1# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2021, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# Originally written by Sergey Kirillov and Andrey Matyukov 11# Intel Corporation 12# 13# March 2021 14# 15# Initial release. 16# 17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 18# 19# IceLake-Client @ 1.3GHz 20# |---------+-----------------------+---------------+-------------| 21# | | OpenSSL 3.0.0-alpha15 | this | Unit | 22# |---------+-----------------------+---------------+-------------| 23# | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign | 24# | | 203.2 | 453.5 / +123% | sign/s | 25# |---------+-----------------------+---------------+-------------| 26# 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 34$avx512ifma=0; 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $avx512ifma = ($1>=2.26); 44} 45 46if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 48 $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 49} 50 51if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 52 $avx512ifma = ($2>=7.0); 53} 54 55open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 56 or die "can't call $xlate: $!"; 57*STDOUT=*OUT; 58 59if ($avx512ifma>0) {{{ 60@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 61 62############################################################################### 63# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52. 64# 65# AMM is defined as presented in the paper [1]. 66# 67# The input and output are presented in 2^52 radix domain, i.e. 68# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed 69# 70# NOTE: the function uses zero-padded data - 2 high QWs is a padding. 71# 72# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 73# 74# NB: the AMM implementation does not perform "conditional" subtraction step 75# specified in the original algorithm as according to the Lemma 1 from the paper 76# [2], the result will be always < 2*m and can be used as a direct input to 77# the next AMM iteration. This post-condition is true, provided the correct 78# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, 79# which matches our case: 1560 > 1536 + 2 * 1. 80# 81# [1] Gueron, S. Efficient software implementations of modular exponentiation. 82# DOI: 10.1007/s13389-012-0031-5 83# [2] Gueron, S. Enhanced Montgomery Multiplication. 84# DOI: 10.1007/3-540-36400-5_5 85# 86# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, 87# const BN_ULONG *a, 88# const BN_ULONG *b, 89# const BN_ULONG *m, 90# BN_ULONG k0); 91############################################################################### 92{ 93# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 94my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 95 96my $mask52 = "%rax"; 97my $acc0_0 = "%r9"; 98my $acc0_0_low = "%r9d"; 99my $acc0_1 = "%r15"; 100my $acc0_1_low = "%r15d"; 101my $b_ptr = "%r11"; 102 103my $iter = "%ebx"; 104 105my $zero = "%ymm0"; 106my $Bi = "%ymm1"; 107my $Yi = "%ymm2"; 108my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10)); 109my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18)); 110 111# Registers mapping for normalization 112my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23))); 113 114sub amm52x30_x1() { 115# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 116# of data for corresponding AMM operation; 117# _b_offset - offset in the |b| array pointing to the next qword digit; 118my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_; 119my $_R0_xmm = $_R0; 120$_R0_xmm =~ s/%y/%x/; 121$code.=<<___; 122 movq $_b_offset($b_ptr), %r13 # b[i] 123 124 vpbroadcastq %r13, $Bi # broadcast b[i] 125 movq $_data_offset($a), %rdx 126 mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 127 addq %r13, $_acc # acc += t0 128 movq %r12, %r10 129 adcq \$0, %r10 # t2 += CF 130 131 movq $_k0, %r13 132 imulq $_acc, %r13 # acc * k0 133 andq $mask52, %r13 # yi = (acc * k0) & mask52 134 135 vpbroadcastq %r13, $Yi # broadcast y[i] 136 movq $_data_offset($m), %rdx 137 mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 138 addq %r13, $_acc # acc += t0 139 adcq %r12, %r10 # t2 += (t1 + CF) 140 141 shrq \$52, $_acc 142 salq \$12, %r10 143 or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 144 145 vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 146 vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 147 vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 148 vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 149 vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 150 vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h 151 vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3 152 vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h 153 154 vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 155 vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 156 vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 157 vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 158 vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 159 vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h 160 vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3 161 vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h 162 163 # Shift accumulators right by 1 qword, zero extending the highest one 164 valignq \$1, $_R0, $_R0h, $_R0 165 valignq \$1, $_R0h, $_R1, $_R0h 166 valignq \$1, $_R1, $_R1h, $_R1 167 valignq \$1, $_R1h, $_R2, $_R1h 168 valignq \$1, $_R2, $_R2h, $_R2 169 valignq \$1, $_R2h, $_R3, $_R2h 170 valignq \$1, $_R3, $_R3h, $_R3 171 valignq \$1, $_R3h, $zero, $_R3h 172 173 vmovq $_R0_xmm, %r13 174 addq %r13, $_acc # acc += R0[0] 175 176 vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 177 vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 178 vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 179 vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 180 vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 181 vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h 182 vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3 183 vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h 184 185 vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 186 vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 187 vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 188 vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 189 vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 190 vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h 191 vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3 192 vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h 193___ 194} 195 196# Normalization routine: handles carry bits and gets bignum qwords to normalized 197# 2^52 representation. 198# 199# Uses %r8-14,%e[abcd]x 200sub amm52x30_x1_norm { 201my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_; 202$code.=<<___; 203 # Put accumulator to low qword in R0 204 vpbroadcastq $_acc, $T0 205 vpblendd \$3, $T0, $_R0, $_R0 206 207 # Extract "carries" (12 high bits) from each QW of the bignum 208 # Save them to LSB of QWs in T0..Tn 209 vpsrlq \$52, $_R0, $T0 210 vpsrlq \$52, $_R0h, $T0h 211 vpsrlq \$52, $_R1, $T1 212 vpsrlq \$52, $_R1h, $T1h 213 vpsrlq \$52, $_R2, $T2 214 vpsrlq \$52, $_R2h, $T2h 215 vpsrlq \$52, $_R3, $T3 216 vpsrlq \$52, $_R3h, $T3h 217 218 # "Shift left" T0..Tn by 1 QW 219 valignq \$3, $T3, $T3h, $T3h 220 valignq \$3, $T2h, $T3, $T3 221 valignq \$3, $T2, $T2h, $T2h 222 valignq \$3, $T1h, $T2, $T2 223 valignq \$3, $T1, $T1h, $T1h 224 valignq \$3, $T0h, $T1, $T1 225 valignq \$3, $T0, $T0h, $T0h 226 valignq \$3, .Lzeros(%rip), $T0, $T0 227 228 # Drop "carries" from R0..Rn QWs 229 vpandq .Lmask52x4(%rip), $_R0, $_R0 230 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 231 vpandq .Lmask52x4(%rip), $_R1, $_R1 232 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 233 vpandq .Lmask52x4(%rip), $_R2, $_R2 234 vpandq .Lmask52x4(%rip), $_R2h, $_R2h 235 vpandq .Lmask52x4(%rip), $_R3, $_R3 236 vpandq .Lmask52x4(%rip), $_R3h, $_R3h 237 238 # Sum R0..Rn with corresponding adjusted carries 239 vpaddq $T0, $_R0, $_R0 240 vpaddq $T0h, $_R0h, $_R0h 241 vpaddq $T1, $_R1, $_R1 242 vpaddq $T1h, $_R1h, $_R1h 243 vpaddq $T2, $_R2, $_R2 244 vpaddq $T2h, $_R2h, $_R2h 245 vpaddq $T3, $_R3, $_R3 246 vpaddq $T3h, $_R3h, $_R3h 247 248 # Now handle carry bits from this addition 249 # Get mask of QWs whose 52-bit parts overflow 250 vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt) 251 vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2 252 kmovb %k1,%r14d 253 kmovb %k2,%r13d 254 shl \$4,%r13b 255 or %r13b,%r14b 256 257 vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1 258 vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2 259 kmovb %k1,%r13d 260 kmovb %k2,%r12d 261 shl \$4,%r12b 262 or %r12b,%r13b 263 264 vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1 265 vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2 266 kmovb %k1,%r12d 267 kmovb %k2,%r11d 268 shl \$4,%r11b 269 or %r11b,%r12b 270 271 vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1 272 vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2 273 kmovb %k1,%r11d 274 kmovb %k2,%r10d 275 shl \$4,%r10b 276 or %r10b,%r11b 277 278 addb %r14b,%r14b 279 adcb %r13b,%r13b 280 adcb %r12b,%r12b 281 adcb %r11b,%r11b 282 283 # Get mask of QWs whose 52-bit parts saturated 284 vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq 285 vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2 286 kmovb %k1,%r9d 287 kmovb %k2,%r8d 288 shl \$4,%r8b 289 or %r8b,%r9b 290 291 vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1 292 vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2 293 kmovb %k1,%r8d 294 kmovb %k2,%edx 295 shl \$4,%dl 296 or %dl,%r8b 297 298 vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1 299 vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2 300 kmovb %k1,%edx 301 kmovb %k2,%ecx 302 shl \$4,%cl 303 or %cl,%dl 304 305 vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1 306 vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2 307 kmovb %k1,%ecx 308 kmovb %k2,%ebx 309 shl \$4,%bl 310 or %bl,%cl 311 312 addb %r9b,%r14b 313 adcb %r8b,%r13b 314 adcb %dl,%r12b 315 adcb %cl,%r11b 316 317 xor %r9b,%r14b 318 xor %r8b,%r13b 319 xor %dl,%r12b 320 xor %cl,%r11b 321 322 kmovb %r14d,%k1 323 shr \$4,%r14b 324 kmovb %r14d,%k2 325 kmovb %r13d,%k3 326 shr \$4,%r13b 327 kmovb %r13d,%k4 328 kmovb %r12d,%k5 329 shr \$4,%r12b 330 kmovb %r12d,%k6 331 kmovb %r11d,%k7 332 333 vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} 334 vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} 335 vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} 336 vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} 337 vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} 338 vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6} 339 vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7} 340 341 vpandq .Lmask52x4(%rip), $_R0, $_R0 342 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 343 vpandq .Lmask52x4(%rip), $_R1, $_R1 344 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 345 vpandq .Lmask52x4(%rip), $_R2, $_R2 346 vpandq .Lmask52x4(%rip), $_R2h, $_R2h 347 vpandq .Lmask52x4(%rip), $_R3, $_R3 348 349 shr \$4,%r11b 350 kmovb %r11d,%k1 351 352 vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1} 353 354 vpandq .Lmask52x4(%rip), $_R3h, $_R3h 355___ 356} 357 358$code.=<<___; 359.text 360 361.globl ossl_rsaz_amm52x30_x1_ifma256 362.type ossl_rsaz_amm52x30_x1_ifma256,\@function,5 363.align 32 364ossl_rsaz_amm52x30_x1_ifma256: 365.cfi_startproc 366 endbranch 367 push %rbx 368.cfi_push %rbx 369 push %rbp 370.cfi_push %rbp 371 push %r12 372.cfi_push %r12 373 push %r13 374.cfi_push %r13 375 push %r14 376.cfi_push %r14 377 push %r15 378.cfi_push %r15 379___ 380$code.=<<___ if ($win64); 381 lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) 382 vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers 383 vmovdqa64 %xmm7, `1*16`(%rsp) 384 vmovdqa64 %xmm8, `2*16`(%rsp) 385 vmovdqa64 %xmm9, `3*16`(%rsp) 386 vmovdqa64 %xmm10,`4*16`(%rsp) 387 vmovdqa64 %xmm11,`5*16`(%rsp) 388 vmovdqa64 %xmm12,`6*16`(%rsp) 389 vmovdqa64 %xmm13,`7*16`(%rsp) 390 vmovdqa64 %xmm14,`8*16`(%rsp) 391 vmovdqa64 %xmm15,`9*16`(%rsp) 392.Lossl_rsaz_amm52x30_x1_ifma256_body: 393___ 394$code.=<<___; 395 # Zeroing accumulators 396 vpxord $zero, $zero, $zero 397 vmovdqa64 $zero, $R0_0 398 vmovdqa64 $zero, $R0_0h 399 vmovdqa64 $zero, $R1_0 400 vmovdqa64 $zero, $R1_0h 401 vmovdqa64 $zero, $R2_0 402 vmovdqa64 $zero, $R2_0h 403 vmovdqa64 $zero, $R3_0 404 vmovdqa64 $zero, $R3_0h 405 406 xorl $acc0_0_low, $acc0_0_low 407 408 movq $b, $b_ptr # backup address of b 409 movq \$0xfffffffffffff, $mask52 # 52-bit mask 410 411 # Loop over 30 digits unrolled by 4 412 mov \$7, $iter 413 414.align 32 415.Lloop7: 416___ 417 foreach my $idx (0..3) { 418 &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); 419 } 420$code.=<<___; 421 lea `4*8`($b_ptr), $b_ptr 422 dec $iter 423 jne .Lloop7 424___ 425 &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); 426 &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); 427 428 &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h); 429$code.=<<___; 430 431 vmovdqu64 $R0_0, `0*32`($res) 432 vmovdqu64 $R0_0h, `1*32`($res) 433 vmovdqu64 $R1_0, `2*32`($res) 434 vmovdqu64 $R1_0h, `3*32`($res) 435 vmovdqu64 $R2_0, `4*32`($res) 436 vmovdqu64 $R2_0h, `5*32`($res) 437 vmovdqu64 $R3_0, `6*32`($res) 438 vmovdqu64 $R3_0h, `7*32`($res) 439 440 vzeroupper 441 lea (%rsp),%rax 442.cfi_def_cfa_register %rax 443___ 444$code.=<<___ if ($win64); 445 vmovdqa64 `0*16`(%rax),%xmm6 446 vmovdqa64 `1*16`(%rax),%xmm7 447 vmovdqa64 `2*16`(%rax),%xmm8 448 vmovdqa64 `3*16`(%rax),%xmm9 449 vmovdqa64 `4*16`(%rax),%xmm10 450 vmovdqa64 `5*16`(%rax),%xmm11 451 vmovdqa64 `6*16`(%rax),%xmm12 452 vmovdqa64 `7*16`(%rax),%xmm13 453 vmovdqa64 `8*16`(%rax),%xmm14 454 vmovdqa64 `9*16`(%rax),%xmm15 455 lea 168(%rsp),%rax 456___ 457$code.=<<___; 458 mov 0(%rax),%r15 459.cfi_restore %r15 460 mov 8(%rax),%r14 461.cfi_restore %r14 462 mov 16(%rax),%r13 463.cfi_restore %r13 464 mov 24(%rax),%r12 465.cfi_restore %r12 466 mov 32(%rax),%rbp 467.cfi_restore %rbp 468 mov 40(%rax),%rbx 469.cfi_restore %rbx 470 lea 48(%rax),%rsp # restore rsp 471.cfi_def_cfa %rsp,8 472.Lossl_rsaz_amm52x30_x1_ifma256_epilogue: 473 ret 474.cfi_endproc 475.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 476___ 477 478$code.=<<___; 479.data 480.align 32 481.Lmask52x4: 482 .quad 0xfffffffffffff 483 .quad 0xfffffffffffff 484 .quad 0xfffffffffffff 485 .quad 0xfffffffffffff 486___ 487 488############################################################################### 489# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52 490# 491# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost 492# Montgomery Multiplication algorithm and function input parameters description. 493# 494# This function does two AMMs for two independent inputs, hence dual. 495# 496# NOTE: the function uses zero-padded data - 2 high QWs is a padding. 497# 498# void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32], 499# const BN_ULONG a[2][32], 500# const BN_ULONG b[2][32], 501# const BN_ULONG m[2][32], 502# const BN_ULONG k0[2]); 503############################################################################### 504 505$code.=<<___; 506.text 507 508.globl ossl_rsaz_amm52x30_x2_ifma256 509.type ossl_rsaz_amm52x30_x2_ifma256,\@function,5 510.align 32 511ossl_rsaz_amm52x30_x2_ifma256: 512.cfi_startproc 513 endbranch 514 push %rbx 515.cfi_push %rbx 516 push %rbp 517.cfi_push %rbp 518 push %r12 519.cfi_push %r12 520 push %r13 521.cfi_push %r13 522 push %r14 523.cfi_push %r14 524 push %r15 525.cfi_push %r15 526___ 527$code.=<<___ if ($win64); 528 lea -168(%rsp),%rsp 529 vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers 530 vmovdqa64 %xmm7, `1*16`(%rsp) 531 vmovdqa64 %xmm8, `2*16`(%rsp) 532 vmovdqa64 %xmm9, `3*16`(%rsp) 533 vmovdqa64 %xmm10,`4*16`(%rsp) 534 vmovdqa64 %xmm11,`5*16`(%rsp) 535 vmovdqa64 %xmm12,`6*16`(%rsp) 536 vmovdqa64 %xmm13,`7*16`(%rsp) 537 vmovdqa64 %xmm14,`8*16`(%rsp) 538 vmovdqa64 %xmm15,`9*16`(%rsp) 539.Lossl_rsaz_amm52x30_x2_ifma256_body: 540___ 541$code.=<<___; 542 # Zeroing accumulators 543 vpxord $zero, $zero, $zero 544 vmovdqa64 $zero, $R0_0 545 vmovdqa64 $zero, $R0_0h 546 vmovdqa64 $zero, $R1_0 547 vmovdqa64 $zero, $R1_0h 548 vmovdqa64 $zero, $R2_0 549 vmovdqa64 $zero, $R2_0h 550 vmovdqa64 $zero, $R3_0 551 vmovdqa64 $zero, $R3_0h 552 553 vmovdqa64 $zero, $R0_1 554 vmovdqa64 $zero, $R0_1h 555 vmovdqa64 $zero, $R1_1 556 vmovdqa64 $zero, $R1_1h 557 vmovdqa64 $zero, $R2_1 558 vmovdqa64 $zero, $R2_1h 559 vmovdqa64 $zero, $R3_1 560 vmovdqa64 $zero, $R3_1h 561 562 563 xorl $acc0_0_low, $acc0_0_low 564 xorl $acc0_1_low, $acc0_1_low 565 566 movq $b, $b_ptr # backup address of b 567 movq \$0xfffffffffffff, $mask52 # 52-bit mask 568 569 mov \$30, $iter 570 571.align 32 572.Lloop30: 573___ 574 &amm52x30_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)"); 575 # 32*8 = offset of the next dimension in two-dimension array 576 &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)"); 577$code.=<<___; 578 lea 8($b_ptr), $b_ptr 579 dec $iter 580 jne .Lloop30 581___ 582 &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h); 583 &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h); 584$code.=<<___; 585 586 vmovdqu64 $R0_0, `0*32`($res) 587 vmovdqu64 $R0_0h, `1*32`($res) 588 vmovdqu64 $R1_0, `2*32`($res) 589 vmovdqu64 $R1_0h, `3*32`($res) 590 vmovdqu64 $R2_0, `4*32`($res) 591 vmovdqu64 $R2_0h, `5*32`($res) 592 vmovdqu64 $R3_0, `6*32`($res) 593 vmovdqu64 $R3_0h, `7*32`($res) 594 595 vmovdqu64 $R0_1, `8*32`($res) 596 vmovdqu64 $R0_1h, `9*32`($res) 597 vmovdqu64 $R1_1, `10*32`($res) 598 vmovdqu64 $R1_1h, `11*32`($res) 599 vmovdqu64 $R2_1, `12*32`($res) 600 vmovdqu64 $R2_1h, `13*32`($res) 601 vmovdqu64 $R3_1, `14*32`($res) 602 vmovdqu64 $R3_1h, `15*32`($res) 603 604 vzeroupper 605 lea (%rsp),%rax 606.cfi_def_cfa_register %rax 607___ 608$code.=<<___ if ($win64); 609 vmovdqa64 `0*16`(%rax),%xmm6 610 vmovdqa64 `1*16`(%rax),%xmm7 611 vmovdqa64 `2*16`(%rax),%xmm8 612 vmovdqa64 `3*16`(%rax),%xmm9 613 vmovdqa64 `4*16`(%rax),%xmm10 614 vmovdqa64 `5*16`(%rax),%xmm11 615 vmovdqa64 `6*16`(%rax),%xmm12 616 vmovdqa64 `7*16`(%rax),%xmm13 617 vmovdqa64 `8*16`(%rax),%xmm14 618 vmovdqa64 `9*16`(%rax),%xmm15 619 lea 168(%rsp),%rax 620___ 621$code.=<<___; 622 mov 0(%rax),%r15 623.cfi_restore %r15 624 mov 8(%rax),%r14 625.cfi_restore %r14 626 mov 16(%rax),%r13 627.cfi_restore %r13 628 mov 24(%rax),%r12 629.cfi_restore %r12 630 mov 32(%rax),%rbp 631.cfi_restore %rbp 632 mov 40(%rax),%rbx 633.cfi_restore %rbx 634 lea 48(%rax),%rsp 635.cfi_def_cfa %rsp,8 636.Lossl_rsaz_amm52x30_x2_ifma256_epilogue: 637 ret 638.cfi_endproc 639.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256 640___ 641} 642 643############################################################################### 644# Constant time extraction from the precomputed table of powers base^i, where 645# i = 0..2^EXP_WIN_SIZE-1 646# 647# The input |red_table| contains precomputations for two independent base values. 648# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. 649# 650# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix. 651# (2 high QW is zero padding) 652# 653# void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, 654# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32], 655# int red_table_idx1, int red_table_idx2); 656# 657# EXP_WIN_SIZE = 5 658############################################################################### 659{ 660# input parameters 661my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 662 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 663 664my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); 665my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25)); 666my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30)); 667 668my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15); 669my $t0xmm = $t0; 670$t0xmm =~ s/%y/%x/; 671 672$code.=<<___; 673.text 674 675.align 32 676.globl ossl_extract_multiplier_2x30_win5 677.type ossl_extract_multiplier_2x30_win5,\@abi-omnipotent 678ossl_extract_multiplier_2x30_win5: 679.cfi_startproc 680 endbranch 681 vmovdqa64 .Lones(%rip), $ones # broadcast ones 682 vpbroadcastq $red_tbl_idx1, $idx1 683 vpbroadcastq $red_tbl_idx2, $idx2 684 leaq `(1<<5)*2*32*8`($red_tbl), %rax # holds end of the tbl 685 686 # zeroing t0..n, cur_idx 687 vpxor $t0xmm, $t0xmm, $t0xmm 688 vmovdqa64 $t0, $cur_idx 689___ 690foreach (1..15) { 691 $code.="vmovdqa64 $t0, $t[$_] \n"; 692} 693$code.=<<___; 694 695.align 32 696.Lloop: 697 vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) 698 vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) 699___ 700foreach (0..15) { 701 my $mask = $_<8?"%k1":"%k2"; 702$code.=<<___; 703 vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl 704 vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero 705___ 706} 707$code.=<<___; 708 vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 709 addq \$`2*32*8`, $red_tbl 710 cmpq $red_tbl, %rax 711 jne .Lloop 712___ 713# store t0..n 714foreach (0..15) { 715 $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; 716} 717$code.=<<___; 718 719 ret 720.cfi_endproc 721.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5 722___ 723$code.=<<___; 724.data 725.align 32 726.Lones: 727 .quad 1,1,1,1 728.Lzeros: 729 .quad 0,0,0,0 730___ 731} 732 733if ($win64) { 734$rec="%rcx"; 735$frame="%rdx"; 736$context="%r8"; 737$disp="%r9"; 738 739$code.=<<___; 740.extern __imp_RtlVirtualUnwind 741.type rsaz_avx_handler,\@abi-omnipotent 742.align 16 743rsaz_avx_handler: 744 push %rsi 745 push %rdi 746 push %rbx 747 push %rbp 748 push %r12 749 push %r13 750 push %r14 751 push %r15 752 pushfq 753 sub \$64,%rsp 754 755 mov 120($context),%rax # pull context->Rax 756 mov 248($context),%rbx # pull context->Rip 757 758 mov 8($disp),%rsi # disp->ImageBase 759 mov 56($disp),%r11 # disp->HandlerData 760 761 mov 0(%r11),%r10d # HandlerData[0] 762 lea (%rsi,%r10),%r10 # prologue label 763 cmp %r10,%rbx # context->Rip<.Lprologue 764 jb .Lcommon_seh_tail 765 766 mov 4(%r11),%r10d # HandlerData[1] 767 lea (%rsi,%r10),%r10 # epilogue label 768 cmp %r10,%rbx # context->Rip>=.Lepilogue 769 jae .Lcommon_seh_tail 770 771 mov 152($context),%rax # pull context->Rsp 772 773 lea (%rax),%rsi # %xmm save area 774 lea 512($context),%rdi # & context.Xmm6 775 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 776 .long 0xa548f3fc # cld; rep movsq 777 778 lea `48+168`(%rax),%rax 779 780 mov -8(%rax),%rbx 781 mov -16(%rax),%rbp 782 mov -24(%rax),%r12 783 mov -32(%rax),%r13 784 mov -40(%rax),%r14 785 mov -48(%rax),%r15 786 mov %rbx,144($context) # restore context->Rbx 787 mov %rbp,160($context) # restore context->Rbp 788 mov %r12,216($context) # restore context->R12 789 mov %r13,224($context) # restore context->R13 790 mov %r14,232($context) # restore context->R14 791 mov %r15,240($context) # restore context->R14 792 793.Lcommon_seh_tail: 794 mov 8(%rax),%rdi 795 mov 16(%rax),%rsi 796 mov %rax,152($context) # restore context->Rsp 797 mov %rsi,168($context) # restore context->Rsi 798 mov %rdi,176($context) # restore context->Rdi 799 800 mov 40($disp),%rdi # disp->ContextRecord 801 mov $context,%rsi # context 802 mov \$154,%ecx # sizeof(CONTEXT) 803 .long 0xa548f3fc # cld; rep movsq 804 805 mov $disp,%rsi 806 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 807 mov 8(%rsi),%rdx # arg2, disp->ImageBase 808 mov 0(%rsi),%r8 # arg3, disp->ControlPc 809 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 810 mov 40(%rsi),%r10 # disp->ContextRecord 811 lea 56(%rsi),%r11 # &disp->HandlerData 812 lea 24(%rsi),%r12 # &disp->EstablisherFrame 813 mov %r10,32(%rsp) # arg5 814 mov %r11,40(%rsp) # arg6 815 mov %r12,48(%rsp) # arg7 816 mov %rcx,56(%rsp) # arg8, (NULL) 817 call *__imp_RtlVirtualUnwind(%rip) 818 819 mov \$1,%eax # ExceptionContinueSearch 820 add \$64,%rsp 821 popfq 822 pop %r15 823 pop %r14 824 pop %r13 825 pop %r12 826 pop %rbp 827 pop %rbx 828 pop %rdi 829 pop %rsi 830 ret 831.size rsaz_avx_handler,.-rsaz_avx_handler 832 833.section .pdata 834.align 4 835 .rva .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256 836 .rva .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256 837 .rva .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256 838 839 .rva .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256 840 .rva .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256 841 .rva .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256 842 843.section .xdata 844.align 8 845.LSEH_info_ossl_rsaz_amm52x30_x1_ifma256: 846 .byte 9,0,0,0 847 .rva rsaz_avx_handler 848 .rva .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue 849.LSEH_info_ossl_rsaz_amm52x30_x2_ifma256: 850 .byte 9,0,0,0 851 .rva rsaz_avx_handler 852 .rva .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue 853___ 854} 855}}} else {{{ # fallback for old assembler 856$code.=<<___; 857.text 858 859.globl ossl_rsaz_amm52x30_x1_ifma256 860.globl ossl_rsaz_amm52x30_x2_ifma256 861.globl ossl_extract_multiplier_2x30_win5 862.type ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent 863ossl_rsaz_amm52x30_x1_ifma256: 864ossl_rsaz_amm52x30_x2_ifma256: 865ossl_extract_multiplier_2x30_win5: 866 .byte 0x0f,0x0b # ud2 867 ret 868.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 869___ 870}}} 871 872$code =~ s/\`([^\`]*)\`/eval $1/gem; 873print $code; 874close STDOUT or die "error closing STDOUT: $!"; 875