1# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2020, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# Originally written by Sergey Kirillov and Andrey Matyukov. 11# Special thanks to Ilya Albrekht for his valuable hints. 12# Intel Corporation 13# 14# December 2020 15# 16# Initial release. 17# 18# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 19# 20# IceLake-Client @ 1.3GHz 21# |---------+----------------------+--------------+-------------| 22# | | OpenSSL 3.0.0-alpha9 | this | Unit | 23# |---------+----------------------+--------------+-------------| 24# | rsa2048 | 2 127 659 | 1 015 625 | cycles/sign | 25# | | 611 | 1280 / +109% | sign/s | 26# |---------+----------------------+--------------+-------------| 27# 28 29# $output is the last argument if it looks like a file (it has an extension) 30# $flavour is the first argument if it doesn't look like a file 31$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 32$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 33 34$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 35$avx512ifma=0; 36 37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 38( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 39( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 40die "can't locate x86_64-xlate.pl"; 41 42if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 43 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 44 $avx512ifma = ($1>=2.26); 45} 46 47if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 48 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 49 $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 50} 51 52if (!$avx512ifma && `$ENV{CC} -v 2>&1` 53 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 54 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 55 if ($1) { 56 # Apple conditions, they use a different version series, see 57 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 58 # clang 7.0.0 is Apple clang 10.0.1 59 $avx512ifma = ($ver>=10.0001) 60 } else { 61 $avx512ifma = ($ver>=7.0); 62 } 63} 64 65open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 66 or die "can't call $xlate: $!"; 67*STDOUT=*OUT; 68 69if ($avx512ifma>0) {{{ 70@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 71 72$code.=<<___; 73.extern OPENSSL_ia32cap_P 74.globl ossl_rsaz_avx512ifma_eligible 75.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 76.align 32 77ossl_rsaz_avx512ifma_eligible: 78 mov OPENSSL_ia32cap_P+8(%rip), %ecx 79 xor %eax,%eax 80 and \$`1<<31|1<<21|1<<17|1<<16`, %ecx # avx512vl + avx512ifma + avx512dq + avx512f 81 cmp \$`1<<31|1<<21|1<<17|1<<16`, %ecx 82 cmove %ecx,%eax 83 ret 84.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 85___ 86 87############################################################################### 88# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. 89# 90# AMM is defined as presented in the paper [1]. 91# 92# The input and output are presented in 2^52 radix domain, i.e. 93# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. 94# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 95# 96# NB: the AMM implementation does not perform "conditional" subtraction step 97# specified in the original algorithm as according to the Lemma 1 from the paper 98# [2], the result will be always < 2*m and can be used as a direct input to 99# the next AMM iteration. This post-condition is true, provided the correct 100# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, 101# which matches our case: 1040 > 1024 + 2 * 1. 102# 103# [1] Gueron, S. Efficient software implementations of modular exponentiation. 104# DOI: 10.1007/s13389-012-0031-5 105# [2] Gueron, S. Enhanced Montgomery Multiplication. 106# DOI: 10.1007/3-540-36400-5_5 107# 108# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, 109# const BN_ULONG *a, 110# const BN_ULONG *b, 111# const BN_ULONG *m, 112# BN_ULONG k0); 113############################################################################### 114{ 115# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 116my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 117 118my $mask52 = "%rax"; 119my $acc0_0 = "%r9"; 120my $acc0_0_low = "%r9d"; 121my $acc0_1 = "%r15"; 122my $acc0_1_low = "%r15d"; 123my $b_ptr = "%r11"; 124 125my $iter = "%ebx"; 126 127my $zero = "%ymm0"; 128my $Bi = "%ymm1"; 129my $Yi = "%ymm2"; 130my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19))); 131my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23))); 132 133# Registers mapping for normalization. 134my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26))); 135 136sub amm52x20_x1() { 137# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 138# of data for corresponding AMM operation; 139# _b_offset - offset in the |b| array pointing to the next qword digit; 140my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_; 141my $_R0_xmm = $_R0; 142$_R0_xmm =~ s/%y/%x/; 143$code.=<<___; 144 movq $_b_offset($b_ptr), %r13 # b[i] 145 146 vpbroadcastq %r13, $Bi # broadcast b[i] 147 movq $_data_offset($a), %rdx 148 mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 149 addq %r13, $_acc # acc += t0 150 movq %r12, %r10 151 adcq \$0, %r10 # t2 += CF 152 153 movq $_k0, %r13 154 imulq $_acc, %r13 # acc * k0 155 andq $mask52, %r13 # yi = (acc * k0) & mask52 156 157 vpbroadcastq %r13, $Yi # broadcast y[i] 158 movq $_data_offset($m), %rdx 159 mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 160 addq %r13, $_acc # acc += t0 161 adcq %r12, %r10 # t2 += (t1 + CF) 162 163 shrq \$52, $_acc 164 salq \$12, %r10 165 or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 166 167 vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 168 vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 169 vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 170 vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 171 vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 172 173 vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 174 vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 175 vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 176 vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 177 vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 178 179 # Shift accumulators right by 1 qword, zero extending the highest one 180 valignq \$1, $_R0, $_R0h, $_R0 181 valignq \$1, $_R0h, $_R1, $_R0h 182 valignq \$1, $_R1, $_R1h, $_R1 183 valignq \$1, $_R1h, $_R2, $_R1h 184 valignq \$1, $_R2, $zero, $_R2 185 186 vmovq $_R0_xmm, %r13 187 addq %r13, $_acc # acc += R0[0] 188 189 vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 190 vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 191 vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 192 vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 193 vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 194 195 vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 196 vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 197 vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 198 vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 199 vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 200___ 201} 202 203# Normalization routine: handles carry bits and gets bignum qwords to normalized 204# 2^52 representation. 205# 206# Uses %r8-14,%e[bcd]x 207sub amm52x20_x1_norm { 208my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_; 209$code.=<<___; 210 # Put accumulator to low qword in R0 211 vpbroadcastq $_acc, $T0 212 vpblendd \$3, $T0, $_R0, $_R0 213 214 # Extract "carries" (12 high bits) from each QW of R0..R2 215 # Save them to LSB of QWs in T0..T2 216 vpsrlq \$52, $_R0, $T0 217 vpsrlq \$52, $_R0h, $T0h 218 vpsrlq \$52, $_R1, $T1 219 vpsrlq \$52, $_R1h, $T1h 220 vpsrlq \$52, $_R2, $T2 221 222 # "Shift left" T0..T2 by 1 QW 223 valignq \$3, $T1h, $T2, $T2 224 valignq \$3, $T1, $T1h, $T1h 225 valignq \$3, $T0h, $T1, $T1 226 valignq \$3, $T0, $T0h, $T0h 227 valignq \$3, .Lzeros(%rip), $T0, $T0 228 229 # Drop "carries" from R0..R2 QWs 230 vpandq .Lmask52x4(%rip), $_R0, $_R0 231 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 232 vpandq .Lmask52x4(%rip), $_R1, $_R1 233 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 234 vpandq .Lmask52x4(%rip), $_R2, $_R2 235 236 # Sum R0..R2 with corresponding adjusted carries 237 vpaddq $T0, $_R0, $_R0 238 vpaddq $T0h, $_R0h, $_R0h 239 vpaddq $T1, $_R1, $_R1 240 vpaddq $T1h, $_R1h, $_R1h 241 vpaddq $T2, $_R2, $_R2 242 243 # Now handle carry bits from this addition 244 # Get mask of QWs which 52-bit parts overflow... 245 vpcmpuq \$6, .Lmask52x4(%rip), $_R0, %k1 # OP=nle (i.e. gt) 246 vpcmpuq \$6, .Lmask52x4(%rip), $_R0h, %k2 247 vpcmpuq \$6, .Lmask52x4(%rip), $_R1, %k3 248 vpcmpuq \$6, .Lmask52x4(%rip), $_R1h, %k4 249 vpcmpuq \$6, .Lmask52x4(%rip), $_R2, %k5 250 kmovb %k1, %r14d # k1 251 kmovb %k2, %r13d # k1h 252 kmovb %k3, %r12d # k2 253 kmovb %k4, %r11d # k2h 254 kmovb %k5, %r10d # k3 255 256 # ...or saturated 257 vpcmpuq \$0, .Lmask52x4(%rip), $_R0, %k1 # OP=eq 258 vpcmpuq \$0, .Lmask52x4(%rip), $_R0h, %k2 259 vpcmpuq \$0, .Lmask52x4(%rip), $_R1, %k3 260 vpcmpuq \$0, .Lmask52x4(%rip), $_R1h, %k4 261 vpcmpuq \$0, .Lmask52x4(%rip), $_R2, %k5 262 kmovb %k1, %r9d # k4 263 kmovb %k2, %r8d # k4h 264 kmovb %k3, %ebx # k5 265 kmovb %k4, %ecx # k5h 266 kmovb %k5, %edx # k6 267 268 # Get mask of QWs where carries shall be propagated to. 269 # Merge 4-bit masks to 8-bit values to use add with carry. 270 shl \$4, %r13b 271 or %r13b, %r14b 272 shl \$4, %r11b 273 or %r11b, %r12b 274 275 add %r14b, %r14b 276 adc %r12b, %r12b 277 adc %r10b, %r10b 278 279 shl \$4, %r8b 280 or %r8b,%r9b 281 shl \$4, %cl 282 or %cl, %bl 283 284 add %r9b, %r14b 285 adc %bl, %r12b 286 adc %dl, %r10b 287 288 xor %r9b, %r14b 289 xor %bl, %r12b 290 xor %dl, %r10b 291 292 kmovb %r14d, %k1 293 shr \$4, %r14b 294 kmovb %r14d, %k2 295 kmovb %r12d, %k3 296 shr \$4, %r12b 297 kmovb %r12d, %k4 298 kmovb %r10d, %k5 299 300 # Add carries according to the obtained mask 301 vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} 302 vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} 303 vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} 304 vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} 305 vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} 306 307 vpandq .Lmask52x4(%rip), $_R0, $_R0 308 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 309 vpandq .Lmask52x4(%rip), $_R1, $_R1 310 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 311 vpandq .Lmask52x4(%rip), $_R2, $_R2 312___ 313} 314 315$code.=<<___; 316.text 317 318.globl ossl_rsaz_amm52x20_x1_ifma256 319.type ossl_rsaz_amm52x20_x1_ifma256,\@function,5 320.align 32 321ossl_rsaz_amm52x20_x1_ifma256: 322.cfi_startproc 323 endbranch 324 push %rbx 325.cfi_push %rbx 326 push %rbp 327.cfi_push %rbp 328 push %r12 329.cfi_push %r12 330 push %r13 331.cfi_push %r13 332 push %r14 333.cfi_push %r14 334 push %r15 335.cfi_push %r15 336.Lossl_rsaz_amm52x20_x1_ifma256_body: 337 338 # Zeroing accumulators 339 vpxord $zero, $zero, $zero 340 vmovdqa64 $zero, $R0_0 341 vmovdqa64 $zero, $R0_0h 342 vmovdqa64 $zero, $R1_0 343 vmovdqa64 $zero, $R1_0h 344 vmovdqa64 $zero, $R2_0 345 346 xorl $acc0_0_low, $acc0_0_low 347 348 movq $b, $b_ptr # backup address of b 349 movq \$0xfffffffffffff, $mask52 # 52-bit mask 350 351 # Loop over 20 digits unrolled by 4 352 mov \$5, $iter 353 354.align 32 355.Lloop5: 356___ 357 foreach my $idx (0..3) { 358 &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0); 359 } 360$code.=<<___; 361 lea `4*8`($b_ptr), $b_ptr 362 dec $iter 363 jne .Lloop5 364___ 365 &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 366$code.=<<___; 367 368 vmovdqu64 $R0_0, `0*32`($res) 369 vmovdqu64 $R0_0h, `1*32`($res) 370 vmovdqu64 $R1_0, `2*32`($res) 371 vmovdqu64 $R1_0h, `3*32`($res) 372 vmovdqu64 $R2_0, `4*32`($res) 373 374 vzeroupper 375 mov 0(%rsp),%r15 376.cfi_restore %r15 377 mov 8(%rsp),%r14 378.cfi_restore %r14 379 mov 16(%rsp),%r13 380.cfi_restore %r13 381 mov 24(%rsp),%r12 382.cfi_restore %r12 383 mov 32(%rsp),%rbp 384.cfi_restore %rbp 385 mov 40(%rsp),%rbx 386.cfi_restore %rbx 387 lea 48(%rsp),%rsp 388.cfi_adjust_cfa_offset -48 389.Lossl_rsaz_amm52x20_x1_ifma256_epilogue: 390 ret 391.cfi_endproc 392.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 393___ 394 395$code.=<<___; 396.section .rodata align=32 397.align 32 398.Lmask52x4: 399 .quad 0xfffffffffffff 400 .quad 0xfffffffffffff 401 .quad 0xfffffffffffff 402 .quad 0xfffffffffffff 403___ 404 405############################################################################### 406# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 407# 408# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost 409# Montgomery Multiplication algorithm and function input parameters description. 410# 411# This function does two AMMs for two independent inputs, hence dual. 412# 413# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], 414# const BN_ULONG a[2][20], 415# const BN_ULONG b[2][20], 416# const BN_ULONG m[2][20], 417# const BN_ULONG k0[2]); 418############################################################################### 419 420$code.=<<___; 421.text 422 423.globl ossl_rsaz_amm52x20_x2_ifma256 424.type ossl_rsaz_amm52x20_x2_ifma256,\@function,5 425.align 32 426ossl_rsaz_amm52x20_x2_ifma256: 427.cfi_startproc 428 endbranch 429 push %rbx 430.cfi_push %rbx 431 push %rbp 432.cfi_push %rbp 433 push %r12 434.cfi_push %r12 435 push %r13 436.cfi_push %r13 437 push %r14 438.cfi_push %r14 439 push %r15 440.cfi_push %r15 441.Lossl_rsaz_amm52x20_x2_ifma256_body: 442 443 # Zeroing accumulators 444 vpxord $zero, $zero, $zero 445 vmovdqa64 $zero, $R0_0 446 vmovdqa64 $zero, $R0_0h 447 vmovdqa64 $zero, $R1_0 448 vmovdqa64 $zero, $R1_0h 449 vmovdqa64 $zero, $R2_0 450 vmovdqa64 $zero, $R0_1 451 vmovdqa64 $zero, $R0_1h 452 vmovdqa64 $zero, $R1_1 453 vmovdqa64 $zero, $R1_1h 454 vmovdqa64 $zero, $R2_1 455 456 xorl $acc0_0_low, $acc0_0_low 457 xorl $acc0_1_low, $acc0_1_low 458 459 movq $b, $b_ptr # backup address of b 460 movq \$0xfffffffffffff, $mask52 # 52-bit mask 461 462 mov \$20, $iter 463 464.align 32 465.Lloop20: 466___ 467 &amm52x20_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)"); 468 # 20*8 = offset of the next dimension in two-dimension array 469 &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)"); 470$code.=<<___; 471 lea 8($b_ptr), $b_ptr 472 dec $iter 473 jne .Lloop20 474___ 475 &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0); 476 &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1); 477$code.=<<___; 478 479 vmovdqu64 $R0_0, `0*32`($res) 480 vmovdqu64 $R0_0h, `1*32`($res) 481 vmovdqu64 $R1_0, `2*32`($res) 482 vmovdqu64 $R1_0h, `3*32`($res) 483 vmovdqu64 $R2_0, `4*32`($res) 484 485 vmovdqu64 $R0_1, `5*32`($res) 486 vmovdqu64 $R0_1h, `6*32`($res) 487 vmovdqu64 $R1_1, `7*32`($res) 488 vmovdqu64 $R1_1h, `8*32`($res) 489 vmovdqu64 $R2_1, `9*32`($res) 490 491 vzeroupper 492 mov 0(%rsp),%r15 493.cfi_restore %r15 494 mov 8(%rsp),%r14 495.cfi_restore %r14 496 mov 16(%rsp),%r13 497.cfi_restore %r13 498 mov 24(%rsp),%r12 499.cfi_restore %r12 500 mov 32(%rsp),%rbp 501.cfi_restore %rbp 502 mov 40(%rsp),%rbx 503.cfi_restore %rbx 504 lea 48(%rsp),%rsp 505.cfi_adjust_cfa_offset -48 506.Lossl_rsaz_amm52x20_x2_ifma256_epilogue: 507 ret 508.cfi_endproc 509.size ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256 510___ 511} 512 513############################################################################### 514# Constant time extraction from the precomputed table of powers base^i, where 515# i = 0..2^EXP_WIN_SIZE-1 516# 517# The input |red_table| contains precomputations for two independent base values. 518# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. 519# 520# Extracted value (output) is 2 20 digit numbers in 2^52 radix. 521# 522# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, 523# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], 524# int red_table_idx1, int red_table_idx2); 525# 526# EXP_WIN_SIZE = 5 527############################################################################### 528{ 529# input parameters 530my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 531 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 532 533my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); 534my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19)); 535my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24)); 536 537my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9); 538my $t0xmm = $t0; 539$t0xmm =~ s/%y/%x/; 540 541$code.=<<___; 542.text 543 544.align 32 545.globl ossl_extract_multiplier_2x20_win5 546.type ossl_extract_multiplier_2x20_win5,\@abi-omnipotent 547ossl_extract_multiplier_2x20_win5: 548.cfi_startproc 549 endbranch 550 vmovdqa64 .Lones(%rip), $ones # broadcast ones 551 vpbroadcastq $red_tbl_idx1, $idx1 552 vpbroadcastq $red_tbl_idx2, $idx2 553 leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl 554 555 # zeroing t0..n, cur_idx 556 vpxor $t0xmm, $t0xmm, $t0xmm 557 vmovdqa64 $t0, $cur_idx 558___ 559foreach (1..9) { 560 $code.="vmovdqa64 $t0, $t[$_] \n"; 561} 562$code.=<<___; 563 564.align 32 565.Lloop: 566 vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) 567 vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) 568___ 569foreach (0..9) { 570 my $mask = $_<5?"%k1":"%k2"; 571$code.=<<___; 572 vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl 573 vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero 574___ 575} 576$code.=<<___; 577 vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 578 addq \$`2*20*8`, $red_tbl 579 cmpq $red_tbl, %rax 580 jne .Lloop 581___ 582# store t0..n 583foreach (0..9) { 584 $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; 585} 586$code.=<<___; 587 ret 588.cfi_endproc 589.size ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5 590___ 591$code.=<<___; 592.section .rodata align=32 593.align 32 594.Lones: 595 .quad 1,1,1,1 596.Lzeros: 597 .quad 0,0,0,0 598___ 599} 600 601if ($win64) { 602$rec="%rcx"; 603$frame="%rdx"; 604$context="%r8"; 605$disp="%r9"; 606 607$code.=<<___; 608.extern __imp_RtlVirtualUnwind 609.type rsaz_def_handler,\@abi-omnipotent 610.align 16 611rsaz_def_handler: 612 push %rsi 613 push %rdi 614 push %rbx 615 push %rbp 616 push %r12 617 push %r13 618 push %r14 619 push %r15 620 pushfq 621 sub \$64,%rsp 622 623 mov 120($context),%rax # pull context->Rax 624 mov 248($context),%rbx # pull context->Rip 625 626 mov 8($disp),%rsi # disp->ImageBase 627 mov 56($disp),%r11 # disp->HandlerData 628 629 mov 0(%r11),%r10d # HandlerData[0] 630 lea (%rsi,%r10),%r10 # prologue label 631 cmp %r10,%rbx # context->Rip<.Lprologue 632 jb .Lcommon_seh_tail 633 634 mov 152($context),%rax # pull context->Rsp 635 636 mov 4(%r11),%r10d # HandlerData[1] 637 lea (%rsi,%r10),%r10 # epilogue label 638 cmp %r10,%rbx # context->Rip>=.Lepilogue 639 jae .Lcommon_seh_tail 640 641 lea 48(%rax),%rax 642 643 mov -8(%rax),%rbx 644 mov -16(%rax),%rbp 645 mov -24(%rax),%r12 646 mov -32(%rax),%r13 647 mov -40(%rax),%r14 648 mov -48(%rax),%r15 649 mov %rbx,144($context) # restore context->Rbx 650 mov %rbp,160($context) # restore context->Rbp 651 mov %r12,216($context) # restore context->R12 652 mov %r13,224($context) # restore context->R13 653 mov %r14,232($context) # restore context->R14 654 mov %r15,240($context) # restore context->R14 655 656.Lcommon_seh_tail: 657 mov 8(%rax),%rdi 658 mov 16(%rax),%rsi 659 mov %rax,152($context) # restore context->Rsp 660 mov %rsi,168($context) # restore context->Rsi 661 mov %rdi,176($context) # restore context->Rdi 662 663 mov 40($disp),%rdi # disp->ContextRecord 664 mov $context,%rsi # context 665 mov \$154,%ecx # sizeof(CONTEXT) 666 .long 0xa548f3fc # cld; rep movsq 667 668 mov $disp,%rsi 669 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 670 mov 8(%rsi),%rdx # arg2, disp->ImageBase 671 mov 0(%rsi),%r8 # arg3, disp->ControlPc 672 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 673 mov 40(%rsi),%r10 # disp->ContextRecord 674 lea 56(%rsi),%r11 # &disp->HandlerData 675 lea 24(%rsi),%r12 # &disp->EstablisherFrame 676 mov %r10,32(%rsp) # arg5 677 mov %r11,40(%rsp) # arg6 678 mov %r12,48(%rsp) # arg7 679 mov %rcx,56(%rsp) # arg8, (NULL) 680 call *__imp_RtlVirtualUnwind(%rip) 681 682 mov \$1,%eax # ExceptionContinueSearch 683 add \$64,%rsp 684 popfq 685 pop %r15 686 pop %r14 687 pop %r13 688 pop %r12 689 pop %rbp 690 pop %rbx 691 pop %rdi 692 pop %rsi 693 ret 694.size rsaz_def_handler,.-rsaz_def_handler 695 696.section .pdata 697.align 4 698 .rva .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256 699 .rva .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256 700 .rva .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256 701 702 .rva .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256 703 .rva .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256 704 .rva .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256 705 706.section .xdata 707.align 8 708.LSEH_info_ossl_rsaz_amm52x20_x1_ifma256: 709 .byte 9,0,0,0 710 .rva rsaz_def_handler 711 .rva .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue 712.LSEH_info_ossl_rsaz_amm52x20_x2_ifma256: 713 .byte 9,0,0,0 714 .rva rsaz_def_handler 715 .rva .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue 716___ 717} 718}}} else {{{ # fallback for old assembler 719$code.=<<___; 720.text 721 722.globl ossl_rsaz_avx512ifma_eligible 723.type ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent 724ossl_rsaz_avx512ifma_eligible: 725 xor %eax,%eax 726 ret 727.size ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible 728 729.globl ossl_rsaz_amm52x20_x1_ifma256 730.globl ossl_rsaz_amm52x20_x2_ifma256 731.globl ossl_extract_multiplier_2x20_win5 732.type ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent 733ossl_rsaz_amm52x20_x1_ifma256: 734ossl_rsaz_amm52x20_x2_ifma256: 735ossl_extract_multiplier_2x20_win5: 736 .byte 0x0f,0x0b # ud2 737 ret 738.size ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256 739___ 740}}} 741 742$code =~ s/\`([^\`]*)\`/eval $1/gem; 743print $code; 744close STDOUT or die "error closing STDOUT: $!"; 745