1# Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2021, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# Originally written by Sergey Kirillov and Andrey Matyukov 11# Intel Corporation 12# 13# March 2021 14# 15# Initial release. 16# 17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 18# 19# IceLake-Client @ 1.3GHz 20# |---------+-----------------------+---------------+-------------| 21# | | OpenSSL 3.0.0-alpha15 | this | Unit | 22# |---------+-----------------------+---------------+-------------| 23# | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign | 24# | | 203.2 | 453.5 / +123% | sign/s | 25# |---------+-----------------------+---------------+-------------| 26# 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 34$avx512ifma=0; 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $avx512ifma = ($1>=2.26); 44} 45 46if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 48 $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 49} 50 51if (!$avx512ifma && `$ENV{CC} -v 2>&1` 52 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 53 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 54 if ($1) { 55 # Apple conditions, they use a different version series, see 56 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 57 # clang 7.0.0 is Apple clang 10.0.1 58 $avx512ifma = ($ver>=10.0001) 59 } else { 60 $avx512ifma = ($ver>=7.0); 61 } 62} 63 64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 65 or die "can't call $xlate: $!"; 66*STDOUT=*OUT; 67 68if ($avx512ifma>0) {{{ 69@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 70 71############################################################################### 72# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52. 73# 74# AMM is defined as presented in the paper [1]. 75# 76# The input and output are presented in 2^52 radix domain, i.e. 77# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed 78# 79# NOTE: the function uses zero-padded data - 2 high QWs is a padding. 80# 81# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 82# 83# NB: the AMM implementation does not perform "conditional" subtraction step 84# specified in the original algorithm as according to the Lemma 1 from the paper 85# [2], the result will be always < 2*m and can be used as a direct input to 86# the next AMM iteration. This post-condition is true, provided the correct 87# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, 88# which matches our case: 1560 > 1536 + 2 * 1. 89# 90# [1] Gueron, S. Efficient software implementations of modular exponentiation. 91# DOI: 10.1007/s13389-012-0031-5 92# [2] Gueron, S. Enhanced Montgomery Multiplication. 93# DOI: 10.1007/3-540-36400-5_5 94# 95# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, 96# const BN_ULONG *a, 97# const BN_ULONG *b, 98# const BN_ULONG *m, 99# BN_ULONG k0); 100############################################################################### 101{ 102# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 103my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 104 105my $mask52 = "%rax"; 106my $acc0_0 = "%r9"; 107my $acc0_0_low = "%r9d"; 108my $acc0_1 = "%r15"; 109my $acc0_1_low = "%r15d"; 110my $b_ptr = "%r11"; 111 112my $iter = "%ebx"; 113 114my $zero = "%ymm0"; 115my $Bi = "%ymm1"; 116my $Yi = "%ymm2"; 117my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10)); 118my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18)); 119 120# Registers mapping for normalization 121my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23))); 122 123sub amm52x30_x1() { 124# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 125# of data for corresponding AMM operation; 126# _b_offset - offset in the |b| array pointing to the next qword digit; 127my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_; 128my $_R0_xmm = $_R0; 129$_R0_xmm =~ s/%y/%x/; 130$code.=<<___; 131 movq $_b_offset($b_ptr), %r13 # b[i] 132 133 vpbroadcastq %r13, $Bi # broadcast b[i] 134 movq $_data_offset($a), %rdx 135 mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 136 addq %r13, $_acc # acc += t0 137 movq %r12, %r10 138 adcq \$0, %r10 # t2 += CF 139 140 movq $_k0, %r13 141 imulq $_acc, %r13 # acc * k0 142 andq $mask52, %r13 # yi = (acc * k0) & mask52 143 144 vpbroadcastq %r13, $Yi # broadcast y[i] 145 movq $_data_offset($m), %rdx 146 mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 147 addq %r13, $_acc # acc += t0 148 adcq %r12, %r10 # t2 += (t1 + CF) 149 150 shrq \$52, $_acc 151 salq \$12, %r10 152 or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 153 154 vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 155 vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 156 vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 157 vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 158 vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 159 vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h 160 vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3 161 vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h 162 163 vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 164 vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 165 vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 166 vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 167 vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 168 vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h 169 vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3 170 vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h 171 172 # Shift accumulators right by 1 qword, zero extending the highest one 173 valignq \$1, $_R0, $_R0h, $_R0 174 valignq \$1, $_R0h, $_R1, $_R0h 175 valignq \$1, $_R1, $_R1h, $_R1 176 valignq \$1, $_R1h, $_R2, $_R1h 177 valignq \$1, $_R2, $_R2h, $_R2 178 valignq \$1, $_R2h, $_R3, $_R2h 179 valignq \$1, $_R3, $_R3h, $_R3 180 valignq \$1, $_R3h, $zero, $_R3h 181 182 vmovq $_R0_xmm, %r13 183 addq %r13, $_acc # acc += R0[0] 184 185 vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 186 vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 187 vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 188 vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 189 vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 190 vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h 191 vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3 192 vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h 193 194 vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 195 vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 196 vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 197 vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 198 vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 199 vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h 200 vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3 201 vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h 202___ 203} 204 205# Normalization routine: handles carry bits and gets bignum qwords to normalized 206# 2^52 representation. 207# 208# Uses %r8-14,%e[abcd]x 209sub amm52x30_x1_norm { 210my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_; 211$code.=<<___; 212 # Put accumulator to low qword in R0 213 vpbroadcastq $_acc, $T0 214 vpblendd \$3, $T0, $_R0, $_R0 215 216 # Extract "carries" (12 high bits) from each QW of the bignum 217 # Save them to LSB of QWs in T0..Tn 218 vpsrlq \$52, $_R0, $T0 219 vpsrlq \$52, $_R0h, $T0h 220 vpsrlq \$52, $_R1, $T1 221 vpsrlq \$52, $_R1h, $T1h 222 vpsrlq \$52, $_R2, $T2 223 vpsrlq \$52, $_R2h, $T2h 224 vpsrlq \$52, $_R3, $T3 225 vpsrlq \$52, $_R3h, $T3h 226 227 # "Shift left" T0..Tn by 1 QW 228 valignq \$3, $T3, $T3h, $T3h 229 valignq \$3, $T2h, $T3, $T3 230 valignq \$3, $T2, $T2h, $T2h 231 valignq \$3, $T1h, $T2, $T2 232 valignq \$3, $T1, $T1h, $T1h 233 valignq \$3, $T0h, $T1, $T1 234 valignq \$3, $T0, $T0h, $T0h 235 valignq \$3, .Lzeros(%rip), $T0, $T0 236 237 # Drop "carries" from R0..Rn QWs 238 vpandq .Lmask52x4(%rip), $_R0, $_R0 239 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 240 vpandq .Lmask52x4(%rip), $_R1, $_R1 241 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 242 vpandq .Lmask52x4(%rip), $_R2, $_R2 243 vpandq .Lmask52x4(%rip), $_R2h, $_R2h 244 vpandq .Lmask52x4(%rip), $_R3, $_R3 245 vpandq .Lmask52x4(%rip), $_R3h, $_R3h 246 247 # Sum R0..Rn with corresponding adjusted carries 248 vpaddq $T0, $_R0, $_R0 249 vpaddq $T0h, $_R0h, $_R0h 250 vpaddq $T1, $_R1, $_R1 251 vpaddq $T1h, $_R1h, $_R1h 252 vpaddq $T2, $_R2, $_R2 253 vpaddq $T2h, $_R2h, $_R2h 254 vpaddq $T3, $_R3, $_R3 255 vpaddq $T3h, $_R3h, $_R3h 256 257 # Now handle carry bits from this addition 258 # Get mask of QWs whose 52-bit parts overflow 259 vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt) 260 vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2 261 kmovb %k1,%r14d 262 kmovb %k2,%r13d 263 shl \$4,%r13b 264 or %r13b,%r14b 265 266 vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1 267 vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2 268 kmovb %k1,%r13d 269 kmovb %k2,%r12d 270 shl \$4,%r12b 271 or %r12b,%r13b 272 273 vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1 274 vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2 275 kmovb %k1,%r12d 276 kmovb %k2,%r11d 277 shl \$4,%r11b 278 or %r11b,%r12b 279 280 vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1 281 vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2 282 kmovb %k1,%r11d 283 kmovb %k2,%r10d 284 shl \$4,%r10b 285 or %r10b,%r11b 286 287 addb %r14b,%r14b 288 adcb %r13b,%r13b 289 adcb %r12b,%r12b 290 adcb %r11b,%r11b 291 292 # Get mask of QWs whose 52-bit parts saturated 293 vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq 294 vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2 295 kmovb %k1,%r9d 296 kmovb %k2,%r8d 297 shl \$4,%r8b 298 or %r8b,%r9b 299 300 vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1 301 vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2 302 kmovb %k1,%r8d 303 kmovb %k2,%edx 304 shl \$4,%dl 305 or %dl,%r8b 306 307 vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1 308 vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2 309 kmovb %k1,%edx 310 kmovb %k2,%ecx 311 shl \$4,%cl 312 or %cl,%dl 313 314 vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1 315 vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2 316 kmovb %k1,%ecx 317 kmovb %k2,%ebx 318 shl \$4,%bl 319 or %bl,%cl 320 321 addb %r9b,%r14b 322 adcb %r8b,%r13b 323 adcb %dl,%r12b 324 adcb %cl,%r11b 325 326 xor %r9b,%r14b 327 xor %r8b,%r13b 328 xor %dl,%r12b 329 xor %cl,%r11b 330 331 kmovb %r14d,%k1 332 shr \$4,%r14b 333 kmovb %r14d,%k2 334 kmovb %r13d,%k3 335 shr \$4,%r13b 336 kmovb %r13d,%k4 337 kmovb %r12d,%k5 338 shr \$4,%r12b 339 kmovb %r12d,%k6 340 kmovb %r11d,%k7 341 342 vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} 343 vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} 344 vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} 345 vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} 346 vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} 347 vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6} 348 vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7} 349 350 vpandq .Lmask52x4(%rip), $_R0, $_R0 351 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 352 vpandq .Lmask52x4(%rip), $_R1, $_R1 353 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 354 vpandq .Lmask52x4(%rip), $_R2, $_R2 355 vpandq .Lmask52x4(%rip), $_R2h, $_R2h 356 vpandq .Lmask52x4(%rip), $_R3, $_R3 357 358 shr \$4,%r11b 359 kmovb %r11d,%k1 360 361 vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1} 362 363 vpandq .Lmask52x4(%rip), $_R3h, $_R3h 364___ 365} 366 367$code.=<<___; 368.text 369 370.globl ossl_rsaz_amm52x30_x1_ifma256 371.type ossl_rsaz_amm52x30_x1_ifma256,\@function,5 372.align 32 373ossl_rsaz_amm52x30_x1_ifma256: 374.cfi_startproc 375 endbranch 376 push %rbx 377.cfi_push %rbx 378 push %rbp 379.cfi_push %rbp 380 push %r12 381.cfi_push %r12 382 push %r13 383.cfi_push %r13 384 push %r14 385.cfi_push %r14 386 push %r15 387.cfi_push %r15 388___ 389$code.=<<___ if ($win64); 390 lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) 391 vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers 392 vmovdqa64 %xmm7, `1*16`(%rsp) 393 vmovdqa64 %xmm8, `2*16`(%rsp) 394 vmovdqa64 %xmm9, `3*16`(%rsp) 395 vmovdqa64 %xmm10,`4*16`(%rsp) 396 vmovdqa64 %xmm11,`5*16`(%rsp) 397 vmovdqa64 %xmm12,`6*16`(%rsp) 398 vmovdqa64 %xmm13,`7*16`(%rsp) 399 vmovdqa64 %xmm14,`8*16`(%rsp) 400 vmovdqa64 %xmm15,`9*16`(%rsp) 401.Lossl_rsaz_amm52x30_x1_ifma256_body: 402___ 403$code.=<<___; 404 # Zeroing accumulators 405 vpxord $zero, $zero, $zero 406 vmovdqa64 $zero, $R0_0 407 vmovdqa64 $zero, $R0_0h 408 vmovdqa64 $zero, $R1_0 409 vmovdqa64 $zero, $R1_0h 410 vmovdqa64 $zero, $R2_0 411 vmovdqa64 $zero, $R2_0h 412 vmovdqa64 $zero, $R3_0 413 vmovdqa64 $zero, $R3_0h 414 415 xorl $acc0_0_low, $acc0_0_low 416 417 movq $b, $b_ptr # backup address of b 418 movq \$0xfffffffffffff, $mask52 # 52-bit mask 419 420 # Loop over 30 digits unrolled by 4 421 mov \$7, $iter 422 423.align 32 424.Lloop7: 425___ 426 foreach my $idx (0..3) { 427 &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); 428 } 429$code.=<<___; 430 lea `4*8`($b_ptr), $b_ptr 431 dec $iter 432 jne .Lloop7 433___ 434 &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); 435 &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0); 436 437 &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h); 438$code.=<<___; 439 440 vmovdqu64 $R0_0, `0*32`($res) 441 vmovdqu64 $R0_0h, `1*32`($res) 442 vmovdqu64 $R1_0, `2*32`($res) 443 vmovdqu64 $R1_0h, `3*32`($res) 444 vmovdqu64 $R2_0, `4*32`($res) 445 vmovdqu64 $R2_0h, `5*32`($res) 446 vmovdqu64 $R3_0, `6*32`($res) 447 vmovdqu64 $R3_0h, `7*32`($res) 448 449 vzeroupper 450 lea (%rsp),%rax 451.cfi_def_cfa_register %rax 452___ 453$code.=<<___ if ($win64); 454 vmovdqa64 `0*16`(%rax),%xmm6 455 vmovdqa64 `1*16`(%rax),%xmm7 456 vmovdqa64 `2*16`(%rax),%xmm8 457 vmovdqa64 `3*16`(%rax),%xmm9 458 vmovdqa64 `4*16`(%rax),%xmm10 459 vmovdqa64 `5*16`(%rax),%xmm11 460 vmovdqa64 `6*16`(%rax),%xmm12 461 vmovdqa64 `7*16`(%rax),%xmm13 462 vmovdqa64 `8*16`(%rax),%xmm14 463 vmovdqa64 `9*16`(%rax),%xmm15 464 lea 168(%rsp),%rax 465___ 466$code.=<<___; 467 mov 0(%rax),%r15 468.cfi_restore %r15 469 mov 8(%rax),%r14 470.cfi_restore %r14 471 mov 16(%rax),%r13 472.cfi_restore %r13 473 mov 24(%rax),%r12 474.cfi_restore %r12 475 mov 32(%rax),%rbp 476.cfi_restore %rbp 477 mov 40(%rax),%rbx 478.cfi_restore %rbx 479 lea 48(%rax),%rsp # restore rsp 480.cfi_def_cfa %rsp,8 481.Lossl_rsaz_amm52x30_x1_ifma256_epilogue: 482 ret 483.cfi_endproc 484.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 485___ 486 487$code.=<<___; 488.section .rodata align=32 489.align 32 490.Lmask52x4: 491 .quad 0xfffffffffffff 492 .quad 0xfffffffffffff 493 .quad 0xfffffffffffff 494 .quad 0xfffffffffffff 495___ 496 497############################################################################### 498# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52 499# 500# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost 501# Montgomery Multiplication algorithm and function input parameters description. 502# 503# This function does two AMMs for two independent inputs, hence dual. 504# 505# NOTE: the function uses zero-padded data - 2 high QWs is a padding. 506# 507# void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32], 508# const BN_ULONG a[2][32], 509# const BN_ULONG b[2][32], 510# const BN_ULONG m[2][32], 511# const BN_ULONG k0[2]); 512############################################################################### 513 514$code.=<<___; 515.text 516 517.globl ossl_rsaz_amm52x30_x2_ifma256 518.type ossl_rsaz_amm52x30_x2_ifma256,\@function,5 519.align 32 520ossl_rsaz_amm52x30_x2_ifma256: 521.cfi_startproc 522 endbranch 523 push %rbx 524.cfi_push %rbx 525 push %rbp 526.cfi_push %rbp 527 push %r12 528.cfi_push %r12 529 push %r13 530.cfi_push %r13 531 push %r14 532.cfi_push %r14 533 push %r15 534.cfi_push %r15 535___ 536$code.=<<___ if ($win64); 537 lea -168(%rsp),%rsp 538 vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers 539 vmovdqa64 %xmm7, `1*16`(%rsp) 540 vmovdqa64 %xmm8, `2*16`(%rsp) 541 vmovdqa64 %xmm9, `3*16`(%rsp) 542 vmovdqa64 %xmm10,`4*16`(%rsp) 543 vmovdqa64 %xmm11,`5*16`(%rsp) 544 vmovdqa64 %xmm12,`6*16`(%rsp) 545 vmovdqa64 %xmm13,`7*16`(%rsp) 546 vmovdqa64 %xmm14,`8*16`(%rsp) 547 vmovdqa64 %xmm15,`9*16`(%rsp) 548.Lossl_rsaz_amm52x30_x2_ifma256_body: 549___ 550$code.=<<___; 551 # Zeroing accumulators 552 vpxord $zero, $zero, $zero 553 vmovdqa64 $zero, $R0_0 554 vmovdqa64 $zero, $R0_0h 555 vmovdqa64 $zero, $R1_0 556 vmovdqa64 $zero, $R1_0h 557 vmovdqa64 $zero, $R2_0 558 vmovdqa64 $zero, $R2_0h 559 vmovdqa64 $zero, $R3_0 560 vmovdqa64 $zero, $R3_0h 561 562 vmovdqa64 $zero, $R0_1 563 vmovdqa64 $zero, $R0_1h 564 vmovdqa64 $zero, $R1_1 565 vmovdqa64 $zero, $R1_1h 566 vmovdqa64 $zero, $R2_1 567 vmovdqa64 $zero, $R2_1h 568 vmovdqa64 $zero, $R3_1 569 vmovdqa64 $zero, $R3_1h 570 571 572 xorl $acc0_0_low, $acc0_0_low 573 xorl $acc0_1_low, $acc0_1_low 574 575 movq $b, $b_ptr # backup address of b 576 movq \$0xfffffffffffff, $mask52 # 52-bit mask 577 578 mov \$30, $iter 579 580.align 32 581.Lloop30: 582___ 583 &amm52x30_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)"); 584 # 32*8 = offset of the next dimension in two-dimension array 585 &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)"); 586$code.=<<___; 587 lea 8($b_ptr), $b_ptr 588 dec $iter 589 jne .Lloop30 590___ 591 &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h); 592 &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h); 593$code.=<<___; 594 595 vmovdqu64 $R0_0, `0*32`($res) 596 vmovdqu64 $R0_0h, `1*32`($res) 597 vmovdqu64 $R1_0, `2*32`($res) 598 vmovdqu64 $R1_0h, `3*32`($res) 599 vmovdqu64 $R2_0, `4*32`($res) 600 vmovdqu64 $R2_0h, `5*32`($res) 601 vmovdqu64 $R3_0, `6*32`($res) 602 vmovdqu64 $R3_0h, `7*32`($res) 603 604 vmovdqu64 $R0_1, `8*32`($res) 605 vmovdqu64 $R0_1h, `9*32`($res) 606 vmovdqu64 $R1_1, `10*32`($res) 607 vmovdqu64 $R1_1h, `11*32`($res) 608 vmovdqu64 $R2_1, `12*32`($res) 609 vmovdqu64 $R2_1h, `13*32`($res) 610 vmovdqu64 $R3_1, `14*32`($res) 611 vmovdqu64 $R3_1h, `15*32`($res) 612 613 vzeroupper 614 lea (%rsp),%rax 615.cfi_def_cfa_register %rax 616___ 617$code.=<<___ if ($win64); 618 vmovdqa64 `0*16`(%rax),%xmm6 619 vmovdqa64 `1*16`(%rax),%xmm7 620 vmovdqa64 `2*16`(%rax),%xmm8 621 vmovdqa64 `3*16`(%rax),%xmm9 622 vmovdqa64 `4*16`(%rax),%xmm10 623 vmovdqa64 `5*16`(%rax),%xmm11 624 vmovdqa64 `6*16`(%rax),%xmm12 625 vmovdqa64 `7*16`(%rax),%xmm13 626 vmovdqa64 `8*16`(%rax),%xmm14 627 vmovdqa64 `9*16`(%rax),%xmm15 628 lea 168(%rsp),%rax 629___ 630$code.=<<___; 631 mov 0(%rax),%r15 632.cfi_restore %r15 633 mov 8(%rax),%r14 634.cfi_restore %r14 635 mov 16(%rax),%r13 636.cfi_restore %r13 637 mov 24(%rax),%r12 638.cfi_restore %r12 639 mov 32(%rax),%rbp 640.cfi_restore %rbp 641 mov 40(%rax),%rbx 642.cfi_restore %rbx 643 lea 48(%rax),%rsp 644.cfi_def_cfa %rsp,8 645.Lossl_rsaz_amm52x30_x2_ifma256_epilogue: 646 ret 647.cfi_endproc 648.size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256 649___ 650} 651 652############################################################################### 653# Constant time extraction from the precomputed table of powers base^i, where 654# i = 0..2^EXP_WIN_SIZE-1 655# 656# The input |red_table| contains precomputations for two independent base values. 657# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. 658# 659# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix. 660# (2 high QW is zero padding) 661# 662# void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, 663# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32], 664# int red_table_idx1, int red_table_idx2); 665# 666# EXP_WIN_SIZE = 5 667############################################################################### 668{ 669# input parameters 670my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 671 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 672 673my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); 674my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25)); 675my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30)); 676 677my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15); 678my $t0xmm = $t0; 679$t0xmm =~ s/%y/%x/; 680 681$code.=<<___; 682.text 683 684.align 32 685.globl ossl_extract_multiplier_2x30_win5 686.type ossl_extract_multiplier_2x30_win5,\@abi-omnipotent 687ossl_extract_multiplier_2x30_win5: 688.cfi_startproc 689 endbranch 690 vmovdqa64 .Lones(%rip), $ones # broadcast ones 691 vpbroadcastq $red_tbl_idx1, $idx1 692 vpbroadcastq $red_tbl_idx2, $idx2 693 leaq `(1<<5)*2*32*8`($red_tbl), %rax # holds end of the tbl 694 695 # zeroing t0..n, cur_idx 696 vpxor $t0xmm, $t0xmm, $t0xmm 697 vmovdqa64 $t0, $cur_idx 698___ 699foreach (1..15) { 700 $code.="vmovdqa64 $t0, $t[$_] \n"; 701} 702$code.=<<___; 703 704.align 32 705.Lloop: 706 vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx) 707 vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx) 708___ 709foreach (0..15) { 710 my $mask = $_<8?"%k1":"%k2"; 711$code.=<<___; 712 vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl 713 vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero 714___ 715} 716$code.=<<___; 717 vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 718 addq \$`2*32*8`, $red_tbl 719 cmpq $red_tbl, %rax 720 jne .Lloop 721___ 722# store t0..n 723foreach (0..15) { 724 $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n"; 725} 726$code.=<<___; 727 728 ret 729.cfi_endproc 730.size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5 731___ 732$code.=<<___; 733.section .rodata align=32 734.align 32 735.Lones: 736 .quad 1,1,1,1 737.Lzeros: 738 .quad 0,0,0,0 739___ 740} 741 742if ($win64) { 743$rec="%rcx"; 744$frame="%rdx"; 745$context="%r8"; 746$disp="%r9"; 747 748$code.=<<___; 749.extern __imp_RtlVirtualUnwind 750.type rsaz_avx_handler,\@abi-omnipotent 751.align 16 752rsaz_avx_handler: 753 push %rsi 754 push %rdi 755 push %rbx 756 push %rbp 757 push %r12 758 push %r13 759 push %r14 760 push %r15 761 pushfq 762 sub \$64,%rsp 763 764 mov 120($context),%rax # pull context->Rax 765 mov 248($context),%rbx # pull context->Rip 766 767 mov 8($disp),%rsi # disp->ImageBase 768 mov 56($disp),%r11 # disp->HandlerData 769 770 mov 0(%r11),%r10d # HandlerData[0] 771 lea (%rsi,%r10),%r10 # prologue label 772 cmp %r10,%rbx # context->Rip<.Lprologue 773 jb .Lcommon_seh_tail 774 775 mov 4(%r11),%r10d # HandlerData[1] 776 lea (%rsi,%r10),%r10 # epilogue label 777 cmp %r10,%rbx # context->Rip>=.Lepilogue 778 jae .Lcommon_seh_tail 779 780 mov 152($context),%rax # pull context->Rsp 781 782 lea (%rax),%rsi # %xmm save area 783 lea 512($context),%rdi # & context.Xmm6 784 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 785 .long 0xa548f3fc # cld; rep movsq 786 787 lea `48+168`(%rax),%rax 788 789 mov -8(%rax),%rbx 790 mov -16(%rax),%rbp 791 mov -24(%rax),%r12 792 mov -32(%rax),%r13 793 mov -40(%rax),%r14 794 mov -48(%rax),%r15 795 mov %rbx,144($context) # restore context->Rbx 796 mov %rbp,160($context) # restore context->Rbp 797 mov %r12,216($context) # restore context->R12 798 mov %r13,224($context) # restore context->R13 799 mov %r14,232($context) # restore context->R14 800 mov %r15,240($context) # restore context->R14 801 802.Lcommon_seh_tail: 803 mov 8(%rax),%rdi 804 mov 16(%rax),%rsi 805 mov %rax,152($context) # restore context->Rsp 806 mov %rsi,168($context) # restore context->Rsi 807 mov %rdi,176($context) # restore context->Rdi 808 809 mov 40($disp),%rdi # disp->ContextRecord 810 mov $context,%rsi # context 811 mov \$154,%ecx # sizeof(CONTEXT) 812 .long 0xa548f3fc # cld; rep movsq 813 814 mov $disp,%rsi 815 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 816 mov 8(%rsi),%rdx # arg2, disp->ImageBase 817 mov 0(%rsi),%r8 # arg3, disp->ControlPc 818 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 819 mov 40(%rsi),%r10 # disp->ContextRecord 820 lea 56(%rsi),%r11 # &disp->HandlerData 821 lea 24(%rsi),%r12 # &disp->EstablisherFrame 822 mov %r10,32(%rsp) # arg5 823 mov %r11,40(%rsp) # arg6 824 mov %r12,48(%rsp) # arg7 825 mov %rcx,56(%rsp) # arg8, (NULL) 826 call *__imp_RtlVirtualUnwind(%rip) 827 828 mov \$1,%eax # ExceptionContinueSearch 829 add \$64,%rsp 830 popfq 831 pop %r15 832 pop %r14 833 pop %r13 834 pop %r12 835 pop %rbp 836 pop %rbx 837 pop %rdi 838 pop %rsi 839 ret 840.size rsaz_avx_handler,.-rsaz_avx_handler 841 842.section .pdata 843.align 4 844 .rva .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256 845 .rva .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256 846 .rva .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256 847 848 .rva .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256 849 .rva .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256 850 .rva .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256 851 852.section .xdata 853.align 8 854.LSEH_info_ossl_rsaz_amm52x30_x1_ifma256: 855 .byte 9,0,0,0 856 .rva rsaz_avx_handler 857 .rva .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue 858.LSEH_info_ossl_rsaz_amm52x30_x2_ifma256: 859 .byte 9,0,0,0 860 .rva rsaz_avx_handler 861 .rva .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue 862___ 863} 864}}} else {{{ # fallback for old assembler 865$code.=<<___; 866.text 867 868.globl ossl_rsaz_amm52x30_x1_ifma256 869.globl ossl_rsaz_amm52x30_x2_ifma256 870.globl ossl_extract_multiplier_2x30_win5 871.type ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent 872ossl_rsaz_amm52x30_x1_ifma256: 873ossl_rsaz_amm52x30_x2_ifma256: 874ossl_extract_multiplier_2x30_win5: 875 .byte 0x0f,0x0b # ud2 876 ret 877.size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256 878___ 879}}} 880 881$code =~ s/\`([^\`]*)\`/eval $1/gem; 882print $code; 883close STDOUT or die "error closing STDOUT: $!"; 884