1# Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2021, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# Originally written by Sergey Kirillov and Andrey Matyukov 11# Intel Corporation 12# 13# March 2021 14# 15# Initial release. 16# 17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues. 18# 19# IceLake-Client @ 1.3GHz 20# |---------+-----------------------+---------------+-------------| 21# | | OpenSSL 3.0.0-alpha15 | this | Unit | 22# |---------+-----------------------+---------------+-------------| 23# | rsa4096 | 14 301 4300 | 5 813 953 | cycles/sign | 24# | | 90.9 | 223.6 / +146% | sign/s | 25# |---------+-----------------------+---------------+-------------| 26# 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 34$avx512ifma=0; 35 36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 39die "can't locate x86_64-xlate.pl"; 40 41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 43 $avx512ifma = ($1>=2.26); 44} 45 46if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 48 $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12); 49} 50 51if (!$avx512ifma && `$ENV{CC} -v 2>&1` 52 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 53 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 54 if ($1) { 55 # Apple conditions, they use a different version series, see 56 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 57 # clang 7.0.0 is Apple clang 10.0.1 58 $avx512ifma = ($ver>=10.0001) 59 } else { 60 $avx512ifma = ($ver>=7.0); 61 } 62} 63 64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 65 or die "can't call $xlate: $!"; 66*STDOUT=*OUT; 67 68if ($avx512ifma>0) {{{ 69@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 70 71############################################################################### 72# Almost Montgomery Multiplication (AMM) for 40-digit number in radix 2^52. 73# 74# AMM is defined as presented in the paper [1]. 75# 76# The input and output are presented in 2^52 radix domain, i.e. 77# |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high bits zeroed. 78# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 79# 80# NB: the AMM implementation does not perform "conditional" subtraction step 81# specified in the original algorithm as according to the Lemma 1 from the paper 82# [2], the result will be always < 2*m and can be used as a direct input to 83# the next AMM iteration. This post-condition is true, provided the correct 84# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, 85# which matches our case: 2080 > 2048 + 2 * 1. 86# 87# [1] Gueron, S. Efficient software implementations of modular exponentiation. 88# DOI: 10.1007/s13389-012-0031-5 89# [2] Gueron, S. Enhanced Montgomery Multiplication. 90# DOI: 10.1007/3-540-36400-5_5 91# 92# void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, 93# const BN_ULONG *a, 94# const BN_ULONG *b, 95# const BN_ULONG *m, 96# BN_ULONG k0); 97############################################################################### 98{ 99# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") 100my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; 101 102my $mask52 = "%rax"; 103my $acc0_0 = "%r9"; 104my $acc0_0_low = "%r9d"; 105my $acc0_1 = "%r15"; 106my $acc0_1_low = "%r15d"; 107my $b_ptr = "%r11"; 108 109my $iter = "%ebx"; 110 111my $zero = "%ymm0"; 112my $Bi = "%ymm1"; 113my $Yi = "%ymm2"; 114my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h) = map("%ymm$_",(3..12)); 115my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h) = map("%ymm$_",(13..22)); 116 117# Registers mapping for normalization 118my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h,$T4,$T4h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (23..29))); 119 120sub amm52x40_x1() { 121# _data_offset - offset in the |a| or |m| arrays pointing to the beginning 122# of data for corresponding AMM operation; 123# _b_offset - offset in the |b| array pointing to the next qword digit; 124my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h,$_k0) = @_; 125my $_R0_xmm = $_R0; 126$_R0_xmm =~ s/%y/%x/; 127$code.=<<___; 128 movq $_b_offset($b_ptr), %r13 # b[i] 129 130 vpbroadcastq %r13, $Bi # broadcast b[i] 131 movq $_data_offset($a), %rdx 132 mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2) 133 addq %r13, $_acc # acc += t0 134 movq %r12, %r10 135 adcq \$0, %r10 # t2 += CF 136 137 movq $_k0, %r13 138 imulq $_acc, %r13 # acc * k0 139 andq $mask52, %r13 # yi = (acc * k0) & mask52 140 141 vpbroadcastq %r13, $Yi # broadcast y[i] 142 movq $_data_offset($m), %rdx 143 mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1) 144 addq %r13, $_acc # acc += t0 145 adcq %r12, %r10 # t2 += (t1 + CF) 146 147 shrq \$52, $_acc 148 salq \$12, %r10 149 or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12)) 150 151 vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0 152 vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h 153 vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1 154 vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h 155 vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2 156 vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h 157 vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3 158 vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h 159 vpmadd52luq `$_data_offset+64*4`($a), $Bi, $_R4 160 vpmadd52luq `$_data_offset+64*4+32`($a), $Bi, $_R4h 161 162 vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0 163 vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h 164 vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1 165 vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h 166 vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2 167 vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h 168 vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3 169 vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h 170 vpmadd52luq `$_data_offset+64*4`($m), $Yi, $_R4 171 vpmadd52luq `$_data_offset+64*4+32`($m), $Yi, $_R4h 172 173 # Shift accumulators right by 1 qword, zero extending the highest one 174 valignq \$1, $_R0, $_R0h, $_R0 175 valignq \$1, $_R0h, $_R1, $_R0h 176 valignq \$1, $_R1, $_R1h, $_R1 177 valignq \$1, $_R1h, $_R2, $_R1h 178 valignq \$1, $_R2, $_R2h, $_R2 179 valignq \$1, $_R2h, $_R3, $_R2h 180 valignq \$1, $_R3, $_R3h, $_R3 181 valignq \$1, $_R3h, $_R4, $_R3h 182 valignq \$1, $_R4, $_R4h, $_R4 183 valignq \$1, $_R4h, $zero, $_R4h 184 185 vmovq $_R0_xmm, %r13 186 addq %r13, $_acc # acc += R0[0] 187 188 vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0 189 vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h 190 vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1 191 vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h 192 vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2 193 vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h 194 vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3 195 vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h 196 vpmadd52huq `$_data_offset+64*4`($a), $Bi, $_R4 197 vpmadd52huq `$_data_offset+64*4+32`($a), $Bi, $_R4h 198 199 vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0 200 vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h 201 vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1 202 vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h 203 vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2 204 vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h 205 vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3 206 vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h 207 vpmadd52huq `$_data_offset+64*4`($m), $Yi, $_R4 208 vpmadd52huq `$_data_offset+64*4+32`($m), $Yi, $_R4h 209___ 210} 211 212# Normalization routine: handles carry bits and gets bignum qwords to normalized 213# 2^52 representation. 214# 215# Uses %r8-14,%e[abcd]x 216sub amm52x40_x1_norm { 217my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h) = @_; 218$code.=<<___; 219 # Put accumulator to low qword in R0 220 vpbroadcastq $_acc, $T0 221 vpblendd \$3, $T0, $_R0, $_R0 222 223 # Extract "carries" (12 high bits) from each QW of the bignum 224 # Save them to LSB of QWs in T0..Tn 225 vpsrlq \$52, $_R0, $T0 226 vpsrlq \$52, $_R0h, $T0h 227 vpsrlq \$52, $_R1, $T1 228 vpsrlq \$52, $_R1h, $T1h 229 vpsrlq \$52, $_R2, $T2 230 vpsrlq \$52, $_R2h, $T2h 231 vpsrlq \$52, $_R3, $T3 232 vpsrlq \$52, $_R3h, $T3h 233 vpsrlq \$52, $_R4, $T4 234 vpsrlq \$52, $_R4h, $T4h 235 236 # "Shift left" T0..Tn by 1 QW 237 valignq \$3, $T4, $T4h, $T4h 238 valignq \$3, $T3h, $T4, $T4 239 valignq \$3, $T3, $T3h, $T3h 240 valignq \$3, $T2h, $T3, $T3 241 valignq \$3, $T2, $T2h, $T2h 242 valignq \$3, $T1h, $T2, $T2 243 valignq \$3, $T1, $T1h, $T1h 244 valignq \$3, $T0h, $T1, $T1 245 valignq \$3, $T0, $T0h, $T0h 246 valignq \$3, .Lzeros(%rip), $T0, $T0 247 248 # Drop "carries" from R0..Rn QWs 249 vpandq .Lmask52x4(%rip), $_R0, $_R0 250 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 251 vpandq .Lmask52x4(%rip), $_R1, $_R1 252 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 253 vpandq .Lmask52x4(%rip), $_R2, $_R2 254 vpandq .Lmask52x4(%rip), $_R2h, $_R2h 255 vpandq .Lmask52x4(%rip), $_R3, $_R3 256 vpandq .Lmask52x4(%rip), $_R3h, $_R3h 257 vpandq .Lmask52x4(%rip), $_R4, $_R4 258 vpandq .Lmask52x4(%rip), $_R4h, $_R4h 259 260 # Sum R0..Rn with corresponding adjusted carries 261 vpaddq $T0, $_R0, $_R0 262 vpaddq $T0h, $_R0h, $_R0h 263 vpaddq $T1, $_R1, $_R1 264 vpaddq $T1h, $_R1h, $_R1h 265 vpaddq $T2, $_R2, $_R2 266 vpaddq $T2h, $_R2h, $_R2h 267 vpaddq $T3, $_R3, $_R3 268 vpaddq $T3h, $_R3h, $_R3h 269 vpaddq $T4, $_R4, $_R4 270 vpaddq $T4h, $_R4h, $_R4h 271 272 # Now handle carry bits from this addition 273 # Get mask of QWs whose 52-bit parts overflow 274 vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt) 275 vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2 276 kmovb %k1,%r14d 277 kmovb %k2,%r13d 278 shl \$4,%r13b 279 or %r13b,%r14b 280 281 vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1 282 vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2 283 kmovb %k1,%r13d 284 kmovb %k2,%r12d 285 shl \$4,%r12b 286 or %r12b,%r13b 287 288 vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1 289 vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2 290 kmovb %k1,%r12d 291 kmovb %k2,%r11d 292 shl \$4,%r11b 293 or %r11b,%r12b 294 295 vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1 296 vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2 297 kmovb %k1,%r11d 298 kmovb %k2,%r10d 299 shl \$4,%r10b 300 or %r10b,%r11b 301 302 vpcmpuq \$6,.Lmask52x4(%rip),${_R4},%k1 303 vpcmpuq \$6,.Lmask52x4(%rip),${_R4h},%k2 304 kmovb %k1,%r10d 305 kmovb %k2,%r9d 306 shl \$4,%r9b 307 or %r9b,%r10b 308 309 addb %r14b,%r14b 310 adcb %r13b,%r13b 311 adcb %r12b,%r12b 312 adcb %r11b,%r11b 313 adcb %r10b,%r10b 314 315 # Get mask of QWs whose 52-bit parts saturated 316 vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq 317 vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2 318 kmovb %k1,%r9d 319 kmovb %k2,%r8d 320 shl \$4,%r8b 321 or %r8b,%r9b 322 323 vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1 324 vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2 325 kmovb %k1,%r8d 326 kmovb %k2,%edx 327 shl \$4,%dl 328 or %dl,%r8b 329 330 vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1 331 vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2 332 kmovb %k1,%edx 333 kmovb %k2,%ecx 334 shl \$4,%cl 335 or %cl,%dl 336 337 vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1 338 vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2 339 kmovb %k1,%ecx 340 kmovb %k2,%ebx 341 shl \$4,%bl 342 or %bl,%cl 343 344 vpcmpuq \$0,.Lmask52x4(%rip),${_R4},%k1 345 vpcmpuq \$0,.Lmask52x4(%rip),${_R4h},%k2 346 kmovb %k1,%ebx 347 kmovb %k2,%eax 348 shl \$4,%al 349 or %al,%bl 350 351 addb %r9b,%r14b 352 adcb %r8b,%r13b 353 adcb %dl,%r12b 354 adcb %cl,%r11b 355 adcb %bl,%r10b 356 357 xor %r9b,%r14b 358 xor %r8b,%r13b 359 xor %dl,%r12b 360 xor %cl,%r11b 361 xor %bl,%r10b 362 363 kmovb %r14d,%k1 364 shr \$4,%r14b 365 kmovb %r14d,%k2 366 kmovb %r13d,%k3 367 shr \$4,%r13b 368 kmovb %r13d,%k4 369 kmovb %r12d,%k5 370 shr \$4,%r12b 371 kmovb %r12d,%k6 372 kmovb %r11d,%k7 373 374 vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1} 375 vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2} 376 vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3} 377 vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4} 378 vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5} 379 vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6} 380 vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7} 381 382 vpandq .Lmask52x4(%rip), $_R0, $_R0 383 vpandq .Lmask52x4(%rip), $_R0h, $_R0h 384 vpandq .Lmask52x4(%rip), $_R1, $_R1 385 vpandq .Lmask52x4(%rip), $_R1h, $_R1h 386 vpandq .Lmask52x4(%rip), $_R2, $_R2 387 vpandq .Lmask52x4(%rip), $_R2h, $_R2h 388 vpandq .Lmask52x4(%rip), $_R3, $_R3 389 390 shr \$4,%r11b 391 kmovb %r11d,%k1 392 kmovb %r10d,%k2 393 shr \$4,%r10b 394 kmovb %r10d,%k3 395 396 vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1} 397 vpsubq .Lmask52x4(%rip), $_R4, ${_R4}{%k2} 398 vpsubq .Lmask52x4(%rip), $_R4h, ${_R4h}{%k3} 399 400 vpandq .Lmask52x4(%rip), $_R3h, $_R3h 401 vpandq .Lmask52x4(%rip), $_R4, $_R4 402 vpandq .Lmask52x4(%rip), $_R4h, $_R4h 403___ 404} 405 406$code.=<<___; 407.text 408 409.globl ossl_rsaz_amm52x40_x1_ifma256 410.type ossl_rsaz_amm52x40_x1_ifma256,\@function,5 411.align 32 412ossl_rsaz_amm52x40_x1_ifma256: 413.cfi_startproc 414 endbranch 415 push %rbx 416.cfi_push %rbx 417 push %rbp 418.cfi_push %rbp 419 push %r12 420.cfi_push %r12 421 push %r13 422.cfi_push %r13 423 push %r14 424.cfi_push %r14 425 push %r15 426.cfi_push %r15 427___ 428$code.=<<___ if ($win64); 429 lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment) 430 vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers 431 vmovdqa64 %xmm7, `1*16`(%rsp) 432 vmovdqa64 %xmm8, `2*16`(%rsp) 433 vmovdqa64 %xmm9, `3*16`(%rsp) 434 vmovdqa64 %xmm10,`4*16`(%rsp) 435 vmovdqa64 %xmm11,`5*16`(%rsp) 436 vmovdqa64 %xmm12,`6*16`(%rsp) 437 vmovdqa64 %xmm13,`7*16`(%rsp) 438 vmovdqa64 %xmm14,`8*16`(%rsp) 439 vmovdqa64 %xmm15,`9*16`(%rsp) 440.Lossl_rsaz_amm52x40_x1_ifma256_body: 441___ 442$code.=<<___; 443 # Zeroing accumulators 444 vpxord $zero, $zero, $zero 445 vmovdqa64 $zero, $R0_0 446 vmovdqa64 $zero, $R0_0h 447 vmovdqa64 $zero, $R1_0 448 vmovdqa64 $zero, $R1_0h 449 vmovdqa64 $zero, $R2_0 450 vmovdqa64 $zero, $R2_0h 451 vmovdqa64 $zero, $R3_0 452 vmovdqa64 $zero, $R3_0h 453 vmovdqa64 $zero, $R4_0 454 vmovdqa64 $zero, $R4_0h 455 456 xorl $acc0_0_low, $acc0_0_low 457 458 movq $b, $b_ptr # backup address of b 459 movq \$0xfffffffffffff, $mask52 # 52-bit mask 460 461 # Loop over 40 digits unrolled by 4 462 mov \$10, $iter 463 464.align 32 465.Lloop10: 466___ 467 foreach my $idx (0..3) { 468 &amm52x40_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,$k0); 469 } 470$code.=<<___; 471 lea `4*8`($b_ptr), $b_ptr 472 dec $iter 473 jne .Lloop10 474___ 475 &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h); 476$code.=<<___; 477 478 vmovdqu64 $R0_0, `0*32`($res) 479 vmovdqu64 $R0_0h, `1*32`($res) 480 vmovdqu64 $R1_0, `2*32`($res) 481 vmovdqu64 $R1_0h, `3*32`($res) 482 vmovdqu64 $R2_0, `4*32`($res) 483 vmovdqu64 $R2_0h, `5*32`($res) 484 vmovdqu64 $R3_0, `6*32`($res) 485 vmovdqu64 $R3_0h, `7*32`($res) 486 vmovdqu64 $R4_0, `8*32`($res) 487 vmovdqu64 $R4_0h, `9*32`($res) 488 489 vzeroupper 490 lea (%rsp),%rax 491.cfi_def_cfa_register %rax 492___ 493$code.=<<___ if ($win64); 494 vmovdqa64 `0*16`(%rax),%xmm6 495 vmovdqa64 `1*16`(%rax),%xmm7 496 vmovdqa64 `2*16`(%rax),%xmm8 497 vmovdqa64 `3*16`(%rax),%xmm9 498 vmovdqa64 `4*16`(%rax),%xmm10 499 vmovdqa64 `5*16`(%rax),%xmm11 500 vmovdqa64 `6*16`(%rax),%xmm12 501 vmovdqa64 `7*16`(%rax),%xmm13 502 vmovdqa64 `8*16`(%rax),%xmm14 503 vmovdqa64 `9*16`(%rax),%xmm15 504 lea 168(%rsp),%rax 505___ 506$code.=<<___; 507 mov 0(%rax),%r15 508.cfi_restore %r15 509 mov 8(%rax),%r14 510.cfi_restore %r14 511 mov 16(%rax),%r13 512.cfi_restore %r13 513 mov 24(%rax),%r12 514.cfi_restore %r12 515 mov 32(%rax),%rbp 516.cfi_restore %rbp 517 mov 40(%rax),%rbx 518.cfi_restore %rbx 519 lea 48(%rax),%rsp # restore rsp 520.cfi_def_cfa %rsp,8 521.Lossl_rsaz_amm52x40_x1_ifma256_epilogue: 522 523 ret 524.cfi_endproc 525.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 526___ 527 528$code.=<<___; 529.section .rodata align=32 530.align 32 531.Lmask52x4: 532 .quad 0xfffffffffffff 533 .quad 0xfffffffffffff 534 .quad 0xfffffffffffff 535 .quad 0xfffffffffffff 536___ 537 538############################################################################### 539# Dual Almost Montgomery Multiplication for 40-digit number in radix 2^52 540# 541# See description of ossl_rsaz_amm52x40_x1_ifma256() above for details about Almost 542# Montgomery Multiplication algorithm and function input parameters description. 543# 544# This function does two AMMs for two independent inputs, hence dual. 545# 546# void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40], 547# const BN_ULONG a[2][40], 548# const BN_ULONG b[2][40], 549# const BN_ULONG m[2][40], 550# const BN_ULONG k0[2]); 551############################################################################### 552 553$code.=<<___; 554.text 555 556.globl ossl_rsaz_amm52x40_x2_ifma256 557.type ossl_rsaz_amm52x40_x2_ifma256,\@function,5 558.align 32 559ossl_rsaz_amm52x40_x2_ifma256: 560.cfi_startproc 561 endbranch 562 push %rbx 563.cfi_push %rbx 564 push %rbp 565.cfi_push %rbp 566 push %r12 567.cfi_push %r12 568 push %r13 569.cfi_push %r13 570 push %r14 571.cfi_push %r14 572 push %r15 573.cfi_push %r15 574___ 575$code.=<<___ if ($win64); 576 lea -168(%rsp),%rsp 577 vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers 578 vmovdqa64 %xmm7, `1*16`(%rsp) 579 vmovdqa64 %xmm8, `2*16`(%rsp) 580 vmovdqa64 %xmm9, `3*16`(%rsp) 581 vmovdqa64 %xmm10,`4*16`(%rsp) 582 vmovdqa64 %xmm11,`5*16`(%rsp) 583 vmovdqa64 %xmm12,`6*16`(%rsp) 584 vmovdqa64 %xmm13,`7*16`(%rsp) 585 vmovdqa64 %xmm14,`8*16`(%rsp) 586 vmovdqa64 %xmm15,`9*16`(%rsp) 587.Lossl_rsaz_amm52x40_x2_ifma256_body: 588___ 589$code.=<<___; 590 # Zeroing accumulators 591 vpxord $zero, $zero, $zero 592 vmovdqa64 $zero, $R0_0 593 vmovdqa64 $zero, $R0_0h 594 vmovdqa64 $zero, $R1_0 595 vmovdqa64 $zero, $R1_0h 596 vmovdqa64 $zero, $R2_0 597 vmovdqa64 $zero, $R2_0h 598 vmovdqa64 $zero, $R3_0 599 vmovdqa64 $zero, $R3_0h 600 vmovdqa64 $zero, $R4_0 601 vmovdqa64 $zero, $R4_0h 602 603 vmovdqa64 $zero, $R0_1 604 vmovdqa64 $zero, $R0_1h 605 vmovdqa64 $zero, $R1_1 606 vmovdqa64 $zero, $R1_1h 607 vmovdqa64 $zero, $R2_1 608 vmovdqa64 $zero, $R2_1h 609 vmovdqa64 $zero, $R3_1 610 vmovdqa64 $zero, $R3_1h 611 vmovdqa64 $zero, $R4_1 612 vmovdqa64 $zero, $R4_1h 613 614 615 xorl $acc0_0_low, $acc0_0_low 616 xorl $acc0_1_low, $acc0_1_low 617 618 movq $b, $b_ptr # backup address of b 619 movq \$0xfffffffffffff, $mask52 # 52-bit mask 620 621 mov \$40, $iter 622 623.align 32 624.Lloop40: 625___ 626 &amm52x40_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,"($k0)"); 627 # 40*8 = offset of the next dimension in two-dimension array 628 &amm52x40_x1(40*8,40*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h,"8($k0)"); 629$code.=<<___; 630 lea 8($b_ptr), $b_ptr 631 dec $iter 632 jne .Lloop40 633___ 634 &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h); 635 &amm52x40_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h); 636$code.=<<___; 637 638 vmovdqu64 $R0_0, `0*32`($res) 639 vmovdqu64 $R0_0h, `1*32`($res) 640 vmovdqu64 $R1_0, `2*32`($res) 641 vmovdqu64 $R1_0h, `3*32`($res) 642 vmovdqu64 $R2_0, `4*32`($res) 643 vmovdqu64 $R2_0h, `5*32`($res) 644 vmovdqu64 $R3_0, `6*32`($res) 645 vmovdqu64 $R3_0h, `7*32`($res) 646 vmovdqu64 $R4_0, `8*32`($res) 647 vmovdqu64 $R4_0h, `9*32`($res) 648 649 vmovdqu64 $R0_1, `10*32`($res) 650 vmovdqu64 $R0_1h, `11*32`($res) 651 vmovdqu64 $R1_1, `12*32`($res) 652 vmovdqu64 $R1_1h, `13*32`($res) 653 vmovdqu64 $R2_1, `14*32`($res) 654 vmovdqu64 $R2_1h, `15*32`($res) 655 vmovdqu64 $R3_1, `16*32`($res) 656 vmovdqu64 $R3_1h, `17*32`($res) 657 vmovdqu64 $R4_1, `18*32`($res) 658 vmovdqu64 $R4_1h, `19*32`($res) 659 660 vzeroupper 661 lea (%rsp),%rax 662.cfi_def_cfa_register %rax 663___ 664$code.=<<___ if ($win64); 665 vmovdqa64 `0*16`(%rax),%xmm6 666 vmovdqa64 `1*16`(%rax),%xmm7 667 vmovdqa64 `2*16`(%rax),%xmm8 668 vmovdqa64 `3*16`(%rax),%xmm9 669 vmovdqa64 `4*16`(%rax),%xmm10 670 vmovdqa64 `5*16`(%rax),%xmm11 671 vmovdqa64 `6*16`(%rax),%xmm12 672 vmovdqa64 `7*16`(%rax),%xmm13 673 vmovdqa64 `8*16`(%rax),%xmm14 674 vmovdqa64 `9*16`(%rax),%xmm15 675 lea 168(%rsp),%rax 676___ 677$code.=<<___; 678 mov 0(%rax),%r15 679.cfi_restore %r15 680 mov 8(%rax),%r14 681.cfi_restore %r14 682 mov 16(%rax),%r13 683.cfi_restore %r13 684 mov 24(%rax),%r12 685.cfi_restore %r12 686 mov 32(%rax),%rbp 687.cfi_restore %rbp 688 mov 40(%rax),%rbx 689.cfi_restore %rbx 690 lea 48(%rax),%rsp 691.cfi_def_cfa %rsp,8 692.Lossl_rsaz_amm52x40_x2_ifma256_epilogue: 693 ret 694.cfi_endproc 695.size ossl_rsaz_amm52x40_x2_ifma256, .-ossl_rsaz_amm52x40_x2_ifma256 696___ 697} 698 699############################################################################### 700# Constant time extraction from the precomputed table of powers base^i, where 701# i = 0..2^EXP_WIN_SIZE-1 702# 703# The input |red_table| contains precomputations for two independent base values. 704# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. 705# 706# Extracted value (output) is 2 40 digits numbers in 2^52 radix. 707# 708# void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y, 709# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40], 710# int red_table_idx1, int red_table_idx2); 711# 712# EXP_WIN_SIZE = 5 713############################################################################### 714{ 715# input parameters 716my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 717 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 718 719my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5)); 720my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19)); 721my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24)); 722 723my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9); 724my $t0xmm = $t0; 725$t0xmm =~ s/%y/%x/; 726 727sub get_table_value_consttime() { 728my ($_idx,$_offset) = @_; 729$code.=<<___; 730 vpxorq $cur_idx, $cur_idx, $cur_idx 731.align 32 732.Lloop_$_offset: 733 vpcmpq \$0, $cur_idx, $_idx, %k1 # mask of (idx == cur_idx) 734___ 735foreach (0..9) { 736$code.=<<___; 737 vmovdqu64 `$_offset+${_}*32`($red_tbl), $tmp # load data from red_tbl 738 vpblendmq $tmp, $t[$_], ${t[$_]}{%k1} # extract data when mask is not zero 739___ 740} 741$code.=<<___; 742 vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx 743 addq \$`2*40*8`, $red_tbl 744 cmpq $red_tbl, %rax 745 jne .Lloop_$_offset 746___ 747} 748 749$code.=<<___; 750.text 751 752.align 32 753.globl ossl_extract_multiplier_2x40_win5 754.type ossl_extract_multiplier_2x40_win5,\@abi-omnipotent 755ossl_extract_multiplier_2x40_win5: 756.cfi_startproc 757 endbranch 758 vmovdqa64 .Lones(%rip), $ones # broadcast ones 759 vpbroadcastq $red_tbl_idx1, $idx1 760 vpbroadcastq $red_tbl_idx2, $idx2 761 leaq `(1<<5)*2*40*8`($red_tbl), %rax # holds end of the tbl 762 763 # backup red_tbl address 764 movq $red_tbl, %r10 765 766 # zeroing t0..n, cur_idx 767 vpxor $t0xmm, $t0xmm, $t0xmm 768___ 769foreach (1..9) { 770 $code.="vmovdqa64 $t0, $t[$_] \n"; 771} 772 773&get_table_value_consttime($idx1, 0); 774foreach (0..9) { 775 $code.="vmovdqu64 $t[$_], `(0+$_)*32`($out) \n"; 776} 777$code.="movq %r10, $red_tbl \n"; 778&get_table_value_consttime($idx2, 40*8); 779foreach (0..9) { 780 $code.="vmovdqu64 $t[$_], `(10+$_)*32`($out) \n"; 781} 782$code.=<<___; 783 784 ret 785.cfi_endproc 786.size ossl_extract_multiplier_2x40_win5, .-ossl_extract_multiplier_2x40_win5 787___ 788$code.=<<___; 789.section .rodata align=32 790.align 32 791.Lones: 792 .quad 1,1,1,1 793.Lzeros: 794 .quad 0,0,0,0 795___ 796} 797 798if ($win64) { 799$rec="%rcx"; 800$frame="%rdx"; 801$context="%r8"; 802$disp="%r9"; 803 804$code.=<<___; 805.extern __imp_RtlVirtualUnwind 806.type rsaz_avx_handler,\@abi-omnipotent 807.align 16 808rsaz_avx_handler: 809 push %rsi 810 push %rdi 811 push %rbx 812 push %rbp 813 push %r12 814 push %r13 815 push %r14 816 push %r15 817 pushfq 818 sub \$64,%rsp 819 820 mov 120($context),%rax # pull context->Rax 821 mov 248($context),%rbx # pull context->Rip 822 823 mov 8($disp),%rsi # disp->ImageBase 824 mov 56($disp),%r11 # disp->HandlerData 825 826 mov 0(%r11),%r10d # HandlerData[0] 827 lea (%rsi,%r10),%r10 # prologue label 828 cmp %r10,%rbx # context->Rip<.Lprologue 829 jb .Lcommon_seh_tail 830 831 mov 4(%r11),%r10d # HandlerData[1] 832 lea (%rsi,%r10),%r10 # epilogue label 833 cmp %r10,%rbx # context->Rip>=.Lepilogue 834 jae .Lcommon_seh_tail 835 836 mov 152($context),%rax # pull context->Rsp 837 838 lea (%rax),%rsi # %xmm save area 839 lea 512($context),%rdi # & context.Xmm6 840 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 841 .long 0xa548f3fc # cld; rep movsq 842 843 lea `48+168`(%rax),%rax 844 845 mov -8(%rax),%rbx 846 mov -16(%rax),%rbp 847 mov -24(%rax),%r12 848 mov -32(%rax),%r13 849 mov -40(%rax),%r14 850 mov -48(%rax),%r15 851 mov %rbx,144($context) # restore context->Rbx 852 mov %rbp,160($context) # restore context->Rbp 853 mov %r12,216($context) # restore context->R12 854 mov %r13,224($context) # restore context->R13 855 mov %r14,232($context) # restore context->R14 856 mov %r15,240($context) # restore context->R14 857 858.Lcommon_seh_tail: 859 mov 8(%rax),%rdi 860 mov 16(%rax),%rsi 861 mov %rax,152($context) # restore context->Rsp 862 mov %rsi,168($context) # restore context->Rsi 863 mov %rdi,176($context) # restore context->Rdi 864 865 mov 40($disp),%rdi # disp->ContextRecord 866 mov $context,%rsi # context 867 mov \$154,%ecx # sizeof(CONTEXT) 868 .long 0xa548f3fc # cld; rep movsq 869 870 mov $disp,%rsi 871 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 872 mov 8(%rsi),%rdx # arg2, disp->ImageBase 873 mov 0(%rsi),%r8 # arg3, disp->ControlPc 874 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 875 mov 40(%rsi),%r10 # disp->ContextRecord 876 lea 56(%rsi),%r11 # &disp->HandlerData 877 lea 24(%rsi),%r12 # &disp->EstablisherFrame 878 mov %r10,32(%rsp) # arg5 879 mov %r11,40(%rsp) # arg6 880 mov %r12,48(%rsp) # arg7 881 mov %rcx,56(%rsp) # arg8, (NULL) 882 call *__imp_RtlVirtualUnwind(%rip) 883 884 mov \$1,%eax # ExceptionContinueSearch 885 add \$64,%rsp 886 popfq 887 pop %r15 888 pop %r14 889 pop %r13 890 pop %r12 891 pop %rbp 892 pop %rbx 893 pop %rdi 894 pop %rsi 895 ret 896.size rsaz_avx_handler,.-rsaz_avx_handler 897 898.section .pdata 899.align 4 900 .rva .LSEH_begin_ossl_rsaz_amm52x40_x1_ifma256 901 .rva .LSEH_end_ossl_rsaz_amm52x40_x1_ifma256 902 .rva .LSEH_info_ossl_rsaz_amm52x40_x1_ifma256 903 904 .rva .LSEH_begin_ossl_rsaz_amm52x40_x2_ifma256 905 .rva .LSEH_end_ossl_rsaz_amm52x40_x2_ifma256 906 .rva .LSEH_info_ossl_rsaz_amm52x40_x2_ifma256 907 908.section .xdata 909.align 8 910.LSEH_info_ossl_rsaz_amm52x40_x1_ifma256: 911 .byte 9,0,0,0 912 .rva rsaz_avx_handler 913 .rva .Lossl_rsaz_amm52x40_x1_ifma256_body,.Lossl_rsaz_amm52x40_x1_ifma256_epilogue 914.LSEH_info_ossl_rsaz_amm52x40_x2_ifma256: 915 .byte 9,0,0,0 916 .rva rsaz_avx_handler 917 .rva .Lossl_rsaz_amm52x40_x2_ifma256_body,.Lossl_rsaz_amm52x40_x2_ifma256_epilogue 918___ 919} 920}}} else {{{ # fallback for old assembler 921$code.=<<___; 922.text 923 924.globl ossl_rsaz_amm52x40_x1_ifma256 925.globl ossl_rsaz_amm52x40_x2_ifma256 926.globl ossl_extract_multiplier_2x40_win5 927.type ossl_rsaz_amm52x40_x1_ifma256,\@abi-omnipotent 928ossl_rsaz_amm52x40_x1_ifma256: 929ossl_rsaz_amm52x40_x2_ifma256: 930ossl_extract_multiplier_2x40_win5: 931 .byte 0x0f,0x0b # ud2 932 ret 933.size ossl_rsaz_amm52x40_x1_ifma256, .-ossl_rsaz_amm52x40_x1_ifma256 934___ 935}}} 936 937$code =~ s/\`([^\`]*)\`/eval $1/gem; 938print $code; 939close STDOUT or die "error closing STDOUT: $!"; 940