xref: /openssl/crypto/bn/asm/rsaz-3k-avx512.pl (revision da1c088f)
1# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
2# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# Originally written by Sergey Kirillov and Andrey Matyukov
11# Intel Corporation
12#
13# March 2021
14#
15# Initial release.
16#
17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
18#
19# IceLake-Client @ 1.3GHz
20# |---------+-----------------------+---------------+-------------|
21# |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
22# |---------+-----------------------+---------------+-------------|
23# | rsa3072 | 6 397 637             | 2 866 593     | cycles/sign |
24# |         | 203.2                 | 453.5 / +123% | sign/s      |
25# |---------+-----------------------+---------------+-------------|
26#
27
28# $output is the last argument if it looks like a file (it has an extension)
29# $flavour is the first argument if it doesn't look like a file
30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32
33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
34$avx512ifma=0;
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43    $avx512ifma = ($1>=2.26);
44}
45
46if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
48    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
49}
50
51if (!$avx512ifma && `$ENV{CC} -v 2>&1`
52    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
53    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
54    if ($1) {
55        # Apple conditions, they use a different version series, see
56        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
57        # clang 7.0.0 is Apple clang 10.0.1
58        $avx512ifma = ($ver>=10.0001)
59    } else {
60        $avx512ifma = ($ver>=7.0);
61    }
62}
63
64open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
65    or die "can't call $xlate: $!";
66*STDOUT=*OUT;
67
68if ($avx512ifma>0) {{{
69@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
70
71###############################################################################
72# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
73#
74# AMM is defined as presented in the paper [1].
75#
76# The input and output are presented in 2^52 radix domain, i.e.
77#   |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
78#
79#   NOTE: the function uses zero-padded data - 2 high QWs is a padding.
80#
81#   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
82#
83# NB: the AMM implementation does not perform "conditional" subtraction step
84# specified in the original algorithm as according to the Lemma 1 from the paper
85# [2], the result will be always < 2*m and can be used as a direct input to
86# the next AMM iteration.  This post-condition is true, provided the correct
87# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k,
88# which matches our case: 1560 > 1536 + 2 * 1.
89#
90# [1] Gueron, S. Efficient software implementations of modular exponentiation.
91#     DOI: 10.1007/s13389-012-0031-5
92# [2] Gueron, S. Enhanced Montgomery Multiplication.
93#     DOI: 10.1007/3-540-36400-5_5
94#
95# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
96#                                    const BN_ULONG *a,
97#                                    const BN_ULONG *b,
98#                                    const BN_ULONG *m,
99#                                    BN_ULONG k0);
100###############################################################################
101{
102# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
103my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
104
105my $mask52     = "%rax";
106my $acc0_0     = "%r9";
107my $acc0_0_low = "%r9d";
108my $acc0_1     = "%r15";
109my $acc0_1_low = "%r15d";
110my $b_ptr      = "%r11";
111
112my $iter = "%ebx";
113
114my $zero = "%ymm0";
115my $Bi   = "%ymm1";
116my $Yi   = "%ymm2";
117my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
118my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
119
120# Registers mapping for normalization
121my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
122
123sub amm52x30_x1() {
124# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
125#                of data for corresponding AMM operation;
126# _b_offset    - offset in the |b| array pointing to the next qword digit;
127my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
128my $_R0_xmm = $_R0;
129$_R0_xmm =~ s/%y/%x/;
130$code.=<<___;
131    movq    $_b_offset($b_ptr), %r13             # b[i]
132
133    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
134    movq    $_data_offset($a), %rdx
135    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
136    addq    %r13, $_acc                          # acc += t0
137    movq    %r12, %r10
138    adcq    \$0, %r10                            # t2 += CF
139
140    movq    $_k0, %r13
141    imulq   $_acc, %r13                          # acc * k0
142    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
143
144    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
145    movq    $_data_offset($m), %rdx
146    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
147    addq    %r13, $_acc                          # acc += t0
148    adcq    %r12, %r10                           # t2 += (t1 + CF)
149
150    shrq    \$52, $_acc
151    salq    \$12, %r10
152    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
153
154    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
155    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
156    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
157    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
158    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
159    vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
160    vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
161    vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
162
163    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
164    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
165    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
166    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
167    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
168    vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
169    vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
170    vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
171
172    # Shift accumulators right by 1 qword, zero extending the highest one
173    valignq     \$1, $_R0, $_R0h, $_R0
174    valignq     \$1, $_R0h, $_R1, $_R0h
175    valignq     \$1, $_R1, $_R1h, $_R1
176    valignq     \$1, $_R1h, $_R2, $_R1h
177    valignq     \$1, $_R2, $_R2h, $_R2
178    valignq     \$1, $_R2h, $_R3, $_R2h
179    valignq     \$1, $_R3, $_R3h, $_R3
180    valignq     \$1, $_R3h, $zero, $_R3h
181
182    vmovq   $_R0_xmm, %r13
183    addq    %r13, $_acc    # acc += R0[0]
184
185    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
186    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
187    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
188    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
189    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
190    vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
191    vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
192    vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
193
194    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
195    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
196    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
197    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
198    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
199    vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
200    vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
201    vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
202___
203}
204
205# Normalization routine: handles carry bits and gets bignum qwords to normalized
206# 2^52 representation.
207#
208# Uses %r8-14,%e[abcd]x
209sub amm52x30_x1_norm {
210my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
211$code.=<<___;
212    # Put accumulator to low qword in R0
213    vpbroadcastq    $_acc, $T0
214    vpblendd \$3, $T0, $_R0, $_R0
215
216    # Extract "carries" (12 high bits) from each QW of the bignum
217    # Save them to LSB of QWs in T0..Tn
218    vpsrlq    \$52, $_R0,   $T0
219    vpsrlq    \$52, $_R0h,  $T0h
220    vpsrlq    \$52, $_R1,   $T1
221    vpsrlq    \$52, $_R1h,  $T1h
222    vpsrlq    \$52, $_R2,   $T2
223    vpsrlq    \$52, $_R2h,  $T2h
224    vpsrlq    \$52, $_R3,   $T3
225    vpsrlq    \$52, $_R3h,  $T3h
226
227    # "Shift left" T0..Tn by 1 QW
228    valignq \$3, $T3,  $T3h,  $T3h
229    valignq \$3, $T2h,  $T3,  $T3
230    valignq \$3, $T2,  $T2h,  $T2h
231    valignq \$3, $T1h,  $T2,  $T2
232    valignq \$3, $T1,   $T1h, $T1h
233    valignq \$3, $T0h,  $T1,  $T1
234    valignq \$3, $T0,   $T0h, $T0h
235    valignq \$3, .Lzeros(%rip), $T0,  $T0
236
237    # Drop "carries" from R0..Rn QWs
238    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
239    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
240    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
241    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
242    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
243    vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
244    vpandq    .Lmask52x4(%rip), $_R3,  $_R3
245    vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
246
247    # Sum R0..Rn with corresponding adjusted carries
248    vpaddq  $T0,  $_R0,  $_R0
249    vpaddq  $T0h, $_R0h, $_R0h
250    vpaddq  $T1,  $_R1,  $_R1
251    vpaddq  $T1h, $_R1h, $_R1h
252    vpaddq  $T2,  $_R2,  $_R2
253    vpaddq  $T2h, $_R2h, $_R2h
254    vpaddq  $T3,  $_R3,  $_R3
255    vpaddq  $T3h, $_R3h, $_R3h
256
257    # Now handle carry bits from this addition
258    # Get mask of QWs whose 52-bit parts overflow
259    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
260    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
261    kmovb      %k1,%r14d
262    kmovb      %k2,%r13d
263    shl        \$4,%r13b
264    or         %r13b,%r14b
265
266    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
267    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
268    kmovb      %k1,%r13d
269    kmovb      %k2,%r12d
270    shl        \$4,%r12b
271    or         %r12b,%r13b
272
273    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
274    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
275    kmovb      %k1,%r12d
276    kmovb      %k2,%r11d
277    shl        \$4,%r11b
278    or         %r11b,%r12b
279
280    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
281    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
282    kmovb      %k1,%r11d
283    kmovb      %k2,%r10d
284    shl        \$4,%r10b
285    or         %r10b,%r11b
286
287    addb       %r14b,%r14b
288    adcb       %r13b,%r13b
289    adcb       %r12b,%r12b
290    adcb       %r11b,%r11b
291
292    # Get mask of QWs whose 52-bit parts saturated
293    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
294    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
295    kmovb      %k1,%r9d
296    kmovb      %k2,%r8d
297    shl        \$4,%r8b
298    or         %r8b,%r9b
299
300    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
301    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
302    kmovb      %k1,%r8d
303    kmovb      %k2,%edx
304    shl        \$4,%dl
305    or         %dl,%r8b
306
307    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
308    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
309    kmovb      %k1,%edx
310    kmovb      %k2,%ecx
311    shl        \$4,%cl
312    or         %cl,%dl
313
314    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
315    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
316    kmovb      %k1,%ecx
317    kmovb      %k2,%ebx
318    shl        \$4,%bl
319    or         %bl,%cl
320
321    addb     %r9b,%r14b
322    adcb     %r8b,%r13b
323    adcb     %dl,%r12b
324    adcb     %cl,%r11b
325
326    xor      %r9b,%r14b
327    xor      %r8b,%r13b
328    xor      %dl,%r12b
329    xor      %cl,%r11b
330
331    kmovb    %r14d,%k1
332    shr      \$4,%r14b
333    kmovb    %r14d,%k2
334    kmovb    %r13d,%k3
335    shr      \$4,%r13b
336    kmovb    %r13d,%k4
337    kmovb    %r12d,%k5
338    shr      \$4,%r12b
339    kmovb    %r12d,%k6
340    kmovb    %r11d,%k7
341
342    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
343    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
344    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
345    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
346    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
347    vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
348    vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
349
350    vpandq  .Lmask52x4(%rip), $_R0,  $_R0
351    vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
352    vpandq  .Lmask52x4(%rip), $_R1,  $_R1
353    vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
354    vpandq  .Lmask52x4(%rip), $_R2,  $_R2
355    vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
356    vpandq  .Lmask52x4(%rip), $_R3,  $_R3
357
358    shr    \$4,%r11b
359    kmovb   %r11d,%k1
360
361    vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
362
363    vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
364___
365}
366
367$code.=<<___;
368.text
369
370.globl  ossl_rsaz_amm52x30_x1_ifma256
371.type   ossl_rsaz_amm52x30_x1_ifma256,\@function,5
372.align 32
373ossl_rsaz_amm52x30_x1_ifma256:
374.cfi_startproc
375    endbranch
376    push    %rbx
377.cfi_push   %rbx
378    push    %rbp
379.cfi_push   %rbp
380    push    %r12
381.cfi_push   %r12
382    push    %r13
383.cfi_push   %r13
384    push    %r14
385.cfi_push   %r14
386    push    %r15
387.cfi_push   %r15
388___
389$code.=<<___ if ($win64);
390    lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
391    vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
392    vmovdqa64   %xmm7, `1*16`(%rsp)
393    vmovdqa64   %xmm8, `2*16`(%rsp)
394    vmovdqa64   %xmm9, `3*16`(%rsp)
395    vmovdqa64   %xmm10,`4*16`(%rsp)
396    vmovdqa64   %xmm11,`5*16`(%rsp)
397    vmovdqa64   %xmm12,`6*16`(%rsp)
398    vmovdqa64   %xmm13,`7*16`(%rsp)
399    vmovdqa64   %xmm14,`8*16`(%rsp)
400    vmovdqa64   %xmm15,`9*16`(%rsp)
401.Lossl_rsaz_amm52x30_x1_ifma256_body:
402___
403$code.=<<___;
404    # Zeroing accumulators
405    vpxord   $zero, $zero, $zero
406    vmovdqa64   $zero, $R0_0
407    vmovdqa64   $zero, $R0_0h
408    vmovdqa64   $zero, $R1_0
409    vmovdqa64   $zero, $R1_0h
410    vmovdqa64   $zero, $R2_0
411    vmovdqa64   $zero, $R2_0h
412    vmovdqa64   $zero, $R3_0
413    vmovdqa64   $zero, $R3_0h
414
415    xorl    $acc0_0_low, $acc0_0_low
416
417    movq    $b, $b_ptr                       # backup address of b
418    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
419
420    # Loop over 30 digits unrolled by 4
421    mov     \$7, $iter
422
423.align 32
424.Lloop7:
425___
426    foreach my $idx (0..3) {
427        &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
428    }
429$code.=<<___;
430    lea    `4*8`($b_ptr), $b_ptr
431    dec    $iter
432    jne    .Lloop7
433___
434    &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
435    &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
436
437    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
438$code.=<<___;
439
440    vmovdqu64   $R0_0,  `0*32`($res)
441    vmovdqu64   $R0_0h, `1*32`($res)
442    vmovdqu64   $R1_0,  `2*32`($res)
443    vmovdqu64   $R1_0h, `3*32`($res)
444    vmovdqu64   $R2_0,  `4*32`($res)
445    vmovdqu64   $R2_0h, `5*32`($res)
446    vmovdqu64   $R3_0,  `6*32`($res)
447    vmovdqu64   $R3_0h, `7*32`($res)
448
449    vzeroupper
450    lea     (%rsp),%rax
451.cfi_def_cfa_register   %rax
452___
453$code.=<<___ if ($win64);
454    vmovdqa64   `0*16`(%rax),%xmm6
455    vmovdqa64   `1*16`(%rax),%xmm7
456    vmovdqa64   `2*16`(%rax),%xmm8
457    vmovdqa64   `3*16`(%rax),%xmm9
458    vmovdqa64   `4*16`(%rax),%xmm10
459    vmovdqa64   `5*16`(%rax),%xmm11
460    vmovdqa64   `6*16`(%rax),%xmm12
461    vmovdqa64   `7*16`(%rax),%xmm13
462    vmovdqa64   `8*16`(%rax),%xmm14
463    vmovdqa64   `9*16`(%rax),%xmm15
464    lea  168(%rsp),%rax
465___
466$code.=<<___;
467    mov  0(%rax),%r15
468.cfi_restore    %r15
469    mov  8(%rax),%r14
470.cfi_restore    %r14
471    mov  16(%rax),%r13
472.cfi_restore    %r13
473    mov  24(%rax),%r12
474.cfi_restore    %r12
475    mov  32(%rax),%rbp
476.cfi_restore    %rbp
477    mov  40(%rax),%rbx
478.cfi_restore    %rbx
479    lea  48(%rax),%rsp       # restore rsp
480.cfi_def_cfa %rsp,8
481.Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
482    ret
483.cfi_endproc
484.size   ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
485___
486
487$code.=<<___;
488.data
489.align 32
490.Lmask52x4:
491    .quad   0xfffffffffffff
492    .quad   0xfffffffffffff
493    .quad   0xfffffffffffff
494    .quad   0xfffffffffffff
495___
496
497###############################################################################
498# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52
499#
500# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost
501# Montgomery Multiplication algorithm and function input parameters description.
502#
503# This function does two AMMs for two independent inputs, hence dual.
504#
505# NOTE: the function uses zero-padded data - 2 high QWs is a padding.
506#
507# void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
508#                                    const BN_ULONG a[2][32],
509#                                    const BN_ULONG b[2][32],
510#                                    const BN_ULONG m[2][32],
511#                                    const BN_ULONG k0[2]);
512###############################################################################
513
514$code.=<<___;
515.text
516
517.globl  ossl_rsaz_amm52x30_x2_ifma256
518.type   ossl_rsaz_amm52x30_x2_ifma256,\@function,5
519.align 32
520ossl_rsaz_amm52x30_x2_ifma256:
521.cfi_startproc
522    endbranch
523    push    %rbx
524.cfi_push   %rbx
525    push    %rbp
526.cfi_push   %rbp
527    push    %r12
528.cfi_push   %r12
529    push    %r13
530.cfi_push   %r13
531    push    %r14
532.cfi_push   %r14
533    push    %r15
534.cfi_push   %r15
535___
536$code.=<<___ if ($win64);
537    lea     -168(%rsp),%rsp
538    vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
539    vmovdqa64   %xmm7, `1*16`(%rsp)
540    vmovdqa64   %xmm8, `2*16`(%rsp)
541    vmovdqa64   %xmm9, `3*16`(%rsp)
542    vmovdqa64   %xmm10,`4*16`(%rsp)
543    vmovdqa64   %xmm11,`5*16`(%rsp)
544    vmovdqa64   %xmm12,`6*16`(%rsp)
545    vmovdqa64   %xmm13,`7*16`(%rsp)
546    vmovdqa64   %xmm14,`8*16`(%rsp)
547    vmovdqa64   %xmm15,`9*16`(%rsp)
548.Lossl_rsaz_amm52x30_x2_ifma256_body:
549___
550$code.=<<___;
551    # Zeroing accumulators
552    vpxord   $zero, $zero, $zero
553    vmovdqa64   $zero, $R0_0
554    vmovdqa64   $zero, $R0_0h
555    vmovdqa64   $zero, $R1_0
556    vmovdqa64   $zero, $R1_0h
557    vmovdqa64   $zero, $R2_0
558    vmovdqa64   $zero, $R2_0h
559    vmovdqa64   $zero, $R3_0
560    vmovdqa64   $zero, $R3_0h
561
562    vmovdqa64   $zero, $R0_1
563    vmovdqa64   $zero, $R0_1h
564    vmovdqa64   $zero, $R1_1
565    vmovdqa64   $zero, $R1_1h
566    vmovdqa64   $zero, $R2_1
567    vmovdqa64   $zero, $R2_1h
568    vmovdqa64   $zero, $R3_1
569    vmovdqa64   $zero, $R3_1h
570
571
572    xorl    $acc0_0_low, $acc0_0_low
573    xorl    $acc0_1_low, $acc0_1_low
574
575    movq    $b, $b_ptr                       # backup address of b
576    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
577
578    mov    \$30, $iter
579
580.align 32
581.Lloop30:
582___
583    &amm52x30_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
584    # 32*8 = offset of the next dimension in two-dimension array
585    &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
586$code.=<<___;
587    lea    8($b_ptr), $b_ptr
588    dec    $iter
589    jne    .Lloop30
590___
591    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
592    &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
593$code.=<<___;
594
595    vmovdqu64   $R0_0,  `0*32`($res)
596    vmovdqu64   $R0_0h, `1*32`($res)
597    vmovdqu64   $R1_0,  `2*32`($res)
598    vmovdqu64   $R1_0h, `3*32`($res)
599    vmovdqu64   $R2_0,  `4*32`($res)
600    vmovdqu64   $R2_0h, `5*32`($res)
601    vmovdqu64   $R3_0,  `6*32`($res)
602    vmovdqu64   $R3_0h, `7*32`($res)
603
604    vmovdqu64   $R0_1,  `8*32`($res)
605    vmovdqu64   $R0_1h, `9*32`($res)
606    vmovdqu64   $R1_1,  `10*32`($res)
607    vmovdqu64   $R1_1h, `11*32`($res)
608    vmovdqu64   $R2_1,  `12*32`($res)
609    vmovdqu64   $R2_1h, `13*32`($res)
610    vmovdqu64   $R3_1,  `14*32`($res)
611    vmovdqu64   $R3_1h, `15*32`($res)
612
613    vzeroupper
614    lea     (%rsp),%rax
615.cfi_def_cfa_register   %rax
616___
617$code.=<<___ if ($win64);
618    vmovdqa64   `0*16`(%rax),%xmm6
619    vmovdqa64   `1*16`(%rax),%xmm7
620    vmovdqa64   `2*16`(%rax),%xmm8
621    vmovdqa64   `3*16`(%rax),%xmm9
622    vmovdqa64   `4*16`(%rax),%xmm10
623    vmovdqa64   `5*16`(%rax),%xmm11
624    vmovdqa64   `6*16`(%rax),%xmm12
625    vmovdqa64   `7*16`(%rax),%xmm13
626    vmovdqa64   `8*16`(%rax),%xmm14
627    vmovdqa64   `9*16`(%rax),%xmm15
628    lea     168(%rsp),%rax
629___
630$code.=<<___;
631    mov  0(%rax),%r15
632.cfi_restore    %r15
633    mov  8(%rax),%r14
634.cfi_restore    %r14
635    mov  16(%rax),%r13
636.cfi_restore    %r13
637    mov  24(%rax),%r12
638.cfi_restore    %r12
639    mov  32(%rax),%rbp
640.cfi_restore    %rbp
641    mov  40(%rax),%rbx
642.cfi_restore    %rbx
643    lea  48(%rax),%rsp
644.cfi_def_cfa    %rsp,8
645.Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
646    ret
647.cfi_endproc
648.size   ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
649___
650}
651
652###############################################################################
653# Constant time extraction from the precomputed table of powers base^i, where
654#    i = 0..2^EXP_WIN_SIZE-1
655#
656# The input |red_table| contains precomputations for two independent base values.
657# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
658#
659# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix.
660# (2 high QW is zero padding)
661#
662# void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
663#                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
664#                                        int red_table_idx1, int red_table_idx2);
665#
666# EXP_WIN_SIZE = 5
667###############################################################################
668{
669# input parameters
670my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
671                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
672
673my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
674my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
675my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
676
677my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
678my $t0xmm = $t0;
679$t0xmm =~ s/%y/%x/;
680
681$code.=<<___;
682.text
683
684.align 32
685.globl  ossl_extract_multiplier_2x30_win5
686.type   ossl_extract_multiplier_2x30_win5,\@abi-omnipotent
687ossl_extract_multiplier_2x30_win5:
688.cfi_startproc
689    endbranch
690    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
691    vpbroadcastq    $red_tbl_idx1, $idx1
692    vpbroadcastq    $red_tbl_idx2, $idx2
693    leaq   `(1<<5)*2*32*8`($red_tbl), %rax  # holds end of the tbl
694
695    # zeroing t0..n, cur_idx
696    vpxor   $t0xmm, $t0xmm, $t0xmm
697    vmovdqa64   $t0, $cur_idx
698___
699foreach (1..15) {
700    $code.="vmovdqa64   $t0, $t[$_] \n";
701}
702$code.=<<___;
703
704.align 32
705.Lloop:
706    vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
707    vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
708___
709foreach (0..15) {
710    my $mask = $_<8?"%k1":"%k2";
711$code.=<<___;
712    vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
713    vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
714___
715}
716$code.=<<___;
717    vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
718    addq    \$`2*32*8`, $red_tbl
719    cmpq    $red_tbl, %rax
720    jne .Lloop
721___
722# store t0..n
723foreach (0..15) {
724    $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
725}
726$code.=<<___;
727
728    ret
729.cfi_endproc
730.size   ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
731___
732$code.=<<___;
733.data
734.align 32
735.Lones:
736    .quad   1,1,1,1
737.Lzeros:
738    .quad   0,0,0,0
739___
740}
741
742if ($win64) {
743$rec="%rcx";
744$frame="%rdx";
745$context="%r8";
746$disp="%r9";
747
748$code.=<<___;
749.extern     __imp_RtlVirtualUnwind
750.type   rsaz_avx_handler,\@abi-omnipotent
751.align  16
752rsaz_avx_handler:
753    push    %rsi
754    push    %rdi
755    push    %rbx
756    push    %rbp
757    push    %r12
758    push    %r13
759    push    %r14
760    push    %r15
761    pushfq
762    sub     \$64,%rsp
763
764    mov     120($context),%rax # pull context->Rax
765    mov     248($context),%rbx # pull context->Rip
766
767    mov     8($disp),%rsi      # disp->ImageBase
768    mov     56($disp),%r11     # disp->HandlerData
769
770    mov     0(%r11),%r10d      # HandlerData[0]
771    lea     (%rsi,%r10),%r10   # prologue label
772    cmp     %r10,%rbx          # context->Rip<.Lprologue
773    jb  .Lcommon_seh_tail
774
775    mov     4(%r11),%r10d      # HandlerData[1]
776    lea     (%rsi,%r10),%r10   # epilogue label
777    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
778    jae     .Lcommon_seh_tail
779
780    mov     152($context),%rax # pull context->Rsp
781
782    lea     (%rax),%rsi         # %xmm save area
783    lea     512($context),%rdi  # & context.Xmm6
784    mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
785    .long   0xa548f3fc          # cld; rep movsq
786
787    lea     `48+168`(%rax),%rax
788
789    mov     -8(%rax),%rbx
790    mov     -16(%rax),%rbp
791    mov     -24(%rax),%r12
792    mov     -32(%rax),%r13
793    mov     -40(%rax),%r14
794    mov     -48(%rax),%r15
795    mov     %rbx,144($context) # restore context->Rbx
796    mov     %rbp,160($context) # restore context->Rbp
797    mov     %r12,216($context) # restore context->R12
798    mov     %r13,224($context) # restore context->R13
799    mov     %r14,232($context) # restore context->R14
800    mov     %r15,240($context) # restore context->R14
801
802.Lcommon_seh_tail:
803    mov     8(%rax),%rdi
804    mov     16(%rax),%rsi
805    mov     %rax,152($context) # restore context->Rsp
806    mov     %rsi,168($context) # restore context->Rsi
807    mov     %rdi,176($context) # restore context->Rdi
808
809    mov     40($disp),%rdi     # disp->ContextRecord
810    mov     $context,%rsi      # context
811    mov     \$154,%ecx         # sizeof(CONTEXT)
812    .long   0xa548f3fc         # cld; rep movsq
813
814    mov     $disp,%rsi
815    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
816    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
817    mov     0(%rsi),%r8        # arg3, disp->ControlPc
818    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
819    mov     40(%rsi),%r10      # disp->ContextRecord
820    lea     56(%rsi),%r11      # &disp->HandlerData
821    lea     24(%rsi),%r12      # &disp->EstablisherFrame
822    mov     %r10,32(%rsp)      # arg5
823    mov     %r11,40(%rsp)      # arg6
824    mov     %r12,48(%rsp)      # arg7
825    mov     %rcx,56(%rsp)      # arg8, (NULL)
826    call    *__imp_RtlVirtualUnwind(%rip)
827
828    mov     \$1,%eax           # ExceptionContinueSearch
829    add     \$64,%rsp
830    popfq
831    pop     %r15
832    pop     %r14
833    pop     %r13
834    pop     %r12
835    pop     %rbp
836    pop     %rbx
837    pop     %rdi
838    pop     %rsi
839    ret
840.size   rsaz_avx_handler,.-rsaz_avx_handler
841
842.section    .pdata
843.align  4
844    .rva    .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256
845    .rva    .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256
846    .rva    .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256
847
848    .rva    .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256
849    .rva    .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256
850    .rva    .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256
851
852.section    .xdata
853.align  8
854.LSEH_info_ossl_rsaz_amm52x30_x1_ifma256:
855    .byte   9,0,0,0
856    .rva    rsaz_avx_handler
857    .rva    .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue
858.LSEH_info_ossl_rsaz_amm52x30_x2_ifma256:
859    .byte   9,0,0,0
860    .rva    rsaz_avx_handler
861    .rva    .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue
862___
863}
864}}} else {{{                # fallback for old assembler
865$code.=<<___;
866.text
867
868.globl  ossl_rsaz_amm52x30_x1_ifma256
869.globl  ossl_rsaz_amm52x30_x2_ifma256
870.globl  ossl_extract_multiplier_2x30_win5
871.type   ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
872ossl_rsaz_amm52x30_x1_ifma256:
873ossl_rsaz_amm52x30_x2_ifma256:
874ossl_extract_multiplier_2x30_win5:
875    .byte   0x0f,0x0b    # ud2
876    ret
877.size   ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
878___
879}}}
880
881$code =~ s/\`([^\`]*)\`/eval $1/gem;
882print $code;
883close STDOUT or die "error closing STDOUT: $!";
884