xref: /openssl/crypto/bn/asm/rsaz-3k-avx512.pl (revision fecb3aae)
1# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
2# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# Originally written by Sergey Kirillov and Andrey Matyukov
11# Intel Corporation
12#
13# March 2021
14#
15# Initial release.
16#
17# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
18#
19# IceLake-Client @ 1.3GHz
20# |---------+-----------------------+---------------+-------------|
21# |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
22# |---------+-----------------------+---------------+-------------|
23# | rsa3072 | 6 397 637             | 2 866 593     | cycles/sign |
24# |         | 203.2                 | 453.5 / +123% | sign/s      |
25# |---------+-----------------------+---------------+-------------|
26#
27
28# $output is the last argument if it looks like a file (it has an extension)
29# $flavour is the first argument if it doesn't look like a file
30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32
33$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
34$avx512ifma=0;
35
36$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39die "can't locate x86_64-xlate.pl";
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43    $avx512ifma = ($1>=2.26);
44}
45
46if (!$avx512 && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
48    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
49}
50
51if (!$avx512 && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
52    $avx512ifma = ($2>=7.0);
53}
54
55open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
56    or die "can't call $xlate: $!";
57*STDOUT=*OUT;
58
59if ($avx512ifma>0) {{{
60@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
61
62###############################################################################
63# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
64#
65# AMM is defined as presented in the paper [1].
66#
67# The input and output are presented in 2^52 radix domain, i.e.
68#   |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
69#
70#   NOTE: the function uses zero-padded data - 2 high QWs is a padding.
71#
72#   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
73#
74# NB: the AMM implementation does not perform "conditional" subtraction step
75# specified in the original algorithm as according to the Lemma 1 from the paper
76# [2], the result will be always < 2*m and can be used as a direct input to
77# the next AMM iteration.  This post-condition is true, provided the correct
78# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k,
79# which matches our case: 1560 > 1536 + 2 * 1.
80#
81# [1] Gueron, S. Efficient software implementations of modular exponentiation.
82#     DOI: 10.1007/s13389-012-0031-5
83# [2] Gueron, S. Enhanced Montgomery Multiplication.
84#     DOI: 10.1007/3-540-36400-5_5
85#
86# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
87#                                    const BN_ULONG *a,
88#                                    const BN_ULONG *b,
89#                                    const BN_ULONG *m,
90#                                    BN_ULONG k0);
91###############################################################################
92{
93# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
94my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
95
96my $mask52     = "%rax";
97my $acc0_0     = "%r9";
98my $acc0_0_low = "%r9d";
99my $acc0_1     = "%r15";
100my $acc0_1_low = "%r15d";
101my $b_ptr      = "%r11";
102
103my $iter = "%ebx";
104
105my $zero = "%ymm0";
106my $Bi   = "%ymm1";
107my $Yi   = "%ymm2";
108my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
109my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
110
111# Registers mapping for normalization
112my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
113
114sub amm52x30_x1() {
115# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
116#                of data for corresponding AMM operation;
117# _b_offset    - offset in the |b| array pointing to the next qword digit;
118my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
119my $_R0_xmm = $_R0;
120$_R0_xmm =~ s/%y/%x/;
121$code.=<<___;
122    movq    $_b_offset($b_ptr), %r13             # b[i]
123
124    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
125    movq    $_data_offset($a), %rdx
126    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
127    addq    %r13, $_acc                          # acc += t0
128    movq    %r12, %r10
129    adcq    \$0, %r10                            # t2 += CF
130
131    movq    $_k0, %r13
132    imulq   $_acc, %r13                          # acc * k0
133    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
134
135    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
136    movq    $_data_offset($m), %rdx
137    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
138    addq    %r13, $_acc                          # acc += t0
139    adcq    %r12, %r10                           # t2 += (t1 + CF)
140
141    shrq    \$52, $_acc
142    salq    \$12, %r10
143    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
144
145    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
146    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
147    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
148    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
149    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
150    vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
151    vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
152    vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
153
154    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
155    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
156    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
157    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
158    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
159    vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
160    vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
161    vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
162
163    # Shift accumulators right by 1 qword, zero extending the highest one
164    valignq     \$1, $_R0, $_R0h, $_R0
165    valignq     \$1, $_R0h, $_R1, $_R0h
166    valignq     \$1, $_R1, $_R1h, $_R1
167    valignq     \$1, $_R1h, $_R2, $_R1h
168    valignq     \$1, $_R2, $_R2h, $_R2
169    valignq     \$1, $_R2h, $_R3, $_R2h
170    valignq     \$1, $_R3, $_R3h, $_R3
171    valignq     \$1, $_R3h, $zero, $_R3h
172
173    vmovq   $_R0_xmm, %r13
174    addq    %r13, $_acc    # acc += R0[0]
175
176    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
177    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
178    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
179    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
180    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
181    vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
182    vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
183    vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
184
185    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
186    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
187    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
188    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
189    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
190    vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
191    vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
192    vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
193___
194}
195
196# Normalization routine: handles carry bits and gets bignum qwords to normalized
197# 2^52 representation.
198#
199# Uses %r8-14,%e[abcd]x
200sub amm52x30_x1_norm {
201my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
202$code.=<<___;
203    # Put accumulator to low qword in R0
204    vpbroadcastq    $_acc, $T0
205    vpblendd \$3, $T0, $_R0, $_R0
206
207    # Extract "carries" (12 high bits) from each QW of the bignum
208    # Save them to LSB of QWs in T0..Tn
209    vpsrlq    \$52, $_R0,   $T0
210    vpsrlq    \$52, $_R0h,  $T0h
211    vpsrlq    \$52, $_R1,   $T1
212    vpsrlq    \$52, $_R1h,  $T1h
213    vpsrlq    \$52, $_R2,   $T2
214    vpsrlq    \$52, $_R2h,  $T2h
215    vpsrlq    \$52, $_R3,   $T3
216    vpsrlq    \$52, $_R3h,  $T3h
217
218    # "Shift left" T0..Tn by 1 QW
219    valignq \$3, $T3,  $T3h,  $T3h
220    valignq \$3, $T2h,  $T3,  $T3
221    valignq \$3, $T2,  $T2h,  $T2h
222    valignq \$3, $T1h,  $T2,  $T2
223    valignq \$3, $T1,   $T1h, $T1h
224    valignq \$3, $T0h,  $T1,  $T1
225    valignq \$3, $T0,   $T0h, $T0h
226    valignq \$3, .Lzeros(%rip), $T0,  $T0
227
228    # Drop "carries" from R0..Rn QWs
229    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
230    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
231    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
232    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
233    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
234    vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
235    vpandq    .Lmask52x4(%rip), $_R3,  $_R3
236    vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
237
238    # Sum R0..Rn with corresponding adjusted carries
239    vpaddq  $T0,  $_R0,  $_R0
240    vpaddq  $T0h, $_R0h, $_R0h
241    vpaddq  $T1,  $_R1,  $_R1
242    vpaddq  $T1h, $_R1h, $_R1h
243    vpaddq  $T2,  $_R2,  $_R2
244    vpaddq  $T2h, $_R2h, $_R2h
245    vpaddq  $T3,  $_R3,  $_R3
246    vpaddq  $T3h, $_R3h, $_R3h
247
248    # Now handle carry bits from this addition
249    # Get mask of QWs whose 52-bit parts overflow
250    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
251    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
252    kmovb      %k1,%r14d
253    kmovb      %k2,%r13d
254    shl        \$4,%r13b
255    or         %r13b,%r14b
256
257    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
258    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
259    kmovb      %k1,%r13d
260    kmovb      %k2,%r12d
261    shl        \$4,%r12b
262    or         %r12b,%r13b
263
264    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
265    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
266    kmovb      %k1,%r12d
267    kmovb      %k2,%r11d
268    shl        \$4,%r11b
269    or         %r11b,%r12b
270
271    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
272    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
273    kmovb      %k1,%r11d
274    kmovb      %k2,%r10d
275    shl        \$4,%r10b
276    or         %r10b,%r11b
277
278    addb       %r14b,%r14b
279    adcb       %r13b,%r13b
280    adcb       %r12b,%r12b
281    adcb       %r11b,%r11b
282
283    # Get mask of QWs whose 52-bit parts saturated
284    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
285    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
286    kmovb      %k1,%r9d
287    kmovb      %k2,%r8d
288    shl        \$4,%r8b
289    or         %r8b,%r9b
290
291    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
292    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
293    kmovb      %k1,%r8d
294    kmovb      %k2,%edx
295    shl        \$4,%dl
296    or         %dl,%r8b
297
298    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
299    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
300    kmovb      %k1,%edx
301    kmovb      %k2,%ecx
302    shl        \$4,%cl
303    or         %cl,%dl
304
305    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
306    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
307    kmovb      %k1,%ecx
308    kmovb      %k2,%ebx
309    shl        \$4,%bl
310    or         %bl,%cl
311
312    addb     %r9b,%r14b
313    adcb     %r8b,%r13b
314    adcb     %dl,%r12b
315    adcb     %cl,%r11b
316
317    xor      %r9b,%r14b
318    xor      %r8b,%r13b
319    xor      %dl,%r12b
320    xor      %cl,%r11b
321
322    kmovb    %r14d,%k1
323    shr      \$4,%r14b
324    kmovb    %r14d,%k2
325    kmovb    %r13d,%k3
326    shr      \$4,%r13b
327    kmovb    %r13d,%k4
328    kmovb    %r12d,%k5
329    shr      \$4,%r12b
330    kmovb    %r12d,%k6
331    kmovb    %r11d,%k7
332
333    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
334    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
335    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
336    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
337    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
338    vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
339    vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
340
341    vpandq  .Lmask52x4(%rip), $_R0,  $_R0
342    vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
343    vpandq  .Lmask52x4(%rip), $_R1,  $_R1
344    vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
345    vpandq  .Lmask52x4(%rip), $_R2,  $_R2
346    vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
347    vpandq  .Lmask52x4(%rip), $_R3,  $_R3
348
349    shr    \$4,%r11b
350    kmovb   %r11d,%k1
351
352    vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
353
354    vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
355___
356}
357
358$code.=<<___;
359.text
360
361.globl  ossl_rsaz_amm52x30_x1_ifma256
362.type   ossl_rsaz_amm52x30_x1_ifma256,\@function,5
363.align 32
364ossl_rsaz_amm52x30_x1_ifma256:
365.cfi_startproc
366    endbranch
367    push    %rbx
368.cfi_push   %rbx
369    push    %rbp
370.cfi_push   %rbp
371    push    %r12
372.cfi_push   %r12
373    push    %r13
374.cfi_push   %r13
375    push    %r14
376.cfi_push   %r14
377    push    %r15
378.cfi_push   %r15
379___
380$code.=<<___ if ($win64);
381    lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
382    vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
383    vmovdqa64   %xmm7, `1*16`(%rsp)
384    vmovdqa64   %xmm8, `2*16`(%rsp)
385    vmovdqa64   %xmm9, `3*16`(%rsp)
386    vmovdqa64   %xmm10,`4*16`(%rsp)
387    vmovdqa64   %xmm11,`5*16`(%rsp)
388    vmovdqa64   %xmm12,`6*16`(%rsp)
389    vmovdqa64   %xmm13,`7*16`(%rsp)
390    vmovdqa64   %xmm14,`8*16`(%rsp)
391    vmovdqa64   %xmm15,`9*16`(%rsp)
392.Lossl_rsaz_amm52x30_x1_ifma256_body:
393___
394$code.=<<___;
395    # Zeroing accumulators
396    vpxord   $zero, $zero, $zero
397    vmovdqa64   $zero, $R0_0
398    vmovdqa64   $zero, $R0_0h
399    vmovdqa64   $zero, $R1_0
400    vmovdqa64   $zero, $R1_0h
401    vmovdqa64   $zero, $R2_0
402    vmovdqa64   $zero, $R2_0h
403    vmovdqa64   $zero, $R3_0
404    vmovdqa64   $zero, $R3_0h
405
406    xorl    $acc0_0_low, $acc0_0_low
407
408    movq    $b, $b_ptr                       # backup address of b
409    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
410
411    # Loop over 30 digits unrolled by 4
412    mov     \$7, $iter
413
414.align 32
415.Lloop7:
416___
417    foreach my $idx (0..3) {
418        &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
419    }
420$code.=<<___;
421    lea    `4*8`($b_ptr), $b_ptr
422    dec    $iter
423    jne    .Lloop7
424___
425    &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
426    &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
427
428    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
429$code.=<<___;
430
431    vmovdqu64   $R0_0,  `0*32`($res)
432    vmovdqu64   $R0_0h, `1*32`($res)
433    vmovdqu64   $R1_0,  `2*32`($res)
434    vmovdqu64   $R1_0h, `3*32`($res)
435    vmovdqu64   $R2_0,  `4*32`($res)
436    vmovdqu64   $R2_0h, `5*32`($res)
437    vmovdqu64   $R3_0,  `6*32`($res)
438    vmovdqu64   $R3_0h, `7*32`($res)
439
440    vzeroupper
441    lea     (%rsp),%rax
442.cfi_def_cfa_register   %rax
443___
444$code.=<<___ if ($win64);
445    vmovdqa64   `0*16`(%rax),%xmm6
446    vmovdqa64   `1*16`(%rax),%xmm7
447    vmovdqa64   `2*16`(%rax),%xmm8
448    vmovdqa64   `3*16`(%rax),%xmm9
449    vmovdqa64   `4*16`(%rax),%xmm10
450    vmovdqa64   `5*16`(%rax),%xmm11
451    vmovdqa64   `6*16`(%rax),%xmm12
452    vmovdqa64   `7*16`(%rax),%xmm13
453    vmovdqa64   `8*16`(%rax),%xmm14
454    vmovdqa64   `9*16`(%rax),%xmm15
455    lea  168(%rsp),%rax
456___
457$code.=<<___;
458    mov  0(%rax),%r15
459.cfi_restore    %r15
460    mov  8(%rax),%r14
461.cfi_restore    %r14
462    mov  16(%rax),%r13
463.cfi_restore    %r13
464    mov  24(%rax),%r12
465.cfi_restore    %r12
466    mov  32(%rax),%rbp
467.cfi_restore    %rbp
468    mov  40(%rax),%rbx
469.cfi_restore    %rbx
470    lea  48(%rax),%rsp       # restore rsp
471.cfi_def_cfa %rsp,8
472.Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
473    ret
474.cfi_endproc
475.size   ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
476___
477
478$code.=<<___;
479.data
480.align 32
481.Lmask52x4:
482    .quad   0xfffffffffffff
483    .quad   0xfffffffffffff
484    .quad   0xfffffffffffff
485    .quad   0xfffffffffffff
486___
487
488###############################################################################
489# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52
490#
491# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost
492# Montgomery Multiplication algorithm and function input parameters description.
493#
494# This function does two AMMs for two independent inputs, hence dual.
495#
496# NOTE: the function uses zero-padded data - 2 high QWs is a padding.
497#
498# void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
499#                                    const BN_ULONG a[2][32],
500#                                    const BN_ULONG b[2][32],
501#                                    const BN_ULONG m[2][32],
502#                                    const BN_ULONG k0[2]);
503###############################################################################
504
505$code.=<<___;
506.text
507
508.globl  ossl_rsaz_amm52x30_x2_ifma256
509.type   ossl_rsaz_amm52x30_x2_ifma256,\@function,5
510.align 32
511ossl_rsaz_amm52x30_x2_ifma256:
512.cfi_startproc
513    endbranch
514    push    %rbx
515.cfi_push   %rbx
516    push    %rbp
517.cfi_push   %rbp
518    push    %r12
519.cfi_push   %r12
520    push    %r13
521.cfi_push   %r13
522    push    %r14
523.cfi_push   %r14
524    push    %r15
525.cfi_push   %r15
526___
527$code.=<<___ if ($win64);
528    lea     -168(%rsp),%rsp
529    vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
530    vmovdqa64   %xmm7, `1*16`(%rsp)
531    vmovdqa64   %xmm8, `2*16`(%rsp)
532    vmovdqa64   %xmm9, `3*16`(%rsp)
533    vmovdqa64   %xmm10,`4*16`(%rsp)
534    vmovdqa64   %xmm11,`5*16`(%rsp)
535    vmovdqa64   %xmm12,`6*16`(%rsp)
536    vmovdqa64   %xmm13,`7*16`(%rsp)
537    vmovdqa64   %xmm14,`8*16`(%rsp)
538    vmovdqa64   %xmm15,`9*16`(%rsp)
539.Lossl_rsaz_amm52x30_x2_ifma256_body:
540___
541$code.=<<___;
542    # Zeroing accumulators
543    vpxord   $zero, $zero, $zero
544    vmovdqa64   $zero, $R0_0
545    vmovdqa64   $zero, $R0_0h
546    vmovdqa64   $zero, $R1_0
547    vmovdqa64   $zero, $R1_0h
548    vmovdqa64   $zero, $R2_0
549    vmovdqa64   $zero, $R2_0h
550    vmovdqa64   $zero, $R3_0
551    vmovdqa64   $zero, $R3_0h
552
553    vmovdqa64   $zero, $R0_1
554    vmovdqa64   $zero, $R0_1h
555    vmovdqa64   $zero, $R1_1
556    vmovdqa64   $zero, $R1_1h
557    vmovdqa64   $zero, $R2_1
558    vmovdqa64   $zero, $R2_1h
559    vmovdqa64   $zero, $R3_1
560    vmovdqa64   $zero, $R3_1h
561
562
563    xorl    $acc0_0_low, $acc0_0_low
564    xorl    $acc0_1_low, $acc0_1_low
565
566    movq    $b, $b_ptr                       # backup address of b
567    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
568
569    mov    \$30, $iter
570
571.align 32
572.Lloop30:
573___
574    &amm52x30_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
575    # 32*8 = offset of the next dimension in two-dimension array
576    &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
577$code.=<<___;
578    lea    8($b_ptr), $b_ptr
579    dec    $iter
580    jne    .Lloop30
581___
582    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
583    &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
584$code.=<<___;
585
586    vmovdqu64   $R0_0,  `0*32`($res)
587    vmovdqu64   $R0_0h, `1*32`($res)
588    vmovdqu64   $R1_0,  `2*32`($res)
589    vmovdqu64   $R1_0h, `3*32`($res)
590    vmovdqu64   $R2_0,  `4*32`($res)
591    vmovdqu64   $R2_0h, `5*32`($res)
592    vmovdqu64   $R3_0,  `6*32`($res)
593    vmovdqu64   $R3_0h, `7*32`($res)
594
595    vmovdqu64   $R0_1,  `8*32`($res)
596    vmovdqu64   $R0_1h, `9*32`($res)
597    vmovdqu64   $R1_1,  `10*32`($res)
598    vmovdqu64   $R1_1h, `11*32`($res)
599    vmovdqu64   $R2_1,  `12*32`($res)
600    vmovdqu64   $R2_1h, `13*32`($res)
601    vmovdqu64   $R3_1,  `14*32`($res)
602    vmovdqu64   $R3_1h, `15*32`($res)
603
604    vzeroupper
605    lea     (%rsp),%rax
606.cfi_def_cfa_register   %rax
607___
608$code.=<<___ if ($win64);
609    vmovdqa64   `0*16`(%rax),%xmm6
610    vmovdqa64   `1*16`(%rax),%xmm7
611    vmovdqa64   `2*16`(%rax),%xmm8
612    vmovdqa64   `3*16`(%rax),%xmm9
613    vmovdqa64   `4*16`(%rax),%xmm10
614    vmovdqa64   `5*16`(%rax),%xmm11
615    vmovdqa64   `6*16`(%rax),%xmm12
616    vmovdqa64   `7*16`(%rax),%xmm13
617    vmovdqa64   `8*16`(%rax),%xmm14
618    vmovdqa64   `9*16`(%rax),%xmm15
619    lea     168(%rsp),%rax
620___
621$code.=<<___;
622    mov  0(%rax),%r15
623.cfi_restore    %r15
624    mov  8(%rax),%r14
625.cfi_restore    %r14
626    mov  16(%rax),%r13
627.cfi_restore    %r13
628    mov  24(%rax),%r12
629.cfi_restore    %r12
630    mov  32(%rax),%rbp
631.cfi_restore    %rbp
632    mov  40(%rax),%rbx
633.cfi_restore    %rbx
634    lea  48(%rax),%rsp
635.cfi_def_cfa    %rsp,8
636.Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
637    ret
638.cfi_endproc
639.size   ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
640___
641}
642
643###############################################################################
644# Constant time extraction from the precomputed table of powers base^i, where
645#    i = 0..2^EXP_WIN_SIZE-1
646#
647# The input |red_table| contains precomputations for two independent base values.
648# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
649#
650# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix.
651# (2 high QW is zero padding)
652#
653# void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
654#                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
655#                                        int red_table_idx1, int red_table_idx2);
656#
657# EXP_WIN_SIZE = 5
658###############################################################################
659{
660# input parameters
661my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
662                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
663
664my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
665my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
666my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
667
668my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
669my $t0xmm = $t0;
670$t0xmm =~ s/%y/%x/;
671
672$code.=<<___;
673.text
674
675.align 32
676.globl  ossl_extract_multiplier_2x30_win5
677.type   ossl_extract_multiplier_2x30_win5,\@abi-omnipotent
678ossl_extract_multiplier_2x30_win5:
679.cfi_startproc
680    endbranch
681    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
682    vpbroadcastq    $red_tbl_idx1, $idx1
683    vpbroadcastq    $red_tbl_idx2, $idx2
684    leaq   `(1<<5)*2*32*8`($red_tbl), %rax  # holds end of the tbl
685
686    # zeroing t0..n, cur_idx
687    vpxor   $t0xmm, $t0xmm, $t0xmm
688    vmovdqa64   $t0, $cur_idx
689___
690foreach (1..15) {
691    $code.="vmovdqa64   $t0, $t[$_] \n";
692}
693$code.=<<___;
694
695.align 32
696.Lloop:
697    vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
698    vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
699___
700foreach (0..15) {
701    my $mask = $_<8?"%k1":"%k2";
702$code.=<<___;
703    vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
704    vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
705___
706}
707$code.=<<___;
708    vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
709    addq    \$`2*32*8`, $red_tbl
710    cmpq    $red_tbl, %rax
711    jne .Lloop
712___
713# store t0..n
714foreach (0..15) {
715    $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
716}
717$code.=<<___;
718
719    ret
720.cfi_endproc
721.size   ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
722___
723$code.=<<___;
724.data
725.align 32
726.Lones:
727    .quad   1,1,1,1
728.Lzeros:
729    .quad   0,0,0,0
730___
731}
732
733if ($win64) {
734$rec="%rcx";
735$frame="%rdx";
736$context="%r8";
737$disp="%r9";
738
739$code.=<<___;
740.extern     __imp_RtlVirtualUnwind
741.type   rsaz_avx_handler,\@abi-omnipotent
742.align  16
743rsaz_avx_handler:
744    push    %rsi
745    push    %rdi
746    push    %rbx
747    push    %rbp
748    push    %r12
749    push    %r13
750    push    %r14
751    push    %r15
752    pushfq
753    sub     \$64,%rsp
754
755    mov     120($context),%rax # pull context->Rax
756    mov     248($context),%rbx # pull context->Rip
757
758    mov     8($disp),%rsi      # disp->ImageBase
759    mov     56($disp),%r11     # disp->HandlerData
760
761    mov     0(%r11),%r10d      # HandlerData[0]
762    lea     (%rsi,%r10),%r10   # prologue label
763    cmp     %r10,%rbx          # context->Rip<.Lprologue
764    jb  .Lcommon_seh_tail
765
766    mov     4(%r11),%r10d      # HandlerData[1]
767    lea     (%rsi,%r10),%r10   # epilogue label
768    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
769    jae     .Lcommon_seh_tail
770
771    mov     152($context),%rax # pull context->Rsp
772
773    lea     (%rax),%rsi         # %xmm save area
774    lea     512($context),%rdi  # & context.Xmm6
775    mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
776    .long   0xa548f3fc          # cld; rep movsq
777
778    lea     `48+168`(%rax),%rax
779
780    mov     -8(%rax),%rbx
781    mov     -16(%rax),%rbp
782    mov     -24(%rax),%r12
783    mov     -32(%rax),%r13
784    mov     -40(%rax),%r14
785    mov     -48(%rax),%r15
786    mov     %rbx,144($context) # restore context->Rbx
787    mov     %rbp,160($context) # restore context->Rbp
788    mov     %r12,216($context) # restore context->R12
789    mov     %r13,224($context) # restore context->R13
790    mov     %r14,232($context) # restore context->R14
791    mov     %r15,240($context) # restore context->R14
792
793.Lcommon_seh_tail:
794    mov     8(%rax),%rdi
795    mov     16(%rax),%rsi
796    mov     %rax,152($context) # restore context->Rsp
797    mov     %rsi,168($context) # restore context->Rsi
798    mov     %rdi,176($context) # restore context->Rdi
799
800    mov     40($disp),%rdi     # disp->ContextRecord
801    mov     $context,%rsi      # context
802    mov     \$154,%ecx         # sizeof(CONTEXT)
803    .long   0xa548f3fc         # cld; rep movsq
804
805    mov     $disp,%rsi
806    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
807    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
808    mov     0(%rsi),%r8        # arg3, disp->ControlPc
809    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
810    mov     40(%rsi),%r10      # disp->ContextRecord
811    lea     56(%rsi),%r11      # &disp->HandlerData
812    lea     24(%rsi),%r12      # &disp->EstablisherFrame
813    mov     %r10,32(%rsp)      # arg5
814    mov     %r11,40(%rsp)      # arg6
815    mov     %r12,48(%rsp)      # arg7
816    mov     %rcx,56(%rsp)      # arg8, (NULL)
817    call    *__imp_RtlVirtualUnwind(%rip)
818
819    mov     \$1,%eax           # ExceptionContinueSearch
820    add     \$64,%rsp
821    popfq
822    pop     %r15
823    pop     %r14
824    pop     %r13
825    pop     %r12
826    pop     %rbp
827    pop     %rbx
828    pop     %rdi
829    pop     %rsi
830    ret
831.size   rsaz_avx_handler,.-rsaz_avx_handler
832
833.section    .pdata
834.align  4
835    .rva    .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256
836    .rva    .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256
837    .rva    .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256
838
839    .rva    .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256
840    .rva    .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256
841    .rva    .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256
842
843.section    .xdata
844.align  8
845.LSEH_info_ossl_rsaz_amm52x30_x1_ifma256:
846    .byte   9,0,0,0
847    .rva    rsaz_avx_handler
848    .rva    .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue
849.LSEH_info_ossl_rsaz_amm52x30_x2_ifma256:
850    .byte   9,0,0,0
851    .rva    rsaz_avx_handler
852    .rva    .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue
853___
854}
855}}} else {{{                # fallback for old assembler
856$code.=<<___;
857.text
858
859.globl  ossl_rsaz_amm52x30_x1_ifma256
860.globl  ossl_rsaz_amm52x30_x2_ifma256
861.globl  ossl_extract_multiplier_2x30_win5
862.type   ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
863ossl_rsaz_amm52x30_x1_ifma256:
864ossl_rsaz_amm52x30_x2_ifma256:
865ossl_extract_multiplier_2x30_win5:
866    .byte   0x0f,0x0b    # ud2
867    ret
868.size   ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
869___
870}}}
871
872$code =~ s/\`([^\`]*)\`/eval $1/gem;
873print $code;
874close STDOUT or die "error closing STDOUT: $!";
875