xref: /openssl/crypto/modes/asm/aes-gcm-avx512.pl (revision 7ed6de99)
1# Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved.
2# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
11# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
12# (https://github.com/intel/intel-ipsec-mb).
13# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
14#
15# References:
16#  [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
17#      Intel Architecture Processors. August, 2010.
18#  [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
19#      Intel Architecture Processors. October, 2012.
20#  [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
21#      Usage for Computing the GCM Mode. May, 2010.
22#
23#
24# December 2021
25#
26# Initial release.
27#
28# GCM128_CONTEXT structure has storage for 16 hkeys only, but this
29# implementation can use up to 48.  To avoid extending the context size,
30# precompute and store in the context first 16 hkeys only, and compute the rest
31# on demand keeping them in the local frame.
32#
33#======================================================================
34# $output is the last argument if it looks like a file (it has an extension)
35# $flavour is the first argument if it doesn't look like a file
36$output  = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop   : undef;
37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.|          ? shift : undef;
38
39$win64 = 0;
40$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42$avx512vaes = 0;
43
44$0 =~ m/(.*[\/\\])[^\/\\]+$/;
45$dir = $1;
46($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
47  or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
48  or die "can't locate x86_64-xlate.pl";
49
50if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
51  $avx512vaes = ($1 >= 2.30);
52}
53
54if (!$avx512vaes
55  && $win64
56  && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
57  && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
58{
59  $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
60}
61
62if (!$avx512vaes && `$ENV{CC} -v 2>&1`
63    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
64    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
65    if ($1) {
66        # Apple conditions, they use a different version series, see
67        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
68        # clang 7.0.0 is Apple clang 10.0.1
69        $avx512vaes = ($ver>=10.0001)
70    } else {
71        $avx512vaes = ($ver>=7.0);
72    }
73}
74
75open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
76  or die "can't call $xlate: $!";
77*STDOUT = *OUT;
78
79#======================================================================
80if ($avx512vaes>0) { #<<<
81
82$code .= <<___;
83.extern OPENSSL_ia32cap_P
84.globl  ossl_vaes_vpclmulqdq_capable
85.type   ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
86.align 32
87ossl_vaes_vpclmulqdq_capable:
88    mov OPENSSL_ia32cap_P+8(%rip), %rcx
89    # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
90    mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
91    xor %eax,%eax
92    and %rdx,%rcx
93    cmp %rdx,%rcx
94    cmove %rcx,%rax
95    ret
96.size   ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
97___
98
99# ; Mapping key length -> AES rounds count
100my %aes_rounds = (
101  128 => 9,
102  192 => 11,
103  256 => 13);
104
105# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
106# ;;; Code generation control switches
107# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
108
109# ; ABI-aware zeroing of volatile registers in EPILOG().
110# ; Disabled due to performance reasons.
111my $CLEAR_SCRATCH_REGISTERS = 0;
112
113# ; Zero HKeys storage from the stack if they are stored there
114my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
115
116# ; Enable / disable check of function arguments for null pointer
117# ; Currently disabled, as this check is handled outside.
118my $CHECK_FUNCTION_ARGUMENTS = 0;
119
120# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121# ;;; Global constants
122# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
123
124# AES block size in bytes
125my $AES_BLOCK_SIZE = 16;
126
127# Storage capacity in elements
128my $HKEYS_STORAGE_CAPACITY = 48;
129my $LOCAL_STORAGE_CAPACITY = 48;
130my $HKEYS_CONTEXT_CAPACITY = 16;
131
132# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133# ;;; Stack frame definition
134# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
135
136# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
137# (2) -> +8-byte space for 16-byte alignment of XMM storage
138# (3) -> Frame pointer (%RBP)
139# (4) -> +160-byte XMM storage (Windows only, zero on Linux)
140# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
141# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
142# (7) -> +768-byte HKEYS storage
143# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
144
145my $GP_STORAGE  = $win64 ? 8 * 8     : 8 * 6;    # ; space for saved non-volatile GP registers (pushed on stack)
146my $XMM_STORAGE = $win64 ? (10 * 16) : 0;        # ; space for saved XMM registers
147my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE);    # ; space for HKeys^i, i=1..48
148my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE);    # ; space for up to 48 AES blocks
149
150my $STACK_HKEYS_OFFSET = 0;
151my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
152
153# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
154# ;;; Function arguments abstraction
155# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
156my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
157
158# ; Counter used for assembly label generation
159my $label_count = 0;
160
161# ; This implementation follows the convention: for non-leaf functions (they
162# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
163# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)].  This
164# ; helps to facilitate SEH handlers writing.
165#
166# ; Leaf functions here do not use more than 4 input arguments.
167if ($win64) {
168  $arg1  = "%rcx";
169  $arg2  = "%rdx";
170  $arg3  = "%r8";
171  $arg4  = "%r9";
172  $arg5  = "`$GP_STORAGE + 8 + 8*5`(%rbp)";    # +8 - alignment bytes
173  $arg6  = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
174  $arg7  = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
175  $arg8  = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
176  $arg9  = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
177  $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
178  $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
179} else {
180  $arg1  = "%rdi";
181  $arg2  = "%rsi";
182  $arg3  = "%rdx";
183  $arg4  = "%rcx";
184  $arg5  = "%r8";
185  $arg6  = "%r9";
186  $arg7  = "`$GP_STORAGE + 8*1`(%rbp)";
187  $arg8  = "`$GP_STORAGE + 8*2`(%rbp)";
188  $arg9  = "`$GP_STORAGE + 8*3`(%rbp)";
189  $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
190  $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
191}
192
193# ; Offsets in gcm128_context structure (see include/crypto/modes.h)
194my $CTX_OFFSET_CurCount  = (16 * 0);          #  ; (Yi) Current counter for generation of encryption key
195my $CTX_OFFSET_PEncBlock = (16 * 1);          #  ; (repurposed EKi field) Partial block buffer
196my $CTX_OFFSET_EK0       = (16 * 2);          #  ; (EK0) Encrypted Y0 counter (see gcm spec notation)
197my $CTX_OFFSET_AadLen    = (16 * 3);          #  ; (len.u[0]) Length of Hash which has been input
198my $CTX_OFFSET_InLen     = ((16 * 3) + 8);    #  ; (len.u[1]) Length of input data which will be encrypted or decrypted
199my $CTX_OFFSET_AadHash   = (16 * 4);          #  ; (Xi) Current hash
200my $CTX_OFFSET_HTable    = (16 * 6);          #  ; (Htable) Precomputed table (allows 16 values)
201
202# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
203# ;;; Helper functions
204# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
205
206sub BYTE {
207  my ($reg) = @_;
208  if ($reg =~ /%r[abcd]x/i) {
209    $reg =~ s/%r([abcd])x/%${1}l/i;
210  } elsif ($reg =~ /%r[sdb][ip]/i) {
211    $reg =~ s/%r([sdb][ip])/%${1}l/i;
212  } elsif ($reg =~ /%r[0-9]{1,2}/i) {
213    $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
214  } else {
215    die "BYTE: unknown register: $reg\n";
216  }
217  return $reg;
218}
219
220sub WORD {
221  my ($reg) = @_;
222  if ($reg =~ /%r[abcdsdb][xip]/i) {
223    $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
224  } elsif ($reg =~ /%r[0-9]{1,2}/) {
225    $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
226  } else {
227    die "WORD: unknown register: $reg\n";
228  }
229  return $reg;
230}
231
232sub DWORD {
233  my ($reg) = @_;
234  if ($reg =~ /%r[abcdsdb][xip]/i) {
235    $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
236  } elsif ($reg =~ /%r[0-9]{1,2}/i) {
237    $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
238  } else {
239    die "DWORD: unknown register: $reg\n";
240  }
241  return $reg;
242}
243
244sub XWORD {
245  my ($reg) = @_;
246  if ($reg =~ /%[xyz]mm/i) {
247    $reg =~ s/%[xyz]mm/%xmm/i;
248  } else {
249    die "XWORD: unknown register: $reg\n";
250  }
251  return $reg;
252}
253
254sub YWORD {
255  my ($reg) = @_;
256  if ($reg =~ /%[xyz]mm/i) {
257    $reg =~ s/%[xyz]mm/%ymm/i;
258  } else {
259    die "YWORD: unknown register: $reg\n";
260  }
261  return $reg;
262}
263
264sub ZWORD {
265  my ($reg) = @_;
266  if ($reg =~ /%[xyz]mm/i) {
267    $reg =~ s/%[xyz]mm/%zmm/i;
268  } else {
269    die "ZWORD: unknown register: $reg\n";
270  }
271  return $reg;
272}
273
274# ; Helper function to construct effective address based on two kinds of
275# ; offsets: numerical or located in the register
276sub EffectiveAddress {
277  my ($base, $offset, $displacement) = @_;
278  $displacement = 0 if (!$displacement);
279
280  if ($offset =~ /^\d+\z/) {    # numerical offset
281    return "`$offset + $displacement`($base)";
282  } else {                      # offset resides in register
283    return "$displacement($base,$offset,1)";
284  }
285}
286
287# ; Provides memory location of corresponding HashKey power
288sub HashKeyByIdx {
289  my ($idx, $base) = @_;
290  my $base_str = ($base eq "%rsp") ? "frame" : "context";
291
292  my $offset = &HashKeyOffsetByIdx($idx, $base_str);
293  return "$offset($base)";
294}
295
296# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
297sub HashKeyOffsetByIdx {
298  my ($idx, $base) = @_;
299  die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
300    if (($base ne "frame") && ($base ne "context"));
301
302  my $offset_base;
303  my $offset_idx;
304  if ($base eq "frame") {    # frame storage
305    die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
306    $offset_base = $STACK_HKEYS_OFFSET;
307    $offset_idx  = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
308  } else {                   # context storage
309    die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
310    $offset_base = $CTX_OFFSET_HTable;
311    $offset_idx  = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
312  }
313  return $offset_base + $offset_idx;
314}
315
316# ; Creates local frame and does back up of non-volatile registers.
317# ; Holds stack unwinding directives.
318sub PROLOG {
319  my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
320
321  my $DYNAMIC_STACK_ALLOC_SIZE            = 0;
322  my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
323
324  if ($need_hkeys_stack_storage) {
325    $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
326  }
327
328  if ($need_aes_stack_storage) {
329    if (!$need_hkeys_stack_storage) {
330      die "PROLOG: unsupported case - aes storage without hkeys one";
331    }
332    $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
333  }
334
335  $code .= <<___;
336    push    %rbx
337.cfi_push   %rbx
338.L${func_name}_seh_push_rbx:
339    push    %rbp
340.cfi_push   %rbp
341.L${func_name}_seh_push_rbp:
342    push    %r12
343.cfi_push   %r12
344.L${func_name}_seh_push_r12:
345    push    %r13
346.cfi_push   %r13
347.L${func_name}_seh_push_r13:
348    push    %r14
349.cfi_push   %r14
350.L${func_name}_seh_push_r14:
351    push    %r15
352.cfi_push   %r15
353.L${func_name}_seh_push_r15:
354___
355
356  if ($win64) {
357    $code .= <<___;
358    push    %rdi
359.L${func_name}_seh_push_rdi:
360    push    %rsi
361.L${func_name}_seh_push_rsi:
362
363    sub     \$`$XMM_STORAGE+8`,%rsp   # +8 alignment
364.L${func_name}_seh_allocstack_xmm:
365___
366  }
367  $code .= <<___;
368    # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
369    # ; bytes of alignment (Windows only)].  It serves as a frame pointer in SEH
370    # ; handlers. The requirement for a frame pointer is that its offset from
371    # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
372    # ; itself seems to be reasonable to use here, because later we do 64-byte stack
373    # ; alignment which gives us non-determinate offsets and complicates writing
374    # ; SEH handlers.
375    #
376    # ; It also serves as an anchor for retrieving stack arguments on both Linux
377    # ; and Windows.
378    lea     `$XMM_STORAGE`(%rsp),%rbp
379.cfi_def_cfa_register %rbp
380.L${func_name}_seh_setfp:
381___
382  if ($win64) {
383
384    # ; xmm6:xmm15 need to be preserved on Windows
385    foreach my $reg_idx (6 .. 15) {
386      my $xmm_reg_offset = ($reg_idx - 6) * 16;
387      $code .= <<___;
388        vmovdqu           %xmm${reg_idx},$xmm_reg_offset(%rsp)
389.L${func_name}_seh_save_xmm${reg_idx}:
390___
391    }
392  }
393
394  $code .= <<___;
395# Prolog ends here. Next stack allocation is treated as "dynamic".
396.L${func_name}_seh_prolog_end:
397___
398
399  if ($DYNAMIC_STACK_ALLOC_SIZE) {
400    $code .= <<___;
401        sub               \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
402        and               \$(-64),%rsp
403___
404  }
405}
406
407# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
408# ;;; Restore register content for the caller.
409# ;;; And cleanup stack.
410# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
411sub EPILOG {
412  my ($hkeys_storage_on_stack, $payload_len) = @_;
413
414  my $label_suffix = $label_count++;
415
416  if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
417
418    # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
419    # ; were stored in the local frame storage
420    $code .= <<___;
421        cmpq              \$`16*16`,$payload_len
422        jbe               .Lskip_hkeys_cleanup_${label_suffix}
423        vpxor             %xmm0,%xmm0,%xmm0
424___
425    for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
426      $code .= "vmovdqa64         %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
427    }
428    $code .= ".Lskip_hkeys_cleanup_${label_suffix}:\n";
429  }
430
431  if ($CLEAR_SCRATCH_REGISTERS) {
432    &clear_scratch_gps_asm();
433    &clear_scratch_zmms_asm();
434  } else {
435    $code .= "vzeroupper\n";
436  }
437
438  if ($win64) {
439
440    # ; restore xmm15:xmm6
441    for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
442      my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
443      $code .= <<___;
444        vmovdqu           $xmm_reg_offset(%rbp),%xmm${reg_idx},
445___
446    }
447  }
448
449  if ($win64) {
450
451    # Forming valid epilog for SEH with use of frame pointer.
452    # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
453    $code .= "lea      8(%rbp),%rsp\n";
454  } else {
455    $code .= "lea      (%rbp),%rsp\n";
456    $code .= ".cfi_def_cfa_register %rsp\n";
457  }
458
459  if ($win64) {
460    $code .= <<___;
461     pop     %rsi
462.cfi_pop     %rsi
463     pop     %rdi
464.cfi_pop     %rdi
465___
466  }
467  $code .= <<___;
468     pop     %r15
469.cfi_pop     %r15
470     pop     %r14
471.cfi_pop     %r14
472     pop     %r13
473.cfi_pop     %r13
474     pop     %r12
475.cfi_pop     %r12
476     pop     %rbp
477.cfi_pop     %rbp
478     pop     %rbx
479.cfi_pop     %rbx
480___
481}
482
483# ; Clears all scratch ZMM registers
484# ;
485# ; It should be called before restoring the XMM registers
486# ; for Windows (XMM6-XMM15).
487# ;
488sub clear_scratch_zmms_asm {
489
490  # ; On Linux, all ZMM registers are scratch registers
491  if (!$win64) {
492    $code .= "vzeroall\n";
493  } else {
494    foreach my $i (0 .. 5) {
495      $code .= "vpxorq  %xmm${i},%xmm${i},%xmm${i}\n";
496    }
497  }
498  foreach my $i (16 .. 31) {
499    $code .= "vpxorq  %xmm${i},%xmm${i},%xmm${i}\n";
500  }
501}
502
503# Clears all scratch GP registers
504sub clear_scratch_gps_asm {
505  foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
506    $code .= "xor $reg,$reg\n";
507  }
508  if (!$win64) {
509    foreach my $reg ("%rsi", "%rdi") {
510      $code .= "xor $reg,$reg\n";
511    }
512  }
513}
514
515sub precompute_hkeys_on_stack {
516  my $GCM128_CTX  = $_[0];
517  my $HKEYS_READY = $_[1];
518  my $ZTMP0       = $_[2];
519  my $ZTMP1       = $_[3];
520  my $ZTMP2       = $_[4];
521  my $ZTMP3       = $_[5];
522  my $ZTMP4       = $_[6];
523  my $ZTMP5       = $_[7];
524  my $ZTMP6       = $_[8];
525  my $HKEYS_RANGE = $_[9];    # ; "first16", "mid16", "all", "first32", "last32"
526
527  die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
528    if ($HKEYS_RANGE ne "first16"
529    && $HKEYS_RANGE ne "mid16"
530    && $HKEYS_RANGE ne "all"
531    && $HKEYS_RANGE ne "first32"
532    && $HKEYS_RANGE ne "last32");
533
534  my $label_suffix = $label_count++;
535
536  $code .= <<___;
537        test              $HKEYS_READY,$HKEYS_READY
538        jnz               .L_skip_hkeys_precomputation_${label_suffix}
539___
540
541  if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
542
543    # ; Fill the stack with the first 16 hkeys from the context
544    $code .= <<___;
545        # ; Move 16 hkeys from the context to stack
546        vmovdqu64         @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
547        vmovdqu64         $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
548
549        vmovdqu64         @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
550        vmovdqu64         $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
551
552        # ; broadcast HashKey^8
553        vshufi64x2        \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
554
555        vmovdqu64         @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
556        vmovdqu64         $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
557
558        vmovdqu64         @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
559        vmovdqu64         $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
560___
561  }
562
563  if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
564    $code .= <<___;
565        vmovdqu64         @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
566
567        # ; broadcast HashKey^8
568        vshufi64x2        \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
569
570        vmovdqu64         @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
571        vmovdqu64         @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
572___
573
574  }
575
576  if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
577
578    # ; Precompute hkeys^i, i=17..32
579    my $i = 20;
580    foreach (1 .. int((32 - 16) / 8)) {
581
582      # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
583      &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
584      $code .= "vmovdqu64         $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
585      $i += 4;
586
587      # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
588      &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
589      $code .= "vmovdqu64         $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
590      $i += 4;
591    }
592  }
593
594  if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
595
596    # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
597    my $i = 36;
598    foreach (1 .. int((48 - 32) / 8)) {
599
600      # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
601      &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
602      $code .= "vmovdqu64         $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
603      $i += 4;
604
605      # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
606      &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
607      $code .= "vmovdqu64         $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
608      $i += 4;
609    }
610  }
611
612  $code .= ".L_skip_hkeys_precomputation_${label_suffix}:\n";
613}
614
615# ;; =============================================================================
616# ;; Generic macro to produce code that executes $OPCODE instruction
617# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
618# ;; All three operands of the instruction come from registers.
619# ;; Note: if 3 blocks are left at the end instruction is produced to operate all
620# ;;       4 blocks (full width of ZMM)
621sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
622  my $NUM_BLOCKS = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
623  my $OPCODE     = $_[1];    # [in] instruction name
624  my @DST;
625  $DST[0] = $_[2];           # [out] destination ZMM register
626  $DST[1] = $_[3];           # [out] destination ZMM register
627  $DST[2] = $_[4];           # [out] destination ZMM register
628  $DST[3] = $_[5];           # [out] destination ZMM register
629  my @SRC1;
630  $SRC1[0] = $_[6];          # [in] source 1 ZMM register
631  $SRC1[1] = $_[7];          # [in] source 1 ZMM register
632  $SRC1[2] = $_[8];          # [in] source 1 ZMM register
633  $SRC1[3] = $_[9];          # [in] source 1 ZMM register
634  my @SRC2;
635  $SRC2[0] = $_[10];         # [in] source 2 ZMM register
636  $SRC2[1] = $_[11];         # [in] source 2 ZMM register
637  $SRC2[2] = $_[12];         # [in] source 2 ZMM register
638  $SRC2[3] = $_[13];         # [in] source 2 ZMM register
639
640  die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
641    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
642
643  my $reg_idx     = 0;
644  my $blocks_left = $NUM_BLOCKS;
645
646  foreach (1 .. ($NUM_BLOCKS / 4)) {
647    $code .= "$OPCODE        $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
648    $reg_idx++;
649    $blocks_left -= 4;
650  }
651
652  my $DSTREG  = $DST[$reg_idx];
653  my $SRC1REG = $SRC1[$reg_idx];
654  my $SRC2REG = $SRC2[$reg_idx];
655
656  if ($blocks_left == 1) {
657    $code .= "$OPCODE         @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
658  } elsif ($blocks_left == 2) {
659    $code .= "$OPCODE         @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
660  } elsif ($blocks_left == 3) {
661    $code .= "$OPCODE         $SRC2REG,$SRC1REG,$DSTREG\n";
662  }
663}
664
665# ;; =============================================================================
666# ;; Loads specified number of AES blocks into ZMM registers using mask register
667# ;; for the last loaded register (xmm, ymm or zmm).
668# ;; Loads take place at 1 byte granularity.
669sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
670  my $NUM_BLOCKS  = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
671  my $INP         = $_[1];    # [in] input data pointer to read from
672  my $DATA_OFFSET = $_[2];    # [in] offset to the output pointer (GP or numerical)
673  my @DST;
674  $DST[0] = $_[3];            # [out] ZMM register with loaded data
675  $DST[1] = $_[4];            # [out] ZMM register with loaded data
676  $DST[2] = $_[5];            # [out] ZMM register with loaded data
677  $DST[3] = $_[6];            # [out] ZMM register with loaded data
678  my $MASK = $_[7];           # [in] mask register
679
680  die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
681    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
682
683  my $src_offset  = 0;
684  my $dst_idx     = 0;
685  my $blocks_left = $NUM_BLOCKS;
686
687  if ($NUM_BLOCKS > 0) {
688    foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
689      $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
690      $src_offset += 64;
691      $dst_idx++;
692      $blocks_left -= 4;
693    }
694  }
695
696  my $DSTREG = $DST[$dst_idx];
697
698  if ($blocks_left == 1) {
699    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
700  } elsif ($blocks_left == 2) {
701    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
702  } elsif (($blocks_left == 3 || $blocks_left == 4)) {
703    $code .= "vmovdqu8          @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
704  }
705}
706
707# ;; =============================================================================
708# ;; Stores specified number of AES blocks from ZMM registers with mask register
709# ;; for the last loaded register (xmm, ymm or zmm).
710# ;; Stores take place at 1 byte granularity.
711sub ZMM_STORE_MASKED_BLOCKS_0_16 {
712  my $NUM_BLOCKS  = $_[0];    # [in] numerical value, number of AES blocks (0 to 16)
713  my $OUTP        = $_[1];    # [in] output data pointer to write to
714  my $DATA_OFFSET = $_[2];    # [in] offset to the output pointer (GP or numerical)
715  my @SRC;
716  $SRC[0] = $_[3];            # [in] ZMM register with data to store
717  $SRC[1] = $_[4];            # [in] ZMM register with data to store
718  $SRC[2] = $_[5];            # [in] ZMM register with data to store
719  $SRC[3] = $_[6];            # [in] ZMM register with data to store
720  my $MASK = $_[7];           # [in] mask register
721
722  die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
723    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
724
725  my $dst_offset  = 0;
726  my $src_idx     = 0;
727  my $blocks_left = $NUM_BLOCKS;
728
729  if ($NUM_BLOCKS > 0) {
730    foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
731      $code .= "vmovdqu8          $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
732      $dst_offset += 64;
733      $src_idx++;
734      $blocks_left -= 4;
735    }
736  }
737
738  my $SRCREG = $SRC[$src_idx];
739
740  if ($blocks_left == 1) {
741    $code .= "vmovdqu8          @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
742  } elsif ($blocks_left == 2) {
743    $code .= "vmovdqu8          @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
744  } elsif ($blocks_left == 3 || $blocks_left == 4) {
745    $code .= "vmovdqu8          $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
746  }
747}
748
749# ;;; ===========================================================================
750# ;;; Handles AES encryption rounds
751# ;;; It handles special cases: the last and first rounds
752# ;;; Optionally, it performs XOR with data after the last AES round.
753# ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
754# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
755sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
756  my $L0B0_3   = $_[0];     # [in/out] zmm; blocks 0 to 3
757  my $L0B4_7   = $_[1];     # [in/out] zmm; blocks 4 to 7
758  my $L0B8_11  = $_[2];     # [in/out] zmm; blocks 8 to 11
759  my $L0B12_15 = $_[3];     # [in/out] zmm; blocks 12 to 15
760  my $KEY      = $_[4];     # [in] zmm containing round key
761  my $ROUND    = $_[5];     # [in] round number
762  my $D0_3     = $_[6];     # [in] zmm or no_data; plain/cipher text blocks 0-3
763  my $D4_7     = $_[7];     # [in] zmm or no_data; plain/cipher text blocks 4-7
764  my $D8_11    = $_[8];     # [in] zmm or no_data; plain/cipher text blocks 8-11
765  my $D12_15   = $_[9];     # [in] zmm or no_data; plain/cipher text blocks 12-15
766  my $NUMBL    = $_[10];    # [in] number of blocks; numerical value
767  my $NROUNDS  = $_[11];    # [in] number of rounds; numerical value
768
769  # ;;; === first AES round
770  if ($ROUND < 1) {
771
772    # ;;  round 0
773    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
774      $NUMBL,  "vpxorq", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
775      $L0B4_7, $L0B8_11, $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
776  }
777
778  # ;;; === middle AES rounds
779  if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
780
781    # ;; rounds 1 to 9/11/13
782    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
783      $NUMBL,  "vaesenc", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
784      $L0B4_7, $L0B8_11,  $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
785  }
786
787  # ;;; === last AES round
788  if ($ROUND > $NROUNDS) {
789
790    # ;; the last round - mix enclast with text xor's
791    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
792      $NUMBL,  "vaesenclast", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
793      $L0B4_7, $L0B8_11,      $L0B12_15, $KEY,    $KEY,     $KEY,      $KEY);
794
795    # ;;; === XOR with data
796    if ( ($D0_3 ne "no_data")
797      && ($D4_7 ne "no_data")
798      && ($D8_11 ne "no_data")
799      && ($D12_15 ne "no_data"))
800    {
801      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
802        $NUMBL,  "vpxorq", $L0B0_3,   $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
803        $L0B4_7, $L0B8_11, $L0B12_15, $D0_3,   $D4_7,    $D8_11,    $D12_15);
804    }
805  }
806}
807
808# ;;; Horizontal XOR - 4 x 128bits xored together
809sub VHPXORI4x128 {
810  my $REG = $_[0];    # [in/out] ZMM with 4x128bits to xor; 128bit output
811  my $TMP = $_[1];    # [clobbered] ZMM temporary register
812  $code .= <<___;
813        vextracti64x4     \$1,$REG,@{[YWORD($TMP)]}
814        vpxorq            @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
815        vextracti32x4     \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
816        vpxorq            @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
817___
818}
819
820# ;;; AVX512 reduction macro
821sub VCLMUL_REDUCE {
822  my $OUT   = $_[0];    # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
823  my $POLY  = $_[1];    # [in] zmm/ymm/xmm: polynomial
824  my $HI128 = $_[2];    # [in] zmm/ymm/xmm: high 128b of hash to reduce
825  my $LO128 = $_[3];    # [in] zmm/ymm/xmm: low 128b of hash to reduce
826  my $TMP0  = $_[4];    # [in] zmm/ymm/xmm: temporary register
827  my $TMP1  = $_[5];    # [in] zmm/ymm/xmm: temporary register
828
829  $code .= <<___;
830        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
831        # ;; first phase of the reduction
832        vpclmulqdq        \$0x01,$LO128,$POLY,$TMP0
833        vpslldq           \$8,$TMP0,$TMP0         # ; shift-L 2 DWs
834        vpxorq            $TMP0,$LO128,$TMP0      # ; first phase of the reduction complete
835        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
836        # ;; second phase of the reduction
837        vpclmulqdq        \$0x00,$TMP0,$POLY,$TMP1
838        vpsrldq           \$4,$TMP1,$TMP1          # ; shift-R only 1-DW to obtain 2-DWs shift-R
839        vpclmulqdq        \$0x10,$TMP0,$POLY,$OUT
840        vpslldq           \$4,$OUT,$OUT            # ; shift-L 1-DW to obtain result with no shifts
841        vpternlogq        \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
842        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
843___
844}
845
846# ;; ===========================================================================
847# ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
848# ;; - it is assumed that data read from $INPTR is already shuffled and
849# ;;   $INPTR address is 64 byte aligned
850# ;; - there is an option to pass ready blocks through ZMM registers too.
851# ;;   4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
852sub GHASH_16 {
853  my $TYPE  = $_[0];     # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
854                         # end_reduce (end with reduction), start_reduce
855  my $GH    = $_[1];     # [in/out] ZMM ghash sum: high 128-bits
856  my $GM    = $_[2];     # [in/out] ZMM ghash sum: middle 128-bits
857  my $GL    = $_[3];     # [in/out] ZMM ghash sum: low 128-bits
858  my $INPTR = $_[4];     # [in] data input pointer
859  my $INOFF = $_[5];     # [in] data input offset
860  my $INDIS = $_[6];     # [in] data input displacement
861  my $HKPTR = $_[7];     # [in] hash key pointer
862  my $HKOFF = $_[8];     # [in] hash key offset (can be either numerical offset, or register containing offset)
863  my $HKDIS = $_[9];     # [in] hash key displacement
864  my $HASH  = $_[10];    # [in/out] ZMM hash value in/out
865  my $ZTMP0 = $_[11];    # [clobbered] temporary ZMM
866  my $ZTMP1 = $_[12];    # [clobbered] temporary ZMM
867  my $ZTMP2 = $_[13];    # [clobbered] temporary ZMM
868  my $ZTMP3 = $_[14];    # [clobbered] temporary ZMM
869  my $ZTMP4 = $_[15];    # [clobbered] temporary ZMM
870  my $ZTMP5 = $_[16];    # [clobbered] temporary ZMM
871  my $ZTMP6 = $_[17];    # [clobbered] temporary ZMM
872  my $ZTMP7 = $_[18];    # [clobbered] temporary ZMM
873  my $ZTMP8 = $_[19];    # [clobbered] temporary ZMM
874  my $ZTMP9 = $_[20];    # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
875  my $DAT0  = $_[21];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
876  my $DAT1  = $_[22];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
877  my $DAT2  = $_[23];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
878  my $DAT3  = $_[24];    # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
879
880  my $start_ghash  = 0;
881  my $do_reduction = 0;
882  if ($TYPE eq "start") {
883    $start_ghash = 1;
884  }
885
886  if ($TYPE eq "start_reduce") {
887    $start_ghash  = 1;
888    $do_reduction = 1;
889  }
890
891  if ($TYPE eq "end_reduce") {
892    $do_reduction = 1;
893  }
894
895  # ;; ghash blocks 0-3
896  if (scalar(@_) == 21) {
897    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
898  } else {
899    $ZTMP9 = $DAT0;
900  }
901
902  if ($start_ghash != 0) {
903    $code .= "vpxorq            $HASH,$ZTMP9,$ZTMP9\n";
904  }
905  $code .= <<___;
906        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
907        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP0      # ; T0H = a1*b1
908        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP1      # ; T0L = a0*b0
909        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP2      # ; T0M1 = a1*b0
910        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP3      # ; T0M2 = a0*b1
911___
912
913  # ;; ghash blocks 4-7
914  if (scalar(@_) == 21) {
915    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
916  } else {
917    $ZTMP9 = $DAT1;
918  }
919  $code .= <<___;
920        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
921        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP4      # ; T1H = a1*b1
922        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP5      # ; T1L = a0*b0
923        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP6      # ; T1M1 = a1*b0
924        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP7      # ; T1M2 = a0*b1
925___
926
927  # ;; update sums
928  if ($start_ghash != 0) {
929    $code .= <<___;
930        vpxorq            $ZTMP6,$ZTMP2,$GM             # ; GM = T0M1 + T1M1
931        vpxorq            $ZTMP4,$ZTMP0,$GH             # ; GH = T0H + T1H
932        vpxorq            $ZTMP5,$ZTMP1,$GL             # ; GL = T0L + T1L
933        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM      # ; GM = T0M2 + T1M1
934___
935  } else {    # ;; mid, end, end_reduce
936    $code .= <<___;
937        vpternlogq        \$0x96,$ZTMP6,$ZTMP2,$GM      # ; GM += T0M1 + T1M1
938        vpternlogq        \$0x96,$ZTMP4,$ZTMP0,$GH      # ; GH += T0H + T1H
939        vpternlogq        \$0x96,$ZTMP5,$ZTMP1,$GL      # ; GL += T0L + T1L
940        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM      # ; GM += T0M2 + T1M1
941___
942  }
943
944  # ;; ghash blocks 8-11
945  if (scalar(@_) == 21) {
946    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
947  } else {
948    $ZTMP9 = $DAT2;
949  }
950  $code .= <<___;
951        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
952        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP0      # ; T0H = a1*b1
953        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP1      # ; T0L = a0*b0
954        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP2      # ; T0M1 = a1*b0
955        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP3      # ; T0M2 = a0*b1
956___
957
958  # ;; ghash blocks 12-15
959  if (scalar(@_) == 21) {
960    $code .= "vmovdqa64         @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
961  } else {
962    $ZTMP9 = $DAT3;
963  }
964  $code .= <<___;
965        vmovdqu64         @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
966        vpclmulqdq        \$0x11,$ZTMP8,$ZTMP9,$ZTMP4      # ; T1H = a1*b1
967        vpclmulqdq        \$0x00,$ZTMP8,$ZTMP9,$ZTMP5      # ; T1L = a0*b0
968        vpclmulqdq        \$0x01,$ZTMP8,$ZTMP9,$ZTMP6      # ; T1M1 = a1*b0
969        vpclmulqdq        \$0x10,$ZTMP8,$ZTMP9,$ZTMP7      # ; T1M2 = a0*b1
970        # ;; update sums
971        vpternlogq        \$0x96,$ZTMP6,$ZTMP2,$GM         # ; GM += T0M1 + T1M1
972        vpternlogq        \$0x96,$ZTMP4,$ZTMP0,$GH         # ; GH += T0H + T1H
973        vpternlogq        \$0x96,$ZTMP5,$ZTMP1,$GL         # ; GL += T0L + T1L
974        vpternlogq        \$0x96,$ZTMP7,$ZTMP3,$GM         # ; GM += T0M2 + T1M1
975___
976  if ($do_reduction != 0) {
977    $code .= <<___;
978        # ;; integrate GM into GH and GL
979        vpsrldq           \$8,$GM,$ZTMP0
980        vpslldq           \$8,$GM,$ZTMP1
981        vpxorq            $ZTMP0,$GH,$GH
982        vpxorq            $ZTMP1,$GL,$GL
983___
984
985    # ;; add GH and GL 128-bit words horizontally
986    &VHPXORI4x128($GH, $ZTMP0);
987    &VHPXORI4x128($GL, $ZTMP1);
988
989    # ;; reduction
990    $code .= "vmovdqa64         POLY2(%rip),@{[XWORD($ZTMP2)]}\n";
991    &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1));
992  }
993}
994
995# ;; ===========================================================================
996# ;; GHASH 1 to 16 blocks of cipher text
997# ;; - performs reduction at the end
998# ;; - it doesn't load the data and it assumed it is already loaded and shuffled
999sub GHASH_1_TO_16 {
1000  my $GCM128_CTX  = $_[0];     # [in] pointer to expanded keys
1001  my $GHASH       = $_[1];     # [out] ghash output
1002  my $T0H         = $_[2];     # [clobbered] temporary ZMM
1003  my $T0L         = $_[3];     # [clobbered] temporary ZMM
1004  my $T0M1        = $_[4];     # [clobbered] temporary ZMM
1005  my $T0M2        = $_[5];     # [clobbered] temporary ZMM
1006  my $T1H         = $_[6];     # [clobbered] temporary ZMM
1007  my $T1L         = $_[7];     # [clobbered] temporary ZMM
1008  my $T1M1        = $_[8];     # [clobbered] temporary ZMM
1009  my $T1M2        = $_[9];     # [clobbered] temporary ZMM
1010  my $HK          = $_[10];    # [clobbered] temporary ZMM
1011  my $AAD_HASH_IN = $_[11];    # [in] input hash value
1012  my @CIPHER_IN;
1013  $CIPHER_IN[0] = $_[12];      # [in] ZMM with cipher text blocks 0-3
1014  $CIPHER_IN[1] = $_[13];      # [in] ZMM with cipher text blocks 4-7
1015  $CIPHER_IN[2] = $_[14];      # [in] ZMM with cipher text blocks 8-11
1016  $CIPHER_IN[3] = $_[15];      # [in] ZMM with cipher text blocks 12-15
1017  my $NUM_BLOCKS = $_[16];     # [in] numerical value, number of blocks
1018  my $GH         = $_[17];     # [in] ZMM with hi product part
1019  my $GM         = $_[18];     # [in] ZMM with mid product part
1020  my $GL         = $_[19];     # [in] ZMM with lo product part
1021
1022  die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
1023
1024  if (scalar(@_) == 17) {
1025    $code .= "vpxorq            $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n";
1026  }
1027
1028  if ($NUM_BLOCKS == 16) {
1029    $code .= <<___;
1030        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1031        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[0],$T0H        # ; H = a1*b1
1032        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[0],$T0L        # ; L = a0*b0
1033        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[0],$T0M1       # ; M1 = a1*b0
1034        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[0],$T0M2       # ; M2 = a0*b1
1035        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1036        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[1],$T1H        # ; H = a1*b1
1037        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[1],$T1L        # ; L = a0*b0
1038        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[1],$T1M1       # ; M1 = a1*b0
1039        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[1],$T1M2       # ; M2 = a0*b1
1040        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1041        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1042        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1043        vpternlogq        \$0x96,$T1H,$CIPHER_IN[0],$T0H
1044        vpternlogq        \$0x96,$T1L,$CIPHER_IN[1],$T0L
1045        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1046        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1047        vpternlogq        \$0x96,$T1M1,$CIPHER_IN[0],$T0M1
1048        vpternlogq        \$0x96,$T1M2,$CIPHER_IN[1],$T0M2
1049        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK
1050        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[3],$T1H        # ; H = a1*b1
1051        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[3],$T1L        # ; L = a0*b0
1052        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[3],$T1M1       # ; M1 = a1*b0
1053        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[3],$T1M2       # ; M2 = a0*b1
1054        vpxorq            $T1H,$T0H,$T1H
1055        vpxorq            $T1L,$T0L,$T1L
1056        vpxorq            $T1M1,$T0M1,$T1M1
1057        vpxorq            $T1M2,$T0M2,$T1M2
1058___
1059  } elsif ($NUM_BLOCKS >= 12) {
1060    $code .= <<___;
1061        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1062        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[0],$T0H        # ; H = a1*b1
1063        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[0],$T0L        # ; L = a0*b0
1064        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[0],$T0M1       # ; M1 = a1*b0
1065        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[0],$T0M2       # ; M2 = a0*b1
1066        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1067        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[1],$T1H        # ; H = a1*b1
1068        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[1],$T1L        # ; L = a0*b0
1069        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[1],$T1M1       # ; M1 = a1*b0
1070        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[1],$T1M2       # ; M2 = a0*b1
1071        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1072        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1073        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1074        vpternlogq        \$0x96,$T0H,$CIPHER_IN[0],$T1H
1075        vpternlogq        \$0x96,$T0L,$CIPHER_IN[1],$T1L
1076        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1077        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1078        vpternlogq        \$0x96,$T0M1,$CIPHER_IN[0],$T1M1
1079        vpternlogq        \$0x96,$T0M2,$CIPHER_IN[1],$T1M2
1080___
1081  } elsif ($NUM_BLOCKS >= 8) {
1082    $code .= <<___;
1083        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1084        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[0],$T0H        # ; H = a1*b1
1085        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[0],$T0L        # ; L = a0*b0
1086        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[0],$T0M1       # ; M1 = a1*b0
1087        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[0],$T0M2       # ; M2 = a0*b1
1088        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1089        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[1],$T1H        # ; H = a1*b1
1090        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[1],$T1L        # ; L = a0*b0
1091        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[1],$T1M1       # ; M1 = a1*b0
1092        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[1],$T1M2       # ; M2 = a0*b1
1093        vpxorq            $T1H,$T0H,$T1H
1094        vpxorq            $T1L,$T0L,$T1L
1095        vpxorq            $T1M1,$T0M1,$T1M1
1096        vpxorq            $T1M2,$T0M2,$T1M2
1097___
1098  } elsif ($NUM_BLOCKS >= 4) {
1099    $code .= <<___;
1100        vmovdqu64         @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1101        vpclmulqdq        \$0x11,$HK,$CIPHER_IN[0],$T1H        # ; H = a1*b1
1102        vpclmulqdq        \$0x00,$HK,$CIPHER_IN[0],$T1L        # ; L = a0*b0
1103        vpclmulqdq        \$0x01,$HK,$CIPHER_IN[0],$T1M1       # ; M1 = a1*b0
1104        vpclmulqdq        \$0x10,$HK,$CIPHER_IN[0],$T1M2       # ; M2 = a0*b1
1105___
1106  }
1107
1108  # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4)
1109  my $blocks_left = ($NUM_BLOCKS % 4);
1110  if ($blocks_left > 0) {
1111
1112    # ;; =====================================================
1113    # ;; There are 1, 2 or 3 blocks left to process.
1114    # ;; It may also be that they are the only blocks to process.
1115
1116    # ;; Set hash key and register index position for the remaining 1 to 3 blocks
1117    my $reg_idx = ($NUM_BLOCKS / 4);
1118    my $REG_IN  = $CIPHER_IN[$reg_idx];
1119
1120    if ($blocks_left == 1) {
1121      $code .= <<___;
1122        vmovdqu64         @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]}
1123        vpclmulqdq        \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0
1124        vpclmulqdq        \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1
1125        vpclmulqdq        \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]}  # ; H = a1*b1
1126        vpclmulqdq        \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]}  # ; L = a0*b0
1127___
1128    } elsif ($blocks_left == 2) {
1129      $code .= <<___;
1130        vmovdqu64         @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1131        vpclmulqdq        \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0
1132        vpclmulqdq        \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1
1133        vpclmulqdq        \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]}  # ; H = a1*b1
1134        vpclmulqdq        \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]}  # ; L = a0*b0
1135___
1136    } else {    # ; blocks_left == 3
1137      $code .= <<___;
1138        vmovdqu64         @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1139        vinserti64x2      \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK
1140        vpclmulqdq        \$0x01,$HK,$REG_IN,$T0M1                                     # ; M1 = a1*b0
1141        vpclmulqdq        \$0x10,$HK,$REG_IN,$T0M2                                     # ; M2 = a0*b1
1142        vpclmulqdq        \$0x11,$HK,$REG_IN,$T0H                                      # ; H = a1*b1
1143        vpclmulqdq        \$0x00,$HK,$REG_IN,$T0L                                      # ; L = a0*b0
1144___
1145    }
1146
1147    if (scalar(@_) == 20) {
1148
1149      # ;; *** GH/GM/GL passed as arguments
1150      if ($NUM_BLOCKS >= 4) {
1151        $code .= <<___;
1152        # ;; add ghash product sums from the first 4, 8 or 12 blocks
1153        vpxorq            $T1M1,$T0M1,$T0M1
1154        vpternlogq        \$0x96,$T1M2,$GM,$T0M2
1155        vpternlogq        \$0x96,$T1H,$GH,$T0H
1156        vpternlogq        \$0x96,$T1L,$GL,$T0L
1157___
1158      } else {
1159        $code .= <<___;
1160        vpxorq            $GM,$T0M1,$T0M1
1161        vpxorq            $GH,$T0H,$T0H
1162        vpxorq            $GL,$T0L,$T0L
1163___
1164      }
1165    } else {
1166
1167      # ;; *** GH/GM/GL NOT passed as arguments
1168      if ($NUM_BLOCKS >= 4) {
1169        $code .= <<___;
1170        # ;; add ghash product sums from the first 4, 8 or 12 blocks
1171        vpxorq            $T1M1,$T0M1,$T0M1
1172        vpxorq            $T1M2,$T0M2,$T0M2
1173        vpxorq            $T1H,$T0H,$T0H
1174        vpxorq            $T1L,$T0L,$T0L
1175___
1176      }
1177    }
1178    $code .= <<___;
1179        # ;; integrate TM into TH and TL
1180        vpxorq            $T0M2,$T0M1,$T0M1
1181        vpsrldq           \$8,$T0M1,$T1M1
1182        vpslldq           \$8,$T0M1,$T1M2
1183        vpxorq            $T1M1,$T0H,$T0H
1184        vpxorq            $T1M2,$T0L,$T0L
1185___
1186  } else {
1187
1188    # ;; =====================================================
1189    # ;; number of blocks is 4, 8, 12 or 16
1190    # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2
1191    if (scalar(@_) == 20) {
1192      $code .= <<___;
1193        # ;; *** GH/GM/GL passed as arguments
1194        vpxorq            $GM,$T1M1,$T1M1
1195        vpxorq            $GH,$T1H,$T1H
1196        vpxorq            $GL,$T1L,$T1L
1197___
1198    }
1199    $code .= <<___;
1200        # ;; integrate TM into TH and TL
1201        vpxorq            $T1M2,$T1M1,$T1M1
1202        vpsrldq           \$8,$T1M1,$T0M1
1203        vpslldq           \$8,$T1M1,$T0M2
1204        vpxorq            $T0M1,$T1H,$T0H
1205        vpxorq            $T0M2,$T1L,$T0L
1206___
1207  }
1208
1209  # ;; add TH and TL 128-bit words horizontally
1210  &VHPXORI4x128($T0H, $T1M1);
1211  &VHPXORI4x128($T0L, $T1M2);
1212
1213  # ;; reduction
1214  $code .= "vmovdqa64         POLY2(%rip),@{[XWORD($HK)]}\n";
1215  &VCLMUL_REDUCE(
1216    @{[XWORD($GHASH)]},
1217    @{[XWORD($HK)]},
1218    @{[XWORD($T0H)]},
1219    @{[XWORD($T0L)]},
1220    @{[XWORD($T0M1)]},
1221    @{[XWORD($T0M2)]});
1222}
1223
1224# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1225# ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1)
1226# ;; Input: A and B (128-bits each, bit-reflected)
1227# ;; Output: C = A*B*x mod poly, (i.e. >>1 )
1228# ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1229# ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1230# ;;
1231# ;; Refer to [3] for more details.
1232# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1233sub GHASH_MUL {
1234  my $GH = $_[0];    #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits)
1235  my $HK = $_[1];    #; [in] xmm/ymm/zmm with hash key value(s) (128-bits)
1236  my $T1 = $_[2];    #; [clobbered] xmm/ymm/zmm
1237  my $T2 = $_[3];    #; [clobbered] xmm/ymm/zmm
1238  my $T3 = $_[4];    #; [clobbered] xmm/ymm/zmm
1239
1240  $code .= <<___;
1241        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1242        vpclmulqdq        \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1
1243        vpclmulqdq        \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0
1244        vpclmulqdq        \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0
1245        vpclmulqdq        \$0x10,$HK,$GH,$GH # ; $GH = a0*b1
1246        vpxorq            $T3,$GH,$GH
1247
1248        vpsrldq           \$8,$GH,$T3        # ; shift-R $GH 2 DWs
1249        vpslldq           \$8,$GH,$GH        # ; shift-L $GH 2 DWs
1250        vpxorq            $T3,$T1,$T1
1251        vpxorq            $T2,$GH,$GH
1252
1253        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1254        # ;first phase of the reduction
1255        vmovdqu64         POLY2(%rip),$T3
1256
1257        vpclmulqdq        \$0x01,$GH,$T3,$T2
1258        vpslldq           \$8,$T2,$T2        # ; shift-L $T2 2 DWs
1259        vpxorq            $T2,$GH,$GH        # ; first phase of the reduction complete
1260
1261        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1262        # ;second phase of the reduction
1263        vpclmulqdq        \$0x00,$GH,$T3,$T2
1264        vpsrldq           \$4,$T2,$T2        # ; shift-R only 1-DW to obtain 2-DWs shift-R
1265        vpclmulqdq        \$0x10,$GH,$T3,$GH
1266        vpslldq           \$4,$GH,$GH        # ; Shift-L 1-DW to obtain result with no shifts
1267                                             # ; second phase of the reduction complete, the result is in $GH
1268        vpternlogq        \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2
1269        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1270___
1271}
1272
1273# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1274# ;;; PRECOMPUTE computes HashKey_i
1275sub PRECOMPUTE {
1276  my $GCM128_CTX = $_[0];    #; [in/out] context pointer, hkeys content updated
1277  my $HK         = $_[1];    #; [in] xmm, hash key
1278  my $T1         = $_[2];    #; [clobbered] xmm
1279  my $T2         = $_[3];    #; [clobbered] xmm
1280  my $T3         = $_[4];    #; [clobbered] xmm
1281  my $T4         = $_[5];    #; [clobbered] xmm
1282  my $T5         = $_[6];    #; [clobbered] xmm
1283  my $T6         = $_[7];    #; [clobbered] xmm
1284
1285  my $ZT1 = &ZWORD($T1);
1286  my $ZT2 = &ZWORD($T2);
1287  my $ZT3 = &ZWORD($T3);
1288  my $ZT4 = &ZWORD($T4);
1289  my $ZT5 = &ZWORD($T5);
1290  my $ZT6 = &ZWORD($T6);
1291
1292  my $YT1 = &YWORD($T1);
1293  my $YT2 = &YWORD($T2);
1294  my $YT3 = &YWORD($T3);
1295  my $YT4 = &YWORD($T4);
1296  my $YT5 = &YWORD($T5);
1297  my $YT6 = &YWORD($T6);
1298
1299  $code .= <<___;
1300        vshufi32x4   \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5
1301        vmovdqa      $YT5,$YT4
1302___
1303
1304  # ;; calculate HashKey^2<<1 mod poly
1305  &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3);
1306
1307  $code .= <<___;
1308        vmovdqu64         $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]}
1309        vinserti64x2      \$1,$HK,$YT4,$YT5
1310        vmovdqa64         $YT5,$YT6                             # ;; YT6 = HashKey | HashKey^2
1311___
1312
1313  # ;; use 2x128-bit computation
1314  # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly
1315  &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3);    # ;; YT5 = HashKey^3 | HashKey^4
1316
1317  $code .= <<___;
1318        vmovdqu64         $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]}
1319
1320        vinserti64x4      \$1,$YT6,$ZT5,$ZT5                    # ;; ZT5 = YT6 | YT5
1321
1322        # ;; switch to 4x128-bit computations now
1323        vshufi64x2        \$0x00,$ZT5,$ZT5,$ZT4                 # ;; broadcast HashKey^4 across all ZT4
1324        vmovdqa64         $ZT5,$ZT6                             # ;; save HashKey^4 to HashKey^1 in ZT6
1325___
1326
1327  # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly
1328  &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1329  $code .= <<___;
1330        vmovdqu64         $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now
1331        vshufi64x2        \$0x00,$ZT5,$ZT5,$ZT4                 # ;; broadcast HashKey^8 across all ZT4
1332___
1333
1334  # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly
1335  # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution
1336
1337  # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9)
1338  &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3);
1339  $code .= "vmovdqu64         $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n";
1340
1341  # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13)
1342  &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1343  $code .= "vmovdqu64         $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n";
1344
1345  # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys
1346}
1347
1348# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1349# ;; READ_SMALL_DATA_INPUT
1350# ;; Packs xmm register with data when data input is less or equal to 16 bytes
1351# ;; Returns 0 if data has length 0
1352# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1353sub READ_SMALL_DATA_INPUT {
1354  my $OUTPUT = $_[0];    # [out] xmm register
1355  my $INPUT  = $_[1];    # [in] buffer pointer to read from
1356  my $LENGTH = $_[2];    # [in] number of bytes to read
1357  my $TMP1   = $_[3];    # [clobbered]
1358  my $TMP2   = $_[4];    # [clobbered]
1359  my $MASK   = $_[5];    # [out] k1 to k7 register to store the partial block mask
1360
1361  $code .= <<___;
1362        mov               \$16,@{[DWORD($TMP2)]}
1363        lea               byte_len_to_mask_table(%rip),$TMP1
1364        cmp               $TMP2,$LENGTH
1365        cmovc             $LENGTH,$TMP2
1366___
1367  if ($win64) {
1368    $code .= <<___;
1369        add               $TMP2,$TMP1
1370        add               $TMP2,$TMP1
1371        kmovw             ($TMP1),$MASK
1372___
1373  } else {
1374    $code .= "kmovw           ($TMP1,$TMP2,2),$MASK\n";
1375  }
1376  $code .= "vmovdqu8          ($INPUT),${OUTPUT}{$MASK}{z}\n";
1377}
1378
1379# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1380#  CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
1381#  Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
1382#  Output: The hash of the data (AAD_HASH).
1383# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1384sub CALC_AAD_HASH {
1385  my $A_IN       = $_[0];     # [in] AAD text pointer
1386  my $A_LEN      = $_[1];     # [in] AAD length
1387  my $AAD_HASH   = $_[2];     # [in/out] xmm ghash value
1388  my $GCM128_CTX = $_[3];     # [in] pointer to context
1389  my $ZT0        = $_[4];     # [clobbered] ZMM register
1390  my $ZT1        = $_[5];     # [clobbered] ZMM register
1391  my $ZT2        = $_[6];     # [clobbered] ZMM register
1392  my $ZT3        = $_[7];     # [clobbered] ZMM register
1393  my $ZT4        = $_[8];     # [clobbered] ZMM register
1394  my $ZT5        = $_[9];     # [clobbered] ZMM register
1395  my $ZT6        = $_[10];    # [clobbered] ZMM register
1396  my $ZT7        = $_[11];    # [clobbered] ZMM register
1397  my $ZT8        = $_[12];    # [clobbered] ZMM register
1398  my $ZT9        = $_[13];    # [clobbered] ZMM register
1399  my $ZT10       = $_[14];    # [clobbered] ZMM register
1400  my $ZT11       = $_[15];    # [clobbered] ZMM register
1401  my $ZT12       = $_[16];    # [clobbered] ZMM register
1402  my $ZT13       = $_[17];    # [clobbered] ZMM register
1403  my $ZT14       = $_[18];    # [clobbered] ZMM register
1404  my $ZT15       = $_[19];    # [clobbered] ZMM register
1405  my $ZT16       = $_[20];    # [clobbered] ZMM register
1406  my $T1         = $_[21];    # [clobbered] GP register
1407  my $T2         = $_[22];    # [clobbered] GP register
1408  my $T3         = $_[23];    # [clobbered] GP register
1409  my $MASKREG    = $_[24];    # [clobbered] mask register
1410
1411  my $HKEYS_READY = "%rbx";
1412
1413  my $SHFMSK = $ZT13;
1414
1415  my $label_suffix = $label_count++;
1416
1417  $code .= <<___;
1418        mov               $A_IN,$T1      # ; T1 = AAD
1419        mov               $A_LEN,$T2     # ; T2 = aadLen
1420        or                $T2,$T2
1421        jz                .L_CALC_AAD_done_${label_suffix}
1422
1423        xor               $HKEYS_READY,$HKEYS_READY
1424        vmovdqa64         SHUF_MASK(%rip),$SHFMSK
1425
1426.L_get_AAD_loop48x16_${label_suffix}:
1427        cmp               \$`(48*16)`,$T2
1428        jl                .L_exit_AAD_loop48x16_${label_suffix}
1429___
1430
1431  $code .= <<___;
1432        vmovdqu64         `64*0`($T1),$ZT1      # ; Blocks 0-3
1433        vmovdqu64         `64*1`($T1),$ZT2      # ; Blocks 4-7
1434        vmovdqu64         `64*2`($T1),$ZT3      # ; Blocks 8-11
1435        vmovdqu64         `64*3`($T1),$ZT4      # ; Blocks 12-15
1436        vpshufb           $SHFMSK,$ZT1,$ZT1
1437        vpshufb           $SHFMSK,$ZT2,$ZT2
1438        vpshufb           $SHFMSK,$ZT3,$ZT3
1439        vpshufb           $SHFMSK,$ZT4,$ZT4
1440___
1441
1442  &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all");
1443  $code .= "mov     \$1,$HKEYS_READY\n";
1444
1445  &GHASH_16(
1446    "start",        $ZT5,           $ZT6,           $ZT7,
1447    "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1448    &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0,
1449    $ZT8,     $ZT9,  $ZT10, $ZT11,
1450    $ZT12,    $ZT14, $ZT15, $ZT16,
1451    "NO_ZMM", $ZT1,  $ZT2,  $ZT3,
1452    $ZT4);
1453
1454  $code .= <<___;
1455        vmovdqu64         `16*16 + 64*0`($T1),$ZT1      # ; Blocks 16-19
1456        vmovdqu64         `16*16 + 64*1`($T1),$ZT2      # ; Blocks 20-23
1457        vmovdqu64         `16*16 + 64*2`($T1),$ZT3      # ; Blocks 24-27
1458        vmovdqu64         `16*16 + 64*3`($T1),$ZT4      # ; Blocks 28-31
1459        vpshufb           $SHFMSK,$ZT1,$ZT1
1460        vpshufb           $SHFMSK,$ZT2,$ZT2
1461        vpshufb           $SHFMSK,$ZT3,$ZT3
1462        vpshufb           $SHFMSK,$ZT4,$ZT4
1463___
1464
1465  &GHASH_16(
1466    "mid",          $ZT5,           $ZT6,           $ZT7,
1467    "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1468    &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0,
1469    $ZT8,     $ZT9,  $ZT10, $ZT11,
1470    $ZT12,    $ZT14, $ZT15, $ZT16,
1471    "NO_ZMM", $ZT1,  $ZT2,  $ZT3,
1472    $ZT4);
1473
1474  $code .= <<___;
1475        vmovdqu64         `32*16 + 64*0`($T1),$ZT1      # ; Blocks 32-35
1476        vmovdqu64         `32*16 + 64*1`($T1),$ZT2      # ; Blocks 36-39
1477        vmovdqu64         `32*16 + 64*2`($T1),$ZT3      # ; Blocks 40-43
1478        vmovdqu64         `32*16 + 64*3`($T1),$ZT4      # ; Blocks 44-47
1479        vpshufb           $SHFMSK,$ZT1,$ZT1
1480        vpshufb           $SHFMSK,$ZT2,$ZT2
1481        vpshufb           $SHFMSK,$ZT3,$ZT3
1482        vpshufb           $SHFMSK,$ZT4,$ZT4
1483___
1484
1485  &GHASH_16(
1486    "end_reduce",   $ZT5,           $ZT6,           $ZT7,
1487    "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1488    &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1489    $ZT8,     $ZT9,  $ZT10, $ZT11,
1490    $ZT12,    $ZT14, $ZT15, $ZT16,
1491    "NO_ZMM", $ZT1,  $ZT2,  $ZT3,
1492    $ZT4);
1493
1494  $code .= <<___;
1495        sub               \$`(48*16)`,$T2
1496        je                .L_CALC_AAD_done_${label_suffix}
1497
1498        add               \$`(48*16)`,$T1
1499        jmp               .L_get_AAD_loop48x16_${label_suffix}
1500
1501.L_exit_AAD_loop48x16_${label_suffix}:
1502        # ; Less than 48x16 bytes remaining
1503        cmp               \$`(32*16)`,$T2
1504        jl                .L_less_than_32x16_${label_suffix}
1505___
1506
1507  $code .= <<___;
1508        # ; Get next 16 blocks
1509        vmovdqu64         `64*0`($T1),$ZT1
1510        vmovdqu64         `64*1`($T1),$ZT2
1511        vmovdqu64         `64*2`($T1),$ZT3
1512        vmovdqu64         `64*3`($T1),$ZT4
1513        vpshufb           $SHFMSK,$ZT1,$ZT1
1514        vpshufb           $SHFMSK,$ZT2,$ZT2
1515        vpshufb           $SHFMSK,$ZT3,$ZT3
1516        vpshufb           $SHFMSK,$ZT4,$ZT4
1517___
1518
1519  &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32");
1520  $code .= "mov     \$1,$HKEYS_READY\n";
1521
1522  &GHASH_16(
1523    "start",        $ZT5,           $ZT6,           $ZT7,
1524    "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1525    &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1526    $ZT8,     $ZT9,  $ZT10, $ZT11,
1527    $ZT12,    $ZT14, $ZT15, $ZT16,
1528    "NO_ZMM", $ZT1,  $ZT2,  $ZT3,
1529    $ZT4);
1530
1531  $code .= <<___;
1532        vmovdqu64         `16*16 + 64*0`($T1),$ZT1
1533        vmovdqu64         `16*16 + 64*1`($T1),$ZT2
1534        vmovdqu64         `16*16 + 64*2`($T1),$ZT3
1535        vmovdqu64         `16*16 + 64*3`($T1),$ZT4
1536        vpshufb           $SHFMSK,$ZT1,$ZT1
1537        vpshufb           $SHFMSK,$ZT2,$ZT2
1538        vpshufb           $SHFMSK,$ZT3,$ZT3
1539        vpshufb           $SHFMSK,$ZT4,$ZT4
1540___
1541
1542  &GHASH_16(
1543    "end_reduce",   $ZT5,           $ZT6,           $ZT7,
1544    "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1545    &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1546    $ZT8,     $ZT9,  $ZT10, $ZT11,
1547    $ZT12,    $ZT14, $ZT15, $ZT16,
1548    "NO_ZMM", $ZT1,  $ZT2,  $ZT3,
1549    $ZT4);
1550
1551  $code .= <<___;
1552        sub               \$`(32*16)`,$T2
1553        je                .L_CALC_AAD_done_${label_suffix}
1554
1555        add               \$`(32*16)`,$T1
1556        jmp               .L_less_than_16x16_${label_suffix}
1557
1558.L_less_than_32x16_${label_suffix}:
1559        cmp               \$`(16*16)`,$T2
1560        jl                .L_less_than_16x16_${label_suffix}
1561        # ; Get next 16 blocks
1562        vmovdqu64         `64*0`($T1),$ZT1
1563        vmovdqu64         `64*1`($T1),$ZT2
1564        vmovdqu64         `64*2`($T1),$ZT3
1565        vmovdqu64         `64*3`($T1),$ZT4
1566        vpshufb           $SHFMSK,$ZT1,$ZT1
1567        vpshufb           $SHFMSK,$ZT2,$ZT2
1568        vpshufb           $SHFMSK,$ZT3,$ZT3
1569        vpshufb           $SHFMSK,$ZT4,$ZT4
1570___
1571
1572  # ; This code path does not use more than 16 hkeys, so they can be taken from the context
1573  # ; (not from the stack storage)
1574  &GHASH_16(
1575    "start_reduce", $ZT5,           $ZT6,           $ZT7,
1576    "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX,
1577    &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0,
1578    $ZT8,     $ZT9,  $ZT10, $ZT11,
1579    $ZT12,    $ZT14, $ZT15, $ZT16,
1580    "NO_ZMM", $ZT1,  $ZT2,  $ZT3,
1581    $ZT4);
1582
1583  $code .= <<___;
1584        sub               \$`(16*16)`,$T2
1585        je                .L_CALC_AAD_done_${label_suffix}
1586
1587        add               \$`(16*16)`,$T1
1588        # ; Less than 16x16 bytes remaining
1589.L_less_than_16x16_${label_suffix}:
1590        # ;; prep mask source address
1591        lea               byte64_len_to_mask_table(%rip),$T3
1592        lea               ($T3,$T2,8),$T3
1593
1594        # ;; calculate number of blocks to ghash (including partial bytes)
1595        add               \$15,@{[DWORD($T2)]}
1596        shr               \$4,@{[DWORD($T2)]}
1597        cmp               \$2,@{[DWORD($T2)]}
1598        jb                .L_AAD_blocks_1_${label_suffix}
1599        je                .L_AAD_blocks_2_${label_suffix}
1600        cmp               \$4,@{[DWORD($T2)]}
1601        jb                .L_AAD_blocks_3_${label_suffix}
1602        je                .L_AAD_blocks_4_${label_suffix}
1603        cmp               \$6,@{[DWORD($T2)]}
1604        jb                .L_AAD_blocks_5_${label_suffix}
1605        je                .L_AAD_blocks_6_${label_suffix}
1606        cmp               \$8,@{[DWORD($T2)]}
1607        jb                .L_AAD_blocks_7_${label_suffix}
1608        je                .L_AAD_blocks_8_${label_suffix}
1609        cmp               \$10,@{[DWORD($T2)]}
1610        jb                .L_AAD_blocks_9_${label_suffix}
1611        je                .L_AAD_blocks_10_${label_suffix}
1612        cmp               \$12,@{[DWORD($T2)]}
1613        jb                .L_AAD_blocks_11_${label_suffix}
1614        je                .L_AAD_blocks_12_${label_suffix}
1615        cmp               \$14,@{[DWORD($T2)]}
1616        jb                .L_AAD_blocks_13_${label_suffix}
1617        je                .L_AAD_blocks_14_${label_suffix}
1618        cmp               \$15,@{[DWORD($T2)]}
1619        je                .L_AAD_blocks_15_${label_suffix}
1620___
1621
1622  # ;; fall through for 16 blocks
1623
1624  # ;; The flow of each of these cases is identical:
1625  # ;; - load blocks plain text
1626  # ;; - shuffle loaded blocks
1627  # ;; - xor in current hash value into block 0
1628  # ;; - perform up multiplications with ghash keys
1629  # ;; - jump to reduction code
1630
1631  for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) {
1632    $code .= ".L_AAD_blocks_${aad_blocks}_${label_suffix}:\n";
1633    if ($aad_blocks > 12) {
1634      $code .= "sub               \$`12*16*8`, $T3\n";
1635    } elsif ($aad_blocks > 8) {
1636      $code .= "sub               \$`8*16*8`, $T3\n";
1637    } elsif ($aad_blocks > 4) {
1638      $code .= "sub               \$`4*16*8`, $T3\n";
1639    }
1640    $code .= "kmovq             ($T3),$MASKREG\n";
1641
1642    &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG);
1643
1644    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4,
1645      $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
1646
1647    &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH),
1648      $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks);
1649
1650    if ($aad_blocks > 1) {
1651
1652      # ;; fall through to CALC_AAD_done in 1 block case
1653      $code .= "jmp           .L_CALC_AAD_done_${label_suffix}\n";
1654    }
1655
1656  }
1657  $code .= ".L_CALC_AAD_done_${label_suffix}:\n";
1658
1659  # ;; result in AAD_HASH
1660}
1661
1662# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1663# ;; PARTIAL_BLOCK
1664# ;; Handles encryption/decryption and the tag partial blocks between
1665# ;; update calls.
1666# ;; Requires the input data be at least 1 byte long.
1667# ;; Output:
1668# ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT),
1669# ;; AAD_HASH and updated GCM128_CTX
1670# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1671sub PARTIAL_BLOCK {
1672  my $GCM128_CTX     = $_[0];     # [in] key pointer
1673  my $PBLOCK_LEN     = $_[1];     # [in] partial block length
1674  my $CIPH_PLAIN_OUT = $_[2];     # [in] output buffer
1675  my $PLAIN_CIPH_IN  = $_[3];     # [in] input buffer
1676  my $PLAIN_CIPH_LEN = $_[4];     # [in] buffer length
1677  my $DATA_OFFSET    = $_[5];     # [out] data offset (gets set)
1678  my $AAD_HASH       = $_[6];     # [out] updated GHASH value
1679  my $ENC_DEC        = $_[7];     # [in] cipher direction
1680  my $GPTMP0         = $_[8];     # [clobbered] GP temporary register
1681  my $GPTMP1         = $_[9];     # [clobbered] GP temporary register
1682  my $GPTMP2         = $_[10];    # [clobbered] GP temporary register
1683  my $ZTMP0          = $_[11];    # [clobbered] ZMM temporary register
1684  my $ZTMP1          = $_[12];    # [clobbered] ZMM temporary register
1685  my $ZTMP2          = $_[13];    # [clobbered] ZMM temporary register
1686  my $ZTMP3          = $_[14];    # [clobbered] ZMM temporary register
1687  my $ZTMP4          = $_[15];    # [clobbered] ZMM temporary register
1688  my $ZTMP5          = $_[16];    # [clobbered] ZMM temporary register
1689  my $ZTMP6          = $_[17];    # [clobbered] ZMM temporary register
1690  my $ZTMP7          = $_[18];    # [clobbered] ZMM temporary register
1691  my $MASKREG        = $_[19];    # [clobbered] mask temporary register
1692
1693  my $XTMP0 = &XWORD($ZTMP0);
1694  my $XTMP1 = &XWORD($ZTMP1);
1695  my $XTMP2 = &XWORD($ZTMP2);
1696  my $XTMP3 = &XWORD($ZTMP3);
1697  my $XTMP4 = &XWORD($ZTMP4);
1698  my $XTMP5 = &XWORD($ZTMP5);
1699  my $XTMP6 = &XWORD($ZTMP6);
1700  my $XTMP7 = &XWORD($ZTMP7);
1701
1702  my $LENGTH = $DATA_OFFSET;
1703  my $IA0    = $GPTMP1;
1704  my $IA1    = $GPTMP2;
1705  my $IA2    = $GPTMP0;
1706
1707  my $label_suffix = $label_count++;
1708
1709  $code .= <<___;
1710        # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero
1711        mov             ($PBLOCK_LEN),$LENGTH
1712        or              $LENGTH,$LENGTH
1713        je              .L_partial_block_done_${label_suffix}         #  ;Leave Macro if no partial blocks
1714___
1715
1716  &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG);
1717
1718  $code .= <<___;
1719        # ;; XTMP1 = my_ctx_data.partial_block_enc_key
1720        vmovdqu64         $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1
1721        vmovdqu64         @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2
1722
1723        # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes
1724        # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16)
1725        lea               SHIFT_MASK(%rip),$IA0
1726        add               $LENGTH,$IA0
1727        vmovdqu64         ($IA0),$XTMP3         # ; shift right shuffle mask
1728        vpshufb           $XTMP3,$XTMP1,$XTMP1
1729___
1730
1731  if ($ENC_DEC eq "DEC") {
1732    $code .= <<___;
1733        # ;;  keep copy of cipher text in $XTMP4
1734        vmovdqa64         $XTMP0,$XTMP4
1735___
1736  }
1737  $code .= <<___;
1738        vpxorq            $XTMP0,$XTMP1,$XTMP1  # ; Ciphertext XOR E(K, Yn)
1739        # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block
1740        # ;; Determine if partial block is not being filled and shift mask accordingly
1741___
1742  if ($win64) {
1743    $code .= <<___;
1744        mov               $PLAIN_CIPH_LEN,$IA1
1745        add               $LENGTH,$IA1
1746___
1747  } else {
1748    $code .= "lea               ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n";
1749  }
1750  $code .= <<___;
1751        sub               \$16,$IA1
1752        jge               .L_no_extra_mask_${label_suffix}
1753        sub               $IA1,$IA0
1754.L_no_extra_mask_${label_suffix}:
1755        # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1
1756        # ;; - mask out bottom $LENGTH bytes of $XTMP1
1757        # ;; sizeof(SHIFT_MASK) == 16 bytes
1758        vmovdqu64         16($IA0),$XTMP0
1759        vpand             $XTMP0,$XTMP1,$XTMP1
1760___
1761
1762  if ($ENC_DEC eq "DEC") {
1763    $code .= <<___;
1764        vpand             $XTMP0,$XTMP4,$XTMP4
1765        vpshufb           SHUF_MASK(%rip),$XTMP4,$XTMP4
1766        vpshufb           $XTMP3,$XTMP4,$XTMP4
1767        vpxorq            $XTMP4,$AAD_HASH,$AAD_HASH
1768___
1769  } else {
1770    $code .= <<___;
1771        vpshufb           SHUF_MASK(%rip),$XTMP1,$XTMP1
1772        vpshufb           $XTMP3,$XTMP1,$XTMP1
1773        vpxorq            $XTMP1,$AAD_HASH,$AAD_HASH
1774___
1775  }
1776  $code .= <<___;
1777        cmp               \$0,$IA1
1778        jl                .L_partial_incomplete_${label_suffix}
1779___
1780
1781  # ;; GHASH computation for the last <16 Byte block
1782  &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7);
1783
1784  $code .= <<___;
1785        movq              \$0, ($PBLOCK_LEN)
1786        # ;;  Set $LENGTH to be the number of bytes to write out
1787        mov               $LENGTH,$IA0
1788        mov               \$16,$LENGTH
1789        sub               $IA0,$LENGTH
1790        jmp               .L_enc_dec_done_${label_suffix}
1791
1792.L_partial_incomplete_${label_suffix}:
1793___
1794  if ($win64) {
1795    $code .= <<___;
1796        mov               $PLAIN_CIPH_LEN,$IA0
1797        add               $IA0,($PBLOCK_LEN)
1798___
1799  } else {
1800    $code .= "add               $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n";
1801  }
1802  $code .= <<___;
1803        mov               $PLAIN_CIPH_LEN,$LENGTH
1804
1805.L_enc_dec_done_${label_suffix}:
1806        # ;; output encrypted Bytes
1807
1808        lea               byte_len_to_mask_table(%rip),$IA0
1809        kmovw             ($IA0,$LENGTH,2),$MASKREG
1810        vmovdqu64         $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)
1811___
1812
1813  if ($ENC_DEC eq "ENC") {
1814    $code .= <<___;
1815        # ;; shuffle XTMP1 back to output as ciphertext
1816        vpshufb           SHUF_MASK(%rip),$XTMP1,$XTMP1
1817        vpshufb           $XTMP3,$XTMP1,$XTMP1
1818___
1819  }
1820  $code .= <<___;
1821        mov               $CIPH_PLAIN_OUT,$IA0
1822        vmovdqu8          $XTMP1,($IA0){$MASKREG}
1823.L_partial_block_done_${label_suffix}:
1824___
1825}
1826
1827# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1828# ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation
1829sub INITIAL_BLOCKS_PARTIAL_CIPHER {
1830  my $AES_KEYS        = $_[0];     # [in] key pointer
1831  my $GCM128_CTX      = $_[1];     # [in] context pointer
1832  my $CIPH_PLAIN_OUT  = $_[2];     # [in] text output pointer
1833  my $PLAIN_CIPH_IN   = $_[3];     # [in] text input pointer
1834  my $LENGTH          = $_[4];     # [in/clobbered] length in bytes
1835  my $DATA_OFFSET     = $_[5];     # [in/out] current data offset (updated)
1836  my $NUM_BLOCKS      = $_[6];     # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1837  my $CTR             = $_[7];     # [in/out] current counter value
1838  my $ENC_DEC         = $_[8];     # [in] cipher direction (ENC/DEC)
1839  my $DAT0            = $_[9];     # [out] ZMM with cipher text shuffled for GHASH
1840  my $DAT1            = $_[10];    # [out] ZMM with cipher text shuffled for GHASH
1841  my $DAT2            = $_[11];    # [out] ZMM with cipher text shuffled for GHASH
1842  my $DAT3            = $_[12];    # [out] ZMM with cipher text shuffled for GHASH
1843  my $LAST_CIPHER_BLK = $_[13];    # [out] XMM to put ciphered counter block partially xor'ed with text
1844  my $LAST_GHASH_BLK  = $_[14];    # [out] XMM to put last cipher text block shuffled for GHASH
1845  my $CTR0            = $_[15];    # [clobbered] ZMM temporary
1846  my $CTR1            = $_[16];    # [clobbered] ZMM temporary
1847  my $CTR2            = $_[17];    # [clobbered] ZMM temporary
1848  my $CTR3            = $_[18];    # [clobbered] ZMM temporary
1849  my $ZT1             = $_[19];    # [clobbered] ZMM temporary
1850  my $IA0             = $_[20];    # [clobbered] GP temporary
1851  my $IA1             = $_[21];    # [clobbered] GP temporary
1852  my $MASKREG         = $_[22];    # [clobbered] mask register
1853  my $SHUFMASK        = $_[23];    # [out] ZMM loaded with BE/LE shuffle mask
1854
1855  if ($NUM_BLOCKS == 1) {
1856    $code .= "vmovdqa64         SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n";
1857  } elsif ($NUM_BLOCKS == 2) {
1858    $code .= "vmovdqa64         SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n";
1859  } else {
1860    $code .= "vmovdqa64         SHUF_MASK(%rip),$SHUFMASK\n";
1861  }
1862
1863  # ;; prepare AES counter blocks
1864  if ($NUM_BLOCKS == 1) {
1865    $code .= "vpaddd            ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n";
1866  } elsif ($NUM_BLOCKS == 2) {
1867    $code .= <<___;
1868        vshufi64x2        \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]}
1869        vpaddd            ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]}
1870___
1871  } else {
1872    $code .= <<___;
1873        vshufi64x2        \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]}
1874        vpaddd            ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0
1875___
1876    if ($NUM_BLOCKS > 4) {
1877      $code .= "vpaddd            ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n";
1878    }
1879    if ($NUM_BLOCKS > 8) {
1880      $code .= "vpaddd            ddq_add_8888(%rip),$CTR0,$CTR2\n";
1881    }
1882    if ($NUM_BLOCKS > 12) {
1883      $code .= "vpaddd            ddq_add_8888(%rip),$CTR1,$CTR3\n";
1884    }
1885  }
1886
1887  # ;; get load/store mask
1888  $code .= <<___;
1889        lea               byte64_len_to_mask_table(%rip),$IA0
1890        mov               $LENGTH,$IA1
1891___
1892  if ($NUM_BLOCKS > 12) {
1893    $code .= "sub               \$`3*64`,$IA1\n";
1894  } elsif ($NUM_BLOCKS > 8) {
1895    $code .= "sub               \$`2*64`,$IA1\n";
1896  } elsif ($NUM_BLOCKS > 4) {
1897    $code .= "sub               \$`1*64`,$IA1\n";
1898  }
1899  $code .= "kmovq             ($IA0,$IA1,8),$MASKREG\n";
1900
1901  # ;; extract new counter value
1902  # ;; shuffle the counters for AES rounds
1903  if ($NUM_BLOCKS <= 4) {
1904    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n";
1905  } elsif ($NUM_BLOCKS <= 8) {
1906    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n";
1907  } elsif ($NUM_BLOCKS <= 12) {
1908    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n";
1909  } else {
1910    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n";
1911  }
1912  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1913    $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1,     $CTR2,     $CTR3,     $CTR0,
1914    $CTR1,       $CTR2,     $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1915
1916  # ;; load plain/cipher text
1917  &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG);
1918
1919  # ;; AES rounds and XOR with plain/cipher text
1920  foreach my $j (0 .. ($NROUNDS + 1)) {
1921    $code .= "vbroadcastf64x2    `($j * 16)`($AES_KEYS),$ZT1\n";
1922    &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j,
1923      $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS);
1924  }
1925
1926  # ;; retrieve the last cipher counter block (partially XOR'ed with text)
1927  # ;; - this is needed for partial block cases
1928  if ($NUM_BLOCKS <= 4) {
1929    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n";
1930  } elsif ($NUM_BLOCKS <= 8) {
1931    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n";
1932  } elsif ($NUM_BLOCKS <= 12) {
1933    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n";
1934  } else {
1935    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n";
1936  }
1937
1938  # ;; write cipher/plain text back to output and
1939  $code .= "mov       $CIPH_PLAIN_OUT,$IA0\n";
1940  &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG);
1941
1942  # ;; zero bytes outside the mask before hashing
1943  if ($NUM_BLOCKS <= 4) {
1944    $code .= "vmovdqu8          $CTR0,${CTR0}{$MASKREG}{z}\n";
1945  } elsif ($NUM_BLOCKS <= 8) {
1946    $code .= "vmovdqu8          $CTR1,${CTR1}{$MASKREG}{z}\n";
1947  } elsif ($NUM_BLOCKS <= 12) {
1948    $code .= "vmovdqu8          $CTR2,${CTR2}{$MASKREG}{z}\n";
1949  } else {
1950    $code .= "vmovdqu8          $CTR3,${CTR3}{$MASKREG}{z}\n";
1951  }
1952
1953  # ;; Shuffle the cipher text blocks for hashing part
1954  # ;; ZT5 and ZT6 are expected outputs with blocks for hashing
1955  if ($ENC_DEC eq "DEC") {
1956
1957    # ;; Decrypt case
1958    # ;; - cipher blocks are in ZT5 & ZT6
1959    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1960      $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1,     $DAT2,     $DAT3,     $DAT0,
1961      $DAT1,       $DAT2,     $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1962  } else {
1963
1964    # ;; Encrypt case
1965    # ;; - cipher blocks are in CTR0-CTR3
1966    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1967      $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1,     $DAT2,     $DAT3,     $CTR0,
1968      $CTR1,       $CTR2,     $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1969  }
1970
1971  # ;; Extract the last block for partials and multi_call cases
1972  if ($NUM_BLOCKS <= 4) {
1973    $code .= "vextracti32x4     \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n";
1974  } elsif ($NUM_BLOCKS <= 8) {
1975    $code .= "vextracti32x4     \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n";
1976  } elsif ($NUM_BLOCKS <= 12) {
1977    $code .= "vextracti32x4     \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n";
1978  } else {
1979    $code .= "vextracti32x4     \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n";
1980  }
1981
1982}
1983
1984# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1985# ;; Computes GHASH on 1 to 16 blocks
1986sub INITIAL_BLOCKS_PARTIAL_GHASH {
1987  my $AES_KEYS        = $_[0];     # [in] key pointer
1988  my $GCM128_CTX      = $_[1];     # [in] context pointer
1989  my $LENGTH          = $_[2];     # [in/clobbered] length in bytes
1990  my $NUM_BLOCKS      = $_[3];     # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1991  my $HASH_IN_OUT     = $_[4];     # [in/out] XMM ghash in/out value
1992  my $ENC_DEC         = $_[5];     # [in] cipher direction (ENC/DEC)
1993  my $DAT0            = $_[6];     # [in] ZMM with cipher text shuffled for GHASH
1994  my $DAT1            = $_[7];     # [in] ZMM with cipher text shuffled for GHASH
1995  my $DAT2            = $_[8];     # [in] ZMM with cipher text shuffled for GHASH
1996  my $DAT3            = $_[9];     # [in] ZMM with cipher text shuffled for GHASH
1997  my $LAST_CIPHER_BLK = $_[10];    # [in] XMM with ciphered counter block partially xor'ed with text
1998  my $LAST_GHASH_BLK  = $_[11];    # [in] XMM with last cipher text block shuffled for GHASH
1999  my $ZT0             = $_[12];    # [clobbered] ZMM temporary
2000  my $ZT1             = $_[13];    # [clobbered] ZMM temporary
2001  my $ZT2             = $_[14];    # [clobbered] ZMM temporary
2002  my $ZT3             = $_[15];    # [clobbered] ZMM temporary
2003  my $ZT4             = $_[16];    # [clobbered] ZMM temporary
2004  my $ZT5             = $_[17];    # [clobbered] ZMM temporary
2005  my $ZT6             = $_[18];    # [clobbered] ZMM temporary
2006  my $ZT7             = $_[19];    # [clobbered] ZMM temporary
2007  my $ZT8             = $_[20];    # [clobbered] ZMM temporary
2008  my $PBLOCK_LEN      = $_[21];    # [in] partial block length
2009  my $GH              = $_[22];    # [in] ZMM with hi product part
2010  my $GM              = $_[23];    # [in] ZMM with mid product part
2011  my $GL              = $_[24];    # [in] ZMM with lo product part
2012
2013  my $label_suffix = $label_count++;
2014
2015  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2016  # ;;; - Hash all but the last partial block of data
2017  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2018
2019  # ;; update data offset
2020  if ($NUM_BLOCKS > 1) {
2021
2022    # ;; The final block of data may be <16B
2023    $code .= "sub               \$16 * ($NUM_BLOCKS - 1),$LENGTH\n";
2024  }
2025
2026  if ($NUM_BLOCKS < 16) {
2027    $code .= <<___;
2028        # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
2029        # ;;      This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
2030        cmp               \$16,$LENGTH
2031        jl                .L_small_initial_partial_block_${label_suffix}
2032
2033        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2034        # ;;; Handle a full length final block - encrypt and hash all blocks
2035        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2036
2037        sub               \$16,$LENGTH
2038        movq              \$0,($PBLOCK_LEN)
2039___
2040
2041    # ;; Hash all of the data
2042    if (scalar(@_) == 22) {
2043
2044      # ;; start GHASH compute
2045      &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2046        $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS);
2047    } elsif (scalar(@_) == 25) {
2048
2049      # ;; continue GHASH compute
2050      &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2051        $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL);
2052    }
2053    $code .= "jmp           .L_small_initial_compute_done_${label_suffix}\n";
2054  }
2055
2056  $code .= <<___;
2057.L_small_initial_partial_block_${label_suffix}:
2058
2059        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2060        # ;;; Handle ghash for a <16B final block
2061        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2062
2063        # ;; As it's an init / update / finalize series we need to leave the
2064        # ;; last block if it's less than a full block of data.
2065
2066        mov               $LENGTH,($PBLOCK_LEN)
2067        vmovdqu64         $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX)
2068___
2069
2070  my $k                  = ($NUM_BLOCKS - 1);
2071  my $last_block_to_hash = 1;
2072  if (($NUM_BLOCKS > $last_block_to_hash)) {
2073
2074    # ;; ZT12-ZT20 - temporary registers
2075    if (scalar(@_) == 22) {
2076
2077      # ;; start GHASH compute
2078      &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2079        $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k);
2080    } elsif (scalar(@_) == 25) {
2081
2082      # ;; continue GHASH compute
2083      &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2084        $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL);
2085    }
2086
2087    # ;; just fall through no jmp needed
2088  } else {
2089
2090    if (scalar(@_) == 25) {
2091      $code .= <<___;
2092        # ;; Reduction is required in this case.
2093        # ;; Integrate GM into GH and GL.
2094        vpsrldq           \$8,$GM,$ZT0
2095        vpslldq           \$8,$GM,$ZT1
2096        vpxorq            $ZT0,$GH,$GH
2097        vpxorq            $ZT1,$GL,$GL
2098___
2099
2100      # ;; Add GH and GL 128-bit words horizontally
2101      &VHPXORI4x128($GH, $ZT0);
2102      &VHPXORI4x128($GL, $ZT1);
2103
2104      # ;; 256-bit to 128-bit reduction
2105      $code .= "vmovdqa64         POLY2(%rip),@{[XWORD($ZT0)]}\n";
2106      &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2));
2107    }
2108    $code .= <<___;
2109        # ;; Record that a reduction is not needed -
2110        # ;; In this case no hashes are computed because there
2111        # ;; is only one initial block and it is < 16B in length.
2112        # ;; We only need to check if a reduction is needed if
2113        # ;; initial_blocks == 1 and init/update/final is being used.
2114        # ;; In this case we may just have a partial block, and that
2115        # ;; gets hashed in finalize.
2116
2117        # ;; The hash should end up in HASH_IN_OUT.
2118        # ;; The only way we should get here is if there is
2119        # ;; a partial block of data, so xor that into the hash.
2120        vpxorq            $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT
2121        # ;; The result is in $HASH_IN_OUT
2122        jmp               .L_after_reduction_${label_suffix}
2123___
2124  }
2125
2126  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2127  # ;;; After GHASH reduction
2128  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2129
2130  $code .= ".L_small_initial_compute_done_${label_suffix}:\n";
2131
2132  # ;; If using init/update/finalize, we need to xor any partial block data
2133  # ;; into the hash.
2134  if ($NUM_BLOCKS > 1) {
2135
2136    # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place
2137    if ($NUM_BLOCKS != 16) {
2138      $code .= <<___;
2139        # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero
2140        or                $LENGTH,$LENGTH
2141        je                .L_after_reduction_${label_suffix}
2142___
2143    }
2144    $code .= "vpxorq            $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n";
2145  }
2146
2147  $code .= ".L_after_reduction_${label_suffix}:\n";
2148
2149  # ;; Final hash is now in HASH_IN_OUT
2150}
2151
2152# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2153# ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
2154# ;; It may look similar to INITIAL_BLOCKS but its usage is different:
2155# ;; - first encrypts/decrypts required number of blocks and then
2156# ;;   ghashes these blocks
2157# ;; - Small packets or left over data chunks (<256 bytes)
2158# ;; - Remaining data chunks below 256 bytes (multi buffer code)
2159# ;;
2160# ;; num_initial_blocks is expected to include the partial final block
2161# ;; in the count.
2162sub INITIAL_BLOCKS_PARTIAL {
2163  my $AES_KEYS        = $_[0];     # [in] key pointer
2164  my $GCM128_CTX      = $_[1];     # [in] context pointer
2165  my $CIPH_PLAIN_OUT  = $_[2];     # [in] text output pointer
2166  my $PLAIN_CIPH_IN   = $_[3];     # [in] text input pointer
2167  my $LENGTH          = $_[4];     # [in/clobbered] length in bytes
2168  my $DATA_OFFSET     = $_[5];     # [in/out] current data offset (updated)
2169  my $NUM_BLOCKS      = $_[6];     # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
2170  my $CTR             = $_[7];     # [in/out] current counter value
2171  my $HASH_IN_OUT     = $_[8];     # [in/out] XMM ghash in/out value
2172  my $ENC_DEC         = $_[9];     # [in] cipher direction (ENC/DEC)
2173  my $CTR0            = $_[10];    # [clobbered] ZMM temporary
2174  my $CTR1            = $_[11];    # [clobbered] ZMM temporary
2175  my $CTR2            = $_[12];    # [clobbered] ZMM temporary
2176  my $CTR3            = $_[13];    # [clobbered] ZMM temporary
2177  my $DAT0            = $_[14];    # [clobbered] ZMM temporary
2178  my $DAT1            = $_[15];    # [clobbered] ZMM temporary
2179  my $DAT2            = $_[16];    # [clobbered] ZMM temporary
2180  my $DAT3            = $_[17];    # [clobbered] ZMM temporary
2181  my $LAST_CIPHER_BLK = $_[18];    # [clobbered] ZMM temporary
2182  my $LAST_GHASH_BLK  = $_[19];    # [clobbered] ZMM temporary
2183  my $ZT0             = $_[20];    # [clobbered] ZMM temporary
2184  my $ZT1             = $_[21];    # [clobbered] ZMM temporary
2185  my $ZT2             = $_[22];    # [clobbered] ZMM temporary
2186  my $ZT3             = $_[23];    # [clobbered] ZMM temporary
2187  my $ZT4             = $_[24];    # [clobbered] ZMM temporary
2188  my $IA0             = $_[25];    # [clobbered] GP temporary
2189  my $IA1             = $_[26];    # [clobbered] GP temporary
2190  my $MASKREG         = $_[27];    # [clobbered] mask register
2191  my $SHUFMASK        = $_[28];    # [clobbered] ZMM for BE/LE shuffle mask
2192  my $PBLOCK_LEN      = $_[29];    # [in] partial block length
2193
2194  &INITIAL_BLOCKS_PARTIAL_CIPHER(
2195    $AES_KEYS, $GCM128_CTX,              $CIPH_PLAIN_OUT,         $PLAIN_CIPH_IN,
2196    $LENGTH,   $DATA_OFFSET,             $NUM_BLOCKS,             $CTR,
2197    $ENC_DEC,  $DAT0,                    $DAT1,                   $DAT2,
2198    $DAT3,     &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0,
2199    $CTR1,     $CTR2,                    $CTR3,                   $ZT0,
2200    $IA0,      $IA1,                     $MASKREG,                $SHUFMASK);
2201
2202  &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0,
2203    $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK),
2204    &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN);
2205}
2206
2207# ;; ===========================================================================
2208# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2209# ;; followed with GHASH of the N blocks.
2210sub GHASH_16_ENCRYPT_N_GHASH_N {
2211  my $AES_KEYS           = $_[0];     # [in] key pointer
2212  my $GCM128_CTX         = $_[1];     # [in] context pointer
2213  my $CIPH_PLAIN_OUT     = $_[2];     # [in] pointer to output buffer
2214  my $PLAIN_CIPH_IN      = $_[3];     # [in] pointer to input buffer
2215  my $DATA_OFFSET        = $_[4];     # [in] data offset
2216  my $LENGTH             = $_[5];     # [in] data length
2217  my $CTR_BE             = $_[6];     # [in/out] ZMM counter blocks (last 4) in big-endian
2218  my $CTR_CHECK          = $_[7];     # [in/out] GP with 8-bit counter for overflow check
2219  my $HASHKEY_OFFSET     = $_[8];     # [in] numerical offset for the highest hash key
2220                                      # (can be in form of register or numerical value)
2221  my $GHASHIN_BLK_OFFSET = $_[9];     # [in] numerical offset for GHASH blocks in
2222  my $SHFMSK             = $_[10];    # [in] ZMM with byte swap mask for pshufb
2223  my $B00_03             = $_[11];    # [clobbered] temporary ZMM
2224  my $B04_07             = $_[12];    # [clobbered] temporary ZMM
2225  my $B08_11             = $_[13];    # [clobbered] temporary ZMM
2226  my $B12_15             = $_[14];    # [clobbered] temporary ZMM
2227  my $GH1H_UNUSED        = $_[15];    # [clobbered] temporary ZMM
2228  my $GH1L               = $_[16];    # [clobbered] temporary ZMM
2229  my $GH1M               = $_[17];    # [clobbered] temporary ZMM
2230  my $GH1T               = $_[18];    # [clobbered] temporary ZMM
2231  my $GH2H               = $_[19];    # [clobbered] temporary ZMM
2232  my $GH2L               = $_[20];    # [clobbered] temporary ZMM
2233  my $GH2M               = $_[21];    # [clobbered] temporary ZMM
2234  my $GH2T               = $_[22];    # [clobbered] temporary ZMM
2235  my $GH3H               = $_[23];    # [clobbered] temporary ZMM
2236  my $GH3L               = $_[24];    # [clobbered] temporary ZMM
2237  my $GH3M               = $_[25];    # [clobbered] temporary ZMM
2238  my $GH3T               = $_[26];    # [clobbered] temporary ZMM
2239  my $AESKEY1            = $_[27];    # [clobbered] temporary ZMM
2240  my $AESKEY2            = $_[28];    # [clobbered] temporary ZMM
2241  my $GHKEY1             = $_[29];    # [clobbered] temporary ZMM
2242  my $GHKEY2             = $_[30];    # [clobbered] temporary ZMM
2243  my $GHDAT1             = $_[31];    # [clobbered] temporary ZMM
2244  my $GHDAT2             = $_[32];    # [clobbered] temporary ZMM
2245  my $ZT01               = $_[33];    # [clobbered] temporary ZMM
2246  my $ADDBE_4x4          = $_[34];    # [in] ZMM with 4x128bits 4 in big-endian
2247  my $ADDBE_1234         = $_[35];    # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2248  my $GHASH_TYPE         = $_[36];    # [in] "start", "start_reduce", "mid", "end_reduce"
2249  my $TO_REDUCE_L        = $_[37];    # [in] ZMM for low 4x128-bit GHASH sum
2250  my $TO_REDUCE_H        = $_[38];    # [in] ZMM for hi 4x128-bit GHASH sum
2251  my $TO_REDUCE_M        = $_[39];    # [in] ZMM for medium 4x128-bit GHASH sum
2252  my $ENC_DEC            = $_[40];    # [in] cipher direction
2253  my $HASH_IN_OUT        = $_[41];    # [in/out] XMM ghash in/out value
2254  my $IA0                = $_[42];    # [clobbered] GP temporary
2255  my $IA1                = $_[43];    # [clobbered] GP temporary
2256  my $MASKREG            = $_[44];    # [clobbered] mask register
2257  my $NUM_BLOCKS         = $_[45];    # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
2258  my $PBLOCK_LEN         = $_[46];    # [in] partial block length
2259
2260  die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
2261    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
2262
2263  my $label_suffix = $label_count++;
2264
2265  my $GH1H = $HASH_IN_OUT;
2266
2267  # ; this is to avoid additional move in do_reduction case
2268
2269  my $LAST_GHASH_BLK  = $GH1L;
2270  my $LAST_CIPHER_BLK = $GH1T;
2271
2272  my $RED_POLY = $GH2T;
2273  my $RED_P1   = $GH2L;
2274  my $RED_T1   = $GH2H;
2275  my $RED_T2   = $GH2M;
2276
2277  my $DATA1 = $GH3H;
2278  my $DATA2 = $GH3L;
2279  my $DATA3 = $GH3M;
2280  my $DATA4 = $GH3T;
2281
2282  # ;; do reduction after the 16 blocks ?
2283  my $do_reduction = 0;
2284
2285  # ;; is 16 block chunk a start?
2286  my $is_start = 0;
2287
2288  if ($GHASH_TYPE eq "start_reduce") {
2289    $is_start     = 1;
2290    $do_reduction = 1;
2291  }
2292
2293  if ($GHASH_TYPE eq "start") {
2294    $is_start = 1;
2295  }
2296
2297  if ($GHASH_TYPE eq "end_reduce") {
2298    $do_reduction = 1;
2299  }
2300
2301  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2302  # ;; - get load/store mask
2303  # ;; - load plain/cipher text
2304  # ;; get load/store mask
2305  $code .= <<___;
2306        lea               byte64_len_to_mask_table(%rip),$IA0
2307        mov               $LENGTH,$IA1
2308___
2309  if ($NUM_BLOCKS > 12) {
2310    $code .= "sub               \$`3*64`,$IA1\n";
2311  } elsif ($NUM_BLOCKS > 8) {
2312    $code .= "sub               \$`2*64`,$IA1\n";
2313  } elsif ($NUM_BLOCKS > 4) {
2314    $code .= "sub               \$`1*64`,$IA1\n";
2315  }
2316  $code .= "kmovq             ($IA0,$IA1,8),$MASKREG\n";
2317
2318  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2319  # ;; prepare counter blocks
2320
2321  $code .= <<___;
2322        cmp               \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
2323        jae               .L_16_blocks_overflow_${label_suffix}
2324___
2325
2326  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2327    $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07,     $B08_11,    $B12_15,    $CTR_BE,
2328    $B00_03,     $B04_07,  $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
2329  $code .= <<___;
2330        jmp               .L_16_blocks_ok_${label_suffix}
2331
2332.L_16_blocks_overflow_${label_suffix}:
2333        vpshufb           $SHFMSK,$CTR_BE,$CTR_BE
2334        vpaddd            ddq_add_1234(%rip),$CTR_BE,$B00_03
2335___
2336  if ($NUM_BLOCKS > 4) {
2337    $code .= <<___;
2338        vmovdqa64         ddq_add_4444(%rip),$B12_15
2339        vpaddd            $B12_15,$B00_03,$B04_07
2340___
2341  }
2342  if ($NUM_BLOCKS > 8) {
2343    $code .= "vpaddd            $B12_15,$B04_07,$B08_11\n";
2344  }
2345  if ($NUM_BLOCKS > 12) {
2346    $code .= "vpaddd            $B12_15,$B08_11,$B12_15\n";
2347  }
2348  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2349    $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2350    $B04_07,     $B08_11,   $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2351  $code .= <<___;
2352.L_16_blocks_ok_${label_suffix}:
2353
2354        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2355        # ;; - pre-load constants
2356        # ;; - add current hash into the 1st block
2357        vbroadcastf64x2    `(16 * 0)`($AES_KEYS),$AESKEY1
2358___
2359  if ($is_start != 0) {
2360    $code .= "vpxorq            `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
2361  } else {
2362    $code .= "vmovdqa64         `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
2363  }
2364
2365  $code .= "vmovdqu64         @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
2366
2367  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2368  # ;; save counter for the next round
2369  # ;; increment counter overflow check register
2370  if ($NUM_BLOCKS <= 4) {
2371    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
2372  } elsif ($NUM_BLOCKS <= 8) {
2373    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
2374  } elsif ($NUM_BLOCKS <= 12) {
2375    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
2376  } else {
2377    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
2378  }
2379  $code .= "vshufi64x2        \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
2380
2381  $code .= <<___;
2382        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2383        # ;; pre-load constants
2384        vbroadcastf64x2    `(16 * 1)`($AES_KEYS),$AESKEY2
2385        vmovdqu64         @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
2386        vmovdqa64         `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
2387___
2388
2389  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2390  # ;; stitch AES rounds with GHASH
2391
2392  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2393  # ;; AES round 0 - ARK
2394
2395  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2396    $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2397    $B04_07,     $B08_11,  $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2398  $code .= "vbroadcastf64x2    `(16 * 2)`($AES_KEYS),$AESKEY1\n";
2399
2400  $code .= <<___;
2401        # ;;==================================================
2402        # ;; GHASH 4 blocks (15 to 12)
2403        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH1H      # ; a1*b1
2404        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH1L      # ; a0*b0
2405        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH1M      # ; a1*b0
2406        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH1T      # ; a0*b1
2407        vmovdqu64         @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
2408        vmovdqa64         `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
2409___
2410
2411  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2412  # ;; AES round 1
2413  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2414    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2415    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2416  $code .= "vbroadcastf64x2    `(16 * 3)`($AES_KEYS),$AESKEY2\n";
2417
2418  $code .= <<___;
2419        # ;; =================================================
2420        # ;; GHASH 4 blocks (11 to 8)
2421        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
2422        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
2423        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
2424        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
2425        vmovdqu64         @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
2426        vmovdqa64         `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
2427___
2428
2429  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2430  # ;; AES round 2
2431  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2432    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2433    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2434  $code .= "vbroadcastf64x2    `(16 * 4)`($AES_KEYS),$AESKEY1\n";
2435
2436  $code .= <<___;
2437        # ;; =================================================
2438        # ;; GHASH 4 blocks (7 to 4)
2439        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH3M      # ; a0*b1
2440        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH3T      # ; a1*b0
2441        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH3H      # ; a1*b1
2442        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH3L      # ; a0*b0
2443___
2444
2445  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2446  # ;; AES rounds 3
2447  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2448    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2449    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2450  $code .= "vbroadcastf64x2    `(16 * 5)`($AES_KEYS),$AESKEY2\n";
2451
2452  $code .= <<___;
2453        # ;; =================================================
2454        # ;; Gather (XOR) GHASH for 12 blocks
2455        vpternlogq        \$0x96,$GH3H,$GH2H,$GH1H
2456        vpternlogq        \$0x96,$GH3L,$GH2L,$GH1L
2457        vpternlogq        \$0x96,$GH3T,$GH2T,$GH1T
2458        vpternlogq        \$0x96,$GH3M,$GH2M,$GH1M
2459___
2460
2461  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2462  # ;; AES rounds 4
2463  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2464    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2465    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2466  $code .= "vbroadcastf64x2    `(16 * 6)`($AES_KEYS),$AESKEY1\n";
2467
2468  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2469  # ;; load plain/cipher text
2470  &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
2471
2472  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2473  # ;; AES rounds 5
2474  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2475    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2476    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2477  $code .= "vbroadcastf64x2    `(16 * 7)`($AES_KEYS),$AESKEY2\n";
2478
2479  $code .= <<___;
2480        # ;; =================================================
2481        # ;; GHASH 4 blocks (3 to 0)
2482        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
2483        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
2484        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
2485        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
2486___
2487
2488  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2489  # ;; AES round 6
2490  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2491    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2492    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2493  $code .= "vbroadcastf64x2    `(16 * 8)`($AES_KEYS),$AESKEY1\n";
2494
2495  # ;; =================================================
2496  # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
2497  # ;; - add GH2[MTLH] to GH1[MTLH]
2498  $code .= "vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M\n";
2499  if ($do_reduction != 0) {
2500
2501    if ($is_start != 0) {
2502      $code .= "vpxorq            $GH2M,$GH1M,$GH1M\n";
2503    } else {
2504      $code .= <<___;
2505        vpternlogq        \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
2506        vpternlogq        \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
2507        vpternlogq        \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
2508___
2509    }
2510
2511  } else {
2512
2513    # ;; Update H/M/L hash sums if not carrying reduction
2514    if ($is_start != 0) {
2515      $code .= <<___;
2516        vpxorq            $GH2H,$GH1H,$TO_REDUCE_H
2517        vpxorq            $GH2L,$GH1L,$TO_REDUCE_L
2518        vpxorq            $GH2M,$GH1M,$TO_REDUCE_M
2519___
2520    } else {
2521      $code .= <<___;
2522        vpternlogq        \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
2523        vpternlogq        \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
2524        vpternlogq        \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
2525___
2526    }
2527
2528  }
2529
2530  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2531  # ;; AES round 7
2532  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2533    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2534    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2535  $code .= "vbroadcastf64x2    `(16 * 9)`($AES_KEYS),$AESKEY2\n";
2536
2537  # ;; =================================================
2538  # ;; prepare mid sum for adding to high & low
2539  # ;; load polynomial constant for reduction
2540  if ($do_reduction != 0) {
2541    $code .= <<___;
2542        vpsrldq           \$8,$GH1M,$GH2M
2543        vpslldq           \$8,$GH1M,$GH1M
2544
2545        vmovdqa64         POLY2(%rip),@{[XWORD($RED_POLY)]}
2546___
2547  }
2548
2549  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2550  # ;; AES round 8
2551  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2552    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2553    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2554  $code .= "vbroadcastf64x2    `(16 * 10)`($AES_KEYS),$AESKEY1\n";
2555
2556  # ;; =================================================
2557  # ;; Add mid product to high and low
2558  if ($do_reduction != 0) {
2559    if ($is_start != 0) {
2560      $code .= <<___;
2561        vpternlogq        \$0x96,$GH2M,$GH2H,$GH1H      # ; TH = TH1 + TH2 + TM>>64
2562        vpternlogq        \$0x96,$GH1M,$GH2L,$GH1L      # ; TL = TL1 + TL2 + TM<<64
2563___
2564    } else {
2565      $code .= <<___;
2566        vpxorq            $GH2M,$GH1H,$GH1H      # ; TH = TH1 + TM>>64
2567        vpxorq            $GH1M,$GH1L,$GH1L      # ; TL = TL1 + TM<<64
2568___
2569    }
2570  }
2571
2572  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2573  # ;; AES round 9
2574  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2575    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2576    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2577
2578  # ;; =================================================
2579  # ;; horizontal xor of low and high 4x128
2580  if ($do_reduction != 0) {
2581    &VHPXORI4x128($GH1H, $GH2H);
2582    &VHPXORI4x128($GH1L, $GH2L);
2583  }
2584
2585  if (($NROUNDS >= 11)) {
2586    $code .= "vbroadcastf64x2    `(16 * 11)`($AES_KEYS),$AESKEY2\n";
2587  }
2588
2589  # ;; =================================================
2590  # ;; first phase of reduction
2591  if ($do_reduction != 0) {
2592    $code .= <<___;
2593        vpclmulqdq        \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
2594        vpslldq           \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]}                    # ; shift-L 2 DWs
2595        vpxorq            @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]}      # ; first phase of the reduct
2596___
2597  }
2598
2599  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2600  # ;; AES rounds up to 11 (AES192) or 13 (AES256)
2601  # ;; AES128 is done
2602  if (($NROUNDS >= 11)) {
2603    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2604      $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2605      $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2606    $code .= "vbroadcastf64x2    `(16 * 12)`($AES_KEYS),$AESKEY1\n";
2607
2608    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2609      $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2610      $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2611    if (($NROUNDS == 13)) {
2612      $code .= "vbroadcastf64x2    `(16 * 13)`($AES_KEYS),$AESKEY2\n";
2613
2614      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2615        $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2616        $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2617      $code .= "vbroadcastf64x2    `(16 * 14)`($AES_KEYS),$AESKEY1\n";
2618
2619      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2620        $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2621        $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2622    }
2623  }
2624
2625  # ;; =================================================
2626  # ;; second phase of the reduction
2627  if ($do_reduction != 0) {
2628    $code .= <<___;
2629        vpclmulqdq        \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
2630        vpsrldq           \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]}      # ; shift-R 1-DW to obtain 2-DWs shift-R
2631        vpclmulqdq        \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
2632        vpslldq           \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]}      # ; shift-L 1-DW for result without shifts
2633        # ;; GH1H = GH1H + RED_T1 + RED_T2
2634        vpternlogq        \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
2635___
2636  }
2637
2638  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2639  # ;; the last AES round
2640  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2641    $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
2642    $B04_07,     $B08_11,       $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2643
2644  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2645  # ;; XOR against plain/cipher text
2646  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2647    $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2648    $B04_07,     $B08_11,  $B12_15, $DATA1,  $DATA2,  $DATA3,  $DATA4);
2649
2650  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2651  # ;; retrieve the last cipher counter block (partially XOR'ed with text)
2652  # ;; - this is needed for partial block cases
2653  if ($NUM_BLOCKS <= 4) {
2654    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2655  } elsif ($NUM_BLOCKS <= 8) {
2656    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2657  } elsif ($NUM_BLOCKS <= 12) {
2658    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2659  } else {
2660    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2661  }
2662
2663  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2664  # ;; store cipher/plain text
2665  $code .= "mov       $CIPH_PLAIN_OUT,$IA0\n";
2666  &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
2667
2668  # ;; =================================================
2669  # ;; shuffle cipher text blocks for GHASH computation
2670  if ($ENC_DEC eq "ENC") {
2671
2672    # ;; zero bytes outside the mask before hashing
2673    if ($NUM_BLOCKS <= 4) {
2674      $code .= "vmovdqu8           $B00_03,${B00_03}{$MASKREG}{z}\n";
2675    } elsif ($NUM_BLOCKS <= 8) {
2676      $code .= "vmovdqu8          $B04_07,${B04_07}{$MASKREG}{z}\n";
2677    } elsif ($NUM_BLOCKS <= 12) {
2678      $code .= "vmovdqu8          $B08_11,${B08_11}{$MASKREG}{z}\n";
2679    } else {
2680      $code .= "vmovdqu8          $B12_15,${B12_15}{$MASKREG}{z}\n";
2681    }
2682
2683    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2684      $NUM_BLOCKS, "vpshufb", $DATA1,  $DATA2,  $DATA3,  $DATA4,  $B00_03,
2685      $B04_07,     $B08_11,   $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2686  } else {
2687
2688    # ;; zero bytes outside the mask before hashing
2689    if ($NUM_BLOCKS <= 4) {
2690      $code .= "vmovdqu8          $DATA1,${DATA1}{$MASKREG}{z}\n";
2691    } elsif ($NUM_BLOCKS <= 8) {
2692      $code .= "vmovdqu8          $DATA2,${DATA2}{$MASKREG}{z}\n";
2693    } elsif ($NUM_BLOCKS <= 12) {
2694      $code .= "vmovdqu8          $DATA3,${DATA3}{$MASKREG}{z}\n";
2695    } else {
2696      $code .= "vmovdqu8          $DATA4,${DATA4}{$MASKREG}{z}\n";
2697    }
2698
2699    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2700      $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2,  $DATA3,  $DATA4,  $DATA1,
2701      $DATA2,      $DATA3,    $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2702  }
2703
2704  # ;; =================================================
2705  # ;; Extract the last block for partial / multi_call cases
2706  if ($NUM_BLOCKS <= 4) {
2707    $code .= "vextracti32x4     \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
2708  } elsif ($NUM_BLOCKS <= 8) {
2709    $code .= "vextracti32x4     \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
2710  } elsif ($NUM_BLOCKS <= 12) {
2711    $code .= "vextracti32x4     \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
2712  } else {
2713    $code .= "vextracti32x4     \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
2714  }
2715
2716  if ($do_reduction != 0) {
2717
2718    # ;; GH1H holds reduced hash value
2719    # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
2720    # ;; - register rename trick obsoletes the above move
2721  }
2722
2723  # ;; =================================================
2724  # ;; GHASH last N blocks
2725  # ;; - current hash value in HASH_IN_OUT or
2726  # ;;   product parts in TO_REDUCE_H/M/L
2727  # ;; - DATA1-DATA4 include blocks for GHASH
2728
2729  if ($do_reduction == 0) {
2730    &INITIAL_BLOCKS_PARTIAL_GHASH(
2731      $AES_KEYS,            $GCM128_CTX, $LENGTH,                  $NUM_BLOCKS,
2732      &XWORD($HASH_IN_OUT), $ENC_DEC,    $DATA1,                   $DATA2,
2733      $DATA3,               $DATA4,      &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2734      $B00_03,              $B04_07,     $B08_11,                  $B12_15,
2735      $GHDAT1,              $GHDAT2,     $AESKEY1,                 $AESKEY2,
2736      $GHKEY1,              $PBLOCK_LEN, $TO_REDUCE_H,             $TO_REDUCE_M,
2737      $TO_REDUCE_L);
2738  } else {
2739    &INITIAL_BLOCKS_PARTIAL_GHASH(
2740      $AES_KEYS,            $GCM128_CTX, $LENGTH,                  $NUM_BLOCKS,
2741      &XWORD($HASH_IN_OUT), $ENC_DEC,    $DATA1,                   $DATA2,
2742      $DATA3,               $DATA4,      &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2743      $B00_03,              $B04_07,     $B08_11,                  $B12_15,
2744      $GHDAT1,              $GHDAT2,     $AESKEY1,                 $AESKEY2,
2745      $GHKEY1,              $PBLOCK_LEN);
2746  }
2747}
2748
2749# ;; ===========================================================================
2750# ;; ===========================================================================
2751# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2752# ;; followed with GHASH of the N blocks.
2753sub GCM_ENC_DEC_LAST {
2754  my $AES_KEYS           = $_[0];     # [in] key pointer
2755  my $GCM128_CTX         = $_[1];     # [in] context pointer
2756  my $CIPH_PLAIN_OUT     = $_[2];     # [in] pointer to output buffer
2757  my $PLAIN_CIPH_IN      = $_[3];     # [in] pointer to input buffer
2758  my $DATA_OFFSET        = $_[4];     # [in] data offset
2759  my $LENGTH             = $_[5];     # [in/clobbered] data length
2760  my $CTR_BE             = $_[6];     # [in/out] ZMM counter blocks (last 4) in big-endian
2761  my $CTR_CHECK          = $_[7];     # [in/out] GP with 8-bit counter for overflow check
2762  my $HASHKEY_OFFSET     = $_[8];     # [in] numerical offset for the highest hash key
2763                                      # (can be register or numerical offset)
2764  my $GHASHIN_BLK_OFFSET = $_[9];     # [in] numerical offset for GHASH blocks in
2765  my $SHFMSK             = $_[10];    # [in] ZMM with byte swap mask for pshufb
2766  my $ZT00               = $_[11];    # [clobbered] temporary ZMM
2767  my $ZT01               = $_[12];    # [clobbered] temporary ZMM
2768  my $ZT02               = $_[13];    # [clobbered] temporary ZMM
2769  my $ZT03               = $_[14];    # [clobbered] temporary ZMM
2770  my $ZT04               = $_[15];    # [clobbered] temporary ZMM
2771  my $ZT05               = $_[16];    # [clobbered] temporary ZMM
2772  my $ZT06               = $_[17];    # [clobbered] temporary ZMM
2773  my $ZT07               = $_[18];    # [clobbered] temporary ZMM
2774  my $ZT08               = $_[19];    # [clobbered] temporary ZMM
2775  my $ZT09               = $_[20];    # [clobbered] temporary ZMM
2776  my $ZT10               = $_[21];    # [clobbered] temporary ZMM
2777  my $ZT11               = $_[22];    # [clobbered] temporary ZMM
2778  my $ZT12               = $_[23];    # [clobbered] temporary ZMM
2779  my $ZT13               = $_[24];    # [clobbered] temporary ZMM
2780  my $ZT14               = $_[25];    # [clobbered] temporary ZMM
2781  my $ZT15               = $_[26];    # [clobbered] temporary ZMM
2782  my $ZT16               = $_[27];    # [clobbered] temporary ZMM
2783  my $ZT17               = $_[28];    # [clobbered] temporary ZMM
2784  my $ZT18               = $_[29];    # [clobbered] temporary ZMM
2785  my $ZT19               = $_[30];    # [clobbered] temporary ZMM
2786  my $ZT20               = $_[31];    # [clobbered] temporary ZMM
2787  my $ZT21               = $_[32];    # [clobbered] temporary ZMM
2788  my $ZT22               = $_[33];    # [clobbered] temporary ZMM
2789  my $ADDBE_4x4          = $_[34];    # [in] ZMM with 4x128bits 4 in big-endian
2790  my $ADDBE_1234         = $_[35];    # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2791  my $GHASH_TYPE         = $_[36];    # [in] "start", "start_reduce", "mid", "end_reduce"
2792  my $TO_REDUCE_L        = $_[37];    # [in] ZMM for low 4x128-bit GHASH sum
2793  my $TO_REDUCE_H        = $_[38];    # [in] ZMM for hi 4x128-bit GHASH sum
2794  my $TO_REDUCE_M        = $_[39];    # [in] ZMM for medium 4x128-bit GHASH sum
2795  my $ENC_DEC            = $_[40];    # [in] cipher direction
2796  my $HASH_IN_OUT        = $_[41];    # [in/out] XMM ghash in/out value
2797  my $IA0                = $_[42];    # [clobbered] GP temporary
2798  my $IA1                = $_[43];    # [clobbered] GP temporary
2799  my $MASKREG            = $_[44];    # [clobbered] mask register
2800  my $PBLOCK_LEN         = $_[45];    # [in] partial block length
2801
2802  my $label_suffix = $label_count++;
2803
2804  $code .= <<___;
2805        mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
2806        add               \$15,@{[DWORD($IA0)]}
2807        shr               \$4,@{[DWORD($IA0)]}
2808        je                .L_last_num_blocks_is_0_${label_suffix}
2809
2810        cmp               \$8,@{[DWORD($IA0)]}
2811        je                .L_last_num_blocks_is_8_${label_suffix}
2812        jb                .L_last_num_blocks_is_7_1_${label_suffix}
2813
2814
2815        cmp               \$12,@{[DWORD($IA0)]}
2816        je                .L_last_num_blocks_is_12_${label_suffix}
2817        jb                .L_last_num_blocks_is_11_9_${label_suffix}
2818
2819        # ;; 16, 15, 14 or 13
2820        cmp               \$15,@{[DWORD($IA0)]}
2821        je                .L_last_num_blocks_is_15_${label_suffix}
2822        ja                .L_last_num_blocks_is_16_${label_suffix}
2823        cmp               \$14,@{[DWORD($IA0)]}
2824        je                .L_last_num_blocks_is_14_${label_suffix}
2825        jmp               .L_last_num_blocks_is_13_${label_suffix}
2826
2827.L_last_num_blocks_is_11_9_${label_suffix}:
2828        # ;; 11, 10 or 9
2829        cmp               \$10,@{[DWORD($IA0)]}
2830        je                .L_last_num_blocks_is_10_${label_suffix}
2831        ja                .L_last_num_blocks_is_11_${label_suffix}
2832        jmp               .L_last_num_blocks_is_9_${label_suffix}
2833
2834.L_last_num_blocks_is_7_1_${label_suffix}:
2835        cmp               \$4,@{[DWORD($IA0)]}
2836        je                .L_last_num_blocks_is_4_${label_suffix}
2837        jb                .L_last_num_blocks_is_3_1_${label_suffix}
2838        # ;; 7, 6 or 5
2839        cmp               \$6,@{[DWORD($IA0)]}
2840        ja                .L_last_num_blocks_is_7_${label_suffix}
2841        je                .L_last_num_blocks_is_6_${label_suffix}
2842        jmp               .L_last_num_blocks_is_5_${label_suffix}
2843
2844.L_last_num_blocks_is_3_1_${label_suffix}:
2845        # ;; 3, 2 or 1
2846        cmp               \$2,@{[DWORD($IA0)]}
2847        ja                .L_last_num_blocks_is_3_${label_suffix}
2848        je                .L_last_num_blocks_is_2_${label_suffix}
2849___
2850
2851  # ;; fall through for `jmp .L_last_num_blocks_is_1`
2852
2853  # ;; Use rep to generate different block size variants
2854  # ;; - one block size has to be the first one
2855  for my $num_blocks (1 .. 16) {
2856    $code .= ".L_last_num_blocks_is_${num_blocks}_${label_suffix}:\n";
2857    &GHASH_16_ENCRYPT_N_GHASH_N(
2858      $AES_KEYS,   $GCM128_CTX,  $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET,
2859      $LENGTH,     $CTR_BE,      $CTR_CHECK,      $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET,
2860      $SHFMSK,     $ZT00,        $ZT01,           $ZT02,           $ZT03,
2861      $ZT04,       $ZT05,        $ZT06,           $ZT07,           $ZT08,
2862      $ZT09,       $ZT10,        $ZT11,           $ZT12,           $ZT13,
2863      $ZT14,       $ZT15,        $ZT16,           $ZT17,           $ZT18,
2864      $ZT19,       $ZT20,        $ZT21,           $ZT22,           $ADDBE_4x4,
2865      $ADDBE_1234, $GHASH_TYPE,  $TO_REDUCE_L,    $TO_REDUCE_H,    $TO_REDUCE_M,
2866      $ENC_DEC,    $HASH_IN_OUT, $IA0,            $IA1,            $MASKREG,
2867      $num_blocks, $PBLOCK_LEN);
2868
2869    $code .= "jmp           .L_last_blocks_done_${label_suffix}\n";
2870  }
2871
2872  $code .= ".L_last_num_blocks_is_0_${label_suffix}:\n";
2873
2874  # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction
2875  # ;; - convert mid into end_reduce
2876  # ;; - convert start into start_reduce
2877  if ($GHASH_TYPE eq "mid") {
2878    $GHASH_TYPE = "end_reduce";
2879  }
2880  if ($GHASH_TYPE eq "start") {
2881    $GHASH_TYPE = "start_reduce";
2882  }
2883
2884  &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp",
2885    $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01,
2886    $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09);
2887
2888  $code .= ".L_last_blocks_done_${label_suffix}:\n";
2889}
2890
2891# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2892# ;; Main GCM macro stitching cipher with GHASH
2893# ;; - operates on single stream
2894# ;; - encrypts 16 blocks at a time
2895# ;; - ghash the 16 previously encrypted ciphertext blocks
2896# ;; - no partial block or multi_call handling here
2897sub GHASH_16_ENCRYPT_16_PARALLEL {
2898  my $AES_KEYS           = $_[0];     # [in] key pointer
2899  my $CIPH_PLAIN_OUT     = $_[1];     # [in] pointer to output buffer
2900  my $PLAIN_CIPH_IN      = $_[2];     # [in] pointer to input buffer
2901  my $DATA_OFFSET        = $_[3];     # [in] data offset
2902  my $CTR_BE             = $_[4];     # [in/out] ZMM counter blocks (last 4) in big-endian
2903  my $CTR_CHECK          = $_[5];     # [in/out] GP with 8-bit counter for overflow check
2904  my $HASHKEY_OFFSET     = $_[6];     # [in] numerical offset for the highest hash key (hash key index value)
2905  my $AESOUT_BLK_OFFSET  = $_[7];     # [in] numerical offset for AES-CTR out
2906  my $GHASHIN_BLK_OFFSET = $_[8];     # [in] numerical offset for GHASH blocks in
2907  my $SHFMSK             = $_[9];     # [in] ZMM with byte swap mask for pshufb
2908  my $ZT1                = $_[10];    # [clobbered] temporary ZMM (cipher)
2909  my $ZT2                = $_[11];    # [clobbered] temporary ZMM (cipher)
2910  my $ZT3                = $_[12];    # [clobbered] temporary ZMM (cipher)
2911  my $ZT4                = $_[13];    # [clobbered] temporary ZMM (cipher)
2912  my $ZT5                = $_[14];    # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
2913  my $ZT6                = $_[15];    # [clobbered] temporary ZMM (cipher)
2914  my $ZT7                = $_[16];    # [clobbered] temporary ZMM (cipher)
2915  my $ZT8                = $_[17];    # [clobbered] temporary ZMM (cipher)
2916  my $ZT9                = $_[18];    # [clobbered] temporary ZMM (cipher)
2917  my $ZT10               = $_[19];    # [clobbered] temporary ZMM (ghash)
2918  my $ZT11               = $_[20];    # [clobbered] temporary ZMM (ghash)
2919  my $ZT12               = $_[21];    # [clobbered] temporary ZMM (ghash)
2920  my $ZT13               = $_[22];    # [clobbered] temporary ZMM (ghash)
2921  my $ZT14               = $_[23];    # [clobbered] temporary ZMM (ghash)
2922  my $ZT15               = $_[24];    # [clobbered] temporary ZMM (ghash)
2923  my $ZT16               = $_[25];    # [clobbered] temporary ZMM (ghash)
2924  my $ZT17               = $_[26];    # [clobbered] temporary ZMM (ghash)
2925  my $ZT18               = $_[27];    # [clobbered] temporary ZMM (ghash)
2926  my $ZT19               = $_[28];    # [clobbered] temporary ZMM
2927  my $ZT20               = $_[29];    # [clobbered] temporary ZMM
2928  my $ZT21               = $_[30];    # [clobbered] temporary ZMM
2929  my $ZT22               = $_[31];    # [clobbered] temporary ZMM
2930  my $ZT23               = $_[32];    # [clobbered] temporary ZMM
2931  my $ADDBE_4x4          = $_[33];    # [in] ZMM with 4x128bits 4 in big-endian
2932  my $ADDBE_1234         = $_[34];    # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2933  my $TO_REDUCE_L        = $_[35];    # [in/out] ZMM for low 4x128-bit GHASH sum
2934  my $TO_REDUCE_H        = $_[36];    # [in/out] ZMM for hi 4x128-bit GHASH sum
2935  my $TO_REDUCE_M        = $_[37];    # [in/out] ZMM for medium 4x128-bit GHASH sum
2936  my $DO_REDUCTION       = $_[38];    # [in] "no_reduction", "final_reduction", "first_time"
2937  my $ENC_DEC            = $_[39];    # [in] cipher direction
2938  my $DATA_DISPL         = $_[40];    # [in] fixed numerical data displacement/offset
2939  my $GHASH_IN           = $_[41];    # [in] current GHASH value or "no_ghash_in"
2940  my $IA0                = $_[42];    # [clobbered] temporary GPR
2941
2942  my $B00_03 = $ZT1;
2943  my $B04_07 = $ZT2;
2944  my $B08_11 = $ZT3;
2945  my $B12_15 = $ZT4;
2946
2947  my $GH1H = $ZT5;
2948
2949  # ; @note: do not change this mapping
2950  my $GH1L = $ZT6;
2951  my $GH1M = $ZT7;
2952  my $GH1T = $ZT8;
2953
2954  my $GH2H = $ZT9;
2955  my $GH2L = $ZT10;
2956  my $GH2M = $ZT11;
2957  my $GH2T = $ZT12;
2958
2959  my $RED_POLY = $GH2T;
2960  my $RED_P1   = $GH2L;
2961  my $RED_T1   = $GH2H;
2962  my $RED_T2   = $GH2M;
2963
2964  my $GH3H = $ZT13;
2965  my $GH3L = $ZT14;
2966  my $GH3M = $ZT15;
2967  my $GH3T = $ZT16;
2968
2969  my $DATA1 = $ZT13;
2970  my $DATA2 = $ZT14;
2971  my $DATA3 = $ZT15;
2972  my $DATA4 = $ZT16;
2973
2974  my $AESKEY1 = $ZT17;
2975  my $AESKEY2 = $ZT18;
2976
2977  my $GHKEY1 = $ZT19;
2978  my $GHKEY2 = $ZT20;
2979  my $GHDAT1 = $ZT21;
2980  my $GHDAT2 = $ZT22;
2981
2982  my $label_suffix = $label_count++;
2983
2984  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2985  # ;; prepare counter blocks
2986
2987  $code .= <<___;
2988        cmpb              \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
2989        jae               .L_16_blocks_overflow_${label_suffix}
2990        vpaddd            $ADDBE_1234,$CTR_BE,$B00_03
2991        vpaddd            $ADDBE_4x4,$B00_03,$B04_07
2992        vpaddd            $ADDBE_4x4,$B04_07,$B08_11
2993        vpaddd            $ADDBE_4x4,$B08_11,$B12_15
2994        jmp               .L_16_blocks_ok_${label_suffix}
2995.L_16_blocks_overflow_${label_suffix}:
2996        vpshufb           $SHFMSK,$CTR_BE,$CTR_BE
2997        vmovdqa64         ddq_add_4444(%rip),$B12_15
2998        vpaddd            ddq_add_1234(%rip),$CTR_BE,$B00_03
2999        vpaddd            $B12_15,$B00_03,$B04_07
3000        vpaddd            $B12_15,$B04_07,$B08_11
3001        vpaddd            $B12_15,$B08_11,$B12_15
3002        vpshufb           $SHFMSK,$B00_03,$B00_03
3003        vpshufb           $SHFMSK,$B04_07,$B04_07
3004        vpshufb           $SHFMSK,$B08_11,$B08_11
3005        vpshufb           $SHFMSK,$B12_15,$B12_15
3006.L_16_blocks_ok_${label_suffix}:
3007___
3008
3009  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3010  # ;; pre-load constants
3011  $code .= "vbroadcastf64x2    `(16 * 0)`($AES_KEYS),$AESKEY1\n";
3012  if ($GHASH_IN ne "no_ghash_in") {
3013    $code .= "vpxorq            `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
3014  } else {
3015    $code .= "vmovdqa64         `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
3016  }
3017
3018  $code .= <<___;
3019        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
3020
3021        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3022        # ;; save counter for the next round
3023        # ;; increment counter overflow check register
3024        vshufi64x2        \$0b11111111,$B12_15,$B12_15,$CTR_BE
3025        addb              \$16,@{[BYTE($CTR_CHECK)]}
3026        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3027        # ;; pre-load constants
3028        vbroadcastf64x2    `(16 * 1)`($AES_KEYS),$AESKEY2
3029        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
3030        vmovdqa64         `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
3031
3032        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3033        # ;; stitch AES rounds with GHASH
3034
3035        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3036        # ;; AES round 0 - ARK
3037
3038        vpxorq            $AESKEY1,$B00_03,$B00_03
3039        vpxorq            $AESKEY1,$B04_07,$B04_07
3040        vpxorq            $AESKEY1,$B08_11,$B08_11
3041        vpxorq            $AESKEY1,$B12_15,$B12_15
3042        vbroadcastf64x2    `(16 * 2)`($AES_KEYS),$AESKEY1
3043
3044        # ;;==================================================
3045        # ;; GHASH 4 blocks (15 to 12)
3046        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH1H      # ; a1*b1
3047        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH1L      # ; a0*b0
3048        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH1M      # ; a1*b0
3049        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH1T      # ; a0*b1
3050        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
3051        vmovdqa64         `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
3052
3053        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3054        # ;; AES round 1
3055        vaesenc           $AESKEY2,$B00_03,$B00_03
3056        vaesenc           $AESKEY2,$B04_07,$B04_07
3057        vaesenc           $AESKEY2,$B08_11,$B08_11
3058        vaesenc           $AESKEY2,$B12_15,$B12_15
3059        vbroadcastf64x2    `(16 * 3)`($AES_KEYS),$AESKEY2
3060
3061        # ;; =================================================
3062        # ;; GHASH 4 blocks (11 to 8)
3063        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
3064        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
3065        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
3066        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
3067        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
3068        vmovdqa64         `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
3069
3070        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3071        # ;; AES round 2
3072        vaesenc           $AESKEY1,$B00_03,$B00_03
3073        vaesenc           $AESKEY1,$B04_07,$B04_07
3074        vaesenc           $AESKEY1,$B08_11,$B08_11
3075        vaesenc           $AESKEY1,$B12_15,$B12_15
3076        vbroadcastf64x2    `(16 * 4)`($AES_KEYS),$AESKEY1
3077
3078        # ;; =================================================
3079        # ;; GHASH 4 blocks (7 to 4)
3080        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH3M      # ; a0*b1
3081        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH3T      # ; a1*b0
3082        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH3H      # ; a1*b1
3083        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH3L      # ; a0*b0
3084        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3085        # ;; AES rounds 3
3086        vaesenc           $AESKEY2,$B00_03,$B00_03
3087        vaesenc           $AESKEY2,$B04_07,$B04_07
3088        vaesenc           $AESKEY2,$B08_11,$B08_11
3089        vaesenc           $AESKEY2,$B12_15,$B12_15
3090        vbroadcastf64x2    `(16 * 5)`($AES_KEYS),$AESKEY2
3091
3092        # ;; =================================================
3093        # ;; Gather (XOR) GHASH for 12 blocks
3094        vpternlogq        \$0x96,$GH3H,$GH2H,$GH1H
3095        vpternlogq        \$0x96,$GH3L,$GH2L,$GH1L
3096        vpternlogq        \$0x96,$GH3T,$GH2T,$GH1T
3097        vpternlogq        \$0x96,$GH3M,$GH2M,$GH1M
3098
3099        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3100        # ;; AES rounds 4
3101        vaesenc           $AESKEY1,$B00_03,$B00_03
3102        vaesenc           $AESKEY1,$B04_07,$B04_07
3103        vaesenc           $AESKEY1,$B08_11,$B08_11
3104        vaesenc           $AESKEY1,$B12_15,$B12_15
3105        vbroadcastf64x2    `(16 * 6)`($AES_KEYS),$AESKEY1
3106
3107        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3108        # ;; load plain/cipher text (recycle GH3xx registers)
3109        vmovdqu8          `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
3110        vmovdqu8          `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
3111        vmovdqu8          `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
3112        vmovdqu8          `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
3113
3114        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3115        # ;; AES rounds 5
3116        vaesenc           $AESKEY2,$B00_03,$B00_03
3117        vaesenc           $AESKEY2,$B04_07,$B04_07
3118        vaesenc           $AESKEY2,$B08_11,$B08_11
3119        vaesenc           $AESKEY2,$B12_15,$B12_15
3120        vbroadcastf64x2    `(16 * 7)`($AES_KEYS),$AESKEY2
3121
3122        # ;; =================================================
3123        # ;; GHASH 4 blocks (3 to 0)
3124        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
3125        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
3126        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
3127        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
3128        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3129        # ;; AES round 6
3130        vaesenc           $AESKEY1,$B00_03,$B00_03
3131        vaesenc           $AESKEY1,$B04_07,$B04_07
3132        vaesenc           $AESKEY1,$B08_11,$B08_11
3133        vaesenc           $AESKEY1,$B12_15,$B12_15
3134        vbroadcastf64x2    `(16 * 8)`($AES_KEYS),$AESKEY1
3135___
3136
3137  # ;; =================================================
3138  # ;; gather GHASH in GH1L (low) and GH1H (high)
3139  if ($DO_REDUCTION eq "first_time") {
3140    $code .= <<___;
3141        vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M      # ; TM
3142        vpxorq            $GH2M,$GH1M,$TO_REDUCE_M      # ; TM
3143        vpxorq            $GH2H,$GH1H,$TO_REDUCE_H      # ; TH
3144        vpxorq            $GH2L,$GH1L,$TO_REDUCE_L      # ; TL
3145___
3146  }
3147  if ($DO_REDUCTION eq "no_reduction") {
3148    $code .= <<___;
3149        vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M             # ; TM
3150        vpternlogq        \$0x96,$GH2M,$GH1M,$TO_REDUCE_M      # ; TM
3151        vpternlogq        \$0x96,$GH2H,$GH1H,$TO_REDUCE_H      # ; TH
3152        vpternlogq        \$0x96,$GH2L,$GH1L,$TO_REDUCE_L      # ; TL
3153___
3154  }
3155  if ($DO_REDUCTION eq "final_reduction") {
3156    $code .= <<___;
3157        # ;; phase 1: add mid products together
3158        # ;; also load polynomial constant for reduction
3159        vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M      # ; TM
3160        vpternlogq        \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
3161
3162        vpsrldq           \$8,$GH1M,$GH2M
3163        vpslldq           \$8,$GH1M,$GH1M
3164
3165        vmovdqa64         POLY2(%rip),@{[XWORD($RED_POLY)]}
3166___
3167  }
3168
3169  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3170  # ;; AES round 7
3171  $code .= <<___;
3172        vaesenc           $AESKEY2,$B00_03,$B00_03
3173        vaesenc           $AESKEY2,$B04_07,$B04_07
3174        vaesenc           $AESKEY2,$B08_11,$B08_11
3175        vaesenc           $AESKEY2,$B12_15,$B12_15
3176        vbroadcastf64x2    `(16 * 9)`($AES_KEYS),$AESKEY2
3177___
3178
3179  # ;; =================================================
3180  # ;; Add mid product to high and low
3181  if ($DO_REDUCTION eq "final_reduction") {
3182    $code .= <<___;
3183        vpternlogq        \$0x96,$GH2M,$GH2H,$GH1H      # ; TH = TH1 + TH2 + TM>>64
3184        vpxorq            $TO_REDUCE_H,$GH1H,$GH1H
3185        vpternlogq        \$0x96,$GH1M,$GH2L,$GH1L      # ; TL = TL1 + TL2 + TM<<64
3186        vpxorq            $TO_REDUCE_L,$GH1L,$GH1L
3187___
3188  }
3189
3190  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3191  # ;; AES round 8
3192  $code .= <<___;
3193        vaesenc           $AESKEY1,$B00_03,$B00_03
3194        vaesenc           $AESKEY1,$B04_07,$B04_07
3195        vaesenc           $AESKEY1,$B08_11,$B08_11
3196        vaesenc           $AESKEY1,$B12_15,$B12_15
3197        vbroadcastf64x2    `(16 * 10)`($AES_KEYS),$AESKEY1
3198___
3199
3200  # ;; =================================================
3201  # ;; horizontal xor of low and high 4x128
3202  if ($DO_REDUCTION eq "final_reduction") {
3203    &VHPXORI4x128($GH1H, $GH2H);
3204    &VHPXORI4x128($GH1L, $GH2L);
3205  }
3206
3207  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3208  # ;; AES round 9
3209  $code .= <<___;
3210        vaesenc           $AESKEY2,$B00_03,$B00_03
3211        vaesenc           $AESKEY2,$B04_07,$B04_07
3212        vaesenc           $AESKEY2,$B08_11,$B08_11
3213        vaesenc           $AESKEY2,$B12_15,$B12_15
3214___
3215  if (($NROUNDS >= 11)) {
3216    $code .= "vbroadcastf64x2    `(16 * 11)`($AES_KEYS),$AESKEY2\n";
3217  }
3218
3219  # ;; =================================================
3220  # ;; first phase of reduction
3221  if ($DO_REDUCTION eq "final_reduction") {
3222    $code .= <<___;
3223        vpclmulqdq        \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
3224        vpslldq           \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]}                    # ; shift-L 2 DWs
3225        vpxorq            @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]}      # ; first phase of the reduct
3226___
3227  }
3228
3229  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3230  # ;; AES rounds up to 11 (AES192) or 13 (AES256)
3231  # ;; AES128 is done
3232  if (($NROUNDS >= 11)) {
3233    $code .= <<___;
3234        vaesenc           $AESKEY1,$B00_03,$B00_03
3235        vaesenc           $AESKEY1,$B04_07,$B04_07
3236        vaesenc           $AESKEY1,$B08_11,$B08_11
3237        vaesenc           $AESKEY1,$B12_15,$B12_15
3238        vbroadcastf64x2    `(16 * 12)`($AES_KEYS),$AESKEY1
3239
3240        vaesenc           $AESKEY2,$B00_03,$B00_03
3241        vaesenc           $AESKEY2,$B04_07,$B04_07
3242        vaesenc           $AESKEY2,$B08_11,$B08_11
3243        vaesenc           $AESKEY2,$B12_15,$B12_15
3244___
3245    if (($NROUNDS == 13)) {
3246      $code .= <<___;
3247        vbroadcastf64x2    `(16 * 13)`($AES_KEYS),$AESKEY2
3248
3249        vaesenc           $AESKEY1,$B00_03,$B00_03
3250        vaesenc           $AESKEY1,$B04_07,$B04_07
3251        vaesenc           $AESKEY1,$B08_11,$B08_11
3252        vaesenc           $AESKEY1,$B12_15,$B12_15
3253        vbroadcastf64x2    `(16 * 14)`($AES_KEYS),$AESKEY1
3254
3255        vaesenc           $AESKEY2,$B00_03,$B00_03
3256        vaesenc           $AESKEY2,$B04_07,$B04_07
3257        vaesenc           $AESKEY2,$B08_11,$B08_11
3258        vaesenc           $AESKEY2,$B12_15,$B12_15
3259___
3260    }
3261  }
3262
3263  # ;; =================================================
3264  # ;; second phase of the reduction
3265  if ($DO_REDUCTION eq "final_reduction") {
3266    $code .= <<___;
3267        vpclmulqdq        \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
3268        vpsrldq           \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]}      # ; shift-R 1-DW to obtain 2-DWs shift-R
3269        vpclmulqdq        \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
3270        vpslldq           \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]}      # ; shift-L 1-DW for result without shifts
3271        # ;; GH1H = GH1H x RED_T1 x RED_T2
3272        vpternlogq        \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
3273___
3274  }
3275
3276  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3277  # ;; the last AES round
3278  $code .= <<___;
3279        vaesenclast       $AESKEY1,$B00_03,$B00_03
3280        vaesenclast       $AESKEY1,$B04_07,$B04_07
3281        vaesenclast       $AESKEY1,$B08_11,$B08_11
3282        vaesenclast       $AESKEY1,$B12_15,$B12_15
3283
3284        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3285        # ;; XOR against plain/cipher text
3286        vpxorq            $DATA1,$B00_03,$B00_03
3287        vpxorq            $DATA2,$B04_07,$B04_07
3288        vpxorq            $DATA3,$B08_11,$B08_11
3289        vpxorq            $DATA4,$B12_15,$B12_15
3290
3291        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3292        # ;; store cipher/plain text
3293        mov               $CIPH_PLAIN_OUT,$IA0
3294        vmovdqu8          $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
3295        vmovdqu8          $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
3296        vmovdqu8          $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
3297        vmovdqu8          $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
3298___
3299
3300  # ;; =================================================
3301  # ;; shuffle cipher text blocks for GHASH computation
3302  if ($ENC_DEC eq "ENC") {
3303    $code .= <<___;
3304        vpshufb           $SHFMSK,$B00_03,$B00_03
3305        vpshufb           $SHFMSK,$B04_07,$B04_07
3306        vpshufb           $SHFMSK,$B08_11,$B08_11
3307        vpshufb           $SHFMSK,$B12_15,$B12_15
3308___
3309  } else {
3310    $code .= <<___;
3311        vpshufb           $SHFMSK,$DATA1,$B00_03
3312        vpshufb           $SHFMSK,$DATA2,$B04_07
3313        vpshufb           $SHFMSK,$DATA3,$B08_11
3314        vpshufb           $SHFMSK,$DATA4,$B12_15
3315___
3316  }
3317
3318  # ;; =================================================
3319  # ;; store shuffled cipher text for ghashing
3320  $code .= <<___;
3321        vmovdqa64         $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
3322        vmovdqa64         $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
3323        vmovdqa64         $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
3324        vmovdqa64         $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
3325___
3326}
3327
3328# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3329# ;;; Encryption of a single block
3330sub ENCRYPT_SINGLE_BLOCK {
3331  my $AES_KEY = $_[0];    # ; [in]
3332  my $XMM0    = $_[1];    # ; [in/out]
3333  my $GPR1    = $_[2];    # ; [clobbered]
3334
3335  my $label_suffix = $label_count++;
3336
3337  $code .= <<___;
3338        # ; load number of rounds from AES_KEY structure (offset in bytes is
3339        # ; size of the |rd_key| buffer)
3340        mov             `4*15*4`($AES_KEY),@{[DWORD($GPR1)]}
3341        cmp             \$9,@{[DWORD($GPR1)]}
3342        je              .Laes_128_${label_suffix}
3343        cmp             \$11,@{[DWORD($GPR1)]}
3344        je              .Laes_192_${label_suffix}
3345        cmp             \$13,@{[DWORD($GPR1)]}
3346        je              .Laes_256_${label_suffix}
3347        jmp             .Lexit_aes_${label_suffix}
3348___
3349  for my $keylen (sort keys %aes_rounds) {
3350    my $nr = $aes_rounds{$keylen};
3351    $code .= <<___;
3352.align 32
3353.Laes_${keylen}_${label_suffix}:
3354___
3355    $code .= "vpxorq          `16*0`($AES_KEY),$XMM0, $XMM0\n\n";
3356    for (my $i = 1; $i <= $nr; $i++) {
3357      $code .= "vaesenc         `16*$i`($AES_KEY),$XMM0,$XMM0\n\n";
3358    }
3359    $code .= <<___;
3360        vaesenclast     `16*($nr+1)`($AES_KEY),$XMM0,$XMM0
3361        jmp .Lexit_aes_${label_suffix}
3362___
3363  }
3364  $code .= ".Lexit_aes_${label_suffix}:\n\n";
3365}
3366
3367sub CALC_J0 {
3368  my $GCM128_CTX = $_[0];     #; [in] Pointer to GCM context
3369  my $IV         = $_[1];     #; [in] Pointer to IV
3370  my $IV_LEN     = $_[2];     #; [in] IV length
3371  my $J0         = $_[3];     #; [out] XMM reg to contain J0
3372  my $ZT0        = $_[4];     #; [clobbered] ZMM register
3373  my $ZT1        = $_[5];     #; [clobbered] ZMM register
3374  my $ZT2        = $_[6];     #; [clobbered] ZMM register
3375  my $ZT3        = $_[7];     #; [clobbered] ZMM register
3376  my $ZT4        = $_[8];     #; [clobbered] ZMM register
3377  my $ZT5        = $_[9];     #; [clobbered] ZMM register
3378  my $ZT6        = $_[10];    #; [clobbered] ZMM register
3379  my $ZT7        = $_[11];    #; [clobbered] ZMM register
3380  my $ZT8        = $_[12];    #; [clobbered] ZMM register
3381  my $ZT9        = $_[13];    #; [clobbered] ZMM register
3382  my $ZT10       = $_[14];    #; [clobbered] ZMM register
3383  my $ZT11       = $_[15];    #; [clobbered] ZMM register
3384  my $ZT12       = $_[16];    #; [clobbered] ZMM register
3385  my $ZT13       = $_[17];    #; [clobbered] ZMM register
3386  my $ZT14       = $_[18];    #; [clobbered] ZMM register
3387  my $ZT15       = $_[19];    #; [clobbered] ZMM register
3388  my $ZT16       = $_[20];    #; [clobbered] ZMM register
3389  my $T1         = $_[21];    #; [clobbered] GP register
3390  my $T2         = $_[22];    #; [clobbered] GP register
3391  my $T3         = $_[23];    #; [clobbered] GP register
3392  my $MASKREG    = $_[24];    #; [clobbered] mask register
3393
3394  # ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
3395  # ;; s = 16 * RoundUp(len(IV)/16) -  len(IV) */
3396
3397  # ;; Calculate GHASH of (IV || 0s)
3398  $code .= "vpxor             $J0,$J0,$J0\n";
3399  &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
3400    $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG);
3401
3402  # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
3403  $code .= <<___;
3404        mov               $IV_LEN,$T1
3405        shl               \$3,$T1      # ; IV length in bits
3406        vmovq             $T1,@{[XWORD($ZT2)]}
3407
3408        # ;; Might need shuffle of ZT2
3409        vpxorq            $J0,@{[XWORD($ZT2)]},$J0
3410
3411        vmovdqu64         @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]}
3412___
3413  &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]});
3414
3415  $code .= "vpshufb           SHUF_MASK(%rip),$J0,$J0      # ; perform a 16Byte swap\n";
3416}
3417
3418# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3419# ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for
3420# ;;; encoding/decoding.
3421# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3422sub GCM_INIT_IV {
3423  my $AES_KEYS   = $_[0];     # [in] AES key schedule
3424  my $GCM128_CTX = $_[1];     # [in/out] GCM context
3425  my $IV         = $_[2];     # [in] IV pointer
3426  my $IV_LEN     = $_[3];     # [in] IV length
3427  my $GPR1       = $_[4];     # [clobbered] GP register
3428  my $GPR2       = $_[5];     # [clobbered] GP register
3429  my $GPR3       = $_[6];     # [clobbered] GP register
3430  my $MASKREG    = $_[7];     # [clobbered] mask register
3431  my $CUR_COUNT  = $_[8];     # [out] XMM with current counter
3432  my $ZT0        = $_[9];     # [clobbered] ZMM register
3433  my $ZT1        = $_[10];    # [clobbered] ZMM register
3434  my $ZT2        = $_[11];    # [clobbered] ZMM register
3435  my $ZT3        = $_[12];    # [clobbered] ZMM register
3436  my $ZT4        = $_[13];    # [clobbered] ZMM register
3437  my $ZT5        = $_[14];    # [clobbered] ZMM register
3438  my $ZT6        = $_[15];    # [clobbered] ZMM register
3439  my $ZT7        = $_[16];    # [clobbered] ZMM register
3440  my $ZT8        = $_[17];    # [clobbered] ZMM register
3441  my $ZT9        = $_[18];    # [clobbered] ZMM register
3442  my $ZT10       = $_[19];    # [clobbered] ZMM register
3443  my $ZT11       = $_[20];    # [clobbered] ZMM register
3444  my $ZT12       = $_[21];    # [clobbered] ZMM register
3445  my $ZT13       = $_[22];    # [clobbered] ZMM register
3446  my $ZT14       = $_[23];    # [clobbered] ZMM register
3447  my $ZT15       = $_[24];    # [clobbered] ZMM register
3448  my $ZT16       = $_[25];    # [clobbered] ZMM register
3449
3450  my $ZT0x = $ZT0;
3451  $ZT0x =~ s/zmm/xmm/;
3452
3453  $code .= <<___;
3454        cmp     \$12,$IV_LEN
3455        je      iv_len_12_init_IV
3456___
3457
3458  # ;; IV is different than 12 bytes
3459  &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7,
3460    $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3461  $code .= <<___;
3462       jmp      skip_iv_len_12_init_IV
3463iv_len_12_init_IV:   # ;; IV is 12 bytes
3464        # ;; read 12 IV bytes and pad with 0x00000001
3465        vmovdqu8          ONEf(%rip),$CUR_COUNT
3466        mov               $IV,$GPR2
3467        mov               \$0x0000000000000fff,@{[DWORD($GPR1)]}
3468        kmovq             $GPR1,$MASKREG
3469        vmovdqu8          ($GPR2),${CUR_COUNT}{$MASKREG}         # ; ctr = IV | 0x1
3470skip_iv_len_12_init_IV:
3471        vmovdqu           $CUR_COUNT,$ZT0x
3472___
3473  &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1");    # ; E(K, Y0)
3474  $code .= <<___;
3475        vmovdqu           $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX)   # ; save EK0 for finalization stage
3476
3477        # ;; store IV as counter in LE format
3478        vpshufb           SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT
3479        vmovdqu           $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX)   # ; save current counter Yi
3480___
3481}
3482
3483sub GCM_UPDATE_AAD {
3484  my $GCM128_CTX = $_[0];  # [in] GCM context pointer
3485  my $A_IN       = $_[1];  # [in] AAD pointer
3486  my $A_LEN      = $_[2];  # [in] AAD length in bytes
3487  my $GPR1       = $_[3];  # [clobbered] GP register
3488  my $GPR2       = $_[4];  # [clobbered] GP register
3489  my $GPR3       = $_[5];  # [clobbered] GP register
3490  my $MASKREG    = $_[6];  # [clobbered] mask register
3491  my $AAD_HASH   = $_[7];  # [out] XMM for AAD_HASH value
3492  my $ZT0        = $_[8];  # [clobbered] ZMM register
3493  my $ZT1        = $_[9];  # [clobbered] ZMM register
3494  my $ZT2        = $_[10]; # [clobbered] ZMM register
3495  my $ZT3        = $_[11]; # [clobbered] ZMM register
3496  my $ZT4        = $_[12]; # [clobbered] ZMM register
3497  my $ZT5        = $_[13]; # [clobbered] ZMM register
3498  my $ZT6        = $_[14]; # [clobbered] ZMM register
3499  my $ZT7        = $_[15]; # [clobbered] ZMM register
3500  my $ZT8        = $_[16]; # [clobbered] ZMM register
3501  my $ZT9        = $_[17]; # [clobbered] ZMM register
3502  my $ZT10       = $_[18]; # [clobbered] ZMM register
3503  my $ZT11       = $_[19]; # [clobbered] ZMM register
3504  my $ZT12       = $_[20]; # [clobbered] ZMM register
3505  my $ZT13       = $_[21]; # [clobbered] ZMM register
3506  my $ZT14       = $_[22]; # [clobbered] ZMM register
3507  my $ZT15       = $_[23]; # [clobbered] ZMM register
3508  my $ZT16       = $_[24]; # [clobbered] ZMM register
3509
3510  # ; load current hash
3511  $code .= "vmovdqu64         $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n";
3512
3513  &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2,
3514    $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13,
3515    $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3516
3517  # ; load current hash
3518  $code .= "vmovdqu64         $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n";
3519}
3520
3521# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3522# ;;; Cipher and ghash of payloads shorter than 256 bytes
3523# ;;; - number of blocks in the message comes as argument
3524# ;;; - depending on the number of blocks an optimized variant of
3525# ;;;   INITIAL_BLOCKS_PARTIAL is invoked
3526sub GCM_ENC_DEC_SMALL {
3527  my $AES_KEYS       = $_[0];     # [in] key pointer
3528  my $GCM128_CTX     = $_[1];     # [in] context pointer
3529  my $CIPH_PLAIN_OUT = $_[2];     # [in] output buffer
3530  my $PLAIN_CIPH_IN  = $_[3];     # [in] input buffer
3531  my $PLAIN_CIPH_LEN = $_[4];     # [in] buffer length
3532  my $ENC_DEC        = $_[5];     # [in] cipher direction
3533  my $DATA_OFFSET    = $_[6];     # [in] data offset
3534  my $LENGTH         = $_[7];     # [in] data length
3535  my $NUM_BLOCKS     = $_[8];     # [in] number of blocks to process 1 to 16
3536  my $CTR            = $_[9];     # [in/out] XMM counter block
3537  my $HASH_IN_OUT    = $_[10];    # [in/out] XMM GHASH value
3538  my $ZTMP0          = $_[11];    # [clobbered] ZMM register
3539  my $ZTMP1          = $_[12];    # [clobbered] ZMM register
3540  my $ZTMP2          = $_[13];    # [clobbered] ZMM register
3541  my $ZTMP3          = $_[14];    # [clobbered] ZMM register
3542  my $ZTMP4          = $_[15];    # [clobbered] ZMM register
3543  my $ZTMP5          = $_[16];    # [clobbered] ZMM register
3544  my $ZTMP6          = $_[17];    # [clobbered] ZMM register
3545  my $ZTMP7          = $_[18];    # [clobbered] ZMM register
3546  my $ZTMP8          = $_[19];    # [clobbered] ZMM register
3547  my $ZTMP9          = $_[20];    # [clobbered] ZMM register
3548  my $ZTMP10         = $_[21];    # [clobbered] ZMM register
3549  my $ZTMP11         = $_[22];    # [clobbered] ZMM register
3550  my $ZTMP12         = $_[23];    # [clobbered] ZMM register
3551  my $ZTMP13         = $_[24];    # [clobbered] ZMM register
3552  my $ZTMP14         = $_[25];    # [clobbered] ZMM register
3553  my $IA0            = $_[26];    # [clobbered] GP register
3554  my $IA1            = $_[27];    # [clobbered] GP register
3555  my $MASKREG        = $_[28];    # [clobbered] mask register
3556  my $SHUFMASK       = $_[29];    # [in] ZMM with BE/LE shuffle mask
3557  my $PBLOCK_LEN     = $_[30];    # [in] partial block length
3558
3559  my $label_suffix = $label_count++;
3560
3561  $code .= <<___;
3562        cmp               \$8,$NUM_BLOCKS
3563        je                .L_small_initial_num_blocks_is_8_${label_suffix}
3564        jl                .L_small_initial_num_blocks_is_7_1_${label_suffix}
3565
3566
3567        cmp               \$12,$NUM_BLOCKS
3568        je                .L_small_initial_num_blocks_is_12_${label_suffix}
3569        jl                .L_small_initial_num_blocks_is_11_9_${label_suffix}
3570
3571        # ;; 16, 15, 14 or 13
3572        cmp               \$16,$NUM_BLOCKS
3573        je                .L_small_initial_num_blocks_is_16_${label_suffix}
3574        cmp               \$15,$NUM_BLOCKS
3575        je                .L_small_initial_num_blocks_is_15_${label_suffix}
3576        cmp               \$14,$NUM_BLOCKS
3577        je                .L_small_initial_num_blocks_is_14_${label_suffix}
3578        jmp               .L_small_initial_num_blocks_is_13_${label_suffix}
3579
3580.L_small_initial_num_blocks_is_11_9_${label_suffix}:
3581        # ;; 11, 10 or 9
3582        cmp               \$11,$NUM_BLOCKS
3583        je                .L_small_initial_num_blocks_is_11_${label_suffix}
3584        cmp               \$10,$NUM_BLOCKS
3585        je                .L_small_initial_num_blocks_is_10_${label_suffix}
3586        jmp               .L_small_initial_num_blocks_is_9_${label_suffix}
3587
3588.L_small_initial_num_blocks_is_7_1_${label_suffix}:
3589        cmp               \$4,$NUM_BLOCKS
3590        je                .L_small_initial_num_blocks_is_4_${label_suffix}
3591        jl                .L_small_initial_num_blocks_is_3_1_${label_suffix}
3592        # ;; 7, 6 or 5
3593        cmp               \$7,$NUM_BLOCKS
3594        je                .L_small_initial_num_blocks_is_7_${label_suffix}
3595        cmp               \$6,$NUM_BLOCKS
3596        je                .L_small_initial_num_blocks_is_6_${label_suffix}
3597        jmp               .L_small_initial_num_blocks_is_5_${label_suffix}
3598
3599.L_small_initial_num_blocks_is_3_1_${label_suffix}:
3600        # ;; 3, 2 or 1
3601        cmp               \$3,$NUM_BLOCKS
3602        je                .L_small_initial_num_blocks_is_3_${label_suffix}
3603        cmp               \$2,$NUM_BLOCKS
3604        je                .L_small_initial_num_blocks_is_2_${label_suffix}
3605
3606        # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed
3607
3608        # ;; Generation of different block size variants
3609        # ;; - one block size has to be the first one
3610___
3611
3612  for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) {
3613    $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${label_suffix}:\n";
3614    &INITIAL_BLOCKS_PARTIAL(
3615      $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH,   $DATA_OFFSET,
3616      $num_blocks, $CTR,        $HASH_IN_OUT,    $ENC_DEC,       $ZTMP0,    $ZTMP1,
3617      $ZTMP2,      $ZTMP3,      $ZTMP4,          $ZTMP5,         $ZTMP6,    $ZTMP7,
3618      $ZTMP8,      $ZTMP9,      $ZTMP10,         $ZTMP11,        $ZTMP12,   $ZTMP13,
3619      $ZTMP14,     $IA0,        $IA1,            $MASKREG,       $SHUFMASK, $PBLOCK_LEN);
3620
3621    if ($num_blocks != 16) {
3622      $code .= "jmp           .L_small_initial_blocks_encrypted_${label_suffix}\n";
3623    }
3624  }
3625
3626  $code .= ".L_small_initial_blocks_encrypted_${label_suffix}:\n";
3627}
3628
3629# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3630# ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context
3631# ; struct has been initialized by GCM_INIT_IV
3632# ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
3633# ; Clobbers rax, r10-r15, and zmm0-zmm31, k1
3634# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3635sub GCM_ENC_DEC {
3636  my $AES_KEYS       = $_[0];    # [in] AES Key schedule
3637  my $GCM128_CTX     = $_[1];    # [in] context pointer
3638  my $PBLOCK_LEN     = $_[2];    # [in] length of partial block at the moment of previous update
3639  my $PLAIN_CIPH_IN  = $_[3];    # [in] input buffer pointer
3640  my $PLAIN_CIPH_LEN = $_[4];    # [in] buffer length
3641  my $CIPH_PLAIN_OUT = $_[5];    # [in] output buffer pointer
3642  my $ENC_DEC        = $_[6];    # [in] cipher direction
3643
3644  my $IA0 = "%r10";
3645  my $IA1 = "%r12";
3646  my $IA2 = "%r13";
3647  my $IA3 = "%r15";
3648  my $IA4 = "%r11";
3649  my $IA5 = "%rax";
3650  my $IA6 = "%rbx";
3651  my $IA7 = "%r14";
3652
3653  my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
3654
3655  my $CTR_CHECK   = $IA3;
3656  my $DATA_OFFSET = $IA4;
3657  my $HASHK_PTR   = $IA6;
3658
3659  my $HKEYS_READY = $IA7;
3660
3661  my $CTR_BLOCKz = "%zmm2";
3662  my $CTR_BLOCKx = "%xmm2";
3663
3664  # ; hardcoded in GCM_INIT
3665
3666  my $AAD_HASHz = "%zmm14";
3667  my $AAD_HASHx = "%xmm14";
3668
3669  # ; hardcoded in GCM_COMPLETE
3670
3671  my $ZTMP0  = "%zmm0";
3672  my $ZTMP1  = "%zmm3";
3673  my $ZTMP2  = "%zmm4";
3674  my $ZTMP3  = "%zmm5";
3675  my $ZTMP4  = "%zmm6";
3676  my $ZTMP5  = "%zmm7";
3677  my $ZTMP6  = "%zmm10";
3678  my $ZTMP7  = "%zmm11";
3679  my $ZTMP8  = "%zmm12";
3680  my $ZTMP9  = "%zmm13";
3681  my $ZTMP10 = "%zmm15";
3682  my $ZTMP11 = "%zmm16";
3683  my $ZTMP12 = "%zmm17";
3684
3685  my $ZTMP13 = "%zmm19";
3686  my $ZTMP14 = "%zmm20";
3687  my $ZTMP15 = "%zmm21";
3688  my $ZTMP16 = "%zmm30";
3689  my $ZTMP17 = "%zmm31";
3690  my $ZTMP18 = "%zmm1";
3691  my $ZTMP19 = "%zmm18";
3692  my $ZTMP20 = "%zmm8";
3693  my $ZTMP21 = "%zmm22";
3694  my $ZTMP22 = "%zmm23";
3695
3696  my $GH        = "%zmm24";
3697  my $GL        = "%zmm25";
3698  my $GM        = "%zmm26";
3699  my $SHUF_MASK = "%zmm29";
3700
3701  # ; Unused in the small packet path
3702  my $ADDBE_4x4  = "%zmm27";
3703  my $ADDBE_1234 = "%zmm28";
3704
3705  my $MASKREG = "%k1";
3706
3707  my $label_suffix = $label_count++;
3708
3709  # ;; reduction every 48 blocks, depth 32 blocks
3710  # ;; @note 48 blocks is the maximum capacity of the stack frame
3711  my $big_loop_nblocks = 48;
3712  my $big_loop_depth   = 32;
3713
3714  # ;;; Macro flow depending on packet size
3715  # ;;; - LENGTH <= 16 blocks
3716  # ;;;   - cipher followed by hashing (reduction)
3717  # ;;; - 16 blocks < LENGTH < 32 blocks
3718  # ;;;   - cipher 16 blocks
3719  # ;;;   - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3720  # ;;; - 32 blocks < LENGTH < 48 blocks
3721  # ;;;   - cipher 2 x 16 blocks
3722  # ;;;   - hash 16 blocks
3723  # ;;;   - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3724  # ;;; - LENGTH >= 48 blocks
3725  # ;;;   - cipher 2 x 16 blocks
3726  # ;;;   - while (data_to_cipher >= 48 blocks):
3727  # ;;;     - cipher 16 blocks & hash 16 blocks
3728  # ;;;     - cipher 16 blocks & hash 16 blocks
3729  # ;;;     - cipher 16 blocks & hash 16 blocks (reduction)
3730  # ;;;   - if (data_to_cipher >= 32 blocks):
3731  # ;;;     - cipher 16 blocks & hash 16 blocks
3732  # ;;;     - cipher 16 blocks & hash 16 blocks
3733  # ;;;     - hash 16 blocks (reduction)
3734  # ;;;     - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3735  # ;;;   - elif (data_to_cipher >= 16 blocks):
3736  # ;;;     - cipher 16 blocks & hash 16 blocks
3737  # ;;;     - hash 16 blocks
3738  # ;;;     - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3739  # ;;;   - else:
3740  # ;;;     - hash 16 blocks
3741  # ;;;     - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3742
3743  if ($win64) {
3744    $code .= "cmpq              \$0,$PLAIN_CIPH_LEN\n";
3745  } else {
3746    $code .= "or                $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
3747  }
3748  $code .= "je            .L_enc_dec_done_${label_suffix}\n";
3749
3750  # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in
3751  # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc'
3752
3753  $code .= "xor                $HKEYS_READY, $HKEYS_READY\n";
3754  $code .= "vmovdqu64         `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
3755
3756  # ;; Used for the update flow - if there was a previous partial
3757  # ;; block fill the remaining bytes here.
3758  &PARTIAL_BLOCK(
3759    $GCM128_CTX,  $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
3760    $DATA_OFFSET, $AAD_HASHx,  $ENC_DEC,        $IA0,           $IA1,
3761    $IA2,         $ZTMP0,      $ZTMP1,          $ZTMP2,         $ZTMP3,
3762    $ZTMP4,       $ZTMP5,      $ZTMP6,          $ZTMP7,         $MASKREG);
3763
3764  $code .= "vmovdqu64         `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
3765
3766  # ;; Save the amount of data left to process in $LENGTH
3767  # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
3768  if ($win64) {
3769    $code .= "mov               $PLAIN_CIPH_LEN,$LENGTH\n";
3770  }
3771
3772  # ;; There may be no more data if it was consumed in the partial block.
3773  $code .= <<___;
3774        sub               $DATA_OFFSET,$LENGTH
3775        je                .L_enc_dec_done_${label_suffix}
3776___
3777
3778  $code .= <<___;
3779        cmp               \$`(16 * 16)`,$LENGTH
3780        jbe              .L_message_below_equal_16_blocks_${label_suffix}
3781
3782        vmovdqa64         SHUF_MASK(%rip),$SHUF_MASK
3783        vmovdqa64         ddq_addbe_4444(%rip),$ADDBE_4x4
3784        vmovdqa64         ddq_addbe_1234(%rip),$ADDBE_1234
3785
3786        # ;; start the pipeline
3787        # ;; - 32 blocks aes-ctr
3788        # ;; - 16 blocks ghash + aes-ctr
3789
3790        # ;; set up CTR_CHECK
3791        vmovd             $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
3792        and               \$255,@{[DWORD($CTR_CHECK)]}
3793        # ;; in LE format after init, convert to BE
3794        vshufi64x2        \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
3795        vpshufb           $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
3796___
3797
3798  # ;; ==== AES-CTR - first 16 blocks
3799  my $aesout_offset      = ($STACK_LOCAL_OFFSET + (0 * 16));
3800  my $data_in_out_offset = 0;
3801  &INITIAL_BLOCKS_16(
3802    $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS,      $DATA_OFFSET,        "no_ghash", $CTR_BLOCKz,
3803    $CTR_CHECK,     $ADDBE_4x4,      $ADDBE_1234,    $ZTMP0,              $ZTMP1,     $ZTMP2,
3804    $ZTMP3,         $ZTMP4,          $ZTMP5,         $ZTMP6,              $ZTMP7,     $ZTMP8,
3805    $SHUF_MASK,     $ENC_DEC,        $aesout_offset, $data_in_out_offset, $IA0);
3806
3807  &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3808    "first16");
3809
3810  $code .= <<___;
3811        cmp               \$`(32 * 16)`,$LENGTH
3812        jb                .L_message_below_32_blocks_${label_suffix}
3813___
3814
3815  # ;; ==== AES-CTR - next 16 blocks
3816  $aesout_offset      = ($STACK_LOCAL_OFFSET + (16 * 16));
3817  $data_in_out_offset = (16 * 16);
3818  &INITIAL_BLOCKS_16(
3819    $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS,      $DATA_OFFSET,        "no_ghash", $CTR_BLOCKz,
3820    $CTR_CHECK,     $ADDBE_4x4,      $ADDBE_1234,    $ZTMP0,              $ZTMP1,     $ZTMP2,
3821    $ZTMP3,         $ZTMP4,          $ZTMP5,         $ZTMP6,              $ZTMP7,     $ZTMP8,
3822    $SHUF_MASK,     $ENC_DEC,        $aesout_offset, $data_in_out_offset, $IA0);
3823
3824  &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3825    "last32");
3826  $code .= "mov     \$1,$HKEYS_READY\n";
3827
3828  $code .= <<___;
3829        add               \$`(32 * 16)`,$DATA_OFFSET
3830        sub               \$`(32 * 16)`,$LENGTH
3831
3832        cmp               \$`($big_loop_nblocks * 16)`,$LENGTH
3833        jb                .L_no_more_big_nblocks_${label_suffix}
3834___
3835
3836  # ;; ====
3837  # ;; ==== AES-CTR + GHASH - 48 blocks loop
3838  # ;; ====
3839  $code .= ".L_encrypt_big_nblocks_${label_suffix}:\n";
3840
3841  # ;; ==== AES-CTR + GHASH - 16 blocks, start
3842  $aesout_offset      = ($STACK_LOCAL_OFFSET + (32 * 16));
3843  $data_in_out_offset = (0 * 16);
3844  my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3845  &GHASH_16_ENCRYPT_16_PARALLEL(
3846    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
3847    48,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
3848    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
3849    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
3850    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
3851    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
3852    $GH,       $GM,             "first_time",    $ENC_DEC,     $data_in_out_offset, $AAD_HASHz,
3853    $IA0);
3854
3855  # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3856  $aesout_offset      = ($STACK_LOCAL_OFFSET + (0 * 16));
3857  $data_in_out_offset = (16 * 16);
3858  $ghashin_offset     = ($STACK_LOCAL_OFFSET + (16 * 16));
3859  &GHASH_16_ENCRYPT_16_PARALLEL(
3860    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
3861    32,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
3862    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
3863    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
3864    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
3865    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
3866    $GH,       $GM,             "no_reduction",  $ENC_DEC,     $data_in_out_offset, "no_ghash_in",
3867    $IA0);
3868
3869  # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
3870  $aesout_offset      = ($STACK_LOCAL_OFFSET + (16 * 16));
3871  $data_in_out_offset = (32 * 16);
3872  $ghashin_offset     = ($STACK_LOCAL_OFFSET + (32 * 16));
3873  &GHASH_16_ENCRYPT_16_PARALLEL(
3874    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,    $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
3875    16,        $aesout_offset,  $ghashin_offset,   $SHUF_MASK,   $ZTMP0,              $ZTMP1,
3876    $ZTMP2,    $ZTMP3,          $ZTMP4,            $ZTMP5,       $ZTMP6,              $ZTMP7,
3877    $ZTMP8,    $ZTMP9,          $ZTMP10,           $ZTMP11,      $ZTMP12,             $ZTMP13,
3878    $ZTMP14,   $ZTMP15,         $ZTMP16,           $ZTMP17,      $ZTMP18,             $ZTMP19,
3879    $ZTMP20,   $ZTMP21,         $ZTMP22,           $ADDBE_4x4,   $ADDBE_1234,         $GL,
3880    $GH,       $GM,             "final_reduction", $ENC_DEC,     $data_in_out_offset, "no_ghash_in",
3881    $IA0);
3882
3883  # ;; === xor cipher block 0 with GHASH (ZT4)
3884  $code .= <<___;
3885        vmovdqa64         $ZTMP4,$AAD_HASHz
3886
3887        add               \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
3888        sub               \$`($big_loop_nblocks * 16)`,$LENGTH
3889        cmp               \$`($big_loop_nblocks * 16)`,$LENGTH
3890        jae               .L_encrypt_big_nblocks_${label_suffix}
3891
3892.L_no_more_big_nblocks_${label_suffix}:
3893
3894        cmp               \$`(32 * 16)`,$LENGTH
3895        jae               .L_encrypt_32_blocks_${label_suffix}
3896
3897        cmp               \$`(16 * 16)`,$LENGTH
3898        jae               .L_encrypt_16_blocks_${label_suffix}
3899___
3900
3901  # ;; =====================================================
3902  # ;; =====================================================
3903  # ;; ==== GHASH 1 x 16 blocks
3904  # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3905  # ;; ====      then GHASH N blocks
3906  $code .= ".L_encrypt_0_blocks_ghash_32_${label_suffix}:\n";
3907
3908  # ;; calculate offset to the right hash key
3909  $code .= <<___;
3910mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
3911and               \$~15,@{[DWORD($IA0)]}
3912mov               \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3913sub               @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3914___
3915
3916  # ;; ==== GHASH 32 blocks and follow with reduction
3917  &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
3918    "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3919
3920  # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3921  $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3922  $code .= "add               \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
3923  &GCM_ENC_DEC_LAST(
3924    $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $LENGTH,
3925    $CTR_BLOCKz, $CTR_CHECK,  $HASHK_PTR,      $ghashin_offset, $SHUF_MASK,   $ZTMP0,
3926    $ZTMP1,      $ZTMP2,      $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,
3927    $ZTMP7,      $ZTMP8,      $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,
3928    $ZTMP13,     $ZTMP14,     $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,
3929    $ZTMP19,     $ZTMP20,     $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,
3930    "mid",       $GL,         $GH,             $GM,             $ENC_DEC,     $AAD_HASHz,
3931    $IA0,        $IA5,        $MASKREG,        $PBLOCK_LEN);
3932
3933  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
3934  $code .= "jmp           .L_ghash_done_${label_suffix}\n";
3935
3936  # ;; =====================================================
3937  # ;; =====================================================
3938  # ;; ==== GHASH & encrypt 1 x 16 blocks
3939  # ;; ==== GHASH & encrypt 1 x 16 blocks
3940  # ;; ==== GHASH 1 x 16 blocks (reduction)
3941  # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3942  # ;; ====      then GHASH N blocks
3943  $code .= ".L_encrypt_32_blocks_${label_suffix}:\n";
3944
3945  # ;; ==== AES-CTR + GHASH - 16 blocks, start
3946  $aesout_offset  = ($STACK_LOCAL_OFFSET + (32 * 16));
3947  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3948  $data_in_out_offset = (0 * 16);
3949  &GHASH_16_ENCRYPT_16_PARALLEL(
3950    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
3951    48,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
3952    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
3953    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
3954    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
3955    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
3956    $GH,       $GM,             "first_time",    $ENC_DEC,     $data_in_out_offset, $AAD_HASHz,
3957    $IA0);
3958
3959  # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3960  $aesout_offset  = ($STACK_LOCAL_OFFSET + (0 * 16));
3961  $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3962  $data_in_out_offset = (16 * 16);
3963  &GHASH_16_ENCRYPT_16_PARALLEL(
3964    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
3965    32,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
3966    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
3967    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
3968    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
3969    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
3970    $GH,       $GM,             "no_reduction",  $ENC_DEC,     $data_in_out_offset, "no_ghash_in",
3971    $IA0);
3972
3973  # ;; ==== GHASH 16 blocks with reduction
3974  &GHASH_16(
3975    "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
3976    "%rsp", &HashKeyOffsetByIdx(16, "frame"),
3977    0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3978
3979  # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3980  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3981  $code .= <<___;
3982        sub               \$`(32 * 16)`,$LENGTH
3983        add               \$`(32 * 16)`,$DATA_OFFSET
3984___
3985
3986  # ;; calculate offset to the right hash key
3987  $code .= "mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
3988  $code .= <<___;
3989        and               \$~15,@{[DWORD($IA0)]}
3990        mov               \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3991        sub               @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3992___
3993  &GCM_ENC_DEC_LAST(
3994    $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $LENGTH,
3995    $CTR_BLOCKz, $CTR_CHECK,  $HASHK_PTR,      $ghashin_offset, $SHUF_MASK,   $ZTMP0,
3996    $ZTMP1,      $ZTMP2,      $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,
3997    $ZTMP7,      $ZTMP8,      $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,
3998    $ZTMP13,     $ZTMP14,     $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,
3999    $ZTMP19,     $ZTMP20,     $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,
4000    "start",     $GL,         $GH,             $GM,             $ENC_DEC,     $AAD_HASHz,
4001    $IA0,        $IA5,        $MASKREG,        $PBLOCK_LEN);
4002
4003  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4004  $code .= "jmp           .L_ghash_done_${label_suffix}\n";
4005
4006  # ;; =====================================================
4007  # ;; =====================================================
4008  # ;; ==== GHASH & encrypt 16 blocks (done before)
4009  # ;; ==== GHASH 1 x 16 blocks
4010  # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
4011  # ;; ====      then GHASH N blocks
4012  $code .= ".L_encrypt_16_blocks_${label_suffix}:\n";
4013
4014  # ;; ==== AES-CTR + GHASH - 16 blocks, start
4015  $aesout_offset  = ($STACK_LOCAL_OFFSET + (32 * 16));
4016  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4017  $data_in_out_offset = (0 * 16);
4018  &GHASH_16_ENCRYPT_16_PARALLEL(
4019    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
4020    48,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
4021    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
4022    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
4023    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
4024    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
4025    $GH,       $GM,             "first_time",    $ENC_DEC,     $data_in_out_offset, $AAD_HASHz,
4026    $IA0);
4027
4028  # ;; ==== GHASH 1 x 16 blocks
4029  &GHASH_16(
4030    "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
4031    "%rsp", &HashKeyOffsetByIdx(32, "frame"),
4032    0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
4033
4034  # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
4035  $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4036  $code .= <<___;
4037        sub               \$`(16 * 16)`,$LENGTH
4038        add               \$`(16 * 16)`,$DATA_OFFSET
4039___
4040  &GCM_ENC_DEC_LAST(
4041    $AES_KEYS,    $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
4042    $DATA_OFFSET, $LENGTH,     $CTR_BLOCKz,     $CTR_CHECK,
4043    &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
4044    $ZTMP1,       $ZTMP2,     $ZTMP3,     $ZTMP4,
4045    $ZTMP5,       $ZTMP6,     $ZTMP7,     $ZTMP8,
4046    $ZTMP9,       $ZTMP10,    $ZTMP11,    $ZTMP12,
4047    $ZTMP13,      $ZTMP14,    $ZTMP15,    $ZTMP16,
4048    $ZTMP17,      $ZTMP18,    $ZTMP19,    $ZTMP20,
4049    $ZTMP21,      $ZTMP22,    $ADDBE_4x4, $ADDBE_1234,
4050    "end_reduce", $GL,        $GH,        $GM,
4051    $ENC_DEC,     $AAD_HASHz, $IA0,       $IA5,
4052    $MASKREG,     $PBLOCK_LEN);
4053
4054  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4055  $code .= <<___;
4056        jmp               .L_ghash_done_${label_suffix}
4057
4058.L_message_below_32_blocks_${label_suffix}:
4059        # ;; 32 > number of blocks > 16
4060
4061        sub               \$`(16 * 16)`,$LENGTH
4062        add               \$`(16 * 16)`,$DATA_OFFSET
4063___
4064  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4065
4066  # ;; calculate offset to the right hash key
4067  $code .= "mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
4068
4069  &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4070    "mid16");
4071  $code .= "mov     \$1,$HKEYS_READY\n";
4072
4073  $code .= <<___;
4074and               \$~15,@{[DWORD($IA0)]}
4075mov               \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
4076sub               @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
4077___
4078
4079  &GCM_ENC_DEC_LAST(
4080    $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $LENGTH,
4081    $CTR_BLOCKz, $CTR_CHECK,  $HASHK_PTR,      $ghashin_offset, $SHUF_MASK,   $ZTMP0,
4082    $ZTMP1,      $ZTMP2,      $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,
4083    $ZTMP7,      $ZTMP8,      $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,
4084    $ZTMP13,     $ZTMP14,     $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,
4085    $ZTMP19,     $ZTMP20,     $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,
4086    "start",     $GL,         $GH,             $GM,             $ENC_DEC,     $AAD_HASHz,
4087    $IA0,        $IA5,        $MASKREG,        $PBLOCK_LEN);
4088
4089  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4090  $code .= <<___;
4091        jmp           .L_ghash_done_${label_suffix}
4092
4093.L_message_below_equal_16_blocks_${label_suffix}:
4094        # ;; Determine how many blocks to process
4095        # ;; - process one additional block if there is a partial block
4096        mov               @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
4097        add               \$15,@{[DWORD($IA1)]}
4098        shr               \$4, @{[DWORD($IA1)]}     # ; $IA1 can be in the range from 0 to 16
4099___
4100  &GCM_ENC_DEC_SMALL(
4101    $AES_KEYS,    $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
4102    $DATA_OFFSET, $LENGTH,     $IA1,            $CTR_BLOCKx,    $AAD_HASHx,      $ZTMP0,
4103    $ZTMP1,       $ZTMP2,      $ZTMP3,          $ZTMP4,         $ZTMP5,          $ZTMP6,
4104    $ZTMP7,       $ZTMP8,      $ZTMP9,          $ZTMP10,        $ZTMP11,         $ZTMP12,
4105    $ZTMP13,      $ZTMP14,     $IA0,            $IA3,           $MASKREG,        $SHUF_MASK,
4106    $PBLOCK_LEN);
4107
4108  # ;; fall through to exit
4109
4110  $code .= ".L_ghash_done_${label_suffix}:\n";
4111
4112  # ;; save the last counter block
4113  $code .= "vmovdqu64         $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n";
4114  $code .= <<___;
4115        vmovdqu64         $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4116.L_enc_dec_done_${label_suffix}:
4117___
4118}
4119
4120# ;;; ===========================================================================
4121# ;;; Encrypt/decrypt the initial 16 blocks
4122sub INITIAL_BLOCKS_16 {
4123  my $IN          = $_[0];     # [in] input buffer
4124  my $OUT         = $_[1];     # [in] output buffer
4125  my $AES_KEYS    = $_[2];     # [in] pointer to expanded keys
4126  my $DATA_OFFSET = $_[3];     # [in] data offset
4127  my $GHASH       = $_[4];     # [in] ZMM with AAD (low 128 bits)
4128  my $CTR         = $_[5];     # [in] ZMM with CTR BE blocks 4x128 bits
4129  my $CTR_CHECK   = $_[6];     # [in/out] GPR with counter overflow check
4130  my $ADDBE_4x4   = $_[7];     # [in] ZMM 4x128bits with value 4 (big endian)
4131  my $ADDBE_1234  = $_[8];     # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
4132  my $T0          = $_[9];     # [clobered] temporary ZMM register
4133  my $T1          = $_[10];    # [clobered] temporary ZMM register
4134  my $T2          = $_[11];    # [clobered] temporary ZMM register
4135  my $T3          = $_[12];    # [clobered] temporary ZMM register
4136  my $T4          = $_[13];    # [clobered] temporary ZMM register
4137  my $T5          = $_[14];    # [clobered] temporary ZMM register
4138  my $T6          = $_[15];    # [clobered] temporary ZMM register
4139  my $T7          = $_[16];    # [clobered] temporary ZMM register
4140  my $T8          = $_[17];    # [clobered] temporary ZMM register
4141  my $SHUF_MASK   = $_[18];    # [in] ZMM with BE/LE shuffle mask
4142  my $ENC_DEC     = $_[19];    # [in] ENC (encrypt) or DEC (decrypt) selector
4143  my $BLK_OFFSET  = $_[20];    # [in] stack frame offset to ciphered blocks
4144  my $DATA_DISPL  = $_[21];    # [in] fixed numerical data displacement/offset
4145  my $IA0         = $_[22];    # [clobered] temporary GP register
4146
4147  my $B00_03 = $T5;
4148  my $B04_07 = $T6;
4149  my $B08_11 = $T7;
4150  my $B12_15 = $T8;
4151
4152  my $label_suffix = $label_count++;
4153
4154  my $stack_offset = $BLK_OFFSET;
4155  $code .= <<___;
4156        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4157        # ;; prepare counter blocks
4158
4159        cmpb              \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
4160        jae               .L_next_16_overflow_${label_suffix}
4161        vpaddd            $ADDBE_1234,$CTR,$B00_03
4162        vpaddd            $ADDBE_4x4,$B00_03,$B04_07
4163        vpaddd            $ADDBE_4x4,$B04_07,$B08_11
4164        vpaddd            $ADDBE_4x4,$B08_11,$B12_15
4165        jmp               .L_next_16_ok_${label_suffix}
4166.L_next_16_overflow_${label_suffix}:
4167        vpshufb           $SHUF_MASK,$CTR,$CTR
4168        vmovdqa64         ddq_add_4444(%rip),$B12_15
4169        vpaddd            ddq_add_1234(%rip),$CTR,$B00_03
4170        vpaddd            $B12_15,$B00_03,$B04_07
4171        vpaddd            $B12_15,$B04_07,$B08_11
4172        vpaddd            $B12_15,$B08_11,$B12_15
4173        vpshufb           $SHUF_MASK,$B00_03,$B00_03
4174        vpshufb           $SHUF_MASK,$B04_07,$B04_07
4175        vpshufb           $SHUF_MASK,$B08_11,$B08_11
4176        vpshufb           $SHUF_MASK,$B12_15,$B12_15
4177.L_next_16_ok_${label_suffix}:
4178        vshufi64x2        \$0b11111111,$B12_15,$B12_15,$CTR
4179        addb               \$16,@{[BYTE($CTR_CHECK)]}
4180        # ;; === load 16 blocks of data
4181        vmovdqu8          `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0
4182        vmovdqu8          `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1
4183        vmovdqu8          `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2
4184        vmovdqu8          `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3
4185
4186        # ;; move to AES encryption rounds
4187        vbroadcastf64x2    `(16*0)`($AES_KEYS),$T4
4188        vpxorq            $T4,$B00_03,$B00_03
4189        vpxorq            $T4,$B04_07,$B04_07
4190        vpxorq            $T4,$B08_11,$B08_11
4191        vpxorq            $T4,$B12_15,$B12_15
4192___
4193  foreach (1 .. ($NROUNDS)) {
4194    $code .= <<___;
4195        vbroadcastf64x2    `(16*$_)`($AES_KEYS),$T4
4196        vaesenc            $T4,$B00_03,$B00_03
4197        vaesenc            $T4,$B04_07,$B04_07
4198        vaesenc            $T4,$B08_11,$B08_11
4199        vaesenc            $T4,$B12_15,$B12_15
4200___
4201  }
4202  $code .= <<___;
4203        vbroadcastf64x2    `(16*($NROUNDS+1))`($AES_KEYS),$T4
4204        vaesenclast         $T4,$B00_03,$B00_03
4205        vaesenclast         $T4,$B04_07,$B04_07
4206        vaesenclast         $T4,$B08_11,$B08_11
4207        vaesenclast         $T4,$B12_15,$B12_15
4208
4209        # ;;  xor against text
4210        vpxorq            $T0,$B00_03,$B00_03
4211        vpxorq            $T1,$B04_07,$B04_07
4212        vpxorq            $T2,$B08_11,$B08_11
4213        vpxorq            $T3,$B12_15,$B12_15
4214
4215        # ;; store
4216        mov               $OUT, $IA0
4217        vmovdqu8          $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1)
4218        vmovdqu8          $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1)
4219        vmovdqu8          $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1)
4220        vmovdqu8          $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1)
4221___
4222  if ($ENC_DEC eq "DEC") {
4223    $code .= <<___;
4224        # ;; decryption - cipher text needs to go to GHASH phase
4225        vpshufb           $SHUF_MASK,$T0,$B00_03
4226        vpshufb           $SHUF_MASK,$T1,$B04_07
4227        vpshufb           $SHUF_MASK,$T2,$B08_11
4228        vpshufb           $SHUF_MASK,$T3,$B12_15
4229___
4230  } else {
4231    $code .= <<___;
4232        # ;; encryption
4233        vpshufb           $SHUF_MASK,$B00_03,$B00_03
4234        vpshufb           $SHUF_MASK,$B04_07,$B04_07
4235        vpshufb           $SHUF_MASK,$B08_11,$B08_11
4236        vpshufb           $SHUF_MASK,$B12_15,$B12_15
4237___
4238  }
4239
4240  if ($GHASH ne "no_ghash") {
4241    $code .= <<___;
4242        # ;; === xor cipher block 0 with GHASH for the next GHASH round
4243        vpxorq            $GHASH,$B00_03,$B00_03
4244___
4245  }
4246  $code .= <<___;
4247        vmovdqa64         $B00_03,`$stack_offset + (0 * 64)`(%rsp)
4248        vmovdqa64         $B04_07,`$stack_offset + (1 * 64)`(%rsp)
4249        vmovdqa64         $B08_11,`$stack_offset + (2 * 64)`(%rsp)
4250        vmovdqa64         $B12_15,`$stack_offset + (3 * 64)`(%rsp)
4251___
4252}
4253
4254# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4255# ; GCM_COMPLETE Finishes ghash calculation
4256# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4257sub GCM_COMPLETE {
4258  my $GCM128_CTX = $_[0];
4259  my $PBLOCK_LEN = $_[1];
4260
4261  my $label_suffix = $label_count++;
4262
4263  $code .= <<___;
4264        vmovdqu           @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2
4265        vmovdqu           $CTX_OFFSET_EK0($GCM128_CTX),%xmm3      # ; xmm3 = E(K,Y0)
4266___
4267
4268  $code .= <<___;
4269        vmovdqu           `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4
4270
4271        # ;; Process the final partial block.
4272        cmp               \$0,$PBLOCK_LEN
4273        je                .L_partial_done_${label_suffix}
4274___
4275
4276  #  ;GHASH computation for the last <16 Byte block
4277  &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4278
4279  $code .= <<___;
4280.L_partial_done_${label_suffix}:
4281        vmovq           `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5
4282        vpinsrq         \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5    #  ; xmm5 = len(A)||len(C)
4283        vpsllq          \$3, %xmm5, %xmm5                                       #  ; convert bytes into bits
4284
4285        vpxor           %xmm5,%xmm4,%xmm4
4286___
4287
4288  &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4289
4290  $code .= <<___;
4291        vpshufb         SHUF_MASK(%rip),%xmm4,%xmm4      # ; perform a 16Byte swap
4292        vpxor           %xmm4,%xmm3,%xmm3
4293
4294.L_return_T_${label_suffix}:
4295        vmovdqu           %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4296___
4297}
4298
4299# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4300# ;;; Functions definitions
4301# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4302
4303$code .= ".text\n";
4304{
4305  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4306  # ;void   ossl_aes_gcm_init_avx512 /
4307  # ;       (const void *aes_keys,
4308  # ;        void *gcm128ctx)
4309  # ;
4310  # ; Precomputes hashkey table for GHASH optimization.
4311  # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4312  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4313  $code .= <<___;
4314.globl ossl_aes_gcm_init_avx512
4315.type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4316.align 32
4317ossl_aes_gcm_init_avx512:
4318.cfi_startproc
4319        endbranch
4320___
4321  if ($CHECK_FUNCTION_ARGUMENTS) {
4322    $code .= <<___;
4323        # ;; Check aes_keys != NULL
4324        test               $arg1,$arg1
4325        jz                .Labort_init
4326
4327        # ;; Check gcm128ctx != NULL
4328        test               $arg2,$arg2
4329        jz                .Labort_init
4330___
4331  }
4332  $code .= "vpxorq            %xmm16,%xmm16,%xmm16\n";
4333  &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax");    # ; xmm16 = HashKey
4334  $code .= <<___;
4335        vpshufb           SHUF_MASK(%rip),%xmm16,%xmm16
4336        # ;;;  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;;
4337        vmovdqa64         %xmm16,%xmm2
4338        vpsllq            \$1,%xmm16,%xmm16
4339        vpsrlq            \$63,%xmm2,%xmm2
4340        vmovdqa           %xmm2,%xmm1
4341        vpslldq           \$8,%xmm2,%xmm2
4342        vpsrldq           \$8,%xmm1,%xmm1
4343        vporq             %xmm2,%xmm16,%xmm16
4344        # ;reduction
4345        vpshufd           \$0b00100100,%xmm1,%xmm2
4346        vpcmpeqd          TWOONE(%rip),%xmm2,%xmm2
4347        vpand             POLY(%rip),%xmm2,%xmm2
4348        vpxorq            %xmm2,%xmm16,%xmm16                  # ; xmm16 holds the HashKey<<1 mod poly
4349        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4350        vmovdqu64         %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly
4351___
4352  &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4353  if ($CLEAR_SCRATCH_REGISTERS) {
4354    &clear_scratch_gps_asm();
4355    &clear_scratch_zmms_asm();
4356  } else {
4357    $code .= "vzeroupper\n";
4358  }
4359  $code .= <<___;
4360.Labort_init:
4361ret
4362.cfi_endproc
4363.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4364___
4365}
4366
4367# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4368# ;void   ossl_aes_gcm_setiv_avx512
4369# ;       (const void *aes_keys,
4370# ;        void *gcm128ctx,
4371# ;        const unsigned char *iv,
4372# ;        size_t ivlen)
4373# ;
4374# ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure.
4375# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4376$code .= <<___;
4377.globl ossl_aes_gcm_setiv_avx512
4378.type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent
4379.align 32
4380ossl_aes_gcm_setiv_avx512:
4381.cfi_startproc
4382.Lsetiv_seh_begin:
4383        endbranch
4384___
4385if ($CHECK_FUNCTION_ARGUMENTS) {
4386  $code .= <<___;
4387        # ;; Check aes_keys != NULL
4388        test               $arg1,$arg1
4389        jz                 .Labort_setiv
4390
4391        # ;; Check gcm128ctx != NULL
4392        test               $arg2,$arg2
4393        jz                 .Labort_setiv
4394
4395        # ;; Check iv != NULL
4396        test               $arg3,$arg3
4397        jz                 .Labort_setiv
4398
4399        # ;; Check ivlen != 0
4400        test               $arg4,$arg4
4401        jz                 .Labort_setiv
4402___
4403}
4404
4405# ; NOTE: code before PROLOG() must not modify any registers
4406&PROLOG(
4407  1,    # allocate stack space for hkeys
4408  0,    # do not allocate stack space for AES blocks
4409  "setiv");
4410&GCM_INIT_IV(
4411  "$arg1",  "$arg2",  "$arg3",  "$arg4",  "%r10",   "%r11",  "%r12",  "%k1",   "%xmm2",  "%zmm1",
4412  "%zmm11", "%zmm3",  "%zmm4",  "%zmm5",  "%zmm6",  "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12",
4413  "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4414&EPILOG(
4415  1,    # hkeys were allocated
4416  $arg4);
4417$code .= <<___;
4418.Labort_setiv:
4419ret
4420.Lsetiv_seh_end:
4421.cfi_endproc
4422.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
4423___
4424
4425# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4426# ;void ossl_aes_gcm_update_aad_avx512
4427# ;     (unsigned char *gcm128ctx,
4428# ;      const unsigned char *aad,
4429# ;      size_t aadlen)
4430# ;
4431# ; Updates AAD hash in gcm128_context structure.
4432# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4433$code .= <<___;
4434.globl ossl_aes_gcm_update_aad_avx512
4435.type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent
4436.align 32
4437ossl_aes_gcm_update_aad_avx512:
4438.cfi_startproc
4439.Lghash_seh_begin:
4440        endbranch
4441___
4442if ($CHECK_FUNCTION_ARGUMENTS) {
4443  $code .= <<___;
4444        # ;; Check gcm128ctx != NULL
4445        test               $arg1,$arg1
4446        jz                 .Lexit_update_aad
4447
4448        # ;; Check aad != NULL
4449        test               $arg2,$arg2
4450        jz                 .Lexit_update_aad
4451
4452        # ;; Check aadlen != 0
4453        test               $arg3,$arg3
4454        jz                 .Lexit_update_aad
4455___
4456}
4457
4458# ; NOTE: code before PROLOG() must not modify any registers
4459&PROLOG(
4460  1,    # allocate stack space for hkeys,
4461  0,    # do not allocate stack space for AES blocks
4462  "ghash");
4463&GCM_UPDATE_AAD(
4464  "$arg1",  "$arg2",  "$arg3",  "%r10",   "%r11",  "%r12",  "%k1",   "%xmm14", "%zmm1",  "%zmm11",
4465  "%zmm3",  "%zmm4",  "%zmm5",  "%zmm6",  "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13",
4466  "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4467&EPILOG(
4468  1,    # hkeys were allocated
4469  $arg3);
4470$code .= <<___;
4471.Lexit_update_aad:
4472ret
4473.Lghash_seh_end:
4474.cfi_endproc
4475.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
4476___
4477
4478# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4479# ;void   ossl_aes_gcm_encrypt_avx512
4480# ;       (const void* aes_keys,
4481# ;        void *gcm128ctx,
4482# ;        unsigned int *pblocklen,
4483# ;        const unsigned char *in,
4484# ;        size_t len,
4485# ;        unsigned char *out);
4486# ;
4487# ; Performs encryption of data |in| of len |len|, and stores the output in |out|.
4488# ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4489# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4490$code .= <<___;
4491.globl ossl_aes_gcm_encrypt_avx512
4492.type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent
4493.align 32
4494ossl_aes_gcm_encrypt_avx512:
4495.cfi_startproc
4496.Lencrypt_seh_begin:
4497        endbranch
4498___
4499
4500# ; NOTE: code before PROLOG() must not modify any registers
4501&PROLOG(
4502  1,    # allocate stack space for hkeys
4503  1,    # allocate stack space for AES blocks
4504  "encrypt");
4505if ($CHECK_FUNCTION_ARGUMENTS) {
4506  $code .= <<___;
4507        # ;; Check aes_keys != NULL
4508        test               $arg1,$arg1
4509        jz                 .Lexit_gcm_encrypt
4510
4511        # ;; Check gcm128ctx != NULL
4512        test               $arg2,$arg2
4513        jz                 .Lexit_gcm_encrypt
4514
4515        # ;; Check pblocklen != NULL
4516        test               $arg3,$arg3
4517        jz                 .Lexit_gcm_encrypt
4518
4519        # ;; Check in != NULL
4520        test               $arg4,$arg4
4521        jz                 .Lexit_gcm_encrypt
4522
4523        # ;; Check if len != 0
4524        cmp                \$0,$arg5
4525        jz                 .Lexit_gcm_encrypt
4526
4527        # ;; Check out != NULL
4528        cmp                \$0,$arg6
4529        jz                 .Lexit_gcm_encrypt
4530___
4531}
4532$code .= <<___;
4533        # ; load number of rounds from AES_KEY structure (offset in bytes is
4534        # ; size of the |rd_key| buffer)
4535        mov             `4*15*4`($arg1),%eax
4536        cmp             \$9,%eax
4537        je              .Laes_gcm_encrypt_128_avx512
4538        cmp             \$11,%eax
4539        je              .Laes_gcm_encrypt_192_avx512
4540        cmp             \$13,%eax
4541        je              .Laes_gcm_encrypt_256_avx512
4542        xor             %eax,%eax
4543        jmp             .Lexit_gcm_encrypt
4544___
4545for my $keylen (sort keys %aes_rounds) {
4546  $NROUNDS = $aes_rounds{$keylen};
4547  $code .= <<___;
4548.align 32
4549.Laes_gcm_encrypt_${keylen}_avx512:
4550___
4551  &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC");
4552  $code .= "jmp .Lexit_gcm_encrypt\n";
4553}
4554$code .= ".Lexit_gcm_encrypt:\n";
4555&EPILOG(1, $arg5);
4556$code .= <<___;
4557ret
4558.Lencrypt_seh_end:
4559.cfi_endproc
4560.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
4561___
4562
4563# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4564# ;void   ossl_aes_gcm_decrypt_avx512
4565# ;       (const void* keys,
4566# ;        void *gcm128ctx,
4567# ;        unsigned int *pblocklen,
4568# ;        const unsigned char *in,
4569# ;        size_t len,
4570# ;        unsigned char *out);
4571# ;
4572# ; Performs decryption of data |in| of len |len|, and stores the output in |out|.
4573# ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4574# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4575$code .= <<___;
4576.globl ossl_aes_gcm_decrypt_avx512
4577.type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent
4578.align 32
4579ossl_aes_gcm_decrypt_avx512:
4580.cfi_startproc
4581.Ldecrypt_seh_begin:
4582        endbranch
4583___
4584
4585# ; NOTE: code before PROLOG() must not modify any registers
4586&PROLOG(
4587  1,    # allocate stack space for hkeys
4588  1,    # allocate stack space for AES blocks
4589  "decrypt");
4590if ($CHECK_FUNCTION_ARGUMENTS) {
4591  $code .= <<___;
4592        # ;; Check keys != NULL
4593        test               $arg1,$arg1
4594        jz                 .Lexit_gcm_decrypt
4595
4596        # ;; Check gcm128ctx != NULL
4597        test               $arg2,$arg2
4598        jz                 .Lexit_gcm_decrypt
4599
4600        # ;; Check pblocklen != NULL
4601        test               $arg3,$arg3
4602        jz                 .Lexit_gcm_decrypt
4603
4604        # ;; Check in != NULL
4605        test               $arg4,$arg4
4606        jz                 .Lexit_gcm_decrypt
4607
4608        # ;; Check if len != 0
4609        cmp                \$0,$arg5
4610        jz                 .Lexit_gcm_decrypt
4611
4612        # ;; Check out != NULL
4613        cmp                \$0,$arg6
4614        jz                 .Lexit_gcm_decrypt
4615___
4616}
4617$code .= <<___;
4618        # ; load number of rounds from AES_KEY structure (offset in bytes is
4619        # ; size of the |rd_key| buffer)
4620        mov             `4*15*4`($arg1),%eax
4621        cmp             \$9,%eax
4622        je              .Laes_gcm_decrypt_128_avx512
4623        cmp             \$11,%eax
4624        je              .Laes_gcm_decrypt_192_avx512
4625        cmp             \$13,%eax
4626        je              .Laes_gcm_decrypt_256_avx512
4627        xor             %eax,%eax
4628        jmp             .Lexit_gcm_decrypt
4629___
4630for my $keylen (sort keys %aes_rounds) {
4631  $NROUNDS = $aes_rounds{$keylen};
4632  $code .= <<___;
4633.align 32
4634.Laes_gcm_decrypt_${keylen}_avx512:
4635___
4636  &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC");
4637  $code .= "jmp .Lexit_gcm_decrypt\n";
4638}
4639$code .= ".Lexit_gcm_decrypt:\n";
4640&EPILOG(1, $arg5);
4641$code .= <<___;
4642ret
4643.Ldecrypt_seh_end:
4644.cfi_endproc
4645.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
4646___
4647
4648# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4649# ;void   ossl_aes_gcm_finalize_vaes_avx512
4650# ;       (void *gcm128ctx,
4651# ;       unsigned int pblocklen);
4652# ;
4653# ; Finalizes encryption / decryption
4654# ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4655# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4656$code .= <<___;
4657.globl ossl_aes_gcm_finalize_avx512
4658.type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent
4659.align 32
4660ossl_aes_gcm_finalize_avx512:
4661.cfi_startproc
4662        endbranch
4663___
4664if ($CHECK_FUNCTION_ARGUMENTS) {
4665  $code .= <<___;
4666        # ;; Check gcm128ctx != NULL
4667        test               $arg1,$arg1
4668        jz                 .Labort_finalize
4669___
4670}
4671
4672&GCM_COMPLETE("$arg1", "$arg2");
4673
4674$code .= <<___;
4675.Labort_finalize:
4676ret
4677.cfi_endproc
4678.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
4679___
4680
4681# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4682# ;void ossl_gcm_gmult_avx512(u64 Xi[2],
4683# ;                           const void* gcm128ctx)
4684# ;
4685# ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4686# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4687$code .= <<___;
4688.globl ossl_gcm_gmult_avx512
4689.hidden ossl_gcm_gmult_avx512
4690.type ossl_gcm_gmult_avx512,\@abi-omnipotent
4691.align 32
4692ossl_gcm_gmult_avx512:
4693.cfi_startproc
4694        endbranch
4695___
4696if ($CHECK_FUNCTION_ARGUMENTS) {
4697  $code .= <<___;
4698        # ;; Check Xi != NULL
4699        test               $arg1,$arg1
4700        jz                 .Labort_gmult
4701
4702        # ;; Check gcm128ctx != NULL
4703        test               $arg2,$arg2
4704        jz                 .Labort_gmult
4705___
4706}
4707$code .= "vmovdqu64         ($arg1),%xmm1\n";
4708$code .= "vmovdqu64         @{[HashKeyByIdx(1,$arg2)]},%xmm2\n";
4709
4710&GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4711
4712$code .= "vmovdqu64         %xmm1,($arg1)\n";
4713if ($CLEAR_SCRATCH_REGISTERS) {
4714  &clear_scratch_gps_asm();
4715  &clear_scratch_zmms_asm();
4716} else {
4717  $code .= "vzeroupper\n";
4718}
4719$code .= <<___;
4720.Labort_gmult:
4721ret
4722.cfi_endproc
4723.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
4724___
4725
4726if ($win64) {
4727
4728  # Add unwind metadata for SEH.
4729
4730  # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160
4731  my $UWOP_PUSH_NONVOL = 0;
4732  my $UWOP_ALLOC_LARGE = 1;
4733  my $UWOP_SET_FPREG   = 3;
4734  my $UWOP_SAVE_XMM128 = 8;
4735  my %UWOP_REG_NUMBER  = (
4736    rax => 0,
4737    rcx => 1,
4738    rdx => 2,
4739    rbx => 3,
4740    rsp => 4,
4741    rbp => 5,
4742    rsi => 6,
4743    rdi => 7,
4744    map(("r$_" => $_), (8 .. 15)));
4745
4746  $code .= <<___;
4747.section    .pdata
4748.align  4
4749    .rva    .Lsetiv_seh_begin
4750    .rva    .Lsetiv_seh_end
4751    .rva    .Lsetiv_seh_info
4752
4753    .rva    .Lghash_seh_begin
4754    .rva    .Lghash_seh_end
4755    .rva    .Lghash_seh_info
4756
4757    .rva    .Lencrypt_seh_begin
4758    .rva    .Lencrypt_seh_end
4759    .rva    .Lencrypt_seh_info
4760
4761    .rva    .Ldecrypt_seh_begin
4762    .rva    .Ldecrypt_seh_end
4763    .rva    .Ldecrypt_seh_info
4764
4765.section    .xdata
4766___
4767
4768  foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") {
4769    $code .= <<___;
4770.align  8
4771.L${func_name}_seh_info:
4772    .byte   1   # version 1, no flags
4773    .byte   .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin
4774    .byte   31 # num_slots = 1*8 + 2 + 1 + 2*10
4775    # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16
4776    .byte   @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]}
4777___
4778
4779    # Metadata for %xmm15-%xmm6
4780    # Occupy 2 slots each
4781    for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
4782
4783      # Scaled-by-16 stack offset
4784      my $xmm_reg_offset = ($reg_idx - 6);
4785      $code .= <<___;
4786    .byte   .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin
4787    .byte   @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]}
4788    .value  $xmm_reg_offset
4789___
4790    }
4791
4792    $code .= <<___;
4793    # Frame pointer (occupy 1 slot)
4794    .byte   .L${func_name}_seh_setfp-.L${func_name}_seh_begin
4795    .byte   $UWOP_SET_FPREG
4796
4797    # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes
4798    .byte   .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin
4799    .byte   $UWOP_ALLOC_LARGE
4800    .value  `($XMM_STORAGE + 8) / 8`
4801___
4802
4803    # Metadata for GPR regs
4804    # Occupy 1 slot each
4805    foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") {
4806      $code .= <<___;
4807    .byte   .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin
4808    .byte   @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]}
4809___
4810    }
4811  }
4812}
4813
4814$code .= <<___;
4815.section .rodata align=16
4816.align 16
4817POLY:   .quad     0x0000000000000001, 0xC200000000000000
4818
4819.align 64
4820POLY2:
4821        .quad     0x00000001C2000000, 0xC200000000000000
4822        .quad     0x00000001C2000000, 0xC200000000000000
4823        .quad     0x00000001C2000000, 0xC200000000000000
4824        .quad     0x00000001C2000000, 0xC200000000000000
4825
4826.align 16
4827TWOONE: .quad     0x0000000000000001, 0x0000000100000000
4828
4829# ;;; Order of these constants should not change.
4830# ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
4831.align 64
4832SHUF_MASK:
4833        .quad     0x08090A0B0C0D0E0F, 0x0001020304050607
4834        .quad     0x08090A0B0C0D0E0F, 0x0001020304050607
4835        .quad     0x08090A0B0C0D0E0F, 0x0001020304050607
4836        .quad     0x08090A0B0C0D0E0F, 0x0001020304050607
4837
4838.align 16
4839SHIFT_MASK:
4840        .quad     0x0706050403020100, 0x0f0e0d0c0b0a0908
4841
4842ALL_F:
4843        .quad     0xffffffffffffffff, 0xffffffffffffffff
4844
4845ZERO:
4846        .quad     0x0000000000000000, 0x0000000000000000
4847
4848.align 16
4849ONE:
4850        .quad     0x0000000000000001, 0x0000000000000000
4851
4852.align 16
4853ONEf:
4854        .quad     0x0000000000000000, 0x0100000000000000
4855
4856.align 64
4857ddq_add_1234:
4858        .quad  0x0000000000000001, 0x0000000000000000
4859        .quad  0x0000000000000002, 0x0000000000000000
4860        .quad  0x0000000000000003, 0x0000000000000000
4861        .quad  0x0000000000000004, 0x0000000000000000
4862
4863.align 64
4864ddq_add_5678:
4865        .quad  0x0000000000000005, 0x0000000000000000
4866        .quad  0x0000000000000006, 0x0000000000000000
4867        .quad  0x0000000000000007, 0x0000000000000000
4868        .quad  0x0000000000000008, 0x0000000000000000
4869
4870.align 64
4871ddq_add_4444:
4872        .quad  0x0000000000000004, 0x0000000000000000
4873        .quad  0x0000000000000004, 0x0000000000000000
4874        .quad  0x0000000000000004, 0x0000000000000000
4875        .quad  0x0000000000000004, 0x0000000000000000
4876
4877.align 64
4878ddq_add_8888:
4879        .quad  0x0000000000000008, 0x0000000000000000
4880        .quad  0x0000000000000008, 0x0000000000000000
4881        .quad  0x0000000000000008, 0x0000000000000000
4882        .quad  0x0000000000000008, 0x0000000000000000
4883
4884.align 64
4885ddq_addbe_1234:
4886        .quad  0x0000000000000000, 0x0100000000000000
4887        .quad  0x0000000000000000, 0x0200000000000000
4888        .quad  0x0000000000000000, 0x0300000000000000
4889        .quad  0x0000000000000000, 0x0400000000000000
4890
4891.align 64
4892ddq_addbe_4444:
4893        .quad  0x0000000000000000, 0x0400000000000000
4894        .quad  0x0000000000000000, 0x0400000000000000
4895        .quad  0x0000000000000000, 0x0400000000000000
4896        .quad  0x0000000000000000, 0x0400000000000000
4897
4898.align 64
4899byte_len_to_mask_table:
4900        .value      0x0000, 0x0001, 0x0003, 0x0007
4901        .value      0x000f, 0x001f, 0x003f, 0x007f
4902        .value      0x00ff, 0x01ff, 0x03ff, 0x07ff
4903        .value      0x0fff, 0x1fff, 0x3fff, 0x7fff
4904        .value      0xffff
4905
4906.align 64
4907byte64_len_to_mask_table:
4908        .quad      0x0000000000000000, 0x0000000000000001
4909        .quad      0x0000000000000003, 0x0000000000000007
4910        .quad      0x000000000000000f, 0x000000000000001f
4911        .quad      0x000000000000003f, 0x000000000000007f
4912        .quad      0x00000000000000ff, 0x00000000000001ff
4913        .quad      0x00000000000003ff, 0x00000000000007ff
4914        .quad      0x0000000000000fff, 0x0000000000001fff
4915        .quad      0x0000000000003fff, 0x0000000000007fff
4916        .quad      0x000000000000ffff, 0x000000000001ffff
4917        .quad      0x000000000003ffff, 0x000000000007ffff
4918        .quad      0x00000000000fffff, 0x00000000001fffff
4919        .quad      0x00000000003fffff, 0x00000000007fffff
4920        .quad      0x0000000000ffffff, 0x0000000001ffffff
4921        .quad      0x0000000003ffffff, 0x0000000007ffffff
4922        .quad      0x000000000fffffff, 0x000000001fffffff
4923        .quad      0x000000003fffffff, 0x000000007fffffff
4924        .quad      0x00000000ffffffff, 0x00000001ffffffff
4925        .quad      0x00000003ffffffff, 0x00000007ffffffff
4926        .quad      0x0000000fffffffff, 0x0000001fffffffff
4927        .quad      0x0000003fffffffff, 0x0000007fffffffff
4928        .quad      0x000000ffffffffff, 0x000001ffffffffff
4929        .quad      0x000003ffffffffff, 0x000007ffffffffff
4930        .quad      0x00000fffffffffff, 0x00001fffffffffff
4931        .quad      0x00003fffffffffff, 0x00007fffffffffff
4932        .quad      0x0000ffffffffffff, 0x0001ffffffffffff
4933        .quad      0x0003ffffffffffff, 0x0007ffffffffffff
4934        .quad      0x000fffffffffffff, 0x001fffffffffffff
4935        .quad      0x003fffffffffffff, 0x007fffffffffffff
4936        .quad      0x00ffffffffffffff, 0x01ffffffffffffff
4937        .quad      0x03ffffffffffffff, 0x07ffffffffffffff
4938        .quad      0x0fffffffffffffff, 0x1fffffffffffffff
4939        .quad      0x3fffffffffffffff, 0x7fffffffffffffff
4940        .quad      0xffffffffffffffff
4941___
4942
4943} else {
4944# Fallback for old assembler
4945$code .= <<___;
4946.text
4947.globl  ossl_vaes_vpclmulqdq_capable
4948.type   ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
4949ossl_vaes_vpclmulqdq_capable:
4950    xor     %eax,%eax
4951    ret
4952.size   ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
4953
4954.globl ossl_aes_gcm_init_avx512
4955.globl ossl_aes_gcm_setiv_avx512
4956.globl ossl_aes_gcm_update_aad_avx512
4957.globl ossl_aes_gcm_encrypt_avx512
4958.globl ossl_aes_gcm_decrypt_avx512
4959.globl ossl_aes_gcm_finalize_avx512
4960.globl ossl_gcm_gmult_avx512
4961
4962.type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4963ossl_aes_gcm_init_avx512:
4964ossl_aes_gcm_setiv_avx512:
4965ossl_aes_gcm_update_aad_avx512:
4966ossl_aes_gcm_encrypt_avx512:
4967ossl_aes_gcm_decrypt_avx512:
4968ossl_aes_gcm_finalize_avx512:
4969ossl_gcm_gmult_avx512:
4970    .byte   0x0f,0x0b    # ud2
4971    ret
4972.size   ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4973___
4974}
4975
4976$code =~ s/\`([^\`]*)\`/eval $1/gem;
4977print $code;
4978close STDOUT or die "error closing STDOUT: $!";
4979