1# Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved. 2# Copyright (c) 2021, Intel Corporation. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ) 11# from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1 12# (https://github.com/intel/intel-ipsec-mb). 13# Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>. 14# 15# References: 16# [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on 17# Intel Architecture Processors. August, 2010. 18# [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on 19# Intel Architecture Processors. October, 2012. 20# [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its 21# Usage for Computing the GCM Mode. May, 2010. 22# 23# 24# December 2021 25# 26# Initial release. 27# 28# GCM128_CONTEXT structure has storage for 16 hkeys only, but this 29# implementation can use up to 48. To avoid extending the context size, 30# precompute and store in the context first 16 hkeys only, and compute the rest 31# on demand keeping them in the local frame. 32# 33#====================================================================== 34# $output is the last argument if it looks like a file (it has an extension) 35# $flavour is the first argument if it doesn't look like a file 36$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 37$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 38 39$win64 = 0; 40$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 41 42$avx512vaes = 0; 43 44$0 =~ m/(.*[\/\\])[^\/\\]+$/; 45$dir = $1; 46($xlate = "${dir}x86_64-xlate.pl" and -f $xlate) 47 or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) 48 or die "can't locate x86_64-xlate.pl"; 49 50if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 51 $avx512vaes = ($1 >= 2.30); 52} 53 54if (!$avx512vaes 55 && $win64 56 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) 57 && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) 58{ 59 $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14); 60} 61 62if (!$avx512vaes && `$ENV{CC} -v 2>&1` 63 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 64 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 65 if ($1) { 66 # Apple conditions, they use a different version series, see 67 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 68 # clang 7.0.0 is Apple clang 10.0.1 69 $avx512vaes = ($ver>=10.0001) 70 } else { 71 $avx512vaes = ($ver>=7.0); 72 } 73} 74 75open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"" 76 or die "can't call $xlate: $!"; 77*STDOUT = *OUT; 78 79#====================================================================== 80if ($avx512vaes>0) { #<<< 81 82$code .= <<___; 83.extern OPENSSL_ia32cap_P 84.globl ossl_vaes_vpclmulqdq_capable 85.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent 86.align 32 87ossl_vaes_vpclmulqdq_capable: 88 mov OPENSSL_ia32cap_P+8(%rip), %rcx 89 # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f 90 mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx 91 xor %eax,%eax 92 and %rdx,%rcx 93 cmp %rdx,%rcx 94 cmove %rcx,%rax 95 ret 96.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable 97___ 98 99# ; Mapping key length -> AES rounds count 100my %aes_rounds = ( 101 128 => 9, 102 192 => 11, 103 256 => 13); 104 105# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 106# ;;; Code generation control switches 107# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 108 109# ; ABI-aware zeroing of volatile registers in EPILOG(). 110# ; Disabled due to performance reasons. 111my $CLEAR_SCRATCH_REGISTERS = 0; 112 113# ; Zero HKeys storage from the stack if they are stored there 114my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1; 115 116# ; Enable / disable check of function arguments for null pointer 117# ; Currently disabled, as this check is handled outside. 118my $CHECK_FUNCTION_ARGUMENTS = 0; 119 120# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 121# ;;; Global constants 122# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 123 124# AES block size in bytes 125my $AES_BLOCK_SIZE = 16; 126 127# Storage capacity in elements 128my $HKEYS_STORAGE_CAPACITY = 48; 129my $LOCAL_STORAGE_CAPACITY = 48; 130my $HKEYS_CONTEXT_CAPACITY = 16; 131 132# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 133# ;;; Stack frame definition 134# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 135 136# (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs 137# (2) -> +8-byte space for 16-byte alignment of XMM storage 138# (3) -> Frame pointer (%RBP) 139# (4) -> +160-byte XMM storage (Windows only, zero on Linux) 140# (5) -> +48-byte space for 64-byte alignment of %RSP from p.8 141# (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions) 142# (7) -> +768-byte HKEYS storage 143# (8) -> Stack pointer (%RSP) aligned on 64-byte boundary 144 145my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack) 146my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers 147my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48 148my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks 149 150my $STACK_HKEYS_OFFSET = 0; 151my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE); 152 153# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 154# ;;; Function arguments abstraction 155# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 156my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11); 157 158# ; Counter used for assembly label generation 159my $label_count = 0; 160 161# ; This implementation follows the convention: for non-leaf functions (they 162# ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from 163# ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This 164# ; helps to facilitate SEH handlers writing. 165# 166# ; Leaf functions here do not use more than 4 input arguments. 167if ($win64) { 168 $arg1 = "%rcx"; 169 $arg2 = "%rdx"; 170 $arg3 = "%r8"; 171 $arg4 = "%r9"; 172 $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes 173 $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)"; 174 $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)"; 175 $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)"; 176 $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)"; 177 $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)"; 178 $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)"; 179} else { 180 $arg1 = "%rdi"; 181 $arg2 = "%rsi"; 182 $arg3 = "%rdx"; 183 $arg4 = "%rcx"; 184 $arg5 = "%r8"; 185 $arg6 = "%r9"; 186 $arg7 = "`$GP_STORAGE + 8*1`(%rbp)"; 187 $arg8 = "`$GP_STORAGE + 8*2`(%rbp)"; 188 $arg9 = "`$GP_STORAGE + 8*3`(%rbp)"; 189 $arg10 = "`$GP_STORAGE + 8*4`(%rbp)"; 190 $arg11 = "`$GP_STORAGE + 8*5`(%rbp)"; 191} 192 193# ; Offsets in gcm128_context structure (see include/crypto/modes.h) 194my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key 195my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer 196my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation) 197my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input 198my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted 199my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash 200my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values) 201 202# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 203# ;;; Helper functions 204# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 205 206sub BYTE { 207 my ($reg) = @_; 208 if ($reg =~ /%r[abcd]x/i) { 209 $reg =~ s/%r([abcd])x/%${1}l/i; 210 } elsif ($reg =~ /%r[sdb][ip]/i) { 211 $reg =~ s/%r([sdb][ip])/%${1}l/i; 212 } elsif ($reg =~ /%r[0-9]{1,2}/i) { 213 $reg =~ s/%(r[0-9]{1,2})/%${1}b/i; 214 } else { 215 die "BYTE: unknown register: $reg\n"; 216 } 217 return $reg; 218} 219 220sub WORD { 221 my ($reg) = @_; 222 if ($reg =~ /%r[abcdsdb][xip]/i) { 223 $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i; 224 } elsif ($reg =~ /%r[0-9]{1,2}/) { 225 $reg =~ s/%(r[0-9]{1,2})/%${1}w/i; 226 } else { 227 die "WORD: unknown register: $reg\n"; 228 } 229 return $reg; 230} 231 232sub DWORD { 233 my ($reg) = @_; 234 if ($reg =~ /%r[abcdsdb][xip]/i) { 235 $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i; 236 } elsif ($reg =~ /%r[0-9]{1,2}/i) { 237 $reg =~ s/%(r[0-9]{1,2})/%${1}d/i; 238 } else { 239 die "DWORD: unknown register: $reg\n"; 240 } 241 return $reg; 242} 243 244sub XWORD { 245 my ($reg) = @_; 246 if ($reg =~ /%[xyz]mm/i) { 247 $reg =~ s/%[xyz]mm/%xmm/i; 248 } else { 249 die "XWORD: unknown register: $reg\n"; 250 } 251 return $reg; 252} 253 254sub YWORD { 255 my ($reg) = @_; 256 if ($reg =~ /%[xyz]mm/i) { 257 $reg =~ s/%[xyz]mm/%ymm/i; 258 } else { 259 die "YWORD: unknown register: $reg\n"; 260 } 261 return $reg; 262} 263 264sub ZWORD { 265 my ($reg) = @_; 266 if ($reg =~ /%[xyz]mm/i) { 267 $reg =~ s/%[xyz]mm/%zmm/i; 268 } else { 269 die "ZWORD: unknown register: $reg\n"; 270 } 271 return $reg; 272} 273 274# ; Helper function to construct effective address based on two kinds of 275# ; offsets: numerical or located in the register 276sub EffectiveAddress { 277 my ($base, $offset, $displacement) = @_; 278 $displacement = 0 if (!$displacement); 279 280 if ($offset =~ /^\d+\z/) { # numerical offset 281 return "`$offset + $displacement`($base)"; 282 } else { # offset resides in register 283 return "$displacement($base,$offset,1)"; 284 } 285} 286 287# ; Provides memory location of corresponding HashKey power 288sub HashKeyByIdx { 289 my ($idx, $base) = @_; 290 my $base_str = ($base eq "%rsp") ? "frame" : "context"; 291 292 my $offset = &HashKeyOffsetByIdx($idx, $base_str); 293 return "$offset($base)"; 294} 295 296# ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage 297sub HashKeyOffsetByIdx { 298 my ($idx, $base) = @_; 299 die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base" 300 if (($base ne "frame") && ($base ne "context")); 301 302 my $offset_base; 303 my $offset_idx; 304 if ($base eq "frame") { # frame storage 305 die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1); 306 $offset_base = $STACK_HKEYS_OFFSET; 307 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx)); 308 } else { # context storage 309 die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1); 310 $offset_base = $CTX_OFFSET_HTable; 311 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx)); 312 } 313 return $offset_base + $offset_idx; 314} 315 316# ; Creates local frame and does back up of non-volatile registers. 317# ; Holds stack unwinding directives. 318sub PROLOG { 319 my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_; 320 321 my $DYNAMIC_STACK_ALLOC_SIZE = 0; 322 my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52; 323 324 if ($need_hkeys_stack_storage) { 325 $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE; 326 } 327 328 if ($need_aes_stack_storage) { 329 if (!$need_hkeys_stack_storage) { 330 die "PROLOG: unsupported case - aes storage without hkeys one"; 331 } 332 $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE; 333 } 334 335 $code .= <<___; 336 push %rbx 337.cfi_push %rbx 338.L${func_name}_seh_push_rbx: 339 push %rbp 340.cfi_push %rbp 341.L${func_name}_seh_push_rbp: 342 push %r12 343.cfi_push %r12 344.L${func_name}_seh_push_r12: 345 push %r13 346.cfi_push %r13 347.L${func_name}_seh_push_r13: 348 push %r14 349.cfi_push %r14 350.L${func_name}_seh_push_r14: 351 push %r15 352.cfi_push %r15 353.L${func_name}_seh_push_r15: 354___ 355 356 if ($win64) { 357 $code .= <<___; 358 push %rdi 359.L${func_name}_seh_push_rdi: 360 push %rsi 361.L${func_name}_seh_push_rsi: 362 363 sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment 364.L${func_name}_seh_allocstack_xmm: 365___ 366 } 367 $code .= <<___; 368 # ; %rbp contains stack pointer right after GP regs pushed at stack + [8 369 # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH 370 # ; handlers. The requirement for a frame pointer is that its offset from 371 # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer 372 # ; itself seems to be reasonable to use here, because later we do 64-byte stack 373 # ; alignment which gives us non-determinate offsets and complicates writing 374 # ; SEH handlers. 375 # 376 # ; It also serves as an anchor for retrieving stack arguments on both Linux 377 # ; and Windows. 378 lea `$XMM_STORAGE`(%rsp),%rbp 379.cfi_def_cfa_register %rbp 380.L${func_name}_seh_setfp: 381___ 382 if ($win64) { 383 384 # ; xmm6:xmm15 need to be preserved on Windows 385 foreach my $reg_idx (6 .. 15) { 386 my $xmm_reg_offset = ($reg_idx - 6) * 16; 387 $code .= <<___; 388 vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp) 389.L${func_name}_seh_save_xmm${reg_idx}: 390___ 391 } 392 } 393 394 $code .= <<___; 395# Prolog ends here. Next stack allocation is treated as "dynamic". 396.L${func_name}_seh_prolog_end: 397___ 398 399 if ($DYNAMIC_STACK_ALLOC_SIZE) { 400 $code .= <<___; 401 sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp 402 and \$(-64),%rsp 403___ 404 } 405} 406 407# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 408# ;;; Restore register content for the caller. 409# ;;; And cleanup stack. 410# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 411sub EPILOG { 412 my ($hkeys_storage_on_stack, $payload_len) = @_; 413 414 my $label_suffix = $label_count++; 415 416 if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) { 417 418 # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys 419 # ; were stored in the local frame storage 420 $code .= <<___; 421 cmpq \$`16*16`,$payload_len 422 jbe .Lskip_hkeys_cleanup_${label_suffix} 423 vpxor %xmm0,%xmm0,%xmm0 424___ 425 for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) { 426 $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n"; 427 } 428 $code .= ".Lskip_hkeys_cleanup_${label_suffix}:\n"; 429 } 430 431 if ($CLEAR_SCRATCH_REGISTERS) { 432 &clear_scratch_gps_asm(); 433 &clear_scratch_zmms_asm(); 434 } else { 435 $code .= "vzeroupper\n"; 436 } 437 438 if ($win64) { 439 440 # ; restore xmm15:xmm6 441 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { 442 my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16; 443 $code .= <<___; 444 vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx}, 445___ 446 } 447 } 448 449 if ($win64) { 450 451 # Forming valid epilog for SEH with use of frame pointer. 452 # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code 453 $code .= "lea 8(%rbp),%rsp\n"; 454 } else { 455 $code .= "lea (%rbp),%rsp\n"; 456 $code .= ".cfi_def_cfa_register %rsp\n"; 457 } 458 459 if ($win64) { 460 $code .= <<___; 461 pop %rsi 462.cfi_pop %rsi 463 pop %rdi 464.cfi_pop %rdi 465___ 466 } 467 $code .= <<___; 468 pop %r15 469.cfi_pop %r15 470 pop %r14 471.cfi_pop %r14 472 pop %r13 473.cfi_pop %r13 474 pop %r12 475.cfi_pop %r12 476 pop %rbp 477.cfi_pop %rbp 478 pop %rbx 479.cfi_pop %rbx 480___ 481} 482 483# ; Clears all scratch ZMM registers 484# ; 485# ; It should be called before restoring the XMM registers 486# ; for Windows (XMM6-XMM15). 487# ; 488sub clear_scratch_zmms_asm { 489 490 # ; On Linux, all ZMM registers are scratch registers 491 if (!$win64) { 492 $code .= "vzeroall\n"; 493 } else { 494 foreach my $i (0 .. 5) { 495 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; 496 } 497 } 498 foreach my $i (16 .. 31) { 499 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n"; 500 } 501} 502 503# Clears all scratch GP registers 504sub clear_scratch_gps_asm { 505 foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") { 506 $code .= "xor $reg,$reg\n"; 507 } 508 if (!$win64) { 509 foreach my $reg ("%rsi", "%rdi") { 510 $code .= "xor $reg,$reg\n"; 511 } 512 } 513} 514 515sub precompute_hkeys_on_stack { 516 my $GCM128_CTX = $_[0]; 517 my $HKEYS_READY = $_[1]; 518 my $ZTMP0 = $_[2]; 519 my $ZTMP1 = $_[3]; 520 my $ZTMP2 = $_[4]; 521 my $ZTMP3 = $_[5]; 522 my $ZTMP4 = $_[6]; 523 my $ZTMP5 = $_[7]; 524 my $ZTMP6 = $_[8]; 525 my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32" 526 527 die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE" 528 if ($HKEYS_RANGE ne "first16" 529 && $HKEYS_RANGE ne "mid16" 530 && $HKEYS_RANGE ne "all" 531 && $HKEYS_RANGE ne "first32" 532 && $HKEYS_RANGE ne "last32"); 533 534 my $label_suffix = $label_count++; 535 536 $code .= <<___; 537 test $HKEYS_READY,$HKEYS_READY 538 jnz .L_skip_hkeys_precomputation_${label_suffix} 539___ 540 541 if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") { 542 543 # ; Fill the stack with the first 16 hkeys from the context 544 $code .= <<___; 545 # ; Move 16 hkeys from the context to stack 546 vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0 547 vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]} 548 549 vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1 550 vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]} 551 552 # ; broadcast HashKey^8 553 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 554 555 vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2 556 vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]} 557 558 vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3 559 vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]} 560___ 561 } 562 563 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") { 564 $code .= <<___; 565 vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1 566 567 # ; broadcast HashKey^8 568 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1 569 570 vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2 571 vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3 572___ 573 574 } 575 576 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { 577 578 # ; Precompute hkeys^i, i=17..32 579 my $i = 20; 580 foreach (1 .. int((32 - 16) / 8)) { 581 582 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) 583 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); 584 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; 585 $i += 4; 586 587 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) 588 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); 589 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; 590 $i += 4; 591 } 592 } 593 594 if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") { 595 596 # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48) 597 my $i = 36; 598 foreach (1 .. int((48 - 32) / 8)) { 599 600 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n) 601 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); 602 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; 603 $i += 4; 604 605 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n) 606 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6); 607 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n"; 608 $i += 4; 609 } 610 } 611 612 $code .= ".L_skip_hkeys_precomputation_${label_suffix}:\n"; 613} 614 615# ;; ============================================================================= 616# ;; Generic macro to produce code that executes $OPCODE instruction 617# ;; on selected number of AES blocks (16 bytes long ) between 0 and 16. 618# ;; All three operands of the instruction come from registers. 619# ;; Note: if 3 blocks are left at the end instruction is produced to operate all 620# ;; 4 blocks (full width of ZMM) 621sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 { 622 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) 623 my $OPCODE = $_[1]; # [in] instruction name 624 my @DST; 625 $DST[0] = $_[2]; # [out] destination ZMM register 626 $DST[1] = $_[3]; # [out] destination ZMM register 627 $DST[2] = $_[4]; # [out] destination ZMM register 628 $DST[3] = $_[5]; # [out] destination ZMM register 629 my @SRC1; 630 $SRC1[0] = $_[6]; # [in] source 1 ZMM register 631 $SRC1[1] = $_[7]; # [in] source 1 ZMM register 632 $SRC1[2] = $_[8]; # [in] source 1 ZMM register 633 $SRC1[3] = $_[9]; # [in] source 1 ZMM register 634 my @SRC2; 635 $SRC2[0] = $_[10]; # [in] source 2 ZMM register 636 $SRC2[1] = $_[11]; # [in] source 2 ZMM register 637 $SRC2[2] = $_[12]; # [in] source 2 ZMM register 638 $SRC2[3] = $_[13]; # [in] source 2 ZMM register 639 640 die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" 641 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); 642 643 my $reg_idx = 0; 644 my $blocks_left = $NUM_BLOCKS; 645 646 foreach (1 .. ($NUM_BLOCKS / 4)) { 647 $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n"; 648 $reg_idx++; 649 $blocks_left -= 4; 650 } 651 652 my $DSTREG = $DST[$reg_idx]; 653 my $SRC1REG = $SRC1[$reg_idx]; 654 my $SRC2REG = $SRC2[$reg_idx]; 655 656 if ($blocks_left == 1) { 657 $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n"; 658 } elsif ($blocks_left == 2) { 659 $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n"; 660 } elsif ($blocks_left == 3) { 661 $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n"; 662 } 663} 664 665# ;; ============================================================================= 666# ;; Loads specified number of AES blocks into ZMM registers using mask register 667# ;; for the last loaded register (xmm, ymm or zmm). 668# ;; Loads take place at 1 byte granularity. 669sub ZMM_LOAD_MASKED_BLOCKS_0_16 { 670 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) 671 my $INP = $_[1]; # [in] input data pointer to read from 672 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) 673 my @DST; 674 $DST[0] = $_[3]; # [out] ZMM register with loaded data 675 $DST[1] = $_[4]; # [out] ZMM register with loaded data 676 $DST[2] = $_[5]; # [out] ZMM register with loaded data 677 $DST[3] = $_[6]; # [out] ZMM register with loaded data 678 my $MASK = $_[7]; # [in] mask register 679 680 die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" 681 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); 682 683 my $src_offset = 0; 684 my $dst_idx = 0; 685 my $blocks_left = $NUM_BLOCKS; 686 687 if ($NUM_BLOCKS > 0) { 688 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { 689 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n"; 690 $src_offset += 64; 691 $dst_idx++; 692 $blocks_left -= 4; 693 } 694 } 695 696 my $DSTREG = $DST[$dst_idx]; 697 698 if ($blocks_left == 1) { 699 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n"; 700 } elsif ($blocks_left == 2) { 701 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n"; 702 } elsif (($blocks_left == 3 || $blocks_left == 4)) { 703 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n"; 704 } 705} 706 707# ;; ============================================================================= 708# ;; Stores specified number of AES blocks from ZMM registers with mask register 709# ;; for the last loaded register (xmm, ymm or zmm). 710# ;; Stores take place at 1 byte granularity. 711sub ZMM_STORE_MASKED_BLOCKS_0_16 { 712 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16) 713 my $OUTP = $_[1]; # [in] output data pointer to write to 714 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical) 715 my @SRC; 716 $SRC[0] = $_[3]; # [in] ZMM register with data to store 717 $SRC[1] = $_[4]; # [in] ZMM register with data to store 718 $SRC[2] = $_[5]; # [in] ZMM register with data to store 719 $SRC[3] = $_[6]; # [in] ZMM register with data to store 720 my $MASK = $_[7]; # [in] mask register 721 722 die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n" 723 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); 724 725 my $dst_offset = 0; 726 my $src_idx = 0; 727 my $blocks_left = $NUM_BLOCKS; 728 729 if ($NUM_BLOCKS > 0) { 730 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) { 731 $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n"; 732 $dst_offset += 64; 733 $src_idx++; 734 $blocks_left -= 4; 735 } 736 } 737 738 my $SRCREG = $SRC[$src_idx]; 739 740 if ($blocks_left == 1) { 741 $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; 742 } elsif ($blocks_left == 2) { 743 $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; 744 } elsif ($blocks_left == 3 || $blocks_left == 4) { 745 $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n"; 746 } 747} 748 749# ;;; =========================================================================== 750# ;;; Handles AES encryption rounds 751# ;;; It handles special cases: the last and first rounds 752# ;;; Optionally, it performs XOR with data after the last AES round. 753# ;;; Uses NROUNDS parameter to check what needs to be done for the current round. 754# ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks). 755sub ZMM_AESENC_ROUND_BLOCKS_0_16 { 756 my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3 757 my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7 758 my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11 759 my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15 760 my $KEY = $_[4]; # [in] zmm containing round key 761 my $ROUND = $_[5]; # [in] round number 762 my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3 763 my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7 764 my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11 765 my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15 766 my $NUMBL = $_[10]; # [in] number of blocks; numerical value 767 my $NROUNDS = $_[11]; # [in] number of rounds; numerical value 768 769 # ;;; === first AES round 770 if ($ROUND < 1) { 771 772 # ;; round 0 773 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 774 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, 775 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); 776 } 777 778 # ;;; === middle AES rounds 779 if ($ROUND >= 1 && $ROUND <= $NROUNDS) { 780 781 # ;; rounds 1 to 9/11/13 782 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 783 $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, 784 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); 785 } 786 787 # ;;; === last AES round 788 if ($ROUND > $NROUNDS) { 789 790 # ;; the last round - mix enclast with text xor's 791 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 792 $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, 793 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY); 794 795 # ;;; === XOR with data 796 if ( ($D0_3 ne "no_data") 797 && ($D4_7 ne "no_data") 798 && ($D8_11 ne "no_data") 799 && ($D12_15 ne "no_data")) 800 { 801 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 802 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3, 803 $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15); 804 } 805 } 806} 807 808# ;;; Horizontal XOR - 4 x 128bits xored together 809sub VHPXORI4x128 { 810 my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output 811 my $TMP = $_[1]; # [clobbered] ZMM temporary register 812 $code .= <<___; 813 vextracti64x4 \$1,$REG,@{[YWORD($TMP)]} 814 vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]} 815 vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]} 816 vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]} 817___ 818} 819 820# ;;; AVX512 reduction macro 821sub VCLMUL_REDUCE { 822 my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128) 823 my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial 824 my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce 825 my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce 826 my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register 827 my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register 828 829 $code .= <<___; 830 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 831 # ;; first phase of the reduction 832 vpclmulqdq \$0x01,$LO128,$POLY,$TMP0 833 vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs 834 vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete 835 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 836 # ;; second phase of the reduction 837 vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1 838 vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R 839 vpclmulqdq \$0x10,$TMP0,$POLY,$OUT 840 vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts 841 vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128 842 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 843___ 844} 845 846# ;; =========================================================================== 847# ;; schoolbook multiply of 16 blocks (16 x 16 bytes) 848# ;; - it is assumed that data read from $INPTR is already shuffled and 849# ;; $INPTR address is 64 byte aligned 850# ;; - there is an option to pass ready blocks through ZMM registers too. 851# ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty 852sub GHASH_16 { 853 my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction), 854 # end_reduce (end with reduction), start_reduce 855 my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits 856 my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits 857 my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits 858 my $INPTR = $_[4]; # [in] data input pointer 859 my $INOFF = $_[5]; # [in] data input offset 860 my $INDIS = $_[6]; # [in] data input displacement 861 my $HKPTR = $_[7]; # [in] hash key pointer 862 my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset) 863 my $HKDIS = $_[9]; # [in] hash key displacement 864 my $HASH = $_[10]; # [in/out] ZMM hash value in/out 865 my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM 866 my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM 867 my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM 868 my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM 869 my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM 870 my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM 871 my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM 872 my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM 873 my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM 874 my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided 875 my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) 876 my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) 877 my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) 878 my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused) 879 880 my $start_ghash = 0; 881 my $do_reduction = 0; 882 if ($TYPE eq "start") { 883 $start_ghash = 1; 884 } 885 886 if ($TYPE eq "start_reduce") { 887 $start_ghash = 1; 888 $do_reduction = 1; 889 } 890 891 if ($TYPE eq "end_reduce") { 892 $do_reduction = 1; 893 } 894 895 # ;; ghash blocks 0-3 896 if (scalar(@_) == 21) { 897 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n"; 898 } else { 899 $ZTMP9 = $DAT0; 900 } 901 902 if ($start_ghash != 0) { 903 $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n"; 904 } 905 $code .= <<___; 906 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8 907 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 908 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 909 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 910 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 911___ 912 913 # ;; ghash blocks 4-7 914 if (scalar(@_) == 21) { 915 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n"; 916 } else { 917 $ZTMP9 = $DAT1; 918 } 919 $code .= <<___; 920 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8 921 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 922 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 923 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 924 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 925___ 926 927 # ;; update sums 928 if ($start_ghash != 0) { 929 $code .= <<___; 930 vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1 931 vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H 932 vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L 933 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1 934___ 935 } else { # ;; mid, end, end_reduce 936 $code .= <<___; 937 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 938 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H 939 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L 940 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 941___ 942 } 943 944 # ;; ghash blocks 8-11 945 if (scalar(@_) == 21) { 946 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n"; 947 } else { 948 $ZTMP9 = $DAT2; 949 } 950 $code .= <<___; 951 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8 952 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1 953 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0 954 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0 955 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1 956___ 957 958 # ;; ghash blocks 12-15 959 if (scalar(@_) == 21) { 960 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n"; 961 } else { 962 $ZTMP9 = $DAT3; 963 } 964 $code .= <<___; 965 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8 966 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1 967 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0 968 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0 969 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1 970 # ;; update sums 971 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1 972 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H 973 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L 974 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1 975___ 976 if ($do_reduction != 0) { 977 $code .= <<___; 978 # ;; integrate GM into GH and GL 979 vpsrldq \$8,$GM,$ZTMP0 980 vpslldq \$8,$GM,$ZTMP1 981 vpxorq $ZTMP0,$GH,$GH 982 vpxorq $ZTMP1,$GL,$GL 983___ 984 985 # ;; add GH and GL 128-bit words horizontally 986 &VHPXORI4x128($GH, $ZTMP0); 987 &VHPXORI4x128($GL, $ZTMP1); 988 989 # ;; reduction 990 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n"; 991 &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1)); 992 } 993} 994 995# ;; =========================================================================== 996# ;; GHASH 1 to 16 blocks of cipher text 997# ;; - performs reduction at the end 998# ;; - it doesn't load the data and it assumed it is already loaded and shuffled 999sub GHASH_1_TO_16 { 1000 my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys 1001 my $GHASH = $_[1]; # [out] ghash output 1002 my $T0H = $_[2]; # [clobbered] temporary ZMM 1003 my $T0L = $_[3]; # [clobbered] temporary ZMM 1004 my $T0M1 = $_[4]; # [clobbered] temporary ZMM 1005 my $T0M2 = $_[5]; # [clobbered] temporary ZMM 1006 my $T1H = $_[6]; # [clobbered] temporary ZMM 1007 my $T1L = $_[7]; # [clobbered] temporary ZMM 1008 my $T1M1 = $_[8]; # [clobbered] temporary ZMM 1009 my $T1M2 = $_[9]; # [clobbered] temporary ZMM 1010 my $HK = $_[10]; # [clobbered] temporary ZMM 1011 my $AAD_HASH_IN = $_[11]; # [in] input hash value 1012 my @CIPHER_IN; 1013 $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3 1014 $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7 1015 $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11 1016 $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15 1017 my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks 1018 my $GH = $_[17]; # [in] ZMM with hi product part 1019 my $GM = $_[18]; # [in] ZMM with mid product part 1020 my $GL = $_[19]; # [in] ZMM with lo product part 1021 1022 die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); 1023 1024 if (scalar(@_) == 17) { 1025 $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n"; 1026 } 1027 1028 if ($NUM_BLOCKS == 16) { 1029 $code .= <<___; 1030 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK 1031 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 1032 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 1033 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 1034 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 1035 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK 1036 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 1037 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 1038 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 1039 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 1040 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK 1041 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1 1042 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0 1043 vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H 1044 vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L 1045 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0 1046 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1 1047 vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1 1048 vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2 1049 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK 1050 vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1 1051 vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0 1052 vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0 1053 vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1 1054 vpxorq $T1H,$T0H,$T1H 1055 vpxorq $T1L,$T0L,$T1L 1056 vpxorq $T1M1,$T0M1,$T1M1 1057 vpxorq $T1M2,$T0M2,$T1M2 1058___ 1059 } elsif ($NUM_BLOCKS >= 12) { 1060 $code .= <<___; 1061 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK 1062 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 1063 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 1064 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 1065 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 1066 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK 1067 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 1068 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 1069 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 1070 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 1071 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK 1072 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1 1073 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0 1074 vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H 1075 vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L 1076 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0 1077 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1 1078 vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1 1079 vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2 1080___ 1081 } elsif ($NUM_BLOCKS >= 8) { 1082 $code .= <<___; 1083 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK 1084 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1 1085 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0 1086 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0 1087 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1 1088 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK 1089 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1 1090 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0 1091 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0 1092 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1 1093 vpxorq $T1H,$T0H,$T1H 1094 vpxorq $T1L,$T0L,$T1L 1095 vpxorq $T1M1,$T0M1,$T1M1 1096 vpxorq $T1M2,$T0M2,$T1M2 1097___ 1098 } elsif ($NUM_BLOCKS >= 4) { 1099 $code .= <<___; 1100 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK 1101 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1 1102 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0 1103 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0 1104 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1 1105___ 1106 } 1107 1108 # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4) 1109 my $blocks_left = ($NUM_BLOCKS % 4); 1110 if ($blocks_left > 0) { 1111 1112 # ;; ===================================================== 1113 # ;; There are 1, 2 or 3 blocks left to process. 1114 # ;; It may also be that they are the only blocks to process. 1115 1116 # ;; Set hash key and register index position for the remaining 1 to 3 blocks 1117 my $reg_idx = ($NUM_BLOCKS / 4); 1118 my $REG_IN = $CIPHER_IN[$reg_idx]; 1119 1120 if ($blocks_left == 1) { 1121 $code .= <<___; 1122 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]} 1123 vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0 1124 vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1 1125 vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1 1126 vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0 1127___ 1128 } elsif ($blocks_left == 2) { 1129 $code .= <<___; 1130 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]} 1131 vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0 1132 vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1 1133 vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1 1134 vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0 1135___ 1136 } else { # ; blocks_left == 3 1137 $code .= <<___; 1138 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]} 1139 vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK 1140 vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0 1141 vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1 1142 vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1 1143 vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0 1144___ 1145 } 1146 1147 if (scalar(@_) == 20) { 1148 1149 # ;; *** GH/GM/GL passed as arguments 1150 if ($NUM_BLOCKS >= 4) { 1151 $code .= <<___; 1152 # ;; add ghash product sums from the first 4, 8 or 12 blocks 1153 vpxorq $T1M1,$T0M1,$T0M1 1154 vpternlogq \$0x96,$T1M2,$GM,$T0M2 1155 vpternlogq \$0x96,$T1H,$GH,$T0H 1156 vpternlogq \$0x96,$T1L,$GL,$T0L 1157___ 1158 } else { 1159 $code .= <<___; 1160 vpxorq $GM,$T0M1,$T0M1 1161 vpxorq $GH,$T0H,$T0H 1162 vpxorq $GL,$T0L,$T0L 1163___ 1164 } 1165 } else { 1166 1167 # ;; *** GH/GM/GL NOT passed as arguments 1168 if ($NUM_BLOCKS >= 4) { 1169 $code .= <<___; 1170 # ;; add ghash product sums from the first 4, 8 or 12 blocks 1171 vpxorq $T1M1,$T0M1,$T0M1 1172 vpxorq $T1M2,$T0M2,$T0M2 1173 vpxorq $T1H,$T0H,$T0H 1174 vpxorq $T1L,$T0L,$T0L 1175___ 1176 } 1177 } 1178 $code .= <<___; 1179 # ;; integrate TM into TH and TL 1180 vpxorq $T0M2,$T0M1,$T0M1 1181 vpsrldq \$8,$T0M1,$T1M1 1182 vpslldq \$8,$T0M1,$T1M2 1183 vpxorq $T1M1,$T0H,$T0H 1184 vpxorq $T1M2,$T0L,$T0L 1185___ 1186 } else { 1187 1188 # ;; ===================================================== 1189 # ;; number of blocks is 4, 8, 12 or 16 1190 # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2 1191 if (scalar(@_) == 20) { 1192 $code .= <<___; 1193 # ;; *** GH/GM/GL passed as arguments 1194 vpxorq $GM,$T1M1,$T1M1 1195 vpxorq $GH,$T1H,$T1H 1196 vpxorq $GL,$T1L,$T1L 1197___ 1198 } 1199 $code .= <<___; 1200 # ;; integrate TM into TH and TL 1201 vpxorq $T1M2,$T1M1,$T1M1 1202 vpsrldq \$8,$T1M1,$T0M1 1203 vpslldq \$8,$T1M1,$T0M2 1204 vpxorq $T0M1,$T1H,$T0H 1205 vpxorq $T0M2,$T1L,$T0L 1206___ 1207 } 1208 1209 # ;; add TH and TL 128-bit words horizontally 1210 &VHPXORI4x128($T0H, $T1M1); 1211 &VHPXORI4x128($T0L, $T1M2); 1212 1213 # ;; reduction 1214 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n"; 1215 &VCLMUL_REDUCE( 1216 @{[XWORD($GHASH)]}, 1217 @{[XWORD($HK)]}, 1218 @{[XWORD($T0H)]}, 1219 @{[XWORD($T0L)]}, 1220 @{[XWORD($T0M1)]}, 1221 @{[XWORD($T0M2)]}); 1222} 1223 1224# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1225# ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1) 1226# ;; Input: A and B (128-bits each, bit-reflected) 1227# ;; Output: C = A*B*x mod poly, (i.e. >>1 ) 1228# ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1229# ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1230# ;; 1231# ;; Refer to [3] for more details. 1232# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1233sub GHASH_MUL { 1234 my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits) 1235 my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits) 1236 my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm 1237 my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm 1238 my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm 1239 1240 $code .= <<___; 1241 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1242 vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1 1243 vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0 1244 vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0 1245 vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1 1246 vpxorq $T3,$GH,$GH 1247 1248 vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs 1249 vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs 1250 vpxorq $T3,$T1,$T1 1251 vpxorq $T2,$GH,$GH 1252 1253 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1254 # ;first phase of the reduction 1255 vmovdqu64 POLY2(%rip),$T3 1256 1257 vpclmulqdq \$0x01,$GH,$T3,$T2 1258 vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs 1259 vpxorq $T2,$GH,$GH # ; first phase of the reduction complete 1260 1261 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1262 # ;second phase of the reduction 1263 vpclmulqdq \$0x00,$GH,$T3,$T2 1264 vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R 1265 vpclmulqdq \$0x10,$GH,$T3,$GH 1266 vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts 1267 # ; second phase of the reduction complete, the result is in $GH 1268 vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2 1269 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1270___ 1271} 1272 1273# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1274# ;;; PRECOMPUTE computes HashKey_i 1275sub PRECOMPUTE { 1276 my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated 1277 my $HK = $_[1]; #; [in] xmm, hash key 1278 my $T1 = $_[2]; #; [clobbered] xmm 1279 my $T2 = $_[3]; #; [clobbered] xmm 1280 my $T3 = $_[4]; #; [clobbered] xmm 1281 my $T4 = $_[5]; #; [clobbered] xmm 1282 my $T5 = $_[6]; #; [clobbered] xmm 1283 my $T6 = $_[7]; #; [clobbered] xmm 1284 1285 my $ZT1 = &ZWORD($T1); 1286 my $ZT2 = &ZWORD($T2); 1287 my $ZT3 = &ZWORD($T3); 1288 my $ZT4 = &ZWORD($T4); 1289 my $ZT5 = &ZWORD($T5); 1290 my $ZT6 = &ZWORD($T6); 1291 1292 my $YT1 = &YWORD($T1); 1293 my $YT2 = &YWORD($T2); 1294 my $YT3 = &YWORD($T3); 1295 my $YT4 = &YWORD($T4); 1296 my $YT5 = &YWORD($T5); 1297 my $YT6 = &YWORD($T6); 1298 1299 $code .= <<___; 1300 vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5 1301 vmovdqa $YT5,$YT4 1302___ 1303 1304 # ;; calculate HashKey^2<<1 mod poly 1305 &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3); 1306 1307 $code .= <<___; 1308 vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]} 1309 vinserti64x2 \$1,$HK,$YT4,$YT5 1310 vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2 1311___ 1312 1313 # ;; use 2x128-bit computation 1314 # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly 1315 &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4 1316 1317 $code .= <<___; 1318 vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]} 1319 1320 vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5 1321 1322 # ;; switch to 4x128-bit computations now 1323 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4 1324 vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6 1325___ 1326 1327 # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly 1328 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3); 1329 $code .= <<___; 1330 vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now 1331 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4 1332___ 1333 1334 # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly 1335 # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution 1336 1337 # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9) 1338 &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3); 1339 $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n"; 1340 1341 # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13) 1342 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3); 1343 $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n"; 1344 1345 # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys 1346} 1347 1348# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1349# ;; READ_SMALL_DATA_INPUT 1350# ;; Packs xmm register with data when data input is less or equal to 16 bytes 1351# ;; Returns 0 if data has length 0 1352# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1353sub READ_SMALL_DATA_INPUT { 1354 my $OUTPUT = $_[0]; # [out] xmm register 1355 my $INPUT = $_[1]; # [in] buffer pointer to read from 1356 my $LENGTH = $_[2]; # [in] number of bytes to read 1357 my $TMP1 = $_[3]; # [clobbered] 1358 my $TMP2 = $_[4]; # [clobbered] 1359 my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask 1360 1361 $code .= <<___; 1362 mov \$16,@{[DWORD($TMP2)]} 1363 lea byte_len_to_mask_table(%rip),$TMP1 1364 cmp $TMP2,$LENGTH 1365 cmovc $LENGTH,$TMP2 1366___ 1367 if ($win64) { 1368 $code .= <<___; 1369 add $TMP2,$TMP1 1370 add $TMP2,$TMP1 1371 kmovw ($TMP1),$MASK 1372___ 1373 } else { 1374 $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n"; 1375 } 1376 $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n"; 1377} 1378 1379# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1380# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 1381# Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). 1382# Output: The hash of the data (AAD_HASH). 1383# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1384sub CALC_AAD_HASH { 1385 my $A_IN = $_[0]; # [in] AAD text pointer 1386 my $A_LEN = $_[1]; # [in] AAD length 1387 my $AAD_HASH = $_[2]; # [in/out] xmm ghash value 1388 my $GCM128_CTX = $_[3]; # [in] pointer to context 1389 my $ZT0 = $_[4]; # [clobbered] ZMM register 1390 my $ZT1 = $_[5]; # [clobbered] ZMM register 1391 my $ZT2 = $_[6]; # [clobbered] ZMM register 1392 my $ZT3 = $_[7]; # [clobbered] ZMM register 1393 my $ZT4 = $_[8]; # [clobbered] ZMM register 1394 my $ZT5 = $_[9]; # [clobbered] ZMM register 1395 my $ZT6 = $_[10]; # [clobbered] ZMM register 1396 my $ZT7 = $_[11]; # [clobbered] ZMM register 1397 my $ZT8 = $_[12]; # [clobbered] ZMM register 1398 my $ZT9 = $_[13]; # [clobbered] ZMM register 1399 my $ZT10 = $_[14]; # [clobbered] ZMM register 1400 my $ZT11 = $_[15]; # [clobbered] ZMM register 1401 my $ZT12 = $_[16]; # [clobbered] ZMM register 1402 my $ZT13 = $_[17]; # [clobbered] ZMM register 1403 my $ZT14 = $_[18]; # [clobbered] ZMM register 1404 my $ZT15 = $_[19]; # [clobbered] ZMM register 1405 my $ZT16 = $_[20]; # [clobbered] ZMM register 1406 my $T1 = $_[21]; # [clobbered] GP register 1407 my $T2 = $_[22]; # [clobbered] GP register 1408 my $T3 = $_[23]; # [clobbered] GP register 1409 my $MASKREG = $_[24]; # [clobbered] mask register 1410 1411 my $HKEYS_READY = "%rbx"; 1412 1413 my $SHFMSK = $ZT13; 1414 1415 my $label_suffix = $label_count++; 1416 1417 $code .= <<___; 1418 mov $A_IN,$T1 # ; T1 = AAD 1419 mov $A_LEN,$T2 # ; T2 = aadLen 1420 or $T2,$T2 1421 jz .L_CALC_AAD_done_${label_suffix} 1422 1423 xor $HKEYS_READY,$HKEYS_READY 1424 vmovdqa64 SHUF_MASK(%rip),$SHFMSK 1425 1426.L_get_AAD_loop48x16_${label_suffix}: 1427 cmp \$`(48*16)`,$T2 1428 jl .L_exit_AAD_loop48x16_${label_suffix} 1429___ 1430 1431 $code .= <<___; 1432 vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3 1433 vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7 1434 vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11 1435 vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15 1436 vpshufb $SHFMSK,$ZT1,$ZT1 1437 vpshufb $SHFMSK,$ZT2,$ZT2 1438 vpshufb $SHFMSK,$ZT3,$ZT3 1439 vpshufb $SHFMSK,$ZT4,$ZT4 1440___ 1441 1442 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all"); 1443 $code .= "mov \$1,$HKEYS_READY\n"; 1444 1445 &GHASH_16( 1446 "start", $ZT5, $ZT6, $ZT7, 1447 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", 1448 &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0, 1449 $ZT8, $ZT9, $ZT10, $ZT11, 1450 $ZT12, $ZT14, $ZT15, $ZT16, 1451 "NO_ZMM", $ZT1, $ZT2, $ZT3, 1452 $ZT4); 1453 1454 $code .= <<___; 1455 vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19 1456 vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23 1457 vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27 1458 vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31 1459 vpshufb $SHFMSK,$ZT1,$ZT1 1460 vpshufb $SHFMSK,$ZT2,$ZT2 1461 vpshufb $SHFMSK,$ZT3,$ZT3 1462 vpshufb $SHFMSK,$ZT4,$ZT4 1463___ 1464 1465 &GHASH_16( 1466 "mid", $ZT5, $ZT6, $ZT7, 1467 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", 1468 &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0, 1469 $ZT8, $ZT9, $ZT10, $ZT11, 1470 $ZT12, $ZT14, $ZT15, $ZT16, 1471 "NO_ZMM", $ZT1, $ZT2, $ZT3, 1472 $ZT4); 1473 1474 $code .= <<___; 1475 vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35 1476 vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39 1477 vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43 1478 vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47 1479 vpshufb $SHFMSK,$ZT1,$ZT1 1480 vpshufb $SHFMSK,$ZT2,$ZT2 1481 vpshufb $SHFMSK,$ZT3,$ZT3 1482 vpshufb $SHFMSK,$ZT4,$ZT4 1483___ 1484 1485 &GHASH_16( 1486 "end_reduce", $ZT5, $ZT6, $ZT7, 1487 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", 1488 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, 1489 $ZT8, $ZT9, $ZT10, $ZT11, 1490 $ZT12, $ZT14, $ZT15, $ZT16, 1491 "NO_ZMM", $ZT1, $ZT2, $ZT3, 1492 $ZT4); 1493 1494 $code .= <<___; 1495 sub \$`(48*16)`,$T2 1496 je .L_CALC_AAD_done_${label_suffix} 1497 1498 add \$`(48*16)`,$T1 1499 jmp .L_get_AAD_loop48x16_${label_suffix} 1500 1501.L_exit_AAD_loop48x16_${label_suffix}: 1502 # ; Less than 48x16 bytes remaining 1503 cmp \$`(32*16)`,$T2 1504 jl .L_less_than_32x16_${label_suffix} 1505___ 1506 1507 $code .= <<___; 1508 # ; Get next 16 blocks 1509 vmovdqu64 `64*0`($T1),$ZT1 1510 vmovdqu64 `64*1`($T1),$ZT2 1511 vmovdqu64 `64*2`($T1),$ZT3 1512 vmovdqu64 `64*3`($T1),$ZT4 1513 vpshufb $SHFMSK,$ZT1,$ZT1 1514 vpshufb $SHFMSK,$ZT2,$ZT2 1515 vpshufb $SHFMSK,$ZT3,$ZT3 1516 vpshufb $SHFMSK,$ZT4,$ZT4 1517___ 1518 1519 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32"); 1520 $code .= "mov \$1,$HKEYS_READY\n"; 1521 1522 &GHASH_16( 1523 "start", $ZT5, $ZT6, $ZT7, 1524 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", 1525 &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, 1526 $ZT8, $ZT9, $ZT10, $ZT11, 1527 $ZT12, $ZT14, $ZT15, $ZT16, 1528 "NO_ZMM", $ZT1, $ZT2, $ZT3, 1529 $ZT4); 1530 1531 $code .= <<___; 1532 vmovdqu64 `16*16 + 64*0`($T1),$ZT1 1533 vmovdqu64 `16*16 + 64*1`($T1),$ZT2 1534 vmovdqu64 `16*16 + 64*2`($T1),$ZT3 1535 vmovdqu64 `16*16 + 64*3`($T1),$ZT4 1536 vpshufb $SHFMSK,$ZT1,$ZT1 1537 vpshufb $SHFMSK,$ZT2,$ZT2 1538 vpshufb $SHFMSK,$ZT3,$ZT3 1539 vpshufb $SHFMSK,$ZT4,$ZT4 1540___ 1541 1542 &GHASH_16( 1543 "end_reduce", $ZT5, $ZT6, $ZT7, 1544 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp", 1545 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0, 1546 $ZT8, $ZT9, $ZT10, $ZT11, 1547 $ZT12, $ZT14, $ZT15, $ZT16, 1548 "NO_ZMM", $ZT1, $ZT2, $ZT3, 1549 $ZT4); 1550 1551 $code .= <<___; 1552 sub \$`(32*16)`,$T2 1553 je .L_CALC_AAD_done_${label_suffix} 1554 1555 add \$`(32*16)`,$T1 1556 jmp .L_less_than_16x16_${label_suffix} 1557 1558.L_less_than_32x16_${label_suffix}: 1559 cmp \$`(16*16)`,$T2 1560 jl .L_less_than_16x16_${label_suffix} 1561 # ; Get next 16 blocks 1562 vmovdqu64 `64*0`($T1),$ZT1 1563 vmovdqu64 `64*1`($T1),$ZT2 1564 vmovdqu64 `64*2`($T1),$ZT3 1565 vmovdqu64 `64*3`($T1),$ZT4 1566 vpshufb $SHFMSK,$ZT1,$ZT1 1567 vpshufb $SHFMSK,$ZT2,$ZT2 1568 vpshufb $SHFMSK,$ZT3,$ZT3 1569 vpshufb $SHFMSK,$ZT4,$ZT4 1570___ 1571 1572 # ; This code path does not use more than 16 hkeys, so they can be taken from the context 1573 # ; (not from the stack storage) 1574 &GHASH_16( 1575 "start_reduce", $ZT5, $ZT6, $ZT7, 1576 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX, 1577 &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0, 1578 $ZT8, $ZT9, $ZT10, $ZT11, 1579 $ZT12, $ZT14, $ZT15, $ZT16, 1580 "NO_ZMM", $ZT1, $ZT2, $ZT3, 1581 $ZT4); 1582 1583 $code .= <<___; 1584 sub \$`(16*16)`,$T2 1585 je .L_CALC_AAD_done_${label_suffix} 1586 1587 add \$`(16*16)`,$T1 1588 # ; Less than 16x16 bytes remaining 1589.L_less_than_16x16_${label_suffix}: 1590 # ;; prep mask source address 1591 lea byte64_len_to_mask_table(%rip),$T3 1592 lea ($T3,$T2,8),$T3 1593 1594 # ;; calculate number of blocks to ghash (including partial bytes) 1595 add \$15,@{[DWORD($T2)]} 1596 shr \$4,@{[DWORD($T2)]} 1597 cmp \$2,@{[DWORD($T2)]} 1598 jb .L_AAD_blocks_1_${label_suffix} 1599 je .L_AAD_blocks_2_${label_suffix} 1600 cmp \$4,@{[DWORD($T2)]} 1601 jb .L_AAD_blocks_3_${label_suffix} 1602 je .L_AAD_blocks_4_${label_suffix} 1603 cmp \$6,@{[DWORD($T2)]} 1604 jb .L_AAD_blocks_5_${label_suffix} 1605 je .L_AAD_blocks_6_${label_suffix} 1606 cmp \$8,@{[DWORD($T2)]} 1607 jb .L_AAD_blocks_7_${label_suffix} 1608 je .L_AAD_blocks_8_${label_suffix} 1609 cmp \$10,@{[DWORD($T2)]} 1610 jb .L_AAD_blocks_9_${label_suffix} 1611 je .L_AAD_blocks_10_${label_suffix} 1612 cmp \$12,@{[DWORD($T2)]} 1613 jb .L_AAD_blocks_11_${label_suffix} 1614 je .L_AAD_blocks_12_${label_suffix} 1615 cmp \$14,@{[DWORD($T2)]} 1616 jb .L_AAD_blocks_13_${label_suffix} 1617 je .L_AAD_blocks_14_${label_suffix} 1618 cmp \$15,@{[DWORD($T2)]} 1619 je .L_AAD_blocks_15_${label_suffix} 1620___ 1621 1622 # ;; fall through for 16 blocks 1623 1624 # ;; The flow of each of these cases is identical: 1625 # ;; - load blocks plain text 1626 # ;; - shuffle loaded blocks 1627 # ;; - xor in current hash value into block 0 1628 # ;; - perform up multiplications with ghash keys 1629 # ;; - jump to reduction code 1630 1631 for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) { 1632 $code .= ".L_AAD_blocks_${aad_blocks}_${label_suffix}:\n"; 1633 if ($aad_blocks > 12) { 1634 $code .= "sub \$`12*16*8`, $T3\n"; 1635 } elsif ($aad_blocks > 8) { 1636 $code .= "sub \$`8*16*8`, $T3\n"; 1637 } elsif ($aad_blocks > 4) { 1638 $code .= "sub \$`4*16*8`, $T3\n"; 1639 } 1640 $code .= "kmovq ($T3),$MASKREG\n"; 1641 1642 &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG); 1643 1644 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4, 1645 $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); 1646 1647 &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH), 1648 $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks); 1649 1650 if ($aad_blocks > 1) { 1651 1652 # ;; fall through to CALC_AAD_done in 1 block case 1653 $code .= "jmp .L_CALC_AAD_done_${label_suffix}\n"; 1654 } 1655 1656 } 1657 $code .= ".L_CALC_AAD_done_${label_suffix}:\n"; 1658 1659 # ;; result in AAD_HASH 1660} 1661 1662# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1663# ;; PARTIAL_BLOCK 1664# ;; Handles encryption/decryption and the tag partial blocks between 1665# ;; update calls. 1666# ;; Requires the input data be at least 1 byte long. 1667# ;; Output: 1668# ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT), 1669# ;; AAD_HASH and updated GCM128_CTX 1670# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1671sub PARTIAL_BLOCK { 1672 my $GCM128_CTX = $_[0]; # [in] key pointer 1673 my $PBLOCK_LEN = $_[1]; # [in] partial block length 1674 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer 1675 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer 1676 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length 1677 my $DATA_OFFSET = $_[5]; # [out] data offset (gets set) 1678 my $AAD_HASH = $_[6]; # [out] updated GHASH value 1679 my $ENC_DEC = $_[7]; # [in] cipher direction 1680 my $GPTMP0 = $_[8]; # [clobbered] GP temporary register 1681 my $GPTMP1 = $_[9]; # [clobbered] GP temporary register 1682 my $GPTMP2 = $_[10]; # [clobbered] GP temporary register 1683 my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register 1684 my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register 1685 my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register 1686 my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register 1687 my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register 1688 my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register 1689 my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register 1690 my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register 1691 my $MASKREG = $_[19]; # [clobbered] mask temporary register 1692 1693 my $XTMP0 = &XWORD($ZTMP0); 1694 my $XTMP1 = &XWORD($ZTMP1); 1695 my $XTMP2 = &XWORD($ZTMP2); 1696 my $XTMP3 = &XWORD($ZTMP3); 1697 my $XTMP4 = &XWORD($ZTMP4); 1698 my $XTMP5 = &XWORD($ZTMP5); 1699 my $XTMP6 = &XWORD($ZTMP6); 1700 my $XTMP7 = &XWORD($ZTMP7); 1701 1702 my $LENGTH = $DATA_OFFSET; 1703 my $IA0 = $GPTMP1; 1704 my $IA1 = $GPTMP2; 1705 my $IA2 = $GPTMP0; 1706 1707 my $label_suffix = $label_count++; 1708 1709 $code .= <<___; 1710 # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero 1711 mov ($PBLOCK_LEN),$LENGTH 1712 or $LENGTH,$LENGTH 1713 je .L_partial_block_done_${label_suffix} # ;Leave Macro if no partial blocks 1714___ 1715 1716 &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG); 1717 1718 $code .= <<___; 1719 # ;; XTMP1 = my_ctx_data.partial_block_enc_key 1720 vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1 1721 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2 1722 1723 # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes 1724 # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16) 1725 lea SHIFT_MASK(%rip),$IA0 1726 add $LENGTH,$IA0 1727 vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask 1728 vpshufb $XTMP3,$XTMP1,$XTMP1 1729___ 1730 1731 if ($ENC_DEC eq "DEC") { 1732 $code .= <<___; 1733 # ;; keep copy of cipher text in $XTMP4 1734 vmovdqa64 $XTMP0,$XTMP4 1735___ 1736 } 1737 $code .= <<___; 1738 vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn) 1739 # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block 1740 # ;; Determine if partial block is not being filled and shift mask accordingly 1741___ 1742 if ($win64) { 1743 $code .= <<___; 1744 mov $PLAIN_CIPH_LEN,$IA1 1745 add $LENGTH,$IA1 1746___ 1747 } else { 1748 $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n"; 1749 } 1750 $code .= <<___; 1751 sub \$16,$IA1 1752 jge .L_no_extra_mask_${label_suffix} 1753 sub $IA1,$IA0 1754.L_no_extra_mask_${label_suffix}: 1755 # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1 1756 # ;; - mask out bottom $LENGTH bytes of $XTMP1 1757 # ;; sizeof(SHIFT_MASK) == 16 bytes 1758 vmovdqu64 16($IA0),$XTMP0 1759 vpand $XTMP0,$XTMP1,$XTMP1 1760___ 1761 1762 if ($ENC_DEC eq "DEC") { 1763 $code .= <<___; 1764 vpand $XTMP0,$XTMP4,$XTMP4 1765 vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4 1766 vpshufb $XTMP3,$XTMP4,$XTMP4 1767 vpxorq $XTMP4,$AAD_HASH,$AAD_HASH 1768___ 1769 } else { 1770 $code .= <<___; 1771 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1 1772 vpshufb $XTMP3,$XTMP1,$XTMP1 1773 vpxorq $XTMP1,$AAD_HASH,$AAD_HASH 1774___ 1775 } 1776 $code .= <<___; 1777 cmp \$0,$IA1 1778 jl .L_partial_incomplete_${label_suffix} 1779___ 1780 1781 # ;; GHASH computation for the last <16 Byte block 1782 &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7); 1783 1784 $code .= <<___; 1785 movq \$0, ($PBLOCK_LEN) 1786 # ;; Set $LENGTH to be the number of bytes to write out 1787 mov $LENGTH,$IA0 1788 mov \$16,$LENGTH 1789 sub $IA0,$LENGTH 1790 jmp .L_enc_dec_done_${label_suffix} 1791 1792.L_partial_incomplete_${label_suffix}: 1793___ 1794 if ($win64) { 1795 $code .= <<___; 1796 mov $PLAIN_CIPH_LEN,$IA0 1797 add $IA0,($PBLOCK_LEN) 1798___ 1799 } else { 1800 $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n"; 1801 } 1802 $code .= <<___; 1803 mov $PLAIN_CIPH_LEN,$LENGTH 1804 1805.L_enc_dec_done_${label_suffix}: 1806 # ;; output encrypted Bytes 1807 1808 lea byte_len_to_mask_table(%rip),$IA0 1809 kmovw ($IA0,$LENGTH,2),$MASKREG 1810 vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX) 1811___ 1812 1813 if ($ENC_DEC eq "ENC") { 1814 $code .= <<___; 1815 # ;; shuffle XTMP1 back to output as ciphertext 1816 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1 1817 vpshufb $XTMP3,$XTMP1,$XTMP1 1818___ 1819 } 1820 $code .= <<___; 1821 mov $CIPH_PLAIN_OUT,$IA0 1822 vmovdqu8 $XTMP1,($IA0){$MASKREG} 1823.L_partial_block_done_${label_suffix}: 1824___ 1825} 1826 1827# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1828# ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation 1829sub INITIAL_BLOCKS_PARTIAL_CIPHER { 1830 my $AES_KEYS = $_[0]; # [in] key pointer 1831 my $GCM128_CTX = $_[1]; # [in] context pointer 1832 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer 1833 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer 1834 my $LENGTH = $_[4]; # [in/clobbered] length in bytes 1835 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated) 1836 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) 1837 my $CTR = $_[7]; # [in/out] current counter value 1838 my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC) 1839 my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH 1840 my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH 1841 my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH 1842 my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH 1843 my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text 1844 my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH 1845 my $CTR0 = $_[15]; # [clobbered] ZMM temporary 1846 my $CTR1 = $_[16]; # [clobbered] ZMM temporary 1847 my $CTR2 = $_[17]; # [clobbered] ZMM temporary 1848 my $CTR3 = $_[18]; # [clobbered] ZMM temporary 1849 my $ZT1 = $_[19]; # [clobbered] ZMM temporary 1850 my $IA0 = $_[20]; # [clobbered] GP temporary 1851 my $IA1 = $_[21]; # [clobbered] GP temporary 1852 my $MASKREG = $_[22]; # [clobbered] mask register 1853 my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask 1854 1855 if ($NUM_BLOCKS == 1) { 1856 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n"; 1857 } elsif ($NUM_BLOCKS == 2) { 1858 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n"; 1859 } else { 1860 $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n"; 1861 } 1862 1863 # ;; prepare AES counter blocks 1864 if ($NUM_BLOCKS == 1) { 1865 $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n"; 1866 } elsif ($NUM_BLOCKS == 2) { 1867 $code .= <<___; 1868 vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]} 1869 vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]} 1870___ 1871 } else { 1872 $code .= <<___; 1873 vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]} 1874 vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0 1875___ 1876 if ($NUM_BLOCKS > 4) { 1877 $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n"; 1878 } 1879 if ($NUM_BLOCKS > 8) { 1880 $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n"; 1881 } 1882 if ($NUM_BLOCKS > 12) { 1883 $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n"; 1884 } 1885 } 1886 1887 # ;; get load/store mask 1888 $code .= <<___; 1889 lea byte64_len_to_mask_table(%rip),$IA0 1890 mov $LENGTH,$IA1 1891___ 1892 if ($NUM_BLOCKS > 12) { 1893 $code .= "sub \$`3*64`,$IA1\n"; 1894 } elsif ($NUM_BLOCKS > 8) { 1895 $code .= "sub \$`2*64`,$IA1\n"; 1896 } elsif ($NUM_BLOCKS > 4) { 1897 $code .= "sub \$`1*64`,$IA1\n"; 1898 } 1899 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n"; 1900 1901 # ;; extract new counter value 1902 # ;; shuffle the counters for AES rounds 1903 if ($NUM_BLOCKS <= 4) { 1904 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n"; 1905 } elsif ($NUM_BLOCKS <= 8) { 1906 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n"; 1907 } elsif ($NUM_BLOCKS <= 12) { 1908 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n"; 1909 } else { 1910 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n"; 1911 } 1912 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 1913 $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0, 1914 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); 1915 1916 # ;; load plain/cipher text 1917 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG); 1918 1919 # ;; AES rounds and XOR with plain/cipher text 1920 foreach my $j (0 .. ($NROUNDS + 1)) { 1921 $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n"; 1922 &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j, 1923 $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS); 1924 } 1925 1926 # ;; retrieve the last cipher counter block (partially XOR'ed with text) 1927 # ;; - this is needed for partial block cases 1928 if ($NUM_BLOCKS <= 4) { 1929 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n"; 1930 } elsif ($NUM_BLOCKS <= 8) { 1931 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n"; 1932 } elsif ($NUM_BLOCKS <= 12) { 1933 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n"; 1934 } else { 1935 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n"; 1936 } 1937 1938 # ;; write cipher/plain text back to output and 1939 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n"; 1940 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG); 1941 1942 # ;; zero bytes outside the mask before hashing 1943 if ($NUM_BLOCKS <= 4) { 1944 $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n"; 1945 } elsif ($NUM_BLOCKS <= 8) { 1946 $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n"; 1947 } elsif ($NUM_BLOCKS <= 12) { 1948 $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n"; 1949 } else { 1950 $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n"; 1951 } 1952 1953 # ;; Shuffle the cipher text blocks for hashing part 1954 # ;; ZT5 and ZT6 are expected outputs with blocks for hashing 1955 if ($ENC_DEC eq "DEC") { 1956 1957 # ;; Decrypt case 1958 # ;; - cipher blocks are in ZT5 & ZT6 1959 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 1960 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0, 1961 $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); 1962 } else { 1963 1964 # ;; Encrypt case 1965 # ;; - cipher blocks are in CTR0-CTR3 1966 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 1967 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0, 1968 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK); 1969 } 1970 1971 # ;; Extract the last block for partials and multi_call cases 1972 if ($NUM_BLOCKS <= 4) { 1973 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n"; 1974 } elsif ($NUM_BLOCKS <= 8) { 1975 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n"; 1976 } elsif ($NUM_BLOCKS <= 12) { 1977 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n"; 1978 } else { 1979 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n"; 1980 } 1981 1982} 1983 1984# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1985# ;; Computes GHASH on 1 to 16 blocks 1986sub INITIAL_BLOCKS_PARTIAL_GHASH { 1987 my $AES_KEYS = $_[0]; # [in] key pointer 1988 my $GCM128_CTX = $_[1]; # [in] context pointer 1989 my $LENGTH = $_[2]; # [in/clobbered] length in bytes 1990 my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) 1991 my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value 1992 my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC) 1993 my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH 1994 my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH 1995 my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH 1996 my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH 1997 my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text 1998 my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH 1999 my $ZT0 = $_[12]; # [clobbered] ZMM temporary 2000 my $ZT1 = $_[13]; # [clobbered] ZMM temporary 2001 my $ZT2 = $_[14]; # [clobbered] ZMM temporary 2002 my $ZT3 = $_[15]; # [clobbered] ZMM temporary 2003 my $ZT4 = $_[16]; # [clobbered] ZMM temporary 2004 my $ZT5 = $_[17]; # [clobbered] ZMM temporary 2005 my $ZT6 = $_[18]; # [clobbered] ZMM temporary 2006 my $ZT7 = $_[19]; # [clobbered] ZMM temporary 2007 my $ZT8 = $_[20]; # [clobbered] ZMM temporary 2008 my $PBLOCK_LEN = $_[21]; # [in] partial block length 2009 my $GH = $_[22]; # [in] ZMM with hi product part 2010 my $GM = $_[23]; # [in] ZMM with mid product part 2011 my $GL = $_[24]; # [in] ZMM with lo product part 2012 2013 my $label_suffix = $label_count++; 2014 2015 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2016 # ;;; - Hash all but the last partial block of data 2017 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2018 2019 # ;; update data offset 2020 if ($NUM_BLOCKS > 1) { 2021 2022 # ;; The final block of data may be <16B 2023 $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n"; 2024 } 2025 2026 if ($NUM_BLOCKS < 16) { 2027 $code .= <<___; 2028 # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16. 2029 # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256. 2030 cmp \$16,$LENGTH 2031 jl .L_small_initial_partial_block_${label_suffix} 2032 2033 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2034 # ;;; Handle a full length final block - encrypt and hash all blocks 2035 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2036 2037 sub \$16,$LENGTH 2038 movq \$0,($PBLOCK_LEN) 2039___ 2040 2041 # ;; Hash all of the data 2042 if (scalar(@_) == 22) { 2043 2044 # ;; start GHASH compute 2045 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, 2046 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS); 2047 } elsif (scalar(@_) == 25) { 2048 2049 # ;; continue GHASH compute 2050 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, 2051 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL); 2052 } 2053 $code .= "jmp .L_small_initial_compute_done_${label_suffix}\n"; 2054 } 2055 2056 $code .= <<___; 2057.L_small_initial_partial_block_${label_suffix}: 2058 2059 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2060 # ;;; Handle ghash for a <16B final block 2061 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2062 2063 # ;; As it's an init / update / finalize series we need to leave the 2064 # ;; last block if it's less than a full block of data. 2065 2066 mov $LENGTH,($PBLOCK_LEN) 2067 vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX) 2068___ 2069 2070 my $k = ($NUM_BLOCKS - 1); 2071 my $last_block_to_hash = 1; 2072 if (($NUM_BLOCKS > $last_block_to_hash)) { 2073 2074 # ;; ZT12-ZT20 - temporary registers 2075 if (scalar(@_) == 22) { 2076 2077 # ;; start GHASH compute 2078 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, 2079 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k); 2080 } elsif (scalar(@_) == 25) { 2081 2082 # ;; continue GHASH compute 2083 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, 2084 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL); 2085 } 2086 2087 # ;; just fall through no jmp needed 2088 } else { 2089 2090 if (scalar(@_) == 25) { 2091 $code .= <<___; 2092 # ;; Reduction is required in this case. 2093 # ;; Integrate GM into GH and GL. 2094 vpsrldq \$8,$GM,$ZT0 2095 vpslldq \$8,$GM,$ZT1 2096 vpxorq $ZT0,$GH,$GH 2097 vpxorq $ZT1,$GL,$GL 2098___ 2099 2100 # ;; Add GH and GL 128-bit words horizontally 2101 &VHPXORI4x128($GH, $ZT0); 2102 &VHPXORI4x128($GL, $ZT1); 2103 2104 # ;; 256-bit to 128-bit reduction 2105 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n"; 2106 &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2)); 2107 } 2108 $code .= <<___; 2109 # ;; Record that a reduction is not needed - 2110 # ;; In this case no hashes are computed because there 2111 # ;; is only one initial block and it is < 16B in length. 2112 # ;; We only need to check if a reduction is needed if 2113 # ;; initial_blocks == 1 and init/update/final is being used. 2114 # ;; In this case we may just have a partial block, and that 2115 # ;; gets hashed in finalize. 2116 2117 # ;; The hash should end up in HASH_IN_OUT. 2118 # ;; The only way we should get here is if there is 2119 # ;; a partial block of data, so xor that into the hash. 2120 vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT 2121 # ;; The result is in $HASH_IN_OUT 2122 jmp .L_after_reduction_${label_suffix} 2123___ 2124 } 2125 2126 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2127 # ;;; After GHASH reduction 2128 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2129 2130 $code .= ".L_small_initial_compute_done_${label_suffix}:\n"; 2131 2132 # ;; If using init/update/finalize, we need to xor any partial block data 2133 # ;; into the hash. 2134 if ($NUM_BLOCKS > 1) { 2135 2136 # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place 2137 if ($NUM_BLOCKS != 16) { 2138 $code .= <<___; 2139 # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero 2140 or $LENGTH,$LENGTH 2141 je .L_after_reduction_${label_suffix} 2142___ 2143 } 2144 $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n"; 2145 } 2146 2147 $code .= ".L_after_reduction_${label_suffix}:\n"; 2148 2149 # ;; Final hash is now in HASH_IN_OUT 2150} 2151 2152# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2153# ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. 2154# ;; It may look similar to INITIAL_BLOCKS but its usage is different: 2155# ;; - first encrypts/decrypts required number of blocks and then 2156# ;; ghashes these blocks 2157# ;; - Small packets or left over data chunks (<256 bytes) 2158# ;; - Remaining data chunks below 256 bytes (multi buffer code) 2159# ;; 2160# ;; num_initial_blocks is expected to include the partial final block 2161# ;; in the count. 2162sub INITIAL_BLOCKS_PARTIAL { 2163 my $AES_KEYS = $_[0]; # [in] key pointer 2164 my $GCM128_CTX = $_[1]; # [in] context pointer 2165 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer 2166 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer 2167 my $LENGTH = $_[4]; # [in/clobbered] length in bytes 2168 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated) 2169 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) 2170 my $CTR = $_[7]; # [in/out] current counter value 2171 my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value 2172 my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC) 2173 my $CTR0 = $_[10]; # [clobbered] ZMM temporary 2174 my $CTR1 = $_[11]; # [clobbered] ZMM temporary 2175 my $CTR2 = $_[12]; # [clobbered] ZMM temporary 2176 my $CTR3 = $_[13]; # [clobbered] ZMM temporary 2177 my $DAT0 = $_[14]; # [clobbered] ZMM temporary 2178 my $DAT1 = $_[15]; # [clobbered] ZMM temporary 2179 my $DAT2 = $_[16]; # [clobbered] ZMM temporary 2180 my $DAT3 = $_[17]; # [clobbered] ZMM temporary 2181 my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary 2182 my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary 2183 my $ZT0 = $_[20]; # [clobbered] ZMM temporary 2184 my $ZT1 = $_[21]; # [clobbered] ZMM temporary 2185 my $ZT2 = $_[22]; # [clobbered] ZMM temporary 2186 my $ZT3 = $_[23]; # [clobbered] ZMM temporary 2187 my $ZT4 = $_[24]; # [clobbered] ZMM temporary 2188 my $IA0 = $_[25]; # [clobbered] GP temporary 2189 my $IA1 = $_[26]; # [clobbered] GP temporary 2190 my $MASKREG = $_[27]; # [clobbered] mask register 2191 my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask 2192 my $PBLOCK_LEN = $_[29]; # [in] partial block length 2193 2194 &INITIAL_BLOCKS_PARTIAL_CIPHER( 2195 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, 2196 $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR, 2197 $ENC_DEC, $DAT0, $DAT1, $DAT2, 2198 $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0, 2199 $CTR1, $CTR2, $CTR3, $ZT0, 2200 $IA0, $IA1, $MASKREG, $SHUFMASK); 2201 2202 &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0, 2203 $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK), 2204 &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN); 2205} 2206 2207# ;; =========================================================================== 2208# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks 2209# ;; followed with GHASH of the N blocks. 2210sub GHASH_16_ENCRYPT_N_GHASH_N { 2211 my $AES_KEYS = $_[0]; # [in] key pointer 2212 my $GCM128_CTX = $_[1]; # [in] context pointer 2213 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer 2214 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer 2215 my $DATA_OFFSET = $_[4]; # [in] data offset 2216 my $LENGTH = $_[5]; # [in] data length 2217 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian 2218 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check 2219 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key 2220 # (can be in form of register or numerical value) 2221 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in 2222 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb 2223 my $B00_03 = $_[11]; # [clobbered] temporary ZMM 2224 my $B04_07 = $_[12]; # [clobbered] temporary ZMM 2225 my $B08_11 = $_[13]; # [clobbered] temporary ZMM 2226 my $B12_15 = $_[14]; # [clobbered] temporary ZMM 2227 my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM 2228 my $GH1L = $_[16]; # [clobbered] temporary ZMM 2229 my $GH1M = $_[17]; # [clobbered] temporary ZMM 2230 my $GH1T = $_[18]; # [clobbered] temporary ZMM 2231 my $GH2H = $_[19]; # [clobbered] temporary ZMM 2232 my $GH2L = $_[20]; # [clobbered] temporary ZMM 2233 my $GH2M = $_[21]; # [clobbered] temporary ZMM 2234 my $GH2T = $_[22]; # [clobbered] temporary ZMM 2235 my $GH3H = $_[23]; # [clobbered] temporary ZMM 2236 my $GH3L = $_[24]; # [clobbered] temporary ZMM 2237 my $GH3M = $_[25]; # [clobbered] temporary ZMM 2238 my $GH3T = $_[26]; # [clobbered] temporary ZMM 2239 my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM 2240 my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM 2241 my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM 2242 my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM 2243 my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM 2244 my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM 2245 my $ZT01 = $_[33]; # [clobbered] temporary ZMM 2246 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian 2247 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian 2248 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce" 2249 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum 2250 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum 2251 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum 2252 my $ENC_DEC = $_[40]; # [in] cipher direction 2253 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value 2254 my $IA0 = $_[42]; # [clobbered] GP temporary 2255 my $IA1 = $_[43]; # [clobbered] GP temporary 2256 my $MASKREG = $_[44]; # [clobbered] mask register 2257 my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16) 2258 my $PBLOCK_LEN = $_[46]; # [in] partial block length 2259 2260 die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n" 2261 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0); 2262 2263 my $label_suffix = $label_count++; 2264 2265 my $GH1H = $HASH_IN_OUT; 2266 2267 # ; this is to avoid additional move in do_reduction case 2268 2269 my $LAST_GHASH_BLK = $GH1L; 2270 my $LAST_CIPHER_BLK = $GH1T; 2271 2272 my $RED_POLY = $GH2T; 2273 my $RED_P1 = $GH2L; 2274 my $RED_T1 = $GH2H; 2275 my $RED_T2 = $GH2M; 2276 2277 my $DATA1 = $GH3H; 2278 my $DATA2 = $GH3L; 2279 my $DATA3 = $GH3M; 2280 my $DATA4 = $GH3T; 2281 2282 # ;; do reduction after the 16 blocks ? 2283 my $do_reduction = 0; 2284 2285 # ;; is 16 block chunk a start? 2286 my $is_start = 0; 2287 2288 if ($GHASH_TYPE eq "start_reduce") { 2289 $is_start = 1; 2290 $do_reduction = 1; 2291 } 2292 2293 if ($GHASH_TYPE eq "start") { 2294 $is_start = 1; 2295 } 2296 2297 if ($GHASH_TYPE eq "end_reduce") { 2298 $do_reduction = 1; 2299 } 2300 2301 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2302 # ;; - get load/store mask 2303 # ;; - load plain/cipher text 2304 # ;; get load/store mask 2305 $code .= <<___; 2306 lea byte64_len_to_mask_table(%rip),$IA0 2307 mov $LENGTH,$IA1 2308___ 2309 if ($NUM_BLOCKS > 12) { 2310 $code .= "sub \$`3*64`,$IA1\n"; 2311 } elsif ($NUM_BLOCKS > 8) { 2312 $code .= "sub \$`2*64`,$IA1\n"; 2313 } elsif ($NUM_BLOCKS > 4) { 2314 $code .= "sub \$`1*64`,$IA1\n"; 2315 } 2316 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n"; 2317 2318 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2319 # ;; prepare counter blocks 2320 2321 $code .= <<___; 2322 cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]} 2323 jae .L_16_blocks_overflow_${label_suffix} 2324___ 2325 2326 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2327 $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE, 2328 $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4); 2329 $code .= <<___; 2330 jmp .L_16_blocks_ok_${label_suffix} 2331 2332.L_16_blocks_overflow_${label_suffix}: 2333 vpshufb $SHFMSK,$CTR_BE,$CTR_BE 2334 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03 2335___ 2336 if ($NUM_BLOCKS > 4) { 2337 $code .= <<___; 2338 vmovdqa64 ddq_add_4444(%rip),$B12_15 2339 vpaddd $B12_15,$B00_03,$B04_07 2340___ 2341 } 2342 if ($NUM_BLOCKS > 8) { 2343 $code .= "vpaddd $B12_15,$B04_07,$B08_11\n"; 2344 } 2345 if ($NUM_BLOCKS > 12) { 2346 $code .= "vpaddd $B12_15,$B08_11,$B12_15\n"; 2347 } 2348 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2349 $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2350 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); 2351 $code .= <<___; 2352.L_16_blocks_ok_${label_suffix}: 2353 2354 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2355 # ;; - pre-load constants 2356 # ;; - add current hash into the 1st block 2357 vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1 2358___ 2359 if ($is_start != 0) { 2360 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n"; 2361 } else { 2362 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n"; 2363 } 2364 2365 $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n"; 2366 2367 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2368 # ;; save counter for the next round 2369 # ;; increment counter overflow check register 2370 if ($NUM_BLOCKS <= 4) { 2371 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n"; 2372 } elsif ($NUM_BLOCKS <= 8) { 2373 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n"; 2374 } elsif ($NUM_BLOCKS <= 12) { 2375 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n"; 2376 } else { 2377 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n"; 2378 } 2379 $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n"; 2380 2381 $code .= <<___; 2382 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2383 # ;; pre-load constants 2384 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2 2385 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2 2386 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2 2387___ 2388 2389 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2390 # ;; stitch AES rounds with GHASH 2391 2392 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2393 # ;; AES round 0 - ARK 2394 2395 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2396 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2397 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2398 $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n"; 2399 2400 $code .= <<___; 2401 # ;;================================================== 2402 # ;; GHASH 4 blocks (15 to 12) 2403 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1 2404 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0 2405 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0 2406 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1 2407 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1 2408 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1 2409___ 2410 2411 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2412 # ;; AES round 1 2413 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2414 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2415 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2416 $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n"; 2417 2418 $code .= <<___; 2419 # ;; ================================================= 2420 # ;; GHASH 4 blocks (11 to 8) 2421 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 2422 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 2423 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 2424 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 2425 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2 2426 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2 2427___ 2428 2429 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2430 # ;; AES round 2 2431 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2432 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2433 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2434 $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n"; 2435 2436 $code .= <<___; 2437 # ;; ================================================= 2438 # ;; GHASH 4 blocks (7 to 4) 2439 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1 2440 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0 2441 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1 2442 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0 2443___ 2444 2445 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2446 # ;; AES rounds 3 2447 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2448 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2449 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2450 $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n"; 2451 2452 $code .= <<___; 2453 # ;; ================================================= 2454 # ;; Gather (XOR) GHASH for 12 blocks 2455 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H 2456 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L 2457 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T 2458 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M 2459___ 2460 2461 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2462 # ;; AES rounds 4 2463 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2464 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2465 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2466 $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n"; 2467 2468 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2469 # ;; load plain/cipher text 2470 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG); 2471 2472 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2473 # ;; AES rounds 5 2474 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2475 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2476 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2477 $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n"; 2478 2479 $code .= <<___; 2480 # ;; ================================================= 2481 # ;; GHASH 4 blocks (3 to 0) 2482 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 2483 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 2484 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 2485 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 2486___ 2487 2488 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2489 # ;; AES round 6 2490 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2491 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2492 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2493 $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n"; 2494 2495 # ;; ================================================= 2496 # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid) 2497 # ;; - add GH2[MTLH] to GH1[MTLH] 2498 $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n"; 2499 if ($do_reduction != 0) { 2500 2501 if ($is_start != 0) { 2502 $code .= "vpxorq $GH2M,$GH1M,$GH1M\n"; 2503 } else { 2504 $code .= <<___; 2505 vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H 2506 vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L 2507 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M 2508___ 2509 } 2510 2511 } else { 2512 2513 # ;; Update H/M/L hash sums if not carrying reduction 2514 if ($is_start != 0) { 2515 $code .= <<___; 2516 vpxorq $GH2H,$GH1H,$TO_REDUCE_H 2517 vpxorq $GH2L,$GH1L,$TO_REDUCE_L 2518 vpxorq $GH2M,$GH1M,$TO_REDUCE_M 2519___ 2520 } else { 2521 $code .= <<___; 2522 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H 2523 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L 2524 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M 2525___ 2526 } 2527 2528 } 2529 2530 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2531 # ;; AES round 7 2532 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2533 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2534 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2535 $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n"; 2536 2537 # ;; ================================================= 2538 # ;; prepare mid sum for adding to high & low 2539 # ;; load polynomial constant for reduction 2540 if ($do_reduction != 0) { 2541 $code .= <<___; 2542 vpsrldq \$8,$GH1M,$GH2M 2543 vpslldq \$8,$GH1M,$GH1M 2544 2545 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]} 2546___ 2547 } 2548 2549 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2550 # ;; AES round 8 2551 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2552 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2553 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2554 $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n"; 2555 2556 # ;; ================================================= 2557 # ;; Add mid product to high and low 2558 if ($do_reduction != 0) { 2559 if ($is_start != 0) { 2560 $code .= <<___; 2561 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64 2562 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64 2563___ 2564 } else { 2565 $code .= <<___; 2566 vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64 2567 vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64 2568___ 2569 } 2570 } 2571 2572 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2573 # ;; AES round 9 2574 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2575 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2576 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2577 2578 # ;; ================================================= 2579 # ;; horizontal xor of low and high 4x128 2580 if ($do_reduction != 0) { 2581 &VHPXORI4x128($GH1H, $GH2H); 2582 &VHPXORI4x128($GH1L, $GH2L); 2583 } 2584 2585 if (($NROUNDS >= 11)) { 2586 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n"; 2587 } 2588 2589 # ;; ================================================= 2590 # ;; first phase of reduction 2591 if ($do_reduction != 0) { 2592 $code .= <<___; 2593 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]} 2594 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs 2595 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct 2596___ 2597 } 2598 2599 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2600 # ;; AES rounds up to 11 (AES192) or 13 (AES256) 2601 # ;; AES128 is done 2602 if (($NROUNDS >= 11)) { 2603 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2604 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2605 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2606 $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n"; 2607 2608 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2609 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2610 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2611 if (($NROUNDS == 13)) { 2612 $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n"; 2613 2614 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2615 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2616 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2617 $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n"; 2618 2619 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2620 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2621 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2); 2622 } 2623 } 2624 2625 # ;; ================================================= 2626 # ;; second phase of the reduction 2627 if ($do_reduction != 0) { 2628 $code .= <<___; 2629 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]} 2630 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R 2631 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]} 2632 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts 2633 # ;; GH1H = GH1H + RED_T1 + RED_T2 2634 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]} 2635___ 2636 } 2637 2638 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2639 # ;; the last AES round 2640 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2641 $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2642 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1); 2643 2644 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2645 # ;; XOR against plain/cipher text 2646 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2647 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03, 2648 $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4); 2649 2650 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2651 # ;; retrieve the last cipher counter block (partially XOR'ed with text) 2652 # ;; - this is needed for partial block cases 2653 if ($NUM_BLOCKS <= 4) { 2654 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n"; 2655 } elsif ($NUM_BLOCKS <= 8) { 2656 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n"; 2657 } elsif ($NUM_BLOCKS <= 12) { 2658 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n"; 2659 } else { 2660 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n"; 2661 } 2662 2663 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2664 # ;; store cipher/plain text 2665 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n"; 2666 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG); 2667 2668 # ;; ================================================= 2669 # ;; shuffle cipher text blocks for GHASH computation 2670 if ($ENC_DEC eq "ENC") { 2671 2672 # ;; zero bytes outside the mask before hashing 2673 if ($NUM_BLOCKS <= 4) { 2674 $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n"; 2675 } elsif ($NUM_BLOCKS <= 8) { 2676 $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n"; 2677 } elsif ($NUM_BLOCKS <= 12) { 2678 $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n"; 2679 } else { 2680 $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n"; 2681 } 2682 2683 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2684 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03, 2685 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); 2686 } else { 2687 2688 # ;; zero bytes outside the mask before hashing 2689 if ($NUM_BLOCKS <= 4) { 2690 $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n"; 2691 } elsif ($NUM_BLOCKS <= 8) { 2692 $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n"; 2693 } elsif ($NUM_BLOCKS <= 12) { 2694 $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n"; 2695 } else { 2696 $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n"; 2697 } 2698 2699 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( 2700 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1, 2701 $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK); 2702 } 2703 2704 # ;; ================================================= 2705 # ;; Extract the last block for partial / multi_call cases 2706 if ($NUM_BLOCKS <= 4) { 2707 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n"; 2708 } elsif ($NUM_BLOCKS <= 8) { 2709 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n"; 2710 } elsif ($NUM_BLOCKS <= 12) { 2711 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n"; 2712 } else { 2713 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n"; 2714 } 2715 2716 if ($do_reduction != 0) { 2717 2718 # ;; GH1H holds reduced hash value 2719 # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)" 2720 # ;; - register rename trick obsoletes the above move 2721 } 2722 2723 # ;; ================================================= 2724 # ;; GHASH last N blocks 2725 # ;; - current hash value in HASH_IN_OUT or 2726 # ;; product parts in TO_REDUCE_H/M/L 2727 # ;; - DATA1-DATA4 include blocks for GHASH 2728 2729 if ($do_reduction == 0) { 2730 &INITIAL_BLOCKS_PARTIAL_GHASH( 2731 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, 2732 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2, 2733 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), 2734 $B00_03, $B04_07, $B08_11, $B12_15, 2735 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2, 2736 $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M, 2737 $TO_REDUCE_L); 2738 } else { 2739 &INITIAL_BLOCKS_PARTIAL_GHASH( 2740 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, 2741 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2, 2742 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), 2743 $B00_03, $B04_07, $B08_11, $B12_15, 2744 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2, 2745 $GHKEY1, $PBLOCK_LEN); 2746 } 2747} 2748 2749# ;; =========================================================================== 2750# ;; =========================================================================== 2751# ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks 2752# ;; followed with GHASH of the N blocks. 2753sub GCM_ENC_DEC_LAST { 2754 my $AES_KEYS = $_[0]; # [in] key pointer 2755 my $GCM128_CTX = $_[1]; # [in] context pointer 2756 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer 2757 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer 2758 my $DATA_OFFSET = $_[4]; # [in] data offset 2759 my $LENGTH = $_[5]; # [in/clobbered] data length 2760 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian 2761 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check 2762 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key 2763 # (can be register or numerical offset) 2764 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in 2765 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb 2766 my $ZT00 = $_[11]; # [clobbered] temporary ZMM 2767 my $ZT01 = $_[12]; # [clobbered] temporary ZMM 2768 my $ZT02 = $_[13]; # [clobbered] temporary ZMM 2769 my $ZT03 = $_[14]; # [clobbered] temporary ZMM 2770 my $ZT04 = $_[15]; # [clobbered] temporary ZMM 2771 my $ZT05 = $_[16]; # [clobbered] temporary ZMM 2772 my $ZT06 = $_[17]; # [clobbered] temporary ZMM 2773 my $ZT07 = $_[18]; # [clobbered] temporary ZMM 2774 my $ZT08 = $_[19]; # [clobbered] temporary ZMM 2775 my $ZT09 = $_[20]; # [clobbered] temporary ZMM 2776 my $ZT10 = $_[21]; # [clobbered] temporary ZMM 2777 my $ZT11 = $_[22]; # [clobbered] temporary ZMM 2778 my $ZT12 = $_[23]; # [clobbered] temporary ZMM 2779 my $ZT13 = $_[24]; # [clobbered] temporary ZMM 2780 my $ZT14 = $_[25]; # [clobbered] temporary ZMM 2781 my $ZT15 = $_[26]; # [clobbered] temporary ZMM 2782 my $ZT16 = $_[27]; # [clobbered] temporary ZMM 2783 my $ZT17 = $_[28]; # [clobbered] temporary ZMM 2784 my $ZT18 = $_[29]; # [clobbered] temporary ZMM 2785 my $ZT19 = $_[30]; # [clobbered] temporary ZMM 2786 my $ZT20 = $_[31]; # [clobbered] temporary ZMM 2787 my $ZT21 = $_[32]; # [clobbered] temporary ZMM 2788 my $ZT22 = $_[33]; # [clobbered] temporary ZMM 2789 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian 2790 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian 2791 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce" 2792 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum 2793 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum 2794 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum 2795 my $ENC_DEC = $_[40]; # [in] cipher direction 2796 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value 2797 my $IA0 = $_[42]; # [clobbered] GP temporary 2798 my $IA1 = $_[43]; # [clobbered] GP temporary 2799 my $MASKREG = $_[44]; # [clobbered] mask register 2800 my $PBLOCK_LEN = $_[45]; # [in] partial block length 2801 2802 my $label_suffix = $label_count++; 2803 2804 $code .= <<___; 2805 mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]} 2806 add \$15,@{[DWORD($IA0)]} 2807 shr \$4,@{[DWORD($IA0)]} 2808 je .L_last_num_blocks_is_0_${label_suffix} 2809 2810 cmp \$8,@{[DWORD($IA0)]} 2811 je .L_last_num_blocks_is_8_${label_suffix} 2812 jb .L_last_num_blocks_is_7_1_${label_suffix} 2813 2814 2815 cmp \$12,@{[DWORD($IA0)]} 2816 je .L_last_num_blocks_is_12_${label_suffix} 2817 jb .L_last_num_blocks_is_11_9_${label_suffix} 2818 2819 # ;; 16, 15, 14 or 13 2820 cmp \$15,@{[DWORD($IA0)]} 2821 je .L_last_num_blocks_is_15_${label_suffix} 2822 ja .L_last_num_blocks_is_16_${label_suffix} 2823 cmp \$14,@{[DWORD($IA0)]} 2824 je .L_last_num_blocks_is_14_${label_suffix} 2825 jmp .L_last_num_blocks_is_13_${label_suffix} 2826 2827.L_last_num_blocks_is_11_9_${label_suffix}: 2828 # ;; 11, 10 or 9 2829 cmp \$10,@{[DWORD($IA0)]} 2830 je .L_last_num_blocks_is_10_${label_suffix} 2831 ja .L_last_num_blocks_is_11_${label_suffix} 2832 jmp .L_last_num_blocks_is_9_${label_suffix} 2833 2834.L_last_num_blocks_is_7_1_${label_suffix}: 2835 cmp \$4,@{[DWORD($IA0)]} 2836 je .L_last_num_blocks_is_4_${label_suffix} 2837 jb .L_last_num_blocks_is_3_1_${label_suffix} 2838 # ;; 7, 6 or 5 2839 cmp \$6,@{[DWORD($IA0)]} 2840 ja .L_last_num_blocks_is_7_${label_suffix} 2841 je .L_last_num_blocks_is_6_${label_suffix} 2842 jmp .L_last_num_blocks_is_5_${label_suffix} 2843 2844.L_last_num_blocks_is_3_1_${label_suffix}: 2845 # ;; 3, 2 or 1 2846 cmp \$2,@{[DWORD($IA0)]} 2847 ja .L_last_num_blocks_is_3_${label_suffix} 2848 je .L_last_num_blocks_is_2_${label_suffix} 2849___ 2850 2851 # ;; fall through for `jmp .L_last_num_blocks_is_1` 2852 2853 # ;; Use rep to generate different block size variants 2854 # ;; - one block size has to be the first one 2855 for my $num_blocks (1 .. 16) { 2856 $code .= ".L_last_num_blocks_is_${num_blocks}_${label_suffix}:\n"; 2857 &GHASH_16_ENCRYPT_N_GHASH_N( 2858 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, 2859 $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET, 2860 $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03, 2861 $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, 2862 $ZT09, $ZT10, $ZT11, $ZT12, $ZT13, 2863 $ZT14, $ZT15, $ZT16, $ZT17, $ZT18, 2864 $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4, 2865 $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M, 2866 $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG, 2867 $num_blocks, $PBLOCK_LEN); 2868 2869 $code .= "jmp .L_last_blocks_done_${label_suffix}\n"; 2870 } 2871 2872 $code .= ".L_last_num_blocks_is_0_${label_suffix}:\n"; 2873 2874 # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction 2875 # ;; - convert mid into end_reduce 2876 # ;; - convert start into start_reduce 2877 if ($GHASH_TYPE eq "mid") { 2878 $GHASH_TYPE = "end_reduce"; 2879 } 2880 if ($GHASH_TYPE eq "start") { 2881 $GHASH_TYPE = "start_reduce"; 2882 } 2883 2884 &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp", 2885 $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01, 2886 $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09); 2887 2888 $code .= ".L_last_blocks_done_${label_suffix}:\n"; 2889} 2890 2891# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2892# ;; Main GCM macro stitching cipher with GHASH 2893# ;; - operates on single stream 2894# ;; - encrypts 16 blocks at a time 2895# ;; - ghash the 16 previously encrypted ciphertext blocks 2896# ;; - no partial block or multi_call handling here 2897sub GHASH_16_ENCRYPT_16_PARALLEL { 2898 my $AES_KEYS = $_[0]; # [in] key pointer 2899 my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer 2900 my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer 2901 my $DATA_OFFSET = $_[3]; # [in] data offset 2902 my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian 2903 my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check 2904 my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value) 2905 my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out 2906 my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in 2907 my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb 2908 my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher) 2909 my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher) 2910 my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher) 2911 my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher) 2912 my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction) 2913 my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher) 2914 my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher) 2915 my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher) 2916 my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher) 2917 my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash) 2918 my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash) 2919 my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash) 2920 my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash) 2921 my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash) 2922 my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash) 2923 my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash) 2924 my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash) 2925 my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash) 2926 my $ZT19 = $_[28]; # [clobbered] temporary ZMM 2927 my $ZT20 = $_[29]; # [clobbered] temporary ZMM 2928 my $ZT21 = $_[30]; # [clobbered] temporary ZMM 2929 my $ZT22 = $_[31]; # [clobbered] temporary ZMM 2930 my $ZT23 = $_[32]; # [clobbered] temporary ZMM 2931 my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian 2932 my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian 2933 my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum 2934 my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum 2935 my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum 2936 my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time" 2937 my $ENC_DEC = $_[39]; # [in] cipher direction 2938 my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset 2939 my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in" 2940 my $IA0 = $_[42]; # [clobbered] temporary GPR 2941 2942 my $B00_03 = $ZT1; 2943 my $B04_07 = $ZT2; 2944 my $B08_11 = $ZT3; 2945 my $B12_15 = $ZT4; 2946 2947 my $GH1H = $ZT5; 2948 2949 # ; @note: do not change this mapping 2950 my $GH1L = $ZT6; 2951 my $GH1M = $ZT7; 2952 my $GH1T = $ZT8; 2953 2954 my $GH2H = $ZT9; 2955 my $GH2L = $ZT10; 2956 my $GH2M = $ZT11; 2957 my $GH2T = $ZT12; 2958 2959 my $RED_POLY = $GH2T; 2960 my $RED_P1 = $GH2L; 2961 my $RED_T1 = $GH2H; 2962 my $RED_T2 = $GH2M; 2963 2964 my $GH3H = $ZT13; 2965 my $GH3L = $ZT14; 2966 my $GH3M = $ZT15; 2967 my $GH3T = $ZT16; 2968 2969 my $DATA1 = $ZT13; 2970 my $DATA2 = $ZT14; 2971 my $DATA3 = $ZT15; 2972 my $DATA4 = $ZT16; 2973 2974 my $AESKEY1 = $ZT17; 2975 my $AESKEY2 = $ZT18; 2976 2977 my $GHKEY1 = $ZT19; 2978 my $GHKEY2 = $ZT20; 2979 my $GHDAT1 = $ZT21; 2980 my $GHDAT2 = $ZT22; 2981 2982 my $label_suffix = $label_count++; 2983 2984 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2985 # ;; prepare counter blocks 2986 2987 $code .= <<___; 2988 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} 2989 jae .L_16_blocks_overflow_${label_suffix} 2990 vpaddd $ADDBE_1234,$CTR_BE,$B00_03 2991 vpaddd $ADDBE_4x4,$B00_03,$B04_07 2992 vpaddd $ADDBE_4x4,$B04_07,$B08_11 2993 vpaddd $ADDBE_4x4,$B08_11,$B12_15 2994 jmp .L_16_blocks_ok_${label_suffix} 2995.L_16_blocks_overflow_${label_suffix}: 2996 vpshufb $SHFMSK,$CTR_BE,$CTR_BE 2997 vmovdqa64 ddq_add_4444(%rip),$B12_15 2998 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03 2999 vpaddd $B12_15,$B00_03,$B04_07 3000 vpaddd $B12_15,$B04_07,$B08_11 3001 vpaddd $B12_15,$B08_11,$B12_15 3002 vpshufb $SHFMSK,$B00_03,$B00_03 3003 vpshufb $SHFMSK,$B04_07,$B04_07 3004 vpshufb $SHFMSK,$B08_11,$B08_11 3005 vpshufb $SHFMSK,$B12_15,$B12_15 3006.L_16_blocks_ok_${label_suffix}: 3007___ 3008 3009 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3010 # ;; pre-load constants 3011 $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n"; 3012 if ($GHASH_IN ne "no_ghash_in") { 3013 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n"; 3014 } else { 3015 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n"; 3016 } 3017 3018 $code .= <<___; 3019 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1 3020 3021 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3022 # ;; save counter for the next round 3023 # ;; increment counter overflow check register 3024 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE 3025 addb \$16,@{[BYTE($CTR_CHECK)]} 3026 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3027 # ;; pre-load constants 3028 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2 3029 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2 3030 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2 3031 3032 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3033 # ;; stitch AES rounds with GHASH 3034 3035 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3036 # ;; AES round 0 - ARK 3037 3038 vpxorq $AESKEY1,$B00_03,$B00_03 3039 vpxorq $AESKEY1,$B04_07,$B04_07 3040 vpxorq $AESKEY1,$B08_11,$B08_11 3041 vpxorq $AESKEY1,$B12_15,$B12_15 3042 vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1 3043 3044 # ;;================================================== 3045 # ;; GHASH 4 blocks (15 to 12) 3046 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1 3047 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0 3048 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0 3049 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1 3050 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1 3051 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1 3052 3053 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3054 # ;; AES round 1 3055 vaesenc $AESKEY2,$B00_03,$B00_03 3056 vaesenc $AESKEY2,$B04_07,$B04_07 3057 vaesenc $AESKEY2,$B08_11,$B08_11 3058 vaesenc $AESKEY2,$B12_15,$B12_15 3059 vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2 3060 3061 # ;; ================================================= 3062 # ;; GHASH 4 blocks (11 to 8) 3063 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 3064 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 3065 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 3066 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 3067 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2 3068 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2 3069 3070 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3071 # ;; AES round 2 3072 vaesenc $AESKEY1,$B00_03,$B00_03 3073 vaesenc $AESKEY1,$B04_07,$B04_07 3074 vaesenc $AESKEY1,$B08_11,$B08_11 3075 vaesenc $AESKEY1,$B12_15,$B12_15 3076 vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1 3077 3078 # ;; ================================================= 3079 # ;; GHASH 4 blocks (7 to 4) 3080 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1 3081 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0 3082 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1 3083 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0 3084 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3085 # ;; AES rounds 3 3086 vaesenc $AESKEY2,$B00_03,$B00_03 3087 vaesenc $AESKEY2,$B04_07,$B04_07 3088 vaesenc $AESKEY2,$B08_11,$B08_11 3089 vaesenc $AESKEY2,$B12_15,$B12_15 3090 vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2 3091 3092 # ;; ================================================= 3093 # ;; Gather (XOR) GHASH for 12 blocks 3094 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H 3095 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L 3096 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T 3097 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M 3098 3099 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3100 # ;; AES rounds 4 3101 vaesenc $AESKEY1,$B00_03,$B00_03 3102 vaesenc $AESKEY1,$B04_07,$B04_07 3103 vaesenc $AESKEY1,$B08_11,$B08_11 3104 vaesenc $AESKEY1,$B12_15,$B12_15 3105 vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1 3106 3107 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3108 # ;; load plain/cipher text (recycle GH3xx registers) 3109 vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1 3110 vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2 3111 vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3 3112 vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4 3113 3114 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3115 # ;; AES rounds 5 3116 vaesenc $AESKEY2,$B00_03,$B00_03 3117 vaesenc $AESKEY2,$B04_07,$B04_07 3118 vaesenc $AESKEY2,$B08_11,$B08_11 3119 vaesenc $AESKEY2,$B12_15,$B12_15 3120 vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2 3121 3122 # ;; ================================================= 3123 # ;; GHASH 4 blocks (3 to 0) 3124 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1 3125 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0 3126 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1 3127 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0 3128 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3129 # ;; AES round 6 3130 vaesenc $AESKEY1,$B00_03,$B00_03 3131 vaesenc $AESKEY1,$B04_07,$B04_07 3132 vaesenc $AESKEY1,$B08_11,$B08_11 3133 vaesenc $AESKEY1,$B12_15,$B12_15 3134 vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1 3135___ 3136 3137 # ;; ================================================= 3138 # ;; gather GHASH in GH1L (low) and GH1H (high) 3139 if ($DO_REDUCTION eq "first_time") { 3140 $code .= <<___; 3141 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM 3142 vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM 3143 vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH 3144 vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL 3145___ 3146 } 3147 if ($DO_REDUCTION eq "no_reduction") { 3148 $code .= <<___; 3149 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM 3150 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM 3151 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH 3152 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL 3153___ 3154 } 3155 if ($DO_REDUCTION eq "final_reduction") { 3156 $code .= <<___; 3157 # ;; phase 1: add mid products together 3158 # ;; also load polynomial constant for reduction 3159 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM 3160 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M 3161 3162 vpsrldq \$8,$GH1M,$GH2M 3163 vpslldq \$8,$GH1M,$GH1M 3164 3165 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]} 3166___ 3167 } 3168 3169 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3170 # ;; AES round 7 3171 $code .= <<___; 3172 vaesenc $AESKEY2,$B00_03,$B00_03 3173 vaesenc $AESKEY2,$B04_07,$B04_07 3174 vaesenc $AESKEY2,$B08_11,$B08_11 3175 vaesenc $AESKEY2,$B12_15,$B12_15 3176 vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2 3177___ 3178 3179 # ;; ================================================= 3180 # ;; Add mid product to high and low 3181 if ($DO_REDUCTION eq "final_reduction") { 3182 $code .= <<___; 3183 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64 3184 vpxorq $TO_REDUCE_H,$GH1H,$GH1H 3185 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64 3186 vpxorq $TO_REDUCE_L,$GH1L,$GH1L 3187___ 3188 } 3189 3190 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3191 # ;; AES round 8 3192 $code .= <<___; 3193 vaesenc $AESKEY1,$B00_03,$B00_03 3194 vaesenc $AESKEY1,$B04_07,$B04_07 3195 vaesenc $AESKEY1,$B08_11,$B08_11 3196 vaesenc $AESKEY1,$B12_15,$B12_15 3197 vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1 3198___ 3199 3200 # ;; ================================================= 3201 # ;; horizontal xor of low and high 4x128 3202 if ($DO_REDUCTION eq "final_reduction") { 3203 &VHPXORI4x128($GH1H, $GH2H); 3204 &VHPXORI4x128($GH1L, $GH2L); 3205 } 3206 3207 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3208 # ;; AES round 9 3209 $code .= <<___; 3210 vaesenc $AESKEY2,$B00_03,$B00_03 3211 vaesenc $AESKEY2,$B04_07,$B04_07 3212 vaesenc $AESKEY2,$B08_11,$B08_11 3213 vaesenc $AESKEY2,$B12_15,$B12_15 3214___ 3215 if (($NROUNDS >= 11)) { 3216 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n"; 3217 } 3218 3219 # ;; ================================================= 3220 # ;; first phase of reduction 3221 if ($DO_REDUCTION eq "final_reduction") { 3222 $code .= <<___; 3223 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]} 3224 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs 3225 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct 3226___ 3227 } 3228 3229 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3230 # ;; AES rounds up to 11 (AES192) or 13 (AES256) 3231 # ;; AES128 is done 3232 if (($NROUNDS >= 11)) { 3233 $code .= <<___; 3234 vaesenc $AESKEY1,$B00_03,$B00_03 3235 vaesenc $AESKEY1,$B04_07,$B04_07 3236 vaesenc $AESKEY1,$B08_11,$B08_11 3237 vaesenc $AESKEY1,$B12_15,$B12_15 3238 vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1 3239 3240 vaesenc $AESKEY2,$B00_03,$B00_03 3241 vaesenc $AESKEY2,$B04_07,$B04_07 3242 vaesenc $AESKEY2,$B08_11,$B08_11 3243 vaesenc $AESKEY2,$B12_15,$B12_15 3244___ 3245 if (($NROUNDS == 13)) { 3246 $code .= <<___; 3247 vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2 3248 3249 vaesenc $AESKEY1,$B00_03,$B00_03 3250 vaesenc $AESKEY1,$B04_07,$B04_07 3251 vaesenc $AESKEY1,$B08_11,$B08_11 3252 vaesenc $AESKEY1,$B12_15,$B12_15 3253 vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1 3254 3255 vaesenc $AESKEY2,$B00_03,$B00_03 3256 vaesenc $AESKEY2,$B04_07,$B04_07 3257 vaesenc $AESKEY2,$B08_11,$B08_11 3258 vaesenc $AESKEY2,$B12_15,$B12_15 3259___ 3260 } 3261 } 3262 3263 # ;; ================================================= 3264 # ;; second phase of the reduction 3265 if ($DO_REDUCTION eq "final_reduction") { 3266 $code .= <<___; 3267 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]} 3268 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R 3269 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]} 3270 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts 3271 # ;; GH1H = GH1H x RED_T1 x RED_T2 3272 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]} 3273___ 3274 } 3275 3276 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3277 # ;; the last AES round 3278 $code .= <<___; 3279 vaesenclast $AESKEY1,$B00_03,$B00_03 3280 vaesenclast $AESKEY1,$B04_07,$B04_07 3281 vaesenclast $AESKEY1,$B08_11,$B08_11 3282 vaesenclast $AESKEY1,$B12_15,$B12_15 3283 3284 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3285 # ;; XOR against plain/cipher text 3286 vpxorq $DATA1,$B00_03,$B00_03 3287 vpxorq $DATA2,$B04_07,$B04_07 3288 vpxorq $DATA3,$B08_11,$B08_11 3289 vpxorq $DATA4,$B12_15,$B12_15 3290 3291 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3292 # ;; store cipher/plain text 3293 mov $CIPH_PLAIN_OUT,$IA0 3294 vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1) 3295 vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1) 3296 vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1) 3297 vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1) 3298___ 3299 3300 # ;; ================================================= 3301 # ;; shuffle cipher text blocks for GHASH computation 3302 if ($ENC_DEC eq "ENC") { 3303 $code .= <<___; 3304 vpshufb $SHFMSK,$B00_03,$B00_03 3305 vpshufb $SHFMSK,$B04_07,$B04_07 3306 vpshufb $SHFMSK,$B08_11,$B08_11 3307 vpshufb $SHFMSK,$B12_15,$B12_15 3308___ 3309 } else { 3310 $code .= <<___; 3311 vpshufb $SHFMSK,$DATA1,$B00_03 3312 vpshufb $SHFMSK,$DATA2,$B04_07 3313 vpshufb $SHFMSK,$DATA3,$B08_11 3314 vpshufb $SHFMSK,$DATA4,$B12_15 3315___ 3316 } 3317 3318 # ;; ================================================= 3319 # ;; store shuffled cipher text for ghashing 3320 $code .= <<___; 3321 vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp) 3322 vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp) 3323 vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp) 3324 vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp) 3325___ 3326} 3327 3328# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3329# ;;; Encryption of a single block 3330sub ENCRYPT_SINGLE_BLOCK { 3331 my $AES_KEY = $_[0]; # ; [in] 3332 my $XMM0 = $_[1]; # ; [in/out] 3333 my $GPR1 = $_[2]; # ; [clobbered] 3334 3335 my $label_suffix = $label_count++; 3336 3337 $code .= <<___; 3338 # ; load number of rounds from AES_KEY structure (offset in bytes is 3339 # ; size of the |rd_key| buffer) 3340 mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]} 3341 cmp \$9,@{[DWORD($GPR1)]} 3342 je .Laes_128_${label_suffix} 3343 cmp \$11,@{[DWORD($GPR1)]} 3344 je .Laes_192_${label_suffix} 3345 cmp \$13,@{[DWORD($GPR1)]} 3346 je .Laes_256_${label_suffix} 3347 jmp .Lexit_aes_${label_suffix} 3348___ 3349 for my $keylen (sort keys %aes_rounds) { 3350 my $nr = $aes_rounds{$keylen}; 3351 $code .= <<___; 3352.align 32 3353.Laes_${keylen}_${label_suffix}: 3354___ 3355 $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n"; 3356 for (my $i = 1; $i <= $nr; $i++) { 3357 $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n"; 3358 } 3359 $code .= <<___; 3360 vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0 3361 jmp .Lexit_aes_${label_suffix} 3362___ 3363 } 3364 $code .= ".Lexit_aes_${label_suffix}:\n\n"; 3365} 3366 3367sub CALC_J0 { 3368 my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context 3369 my $IV = $_[1]; #; [in] Pointer to IV 3370 my $IV_LEN = $_[2]; #; [in] IV length 3371 my $J0 = $_[3]; #; [out] XMM reg to contain J0 3372 my $ZT0 = $_[4]; #; [clobbered] ZMM register 3373 my $ZT1 = $_[5]; #; [clobbered] ZMM register 3374 my $ZT2 = $_[6]; #; [clobbered] ZMM register 3375 my $ZT3 = $_[7]; #; [clobbered] ZMM register 3376 my $ZT4 = $_[8]; #; [clobbered] ZMM register 3377 my $ZT5 = $_[9]; #; [clobbered] ZMM register 3378 my $ZT6 = $_[10]; #; [clobbered] ZMM register 3379 my $ZT7 = $_[11]; #; [clobbered] ZMM register 3380 my $ZT8 = $_[12]; #; [clobbered] ZMM register 3381 my $ZT9 = $_[13]; #; [clobbered] ZMM register 3382 my $ZT10 = $_[14]; #; [clobbered] ZMM register 3383 my $ZT11 = $_[15]; #; [clobbered] ZMM register 3384 my $ZT12 = $_[16]; #; [clobbered] ZMM register 3385 my $ZT13 = $_[17]; #; [clobbered] ZMM register 3386 my $ZT14 = $_[18]; #; [clobbered] ZMM register 3387 my $ZT15 = $_[19]; #; [clobbered] ZMM register 3388 my $ZT16 = $_[20]; #; [clobbered] ZMM register 3389 my $T1 = $_[21]; #; [clobbered] GP register 3390 my $T2 = $_[22]; #; [clobbered] GP register 3391 my $T3 = $_[23]; #; [clobbered] GP register 3392 my $MASKREG = $_[24]; #; [clobbered] mask register 3393 3394 # ;; J0 = GHASH(IV || 0s+64 || len(IV)64) 3395 # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ 3396 3397 # ;; Calculate GHASH of (IV || 0s) 3398 $code .= "vpxor $J0,$J0,$J0\n"; 3399 &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, 3400 $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG); 3401 3402 # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) 3403 $code .= <<___; 3404 mov $IV_LEN,$T1 3405 shl \$3,$T1 # ; IV length in bits 3406 vmovq $T1,@{[XWORD($ZT2)]} 3407 3408 # ;; Might need shuffle of ZT2 3409 vpxorq $J0,@{[XWORD($ZT2)]},$J0 3410 3411 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]} 3412___ 3413 &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]}); 3414 3415 $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n"; 3416} 3417 3418# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3419# ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for 3420# ;;; encoding/decoding. 3421# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3422sub GCM_INIT_IV { 3423 my $AES_KEYS = $_[0]; # [in] AES key schedule 3424 my $GCM128_CTX = $_[1]; # [in/out] GCM context 3425 my $IV = $_[2]; # [in] IV pointer 3426 my $IV_LEN = $_[3]; # [in] IV length 3427 my $GPR1 = $_[4]; # [clobbered] GP register 3428 my $GPR2 = $_[5]; # [clobbered] GP register 3429 my $GPR3 = $_[6]; # [clobbered] GP register 3430 my $MASKREG = $_[7]; # [clobbered] mask register 3431 my $CUR_COUNT = $_[8]; # [out] XMM with current counter 3432 my $ZT0 = $_[9]; # [clobbered] ZMM register 3433 my $ZT1 = $_[10]; # [clobbered] ZMM register 3434 my $ZT2 = $_[11]; # [clobbered] ZMM register 3435 my $ZT3 = $_[12]; # [clobbered] ZMM register 3436 my $ZT4 = $_[13]; # [clobbered] ZMM register 3437 my $ZT5 = $_[14]; # [clobbered] ZMM register 3438 my $ZT6 = $_[15]; # [clobbered] ZMM register 3439 my $ZT7 = $_[16]; # [clobbered] ZMM register 3440 my $ZT8 = $_[17]; # [clobbered] ZMM register 3441 my $ZT9 = $_[18]; # [clobbered] ZMM register 3442 my $ZT10 = $_[19]; # [clobbered] ZMM register 3443 my $ZT11 = $_[20]; # [clobbered] ZMM register 3444 my $ZT12 = $_[21]; # [clobbered] ZMM register 3445 my $ZT13 = $_[22]; # [clobbered] ZMM register 3446 my $ZT14 = $_[23]; # [clobbered] ZMM register 3447 my $ZT15 = $_[24]; # [clobbered] ZMM register 3448 my $ZT16 = $_[25]; # [clobbered] ZMM register 3449 3450 my $ZT0x = $ZT0; 3451 $ZT0x =~ s/zmm/xmm/; 3452 3453 $code .= <<___; 3454 cmp \$12,$IV_LEN 3455 je iv_len_12_init_IV 3456___ 3457 3458 # ;; IV is different than 12 bytes 3459 &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, 3460 $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG); 3461 $code .= <<___; 3462 jmp skip_iv_len_12_init_IV 3463iv_len_12_init_IV: # ;; IV is 12 bytes 3464 # ;; read 12 IV bytes and pad with 0x00000001 3465 vmovdqu8 ONEf(%rip),$CUR_COUNT 3466 mov $IV,$GPR2 3467 mov \$0x0000000000000fff,@{[DWORD($GPR1)]} 3468 kmovq $GPR1,$MASKREG 3469 vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1 3470skip_iv_len_12_init_IV: 3471 vmovdqu $CUR_COUNT,$ZT0x 3472___ 3473 &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0) 3474 $code .= <<___; 3475 vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage 3476 3477 # ;; store IV as counter in LE format 3478 vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT 3479 vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi 3480___ 3481} 3482 3483sub GCM_UPDATE_AAD { 3484 my $GCM128_CTX = $_[0]; # [in] GCM context pointer 3485 my $A_IN = $_[1]; # [in] AAD pointer 3486 my $A_LEN = $_[2]; # [in] AAD length in bytes 3487 my $GPR1 = $_[3]; # [clobbered] GP register 3488 my $GPR2 = $_[4]; # [clobbered] GP register 3489 my $GPR3 = $_[5]; # [clobbered] GP register 3490 my $MASKREG = $_[6]; # [clobbered] mask register 3491 my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value 3492 my $ZT0 = $_[8]; # [clobbered] ZMM register 3493 my $ZT1 = $_[9]; # [clobbered] ZMM register 3494 my $ZT2 = $_[10]; # [clobbered] ZMM register 3495 my $ZT3 = $_[11]; # [clobbered] ZMM register 3496 my $ZT4 = $_[12]; # [clobbered] ZMM register 3497 my $ZT5 = $_[13]; # [clobbered] ZMM register 3498 my $ZT6 = $_[14]; # [clobbered] ZMM register 3499 my $ZT7 = $_[15]; # [clobbered] ZMM register 3500 my $ZT8 = $_[16]; # [clobbered] ZMM register 3501 my $ZT9 = $_[17]; # [clobbered] ZMM register 3502 my $ZT10 = $_[18]; # [clobbered] ZMM register 3503 my $ZT11 = $_[19]; # [clobbered] ZMM register 3504 my $ZT12 = $_[20]; # [clobbered] ZMM register 3505 my $ZT13 = $_[21]; # [clobbered] ZMM register 3506 my $ZT14 = $_[22]; # [clobbered] ZMM register 3507 my $ZT15 = $_[23]; # [clobbered] ZMM register 3508 my $ZT16 = $_[24]; # [clobbered] ZMM register 3509 3510 # ; load current hash 3511 $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n"; 3512 3513 &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2, 3514 $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, 3515 $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG); 3516 3517 # ; load current hash 3518 $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n"; 3519} 3520 3521# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3522# ;;; Cipher and ghash of payloads shorter than 256 bytes 3523# ;;; - number of blocks in the message comes as argument 3524# ;;; - depending on the number of blocks an optimized variant of 3525# ;;; INITIAL_BLOCKS_PARTIAL is invoked 3526sub GCM_ENC_DEC_SMALL { 3527 my $AES_KEYS = $_[0]; # [in] key pointer 3528 my $GCM128_CTX = $_[1]; # [in] context pointer 3529 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer 3530 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer 3531 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length 3532 my $ENC_DEC = $_[5]; # [in] cipher direction 3533 my $DATA_OFFSET = $_[6]; # [in] data offset 3534 my $LENGTH = $_[7]; # [in] data length 3535 my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16 3536 my $CTR = $_[9]; # [in/out] XMM counter block 3537 my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value 3538 my $ZTMP0 = $_[11]; # [clobbered] ZMM register 3539 my $ZTMP1 = $_[12]; # [clobbered] ZMM register 3540 my $ZTMP2 = $_[13]; # [clobbered] ZMM register 3541 my $ZTMP3 = $_[14]; # [clobbered] ZMM register 3542 my $ZTMP4 = $_[15]; # [clobbered] ZMM register 3543 my $ZTMP5 = $_[16]; # [clobbered] ZMM register 3544 my $ZTMP6 = $_[17]; # [clobbered] ZMM register 3545 my $ZTMP7 = $_[18]; # [clobbered] ZMM register 3546 my $ZTMP8 = $_[19]; # [clobbered] ZMM register 3547 my $ZTMP9 = $_[20]; # [clobbered] ZMM register 3548 my $ZTMP10 = $_[21]; # [clobbered] ZMM register 3549 my $ZTMP11 = $_[22]; # [clobbered] ZMM register 3550 my $ZTMP12 = $_[23]; # [clobbered] ZMM register 3551 my $ZTMP13 = $_[24]; # [clobbered] ZMM register 3552 my $ZTMP14 = $_[25]; # [clobbered] ZMM register 3553 my $IA0 = $_[26]; # [clobbered] GP register 3554 my $IA1 = $_[27]; # [clobbered] GP register 3555 my $MASKREG = $_[28]; # [clobbered] mask register 3556 my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask 3557 my $PBLOCK_LEN = $_[30]; # [in] partial block length 3558 3559 my $label_suffix = $label_count++; 3560 3561 $code .= <<___; 3562 cmp \$8,$NUM_BLOCKS 3563 je .L_small_initial_num_blocks_is_8_${label_suffix} 3564 jl .L_small_initial_num_blocks_is_7_1_${label_suffix} 3565 3566 3567 cmp \$12,$NUM_BLOCKS 3568 je .L_small_initial_num_blocks_is_12_${label_suffix} 3569 jl .L_small_initial_num_blocks_is_11_9_${label_suffix} 3570 3571 # ;; 16, 15, 14 or 13 3572 cmp \$16,$NUM_BLOCKS 3573 je .L_small_initial_num_blocks_is_16_${label_suffix} 3574 cmp \$15,$NUM_BLOCKS 3575 je .L_small_initial_num_blocks_is_15_${label_suffix} 3576 cmp \$14,$NUM_BLOCKS 3577 je .L_small_initial_num_blocks_is_14_${label_suffix} 3578 jmp .L_small_initial_num_blocks_is_13_${label_suffix} 3579 3580.L_small_initial_num_blocks_is_11_9_${label_suffix}: 3581 # ;; 11, 10 or 9 3582 cmp \$11,$NUM_BLOCKS 3583 je .L_small_initial_num_blocks_is_11_${label_suffix} 3584 cmp \$10,$NUM_BLOCKS 3585 je .L_small_initial_num_blocks_is_10_${label_suffix} 3586 jmp .L_small_initial_num_blocks_is_9_${label_suffix} 3587 3588.L_small_initial_num_blocks_is_7_1_${label_suffix}: 3589 cmp \$4,$NUM_BLOCKS 3590 je .L_small_initial_num_blocks_is_4_${label_suffix} 3591 jl .L_small_initial_num_blocks_is_3_1_${label_suffix} 3592 # ;; 7, 6 or 5 3593 cmp \$7,$NUM_BLOCKS 3594 je .L_small_initial_num_blocks_is_7_${label_suffix} 3595 cmp \$6,$NUM_BLOCKS 3596 je .L_small_initial_num_blocks_is_6_${label_suffix} 3597 jmp .L_small_initial_num_blocks_is_5_${label_suffix} 3598 3599.L_small_initial_num_blocks_is_3_1_${label_suffix}: 3600 # ;; 3, 2 or 1 3601 cmp \$3,$NUM_BLOCKS 3602 je .L_small_initial_num_blocks_is_3_${label_suffix} 3603 cmp \$2,$NUM_BLOCKS 3604 je .L_small_initial_num_blocks_is_2_${label_suffix} 3605 3606 # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed 3607 3608 # ;; Generation of different block size variants 3609 # ;; - one block size has to be the first one 3610___ 3611 3612 for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) { 3613 $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${label_suffix}:\n"; 3614 &INITIAL_BLOCKS_PARTIAL( 3615 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET, 3616 $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1, 3617 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 3618 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 3619 $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN); 3620 3621 if ($num_blocks != 16) { 3622 $code .= "jmp .L_small_initial_blocks_encrypted_${label_suffix}\n"; 3623 } 3624 } 3625 3626 $code .= ".L_small_initial_blocks_encrypted_${label_suffix}:\n"; 3627} 3628 3629# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3630# ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context 3631# ; struct has been initialized by GCM_INIT_IV 3632# ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. 3633# ; Clobbers rax, r10-r15, and zmm0-zmm31, k1 3634# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3635sub GCM_ENC_DEC { 3636 my $AES_KEYS = $_[0]; # [in] AES Key schedule 3637 my $GCM128_CTX = $_[1]; # [in] context pointer 3638 my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update 3639 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer 3640 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length 3641 my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer 3642 my $ENC_DEC = $_[6]; # [in] cipher direction 3643 3644 my $IA0 = "%r10"; 3645 my $IA1 = "%r12"; 3646 my $IA2 = "%r13"; 3647 my $IA3 = "%r15"; 3648 my $IA4 = "%r11"; 3649 my $IA5 = "%rax"; 3650 my $IA6 = "%rbx"; 3651 my $IA7 = "%r14"; 3652 3653 my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN; 3654 3655 my $CTR_CHECK = $IA3; 3656 my $DATA_OFFSET = $IA4; 3657 my $HASHK_PTR = $IA6; 3658 3659 my $HKEYS_READY = $IA7; 3660 3661 my $CTR_BLOCKz = "%zmm2"; 3662 my $CTR_BLOCKx = "%xmm2"; 3663 3664 # ; hardcoded in GCM_INIT 3665 3666 my $AAD_HASHz = "%zmm14"; 3667 my $AAD_HASHx = "%xmm14"; 3668 3669 # ; hardcoded in GCM_COMPLETE 3670 3671 my $ZTMP0 = "%zmm0"; 3672 my $ZTMP1 = "%zmm3"; 3673 my $ZTMP2 = "%zmm4"; 3674 my $ZTMP3 = "%zmm5"; 3675 my $ZTMP4 = "%zmm6"; 3676 my $ZTMP5 = "%zmm7"; 3677 my $ZTMP6 = "%zmm10"; 3678 my $ZTMP7 = "%zmm11"; 3679 my $ZTMP8 = "%zmm12"; 3680 my $ZTMP9 = "%zmm13"; 3681 my $ZTMP10 = "%zmm15"; 3682 my $ZTMP11 = "%zmm16"; 3683 my $ZTMP12 = "%zmm17"; 3684 3685 my $ZTMP13 = "%zmm19"; 3686 my $ZTMP14 = "%zmm20"; 3687 my $ZTMP15 = "%zmm21"; 3688 my $ZTMP16 = "%zmm30"; 3689 my $ZTMP17 = "%zmm31"; 3690 my $ZTMP18 = "%zmm1"; 3691 my $ZTMP19 = "%zmm18"; 3692 my $ZTMP20 = "%zmm8"; 3693 my $ZTMP21 = "%zmm22"; 3694 my $ZTMP22 = "%zmm23"; 3695 3696 my $GH = "%zmm24"; 3697 my $GL = "%zmm25"; 3698 my $GM = "%zmm26"; 3699 my $SHUF_MASK = "%zmm29"; 3700 3701 # ; Unused in the small packet path 3702 my $ADDBE_4x4 = "%zmm27"; 3703 my $ADDBE_1234 = "%zmm28"; 3704 3705 my $MASKREG = "%k1"; 3706 3707 my $label_suffix = $label_count++; 3708 3709 # ;; reduction every 48 blocks, depth 32 blocks 3710 # ;; @note 48 blocks is the maximum capacity of the stack frame 3711 my $big_loop_nblocks = 48; 3712 my $big_loop_depth = 32; 3713 3714 # ;;; Macro flow depending on packet size 3715 # ;;; - LENGTH <= 16 blocks 3716 # ;;; - cipher followed by hashing (reduction) 3717 # ;;; - 16 blocks < LENGTH < 32 blocks 3718 # ;;; - cipher 16 blocks 3719 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) 3720 # ;;; - 32 blocks < LENGTH < 48 blocks 3721 # ;;; - cipher 2 x 16 blocks 3722 # ;;; - hash 16 blocks 3723 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) 3724 # ;;; - LENGTH >= 48 blocks 3725 # ;;; - cipher 2 x 16 blocks 3726 # ;;; - while (data_to_cipher >= 48 blocks): 3727 # ;;; - cipher 16 blocks & hash 16 blocks 3728 # ;;; - cipher 16 blocks & hash 16 blocks 3729 # ;;; - cipher 16 blocks & hash 16 blocks (reduction) 3730 # ;;; - if (data_to_cipher >= 32 blocks): 3731 # ;;; - cipher 16 blocks & hash 16 blocks 3732 # ;;; - cipher 16 blocks & hash 16 blocks 3733 # ;;; - hash 16 blocks (reduction) 3734 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) 3735 # ;;; - elif (data_to_cipher >= 16 blocks): 3736 # ;;; - cipher 16 blocks & hash 16 blocks 3737 # ;;; - hash 16 blocks 3738 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) 3739 # ;;; - else: 3740 # ;;; - hash 16 blocks 3741 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) 3742 3743 if ($win64) { 3744 $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n"; 3745 } else { 3746 $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n"; 3747 } 3748 $code .= "je .L_enc_dec_done_${label_suffix}\n"; 3749 3750 # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in 3751 # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc' 3752 3753 $code .= "xor $HKEYS_READY, $HKEYS_READY\n"; 3754 $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n"; 3755 3756 # ;; Used for the update flow - if there was a previous partial 3757 # ;; block fill the remaining bytes here. 3758 &PARTIAL_BLOCK( 3759 $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, 3760 $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1, 3761 $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, 3762 $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG); 3763 3764 $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n"; 3765 3766 # ;; Save the amount of data left to process in $LENGTH 3767 # ;; NOTE: PLAIN_CIPH_LEN is a register on linux; 3768 if ($win64) { 3769 $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n"; 3770 } 3771 3772 # ;; There may be no more data if it was consumed in the partial block. 3773 $code .= <<___; 3774 sub $DATA_OFFSET,$LENGTH 3775 je .L_enc_dec_done_${label_suffix} 3776___ 3777 3778 $code .= <<___; 3779 cmp \$`(16 * 16)`,$LENGTH 3780 jbe .L_message_below_equal_16_blocks_${label_suffix} 3781 3782 vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK 3783 vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4 3784 vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234 3785 3786 # ;; start the pipeline 3787 # ;; - 32 blocks aes-ctr 3788 # ;; - 16 blocks ghash + aes-ctr 3789 3790 # ;; set up CTR_CHECK 3791 vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]} 3792 and \$255,@{[DWORD($CTR_CHECK)]} 3793 # ;; in LE format after init, convert to BE 3794 vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz 3795 vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz 3796___ 3797 3798 # ;; ==== AES-CTR - first 16 blocks 3799 my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 3800 my $data_in_out_offset = 0; 3801 &INITIAL_BLOCKS_16( 3802 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz, 3803 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2, 3804 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, 3805 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0); 3806 3807 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 3808 "first16"); 3809 3810 $code .= <<___; 3811 cmp \$`(32 * 16)`,$LENGTH 3812 jb .L_message_below_32_blocks_${label_suffix} 3813___ 3814 3815 # ;; ==== AES-CTR - next 16 blocks 3816 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); 3817 $data_in_out_offset = (16 * 16); 3818 &INITIAL_BLOCKS_16( 3819 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz, 3820 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2, 3821 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, 3822 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0); 3823 3824 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 3825 "last32"); 3826 $code .= "mov \$1,$HKEYS_READY\n"; 3827 3828 $code .= <<___; 3829 add \$`(32 * 16)`,$DATA_OFFSET 3830 sub \$`(32 * 16)`,$LENGTH 3831 3832 cmp \$`($big_loop_nblocks * 16)`,$LENGTH 3833 jb .L_no_more_big_nblocks_${label_suffix} 3834___ 3835 3836 # ;; ==== 3837 # ;; ==== AES-CTR + GHASH - 48 blocks loop 3838 # ;; ==== 3839 $code .= ".L_encrypt_big_nblocks_${label_suffix}:\n"; 3840 3841 # ;; ==== AES-CTR + GHASH - 16 blocks, start 3842 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); 3843 $data_in_out_offset = (0 * 16); 3844 my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 3845 &GHASH_16_ENCRYPT_16_PARALLEL( 3846 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, 3847 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, 3848 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 3849 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 3850 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, 3851 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, 3852 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, 3853 $IA0); 3854 3855 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction 3856 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 3857 $data_in_out_offset = (16 * 16); 3858 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); 3859 &GHASH_16_ENCRYPT_16_PARALLEL( 3860 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, 3861 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, 3862 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 3863 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 3864 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, 3865 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, 3866 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", 3867 $IA0); 3868 3869 # ;; ==== AES-CTR + GHASH - 16 blocks, reduction 3870 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); 3871 $data_in_out_offset = (32 * 16); 3872 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); 3873 &GHASH_16_ENCRYPT_16_PARALLEL( 3874 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, 3875 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, 3876 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 3877 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 3878 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, 3879 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, 3880 $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", 3881 $IA0); 3882 3883 # ;; === xor cipher block 0 with GHASH (ZT4) 3884 $code .= <<___; 3885 vmovdqa64 $ZTMP4,$AAD_HASHz 3886 3887 add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET 3888 sub \$`($big_loop_nblocks * 16)`,$LENGTH 3889 cmp \$`($big_loop_nblocks * 16)`,$LENGTH 3890 jae .L_encrypt_big_nblocks_${label_suffix} 3891 3892.L_no_more_big_nblocks_${label_suffix}: 3893 3894 cmp \$`(32 * 16)`,$LENGTH 3895 jae .L_encrypt_32_blocks_${label_suffix} 3896 3897 cmp \$`(16 * 16)`,$LENGTH 3898 jae .L_encrypt_16_blocks_${label_suffix} 3899___ 3900 3901 # ;; ===================================================== 3902 # ;; ===================================================== 3903 # ;; ==== GHASH 1 x 16 blocks 3904 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks 3905 # ;; ==== then GHASH N blocks 3906 $code .= ".L_encrypt_0_blocks_ghash_32_${label_suffix}:\n"; 3907 3908 # ;; calculate offset to the right hash key 3909 $code .= <<___; 3910mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]} 3911and \$~15,@{[DWORD($IA0)]} 3912mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]} 3913sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} 3914___ 3915 3916 # ;; ==== GHASH 32 blocks and follow with reduction 3917 &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16), 3918 "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); 3919 3920 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder 3921 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); 3922 $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n"; 3923 &GCM_ENC_DEC_LAST( 3924 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, 3925 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, 3926 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 3927 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, 3928 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, 3929 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, 3930 "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, 3931 $IA0, $IA5, $MASKREG, $PBLOCK_LEN); 3932 3933 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; 3934 $code .= "jmp .L_ghash_done_${label_suffix}\n"; 3935 3936 # ;; ===================================================== 3937 # ;; ===================================================== 3938 # ;; ==== GHASH & encrypt 1 x 16 blocks 3939 # ;; ==== GHASH & encrypt 1 x 16 blocks 3940 # ;; ==== GHASH 1 x 16 blocks (reduction) 3941 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks 3942 # ;; ==== then GHASH N blocks 3943 $code .= ".L_encrypt_32_blocks_${label_suffix}:\n"; 3944 3945 # ;; ==== AES-CTR + GHASH - 16 blocks, start 3946 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); 3947 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 3948 $data_in_out_offset = (0 * 16); 3949 &GHASH_16_ENCRYPT_16_PARALLEL( 3950 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, 3951 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, 3952 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 3953 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 3954 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, 3955 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, 3956 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, 3957 $IA0); 3958 3959 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction 3960 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 3961 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16)); 3962 $data_in_out_offset = (16 * 16); 3963 &GHASH_16_ENCRYPT_16_PARALLEL( 3964 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, 3965 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, 3966 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 3967 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 3968 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, 3969 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, 3970 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in", 3971 $IA0); 3972 3973 # ;; ==== GHASH 16 blocks with reduction 3974 &GHASH_16( 3975 "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16), 3976 "%rsp", &HashKeyOffsetByIdx(16, "frame"), 3977 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); 3978 3979 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder 3980 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 3981 $code .= <<___; 3982 sub \$`(32 * 16)`,$LENGTH 3983 add \$`(32 * 16)`,$DATA_OFFSET 3984___ 3985 3986 # ;; calculate offset to the right hash key 3987 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n"; 3988 $code .= <<___; 3989 and \$~15,@{[DWORD($IA0)]} 3990 mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]} 3991 sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} 3992___ 3993 &GCM_ENC_DEC_LAST( 3994 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, 3995 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, 3996 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 3997 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, 3998 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, 3999 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, 4000 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, 4001 $IA0, $IA5, $MASKREG, $PBLOCK_LEN); 4002 4003 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; 4004 $code .= "jmp .L_ghash_done_${label_suffix}\n"; 4005 4006 # ;; ===================================================== 4007 # ;; ===================================================== 4008 # ;; ==== GHASH & encrypt 16 blocks (done before) 4009 # ;; ==== GHASH 1 x 16 blocks 4010 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks 4011 # ;; ==== then GHASH N blocks 4012 $code .= ".L_encrypt_16_blocks_${label_suffix}:\n"; 4013 4014 # ;; ==== AES-CTR + GHASH - 16 blocks, start 4015 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); 4016 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 4017 $data_in_out_offset = (0 * 16); 4018 &GHASH_16_ENCRYPT_16_PARALLEL( 4019 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK, 4020 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1, 4021 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, 4022 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13, 4023 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19, 4024 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL, 4025 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz, 4026 $IA0); 4027 4028 # ;; ==== GHASH 1 x 16 blocks 4029 &GHASH_16( 4030 "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16), 4031 "%rsp", &HashKeyOffsetByIdx(32, "frame"), 4032 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9); 4033 4034 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder 4035 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16)); 4036 $code .= <<___; 4037 sub \$`(16 * 16)`,$LENGTH 4038 add \$`(16 * 16)`,$DATA_OFFSET 4039___ 4040 &GCM_ENC_DEC_LAST( 4041 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, 4042 $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK, 4043 &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0, 4044 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, 4045 $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, 4046 $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, 4047 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, 4048 $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20, 4049 $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, 4050 "end_reduce", $GL, $GH, $GM, 4051 $ENC_DEC, $AAD_HASHz, $IA0, $IA5, 4052 $MASKREG, $PBLOCK_LEN); 4053 4054 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; 4055 $code .= <<___; 4056 jmp .L_ghash_done_${label_suffix} 4057 4058.L_message_below_32_blocks_${label_suffix}: 4059 # ;; 32 > number of blocks > 16 4060 4061 sub \$`(16 * 16)`,$LENGTH 4062 add \$`(16 * 16)`,$DATA_OFFSET 4063___ 4064 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); 4065 4066 # ;; calculate offset to the right hash key 4067 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n"; 4068 4069 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 4070 "mid16"); 4071 $code .= "mov \$1,$HKEYS_READY\n"; 4072 4073 $code .= <<___; 4074and \$~15,@{[DWORD($IA0)]} 4075mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]} 4076sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]} 4077___ 4078 4079 &GCM_ENC_DEC_LAST( 4080 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH, 4081 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0, 4082 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 4083 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, 4084 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, 4085 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, 4086 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz, 4087 $IA0, $IA5, $MASKREG, $PBLOCK_LEN); 4088 4089 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n"; 4090 $code .= <<___; 4091 jmp .L_ghash_done_${label_suffix} 4092 4093.L_message_below_equal_16_blocks_${label_suffix}: 4094 # ;; Determine how many blocks to process 4095 # ;; - process one additional block if there is a partial block 4096 mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]} 4097 add \$15,@{[DWORD($IA1)]} 4098 shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16 4099___ 4100 &GCM_ENC_DEC_SMALL( 4101 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC, 4102 $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0, 4103 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, 4104 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, 4105 $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK, 4106 $PBLOCK_LEN); 4107 4108 # ;; fall through to exit 4109 4110 $code .= ".L_ghash_done_${label_suffix}:\n"; 4111 4112 # ;; save the last counter block 4113 $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n"; 4114 $code .= <<___; 4115 vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX) 4116.L_enc_dec_done_${label_suffix}: 4117___ 4118} 4119 4120# ;;; =========================================================================== 4121# ;;; Encrypt/decrypt the initial 16 blocks 4122sub INITIAL_BLOCKS_16 { 4123 my $IN = $_[0]; # [in] input buffer 4124 my $OUT = $_[1]; # [in] output buffer 4125 my $AES_KEYS = $_[2]; # [in] pointer to expanded keys 4126 my $DATA_OFFSET = $_[3]; # [in] data offset 4127 my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits) 4128 my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits 4129 my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check 4130 my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian) 4131 my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) 4132 my $T0 = $_[9]; # [clobered] temporary ZMM register 4133 my $T1 = $_[10]; # [clobered] temporary ZMM register 4134 my $T2 = $_[11]; # [clobered] temporary ZMM register 4135 my $T3 = $_[12]; # [clobered] temporary ZMM register 4136 my $T4 = $_[13]; # [clobered] temporary ZMM register 4137 my $T5 = $_[14]; # [clobered] temporary ZMM register 4138 my $T6 = $_[15]; # [clobered] temporary ZMM register 4139 my $T7 = $_[16]; # [clobered] temporary ZMM register 4140 my $T8 = $_[17]; # [clobered] temporary ZMM register 4141 my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask 4142 my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector 4143 my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks 4144 my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset 4145 my $IA0 = $_[22]; # [clobered] temporary GP register 4146 4147 my $B00_03 = $T5; 4148 my $B04_07 = $T6; 4149 my $B08_11 = $T7; 4150 my $B12_15 = $T8; 4151 4152 my $label_suffix = $label_count++; 4153 4154 my $stack_offset = $BLK_OFFSET; 4155 $code .= <<___; 4156 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4157 # ;; prepare counter blocks 4158 4159 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} 4160 jae .L_next_16_overflow_${label_suffix} 4161 vpaddd $ADDBE_1234,$CTR,$B00_03 4162 vpaddd $ADDBE_4x4,$B00_03,$B04_07 4163 vpaddd $ADDBE_4x4,$B04_07,$B08_11 4164 vpaddd $ADDBE_4x4,$B08_11,$B12_15 4165 jmp .L_next_16_ok_${label_suffix} 4166.L_next_16_overflow_${label_suffix}: 4167 vpshufb $SHUF_MASK,$CTR,$CTR 4168 vmovdqa64 ddq_add_4444(%rip),$B12_15 4169 vpaddd ddq_add_1234(%rip),$CTR,$B00_03 4170 vpaddd $B12_15,$B00_03,$B04_07 4171 vpaddd $B12_15,$B04_07,$B08_11 4172 vpaddd $B12_15,$B08_11,$B12_15 4173 vpshufb $SHUF_MASK,$B00_03,$B00_03 4174 vpshufb $SHUF_MASK,$B04_07,$B04_07 4175 vpshufb $SHUF_MASK,$B08_11,$B08_11 4176 vpshufb $SHUF_MASK,$B12_15,$B12_15 4177.L_next_16_ok_${label_suffix}: 4178 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR 4179 addb \$16,@{[BYTE($CTR_CHECK)]} 4180 # ;; === load 16 blocks of data 4181 vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0 4182 vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1 4183 vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2 4184 vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3 4185 4186 # ;; move to AES encryption rounds 4187 vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4 4188 vpxorq $T4,$B00_03,$B00_03 4189 vpxorq $T4,$B04_07,$B04_07 4190 vpxorq $T4,$B08_11,$B08_11 4191 vpxorq $T4,$B12_15,$B12_15 4192___ 4193 foreach (1 .. ($NROUNDS)) { 4194 $code .= <<___; 4195 vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4 4196 vaesenc $T4,$B00_03,$B00_03 4197 vaesenc $T4,$B04_07,$B04_07 4198 vaesenc $T4,$B08_11,$B08_11 4199 vaesenc $T4,$B12_15,$B12_15 4200___ 4201 } 4202 $code .= <<___; 4203 vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4 4204 vaesenclast $T4,$B00_03,$B00_03 4205 vaesenclast $T4,$B04_07,$B04_07 4206 vaesenclast $T4,$B08_11,$B08_11 4207 vaesenclast $T4,$B12_15,$B12_15 4208 4209 # ;; xor against text 4210 vpxorq $T0,$B00_03,$B00_03 4211 vpxorq $T1,$B04_07,$B04_07 4212 vpxorq $T2,$B08_11,$B08_11 4213 vpxorq $T3,$B12_15,$B12_15 4214 4215 # ;; store 4216 mov $OUT, $IA0 4217 vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1) 4218 vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1) 4219 vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1) 4220 vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1) 4221___ 4222 if ($ENC_DEC eq "DEC") { 4223 $code .= <<___; 4224 # ;; decryption - cipher text needs to go to GHASH phase 4225 vpshufb $SHUF_MASK,$T0,$B00_03 4226 vpshufb $SHUF_MASK,$T1,$B04_07 4227 vpshufb $SHUF_MASK,$T2,$B08_11 4228 vpshufb $SHUF_MASK,$T3,$B12_15 4229___ 4230 } else { 4231 $code .= <<___; 4232 # ;; encryption 4233 vpshufb $SHUF_MASK,$B00_03,$B00_03 4234 vpshufb $SHUF_MASK,$B04_07,$B04_07 4235 vpshufb $SHUF_MASK,$B08_11,$B08_11 4236 vpshufb $SHUF_MASK,$B12_15,$B12_15 4237___ 4238 } 4239 4240 if ($GHASH ne "no_ghash") { 4241 $code .= <<___; 4242 # ;; === xor cipher block 0 with GHASH for the next GHASH round 4243 vpxorq $GHASH,$B00_03,$B00_03 4244___ 4245 } 4246 $code .= <<___; 4247 vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp) 4248 vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp) 4249 vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp) 4250 vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp) 4251___ 4252} 4253 4254# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4255# ; GCM_COMPLETE Finishes ghash calculation 4256# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4257sub GCM_COMPLETE { 4258 my $GCM128_CTX = $_[0]; 4259 my $PBLOCK_LEN = $_[1]; 4260 4261 my $label_suffix = $label_count++; 4262 4263 $code .= <<___; 4264 vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2 4265 vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0) 4266___ 4267 4268 $code .= <<___; 4269 vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4 4270 4271 # ;; Process the final partial block. 4272 cmp \$0,$PBLOCK_LEN 4273 je .L_partial_done_${label_suffix} 4274___ 4275 4276 # ;GHASH computation for the last <16 Byte block 4277 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17"); 4278 4279 $code .= <<___; 4280.L_partial_done_${label_suffix}: 4281 vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5 4282 vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C) 4283 vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits 4284 4285 vpxor %xmm5,%xmm4,%xmm4 4286___ 4287 4288 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17"); 4289 4290 $code .= <<___; 4291 vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap 4292 vpxor %xmm4,%xmm3,%xmm3 4293 4294.L_return_T_${label_suffix}: 4295 vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX) 4296___ 4297} 4298 4299# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4300# ;;; Functions definitions 4301# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4302 4303$code .= ".text\n"; 4304{ 4305 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4306 # ;void ossl_aes_gcm_init_avx512 / 4307 # ; (const void *aes_keys, 4308 # ; void *gcm128ctx) 4309 # ; 4310 # ; Precomputes hashkey table for GHASH optimization. 4311 # ; Leaf function (does not allocate stack space, does not use non-volatile registers). 4312 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4313 $code .= <<___; 4314.globl ossl_aes_gcm_init_avx512 4315.type ossl_aes_gcm_init_avx512,\@abi-omnipotent 4316.align 32 4317ossl_aes_gcm_init_avx512: 4318.cfi_startproc 4319 endbranch 4320___ 4321 if ($CHECK_FUNCTION_ARGUMENTS) { 4322 $code .= <<___; 4323 # ;; Check aes_keys != NULL 4324 test $arg1,$arg1 4325 jz .Labort_init 4326 4327 # ;; Check gcm128ctx != NULL 4328 test $arg2,$arg2 4329 jz .Labort_init 4330___ 4331 } 4332 $code .= "vpxorq %xmm16,%xmm16,%xmm16\n"; 4333 &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey 4334 $code .= <<___; 4335 vpshufb SHUF_MASK(%rip),%xmm16,%xmm16 4336 # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;; 4337 vmovdqa64 %xmm16,%xmm2 4338 vpsllq \$1,%xmm16,%xmm16 4339 vpsrlq \$63,%xmm2,%xmm2 4340 vmovdqa %xmm2,%xmm1 4341 vpslldq \$8,%xmm2,%xmm2 4342 vpsrldq \$8,%xmm1,%xmm1 4343 vporq %xmm2,%xmm16,%xmm16 4344 # ;reduction 4345 vpshufd \$0b00100100,%xmm1,%xmm2 4346 vpcmpeqd TWOONE(%rip),%xmm2,%xmm2 4347 vpand POLY(%rip),%xmm2,%xmm2 4348 vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly 4349 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4350 vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly 4351___ 4352 &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); 4353 if ($CLEAR_SCRATCH_REGISTERS) { 4354 &clear_scratch_gps_asm(); 4355 &clear_scratch_zmms_asm(); 4356 } else { 4357 $code .= "vzeroupper\n"; 4358 } 4359 $code .= <<___; 4360.Labort_init: 4361ret 4362.cfi_endproc 4363.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 4364___ 4365} 4366 4367# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4368# ;void ossl_aes_gcm_setiv_avx512 4369# ; (const void *aes_keys, 4370# ; void *gcm128ctx, 4371# ; const unsigned char *iv, 4372# ; size_t ivlen) 4373# ; 4374# ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure. 4375# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4376$code .= <<___; 4377.globl ossl_aes_gcm_setiv_avx512 4378.type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent 4379.align 32 4380ossl_aes_gcm_setiv_avx512: 4381.cfi_startproc 4382.Lsetiv_seh_begin: 4383 endbranch 4384___ 4385if ($CHECK_FUNCTION_ARGUMENTS) { 4386 $code .= <<___; 4387 # ;; Check aes_keys != NULL 4388 test $arg1,$arg1 4389 jz .Labort_setiv 4390 4391 # ;; Check gcm128ctx != NULL 4392 test $arg2,$arg2 4393 jz .Labort_setiv 4394 4395 # ;; Check iv != NULL 4396 test $arg3,$arg3 4397 jz .Labort_setiv 4398 4399 # ;; Check ivlen != 0 4400 test $arg4,$arg4 4401 jz .Labort_setiv 4402___ 4403} 4404 4405# ; NOTE: code before PROLOG() must not modify any registers 4406&PROLOG( 4407 1, # allocate stack space for hkeys 4408 0, # do not allocate stack space for AES blocks 4409 "setiv"); 4410&GCM_INIT_IV( 4411 "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1", 4412 "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", 4413 "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19"); 4414&EPILOG( 4415 1, # hkeys were allocated 4416 $arg4); 4417$code .= <<___; 4418.Labort_setiv: 4419ret 4420.Lsetiv_seh_end: 4421.cfi_endproc 4422.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512 4423___ 4424 4425# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4426# ;void ossl_aes_gcm_update_aad_avx512 4427# ; (unsigned char *gcm128ctx, 4428# ; const unsigned char *aad, 4429# ; size_t aadlen) 4430# ; 4431# ; Updates AAD hash in gcm128_context structure. 4432# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4433$code .= <<___; 4434.globl ossl_aes_gcm_update_aad_avx512 4435.type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent 4436.align 32 4437ossl_aes_gcm_update_aad_avx512: 4438.cfi_startproc 4439.Lghash_seh_begin: 4440 endbranch 4441___ 4442if ($CHECK_FUNCTION_ARGUMENTS) { 4443 $code .= <<___; 4444 # ;; Check gcm128ctx != NULL 4445 test $arg1,$arg1 4446 jz .Lexit_update_aad 4447 4448 # ;; Check aad != NULL 4449 test $arg2,$arg2 4450 jz .Lexit_update_aad 4451 4452 # ;; Check aadlen != 0 4453 test $arg3,$arg3 4454 jz .Lexit_update_aad 4455___ 4456} 4457 4458# ; NOTE: code before PROLOG() must not modify any registers 4459&PROLOG( 4460 1, # allocate stack space for hkeys, 4461 0, # do not allocate stack space for AES blocks 4462 "ghash"); 4463&GCM_UPDATE_AAD( 4464 "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11", 4465 "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13", 4466 "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19"); 4467&EPILOG( 4468 1, # hkeys were allocated 4469 $arg3); 4470$code .= <<___; 4471.Lexit_update_aad: 4472ret 4473.Lghash_seh_end: 4474.cfi_endproc 4475.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512 4476___ 4477 4478# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4479# ;void ossl_aes_gcm_encrypt_avx512 4480# ; (const void* aes_keys, 4481# ; void *gcm128ctx, 4482# ; unsigned int *pblocklen, 4483# ; const unsigned char *in, 4484# ; size_t len, 4485# ; unsigned char *out); 4486# ; 4487# ; Performs encryption of data |in| of len |len|, and stores the output in |out|. 4488# ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|. 4489# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4490$code .= <<___; 4491.globl ossl_aes_gcm_encrypt_avx512 4492.type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent 4493.align 32 4494ossl_aes_gcm_encrypt_avx512: 4495.cfi_startproc 4496.Lencrypt_seh_begin: 4497 endbranch 4498___ 4499 4500# ; NOTE: code before PROLOG() must not modify any registers 4501&PROLOG( 4502 1, # allocate stack space for hkeys 4503 1, # allocate stack space for AES blocks 4504 "encrypt"); 4505if ($CHECK_FUNCTION_ARGUMENTS) { 4506 $code .= <<___; 4507 # ;; Check aes_keys != NULL 4508 test $arg1,$arg1 4509 jz .Lexit_gcm_encrypt 4510 4511 # ;; Check gcm128ctx != NULL 4512 test $arg2,$arg2 4513 jz .Lexit_gcm_encrypt 4514 4515 # ;; Check pblocklen != NULL 4516 test $arg3,$arg3 4517 jz .Lexit_gcm_encrypt 4518 4519 # ;; Check in != NULL 4520 test $arg4,$arg4 4521 jz .Lexit_gcm_encrypt 4522 4523 # ;; Check if len != 0 4524 cmp \$0,$arg5 4525 jz .Lexit_gcm_encrypt 4526 4527 # ;; Check out != NULL 4528 cmp \$0,$arg6 4529 jz .Lexit_gcm_encrypt 4530___ 4531} 4532$code .= <<___; 4533 # ; load number of rounds from AES_KEY structure (offset in bytes is 4534 # ; size of the |rd_key| buffer) 4535 mov `4*15*4`($arg1),%eax 4536 cmp \$9,%eax 4537 je .Laes_gcm_encrypt_128_avx512 4538 cmp \$11,%eax 4539 je .Laes_gcm_encrypt_192_avx512 4540 cmp \$13,%eax 4541 je .Laes_gcm_encrypt_256_avx512 4542 xor %eax,%eax 4543 jmp .Lexit_gcm_encrypt 4544___ 4545for my $keylen (sort keys %aes_rounds) { 4546 $NROUNDS = $aes_rounds{$keylen}; 4547 $code .= <<___; 4548.align 32 4549.Laes_gcm_encrypt_${keylen}_avx512: 4550___ 4551 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC"); 4552 $code .= "jmp .Lexit_gcm_encrypt\n"; 4553} 4554$code .= ".Lexit_gcm_encrypt:\n"; 4555&EPILOG(1, $arg5); 4556$code .= <<___; 4557ret 4558.Lencrypt_seh_end: 4559.cfi_endproc 4560.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512 4561___ 4562 4563# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4564# ;void ossl_aes_gcm_decrypt_avx512 4565# ; (const void* keys, 4566# ; void *gcm128ctx, 4567# ; unsigned int *pblocklen, 4568# ; const unsigned char *in, 4569# ; size_t len, 4570# ; unsigned char *out); 4571# ; 4572# ; Performs decryption of data |in| of len |len|, and stores the output in |out|. 4573# ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|. 4574# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4575$code .= <<___; 4576.globl ossl_aes_gcm_decrypt_avx512 4577.type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent 4578.align 32 4579ossl_aes_gcm_decrypt_avx512: 4580.cfi_startproc 4581.Ldecrypt_seh_begin: 4582 endbranch 4583___ 4584 4585# ; NOTE: code before PROLOG() must not modify any registers 4586&PROLOG( 4587 1, # allocate stack space for hkeys 4588 1, # allocate stack space for AES blocks 4589 "decrypt"); 4590if ($CHECK_FUNCTION_ARGUMENTS) { 4591 $code .= <<___; 4592 # ;; Check keys != NULL 4593 test $arg1,$arg1 4594 jz .Lexit_gcm_decrypt 4595 4596 # ;; Check gcm128ctx != NULL 4597 test $arg2,$arg2 4598 jz .Lexit_gcm_decrypt 4599 4600 # ;; Check pblocklen != NULL 4601 test $arg3,$arg3 4602 jz .Lexit_gcm_decrypt 4603 4604 # ;; Check in != NULL 4605 test $arg4,$arg4 4606 jz .Lexit_gcm_decrypt 4607 4608 # ;; Check if len != 0 4609 cmp \$0,$arg5 4610 jz .Lexit_gcm_decrypt 4611 4612 # ;; Check out != NULL 4613 cmp \$0,$arg6 4614 jz .Lexit_gcm_decrypt 4615___ 4616} 4617$code .= <<___; 4618 # ; load number of rounds from AES_KEY structure (offset in bytes is 4619 # ; size of the |rd_key| buffer) 4620 mov `4*15*4`($arg1),%eax 4621 cmp \$9,%eax 4622 je .Laes_gcm_decrypt_128_avx512 4623 cmp \$11,%eax 4624 je .Laes_gcm_decrypt_192_avx512 4625 cmp \$13,%eax 4626 je .Laes_gcm_decrypt_256_avx512 4627 xor %eax,%eax 4628 jmp .Lexit_gcm_decrypt 4629___ 4630for my $keylen (sort keys %aes_rounds) { 4631 $NROUNDS = $aes_rounds{$keylen}; 4632 $code .= <<___; 4633.align 32 4634.Laes_gcm_decrypt_${keylen}_avx512: 4635___ 4636 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC"); 4637 $code .= "jmp .Lexit_gcm_decrypt\n"; 4638} 4639$code .= ".Lexit_gcm_decrypt:\n"; 4640&EPILOG(1, $arg5); 4641$code .= <<___; 4642ret 4643.Ldecrypt_seh_end: 4644.cfi_endproc 4645.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512 4646___ 4647 4648# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4649# ;void ossl_aes_gcm_finalize_vaes_avx512 4650# ; (void *gcm128ctx, 4651# ; unsigned int pblocklen); 4652# ; 4653# ; Finalizes encryption / decryption 4654# ; Leaf function (does not allocate stack space, does not use non-volatile registers). 4655# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4656$code .= <<___; 4657.globl ossl_aes_gcm_finalize_avx512 4658.type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent 4659.align 32 4660ossl_aes_gcm_finalize_avx512: 4661.cfi_startproc 4662 endbranch 4663___ 4664if ($CHECK_FUNCTION_ARGUMENTS) { 4665 $code .= <<___; 4666 # ;; Check gcm128ctx != NULL 4667 test $arg1,$arg1 4668 jz .Labort_finalize 4669___ 4670} 4671 4672&GCM_COMPLETE("$arg1", "$arg2"); 4673 4674$code .= <<___; 4675.Labort_finalize: 4676ret 4677.cfi_endproc 4678.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512 4679___ 4680 4681# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4682# ;void ossl_gcm_gmult_avx512(u64 Xi[2], 4683# ; const void* gcm128ctx) 4684# ; 4685# ; Leaf function (does not allocate stack space, does not use non-volatile registers). 4686# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4687$code .= <<___; 4688.globl ossl_gcm_gmult_avx512 4689.hidden ossl_gcm_gmult_avx512 4690.type ossl_gcm_gmult_avx512,\@abi-omnipotent 4691.align 32 4692ossl_gcm_gmult_avx512: 4693.cfi_startproc 4694 endbranch 4695___ 4696if ($CHECK_FUNCTION_ARGUMENTS) { 4697 $code .= <<___; 4698 # ;; Check Xi != NULL 4699 test $arg1,$arg1 4700 jz .Labort_gmult 4701 4702 # ;; Check gcm128ctx != NULL 4703 test $arg2,$arg2 4704 jz .Labort_gmult 4705___ 4706} 4707$code .= "vmovdqu64 ($arg1),%xmm1\n"; 4708$code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n"; 4709 4710&GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5"); 4711 4712$code .= "vmovdqu64 %xmm1,($arg1)\n"; 4713if ($CLEAR_SCRATCH_REGISTERS) { 4714 &clear_scratch_gps_asm(); 4715 &clear_scratch_zmms_asm(); 4716} else { 4717 $code .= "vzeroupper\n"; 4718} 4719$code .= <<___; 4720.Labort_gmult: 4721ret 4722.cfi_endproc 4723.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512 4724___ 4725 4726if ($win64) { 4727 4728 # Add unwind metadata for SEH. 4729 4730 # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160 4731 my $UWOP_PUSH_NONVOL = 0; 4732 my $UWOP_ALLOC_LARGE = 1; 4733 my $UWOP_SET_FPREG = 3; 4734 my $UWOP_SAVE_XMM128 = 8; 4735 my %UWOP_REG_NUMBER = ( 4736 rax => 0, 4737 rcx => 1, 4738 rdx => 2, 4739 rbx => 3, 4740 rsp => 4, 4741 rbp => 5, 4742 rsi => 6, 4743 rdi => 7, 4744 map(("r$_" => $_), (8 .. 15))); 4745 4746 $code .= <<___; 4747.section .pdata 4748.align 4 4749 .rva .Lsetiv_seh_begin 4750 .rva .Lsetiv_seh_end 4751 .rva .Lsetiv_seh_info 4752 4753 .rva .Lghash_seh_begin 4754 .rva .Lghash_seh_end 4755 .rva .Lghash_seh_info 4756 4757 .rva .Lencrypt_seh_begin 4758 .rva .Lencrypt_seh_end 4759 .rva .Lencrypt_seh_info 4760 4761 .rva .Ldecrypt_seh_begin 4762 .rva .Ldecrypt_seh_end 4763 .rva .Ldecrypt_seh_info 4764 4765.section .xdata 4766___ 4767 4768 foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") { 4769 $code .= <<___; 4770.align 8 4771.L${func_name}_seh_info: 4772 .byte 1 # version 1, no flags 4773 .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin 4774 .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10 4775 # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16 4776 .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]} 4777___ 4778 4779 # Metadata for %xmm15-%xmm6 4780 # Occupy 2 slots each 4781 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) { 4782 4783 # Scaled-by-16 stack offset 4784 my $xmm_reg_offset = ($reg_idx - 6); 4785 $code .= <<___; 4786 .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin 4787 .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]} 4788 .value $xmm_reg_offset 4789___ 4790 } 4791 4792 $code .= <<___; 4793 # Frame pointer (occupy 1 slot) 4794 .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin 4795 .byte $UWOP_SET_FPREG 4796 4797 # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes 4798 .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin 4799 .byte $UWOP_ALLOC_LARGE 4800 .value `($XMM_STORAGE + 8) / 8` 4801___ 4802 4803 # Metadata for GPR regs 4804 # Occupy 1 slot each 4805 foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") { 4806 $code .= <<___; 4807 .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin 4808 .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]} 4809___ 4810 } 4811 } 4812} 4813 4814$code .= <<___; 4815.section .rodata align=16 4816.align 16 4817POLY: .quad 0x0000000000000001, 0xC200000000000000 4818 4819.align 64 4820POLY2: 4821 .quad 0x00000001C2000000, 0xC200000000000000 4822 .quad 0x00000001C2000000, 0xC200000000000000 4823 .quad 0x00000001C2000000, 0xC200000000000000 4824 .quad 0x00000001C2000000, 0xC200000000000000 4825 4826.align 16 4827TWOONE: .quad 0x0000000000000001, 0x0000000100000000 4828 4829# ;;; Order of these constants should not change. 4830# ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F 4831.align 64 4832SHUF_MASK: 4833 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 4834 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 4835 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 4836 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607 4837 4838.align 16 4839SHIFT_MASK: 4840 .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 4841 4842ALL_F: 4843 .quad 0xffffffffffffffff, 0xffffffffffffffff 4844 4845ZERO: 4846 .quad 0x0000000000000000, 0x0000000000000000 4847 4848.align 16 4849ONE: 4850 .quad 0x0000000000000001, 0x0000000000000000 4851 4852.align 16 4853ONEf: 4854 .quad 0x0000000000000000, 0x0100000000000000 4855 4856.align 64 4857ddq_add_1234: 4858 .quad 0x0000000000000001, 0x0000000000000000 4859 .quad 0x0000000000000002, 0x0000000000000000 4860 .quad 0x0000000000000003, 0x0000000000000000 4861 .quad 0x0000000000000004, 0x0000000000000000 4862 4863.align 64 4864ddq_add_5678: 4865 .quad 0x0000000000000005, 0x0000000000000000 4866 .quad 0x0000000000000006, 0x0000000000000000 4867 .quad 0x0000000000000007, 0x0000000000000000 4868 .quad 0x0000000000000008, 0x0000000000000000 4869 4870.align 64 4871ddq_add_4444: 4872 .quad 0x0000000000000004, 0x0000000000000000 4873 .quad 0x0000000000000004, 0x0000000000000000 4874 .quad 0x0000000000000004, 0x0000000000000000 4875 .quad 0x0000000000000004, 0x0000000000000000 4876 4877.align 64 4878ddq_add_8888: 4879 .quad 0x0000000000000008, 0x0000000000000000 4880 .quad 0x0000000000000008, 0x0000000000000000 4881 .quad 0x0000000000000008, 0x0000000000000000 4882 .quad 0x0000000000000008, 0x0000000000000000 4883 4884.align 64 4885ddq_addbe_1234: 4886 .quad 0x0000000000000000, 0x0100000000000000 4887 .quad 0x0000000000000000, 0x0200000000000000 4888 .quad 0x0000000000000000, 0x0300000000000000 4889 .quad 0x0000000000000000, 0x0400000000000000 4890 4891.align 64 4892ddq_addbe_4444: 4893 .quad 0x0000000000000000, 0x0400000000000000 4894 .quad 0x0000000000000000, 0x0400000000000000 4895 .quad 0x0000000000000000, 0x0400000000000000 4896 .quad 0x0000000000000000, 0x0400000000000000 4897 4898.align 64 4899byte_len_to_mask_table: 4900 .value 0x0000, 0x0001, 0x0003, 0x0007 4901 .value 0x000f, 0x001f, 0x003f, 0x007f 4902 .value 0x00ff, 0x01ff, 0x03ff, 0x07ff 4903 .value 0x0fff, 0x1fff, 0x3fff, 0x7fff 4904 .value 0xffff 4905 4906.align 64 4907byte64_len_to_mask_table: 4908 .quad 0x0000000000000000, 0x0000000000000001 4909 .quad 0x0000000000000003, 0x0000000000000007 4910 .quad 0x000000000000000f, 0x000000000000001f 4911 .quad 0x000000000000003f, 0x000000000000007f 4912 .quad 0x00000000000000ff, 0x00000000000001ff 4913 .quad 0x00000000000003ff, 0x00000000000007ff 4914 .quad 0x0000000000000fff, 0x0000000000001fff 4915 .quad 0x0000000000003fff, 0x0000000000007fff 4916 .quad 0x000000000000ffff, 0x000000000001ffff 4917 .quad 0x000000000003ffff, 0x000000000007ffff 4918 .quad 0x00000000000fffff, 0x00000000001fffff 4919 .quad 0x00000000003fffff, 0x00000000007fffff 4920 .quad 0x0000000000ffffff, 0x0000000001ffffff 4921 .quad 0x0000000003ffffff, 0x0000000007ffffff 4922 .quad 0x000000000fffffff, 0x000000001fffffff 4923 .quad 0x000000003fffffff, 0x000000007fffffff 4924 .quad 0x00000000ffffffff, 0x00000001ffffffff 4925 .quad 0x00000003ffffffff, 0x00000007ffffffff 4926 .quad 0x0000000fffffffff, 0x0000001fffffffff 4927 .quad 0x0000003fffffffff, 0x0000007fffffffff 4928 .quad 0x000000ffffffffff, 0x000001ffffffffff 4929 .quad 0x000003ffffffffff, 0x000007ffffffffff 4930 .quad 0x00000fffffffffff, 0x00001fffffffffff 4931 .quad 0x00003fffffffffff, 0x00007fffffffffff 4932 .quad 0x0000ffffffffffff, 0x0001ffffffffffff 4933 .quad 0x0003ffffffffffff, 0x0007ffffffffffff 4934 .quad 0x000fffffffffffff, 0x001fffffffffffff 4935 .quad 0x003fffffffffffff, 0x007fffffffffffff 4936 .quad 0x00ffffffffffffff, 0x01ffffffffffffff 4937 .quad 0x03ffffffffffffff, 0x07ffffffffffffff 4938 .quad 0x0fffffffffffffff, 0x1fffffffffffffff 4939 .quad 0x3fffffffffffffff, 0x7fffffffffffffff 4940 .quad 0xffffffffffffffff 4941___ 4942 4943} else { 4944# Fallback for old assembler 4945$code .= <<___; 4946.text 4947.globl ossl_vaes_vpclmulqdq_capable 4948.type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent 4949ossl_vaes_vpclmulqdq_capable: 4950 xor %eax,%eax 4951 ret 4952.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable 4953 4954.globl ossl_aes_gcm_init_avx512 4955.globl ossl_aes_gcm_setiv_avx512 4956.globl ossl_aes_gcm_update_aad_avx512 4957.globl ossl_aes_gcm_encrypt_avx512 4958.globl ossl_aes_gcm_decrypt_avx512 4959.globl ossl_aes_gcm_finalize_avx512 4960.globl ossl_gcm_gmult_avx512 4961 4962.type ossl_aes_gcm_init_avx512,\@abi-omnipotent 4963ossl_aes_gcm_init_avx512: 4964ossl_aes_gcm_setiv_avx512: 4965ossl_aes_gcm_update_aad_avx512: 4966ossl_aes_gcm_encrypt_avx512: 4967ossl_aes_gcm_decrypt_avx512: 4968ossl_aes_gcm_finalize_avx512: 4969ossl_gcm_gmult_avx512: 4970 .byte 0x0f,0x0b # ud2 4971 ret 4972.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512 4973___ 4974} 4975 4976$code =~ s/\`([^\`]*)\`/eval $1/gem; 4977print $code; 4978close STDOUT or die "error closing STDOUT: $!"; 4979