1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93# $output is the last argument if it looks like a file (it has an extension) 94# $flavour is the first argument if it doesn't look like a file 95$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 96$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 97 98$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 99 100$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 101( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 102( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 103die "can't locate x86_64-xlate.pl"; 104 105if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 106 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 107 $avx = ($1>=2.20) + ($1>=2.22); 108} 109 110if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 111 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 112 $avx = ($1>=2.09) + ($1>=2.10); 113} 114 115if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 116 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 117 $avx = ($1>=10) + ($1>=11); 118} 119 120if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 121 $avx = ($2>=3.0) + ($2>3.0); 122} 123 124open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 125 or die "can't call $xlate: $!"; 126*STDOUT=*OUT; 127 128$do4xaggr=1; 129 130# common register layout 131$nlo="%rax"; 132$nhi="%rbx"; 133$Zlo="%r8"; 134$Zhi="%r9"; 135$tmp="%r10"; 136$rem_4bit = "%r11"; 137 138$Xi="%rdi"; 139$Htbl="%rsi"; 140 141# per-function register layout 142$cnt="%rcx"; 143$rem="%rdx"; 144 145sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or 146 $r =~ s/%[er]([sd]i)/%\1l/ or 147 $r =~ s/%[er](bp)/%\1l/ or 148 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 149 150sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 151{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 152 my $arg = pop; 153 $arg = "\$$arg" if ($arg*1 eq $arg); 154 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 155} 156 157{ my $N; 158 sub loop() { 159 my $inp = shift; 160 161 $N++; 162$code.=<<___; 163 xor $nlo,$nlo 164 xor $nhi,$nhi 165 mov `&LB("$Zlo")`,`&LB("$nlo")` 166 mov `&LB("$Zlo")`,`&LB("$nhi")` 167 shl \$4,`&LB("$nlo")` 168 mov \$14,$cnt 169 mov 8($Htbl,$nlo),$Zlo 170 mov ($Htbl,$nlo),$Zhi 171 and \$0xf0,`&LB("$nhi")` 172 mov $Zlo,$rem 173 jmp .Loop$N 174 175.align 16 176.Loop$N: 177 shr \$4,$Zlo 178 and \$0xf,$rem 179 mov $Zhi,$tmp 180 mov ($inp,$cnt),`&LB("$nlo")` 181 shr \$4,$Zhi 182 xor 8($Htbl,$nhi),$Zlo 183 shl \$60,$tmp 184 xor ($Htbl,$nhi),$Zhi 185 mov `&LB("$nlo")`,`&LB("$nhi")` 186 xor ($rem_4bit,$rem,8),$Zhi 187 mov $Zlo,$rem 188 shl \$4,`&LB("$nlo")` 189 xor $tmp,$Zlo 190 dec $cnt 191 js .Lbreak$N 192 193 shr \$4,$Zlo 194 and \$0xf,$rem 195 mov $Zhi,$tmp 196 shr \$4,$Zhi 197 xor 8($Htbl,$nlo),$Zlo 198 shl \$60,$tmp 199 xor ($Htbl,$nlo),$Zhi 200 and \$0xf0,`&LB("$nhi")` 201 xor ($rem_4bit,$rem,8),$Zhi 202 mov $Zlo,$rem 203 xor $tmp,$Zlo 204 jmp .Loop$N 205 206.align 16 207.Lbreak$N: 208 shr \$4,$Zlo 209 and \$0xf,$rem 210 mov $Zhi,$tmp 211 shr \$4,$Zhi 212 xor 8($Htbl,$nlo),$Zlo 213 shl \$60,$tmp 214 xor ($Htbl,$nlo),$Zhi 215 and \$0xf0,`&LB("$nhi")` 216 xor ($rem_4bit,$rem,8),$Zhi 217 mov $Zlo,$rem 218 xor $tmp,$Zlo 219 220 shr \$4,$Zlo 221 and \$0xf,$rem 222 mov $Zhi,$tmp 223 shr \$4,$Zhi 224 xor 8($Htbl,$nhi),$Zlo 225 shl \$60,$tmp 226 xor ($Htbl,$nhi),$Zhi 227 xor $tmp,$Zlo 228 xor ($rem_4bit,$rem,8),$Zhi 229 230 bswap $Zlo 231 bswap $Zhi 232___ 233}} 234 235$code=<<___; 236.text 237.extern OPENSSL_ia32cap_P 238 239.globl gcm_gmult_4bit 240.type gcm_gmult_4bit,\@function,2 241.align 16 242gcm_gmult_4bit: 243.cfi_startproc 244 endbranch 245 push %rbx 246.cfi_push %rbx 247 push %rbp # %rbp and others are pushed exclusively in 248.cfi_push %rbp 249 push %r12 # order to reuse Win64 exception handler... 250.cfi_push %r12 251 push %r13 252.cfi_push %r13 253 push %r14 254.cfi_push %r14 255 push %r15 256.cfi_push %r15 257 sub \$280,%rsp 258.cfi_adjust_cfa_offset 280 259.Lgmult_prologue: 260 261 movzb 15($Xi),$Zlo 262 lea .Lrem_4bit(%rip),$rem_4bit 263___ 264 &loop ($Xi); 265$code.=<<___; 266 mov $Zlo,8($Xi) 267 mov $Zhi,($Xi) 268 269 lea 280+48(%rsp),%rsi 270.cfi_def_cfa %rsi,8 271 mov -8(%rsi),%rbx 272.cfi_restore %rbx 273 lea (%rsi),%rsp 274.cfi_def_cfa_register %rsp 275.Lgmult_epilogue: 276 ret 277.cfi_endproc 278.size gcm_gmult_4bit,.-gcm_gmult_4bit 279___ 280 281# per-function register layout 282$inp="%rdx"; 283$len="%rcx"; 284$rem_8bit=$rem_4bit; 285 286$code.=<<___; 287.globl gcm_ghash_4bit 288.type gcm_ghash_4bit,\@function,4 289.align 16 290gcm_ghash_4bit: 291.cfi_startproc 292 endbranch 293 push %rbx 294.cfi_push %rbx 295 push %rbp 296.cfi_push %rbp 297 push %r12 298.cfi_push %r12 299 push %r13 300.cfi_push %r13 301 push %r14 302.cfi_push %r14 303 push %r15 304.cfi_push %r15 305 sub \$280,%rsp 306.cfi_adjust_cfa_offset 280 307.Lghash_prologue: 308 mov $inp,%r14 # reassign couple of args 309 mov $len,%r15 310___ 311{ my $inp="%r14"; 312 my $dat="%edx"; 313 my $len="%r15"; 314 my @nhi=("%ebx","%ecx"); 315 my @rem=("%r12","%r13"); 316 my $Hshr4="%rbp"; 317 318 &sub ($Htbl,-128); # size optimization 319 &lea ($Hshr4,"16+128(%rsp)"); 320 { my @lo =($nlo,$nhi); 321 my @hi =($Zlo,$Zhi); 322 323 &xor ($dat,$dat); 324 for ($i=0,$j=-2;$i<18;$i++,$j++) { 325 &mov ("$j(%rsp)",&LB($dat)) if ($i>1); 326 &or ($lo[0],$tmp) if ($i>1); 327 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); 328 &shr ($lo[1],4) if ($i>0 && $i<17); 329 &mov ($tmp,$hi[1]) if ($i>0 && $i<17); 330 &shr ($hi[1],4) if ($i>0 && $i<17); 331 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); 332 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); 333 &shl (&LB($dat),4) if ($i>0 && $i<17); 334 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); 335 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); 336 &shl ($tmp,60) if ($i>0 && $i<17); 337 338 push (@lo,shift(@lo)); 339 push (@hi,shift(@hi)); 340 } 341 } 342 &add ($Htbl,-128); 343 &mov ($Zlo,"8($Xi)"); 344 &mov ($Zhi,"0($Xi)"); 345 &add ($len,$inp); # pointer to the end of data 346 &lea ($rem_8bit,".Lrem_8bit(%rip)"); 347 &jmp (".Louter_loop"); 348 349$code.=".align 16\n.Louter_loop:\n"; 350 &xor ($Zhi,"($inp)"); 351 &mov ("%rdx","8($inp)"); 352 &lea ($inp,"16($inp)"); 353 &xor ("%rdx",$Zlo); 354 &mov ("($Xi)",$Zhi); 355 &mov ("8($Xi)","%rdx"); 356 &shr ("%rdx",32); 357 358 &xor ($nlo,$nlo); 359 &rol ($dat,8); 360 &mov (&LB($nlo),&LB($dat)); 361 &movz ($nhi[0],&LB($dat)); 362 &shl (&LB($nlo),4); 363 &shr ($nhi[0],4); 364 365 for ($j=11,$i=0;$i<15;$i++) { 366 &rol ($dat,8); 367 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); 368 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); 369 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); 370 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); 371 372 &mov (&LB($nlo),&LB($dat)); 373 &xor ($Zlo,$tmp) if ($i>0); 374 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); 375 376 &movz ($nhi[1],&LB($dat)); 377 &shl (&LB($nlo),4); 378 &movzb ($rem[0],"(%rsp,$nhi[0])"); 379 380 &shr ($nhi[1],4) if ($i<14); 381 &and ($nhi[1],0xf0) if ($i==14); 382 &shl ($rem[1],48) if ($i>0); 383 &xor ($rem[0],$Zlo); 384 385 &mov ($tmp,$Zhi); 386 &xor ($Zhi,$rem[1]) if ($i>0); 387 &shr ($Zlo,8); 388 389 &movz ($rem[0],&LB($rem[0])); 390 &mov ($dat,"$j($Xi)") if (--$j%4==0); 391 &shr ($Zhi,8); 392 393 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); 394 &shl ($tmp,56); 395 &xor ($Zhi,"($Hshr4,$nhi[0],8)"); 396 397 unshift (@nhi,pop(@nhi)); # "rotate" registers 398 unshift (@rem,pop(@rem)); 399 } 400 &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); 401 &xor ($Zlo,"8($Htbl,$nlo)"); 402 &xor ($Zhi,"($Htbl,$nlo)"); 403 404 &shl ($rem[1],48); 405 &xor ($Zlo,$tmp); 406 407 &xor ($Zhi,$rem[1]); 408 &movz ($rem[0],&LB($Zlo)); 409 &shr ($Zlo,4); 410 411 &mov ($tmp,$Zhi); 412 &shl (&LB($rem[0]),4); 413 &shr ($Zhi,4); 414 415 &xor ($Zlo,"8($Htbl,$nhi[0])"); 416 &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); 417 &shl ($tmp,60); 418 419 &xor ($Zhi,"($Htbl,$nhi[0])"); 420 &xor ($Zlo,$tmp); 421 &shl ($rem[0],48); 422 423 &bswap ($Zlo); 424 &xor ($Zhi,$rem[0]); 425 426 &bswap ($Zhi); 427 &cmp ($inp,$len); 428 &jb (".Louter_loop"); 429} 430$code.=<<___; 431 mov $Zlo,8($Xi) 432 mov $Zhi,($Xi) 433 434 lea 280+48(%rsp),%rsi 435.cfi_def_cfa %rsi,8 436 mov -48(%rsi),%r15 437.cfi_restore %r15 438 mov -40(%rsi),%r14 439.cfi_restore %r14 440 mov -32(%rsi),%r13 441.cfi_restore %r13 442 mov -24(%rsi),%r12 443.cfi_restore %r12 444 mov -16(%rsi),%rbp 445.cfi_restore %rbp 446 mov -8(%rsi),%rbx 447.cfi_restore %rbx 448 lea 0(%rsi),%rsp 449.cfi_def_cfa_register %rsp 450.Lghash_epilogue: 451 ret 452.cfi_endproc 453.size gcm_ghash_4bit,.-gcm_ghash_4bit 454___ 455 456###################################################################### 457# PCLMULQDQ version. 458 459@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 460 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 461 462($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; 463($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); 464 465sub clmul64x64_T2 { # minimal register pressure 466my ($Xhi,$Xi,$Hkey,$HK)=@_; 467 468if (!defined($HK)) { $HK = $T2; 469$code.=<<___; 470 movdqa $Xi,$Xhi # 471 pshufd \$0b01001110,$Xi,$T1 472 pshufd \$0b01001110,$Hkey,$T2 473 pxor $Xi,$T1 # 474 pxor $Hkey,$T2 475___ 476} else { 477$code.=<<___; 478 movdqa $Xi,$Xhi # 479 pshufd \$0b01001110,$Xi,$T1 480 pxor $Xi,$T1 # 481___ 482} 483$code.=<<___; 484 pclmulqdq \$0x00,$Hkey,$Xi ####### 485 pclmulqdq \$0x11,$Hkey,$Xhi ####### 486 pclmulqdq \$0x00,$HK,$T1 ####### 487 pxor $Xi,$T1 # 488 pxor $Xhi,$T1 # 489 490 movdqa $T1,$T2 # 491 psrldq \$8,$T1 492 pslldq \$8,$T2 # 493 pxor $T1,$Xhi 494 pxor $T2,$Xi # 495___ 496} 497 498sub reduction_alg9 { # 17/11 times faster than Intel version 499my ($Xhi,$Xi) = @_; 500 501$code.=<<___; 502 # 1st phase 503 movdqa $Xi,$T2 # 504 movdqa $Xi,$T1 505 psllq \$5,$Xi 506 pxor $Xi,$T1 # 507 psllq \$1,$Xi 508 pxor $T1,$Xi # 509 psllq \$57,$Xi # 510 movdqa $Xi,$T1 # 511 pslldq \$8,$Xi 512 psrldq \$8,$T1 # 513 pxor $T2,$Xi 514 pxor $T1,$Xhi # 515 516 # 2nd phase 517 movdqa $Xi,$T2 518 psrlq \$1,$Xi 519 pxor $T2,$Xhi # 520 pxor $Xi,$T2 521 psrlq \$5,$Xi 522 pxor $T2,$Xi # 523 psrlq \$1,$Xi # 524 pxor $Xhi,$Xi # 525___ 526} 527 528{ my ($Htbl,$Xip)=@_4args; 529 my $HK="%xmm6"; 530 531$code.=<<___; 532.globl gcm_init_clmul 533.type gcm_init_clmul,\@abi-omnipotent 534.align 16 535gcm_init_clmul: 536.cfi_startproc 537.L_init_clmul: 538___ 539$code.=<<___ if ($win64); 540.LSEH_begin_gcm_init_clmul: 541 # I can't trust assembler to use specific encoding:-( 542 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 543 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 544___ 545$code.=<<___; 546 movdqu ($Xip),$Hkey 547 pshufd \$0b01001110,$Hkey,$Hkey # dword swap 548 549 # <<1 twist 550 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 551 movdqa $Hkey,$T1 552 psllq \$1,$Hkey 553 pxor $T3,$T3 # 554 psrlq \$63,$T1 555 pcmpgtd $T2,$T3 # broadcast carry bit 556 pslldq \$8,$T1 557 por $T1,$Hkey # H<<=1 558 559 # magic reduction 560 pand .L0x1c2_polynomial(%rip),$T3 561 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial 562 563 # calculate H^2 564 pshufd \$0b01001110,$Hkey,$HK 565 movdqa $Hkey,$Xi 566 pxor $Hkey,$HK 567___ 568 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); 569 &reduction_alg9 ($Xhi,$Xi); 570$code.=<<___; 571 pshufd \$0b01001110,$Hkey,$T1 572 pshufd \$0b01001110,$Xi,$T2 573 pxor $Hkey,$T1 # Karatsuba pre-processing 574 movdqu $Hkey,0x00($Htbl) # save H 575 pxor $Xi,$T2 # Karatsuba pre-processing 576 movdqu $Xi,0x10($Htbl) # save H^2 577 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... 578 movdqu $T2,0x20($Htbl) # save Karatsuba "salt" 579___ 580if ($do4xaggr) { 581 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 582 &reduction_alg9 ($Xhi,$Xi); 583$code.=<<___; 584 movdqa $Xi,$T3 585___ 586 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 587 &reduction_alg9 ($Xhi,$Xi); 588$code.=<<___; 589 pshufd \$0b01001110,$T3,$T1 590 pshufd \$0b01001110,$Xi,$T2 591 pxor $T3,$T1 # Karatsuba pre-processing 592 movdqu $T3,0x30($Htbl) # save H^3 593 pxor $Xi,$T2 # Karatsuba pre-processing 594 movdqu $Xi,0x40($Htbl) # save H^4 595 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... 596 movdqu $T2,0x50($Htbl) # save Karatsuba "salt" 597___ 598} 599$code.=<<___ if ($win64); 600 movaps (%rsp),%xmm6 601 lea 0x18(%rsp),%rsp 602.LSEH_end_gcm_init_clmul: 603___ 604$code.=<<___; 605 ret 606.cfi_endproc 607.size gcm_init_clmul,.-gcm_init_clmul 608___ 609} 610 611{ my ($Xip,$Htbl)=@_4args; 612 613$code.=<<___; 614.globl gcm_gmult_clmul 615.type gcm_gmult_clmul,\@abi-omnipotent 616.align 16 617gcm_gmult_clmul: 618.cfi_startproc 619 endbranch 620.L_gmult_clmul: 621 movdqu ($Xip),$Xi 622 movdqa .Lbswap_mask(%rip),$T3 623 movdqu ($Htbl),$Hkey 624 movdqu 0x20($Htbl),$T2 625 pshufb $T3,$Xi 626___ 627 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); 628$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); 629 # experimental alternative. special thing about is that there 630 # no dependency between the two multiplications... 631 mov \$`0xE1<<1`,%eax 632 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff 633 mov \$0x07,%r11d 634 movq %rax,$T1 635 movq %r10,$T2 636 movq %r11,$T3 # borrow $T3 637 pand $Xi,$T3 638 pshufb $T3,$T2 # ($Xi&7)·0xE0 639 movq %rax,$T3 640 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) 641 pxor $Xi,$T2 642 pslldq \$15,$T2 643 paddd $T2,$T2 # <<(64+56+1) 644 pxor $T2,$Xi 645 pclmulqdq \$0x01,$T3,$Xi 646 movdqa .Lbswap_mask(%rip),$T3 # reload $T3 647 psrldq \$1,$T1 648 pxor $T1,$Xhi 649 pslldq \$7,$Xi 650 pxor $Xhi,$Xi 651___ 652$code.=<<___; 653 pshufb $T3,$Xi 654 movdqu $Xi,($Xip) 655 ret 656.cfi_endproc 657.size gcm_gmult_clmul,.-gcm_gmult_clmul 658___ 659} 660 661{ my ($Xip,$Htbl,$inp,$len)=@_4args; 662 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); 663 my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); 664 665$code.=<<___; 666.globl gcm_ghash_clmul 667.type gcm_ghash_clmul,\@abi-omnipotent 668.align 32 669gcm_ghash_clmul: 670.cfi_startproc 671 endbranch 672.L_ghash_clmul: 673___ 674$code.=<<___ if ($win64); 675 lea -0x88(%rsp),%rax 676.LSEH_begin_gcm_ghash_clmul: 677 # I can't trust assembler to use specific encoding:-( 678 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 679 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 680 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 681 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 682 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 683 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 684 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 685 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 686 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 687 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 688 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 689___ 690$code.=<<___; 691 movdqa .Lbswap_mask(%rip),$T3 692 693 movdqu ($Xip),$Xi 694 movdqu ($Htbl),$Hkey 695 movdqu 0x20($Htbl),$HK 696 pshufb $T3,$Xi 697 698 sub \$0x10,$len 699 jz .Lodd_tail 700 701 movdqu 0x10($Htbl),$Hkey2 702___ 703if ($do4xaggr) { 704my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); 705 706$code.=<<___; 707 mov OPENSSL_ia32cap_P+4(%rip),%eax 708 cmp \$0x30,$len 709 jb .Lskip4x 710 711 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE 712 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE 713 je .Lskip4x 714 715 sub \$0x30,$len 716 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff 717 movdqu 0x30($Htbl),$Hkey3 718 movdqu 0x40($Htbl),$Hkey4 719 720 ####### 721 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P 722 # 723 movdqu 0x30($inp),$Xln 724 movdqu 0x20($inp),$Xl 725 pshufb $T3,$Xln 726 pshufb $T3,$Xl 727 movdqa $Xln,$Xhn 728 pshufd \$0b01001110,$Xln,$Xmn 729 pxor $Xln,$Xmn 730 pclmulqdq \$0x00,$Hkey,$Xln 731 pclmulqdq \$0x11,$Hkey,$Xhn 732 pclmulqdq \$0x00,$HK,$Xmn 733 734 movdqa $Xl,$Xh 735 pshufd \$0b01001110,$Xl,$Xm 736 pxor $Xl,$Xm 737 pclmulqdq \$0x00,$Hkey2,$Xl 738 pclmulqdq \$0x11,$Hkey2,$Xh 739 pclmulqdq \$0x10,$HK,$Xm 740 xorps $Xl,$Xln 741 xorps $Xh,$Xhn 742 movups 0x50($Htbl),$HK 743 xorps $Xm,$Xmn 744 745 movdqu 0x10($inp),$Xl 746 movdqu 0($inp),$T1 747 pshufb $T3,$Xl 748 pshufb $T3,$T1 749 movdqa $Xl,$Xh 750 pshufd \$0b01001110,$Xl,$Xm 751 pxor $T1,$Xi 752 pxor $Xl,$Xm 753 pclmulqdq \$0x00,$Hkey3,$Xl 754 movdqa $Xi,$Xhi 755 pshufd \$0b01001110,$Xi,$T1 756 pxor $Xi,$T1 757 pclmulqdq \$0x11,$Hkey3,$Xh 758 pclmulqdq \$0x00,$HK,$Xm 759 xorps $Xl,$Xln 760 xorps $Xh,$Xhn 761 762 lea 0x40($inp),$inp 763 sub \$0x40,$len 764 jc .Ltail4x 765 766 jmp .Lmod4_loop 767.align 32 768.Lmod4_loop: 769 pclmulqdq \$0x00,$Hkey4,$Xi 770 xorps $Xm,$Xmn 771 movdqu 0x30($inp),$Xl 772 pshufb $T3,$Xl 773 pclmulqdq \$0x11,$Hkey4,$Xhi 774 xorps $Xln,$Xi 775 movdqu 0x20($inp),$Xln 776 movdqa $Xl,$Xh 777 pclmulqdq \$0x10,$HK,$T1 778 pshufd \$0b01001110,$Xl,$Xm 779 xorps $Xhn,$Xhi 780 pxor $Xl,$Xm 781 pshufb $T3,$Xln 782 movups 0x20($Htbl),$HK 783 xorps $Xmn,$T1 784 pclmulqdq \$0x00,$Hkey,$Xl 785 pshufd \$0b01001110,$Xln,$Xmn 786 787 pxor $Xi,$T1 # aggregated Karatsuba post-processing 788 movdqa $Xln,$Xhn 789 pxor $Xhi,$T1 # 790 pxor $Xln,$Xmn 791 movdqa $T1,$T2 # 792 pclmulqdq \$0x11,$Hkey,$Xh 793 pslldq \$8,$T1 794 psrldq \$8,$T2 # 795 pxor $T1,$Xi 796 movdqa .L7_mask(%rip),$T1 797 pxor $T2,$Xhi # 798 movq %rax,$T2 799 800 pand $Xi,$T1 # 1st phase 801 pshufb $T1,$T2 # 802 pxor $Xi,$T2 # 803 pclmulqdq \$0x00,$HK,$Xm 804 psllq \$57,$T2 # 805 movdqa $T2,$T1 # 806 pslldq \$8,$T2 807 pclmulqdq \$0x00,$Hkey2,$Xln 808 psrldq \$8,$T1 # 809 pxor $T2,$Xi 810 pxor $T1,$Xhi # 811 movdqu 0($inp),$T1 812 813 movdqa $Xi,$T2 # 2nd phase 814 psrlq \$1,$Xi 815 pclmulqdq \$0x11,$Hkey2,$Xhn 816 xorps $Xl,$Xln 817 movdqu 0x10($inp),$Xl 818 pshufb $T3,$Xl 819 pclmulqdq \$0x10,$HK,$Xmn 820 xorps $Xh,$Xhn 821 movups 0x50($Htbl),$HK 822 pshufb $T3,$T1 823 pxor $T2,$Xhi # 824 pxor $Xi,$T2 825 psrlq \$5,$Xi 826 827 movdqa $Xl,$Xh 828 pxor $Xm,$Xmn 829 pshufd \$0b01001110,$Xl,$Xm 830 pxor $T2,$Xi # 831 pxor $T1,$Xhi 832 pxor $Xl,$Xm 833 pclmulqdq \$0x00,$Hkey3,$Xl 834 psrlq \$1,$Xi # 835 pxor $Xhi,$Xi # 836 movdqa $Xi,$Xhi 837 pclmulqdq \$0x11,$Hkey3,$Xh 838 xorps $Xl,$Xln 839 pshufd \$0b01001110,$Xi,$T1 840 pxor $Xi,$T1 841 842 pclmulqdq \$0x00,$HK,$Xm 843 xorps $Xh,$Xhn 844 845 lea 0x40($inp),$inp 846 sub \$0x40,$len 847 jnc .Lmod4_loop 848 849.Ltail4x: 850 pclmulqdq \$0x00,$Hkey4,$Xi 851 pclmulqdq \$0x11,$Hkey4,$Xhi 852 pclmulqdq \$0x10,$HK,$T1 853 xorps $Xm,$Xmn 854 xorps $Xln,$Xi 855 xorps $Xhn,$Xhi 856 pxor $Xi,$Xhi # aggregated Karatsuba post-processing 857 pxor $Xmn,$T1 858 859 pxor $Xhi,$T1 # 860 pxor $Xi,$Xhi 861 862 movdqa $T1,$T2 # 863 psrldq \$8,$T1 864 pslldq \$8,$T2 # 865 pxor $T1,$Xhi 866 pxor $T2,$Xi # 867___ 868 &reduction_alg9($Xhi,$Xi); 869$code.=<<___; 870 add \$0x40,$len 871 jz .Ldone 872 movdqu 0x20($Htbl),$HK 873 sub \$0x10,$len 874 jz .Lodd_tail 875.Lskip4x: 876___ 877} 878$code.=<<___; 879 ####### 880 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = 881 # [(H*Ii+1) + (H*Xi+1)] mod P = 882 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P 883 # 884 movdqu ($inp),$T1 # Ii 885 movdqu 16($inp),$Xln # Ii+1 886 pshufb $T3,$T1 887 pshufb $T3,$Xln 888 pxor $T1,$Xi # Ii+Xi 889 890 movdqa $Xln,$Xhn 891 pshufd \$0b01001110,$Xln,$Xmn 892 pxor $Xln,$Xmn 893 pclmulqdq \$0x00,$Hkey,$Xln 894 pclmulqdq \$0x11,$Hkey,$Xhn 895 pclmulqdq \$0x00,$HK,$Xmn 896 897 lea 32($inp),$inp # i+=2 898 nop 899 sub \$0x20,$len 900 jbe .Leven_tail 901 nop 902 jmp .Lmod_loop 903 904.align 32 905.Lmod_loop: 906 movdqa $Xi,$Xhi 907 movdqa $Xmn,$T1 908 pshufd \$0b01001110,$Xi,$Xmn # 909 pxor $Xi,$Xmn # 910 911 pclmulqdq \$0x00,$Hkey2,$Xi 912 pclmulqdq \$0x11,$Hkey2,$Xhi 913 pclmulqdq \$0x10,$HK,$Xmn 914 915 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 916 pxor $Xhn,$Xhi 917 movdqu ($inp),$T2 # Ii 918 pxor $Xi,$T1 # aggregated Karatsuba post-processing 919 pshufb $T3,$T2 920 movdqu 16($inp),$Xln # Ii+1 921 922 pxor $Xhi,$T1 923 pxor $T2,$Xhi # "Ii+Xi", consume early 924 pxor $T1,$Xmn 925 pshufb $T3,$Xln 926 movdqa $Xmn,$T1 # 927 psrldq \$8,$T1 928 pslldq \$8,$Xmn # 929 pxor $T1,$Xhi 930 pxor $Xmn,$Xi # 931 932 movdqa $Xln,$Xhn # 933 934 movdqa $Xi,$T2 # 1st phase 935 movdqa $Xi,$T1 936 psllq \$5,$Xi 937 pxor $Xi,$T1 # 938 pclmulqdq \$0x00,$Hkey,$Xln ####### 939 psllq \$1,$Xi 940 pxor $T1,$Xi # 941 psllq \$57,$Xi # 942 movdqa $Xi,$T1 # 943 pslldq \$8,$Xi 944 psrldq \$8,$T1 # 945 pxor $T2,$Xi 946 pshufd \$0b01001110,$Xhn,$Xmn 947 pxor $T1,$Xhi # 948 pxor $Xhn,$Xmn # 949 950 movdqa $Xi,$T2 # 2nd phase 951 psrlq \$1,$Xi 952 pclmulqdq \$0x11,$Hkey,$Xhn ####### 953 pxor $T2,$Xhi # 954 pxor $Xi,$T2 955 psrlq \$5,$Xi 956 pxor $T2,$Xi # 957 lea 32($inp),$inp 958 psrlq \$1,$Xi # 959 pclmulqdq \$0x00,$HK,$Xmn ####### 960 pxor $Xhi,$Xi # 961 962 sub \$0x20,$len 963 ja .Lmod_loop 964 965.Leven_tail: 966 movdqa $Xi,$Xhi 967 movdqa $Xmn,$T1 968 pshufd \$0b01001110,$Xi,$Xmn # 969 pxor $Xi,$Xmn # 970 971 pclmulqdq \$0x00,$Hkey2,$Xi 972 pclmulqdq \$0x11,$Hkey2,$Xhi 973 pclmulqdq \$0x10,$HK,$Xmn 974 975 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) 976 pxor $Xhn,$Xhi 977 pxor $Xi,$T1 978 pxor $Xhi,$T1 979 pxor $T1,$Xmn 980 movdqa $Xmn,$T1 # 981 psrldq \$8,$T1 982 pslldq \$8,$Xmn # 983 pxor $T1,$Xhi 984 pxor $Xmn,$Xi # 985___ 986 &reduction_alg9 ($Xhi,$Xi); 987$code.=<<___; 988 test $len,$len 989 jnz .Ldone 990 991.Lodd_tail: 992 movdqu ($inp),$T1 # Ii 993 pshufb $T3,$T1 994 pxor $T1,$Xi # Ii+Xi 995___ 996 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) 997 &reduction_alg9 ($Xhi,$Xi); 998$code.=<<___; 999.Ldone: 1000 pshufb $T3,$Xi 1001 movdqu $Xi,($Xip) 1002___ 1003$code.=<<___ if ($win64); 1004 movaps (%rsp),%xmm6 1005 movaps 0x10(%rsp),%xmm7 1006 movaps 0x20(%rsp),%xmm8 1007 movaps 0x30(%rsp),%xmm9 1008 movaps 0x40(%rsp),%xmm10 1009 movaps 0x50(%rsp),%xmm11 1010 movaps 0x60(%rsp),%xmm12 1011 movaps 0x70(%rsp),%xmm13 1012 movaps 0x80(%rsp),%xmm14 1013 movaps 0x90(%rsp),%xmm15 1014 lea 0xa8(%rsp),%rsp 1015.LSEH_end_gcm_ghash_clmul: 1016___ 1017$code.=<<___; 1018 ret 1019.cfi_endproc 1020.size gcm_ghash_clmul,.-gcm_ghash_clmul 1021___ 1022} 1023 1024$code.=<<___; 1025.globl gcm_init_avx 1026.type gcm_init_avx,\@abi-omnipotent 1027.align 32 1028gcm_init_avx: 1029.cfi_startproc 1030___ 1031if ($avx) { 1032my ($Htbl,$Xip)=@_4args; 1033my $HK="%xmm6"; 1034 1035$code.=<<___ if ($win64); 1036.LSEH_begin_gcm_init_avx: 1037 # I can't trust assembler to use specific encoding:-( 1038 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp 1039 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) 1040___ 1041$code.=<<___; 1042 vzeroupper 1043 1044 vmovdqu ($Xip),$Hkey 1045 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap 1046 1047 # <<1 twist 1048 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword 1049 vpsrlq \$63,$Hkey,$T1 1050 vpsllq \$1,$Hkey,$Hkey 1051 vpxor $T3,$T3,$T3 # 1052 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit 1053 vpslldq \$8,$T1,$T1 1054 vpor $T1,$Hkey,$Hkey # H<<=1 1055 1056 # magic reduction 1057 vpand .L0x1c2_polynomial(%rip),$T3,$T3 1058 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial 1059 1060 vpunpckhqdq $Hkey,$Hkey,$HK 1061 vmovdqa $Hkey,$Xi 1062 vpxor $Hkey,$HK,$HK 1063 mov \$4,%r10 # up to H^8 1064 jmp .Linit_start_avx 1065___ 1066 1067sub clmul64x64_avx { 1068my ($Xhi,$Xi,$Hkey,$HK)=@_; 1069 1070if (!defined($HK)) { $HK = $T2; 1071$code.=<<___; 1072 vpunpckhqdq $Xi,$Xi,$T1 1073 vpunpckhqdq $Hkey,$Hkey,$T2 1074 vpxor $Xi,$T1,$T1 # 1075 vpxor $Hkey,$T2,$T2 1076___ 1077} else { 1078$code.=<<___; 1079 vpunpckhqdq $Xi,$Xi,$T1 1080 vpxor $Xi,$T1,$T1 # 1081___ 1082} 1083$code.=<<___; 1084 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### 1085 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### 1086 vpclmulqdq \$0x00,$HK,$T1,$T1 ####### 1087 vpxor $Xi,$Xhi,$T2 # 1088 vpxor $T2,$T1,$T1 # 1089 1090 vpslldq \$8,$T1,$T2 # 1091 vpsrldq \$8,$T1,$T1 1092 vpxor $T2,$Xi,$Xi # 1093 vpxor $T1,$Xhi,$Xhi 1094___ 1095} 1096 1097sub reduction_avx { 1098my ($Xhi,$Xi) = @_; 1099 1100$code.=<<___; 1101 vpsllq \$57,$Xi,$T1 # 1st phase 1102 vpsllq \$62,$Xi,$T2 1103 vpxor $T1,$T2,$T2 # 1104 vpsllq \$63,$Xi,$T1 1105 vpxor $T1,$T2,$T2 # 1106 vpslldq \$8,$T2,$T1 # 1107 vpsrldq \$8,$T2,$T2 1108 vpxor $T1,$Xi,$Xi # 1109 vpxor $T2,$Xhi,$Xhi 1110 1111 vpsrlq \$1,$Xi,$T2 # 2nd phase 1112 vpxor $Xi,$Xhi,$Xhi 1113 vpxor $T2,$Xi,$Xi # 1114 vpsrlq \$5,$T2,$T2 1115 vpxor $T2,$Xi,$Xi # 1116 vpsrlq \$1,$Xi,$Xi # 1117 vpxor $Xhi,$Xi,$Xi # 1118___ 1119} 1120 1121$code.=<<___; 1122.align 32 1123.Linit_loop_avx: 1124 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... 1125 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" 1126___ 1127 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 1128 &reduction_avx ($Xhi,$Xi); 1129$code.=<<___; 1130.Linit_start_avx: 1131 vmovdqa $Xi,$T3 1132___ 1133 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 1134 &reduction_avx ($Xhi,$Xi); 1135$code.=<<___; 1136 vpshufd \$0b01001110,$T3,$T1 1137 vpshufd \$0b01001110,$Xi,$T2 1138 vpxor $T3,$T1,$T1 # Karatsuba pre-processing 1139 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 1140 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing 1141 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 1142 lea 0x30($Htbl),$Htbl 1143 sub \$1,%r10 1144 jnz .Linit_loop_avx 1145 1146 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped 1147 vmovdqu $T3,-0x10($Htbl) 1148 1149 vzeroupper 1150___ 1151$code.=<<___ if ($win64); 1152 movaps (%rsp),%xmm6 1153 lea 0x18(%rsp),%rsp 1154.LSEH_end_gcm_init_avx: 1155___ 1156$code.=<<___; 1157 ret 1158.cfi_endproc 1159.size gcm_init_avx,.-gcm_init_avx 1160___ 1161} else { 1162$code.=<<___; 1163 jmp .L_init_clmul 1164.cfi_endproc 1165.size gcm_init_avx,.-gcm_init_avx 1166___ 1167} 1168 1169$code.=<<___; 1170.globl gcm_gmult_avx 1171.type gcm_gmult_avx,\@abi-omnipotent 1172.align 32 1173gcm_gmult_avx: 1174.cfi_startproc 1175 endbranch 1176 jmp .L_gmult_clmul 1177.cfi_endproc 1178.size gcm_gmult_avx,.-gcm_gmult_avx 1179___ 1180 1181$code.=<<___; 1182.globl gcm_ghash_avx 1183.type gcm_ghash_avx,\@abi-omnipotent 1184.align 32 1185gcm_ghash_avx: 1186.cfi_startproc 1187 endbranch 1188___ 1189if ($avx) { 1190my ($Xip,$Htbl,$inp,$len)=@_4args; 1191my ($Xlo,$Xhi,$Xmi, 1192 $Zlo,$Zhi,$Zmi, 1193 $Hkey,$HK,$T1,$T2, 1194 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); 1195 1196$code.=<<___ if ($win64); 1197 lea -0x88(%rsp),%rax 1198.LSEH_begin_gcm_ghash_avx: 1199 # I can't trust assembler to use specific encoding:-( 1200 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1201 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) 1202 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) 1203 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) 1204 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) 1205 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) 1206 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) 1207 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) 1208 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) 1209 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) 1210 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) 1211___ 1212$code.=<<___; 1213 vzeroupper 1214 1215 vmovdqu ($Xip),$Xi # load $Xi 1216 lea .L0x1c2_polynomial(%rip),%r10 1217 lea 0x40($Htbl),$Htbl # size optimization 1218 vmovdqu .Lbswap_mask(%rip),$bswap 1219 vpshufb $bswap,$Xi,$Xi 1220 cmp \$0x80,$len 1221 jb .Lshort_avx 1222 sub \$0x80,$len 1223 1224 vmovdqu 0x70($inp),$Ii # I[7] 1225 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1226 vpshufb $bswap,$Ii,$Ii 1227 vmovdqu 0x20-0x40($Htbl),$HK 1228 1229 vpunpckhqdq $Ii,$Ii,$T2 1230 vmovdqu 0x60($inp),$Ij # I[6] 1231 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1232 vpxor $Ii,$T2,$T2 1233 vpshufb $bswap,$Ij,$Ij 1234 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1235 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1236 vpunpckhqdq $Ij,$Ij,$T1 1237 vmovdqu 0x50($inp),$Ii # I[5] 1238 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1239 vpxor $Ij,$T1,$T1 1240 1241 vpshufb $bswap,$Ii,$Ii 1242 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1243 vpunpckhqdq $Ii,$Ii,$T2 1244 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1245 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1246 vpxor $Ii,$T2,$T2 1247 vmovdqu 0x40($inp),$Ij # I[4] 1248 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1249 vmovdqu 0x50-0x40($Htbl),$HK 1250 1251 vpshufb $bswap,$Ij,$Ij 1252 vpxor $Xlo,$Zlo,$Zlo 1253 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1254 vpxor $Xhi,$Zhi,$Zhi 1255 vpunpckhqdq $Ij,$Ij,$T1 1256 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1257 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1258 vpxor $Xmi,$Zmi,$Zmi 1259 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1260 vpxor $Ij,$T1,$T1 1261 1262 vmovdqu 0x30($inp),$Ii # I[3] 1263 vpxor $Zlo,$Xlo,$Xlo 1264 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1265 vpxor $Zhi,$Xhi,$Xhi 1266 vpshufb $bswap,$Ii,$Ii 1267 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1268 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1269 vpxor $Zmi,$Xmi,$Xmi 1270 vpunpckhqdq $Ii,$Ii,$T2 1271 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1272 vmovdqu 0x80-0x40($Htbl),$HK 1273 vpxor $Ii,$T2,$T2 1274 1275 vmovdqu 0x20($inp),$Ij # I[2] 1276 vpxor $Xlo,$Zlo,$Zlo 1277 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1278 vpxor $Xhi,$Zhi,$Zhi 1279 vpshufb $bswap,$Ij,$Ij 1280 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1281 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1282 vpxor $Xmi,$Zmi,$Zmi 1283 vpunpckhqdq $Ij,$Ij,$T1 1284 vpclmulqdq \$0x00,$HK,$T2,$Xmi 1285 vpxor $Ij,$T1,$T1 1286 1287 vmovdqu 0x10($inp),$Ii # I[1] 1288 vpxor $Zlo,$Xlo,$Xlo 1289 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1290 vpxor $Zhi,$Xhi,$Xhi 1291 vpshufb $bswap,$Ii,$Ii 1292 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1293 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1294 vpxor $Zmi,$Xmi,$Xmi 1295 vpunpckhqdq $Ii,$Ii,$T2 1296 vpclmulqdq \$0x10,$HK,$T1,$Zmi 1297 vmovdqu 0xb0-0x40($Htbl),$HK 1298 vpxor $Ii,$T2,$T2 1299 1300 vmovdqu ($inp),$Ij # I[0] 1301 vpxor $Xlo,$Zlo,$Zlo 1302 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1303 vpxor $Xhi,$Zhi,$Zhi 1304 vpshufb $bswap,$Ij,$Ij 1305 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1306 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1307 vpxor $Xmi,$Zmi,$Zmi 1308 vpclmulqdq \$0x10,$HK,$T2,$Xmi 1309 1310 lea 0x80($inp),$inp 1311 cmp \$0x80,$len 1312 jb .Ltail_avx 1313 1314 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1315 sub \$0x80,$len 1316 jmp .Loop8x_avx 1317 1318.align 32 1319.Loop8x_avx: 1320 vpunpckhqdq $Ij,$Ij,$T1 1321 vmovdqu 0x70($inp),$Ii # I[7] 1322 vpxor $Xlo,$Zlo,$Zlo 1323 vpxor $Ij,$T1,$T1 1324 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi 1325 vpshufb $bswap,$Ii,$Ii 1326 vpxor $Xhi,$Zhi,$Zhi 1327 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo 1328 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1329 vpunpckhqdq $Ii,$Ii,$T2 1330 vpxor $Xmi,$Zmi,$Zmi 1331 vpclmulqdq \$0x00,$HK,$T1,$Tred 1332 vmovdqu 0x20-0x40($Htbl),$HK 1333 vpxor $Ii,$T2,$T2 1334 1335 vmovdqu 0x60($inp),$Ij # I[6] 1336 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1337 vpxor $Zlo,$Xi,$Xi # collect result 1338 vpshufb $bswap,$Ij,$Ij 1339 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1340 vxorps $Zhi,$Xo,$Xo 1341 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1342 vpunpckhqdq $Ij,$Ij,$T1 1343 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1344 vpxor $Zmi,$Tred,$Tred 1345 vxorps $Ij,$T1,$T1 1346 1347 vmovdqu 0x50($inp),$Ii # I[5] 1348 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing 1349 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1350 vpxor $Xo,$Tred,$Tred 1351 vpslldq \$8,$Tred,$T2 1352 vpxor $Xlo,$Zlo,$Zlo 1353 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1354 vpsrldq \$8,$Tred,$Tred 1355 vpxor $T2, $Xi, $Xi 1356 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1357 vpshufb $bswap,$Ii,$Ii 1358 vxorps $Tred,$Xo, $Xo 1359 vpxor $Xhi,$Zhi,$Zhi 1360 vpunpckhqdq $Ii,$Ii,$T2 1361 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1362 vmovdqu 0x50-0x40($Htbl),$HK 1363 vpxor $Ii,$T2,$T2 1364 vpxor $Xmi,$Zmi,$Zmi 1365 1366 vmovdqu 0x40($inp),$Ij # I[4] 1367 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase 1368 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1369 vpshufb $bswap,$Ij,$Ij 1370 vpxor $Zlo,$Xlo,$Xlo 1371 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1372 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1373 vpunpckhqdq $Ij,$Ij,$T1 1374 vpxor $Zhi,$Xhi,$Xhi 1375 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1376 vxorps $Ij,$T1,$T1 1377 vpxor $Zmi,$Xmi,$Xmi 1378 1379 vmovdqu 0x30($inp),$Ii # I[3] 1380 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1381 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1382 vpshufb $bswap,$Ii,$Ii 1383 vpxor $Xlo,$Zlo,$Zlo 1384 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1385 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1386 vpunpckhqdq $Ii,$Ii,$T2 1387 vpxor $Xhi,$Zhi,$Zhi 1388 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1389 vmovdqu 0x80-0x40($Htbl),$HK 1390 vpxor $Ii,$T2,$T2 1391 vpxor $Xmi,$Zmi,$Zmi 1392 1393 vmovdqu 0x20($inp),$Ij # I[2] 1394 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1395 vpshufb $bswap,$Ij,$Ij 1396 vpxor $Zlo,$Xlo,$Xlo 1397 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1398 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1399 vpunpckhqdq $Ij,$Ij,$T1 1400 vpxor $Zhi,$Xhi,$Xhi 1401 vpclmulqdq \$0x00,$HK, $T2,$Xmi 1402 vpxor $Ij,$T1,$T1 1403 vpxor $Zmi,$Xmi,$Xmi 1404 vxorps $Tred,$Xi,$Xi 1405 1406 vmovdqu 0x10($inp),$Ii # I[1] 1407 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase 1408 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo 1409 vpshufb $bswap,$Ii,$Ii 1410 vpxor $Xlo,$Zlo,$Zlo 1411 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi 1412 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1413 vpclmulqdq \$0x10,(%r10),$Xi,$Xi 1414 vxorps $Xo,$Tred,$Tred 1415 vpunpckhqdq $Ii,$Ii,$T2 1416 vpxor $Xhi,$Zhi,$Zhi 1417 vpclmulqdq \$0x10,$HK, $T1,$Zmi 1418 vmovdqu 0xb0-0x40($Htbl),$HK 1419 vpxor $Ii,$T2,$T2 1420 vpxor $Xmi,$Zmi,$Zmi 1421 1422 vmovdqu ($inp),$Ij # I[0] 1423 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo 1424 vpshufb $bswap,$Ij,$Ij 1425 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi 1426 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 1427 vpxor $Tred,$Ij,$Ij 1428 vpclmulqdq \$0x10,$HK, $T2,$Xmi 1429 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1430 1431 lea 0x80($inp),$inp 1432 sub \$0x80,$len 1433 jnc .Loop8x_avx 1434 1435 add \$0x80,$len 1436 jmp .Ltail_no_xor_avx 1437 1438.align 32 1439.Lshort_avx: 1440 vmovdqu -0x10($inp,$len),$Ii # very last word 1441 lea ($inp,$len),$inp 1442 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 1443 vmovdqu 0x20-0x40($Htbl),$HK 1444 vpshufb $bswap,$Ii,$Ij 1445 1446 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, 1447 vmovdqa $Xhi,$Zhi # $Zhi and 1448 vmovdqa $Xmi,$Zmi # $Zmi 1449 sub \$0x10,$len 1450 jz .Ltail_avx 1451 1452 vpunpckhqdq $Ij,$Ij,$T1 1453 vpxor $Xlo,$Zlo,$Zlo 1454 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1455 vpxor $Ij,$T1,$T1 1456 vmovdqu -0x20($inp),$Ii 1457 vpxor $Xhi,$Zhi,$Zhi 1458 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1459 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 1460 vpshufb $bswap,$Ii,$Ij 1461 vpxor $Xmi,$Zmi,$Zmi 1462 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1463 vpsrldq \$8,$HK,$HK 1464 sub \$0x10,$len 1465 jz .Ltail_avx 1466 1467 vpunpckhqdq $Ij,$Ij,$T1 1468 vpxor $Xlo,$Zlo,$Zlo 1469 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1470 vpxor $Ij,$T1,$T1 1471 vmovdqu -0x30($inp),$Ii 1472 vpxor $Xhi,$Zhi,$Zhi 1473 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1474 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 1475 vpshufb $bswap,$Ii,$Ij 1476 vpxor $Xmi,$Zmi,$Zmi 1477 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1478 vmovdqu 0x50-0x40($Htbl),$HK 1479 sub \$0x10,$len 1480 jz .Ltail_avx 1481 1482 vpunpckhqdq $Ij,$Ij,$T1 1483 vpxor $Xlo,$Zlo,$Zlo 1484 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1485 vpxor $Ij,$T1,$T1 1486 vmovdqu -0x40($inp),$Ii 1487 vpxor $Xhi,$Zhi,$Zhi 1488 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1489 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 1490 vpshufb $bswap,$Ii,$Ij 1491 vpxor $Xmi,$Zmi,$Zmi 1492 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1493 vpsrldq \$8,$HK,$HK 1494 sub \$0x10,$len 1495 jz .Ltail_avx 1496 1497 vpunpckhqdq $Ij,$Ij,$T1 1498 vpxor $Xlo,$Zlo,$Zlo 1499 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1500 vpxor $Ij,$T1,$T1 1501 vmovdqu -0x50($inp),$Ii 1502 vpxor $Xhi,$Zhi,$Zhi 1503 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1504 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 1505 vpshufb $bswap,$Ii,$Ij 1506 vpxor $Xmi,$Zmi,$Zmi 1507 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1508 vmovdqu 0x80-0x40($Htbl),$HK 1509 sub \$0x10,$len 1510 jz .Ltail_avx 1511 1512 vpunpckhqdq $Ij,$Ij,$T1 1513 vpxor $Xlo,$Zlo,$Zlo 1514 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1515 vpxor $Ij,$T1,$T1 1516 vmovdqu -0x60($inp),$Ii 1517 vpxor $Xhi,$Zhi,$Zhi 1518 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1519 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 1520 vpshufb $bswap,$Ii,$Ij 1521 vpxor $Xmi,$Zmi,$Zmi 1522 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1523 vpsrldq \$8,$HK,$HK 1524 sub \$0x10,$len 1525 jz .Ltail_avx 1526 1527 vpunpckhqdq $Ij,$Ij,$T1 1528 vpxor $Xlo,$Zlo,$Zlo 1529 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1530 vpxor $Ij,$T1,$T1 1531 vmovdqu -0x70($inp),$Ii 1532 vpxor $Xhi,$Zhi,$Zhi 1533 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1534 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 1535 vpshufb $bswap,$Ii,$Ij 1536 vpxor $Xmi,$Zmi,$Zmi 1537 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1538 vmovq 0xb8-0x40($Htbl),$HK 1539 sub \$0x10,$len 1540 jmp .Ltail_avx 1541 1542.align 32 1543.Ltail_avx: 1544 vpxor $Xi,$Ij,$Ij # accumulate $Xi 1545.Ltail_no_xor_avx: 1546 vpunpckhqdq $Ij,$Ij,$T1 1547 vpxor $Xlo,$Zlo,$Zlo 1548 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo 1549 vpxor $Ij,$T1,$T1 1550 vpxor $Xhi,$Zhi,$Zhi 1551 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi 1552 vpxor $Xmi,$Zmi,$Zmi 1553 vpclmulqdq \$0x00,$HK,$T1,$Xmi 1554 1555 vmovdqu (%r10),$Tred 1556 1557 vpxor $Xlo,$Zlo,$Xi 1558 vpxor $Xhi,$Zhi,$Xo 1559 vpxor $Xmi,$Zmi,$Zmi 1560 1561 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing 1562 vpxor $Xo, $Zmi,$Zmi 1563 vpslldq \$8, $Zmi,$T2 1564 vpsrldq \$8, $Zmi,$Zmi 1565 vpxor $T2, $Xi, $Xi 1566 vpxor $Zmi,$Xo, $Xo 1567 1568 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase 1569 vpalignr \$8,$Xi,$Xi,$Xi 1570 vpxor $T2,$Xi,$Xi 1571 1572 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase 1573 vpalignr \$8,$Xi,$Xi,$Xi 1574 vpxor $Xo,$Xi,$Xi 1575 vpxor $T2,$Xi,$Xi 1576 1577 cmp \$0,$len 1578 jne .Lshort_avx 1579 1580 vpshufb $bswap,$Xi,$Xi 1581 vmovdqu $Xi,($Xip) 1582 vzeroupper 1583___ 1584$code.=<<___ if ($win64); 1585 movaps (%rsp),%xmm6 1586 movaps 0x10(%rsp),%xmm7 1587 movaps 0x20(%rsp),%xmm8 1588 movaps 0x30(%rsp),%xmm9 1589 movaps 0x40(%rsp),%xmm10 1590 movaps 0x50(%rsp),%xmm11 1591 movaps 0x60(%rsp),%xmm12 1592 movaps 0x70(%rsp),%xmm13 1593 movaps 0x80(%rsp),%xmm14 1594 movaps 0x90(%rsp),%xmm15 1595 lea 0xa8(%rsp),%rsp 1596.LSEH_end_gcm_ghash_avx: 1597___ 1598$code.=<<___; 1599 ret 1600.cfi_endproc 1601.size gcm_ghash_avx,.-gcm_ghash_avx 1602___ 1603} else { 1604$code.=<<___; 1605 jmp .L_ghash_clmul 1606.cfi_endproc 1607.size gcm_ghash_avx,.-gcm_ghash_avx 1608___ 1609} 1610 1611$code.=<<___; 1612.align 64 1613.Lbswap_mask: 1614 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1615.L0x1c2_polynomial: 1616 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1617.L7_mask: 1618 .long 7,0,7,0 1619.L7_mask_poly: 1620 .long 7,0,`0xE1<<1`,0 1621.align 64 1622.type .Lrem_4bit,\@object 1623.Lrem_4bit: 1624 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` 1625 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` 1626 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` 1627 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` 1628.type .Lrem_8bit,\@object 1629.Lrem_8bit: 1630 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 1631 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 1632 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 1633 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 1634 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 1635 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 1636 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 1637 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 1638 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 1639 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 1640 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 1641 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 1642 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 1643 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 1644 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 1645 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 1646 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 1647 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 1648 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 1649 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 1650 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 1651 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 1652 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 1653 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 1654 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 1655 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 1656 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 1657 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 1658 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 1659 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 1660 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 1661 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 1662 1663.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1664.align 64 1665___ 1666 1667# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1668# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1669if ($win64) { 1670$rec="%rcx"; 1671$frame="%rdx"; 1672$context="%r8"; 1673$disp="%r9"; 1674 1675$code.=<<___; 1676.extern __imp_RtlVirtualUnwind 1677.type se_handler,\@abi-omnipotent 1678.align 16 1679se_handler: 1680 push %rsi 1681 push %rdi 1682 push %rbx 1683 push %rbp 1684 push %r12 1685 push %r13 1686 push %r14 1687 push %r15 1688 pushfq 1689 sub \$64,%rsp 1690 1691 mov 120($context),%rax # pull context->Rax 1692 mov 248($context),%rbx # pull context->Rip 1693 1694 mov 8($disp),%rsi # disp->ImageBase 1695 mov 56($disp),%r11 # disp->HandlerData 1696 1697 mov 0(%r11),%r10d # HandlerData[0] 1698 lea (%rsi,%r10),%r10 # prologue label 1699 cmp %r10,%rbx # context->Rip<prologue label 1700 jb .Lin_prologue 1701 1702 mov 152($context),%rax # pull context->Rsp 1703 1704 mov 4(%r11),%r10d # HandlerData[1] 1705 lea (%rsi,%r10),%r10 # epilogue label 1706 cmp %r10,%rbx # context->Rip>=epilogue label 1707 jae .Lin_prologue 1708 1709 lea 48+280(%rax),%rax # adjust "rsp" 1710 1711 mov -8(%rax),%rbx 1712 mov -16(%rax),%rbp 1713 mov -24(%rax),%r12 1714 mov -32(%rax),%r13 1715 mov -40(%rax),%r14 1716 mov -48(%rax),%r15 1717 mov %rbx,144($context) # restore context->Rbx 1718 mov %rbp,160($context) # restore context->Rbp 1719 mov %r12,216($context) # restore context->R12 1720 mov %r13,224($context) # restore context->R13 1721 mov %r14,232($context) # restore context->R14 1722 mov %r15,240($context) # restore context->R15 1723 1724.Lin_prologue: 1725 mov 8(%rax),%rdi 1726 mov 16(%rax),%rsi 1727 mov %rax,152($context) # restore context->Rsp 1728 mov %rsi,168($context) # restore context->Rsi 1729 mov %rdi,176($context) # restore context->Rdi 1730 1731 mov 40($disp),%rdi # disp->ContextRecord 1732 mov $context,%rsi # context 1733 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1734 .long 0xa548f3fc # cld; rep movsq 1735 1736 mov $disp,%rsi 1737 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1738 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1739 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1740 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1741 mov 40(%rsi),%r10 # disp->ContextRecord 1742 lea 56(%rsi),%r11 # &disp->HandlerData 1743 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1744 mov %r10,32(%rsp) # arg5 1745 mov %r11,40(%rsp) # arg6 1746 mov %r12,48(%rsp) # arg7 1747 mov %rcx,56(%rsp) # arg8, (NULL) 1748 call *__imp_RtlVirtualUnwind(%rip) 1749 1750 mov \$1,%eax # ExceptionContinueSearch 1751 add \$64,%rsp 1752 popfq 1753 pop %r15 1754 pop %r14 1755 pop %r13 1756 pop %r12 1757 pop %rbp 1758 pop %rbx 1759 pop %rdi 1760 pop %rsi 1761 ret 1762.size se_handler,.-se_handler 1763 1764.section .pdata 1765.align 4 1766 .rva .LSEH_begin_gcm_gmult_4bit 1767 .rva .LSEH_end_gcm_gmult_4bit 1768 .rva .LSEH_info_gcm_gmult_4bit 1769 1770 .rva .LSEH_begin_gcm_ghash_4bit 1771 .rva .LSEH_end_gcm_ghash_4bit 1772 .rva .LSEH_info_gcm_ghash_4bit 1773 1774 .rva .LSEH_begin_gcm_init_clmul 1775 .rva .LSEH_end_gcm_init_clmul 1776 .rva .LSEH_info_gcm_init_clmul 1777 1778 .rva .LSEH_begin_gcm_ghash_clmul 1779 .rva .LSEH_end_gcm_ghash_clmul 1780 .rva .LSEH_info_gcm_ghash_clmul 1781___ 1782$code.=<<___ if ($avx); 1783 .rva .LSEH_begin_gcm_init_avx 1784 .rva .LSEH_end_gcm_init_avx 1785 .rva .LSEH_info_gcm_init_clmul 1786 1787 .rva .LSEH_begin_gcm_ghash_avx 1788 .rva .LSEH_end_gcm_ghash_avx 1789 .rva .LSEH_info_gcm_ghash_clmul 1790___ 1791$code.=<<___; 1792.section .xdata 1793.align 8 1794.LSEH_info_gcm_gmult_4bit: 1795 .byte 9,0,0,0 1796 .rva se_handler 1797 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData 1798.LSEH_info_gcm_ghash_4bit: 1799 .byte 9,0,0,0 1800 .rva se_handler 1801 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData 1802.LSEH_info_gcm_init_clmul: 1803 .byte 0x01,0x08,0x03,0x00 1804 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1805 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 1806.LSEH_info_gcm_ghash_clmul: 1807 .byte 0x01,0x33,0x16,0x00 1808 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 1809 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 1810 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 1811 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 1812 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 1813 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 1814 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 1815 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 1816 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 1817 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 1818 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1819___ 1820} 1821 1822$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1823 1824print $code; 1825 1826close STDOUT or die "error closing STDOUT: $!"; 1827