1#! /usr/bin/env perl 2# Copyright 2013-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2013 18# 19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled 20# in http://download.intel.com/design/intarch/papers/323686.pdf, is 21# that since AESNI-CBC encrypt exhibit *very* low instruction-level 22# parallelism, interleaving it with another algorithm would allow to 23# utilize processor resources better and achieve better performance. 24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and 25# AESNI code is weaved into it. As SHA256 dominates execution time, 26# stitch performance does not depend on AES key length. Below are 27# performance numbers in cycles per processed byte, less is better, 28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched 29# subroutine: 30# 31# AES-128/-192/-256+SHA256 this(**) gain 32# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% 33# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% 34# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% 35# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% 36# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% 37# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% 38# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% 39# 40# (*) there are XOP, AVX1 and AVX2 code paths, meaning that 41# Westmere is omitted from loop, this is because gain was not 42# estimated high enough to justify the effort; 43# (**) these are EVP-free results, results obtained with 'speed 44# -evp aes-256-cbc-hmac-sha256' will vary by percent or two; 45# (***) these are SHAEXT results; 46 47# $output is the last argument if it looks like a file (it has an extension) 48# $flavour is the first argument if it doesn't look like a file 49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 51 52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 53 54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 57die "can't locate x86_64-xlate.pl"; 58 59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 61 $avx = ($1>=2.19) + ($1>=2.22); 62} 63 64if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 66 $avx = ($1>=2.09) + ($1>=2.10); 67} 68 69if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 70 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 71 $avx = ($1>=10) + ($1>=12); 72} 73 74if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 75 $avx = ($2>=3.0) + ($2>3.0); 76} 77 78$shaext=$avx; ### set to zero if compiling for 1.0.1 79$avx=1 if (!$shaext && $avx); 80 81open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 82 or die "can't call $xlate: $!"; 83*STDOUT=*OUT; 84 85$func="aesni_cbc_sha256_enc"; 86$TABLE="K256"; 87$SZ=4; 88@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 89 "%r8d","%r9d","%r10d","%r11d"); 90($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); 91@Sigma0=( 2,13,22); 92@Sigma1=( 6,11,25); 93@sigma0=( 7,18, 3); 94@sigma1=(17,19,10); 95$rounds=64; 96 97######################################################################## 98# void aesni_cbc_sha256_enc(const void *inp, 99# void *out, 100# size_t length, 101# const AES_KEY *key, 102# unsigned char *iv, 103# SHA256_CTX *ctx, 104# const void *in0); 105($inp, $out, $len, $key, $ivp, $ctx, $in0) = 106("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 107 108$Tbl="%rbp"; 109 110$_inp="16*$SZ+0*8(%rsp)"; 111$_out="16*$SZ+1*8(%rsp)"; 112$_end="16*$SZ+2*8(%rsp)"; 113$_key="16*$SZ+3*8(%rsp)"; 114$_ivp="16*$SZ+4*8(%rsp)"; 115$_ctx="16*$SZ+5*8(%rsp)"; 116$_in0="16*$SZ+6*8(%rsp)"; 117$_rsp="`16*$SZ+7*8`(%rsp)"; 118$framesz=16*$SZ+8*8; 119 120$code=<<___; 121.text 122 123.extern OPENSSL_ia32cap_P 124.globl $func 125.type $func,\@abi-omnipotent 126.align 16 127$func: 128.cfi_startproc 129___ 130 if ($avx) { 131$code.=<<___; 132 lea OPENSSL_ia32cap_P(%rip),%r11 133 mov \$1,%eax 134 cmp \$0,`$win64?"%rcx":"%rdi"` 135 je .Lprobe 136 mov 0(%r11),%eax 137 mov 4(%r11),%r10 138___ 139$code.=<<___ if ($shaext); 140 bt \$61,%r10 # check for SHA 141 jc ${func}_shaext 142___ 143$code.=<<___; 144 mov %r10,%r11 145 shr \$32,%r11 146 147 test \$`1<<11`,%r10d # check for XOP 148 jnz ${func}_xop 149___ 150$code.=<<___ if ($avx>1); 151 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 152 cmp \$`1<<8|1<<5|1<<3`,%r11d 153 je ${func}_avx2 154___ 155$code.=<<___; 156 and \$`1<<28`,%r10d # check for AVX 157 jnz ${func}_avx 158 ud2 159___ 160 } 161$code.=<<___; 162 xor %eax,%eax 163 cmp \$0,`$win64?"%rcx":"%rdi"` 164 je .Lprobe 165 ud2 166.Lprobe: 167 ret 168.cfi_endproc 169.size $func,.-$func 170 171.section .rodata align=64 172.align 64 173.type $TABLE,\@object 174$TABLE: 175 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 176 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 177 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 178 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 179 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 180 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 181 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 182 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 183 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 184 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 185 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 186 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 187 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 188 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 189 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 190 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 191 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 192 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 193 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 194 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 195 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 196 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 197 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 198 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 199 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 200 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 201 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 202 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 203 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 204 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 205 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 206 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 207 208 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 209 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 210 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 211 .long 0,0,0,0, 0,0,0,0 212 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 213.align 64 214.previous 215___ 216 217###################################################################### 218# SIMD code paths 219# 220{{{ 221($iv,$inout,$roundkey,$temp, 222 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); 223 224$aesni_cbc_idx=0; 225@aesni_cbc_block = ( 226## &vmovdqu ($roundkey,"0x00-0x80($inp)");' 227## &vmovdqu ($inout,($inp)); 228## &mov ($_inp,$inp); 229 230 '&vpxor ($inout,$inout,$roundkey);'. 231 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', 232 233 '&vpxor ($inout,$inout,$iv);', 234 235 '&vaesenc ($inout,$inout,$roundkey);'. 236 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', 237 238 '&vaesenc ($inout,$inout,$roundkey);'. 239 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', 240 241 '&vaesenc ($inout,$inout,$roundkey);'. 242 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', 243 244 '&vaesenc ($inout,$inout,$roundkey);'. 245 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', 246 247 '&vaesenc ($inout,$inout,$roundkey);'. 248 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', 249 250 '&vaesenc ($inout,$inout,$roundkey);'. 251 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', 252 253 '&vaesenc ($inout,$inout,$roundkey);'. 254 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', 255 256 '&vaesenc ($inout,$inout,$roundkey);'. 257 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', 258 259 '&vaesenc ($inout,$inout,$roundkey);'. 260 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', 261 262 '&vaesenclast ($temp,$inout,$roundkey);'. 263 ' &vaesenc ($inout,$inout,$roundkey);'. 264 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', 265 266 '&vpand ($iv,$temp,$mask10);'. 267 ' &vaesenc ($inout,$inout,$roundkey);'. 268 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', 269 270 '&vaesenclast ($temp,$inout,$roundkey);'. 271 ' &vaesenc ($inout,$inout,$roundkey);'. 272 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', 273 274 '&vpand ($temp,$temp,$mask12);'. 275 ' &vaesenc ($inout,$inout,$roundkey);'. 276 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', 277 278 '&vpor ($iv,$iv,$temp);'. 279 ' &vaesenclast ($temp,$inout,$roundkey);'. 280 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' 281 282## &mov ($inp,$_inp); 283## &mov ($out,$_out); 284## &vpand ($temp,$temp,$mask14); 285## &vpor ($iv,$iv,$temp); 286## &vmovdqu ($iv,($out,$inp); 287## &lea (inp,16($inp)); 288); 289 290my $a4=$T1; 291my ($a,$b,$c,$d,$e,$f,$g,$h); 292 293sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 294{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 295 my $arg = pop; 296 $arg = "\$$arg" if ($arg*1 eq $arg); 297 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 298} 299 300sub body_00_15 () { 301 ( 302 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 303 304 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 305 '&mov ($a,$a1)', 306 '&mov ($a4,$f)', 307 308 '&xor ($a0,$e)', 309 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 310 '&xor ($a4,$g)', # f^g 311 312 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 313 '&xor ($a1,$a)', 314 '&and ($a4,$e)', # (f^g)&e 315 316 @aesni_cbc_block[$aesni_cbc_idx++]. 317 '&xor ($a0,$e)', 318 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 319 '&mov ($a2,$a)', 320 321 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 322 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 323 '&xor ($a2,$b)', # a^b, b^c in next round 324 325 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 326 '&add ($h,$a4)', # h+=Ch(e,f,g) 327 '&and ($a3,$a2)', # (b^c)&(a^b) 328 329 '&xor ($a1,$a)', 330 '&add ($h,$a0)', # h+=Sigma1(e) 331 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 332 333 '&add ($d,$h)', # d+=h 334 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 335 '&add ($h,$a3)', # h+=Maj(a,b,c) 336 337 '&mov ($a0,$d)', 338 '&add ($a1,$h);'. # h+=Sigma0(a) 339 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 340 ); 341} 342 343if ($avx) {{ 344###################################################################### 345# XOP code path 346# 347$code.=<<___; 348.type ${func}_xop,\@function,6 349.align 64 350${func}_xop: 351.cfi_startproc 352.Lxop_shortcut: 353 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 354 mov %rsp,%rax # copy %rsp 355.cfi_def_cfa_register %rax 356 push %rbx 357.cfi_push %rbx 358 push %rbp 359.cfi_push %rbp 360 push %r12 361.cfi_push %r12 362 push %r13 363.cfi_push %r13 364 push %r14 365.cfi_push %r14 366 push %r15 367.cfi_push %r15 368 sub \$`$framesz+$win64*16*10`,%rsp 369 and \$-64,%rsp # align stack frame 370 371 shl \$6,$len 372 sub $inp,$out # re-bias 373 sub $inp,$in0 374 add $inp,$len # end of input 375 376 #mov $inp,$_inp # saved later 377 mov $out,$_out 378 mov $len,$_end 379 #mov $key,$_key # remains resident in $inp register 380 mov $ivp,$_ivp 381 mov $ctx,$_ctx 382 mov $in0,$_in0 383 mov %rax,$_rsp 384.cfi_cfa_expression $_rsp,deref,+8 385___ 386$code.=<<___ if ($win64); 387 movaps %xmm6,`$framesz+16*0`(%rsp) 388 movaps %xmm7,`$framesz+16*1`(%rsp) 389 movaps %xmm8,`$framesz+16*2`(%rsp) 390 movaps %xmm9,`$framesz+16*3`(%rsp) 391 movaps %xmm10,`$framesz+16*4`(%rsp) 392 movaps %xmm11,`$framesz+16*5`(%rsp) 393 movaps %xmm12,`$framesz+16*6`(%rsp) 394 movaps %xmm13,`$framesz+16*7`(%rsp) 395 movaps %xmm14,`$framesz+16*8`(%rsp) 396 movaps %xmm15,`$framesz+16*9`(%rsp) 397___ 398$code.=<<___; 399.Lprologue_xop: 400 vzeroall 401 402 mov $inp,%r12 # borrow $a4 403 lea 0x80($key),$inp # size optimization, reassign 404 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 405 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 406 mov $ctx,%r15 # borrow $a2 407 mov $in0,%rsi # borrow $a3 408 vmovdqu ($ivp),$iv # load IV 409 sub \$9,%r14 410 411 mov $SZ*0(%r15),$A 412 mov $SZ*1(%r15),$B 413 mov $SZ*2(%r15),$C 414 mov $SZ*3(%r15),$D 415 mov $SZ*4(%r15),$E 416 mov $SZ*5(%r15),$F 417 mov $SZ*6(%r15),$G 418 mov $SZ*7(%r15),$H 419 420 vmovdqa 0x00(%r13,%r14,8),$mask14 421 vmovdqa 0x10(%r13,%r14,8),$mask12 422 vmovdqa 0x20(%r13,%r14,8),$mask10 423 vmovdqu 0x00-0x80($inp),$roundkey 424 jmp .Lloop_xop 425___ 426 if ($SZ==4) { # SHA256 427 my @X = map("%xmm$_",(0..3)); 428 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 429 430$code.=<<___; 431.align 16 432.Lloop_xop: 433 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 434 vmovdqu 0x00(%rsi,%r12),@X[0] 435 vmovdqu 0x10(%rsi,%r12),@X[1] 436 vmovdqu 0x20(%rsi,%r12),@X[2] 437 vmovdqu 0x30(%rsi,%r12),@X[3] 438 vpshufb $t3,@X[0],@X[0] 439 lea $TABLE(%rip),$Tbl 440 vpshufb $t3,@X[1],@X[1] 441 vpshufb $t3,@X[2],@X[2] 442 vpaddd 0x00($Tbl),@X[0],$t0 443 vpshufb $t3,@X[3],@X[3] 444 vpaddd 0x20($Tbl),@X[1],$t1 445 vpaddd 0x40($Tbl),@X[2],$t2 446 vpaddd 0x60($Tbl),@X[3],$t3 447 vmovdqa $t0,0x00(%rsp) 448 mov $A,$a1 449 vmovdqa $t1,0x10(%rsp) 450 mov $B,$a3 451 vmovdqa $t2,0x20(%rsp) 452 xor $C,$a3 # magic 453 vmovdqa $t3,0x30(%rsp) 454 mov $E,$a0 455 jmp .Lxop_00_47 456 457.align 16 458.Lxop_00_47: 459 sub \$-16*2*$SZ,$Tbl # size optimization 460 vmovdqu (%r12),$inout # $a4 461 mov %r12,$_inp # $a4 462___ 463sub XOP_256_00_47 () { 464my $j = shift; 465my $body = shift; 466my @X = @_; 467my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 468 469 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 470 eval(shift(@insns)); 471 eval(shift(@insns)); 472 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 473 eval(shift(@insns)); 474 eval(shift(@insns)); 475 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 476 eval(shift(@insns)); 477 eval(shift(@insns)); 478 &vpsrld ($t0,$t0,$sigma0[2]); 479 eval(shift(@insns)); 480 eval(shift(@insns)); 481 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 482 eval(shift(@insns)); 483 eval(shift(@insns)); 484 eval(shift(@insns)); 485 eval(shift(@insns)); 486 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 487 eval(shift(@insns)); 488 eval(shift(@insns)); 489 &vpxor ($t0,$t0,$t1); 490 eval(shift(@insns)); 491 eval(shift(@insns)); 492 eval(shift(@insns)); 493 eval(shift(@insns)); 494 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 495 eval(shift(@insns)); 496 eval(shift(@insns)); 497 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 498 eval(shift(@insns)); 499 eval(shift(@insns)); 500 &vpsrld ($t2,@X[3],$sigma1[2]); 501 eval(shift(@insns)); 502 eval(shift(@insns)); 503 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 504 eval(shift(@insns)); 505 eval(shift(@insns)); 506 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 507 eval(shift(@insns)); 508 eval(shift(@insns)); 509 &vpxor ($t3,$t3,$t2); 510 eval(shift(@insns)); 511 eval(shift(@insns)); 512 eval(shift(@insns)); 513 eval(shift(@insns)); 514 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 515 eval(shift(@insns)); 516 eval(shift(@insns)); 517 eval(shift(@insns)); 518 eval(shift(@insns)); 519 &vpsrldq ($t3,$t3,8); 520 eval(shift(@insns)); 521 eval(shift(@insns)); 522 eval(shift(@insns)); 523 eval(shift(@insns)); 524 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 525 eval(shift(@insns)); 526 eval(shift(@insns)); 527 eval(shift(@insns)); 528 eval(shift(@insns)); 529 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 530 eval(shift(@insns)); 531 eval(shift(@insns)); 532 &vpsrld ($t2,@X[0],$sigma1[2]); 533 eval(shift(@insns)); 534 eval(shift(@insns)); 535 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 536 eval(shift(@insns)); 537 eval(shift(@insns)); 538 &vpxor ($t3,$t3,$t2); 539 eval(shift(@insns)); 540 eval(shift(@insns)); 541 eval(shift(@insns)); 542 eval(shift(@insns)); 543 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 544 eval(shift(@insns)); 545 eval(shift(@insns)); 546 eval(shift(@insns)); 547 eval(shift(@insns)); 548 &vpslldq ($t3,$t3,8); # 22 instructions 549 eval(shift(@insns)); 550 eval(shift(@insns)); 551 eval(shift(@insns)); 552 eval(shift(@insns)); 553 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 554 eval(shift(@insns)); 555 eval(shift(@insns)); 556 eval(shift(@insns)); 557 eval(shift(@insns)); 558 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 559 foreach (@insns) { eval; } # remaining instructions 560 &vmovdqa (16*$j."(%rsp)",$t2); 561} 562 563 $aesni_cbc_idx=0; 564 for ($i=0,$j=0; $j<4; $j++) { 565 &XOP_256_00_47($j,\&body_00_15,@X); 566 push(@X,shift(@X)); # rotate(@X) 567 } 568 &mov ("%r12",$_inp); # borrow $a4 569 &vpand ($temp,$temp,$mask14); 570 &mov ("%r15",$_out); # borrow $a2 571 &vpor ($iv,$iv,$temp); 572 &vmovdqu ("(%r15,%r12)",$iv); # write output 573 &lea ("%r12","16(%r12)"); # inp++ 574 575 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 576 &jne (".Lxop_00_47"); 577 578 &vmovdqu ($inout,"(%r12)"); 579 &mov ($_inp,"%r12"); 580 581 $aesni_cbc_idx=0; 582 for ($i=0; $i<16; ) { 583 foreach(body_00_15()) { eval; } 584 } 585 } 586$code.=<<___; 587 mov $_inp,%r12 # borrow $a4 588 mov $_out,%r13 # borrow $a0 589 mov $_ctx,%r15 # borrow $a2 590 mov $_in0,%rsi # borrow $a3 591 592 vpand $mask14,$temp,$temp 593 mov $a1,$A 594 vpor $temp,$iv,$iv 595 vmovdqu $iv,(%r13,%r12) # write output 596 lea 16(%r12),%r12 # inp++ 597 598 add $SZ*0(%r15),$A 599 add $SZ*1(%r15),$B 600 add $SZ*2(%r15),$C 601 add $SZ*3(%r15),$D 602 add $SZ*4(%r15),$E 603 add $SZ*5(%r15),$F 604 add $SZ*6(%r15),$G 605 add $SZ*7(%r15),$H 606 607 cmp $_end,%r12 608 609 mov $A,$SZ*0(%r15) 610 mov $B,$SZ*1(%r15) 611 mov $C,$SZ*2(%r15) 612 mov $D,$SZ*3(%r15) 613 mov $E,$SZ*4(%r15) 614 mov $F,$SZ*5(%r15) 615 mov $G,$SZ*6(%r15) 616 mov $H,$SZ*7(%r15) 617 618 jb .Lloop_xop 619 620 mov $_ivp,$ivp 621 mov $_rsp,%rsi 622.cfi_def_cfa %rsi,8 623 vmovdqu $iv,($ivp) # output IV 624 vzeroall 625___ 626$code.=<<___ if ($win64); 627 movaps `$framesz+16*0`(%rsp),%xmm6 628 movaps `$framesz+16*1`(%rsp),%xmm7 629 movaps `$framesz+16*2`(%rsp),%xmm8 630 movaps `$framesz+16*3`(%rsp),%xmm9 631 movaps `$framesz+16*4`(%rsp),%xmm10 632 movaps `$framesz+16*5`(%rsp),%xmm11 633 movaps `$framesz+16*6`(%rsp),%xmm12 634 movaps `$framesz+16*7`(%rsp),%xmm13 635 movaps `$framesz+16*8`(%rsp),%xmm14 636 movaps `$framesz+16*9`(%rsp),%xmm15 637___ 638$code.=<<___; 639 mov -48(%rsi),%r15 640.cfi_restore %r15 641 mov -40(%rsi),%r14 642.cfi_restore %r14 643 mov -32(%rsi),%r13 644.cfi_restore %r13 645 mov -24(%rsi),%r12 646.cfi_restore %r12 647 mov -16(%rsi),%rbp 648.cfi_restore %rbp 649 mov -8(%rsi),%rbx 650.cfi_restore %rbx 651 lea (%rsi),%rsp 652.cfi_def_cfa_register %rsp 653.Lepilogue_xop: 654 ret 655.cfi_endproc 656.size ${func}_xop,.-${func}_xop 657___ 658###################################################################### 659# AVX+shrd code path 660# 661local *ror = sub { &shrd(@_[0],@_) }; 662 663$code.=<<___; 664.type ${func}_avx,\@function,6 665.align 64 666${func}_avx: 667.cfi_startproc 668.Lavx_shortcut: 669 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 670 mov %rsp,%rax # copy %rsp 671.cfi_def_cfa_register %rax 672 push %rbx 673.cfi_push %rbx 674 push %rbp 675.cfi_push %rbp 676 push %r12 677.cfi_push %r12 678 push %r13 679.cfi_push %r13 680 push %r14 681.cfi_push %r14 682 push %r15 683.cfi_push %r15 684 sub \$`$framesz+$win64*16*10`,%rsp 685 and \$-64,%rsp # align stack frame 686 687 shl \$6,$len 688 sub $inp,$out # re-bias 689 sub $inp,$in0 690 add $inp,$len # end of input 691 692 #mov $inp,$_inp # saved later 693 mov $out,$_out 694 mov $len,$_end 695 #mov $key,$_key # remains resident in $inp register 696 mov $ivp,$_ivp 697 mov $ctx,$_ctx 698 mov $in0,$_in0 699 mov %rax,$_rsp 700.cfi_cfa_expression $_rsp,deref,+8 701___ 702$code.=<<___ if ($win64); 703 movaps %xmm6,`$framesz+16*0`(%rsp) 704 movaps %xmm7,`$framesz+16*1`(%rsp) 705 movaps %xmm8,`$framesz+16*2`(%rsp) 706 movaps %xmm9,`$framesz+16*3`(%rsp) 707 movaps %xmm10,`$framesz+16*4`(%rsp) 708 movaps %xmm11,`$framesz+16*5`(%rsp) 709 movaps %xmm12,`$framesz+16*6`(%rsp) 710 movaps %xmm13,`$framesz+16*7`(%rsp) 711 movaps %xmm14,`$framesz+16*8`(%rsp) 712 movaps %xmm15,`$framesz+16*9`(%rsp) 713___ 714$code.=<<___; 715.Lprologue_avx: 716 vzeroall 717 718 mov $inp,%r12 # borrow $a4 719 lea 0x80($key),$inp # size optimization, reassign 720 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 721 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 722 mov $ctx,%r15 # borrow $a2 723 mov $in0,%rsi # borrow $a3 724 vmovdqu ($ivp),$iv # load IV 725 sub \$9,%r14 726 727 mov $SZ*0(%r15),$A 728 mov $SZ*1(%r15),$B 729 mov $SZ*2(%r15),$C 730 mov $SZ*3(%r15),$D 731 mov $SZ*4(%r15),$E 732 mov $SZ*5(%r15),$F 733 mov $SZ*6(%r15),$G 734 mov $SZ*7(%r15),$H 735 736 vmovdqa 0x00(%r13,%r14,8),$mask14 737 vmovdqa 0x10(%r13,%r14,8),$mask12 738 vmovdqa 0x20(%r13,%r14,8),$mask10 739 vmovdqu 0x00-0x80($inp),$roundkey 740___ 741 if ($SZ==4) { # SHA256 742 my @X = map("%xmm$_",(0..3)); 743 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 744 745$code.=<<___; 746 jmp .Lloop_avx 747.align 16 748.Lloop_avx: 749 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 750 vmovdqu 0x00(%rsi,%r12),@X[0] 751 vmovdqu 0x10(%rsi,%r12),@X[1] 752 vmovdqu 0x20(%rsi,%r12),@X[2] 753 vmovdqu 0x30(%rsi,%r12),@X[3] 754 vpshufb $t3,@X[0],@X[0] 755 lea $TABLE(%rip),$Tbl 756 vpshufb $t3,@X[1],@X[1] 757 vpshufb $t3,@X[2],@X[2] 758 vpaddd 0x00($Tbl),@X[0],$t0 759 vpshufb $t3,@X[3],@X[3] 760 vpaddd 0x20($Tbl),@X[1],$t1 761 vpaddd 0x40($Tbl),@X[2],$t2 762 vpaddd 0x60($Tbl),@X[3],$t3 763 vmovdqa $t0,0x00(%rsp) 764 mov $A,$a1 765 vmovdqa $t1,0x10(%rsp) 766 mov $B,$a3 767 vmovdqa $t2,0x20(%rsp) 768 xor $C,$a3 # magic 769 vmovdqa $t3,0x30(%rsp) 770 mov $E,$a0 771 jmp .Lavx_00_47 772 773.align 16 774.Lavx_00_47: 775 sub \$-16*2*$SZ,$Tbl # size optimization 776 vmovdqu (%r12),$inout # $a4 777 mov %r12,$_inp # $a4 778___ 779sub Xupdate_256_AVX () { 780 ( 781 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 782 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 783 '&vpsrld ($t2,$t0,$sigma0[0]);', 784 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 785 '&vpsrld ($t3,$t0,$sigma0[2])', 786 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 787 '&vpxor ($t0,$t3,$t2)', 788 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 789 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 790 '&vpxor ($t0,$t0,$t1)', 791 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 792 '&vpxor ($t0,$t0,$t2)', 793 '&vpsrld ($t2,$t3,$sigma1[2]);', 794 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 795 '&vpsrlq ($t3,$t3,$sigma1[0]);', 796 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 797 '&vpxor ($t2,$t2,$t3);', 798 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 799 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) 800 '&vpshufd ($t2,$t2,0b10000100)', 801 '&vpsrldq ($t2,$t2,8)', 802 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 803 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 804 '&vpsrld ($t2,$t3,$sigma1[2])', 805 '&vpsrlq ($t3,$t3,$sigma1[0])', 806 '&vpxor ($t2,$t2,$t3);', 807 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 808 '&vpxor ($t2,$t2,$t3)', 809 '&vpshufd ($t2,$t2,0b11101000)', 810 '&vpslldq ($t2,$t2,8)', 811 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 812 ); 813} 814 815sub AVX_256_00_47 () { 816my $j = shift; 817my $body = shift; 818my @X = @_; 819my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 820 821 foreach (Xupdate_256_AVX()) { # 29 instructions 822 eval; 823 eval(shift(@insns)); 824 eval(shift(@insns)); 825 eval(shift(@insns)); 826 } 827 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 828 foreach (@insns) { eval; } # remaining instructions 829 &vmovdqa (16*$j."(%rsp)",$t2); 830} 831 832 $aesni_cbc_idx=0; 833 for ($i=0,$j=0; $j<4; $j++) { 834 &AVX_256_00_47($j,\&body_00_15,@X); 835 push(@X,shift(@X)); # rotate(@X) 836 } 837 &mov ("%r12",$_inp); # borrow $a4 838 &vpand ($temp,$temp,$mask14); 839 &mov ("%r15",$_out); # borrow $a2 840 &vpor ($iv,$iv,$temp); 841 &vmovdqu ("(%r15,%r12)",$iv); # write output 842 &lea ("%r12","16(%r12)"); # inp++ 843 844 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 845 &jne (".Lavx_00_47"); 846 847 &vmovdqu ($inout,"(%r12)"); 848 &mov ($_inp,"%r12"); 849 850 $aesni_cbc_idx=0; 851 for ($i=0; $i<16; ) { 852 foreach(body_00_15()) { eval; } 853 } 854 855 } 856$code.=<<___; 857 mov $_inp,%r12 # borrow $a4 858 mov $_out,%r13 # borrow $a0 859 mov $_ctx,%r15 # borrow $a2 860 mov $_in0,%rsi # borrow $a3 861 862 vpand $mask14,$temp,$temp 863 mov $a1,$A 864 vpor $temp,$iv,$iv 865 vmovdqu $iv,(%r13,%r12) # write output 866 lea 16(%r12),%r12 # inp++ 867 868 add $SZ*0(%r15),$A 869 add $SZ*1(%r15),$B 870 add $SZ*2(%r15),$C 871 add $SZ*3(%r15),$D 872 add $SZ*4(%r15),$E 873 add $SZ*5(%r15),$F 874 add $SZ*6(%r15),$G 875 add $SZ*7(%r15),$H 876 877 cmp $_end,%r12 878 879 mov $A,$SZ*0(%r15) 880 mov $B,$SZ*1(%r15) 881 mov $C,$SZ*2(%r15) 882 mov $D,$SZ*3(%r15) 883 mov $E,$SZ*4(%r15) 884 mov $F,$SZ*5(%r15) 885 mov $G,$SZ*6(%r15) 886 mov $H,$SZ*7(%r15) 887 jb .Lloop_avx 888 889 mov $_ivp,$ivp 890 mov $_rsp,%rsi 891.cfi_def_cfa %rsi,8 892 vmovdqu $iv,($ivp) # output IV 893 vzeroall 894___ 895$code.=<<___ if ($win64); 896 movaps `$framesz+16*0`(%rsp),%xmm6 897 movaps `$framesz+16*1`(%rsp),%xmm7 898 movaps `$framesz+16*2`(%rsp),%xmm8 899 movaps `$framesz+16*3`(%rsp),%xmm9 900 movaps `$framesz+16*4`(%rsp),%xmm10 901 movaps `$framesz+16*5`(%rsp),%xmm11 902 movaps `$framesz+16*6`(%rsp),%xmm12 903 movaps `$framesz+16*7`(%rsp),%xmm13 904 movaps `$framesz+16*8`(%rsp),%xmm14 905 movaps `$framesz+16*9`(%rsp),%xmm15 906___ 907$code.=<<___; 908 mov -48(%rsi),%r15 909.cfi_restore %r15 910 mov -40(%rsi),%r14 911.cfi_restore %r14 912 mov -32(%rsi),%r13 913.cfi_restore %r13 914 mov -24(%rsi),%r12 915.cfi_restore %r12 916 mov -16(%rsi),%rbp 917.cfi_restore %rbp 918 mov -8(%rsi),%rbx 919.cfi_restore %rbx 920 lea (%rsi),%rsp 921.cfi_def_cfa_register %rsp 922.Lepilogue_avx: 923 ret 924.cfi_endproc 925.size ${func}_avx,.-${func}_avx 926___ 927 928if ($avx>1) {{ 929###################################################################### 930# AVX2+BMI code path 931# 932my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 933my $PUSH8=8*2*$SZ; 934use integer; 935 936sub bodyx_00_15 () { 937 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 938 ( 939 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 940 941 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 942 '&and ($a4,$e)', # f&e 943 '&rorx ($a0,$e,$Sigma1[2])', 944 '&rorx ($a2,$e,$Sigma1[1])', 945 946 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 947 '&lea ($h,"($h,$a4)")', 948 '&andn ($a4,$e,$g)', # ~e&g 949 '&xor ($a0,$a2)', 950 951 '&rorx ($a1,$e,$Sigma1[0])', 952 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 953 '&xor ($a0,$a1)', # Sigma1(e) 954 '&mov ($a2,$a)', 955 956 '&rorx ($a4,$a,$Sigma0[2])', 957 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 958 '&xor ($a2,$b)', # a^b, b^c in next round 959 '&rorx ($a1,$a,$Sigma0[1])', 960 961 '&rorx ($a0,$a,$Sigma0[0])', 962 '&lea ($d,"($d,$h)")', # d+=h 963 '&and ($a3,$a2)', # (b^c)&(a^b) 964 @aesni_cbc_block[$aesni_cbc_idx++]. 965 '&xor ($a1,$a4)', 966 967 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 968 '&xor ($a1,$a0)', # Sigma0(a) 969 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 970 '&mov ($a4,$e)', # copy of f in future 971 972 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 973 ); 974 # and at the finish one has to $a+=$a1 975} 976 977$code.=<<___; 978.type ${func}_avx2,\@function,6 979.align 64 980${func}_avx2: 981.cfi_startproc 982.Lavx2_shortcut: 983 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter 984 mov %rsp,%rax # copy %rsp 985.cfi_def_cfa_register %rax 986 push %rbx 987.cfi_push %rbx 988 push %rbp 989.cfi_push %rbp 990 push %r12 991.cfi_push %r12 992 push %r13 993.cfi_push %r13 994 push %r14 995.cfi_push %r14 996 push %r15 997.cfi_push %r15 998 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp 999 and \$-256*$SZ,%rsp # align stack frame 1000 add \$`2*$SZ*($rounds-8)`,%rsp 1001 1002 shl \$6,$len 1003 sub $inp,$out # re-bias 1004 sub $inp,$in0 1005 add $inp,$len # end of input 1006 1007 #mov $inp,$_inp # saved later 1008 #mov $out,$_out # kept in $offload 1009 mov $len,$_end 1010 #mov $key,$_key # remains resident in $inp register 1011 mov $ivp,$_ivp 1012 mov $ctx,$_ctx 1013 mov $in0,$_in0 1014 mov %rax,$_rsp 1015.cfi_cfa_expression $_rsp,deref,+8 1016___ 1017$code.=<<___ if ($win64); 1018 movaps %xmm6,`$framesz+16*0`(%rsp) 1019 movaps %xmm7,`$framesz+16*1`(%rsp) 1020 movaps %xmm8,`$framesz+16*2`(%rsp) 1021 movaps %xmm9,`$framesz+16*3`(%rsp) 1022 movaps %xmm10,`$framesz+16*4`(%rsp) 1023 movaps %xmm11,`$framesz+16*5`(%rsp) 1024 movaps %xmm12,`$framesz+16*6`(%rsp) 1025 movaps %xmm13,`$framesz+16*7`(%rsp) 1026 movaps %xmm14,`$framesz+16*8`(%rsp) 1027 movaps %xmm15,`$framesz+16*9`(%rsp) 1028___ 1029$code.=<<___; 1030.Lprologue_avx2: 1031 vzeroall 1032 1033 mov $inp,%r13 # borrow $a0 1034 vpinsrq \$1,$out,$offload,$offload 1035 lea 0x80($key),$inp # size optimization, reassign 1036 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 1037 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 1038 mov $ctx,%r15 # borrow $a2 1039 mov $in0,%rsi # borrow $a3 1040 vmovdqu ($ivp),$iv # load IV 1041 lea -9(%r14),%r14 1042 1043 vmovdqa 0x00(%r12,%r14,8),$mask14 1044 vmovdqa 0x10(%r12,%r14,8),$mask12 1045 vmovdqa 0x20(%r12,%r14,8),$mask10 1046 1047 sub \$-16*$SZ,%r13 # inp++, size optimization 1048 mov $SZ*0(%r15),$A 1049 lea (%rsi,%r13),%r12 # borrow $a0 1050 mov $SZ*1(%r15),$B 1051 cmp $len,%r13 # $_end 1052 mov $SZ*2(%r15),$C 1053 cmove %rsp,%r12 # next block or random data 1054 mov $SZ*3(%r15),$D 1055 mov $SZ*4(%r15),$E 1056 mov $SZ*5(%r15),$F 1057 mov $SZ*6(%r15),$G 1058 mov $SZ*7(%r15),$H 1059 vmovdqu 0x00-0x80($inp),$roundkey 1060___ 1061 if ($SZ==4) { # SHA256 1062 my @X = map("%ymm$_",(0..3)); 1063 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); 1064 1065$code.=<<___; 1066 jmp .Loop_avx2 1067.align 16 1068.Loop_avx2: 1069 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1070 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 1071 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 1072 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 1073 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 1074 1075 vinserti128 \$1,(%r12),@X[0],@X[0] 1076 vinserti128 \$1,16(%r12),@X[1],@X[1] 1077 vpshufb $t3,@X[0],@X[0] 1078 vinserti128 \$1,32(%r12),@X[2],@X[2] 1079 vpshufb $t3,@X[1],@X[1] 1080 vinserti128 \$1,48(%r12),@X[3],@X[3] 1081 1082 lea $TABLE(%rip),$Tbl 1083 vpshufb $t3,@X[2],@X[2] 1084 lea -16*$SZ(%r13),%r13 1085 vpaddd 0x00($Tbl),@X[0],$t0 1086 vpshufb $t3,@X[3],@X[3] 1087 vpaddd 0x20($Tbl),@X[1],$t1 1088 vpaddd 0x40($Tbl),@X[2],$t2 1089 vpaddd 0x60($Tbl),@X[3],$t3 1090 vmovdqa $t0,0x00(%rsp) 1091 xor $a1,$a1 1092 vmovdqa $t1,0x20(%rsp) 1093___ 1094$code.=<<___ if (!$win64); 1095# temporarily use %rsi as frame pointer 1096 mov $_rsp,%rsi 1097.cfi_def_cfa %rsi,8 1098___ 1099$code.=<<___; 1100 lea -$PUSH8(%rsp),%rsp 1101___ 1102$code.=<<___ if (!$win64); 1103# the frame info is at $_rsp, but the stack is moving... 1104# so a second frame pointer is saved at -8(%rsp) 1105# that is in the red zone 1106 mov %rsi,-8(%rsp) 1107.cfi_cfa_expression %rsp-8,deref,+8 1108___ 1109$code.=<<___; 1110 mov $B,$a3 1111 vmovdqa $t2,0x00(%rsp) 1112 xor $C,$a3 # magic 1113 vmovdqa $t3,0x20(%rsp) 1114 mov $F,$a4 1115 sub \$-16*2*$SZ,$Tbl # size optimization 1116 jmp .Lavx2_00_47 1117 1118.align 16 1119.Lavx2_00_47: 1120 vmovdqu (%r13),$inout 1121 vpinsrq \$0,%r13,$offload,$offload 1122___ 1123 1124sub AVX2_256_00_47 () { 1125my $j = shift; 1126my $body = shift; 1127my @X = @_; 1128my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1129my $base = "+2*$PUSH8(%rsp)"; 1130 1131 if (($j%2)==0) { 1132 &lea ("%rsp","-$PUSH8(%rsp)"); 1133$code.=<<___ if (!$win64); 1134.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 1135# copy secondary frame pointer to new location again at -8(%rsp) 1136 pushq $PUSH8-8(%rsp) 1137.cfi_cfa_expression %rsp,deref,+8 1138 lea 8(%rsp),%rsp 1139.cfi_cfa_expression %rsp-8,deref,+8 1140___ 1141 } 1142 foreach (Xupdate_256_AVX()) { # 29 instructions 1143 eval; 1144 eval(shift(@insns)); 1145 eval(shift(@insns)); 1146 eval(shift(@insns)); 1147 } 1148 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1149 foreach (@insns) { eval; } # remaining instructions 1150 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1151} 1152 $aesni_cbc_idx=0; 1153 for ($i=0,$j=0; $j<4; $j++) { 1154 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1155 push(@X,shift(@X)); # rotate(@X) 1156 } 1157 &vmovq ("%r13",$offload); # borrow $a0 1158 &vpextrq ("%r15",$offload,1); # borrow $a2 1159 &vpand ($temp,$temp,$mask14); 1160 &vpor ($iv,$iv,$temp); 1161 &vmovdqu ("(%r15,%r13)",$iv); # write output 1162 &lea ("%r13","16(%r13)"); # inp++ 1163 1164 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1165 &cmpb (($SZ-1)."($Tbl)",0); 1166 &jne (".Lavx2_00_47"); 1167 1168 &vmovdqu ($inout,"(%r13)"); 1169 &vpinsrq ($offload,$offload,"%r13",0); 1170 1171 $aesni_cbc_idx=0; 1172 for ($i=0; $i<16; ) { 1173 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1174 foreach(bodyx_00_15()) { eval; } 1175 } 1176 } 1177$code.=<<___; 1178 vpextrq \$1,$offload,%r12 # $_out, borrow $a4 1179 vmovq $offload,%r13 # $_inp, borrow $a0 1180 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1181 add $a1,$A 1182 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 1183 1184 vpand $mask14,$temp,$temp 1185 vpor $temp,$iv,$iv 1186 vmovdqu $iv,(%r12,%r13) # write output 1187 lea 16(%r13),%r13 1188 1189 add $SZ*0(%r15),$A 1190 add $SZ*1(%r15),$B 1191 add $SZ*2(%r15),$C 1192 add $SZ*3(%r15),$D 1193 add $SZ*4(%r15),$E 1194 add $SZ*5(%r15),$F 1195 add $SZ*6(%r15),$G 1196 add $SZ*7(%r15),$H 1197 1198 mov $A,$SZ*0(%r15) 1199 mov $B,$SZ*1(%r15) 1200 mov $C,$SZ*2(%r15) 1201 mov $D,$SZ*3(%r15) 1202 mov $E,$SZ*4(%r15) 1203 mov $F,$SZ*5(%r15) 1204 mov $G,$SZ*6(%r15) 1205 mov $H,$SZ*7(%r15) 1206 1207 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end 1208 je .Ldone_avx2 1209 1210 xor $a1,$a1 1211 mov $B,$a3 1212 mov $F,$a4 1213 xor $C,$a3 # magic 1214 jmp .Lower_avx2 1215.align 16 1216.Lower_avx2: 1217 vmovdqu (%r13),$inout 1218 vpinsrq \$0,%r13,$offload,$offload 1219___ 1220 $aesni_cbc_idx=0; 1221 for ($i=0; $i<16; ) { 1222 my $base="+16($Tbl)"; 1223 foreach(bodyx_00_15()) { eval; } 1224 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); 1225 } 1226$code.=<<___; 1227 vmovq $offload,%r13 # borrow $a0 1228 vpextrq \$1,$offload,%r15 # borrow $a2 1229 vpand $mask14,$temp,$temp 1230 vpor $temp,$iv,$iv 1231 lea -$PUSH8($Tbl),$Tbl 1232 vmovdqu $iv,(%r15,%r13) # write output 1233 lea 16(%r13),%r13 # inp++ 1234 cmp %rsp,$Tbl 1235 jae .Lower_avx2 1236 1237 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 1238 lea 16*$SZ(%r13),%r13 1239 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 1240 add $a1,$A 1241 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 1242 1243 add $SZ*0(%r15),$A 1244 add $SZ*1(%r15),$B 1245 add $SZ*2(%r15),$C 1246 add $SZ*3(%r15),$D 1247 add $SZ*4(%r15),$E 1248 add $SZ*5(%r15),$F 1249 add $SZ*6(%r15),$G 1250 lea (%rsi,%r13),%r12 1251 add $SZ*7(%r15),$H 1252 1253 cmp $_end,%r13 1254 1255 mov $A,$SZ*0(%r15) 1256 cmove %rsp,%r12 # next block or stale data 1257 mov $B,$SZ*1(%r15) 1258 mov $C,$SZ*2(%r15) 1259 mov $D,$SZ*3(%r15) 1260 mov $E,$SZ*4(%r15) 1261 mov $F,$SZ*5(%r15) 1262 mov $G,$SZ*6(%r15) 1263 mov $H,$SZ*7(%r15) 1264 1265 jbe .Loop_avx2 1266 lea (%rsp),$Tbl 1267# temporarily use $Tbl as index to $_rsp 1268# this avoids the need to save a secondary frame pointer at -8(%rsp) 1269.cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 1270 1271.Ldone_avx2: 1272 mov 16*$SZ+4*8($Tbl),$ivp 1273 mov 16*$SZ+7*8($Tbl),%rsi 1274.cfi_def_cfa %rsi,8 1275 vmovdqu $iv,($ivp) # output IV 1276 vzeroall 1277___ 1278$code.=<<___ if ($win64); 1279 movaps `$framesz+16*0`($Tbl),%xmm6 1280 movaps `$framesz+16*1`($Tbl),%xmm7 1281 movaps `$framesz+16*2`($Tbl),%xmm8 1282 movaps `$framesz+16*3`($Tbl),%xmm9 1283 movaps `$framesz+16*4`($Tbl),%xmm10 1284 movaps `$framesz+16*5`($Tbl),%xmm11 1285 movaps `$framesz+16*6`($Tbl),%xmm12 1286 movaps `$framesz+16*7`($Tbl),%xmm13 1287 movaps `$framesz+16*8`($Tbl),%xmm14 1288 movaps `$framesz+16*9`($Tbl),%xmm15 1289___ 1290$code.=<<___; 1291 mov -48(%rsi),%r15 1292.cfi_restore %r15 1293 mov -40(%rsi),%r14 1294.cfi_restore %r14 1295 mov -32(%rsi),%r13 1296.cfi_restore %r13 1297 mov -24(%rsi),%r12 1298.cfi_restore %r12 1299 mov -16(%rsi),%rbp 1300.cfi_restore %rbp 1301 mov -8(%rsi),%rbx 1302.cfi_restore %rbx 1303 lea (%rsi),%rsp 1304.cfi_def_cfa_register %rsp 1305.Lepilogue_avx2: 1306 ret 1307.cfi_endproc 1308.size ${func}_avx2,.-${func}_avx2 1309___ 1310}} 1311}} 1312{{ 1313my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1314 1315my ($rounds,$Tbl)=("%r11d","%rbx"); 1316 1317my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); 1318my @rndkey=("%xmm4","%xmm5"); 1319my $r=0; 1320my $sn=0; 1321 1322my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); 1323my @MSG=map("%xmm$_",(10..13)); 1324 1325my $aesenc=sub { 1326 use integer; 1327 my ($n,$k)=($r/10,$r%10); 1328 if ($k==0) { 1329 $code.=<<___; 1330 movups `16*$n`($in0),$in # load input 1331 xorps $rndkey0,$in 1332___ 1333 $code.=<<___ if ($n); 1334 movups $iv,`16*($n-1)`($out,$in0) # write output 1335___ 1336 $code.=<<___; 1337 xorps $in,$iv 1338 movups `32+16*$k-112`($key),$rndkey[1] 1339 aesenc $rndkey[0],$iv 1340___ 1341 } elsif ($k==9) { 1342 $sn++; 1343 $code.=<<___; 1344 cmp \$11,$rounds 1345 jb .Laesenclast$sn 1346 movups `32+16*($k+0)-112`($key),$rndkey[1] 1347 aesenc $rndkey[0],$iv 1348 movups `32+16*($k+1)-112`($key),$rndkey[0] 1349 aesenc $rndkey[1],$iv 1350 je .Laesenclast$sn 1351 movups `32+16*($k+2)-112`($key),$rndkey[1] 1352 aesenc $rndkey[0],$iv 1353 movups `32+16*($k+3)-112`($key),$rndkey[0] 1354 aesenc $rndkey[1],$iv 1355.Laesenclast$sn: 1356 aesenclast $rndkey[0],$iv 1357 movups 16-112($key),$rndkey[1] # forward reference 1358 nop 1359___ 1360 } else { 1361 $code.=<<___; 1362 movups `32+16*$k-112`($key),$rndkey[1] 1363 aesenc $rndkey[0],$iv 1364___ 1365 } 1366 $r++; unshift(@rndkey,pop(@rndkey)); 1367}; 1368 1369if ($shaext) { 1370my $Tbl="%rax"; 1371 1372$code.=<<___; 1373.type ${func}_shaext,\@function,6 1374.align 32 1375${func}_shaext: 1376.cfi_startproc 1377 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1378___ 1379$code.=<<___ if ($win64); 1380 lea `-8-10*16`(%rsp),%rsp 1381 movaps %xmm6,-8-10*16(%rax) 1382 movaps %xmm7,-8-9*16(%rax) 1383 movaps %xmm8,-8-8*16(%rax) 1384 movaps %xmm9,-8-7*16(%rax) 1385 movaps %xmm10,-8-6*16(%rax) 1386 movaps %xmm11,-8-5*16(%rax) 1387 movaps %xmm12,-8-4*16(%rax) 1388 movaps %xmm13,-8-3*16(%rax) 1389 movaps %xmm14,-8-2*16(%rax) 1390 movaps %xmm15,-8-1*16(%rax) 1391.Lprologue_shaext: 1392___ 1393$code.=<<___; 1394 lea K256+0x80(%rip),$Tbl 1395 movdqu ($ctx),$ABEF # DCBA 1396 movdqu 16($ctx),$CDGH # HGFE 1397 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 1398 1399 mov 240($key),$rounds 1400 sub $in0,$out 1401 movups ($key),$rndkey0 # $key[0] 1402 movups ($ivp),$iv # load IV 1403 movups 16($key),$rndkey[0] # forward reference 1404 lea 112($key),$key # size optimization 1405 1406 pshufd \$0x1b,$ABEF,$Wi # ABCD 1407 pshufd \$0xb1,$ABEF,$ABEF # CDAB 1408 pshufd \$0x1b,$CDGH,$CDGH # EFGH 1409 movdqa $TMP,$BSWAP # offload 1410 palignr \$8,$CDGH,$ABEF # ABEF 1411 punpcklqdq $Wi,$CDGH # CDGH 1412 1413 jmp .Loop_shaext 1414 1415.align 16 1416.Loop_shaext: 1417 movdqu ($inp),@MSG[0] 1418 movdqu 0x10($inp),@MSG[1] 1419 movdqu 0x20($inp),@MSG[2] 1420 pshufb $TMP,@MSG[0] 1421 movdqu 0x30($inp),@MSG[3] 1422 1423 movdqa 0*32-0x80($Tbl),$Wi 1424 paddd @MSG[0],$Wi 1425 pshufb $TMP,@MSG[1] 1426 movdqa $CDGH,$CDGH_SAVE # offload 1427 movdqa $ABEF,$ABEF_SAVE # offload 1428___ 1429 &$aesenc(); 1430$code.=<<___; 1431 sha256rnds2 $ABEF,$CDGH # 0-3 1432 pshufd \$0x0e,$Wi,$Wi 1433___ 1434 &$aesenc(); 1435$code.=<<___; 1436 sha256rnds2 $CDGH,$ABEF 1437 1438 movdqa 1*32-0x80($Tbl),$Wi 1439 paddd @MSG[1],$Wi 1440 pshufb $TMP,@MSG[2] 1441 lea 0x40($inp),$inp 1442___ 1443 &$aesenc(); 1444$code.=<<___; 1445 sha256rnds2 $ABEF,$CDGH # 4-7 1446 pshufd \$0x0e,$Wi,$Wi 1447___ 1448 &$aesenc(); 1449$code.=<<___; 1450 sha256rnds2 $CDGH,$ABEF 1451 1452 movdqa 2*32-0x80($Tbl),$Wi 1453 paddd @MSG[2],$Wi 1454 pshufb $TMP,@MSG[3] 1455 sha256msg1 @MSG[1],@MSG[0] 1456___ 1457 &$aesenc(); 1458$code.=<<___; 1459 sha256rnds2 $ABEF,$CDGH # 8-11 1460 pshufd \$0x0e,$Wi,$Wi 1461 movdqa @MSG[3],$TMP 1462 palignr \$4,@MSG[2],$TMP 1463 paddd $TMP,@MSG[0] 1464___ 1465 &$aesenc(); 1466$code.=<<___; 1467 sha256rnds2 $CDGH,$ABEF 1468 1469 movdqa 3*32-0x80($Tbl),$Wi 1470 paddd @MSG[3],$Wi 1471 sha256msg2 @MSG[3],@MSG[0] 1472 sha256msg1 @MSG[2],@MSG[1] 1473___ 1474 &$aesenc(); 1475$code.=<<___; 1476 sha256rnds2 $ABEF,$CDGH # 12-15 1477 pshufd \$0x0e,$Wi,$Wi 1478___ 1479 &$aesenc(); 1480$code.=<<___; 1481 movdqa @MSG[0],$TMP 1482 palignr \$4,@MSG[3],$TMP 1483 paddd $TMP,@MSG[1] 1484 sha256rnds2 $CDGH,$ABEF 1485___ 1486for($i=4;$i<16-3;$i++) { 1487 &$aesenc() if (($r%10)==0); 1488$code.=<<___; 1489 movdqa $i*32-0x80($Tbl),$Wi 1490 paddd @MSG[0],$Wi 1491 sha256msg2 @MSG[0],@MSG[1] 1492 sha256msg1 @MSG[3],@MSG[2] 1493___ 1494 &$aesenc(); 1495$code.=<<___; 1496 sha256rnds2 $ABEF,$CDGH # 16-19... 1497 pshufd \$0x0e,$Wi,$Wi 1498 movdqa @MSG[1],$TMP 1499 palignr \$4,@MSG[0],$TMP 1500 paddd $TMP,@MSG[2] 1501___ 1502 &$aesenc(); 1503 &$aesenc() if ($r==19); 1504$code.=<<___; 1505 sha256rnds2 $CDGH,$ABEF 1506___ 1507 push(@MSG,shift(@MSG)); 1508} 1509$code.=<<___; 1510 movdqa 13*32-0x80($Tbl),$Wi 1511 paddd @MSG[0],$Wi 1512 sha256msg2 @MSG[0],@MSG[1] 1513 sha256msg1 @MSG[3],@MSG[2] 1514___ 1515 &$aesenc(); 1516$code.=<<___; 1517 sha256rnds2 $ABEF,$CDGH # 52-55 1518 pshufd \$0x0e,$Wi,$Wi 1519 movdqa @MSG[1],$TMP 1520 palignr \$4,@MSG[0],$TMP 1521 paddd $TMP,@MSG[2] 1522___ 1523 &$aesenc(); 1524 &$aesenc(); 1525$code.=<<___; 1526 sha256rnds2 $CDGH,$ABEF 1527 1528 movdqa 14*32-0x80($Tbl),$Wi 1529 paddd @MSG[1],$Wi 1530 sha256msg2 @MSG[1],@MSG[2] 1531 movdqa $BSWAP,$TMP 1532___ 1533 &$aesenc(); 1534$code.=<<___; 1535 sha256rnds2 $ABEF,$CDGH # 56-59 1536 pshufd \$0x0e,$Wi,$Wi 1537___ 1538 &$aesenc(); 1539$code.=<<___; 1540 sha256rnds2 $CDGH,$ABEF 1541 1542 movdqa 15*32-0x80($Tbl),$Wi 1543 paddd @MSG[2],$Wi 1544___ 1545 &$aesenc(); 1546 &$aesenc(); 1547$code.=<<___; 1548 sha256rnds2 $ABEF,$CDGH # 60-63 1549 pshufd \$0x0e,$Wi,$Wi 1550___ 1551 &$aesenc(); 1552$code.=<<___; 1553 sha256rnds2 $CDGH,$ABEF 1554 #pxor $CDGH,$rndkey0 # black magic 1555___ 1556 while ($r<40) { &$aesenc(); } # remaining aesenc's 1557$code.=<<___; 1558 #xorps $CDGH,$rndkey0 # black magic 1559 paddd $CDGH_SAVE,$CDGH 1560 paddd $ABEF_SAVE,$ABEF 1561 1562 dec $len 1563 movups $iv,48($out,$in0) # write output 1564 lea 64($in0),$in0 1565 jnz .Loop_shaext 1566 1567 pshufd \$0xb1,$CDGH,$CDGH # DCHG 1568 pshufd \$0x1b,$ABEF,$TMP # FEBA 1569 pshufd \$0xb1,$ABEF,$ABEF # BAFE 1570 punpckhqdq $CDGH,$ABEF # DCBA 1571 palignr \$8,$TMP,$CDGH # HGFE 1572 1573 movups $iv,($ivp) # write IV 1574 movdqu $ABEF,($ctx) 1575 movdqu $CDGH,16($ctx) 1576___ 1577$code.=<<___ if ($win64); 1578 movaps 0*16(%rsp),%xmm6 1579 movaps 1*16(%rsp),%xmm7 1580 movaps 2*16(%rsp),%xmm8 1581 movaps 3*16(%rsp),%xmm9 1582 movaps 4*16(%rsp),%xmm10 1583 movaps 5*16(%rsp),%xmm11 1584 movaps 6*16(%rsp),%xmm12 1585 movaps 7*16(%rsp),%xmm13 1586 movaps 8*16(%rsp),%xmm14 1587 movaps 9*16(%rsp),%xmm15 1588 lea 8+10*16(%rsp),%rsp 1589.Lepilogue_shaext: 1590___ 1591$code.=<<___; 1592 ret 1593.cfi_endproc 1594.size ${func}_shaext,.-${func}_shaext 1595___ 1596} 1597}}}}} 1598 1599# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1600# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1601if ($win64 && $avx) { 1602$rec="%rcx"; 1603$frame="%rdx"; 1604$context="%r8"; 1605$disp="%r9"; 1606 1607$code.=<<___; 1608.extern __imp_RtlVirtualUnwind 1609.type se_handler,\@abi-omnipotent 1610.align 16 1611se_handler: 1612 push %rsi 1613 push %rdi 1614 push %rbx 1615 push %rbp 1616 push %r12 1617 push %r13 1618 push %r14 1619 push %r15 1620 pushfq 1621 sub \$64,%rsp 1622 1623 mov 120($context),%rax # pull context->Rax 1624 mov 248($context),%rbx # pull context->Rip 1625 1626 mov 8($disp),%rsi # disp->ImageBase 1627 mov 56($disp),%r11 # disp->HanderlData 1628 1629 mov 0(%r11),%r10d # HandlerData[0] 1630 lea (%rsi,%r10),%r10 # prologue label 1631 cmp %r10,%rbx # context->Rip<prologue label 1632 jb .Lin_prologue 1633 1634 mov 152($context),%rax # pull context->Rsp 1635 1636 mov 4(%r11),%r10d # HandlerData[1] 1637 lea (%rsi,%r10),%r10 # epilogue label 1638 cmp %r10,%rbx # context->Rip>=epilogue label 1639 jae .Lin_prologue 1640___ 1641$code.=<<___ if ($shaext); 1642 lea aesni_cbc_sha256_enc_shaext(%rip),%r10 1643 cmp %r10,%rbx 1644 jb .Lnot_in_shaext 1645 1646 lea (%rax),%rsi 1647 lea 512($context),%rdi # &context.Xmm6 1648 mov \$20,%ecx 1649 .long 0xa548f3fc # cld; rep movsq 1650 lea 168(%rax),%rax # adjust stack pointer 1651 jmp .Lin_prologue 1652.Lnot_in_shaext: 1653___ 1654$code.=<<___ if ($avx>1); 1655 lea .Lavx2_shortcut(%rip),%r10 1656 cmp %r10,%rbx # context->Rip<avx2_shortcut 1657 jb .Lnot_in_avx2 1658 1659 and \$-256*$SZ,%rax 1660 add \$`2*$SZ*($rounds-8)`,%rax 1661.Lnot_in_avx2: 1662___ 1663$code.=<<___; 1664 mov %rax,%rsi # put aside Rsp 1665 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp 1666 1667 mov -8(%rax),%rbx 1668 mov -16(%rax),%rbp 1669 mov -24(%rax),%r12 1670 mov -32(%rax),%r13 1671 mov -40(%rax),%r14 1672 mov -48(%rax),%r15 1673 mov %rbx,144($context) # restore context->Rbx 1674 mov %rbp,160($context) # restore context->Rbp 1675 mov %r12,216($context) # restore context->R12 1676 mov %r13,224($context) # restore context->R13 1677 mov %r14,232($context) # restore context->R14 1678 mov %r15,240($context) # restore context->R15 1679 1680 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area 1681 lea 512($context),%rdi # &context.Xmm6 1682 mov \$20,%ecx 1683 .long 0xa548f3fc # cld; rep movsq 1684 1685.Lin_prologue: 1686 mov 8(%rax),%rdi 1687 mov 16(%rax),%rsi 1688 mov %rax,152($context) # restore context->Rsp 1689 mov %rsi,168($context) # restore context->Rsi 1690 mov %rdi,176($context) # restore context->Rdi 1691 1692 mov 40($disp),%rdi # disp->ContextRecord 1693 mov $context,%rsi # context 1694 mov \$154,%ecx # sizeof(CONTEXT) 1695 .long 0xa548f3fc # cld; rep movsq 1696 1697 mov $disp,%rsi 1698 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1699 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1700 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1701 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1702 mov 40(%rsi),%r10 # disp->ContextRecord 1703 lea 56(%rsi),%r11 # &disp->HandlerData 1704 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1705 mov %r10,32(%rsp) # arg5 1706 mov %r11,40(%rsp) # arg6 1707 mov %r12,48(%rsp) # arg7 1708 mov %rcx,56(%rsp) # arg8, (NULL) 1709 call *__imp_RtlVirtualUnwind(%rip) 1710 1711 mov \$1,%eax # ExceptionContinueSearch 1712 add \$64,%rsp 1713 popfq 1714 pop %r15 1715 pop %r14 1716 pop %r13 1717 pop %r12 1718 pop %rbp 1719 pop %rbx 1720 pop %rdi 1721 pop %rsi 1722 ret 1723.size se_handler,.-se_handler 1724 1725.section .pdata 1726 .rva .LSEH_begin_${func}_xop 1727 .rva .LSEH_end_${func}_xop 1728 .rva .LSEH_info_${func}_xop 1729 1730 .rva .LSEH_begin_${func}_avx 1731 .rva .LSEH_end_${func}_avx 1732 .rva .LSEH_info_${func}_avx 1733___ 1734$code.=<<___ if ($avx>1); 1735 .rva .LSEH_begin_${func}_avx2 1736 .rva .LSEH_end_${func}_avx2 1737 .rva .LSEH_info_${func}_avx2 1738___ 1739$code.=<<___ if ($shaext); 1740 .rva .LSEH_begin_${func}_shaext 1741 .rva .LSEH_end_${func}_shaext 1742 .rva .LSEH_info_${func}_shaext 1743___ 1744$code.=<<___; 1745.section .xdata 1746.align 8 1747.LSEH_info_${func}_xop: 1748 .byte 9,0,0,0 1749 .rva se_handler 1750 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 1751 1752.LSEH_info_${func}_avx: 1753 .byte 9,0,0,0 1754 .rva se_handler 1755 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1756___ 1757$code.=<<___ if ($avx>1); 1758.LSEH_info_${func}_avx2: 1759 .byte 9,0,0,0 1760 .rva se_handler 1761 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 1762___ 1763$code.=<<___ if ($shaext); 1764.LSEH_info_${func}_shaext: 1765 .byte 9,0,0,0 1766 .rva se_handler 1767 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1768___ 1769} 1770 1771#################################################################### 1772sub rex { 1773 local *opcode=shift; 1774 my ($dst,$src)=@_; 1775 my $rex=0; 1776 1777 $rex|=0x04 if($dst>=8); 1778 $rex|=0x01 if($src>=8); 1779 unshift @opcode,$rex|0x40 if($rex); 1780} 1781 1782{ 1783 my %opcodelet = ( 1784 "sha256rnds2" => 0xcb, 1785 "sha256msg1" => 0xcc, 1786 "sha256msg2" => 0xcd ); 1787 1788 sub sha256op38 { 1789 my $instr = shift; 1790 1791 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1792 my @opcode=(0x0f,0x38); 1793 rex(\@opcode,$2,$1); 1794 push @opcode,$opcodelet{$instr}; 1795 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 1796 return ".byte\t".join(',',@opcode); 1797 } else { 1798 return $instr."\t".@_[0]; 1799 } 1800 } 1801} 1802 1803$code =~ s/\`([^\`]*)\`/eval $1/gem; 1804$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; 1805print $code; 1806close STDOUT or die "error closing STDOUT: $!"; 1807