1#! /usr/bin/env perl 2# Copyright 2005-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the License. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111 112# $output is the last argument if it looks like a file (it has an extension) 113# $flavour is the first argument if it doesn't look like a file 114$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 115$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 116 117$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 118 119$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 120( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 121( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 122die "can't locate x86_64-xlate.pl"; 123 124if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 125 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 126 $avx = ($1>=2.19) + ($1>=2.22); 127} 128 129if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 130 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 131 $avx = ($1>=2.09) + ($1>=2.10); 132} 133 134if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 135 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 136 $avx = ($1>=10) + ($1>=11); 137} 138 139if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 140 $avx = ($2>=3.0) + ($2>3.0); 141} 142 143$shaext=1; ### set to zero if compiling for 1.0.1 144$avx=1 if (!$shaext && $avx); 145 146open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 147 or die "can't call $xlate: $!"; 148*STDOUT=*OUT; 149 150if ($output =~ /512/) { 151 $func="sha512_block_data_order"; 152 $TABLE="K512"; 153 $SZ=8; 154 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 155 "%r8", "%r9", "%r10","%r11"); 156 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 157 @Sigma0=(28,34,39); 158 @Sigma1=(14,18,41); 159 @sigma0=(1, 8, 7); 160 @sigma1=(19,61, 6); 161 $rounds=80; 162} else { 163 $func="sha256_block_data_order"; 164 $TABLE="K256"; 165 $SZ=4; 166 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 167 "%r8d","%r9d","%r10d","%r11d"); 168 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 169 @Sigma0=( 2,13,22); 170 @Sigma1=( 6,11,25); 171 @sigma0=( 7,18, 3); 172 @sigma1=(17,19,10); 173 $rounds=64; 174} 175 176$ctx="%rdi"; # 1st arg, zapped by $a3 177$inp="%rsi"; # 2nd arg 178$Tbl="%rbp"; 179 180$_ctx="16*$SZ+0*8(%rsp)"; 181$_inp="16*$SZ+1*8(%rsp)"; 182$_end="16*$SZ+2*8(%rsp)"; 183$_rsp="`16*$SZ+3*8`(%rsp)"; 184$framesz="16*$SZ+4*8"; 185 186 187sub ROUND_00_15() 188{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 189 my $STRIDE=$SZ; 190 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 191 192$code.=<<___; 193 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 194 mov $f,$a2 195 196 xor $e,$a0 197 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 198 xor $g,$a2 # f^g 199 200 mov $T1,`$SZ*($i&0xf)`(%rsp) 201 xor $a,$a1 202 and $e,$a2 # (f^g)&e 203 204 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 205 add $h,$T1 # T1+=h 206 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 207 208 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 209 xor $e,$a0 210 add $a2,$T1 # T1+=Ch(e,f,g) 211 212 mov $a,$a2 213 add ($Tbl),$T1 # T1+=K[round] 214 xor $a,$a1 215 216 xor $b,$a2 # a^b, b^c in next round 217 ror \$$Sigma1[0],$a0 # Sigma1(e) 218 mov $b,$h 219 220 and $a2,$a3 221 ror \$$Sigma0[0],$a1 # Sigma0(a) 222 add $a0,$T1 # T1+=Sigma1(e) 223 224 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 225 add $T1,$d # d+=T1 226 add $T1,$h # h+=T1 227 228 lea $STRIDE($Tbl),$Tbl # round++ 229___ 230$code.=<<___ if ($i<15); 231 add $a1,$h # h+=Sigma0(a) 232___ 233 ($a2,$a3) = ($a3,$a2); 234} 235 236sub ROUND_16_XX() 237{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 238 239$code.=<<___; 240 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 241 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 242 243 mov $a0,$T1 244 ror \$`$sigma0[1]-$sigma0[0]`,$a0 245 add $a1,$a # modulo-scheduled h+=Sigma0(a) 246 mov $a2,$a1 247 ror \$`$sigma1[1]-$sigma1[0]`,$a2 248 249 xor $T1,$a0 250 shr \$$sigma0[2],$T1 251 ror \$$sigma0[0],$a0 252 xor $a1,$a2 253 shr \$$sigma1[2],$a1 254 255 ror \$$sigma1[0],$a2 256 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 257 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 258 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 259 260 add `$SZ*($i&0xf)`(%rsp),$T1 261 mov $e,$a0 262 add $a2,$T1 263 mov $a,$a1 264___ 265 &ROUND_00_15(@_); 266} 267 268$code=<<___; 269.text 270 271.extern OPENSSL_ia32cap_P 272.globl $func 273.type $func,\@function,3 274.align 16 275$func: 276.cfi_startproc 277___ 278$code.=<<___ if ($SZ==4 || $avx); 279 lea OPENSSL_ia32cap_P(%rip),%r11 280 mov 0(%r11),%r9d 281 mov 4(%r11),%r10d 282 mov 8(%r11),%r11d 283___ 284$code.=<<___ if ($SZ==4 && $shaext); 285 test \$`1<<29`,%r11d # check for SHA 286 jnz _shaext_shortcut 287___ 288$code.=<<___ if ($avx && $SZ==8); 289 test \$`1<<11`,%r10d # check for XOP 290 jnz .Lxop_shortcut 291___ 292$code.=<<___ if ($avx>1); 293 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 294 cmp \$`1<<8|1<<5|1<<3`,%r11d 295 je .Lavx2_shortcut 296___ 297$code.=<<___ if ($avx); 298 and \$`1<<30`,%r9d # mask "Intel CPU" bit 299 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 300 or %r9d,%r10d 301 cmp \$`1<<28|1<<9|1<<30`,%r10d 302 je .Lavx_shortcut 303___ 304$code.=<<___ if ($SZ==4); 305 test \$`1<<9`,%r10d 306 jnz .Lssse3_shortcut 307___ 308$code.=<<___; 309 mov %rsp,%rax # copy %rsp 310.cfi_def_cfa_register %rax 311 push %rbx 312.cfi_push %rbx 313 push %rbp 314.cfi_push %rbp 315 push %r12 316.cfi_push %r12 317 push %r13 318.cfi_push %r13 319 push %r14 320.cfi_push %r14 321 push %r15 322.cfi_push %r15 323 shl \$4,%rdx # num*16 324 sub \$$framesz,%rsp 325 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 326 and \$-64,%rsp # align stack frame 327 mov $ctx,$_ctx # save ctx, 1st arg 328 mov $inp,$_inp # save inp, 2nd arh 329 mov %rdx,$_end # save end pointer, "3rd" arg 330 mov %rax,$_rsp # save copy of %rsp 331.cfi_cfa_expression $_rsp,deref,+8 332.Lprologue: 333 334 mov $SZ*0($ctx),$A 335 mov $SZ*1($ctx),$B 336 mov $SZ*2($ctx),$C 337 mov $SZ*3($ctx),$D 338 mov $SZ*4($ctx),$E 339 mov $SZ*5($ctx),$F 340 mov $SZ*6($ctx),$G 341 mov $SZ*7($ctx),$H 342 jmp .Lloop 343 344.align 16 345.Lloop: 346 mov $B,$a3 347 lea $TABLE(%rip),$Tbl 348 xor $C,$a3 # magic 349___ 350 for($i=0;$i<16;$i++) { 351 $code.=" mov $SZ*$i($inp),$T1\n"; 352 $code.=" mov @ROT[4],$a0\n"; 353 $code.=" mov @ROT[0],$a1\n"; 354 $code.=" bswap $T1\n"; 355 &ROUND_00_15($i,@ROT); 356 unshift(@ROT,pop(@ROT)); 357 } 358$code.=<<___; 359 jmp .Lrounds_16_xx 360.align 16 361.Lrounds_16_xx: 362___ 363 for(;$i<32;$i++) { 364 &ROUND_16_XX($i,@ROT); 365 unshift(@ROT,pop(@ROT)); 366 } 367 368$code.=<<___; 369 cmpb \$0,`$SZ-1`($Tbl) 370 jnz .Lrounds_16_xx 371 372 mov $_ctx,$ctx 373 add $a1,$A # modulo-scheduled h+=Sigma0(a) 374 lea 16*$SZ($inp),$inp 375 376 add $SZ*0($ctx),$A 377 add $SZ*1($ctx),$B 378 add $SZ*2($ctx),$C 379 add $SZ*3($ctx),$D 380 add $SZ*4($ctx),$E 381 add $SZ*5($ctx),$F 382 add $SZ*6($ctx),$G 383 add $SZ*7($ctx),$H 384 385 cmp $_end,$inp 386 387 mov $A,$SZ*0($ctx) 388 mov $B,$SZ*1($ctx) 389 mov $C,$SZ*2($ctx) 390 mov $D,$SZ*3($ctx) 391 mov $E,$SZ*4($ctx) 392 mov $F,$SZ*5($ctx) 393 mov $G,$SZ*6($ctx) 394 mov $H,$SZ*7($ctx) 395 jb .Lloop 396 397 mov $_rsp,%rsi 398.cfi_def_cfa %rsi,8 399 mov -48(%rsi),%r15 400.cfi_restore %r15 401 mov -40(%rsi),%r14 402.cfi_restore %r14 403 mov -32(%rsi),%r13 404.cfi_restore %r13 405 mov -24(%rsi),%r12 406.cfi_restore %r12 407 mov -16(%rsi),%rbp 408.cfi_restore %rbp 409 mov -8(%rsi),%rbx 410.cfi_restore %rbx 411 lea (%rsi),%rsp 412.cfi_def_cfa_register %rsp 413.Lepilogue: 414 ret 415.cfi_endproc 416.size $func,.-$func 417___ 418 419if ($SZ==4) { 420$code.=<<___; 421.section .rodata align=64 422.align 64 423.type $TABLE,\@object 424$TABLE: 425 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 426 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 427 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 428 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 429 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 430 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 431 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 432 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 433 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 434 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 435 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 436 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 437 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 438 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 439 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 440 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 441 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 442 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 443 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 444 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 445 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 446 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 447 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 448 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 449 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 450 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 451 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 452 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 453 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 454 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 456 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 457 458 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 459 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 460 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 461 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 462 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 463 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 464 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 465.previous 466___ 467} else { 468$code.=<<___; 469.section .rodata align=64 470.align 64 471.type $TABLE,\@object 472$TABLE: 473 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 474 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 475 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 476 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 477 .quad 0x3956c25bf348b538,0x59f111f1b605d019 478 .quad 0x3956c25bf348b538,0x59f111f1b605d019 479 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 480 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 481 .quad 0xd807aa98a3030242,0x12835b0145706fbe 482 .quad 0xd807aa98a3030242,0x12835b0145706fbe 483 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 484 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 485 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 486 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 487 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 488 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 489 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 490 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 491 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 492 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 493 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 494 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 495 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 496 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 497 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 498 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 499 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 500 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 501 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 502 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 503 .quad 0x06ca6351e003826f,0x142929670a0e6e70 504 .quad 0x06ca6351e003826f,0x142929670a0e6e70 505 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 506 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 507 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 508 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 509 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 510 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 511 .quad 0x81c2c92e47edaee6,0x92722c851482353b 512 .quad 0x81c2c92e47edaee6,0x92722c851482353b 513 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 514 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 515 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 516 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 517 .quad 0xd192e819d6ef5218,0xd69906245565a910 518 .quad 0xd192e819d6ef5218,0xd69906245565a910 519 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 520 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 521 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 522 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 523 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 524 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 525 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 526 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 527 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 528 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 529 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 530 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 531 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 532 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 533 .quad 0x90befffa23631e28,0xa4506cebde82bde9 534 .quad 0x90befffa23631e28,0xa4506cebde82bde9 535 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 536 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 537 .quad 0xca273eceea26619c,0xd186b8c721c0c207 538 .quad 0xca273eceea26619c,0xd186b8c721c0c207 539 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 540 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 541 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 542 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 543 .quad 0x113f9804bef90dae,0x1b710b35131c471b 544 .quad 0x113f9804bef90dae,0x1b710b35131c471b 545 .quad 0x28db77f523047d84,0x32caab7b40c72493 546 .quad 0x28db77f523047d84,0x32caab7b40c72493 547 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 548 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 549 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 550 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 551 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 552 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 553 554 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 555 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 556 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 557.previous 558___ 559} 560 561###################################################################### 562# SIMD code paths 563# 564if ($SZ==4 && $shaext) {{{ 565###################################################################### 566# Intel SHA Extensions implementation of SHA256 update function. 567# 568my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 569 570my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 571my @MSG=map("%xmm$_",(3..6)); 572 573$code.=<<___; 574.type sha256_block_data_order_shaext,\@function,3 575.align 64 576sha256_block_data_order_shaext: 577_shaext_shortcut: 578.cfi_startproc 579___ 580$code.=<<___ if ($win64); 581 lea `-8-5*16`(%rsp),%rsp 582 movaps %xmm6,-8-5*16(%rax) 583 movaps %xmm7,-8-4*16(%rax) 584 movaps %xmm8,-8-3*16(%rax) 585 movaps %xmm9,-8-2*16(%rax) 586 movaps %xmm10,-8-1*16(%rax) 587.Lprologue_shaext: 588___ 589$code.=<<___; 590 lea K256+0x80(%rip),$Tbl 591 movdqu ($ctx),$ABEF # DCBA 592 movdqu 16($ctx),$CDGH # HGFE 593 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 594 595 pshufd \$0x1b,$ABEF,$Wi # ABCD 596 pshufd \$0xb1,$ABEF,$ABEF # CDAB 597 pshufd \$0x1b,$CDGH,$CDGH # EFGH 598 movdqa $TMP,$BSWAP # offload 599 palignr \$8,$CDGH,$ABEF # ABEF 600 punpcklqdq $Wi,$CDGH # CDGH 601 jmp .Loop_shaext 602 603.align 16 604.Loop_shaext: 605 movdqu ($inp),@MSG[0] 606 movdqu 0x10($inp),@MSG[1] 607 movdqu 0x20($inp),@MSG[2] 608 pshufb $TMP,@MSG[0] 609 movdqu 0x30($inp),@MSG[3] 610 611 movdqa 0*32-0x80($Tbl),$Wi 612 paddd @MSG[0],$Wi 613 pshufb $TMP,@MSG[1] 614 movdqa $CDGH,$CDGH_SAVE # offload 615 sha256rnds2 $ABEF,$CDGH # 0-3 616 pshufd \$0x0e,$Wi,$Wi 617 nop 618 movdqa $ABEF,$ABEF_SAVE # offload 619 sha256rnds2 $CDGH,$ABEF 620 621 movdqa 1*32-0x80($Tbl),$Wi 622 paddd @MSG[1],$Wi 623 pshufb $TMP,@MSG[2] 624 sha256rnds2 $ABEF,$CDGH # 4-7 625 pshufd \$0x0e,$Wi,$Wi 626 lea 0x40($inp),$inp 627 sha256msg1 @MSG[1],@MSG[0] 628 sha256rnds2 $CDGH,$ABEF 629 630 movdqa 2*32-0x80($Tbl),$Wi 631 paddd @MSG[2],$Wi 632 pshufb $TMP,@MSG[3] 633 sha256rnds2 $ABEF,$CDGH # 8-11 634 pshufd \$0x0e,$Wi,$Wi 635 movdqa @MSG[3],$TMP 636 palignr \$4,@MSG[2],$TMP 637 nop 638 paddd $TMP,@MSG[0] 639 sha256msg1 @MSG[2],@MSG[1] 640 sha256rnds2 $CDGH,$ABEF 641 642 movdqa 3*32-0x80($Tbl),$Wi 643 paddd @MSG[3],$Wi 644 sha256msg2 @MSG[3],@MSG[0] 645 sha256rnds2 $ABEF,$CDGH # 12-15 646 pshufd \$0x0e,$Wi,$Wi 647 movdqa @MSG[0],$TMP 648 palignr \$4,@MSG[3],$TMP 649 nop 650 paddd $TMP,@MSG[1] 651 sha256msg1 @MSG[3],@MSG[2] 652 sha256rnds2 $CDGH,$ABEF 653___ 654for($i=4;$i<16-3;$i++) { 655$code.=<<___; 656 movdqa $i*32-0x80($Tbl),$Wi 657 paddd @MSG[0],$Wi 658 sha256msg2 @MSG[0],@MSG[1] 659 sha256rnds2 $ABEF,$CDGH # 16-19... 660 pshufd \$0x0e,$Wi,$Wi 661 movdqa @MSG[1],$TMP 662 palignr \$4,@MSG[0],$TMP 663 nop 664 paddd $TMP,@MSG[2] 665 sha256msg1 @MSG[0],@MSG[3] 666 sha256rnds2 $CDGH,$ABEF 667___ 668 push(@MSG,shift(@MSG)); 669} 670$code.=<<___; 671 movdqa 13*32-0x80($Tbl),$Wi 672 paddd @MSG[0],$Wi 673 sha256msg2 @MSG[0],@MSG[1] 674 sha256rnds2 $ABEF,$CDGH # 52-55 675 pshufd \$0x0e,$Wi,$Wi 676 movdqa @MSG[1],$TMP 677 palignr \$4,@MSG[0],$TMP 678 sha256rnds2 $CDGH,$ABEF 679 paddd $TMP,@MSG[2] 680 681 movdqa 14*32-0x80($Tbl),$Wi 682 paddd @MSG[1],$Wi 683 sha256rnds2 $ABEF,$CDGH # 56-59 684 pshufd \$0x0e,$Wi,$Wi 685 sha256msg2 @MSG[1],@MSG[2] 686 movdqa $BSWAP,$TMP 687 sha256rnds2 $CDGH,$ABEF 688 689 movdqa 15*32-0x80($Tbl),$Wi 690 paddd @MSG[2],$Wi 691 nop 692 sha256rnds2 $ABEF,$CDGH # 60-63 693 pshufd \$0x0e,$Wi,$Wi 694 dec $num 695 nop 696 sha256rnds2 $CDGH,$ABEF 697 698 paddd $CDGH_SAVE,$CDGH 699 paddd $ABEF_SAVE,$ABEF 700 jnz .Loop_shaext 701 702 pshufd \$0xb1,$CDGH,$CDGH # DCHG 703 pshufd \$0x1b,$ABEF,$TMP # FEBA 704 pshufd \$0xb1,$ABEF,$ABEF # BAFE 705 punpckhqdq $CDGH,$ABEF # DCBA 706 palignr \$8,$TMP,$CDGH # HGFE 707 708 movdqu $ABEF,($ctx) 709 movdqu $CDGH,16($ctx) 710___ 711$code.=<<___ if ($win64); 712 movaps -8-5*16(%rax),%xmm6 713 movaps -8-4*16(%rax),%xmm7 714 movaps -8-3*16(%rax),%xmm8 715 movaps -8-2*16(%rax),%xmm9 716 movaps -8-1*16(%rax),%xmm10 717 mov %rax,%rsp 718.Lepilogue_shaext: 719___ 720$code.=<<___; 721 ret 722.cfi_endproc 723.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 724___ 725}}} 726{{{ 727 728my $a4=$T1; 729my ($a,$b,$c,$d,$e,$f,$g,$h); 730 731sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 732{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 733 my $arg = pop; 734 $arg = "\$$arg" if ($arg*1 eq $arg); 735 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 736} 737 738sub body_00_15 () { 739 ( 740 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 741 742 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 743 '&mov ($a,$a1)', 744 '&mov ($a4,$f)', 745 746 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 747 '&xor ($a0,$e)', 748 '&xor ($a4,$g)', # f^g 749 750 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 751 '&xor ($a1,$a)', 752 '&and ($a4,$e)', # (f^g)&e 753 754 '&xor ($a0,$e)', 755 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 756 '&mov ($a2,$a)', 757 758 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 759 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 760 '&xor ($a2,$b)', # a^b, b^c in next round 761 762 '&add ($h,$a4)', # h+=Ch(e,f,g) 763 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 764 '&and ($a3,$a2)', # (b^c)&(a^b) 765 766 '&xor ($a1,$a)', 767 '&add ($h,$a0)', # h+=Sigma1(e) 768 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 769 770 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 771 '&add ($d,$h)', # d+=h 772 '&add ($h,$a3)', # h+=Maj(a,b,c) 773 774 '&mov ($a0,$d)', 775 '&add ($a1,$h);'. # h+=Sigma0(a) 776 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 777 ); 778} 779 780###################################################################### 781# SSSE3 code path 782# 783if ($SZ==4) { # SHA256 only 784my @X = map("%xmm$_",(0..3)); 785my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 786 787$code.=<<___; 788.type ${func}_ssse3,\@function,3 789.align 64 790${func}_ssse3: 791.cfi_startproc 792.Lssse3_shortcut: 793 mov %rsp,%rax # copy %rsp 794.cfi_def_cfa_register %rax 795 push %rbx 796.cfi_push %rbx 797 push %rbp 798.cfi_push %rbp 799 push %r12 800.cfi_push %r12 801 push %r13 802.cfi_push %r13 803 push %r14 804.cfi_push %r14 805 push %r15 806.cfi_push %r15 807 shl \$4,%rdx # num*16 808 sub \$`$framesz+$win64*16*4`,%rsp 809 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 810 and \$-64,%rsp # align stack frame 811 mov $ctx,$_ctx # save ctx, 1st arg 812 mov $inp,$_inp # save inp, 2nd arh 813 mov %rdx,$_end # save end pointer, "3rd" arg 814 mov %rax,$_rsp # save copy of %rsp 815.cfi_cfa_expression $_rsp,deref,+8 816___ 817$code.=<<___ if ($win64); 818 movaps %xmm6,16*$SZ+32(%rsp) 819 movaps %xmm7,16*$SZ+48(%rsp) 820 movaps %xmm8,16*$SZ+64(%rsp) 821 movaps %xmm9,16*$SZ+80(%rsp) 822___ 823$code.=<<___; 824.Lprologue_ssse3: 825 826 mov $SZ*0($ctx),$A 827 mov $SZ*1($ctx),$B 828 mov $SZ*2($ctx),$C 829 mov $SZ*3($ctx),$D 830 mov $SZ*4($ctx),$E 831 mov $SZ*5($ctx),$F 832 mov $SZ*6($ctx),$G 833 mov $SZ*7($ctx),$H 834___ 835 836$code.=<<___; 837 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 838 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 839 jmp .Lloop_ssse3 840.align 16 841.Lloop_ssse3: 842 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 843 movdqu 0x00($inp),@X[0] 844 movdqu 0x10($inp),@X[1] 845 movdqu 0x20($inp),@X[2] 846 pshufb $t3,@X[0] 847 movdqu 0x30($inp),@X[3] 848 lea $TABLE(%rip),$Tbl 849 pshufb $t3,@X[1] 850 movdqa 0x00($Tbl),$t0 851 movdqa 0x20($Tbl),$t1 852 pshufb $t3,@X[2] 853 paddd @X[0],$t0 854 movdqa 0x40($Tbl),$t2 855 pshufb $t3,@X[3] 856 movdqa 0x60($Tbl),$t3 857 paddd @X[1],$t1 858 paddd @X[2],$t2 859 paddd @X[3],$t3 860 movdqa $t0,0x00(%rsp) 861 mov $A,$a1 862 movdqa $t1,0x10(%rsp) 863 mov $B,$a3 864 movdqa $t2,0x20(%rsp) 865 xor $C,$a3 # magic 866 movdqa $t3,0x30(%rsp) 867 mov $E,$a0 868 jmp .Lssse3_00_47 869 870.align 16 871.Lssse3_00_47: 872 sub \$`-16*2*$SZ`,$Tbl # size optimization 873___ 874sub Xupdate_256_SSSE3 () { 875 ( 876 '&movdqa ($t0,@X[1]);', 877 '&movdqa ($t3,@X[3])', 878 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 879 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 880 '&movdqa ($t1,$t0)', 881 '&movdqa ($t2,$t0);', 882 '&psrld ($t0,$sigma0[2])', 883 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 884 '&psrld ($t2,$sigma0[0])', 885 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 886 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 887 '&pxor ($t0,$t2)', 888 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 889 '&pxor ($t0,$t1)', 890 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 891 '&pxor ($t0,$t2);', 892 '&movdqa ($t2,$t3)', 893 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 894 '&psrld ($t3,$sigma1[2])', 895 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 896 '&psrlq ($t2,$sigma1[0])', 897 '&pxor ($t3,$t2);', 898 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 899 '&pxor ($t3,$t2)', 900 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 901 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 902 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 903 '&movdqa ($t2,$t3);', 904 '&psrld ($t3,$sigma1[2])', 905 '&psrlq ($t2,$sigma1[0])', 906 '&pxor ($t3,$t2);', 907 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 908 '&pxor ($t3,$t2);', 909 '&movdqa ($t2,16*2*$j."($Tbl)")', 910 '&pshufb ($t3,$t5)', 911 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 912 ); 913} 914 915sub SSSE3_256_00_47 () { 916my $j = shift; 917my $body = shift; 918my @X = @_; 919my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 920 921 if (0) { 922 foreach (Xupdate_256_SSSE3()) { # 36 instructions 923 eval; 924 eval(shift(@insns)); 925 eval(shift(@insns)); 926 eval(shift(@insns)); 927 } 928 } else { # squeeze extra 4% on Westmere and 19% on Atom 929 eval(shift(@insns)); #@ 930 &movdqa ($t0,@X[1]); 931 eval(shift(@insns)); 932 eval(shift(@insns)); 933 &movdqa ($t3,@X[3]); 934 eval(shift(@insns)); #@ 935 eval(shift(@insns)); 936 eval(shift(@insns)); 937 eval(shift(@insns)); #@ 938 eval(shift(@insns)); 939 &palignr ($t0,@X[0],$SZ); # X[1..4] 940 eval(shift(@insns)); 941 eval(shift(@insns)); 942 &palignr ($t3,@X[2],$SZ); # X[9..12] 943 eval(shift(@insns)); 944 eval(shift(@insns)); 945 eval(shift(@insns)); 946 eval(shift(@insns)); #@ 947 &movdqa ($t1,$t0); 948 eval(shift(@insns)); 949 eval(shift(@insns)); 950 &movdqa ($t2,$t0); 951 eval(shift(@insns)); #@ 952 eval(shift(@insns)); 953 &psrld ($t0,$sigma0[2]); 954 eval(shift(@insns)); 955 eval(shift(@insns)); 956 eval(shift(@insns)); 957 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 958 eval(shift(@insns)); #@ 959 eval(shift(@insns)); 960 &psrld ($t2,$sigma0[0]); 961 eval(shift(@insns)); 962 eval(shift(@insns)); 963 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 964 eval(shift(@insns)); 965 eval(shift(@insns)); #@ 966 &pslld ($t1,8*$SZ-$sigma0[1]); 967 eval(shift(@insns)); 968 eval(shift(@insns)); 969 &pxor ($t0,$t2); 970 eval(shift(@insns)); #@ 971 eval(shift(@insns)); 972 eval(shift(@insns)); 973 eval(shift(@insns)); #@ 974 &psrld ($t2,$sigma0[1]-$sigma0[0]); 975 eval(shift(@insns)); 976 &pxor ($t0,$t1); 977 eval(shift(@insns)); 978 eval(shift(@insns)); 979 &pslld ($t1,$sigma0[1]-$sigma0[0]); 980 eval(shift(@insns)); 981 eval(shift(@insns)); 982 &pxor ($t0,$t2); 983 eval(shift(@insns)); 984 eval(shift(@insns)); #@ 985 &movdqa ($t2,$t3); 986 eval(shift(@insns)); 987 eval(shift(@insns)); 988 &pxor ($t0,$t1); # sigma0(X[1..4]) 989 eval(shift(@insns)); #@ 990 eval(shift(@insns)); 991 eval(shift(@insns)); 992 &psrld ($t3,$sigma1[2]); 993 eval(shift(@insns)); 994 eval(shift(@insns)); 995 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 996 eval(shift(@insns)); #@ 997 eval(shift(@insns)); 998 &psrlq ($t2,$sigma1[0]); 999 eval(shift(@insns)); 1000 eval(shift(@insns)); 1001 eval(shift(@insns)); 1002 &pxor ($t3,$t2); 1003 eval(shift(@insns)); #@ 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); 1006 eval(shift(@insns)); #@ 1007 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); 1010 &pxor ($t3,$t2); 1011 eval(shift(@insns)); #@ 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 1015 &pshufd ($t3,$t3,0b10000000); 1016 eval(shift(@insns)); 1017 eval(shift(@insns)); 1018 eval(shift(@insns)); 1019 &psrldq ($t3,8); 1020 eval(shift(@insns)); 1021 eval(shift(@insns)); #@ 1022 eval(shift(@insns)); 1023 eval(shift(@insns)); 1024 eval(shift(@insns)); #@ 1025 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1026 eval(shift(@insns)); 1027 eval(shift(@insns)); 1028 eval(shift(@insns)); 1029 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 1030 eval(shift(@insns)); 1031 eval(shift(@insns)); #@ 1032 eval(shift(@insns)); 1033 &movdqa ($t2,$t3); 1034 eval(shift(@insns)); 1035 eval(shift(@insns)); 1036 &psrld ($t3,$sigma1[2]); 1037 eval(shift(@insns)); 1038 eval(shift(@insns)); #@ 1039 &psrlq ($t2,$sigma1[0]); 1040 eval(shift(@insns)); 1041 eval(shift(@insns)); 1042 &pxor ($t3,$t2); 1043 eval(shift(@insns)); #@ 1044 eval(shift(@insns)); 1045 eval(shift(@insns)); 1046 eval(shift(@insns)); #@ 1047 eval(shift(@insns)); 1048 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1049 eval(shift(@insns)); 1050 eval(shift(@insns)); 1051 eval(shift(@insns)); 1052 &pxor ($t3,$t2); 1053 eval(shift(@insns)); 1054 eval(shift(@insns)); 1055 eval(shift(@insns)); #@ 1056 #&pshufb ($t3,$t5); 1057 &pshufd ($t3,$t3,0b00001000); 1058 eval(shift(@insns)); 1059 eval(shift(@insns)); 1060 &movdqa ($t2,16*2*$j."($Tbl)"); 1061 eval(shift(@insns)); #@ 1062 eval(shift(@insns)); 1063 &pslldq ($t3,8); 1064 eval(shift(@insns)); 1065 eval(shift(@insns)); 1066 eval(shift(@insns)); 1067 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1068 eval(shift(@insns)); #@ 1069 eval(shift(@insns)); 1070 eval(shift(@insns)); 1071 } 1072 &paddd ($t2,@X[0]); 1073 foreach (@insns) { eval; } # remaining instructions 1074 &movdqa (16*$j."(%rsp)",$t2); 1075} 1076 1077 for ($i=0,$j=0; $j<4; $j++) { 1078 &SSSE3_256_00_47($j,\&body_00_15,@X); 1079 push(@X,shift(@X)); # rotate(@X) 1080 } 1081 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1082 &jne (".Lssse3_00_47"); 1083 1084 for ($i=0; $i<16; ) { 1085 foreach(body_00_15()) { eval; } 1086 } 1087$code.=<<___; 1088 mov $_ctx,$ctx 1089 mov $a1,$A 1090 1091 add $SZ*0($ctx),$A 1092 lea 16*$SZ($inp),$inp 1093 add $SZ*1($ctx),$B 1094 add $SZ*2($ctx),$C 1095 add $SZ*3($ctx),$D 1096 add $SZ*4($ctx),$E 1097 add $SZ*5($ctx),$F 1098 add $SZ*6($ctx),$G 1099 add $SZ*7($ctx),$H 1100 1101 cmp $_end,$inp 1102 1103 mov $A,$SZ*0($ctx) 1104 mov $B,$SZ*1($ctx) 1105 mov $C,$SZ*2($ctx) 1106 mov $D,$SZ*3($ctx) 1107 mov $E,$SZ*4($ctx) 1108 mov $F,$SZ*5($ctx) 1109 mov $G,$SZ*6($ctx) 1110 mov $H,$SZ*7($ctx) 1111 jb .Lloop_ssse3 1112 1113 mov $_rsp,%rsi 1114.cfi_def_cfa %rsi,8 1115___ 1116$code.=<<___ if ($win64); 1117 movaps 16*$SZ+32(%rsp),%xmm6 1118 movaps 16*$SZ+48(%rsp),%xmm7 1119 movaps 16*$SZ+64(%rsp),%xmm8 1120 movaps 16*$SZ+80(%rsp),%xmm9 1121___ 1122$code.=<<___; 1123 mov -48(%rsi),%r15 1124.cfi_restore %r15 1125 mov -40(%rsi),%r14 1126.cfi_restore %r14 1127 mov -32(%rsi),%r13 1128.cfi_restore %r13 1129 mov -24(%rsi),%r12 1130.cfi_restore %r12 1131 mov -16(%rsi),%rbp 1132.cfi_restore %rbp 1133 mov -8(%rsi),%rbx 1134.cfi_restore %rbx 1135 lea (%rsi),%rsp 1136.cfi_def_cfa_register %rsp 1137.Lepilogue_ssse3: 1138 ret 1139.cfi_endproc 1140.size ${func}_ssse3,.-${func}_ssse3 1141___ 1142} 1143 1144if ($avx) {{ 1145###################################################################### 1146# XOP code path 1147# 1148if ($SZ==8) { # SHA512 only 1149$code.=<<___; 1150.type ${func}_xop,\@function,3 1151.align 64 1152${func}_xop: 1153.cfi_startproc 1154.Lxop_shortcut: 1155 mov %rsp,%rax # copy %rsp 1156.cfi_def_cfa_register %rax 1157 push %rbx 1158.cfi_push %rbx 1159 push %rbp 1160.cfi_push %rbp 1161 push %r12 1162.cfi_push %r12 1163 push %r13 1164.cfi_push %r13 1165 push %r14 1166.cfi_push %r14 1167 push %r15 1168.cfi_push %r15 1169 shl \$4,%rdx # num*16 1170 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1171 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1172 and \$-64,%rsp # align stack frame 1173 mov $ctx,$_ctx # save ctx, 1st arg 1174 mov $inp,$_inp # save inp, 2nd arh 1175 mov %rdx,$_end # save end pointer, "3rd" arg 1176 mov %rax,$_rsp # save copy of %rsp 1177.cfi_cfa_expression $_rsp,deref,+8 1178___ 1179$code.=<<___ if ($win64); 1180 movaps %xmm6,16*$SZ+32(%rsp) 1181 movaps %xmm7,16*$SZ+48(%rsp) 1182 movaps %xmm8,16*$SZ+64(%rsp) 1183 movaps %xmm9,16*$SZ+80(%rsp) 1184___ 1185$code.=<<___ if ($win64 && $SZ>4); 1186 movaps %xmm10,16*$SZ+96(%rsp) 1187 movaps %xmm11,16*$SZ+112(%rsp) 1188___ 1189$code.=<<___; 1190.Lprologue_xop: 1191 1192 vzeroupper 1193 mov $SZ*0($ctx),$A 1194 mov $SZ*1($ctx),$B 1195 mov $SZ*2($ctx),$C 1196 mov $SZ*3($ctx),$D 1197 mov $SZ*4($ctx),$E 1198 mov $SZ*5($ctx),$F 1199 mov $SZ*6($ctx),$G 1200 mov $SZ*7($ctx),$H 1201 jmp .Lloop_xop 1202___ 1203 if ($SZ==4) { # SHA256 1204 my @X = map("%xmm$_",(0..3)); 1205 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1206 1207$code.=<<___; 1208.align 16 1209.Lloop_xop: 1210 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1211 vmovdqu 0x00($inp),@X[0] 1212 vmovdqu 0x10($inp),@X[1] 1213 vmovdqu 0x20($inp),@X[2] 1214 vmovdqu 0x30($inp),@X[3] 1215 vpshufb $t3,@X[0],@X[0] 1216 lea $TABLE(%rip),$Tbl 1217 vpshufb $t3,@X[1],@X[1] 1218 vpshufb $t3,@X[2],@X[2] 1219 vpaddd 0x00($Tbl),@X[0],$t0 1220 vpshufb $t3,@X[3],@X[3] 1221 vpaddd 0x20($Tbl),@X[1],$t1 1222 vpaddd 0x40($Tbl),@X[2],$t2 1223 vpaddd 0x60($Tbl),@X[3],$t3 1224 vmovdqa $t0,0x00(%rsp) 1225 mov $A,$a1 1226 vmovdqa $t1,0x10(%rsp) 1227 mov $B,$a3 1228 vmovdqa $t2,0x20(%rsp) 1229 xor $C,$a3 # magic 1230 vmovdqa $t3,0x30(%rsp) 1231 mov $E,$a0 1232 jmp .Lxop_00_47 1233 1234.align 16 1235.Lxop_00_47: 1236 sub \$`-16*2*$SZ`,$Tbl # size optimization 1237___ 1238sub XOP_256_00_47 () { 1239my $j = shift; 1240my $body = shift; 1241my @X = @_; 1242my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1243 1244 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1245 eval(shift(@insns)); 1246 eval(shift(@insns)); 1247 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1248 eval(shift(@insns)); 1249 eval(shift(@insns)); 1250 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 &vpsrld ($t0,$t0,$sigma0[2]); 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1257 eval(shift(@insns)); 1258 eval(shift(@insns)); 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1262 eval(shift(@insns)); 1263 eval(shift(@insns)); 1264 &vpxor ($t0,$t0,$t1); 1265 eval(shift(@insns)); 1266 eval(shift(@insns)); 1267 eval(shift(@insns)); 1268 eval(shift(@insns)); 1269 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1270 eval(shift(@insns)); 1271 eval(shift(@insns)); 1272 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1273 eval(shift(@insns)); 1274 eval(shift(@insns)); 1275 &vpsrld ($t2,@X[3],$sigma1[2]); 1276 eval(shift(@insns)); 1277 eval(shift(@insns)); 1278 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1279 eval(shift(@insns)); 1280 eval(shift(@insns)); 1281 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1282 eval(shift(@insns)); 1283 eval(shift(@insns)); 1284 &vpxor ($t3,$t3,$t2); 1285 eval(shift(@insns)); 1286 eval(shift(@insns)); 1287 eval(shift(@insns)); 1288 eval(shift(@insns)); 1289 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1290 eval(shift(@insns)); 1291 eval(shift(@insns)); 1292 eval(shift(@insns)); 1293 eval(shift(@insns)); 1294 &vpsrldq ($t3,$t3,8); 1295 eval(shift(@insns)); 1296 eval(shift(@insns)); 1297 eval(shift(@insns)); 1298 eval(shift(@insns)); 1299 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1300 eval(shift(@insns)); 1301 eval(shift(@insns)); 1302 eval(shift(@insns)); 1303 eval(shift(@insns)); 1304 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1305 eval(shift(@insns)); 1306 eval(shift(@insns)); 1307 &vpsrld ($t2,@X[0],$sigma1[2]); 1308 eval(shift(@insns)); 1309 eval(shift(@insns)); 1310 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1311 eval(shift(@insns)); 1312 eval(shift(@insns)); 1313 &vpxor ($t3,$t3,$t2); 1314 eval(shift(@insns)); 1315 eval(shift(@insns)); 1316 eval(shift(@insns)); 1317 eval(shift(@insns)); 1318 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1319 eval(shift(@insns)); 1320 eval(shift(@insns)); 1321 eval(shift(@insns)); 1322 eval(shift(@insns)); 1323 &vpslldq ($t3,$t3,8); # 22 instructions 1324 eval(shift(@insns)); 1325 eval(shift(@insns)); 1326 eval(shift(@insns)); 1327 eval(shift(@insns)); 1328 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1329 eval(shift(@insns)); 1330 eval(shift(@insns)); 1331 eval(shift(@insns)); 1332 eval(shift(@insns)); 1333 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1334 foreach (@insns) { eval; } # remaining instructions 1335 &vmovdqa (16*$j."(%rsp)",$t2); 1336} 1337 1338 for ($i=0,$j=0; $j<4; $j++) { 1339 &XOP_256_00_47($j,\&body_00_15,@X); 1340 push(@X,shift(@X)); # rotate(@X) 1341 } 1342 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1343 &jne (".Lxop_00_47"); 1344 1345 for ($i=0; $i<16; ) { 1346 foreach(body_00_15()) { eval; } 1347 } 1348 1349 } else { # SHA512 1350 my @X = map("%xmm$_",(0..7)); 1351 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1352 1353$code.=<<___; 1354.align 16 1355.Lloop_xop: 1356 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1357 vmovdqu 0x00($inp),@X[0] 1358 lea $TABLE+0x80(%rip),$Tbl # size optimization 1359 vmovdqu 0x10($inp),@X[1] 1360 vmovdqu 0x20($inp),@X[2] 1361 vpshufb $t3,@X[0],@X[0] 1362 vmovdqu 0x30($inp),@X[3] 1363 vpshufb $t3,@X[1],@X[1] 1364 vmovdqu 0x40($inp),@X[4] 1365 vpshufb $t3,@X[2],@X[2] 1366 vmovdqu 0x50($inp),@X[5] 1367 vpshufb $t3,@X[3],@X[3] 1368 vmovdqu 0x60($inp),@X[6] 1369 vpshufb $t3,@X[4],@X[4] 1370 vmovdqu 0x70($inp),@X[7] 1371 vpshufb $t3,@X[5],@X[5] 1372 vpaddq -0x80($Tbl),@X[0],$t0 1373 vpshufb $t3,@X[6],@X[6] 1374 vpaddq -0x60($Tbl),@X[1],$t1 1375 vpshufb $t3,@X[7],@X[7] 1376 vpaddq -0x40($Tbl),@X[2],$t2 1377 vpaddq -0x20($Tbl),@X[3],$t3 1378 vmovdqa $t0,0x00(%rsp) 1379 vpaddq 0x00($Tbl),@X[4],$t0 1380 vmovdqa $t1,0x10(%rsp) 1381 vpaddq 0x20($Tbl),@X[5],$t1 1382 vmovdqa $t2,0x20(%rsp) 1383 vpaddq 0x40($Tbl),@X[6],$t2 1384 vmovdqa $t3,0x30(%rsp) 1385 vpaddq 0x60($Tbl),@X[7],$t3 1386 vmovdqa $t0,0x40(%rsp) 1387 mov $A,$a1 1388 vmovdqa $t1,0x50(%rsp) 1389 mov $B,$a3 1390 vmovdqa $t2,0x60(%rsp) 1391 xor $C,$a3 # magic 1392 vmovdqa $t3,0x70(%rsp) 1393 mov $E,$a0 1394 jmp .Lxop_00_47 1395 1396.align 16 1397.Lxop_00_47: 1398 add \$`16*2*$SZ`,$Tbl 1399___ 1400sub XOP_512_00_47 () { 1401my $j = shift; 1402my $body = shift; 1403my @X = @_; 1404my @insns = (&$body,&$body); # 52 instructions 1405 1406 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1407 eval(shift(@insns)); 1408 eval(shift(@insns)); 1409 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1410 eval(shift(@insns)); 1411 eval(shift(@insns)); 1412 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1413 eval(shift(@insns)); 1414 eval(shift(@insns)); 1415 &vpsrlq ($t0,$t0,$sigma0[2]); 1416 eval(shift(@insns)); 1417 eval(shift(@insns)); 1418 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1419 eval(shift(@insns)); 1420 eval(shift(@insns)); 1421 eval(shift(@insns)); 1422 eval(shift(@insns)); 1423 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1424 eval(shift(@insns)); 1425 eval(shift(@insns)); 1426 &vpxor ($t0,$t0,$t1); 1427 eval(shift(@insns)); 1428 eval(shift(@insns)); 1429 eval(shift(@insns)); 1430 eval(shift(@insns)); 1431 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1432 eval(shift(@insns)); 1433 eval(shift(@insns)); 1434 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1435 eval(shift(@insns)); 1436 eval(shift(@insns)); 1437 &vpsrlq ($t2,@X[7],$sigma1[2]); 1438 eval(shift(@insns)); 1439 eval(shift(@insns)); 1440 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1441 eval(shift(@insns)); 1442 eval(shift(@insns)); 1443 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1444 eval(shift(@insns)); 1445 eval(shift(@insns)); 1446 &vpxor ($t3,$t3,$t2); 1447 eval(shift(@insns)); 1448 eval(shift(@insns)); 1449 eval(shift(@insns)); 1450 eval(shift(@insns)); 1451 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1452 eval(shift(@insns)); 1453 eval(shift(@insns)); 1454 eval(shift(@insns)); 1455 eval(shift(@insns)); 1456 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1457 eval(shift(@insns)); 1458 eval(shift(@insns)); 1459 eval(shift(@insns)); 1460 eval(shift(@insns)); 1461 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1462 foreach (@insns) { eval; } # remaining instructions 1463 &vmovdqa (16*$j."(%rsp)",$t2); 1464} 1465 1466 for ($i=0,$j=0; $j<8; $j++) { 1467 &XOP_512_00_47($j,\&body_00_15,@X); 1468 push(@X,shift(@X)); # rotate(@X) 1469 } 1470 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1471 &jne (".Lxop_00_47"); 1472 1473 for ($i=0; $i<16; ) { 1474 foreach(body_00_15()) { eval; } 1475 } 1476} 1477$code.=<<___; 1478 mov $_ctx,$ctx 1479 mov $a1,$A 1480 1481 add $SZ*0($ctx),$A 1482 lea 16*$SZ($inp),$inp 1483 add $SZ*1($ctx),$B 1484 add $SZ*2($ctx),$C 1485 add $SZ*3($ctx),$D 1486 add $SZ*4($ctx),$E 1487 add $SZ*5($ctx),$F 1488 add $SZ*6($ctx),$G 1489 add $SZ*7($ctx),$H 1490 1491 cmp $_end,$inp 1492 1493 mov $A,$SZ*0($ctx) 1494 mov $B,$SZ*1($ctx) 1495 mov $C,$SZ*2($ctx) 1496 mov $D,$SZ*3($ctx) 1497 mov $E,$SZ*4($ctx) 1498 mov $F,$SZ*5($ctx) 1499 mov $G,$SZ*6($ctx) 1500 mov $H,$SZ*7($ctx) 1501 jb .Lloop_xop 1502 1503 mov $_rsp,%rsi 1504.cfi_def_cfa %rsi,8 1505 vzeroupper 1506___ 1507$code.=<<___ if ($win64); 1508 movaps 16*$SZ+32(%rsp),%xmm6 1509 movaps 16*$SZ+48(%rsp),%xmm7 1510 movaps 16*$SZ+64(%rsp),%xmm8 1511 movaps 16*$SZ+80(%rsp),%xmm9 1512___ 1513$code.=<<___ if ($win64 && $SZ>4); 1514 movaps 16*$SZ+96(%rsp),%xmm10 1515 movaps 16*$SZ+112(%rsp),%xmm11 1516___ 1517$code.=<<___; 1518 mov -48(%rsi),%r15 1519.cfi_restore %r15 1520 mov -40(%rsi),%r14 1521.cfi_restore %r14 1522 mov -32(%rsi),%r13 1523.cfi_restore %r13 1524 mov -24(%rsi),%r12 1525.cfi_restore %r12 1526 mov -16(%rsi),%rbp 1527.cfi_restore %rbp 1528 mov -8(%rsi),%rbx 1529.cfi_restore %rbx 1530 lea (%rsi),%rsp 1531.cfi_def_cfa_register %rsp 1532.Lepilogue_xop: 1533 ret 1534.cfi_endproc 1535.size ${func}_xop,.-${func}_xop 1536___ 1537} 1538###################################################################### 1539# AVX+shrd code path 1540# 1541local *ror = sub { &shrd(@_[0],@_) }; 1542 1543$code.=<<___; 1544.type ${func}_avx,\@function,3 1545.align 64 1546${func}_avx: 1547.cfi_startproc 1548.Lavx_shortcut: 1549 mov %rsp,%rax # copy %rsp 1550.cfi_def_cfa_register %rax 1551 push %rbx 1552.cfi_push %rbx 1553 push %rbp 1554.cfi_push %rbp 1555 push %r12 1556.cfi_push %r12 1557 push %r13 1558.cfi_push %r13 1559 push %r14 1560.cfi_push %r14 1561 push %r15 1562.cfi_push %r15 1563 shl \$4,%rdx # num*16 1564 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1565 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1566 and \$-64,%rsp # align stack frame 1567 mov $ctx,$_ctx # save ctx, 1st arg 1568 mov $inp,$_inp # save inp, 2nd arh 1569 mov %rdx,$_end # save end pointer, "3rd" arg 1570 mov %rax,$_rsp # save copy of %rsp 1571.cfi_cfa_expression $_rsp,deref,+8 1572___ 1573$code.=<<___ if ($win64); 1574 movaps %xmm6,16*$SZ+32(%rsp) 1575 movaps %xmm7,16*$SZ+48(%rsp) 1576 movaps %xmm8,16*$SZ+64(%rsp) 1577 movaps %xmm9,16*$SZ+80(%rsp) 1578___ 1579$code.=<<___ if ($win64 && $SZ>4); 1580 movaps %xmm10,16*$SZ+96(%rsp) 1581 movaps %xmm11,16*$SZ+112(%rsp) 1582___ 1583$code.=<<___; 1584.Lprologue_avx: 1585 1586 vzeroupper 1587 mov $SZ*0($ctx),$A 1588 mov $SZ*1($ctx),$B 1589 mov $SZ*2($ctx),$C 1590 mov $SZ*3($ctx),$D 1591 mov $SZ*4($ctx),$E 1592 mov $SZ*5($ctx),$F 1593 mov $SZ*6($ctx),$G 1594 mov $SZ*7($ctx),$H 1595___ 1596 if ($SZ==4) { # SHA256 1597 my @X = map("%xmm$_",(0..3)); 1598 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1599 1600$code.=<<___; 1601 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1602 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1603 jmp .Lloop_avx 1604.align 16 1605.Lloop_avx: 1606 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1607 vmovdqu 0x00($inp),@X[0] 1608 vmovdqu 0x10($inp),@X[1] 1609 vmovdqu 0x20($inp),@X[2] 1610 vmovdqu 0x30($inp),@X[3] 1611 vpshufb $t3,@X[0],@X[0] 1612 lea $TABLE(%rip),$Tbl 1613 vpshufb $t3,@X[1],@X[1] 1614 vpshufb $t3,@X[2],@X[2] 1615 vpaddd 0x00($Tbl),@X[0],$t0 1616 vpshufb $t3,@X[3],@X[3] 1617 vpaddd 0x20($Tbl),@X[1],$t1 1618 vpaddd 0x40($Tbl),@X[2],$t2 1619 vpaddd 0x60($Tbl),@X[3],$t3 1620 vmovdqa $t0,0x00(%rsp) 1621 mov $A,$a1 1622 vmovdqa $t1,0x10(%rsp) 1623 mov $B,$a3 1624 vmovdqa $t2,0x20(%rsp) 1625 xor $C,$a3 # magic 1626 vmovdqa $t3,0x30(%rsp) 1627 mov $E,$a0 1628 jmp .Lavx_00_47 1629 1630.align 16 1631.Lavx_00_47: 1632 sub \$`-16*2*$SZ`,$Tbl # size optimization 1633___ 1634sub Xupdate_256_AVX () { 1635 ( 1636 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1637 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1638 '&vpsrld ($t2,$t0,$sigma0[0]);', 1639 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1640 '&vpsrld ($t3,$t0,$sigma0[2])', 1641 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1642 '&vpxor ($t0,$t3,$t2)', 1643 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1644 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1645 '&vpxor ($t0,$t0,$t1)', 1646 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1647 '&vpxor ($t0,$t0,$t2)', 1648 '&vpsrld ($t2,$t3,$sigma1[2]);', 1649 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1650 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1651 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1652 '&vpxor ($t2,$t2,$t3);', 1653 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1654 '&vpxor ($t2,$t2,$t3)', 1655 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1656 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1657 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1658 '&vpsrld ($t2,$t3,$sigma1[2])', 1659 '&vpsrlq ($t3,$t3,$sigma1[0])', 1660 '&vpxor ($t2,$t2,$t3);', 1661 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1662 '&vpxor ($t2,$t2,$t3)', 1663 '&vpshufb ($t2,$t2,$t5)', 1664 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1665 ); 1666} 1667 1668sub AVX_256_00_47 () { 1669my $j = shift; 1670my $body = shift; 1671my @X = @_; 1672my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1673 1674 foreach (Xupdate_256_AVX()) { # 29 instructions 1675 eval; 1676 eval(shift(@insns)); 1677 eval(shift(@insns)); 1678 eval(shift(@insns)); 1679 } 1680 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1681 foreach (@insns) { eval; } # remaining instructions 1682 &vmovdqa (16*$j."(%rsp)",$t2); 1683} 1684 1685 for ($i=0,$j=0; $j<4; $j++) { 1686 &AVX_256_00_47($j,\&body_00_15,@X); 1687 push(@X,shift(@X)); # rotate(@X) 1688 } 1689 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1690 &jne (".Lavx_00_47"); 1691 1692 for ($i=0; $i<16; ) { 1693 foreach(body_00_15()) { eval; } 1694 } 1695 1696 } else { # SHA512 1697 my @X = map("%xmm$_",(0..7)); 1698 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1699 1700$code.=<<___; 1701 jmp .Lloop_avx 1702.align 16 1703.Lloop_avx: 1704 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1705 vmovdqu 0x00($inp),@X[0] 1706 lea $TABLE+0x80(%rip),$Tbl # size optimization 1707 vmovdqu 0x10($inp),@X[1] 1708 vmovdqu 0x20($inp),@X[2] 1709 vpshufb $t3,@X[0],@X[0] 1710 vmovdqu 0x30($inp),@X[3] 1711 vpshufb $t3,@X[1],@X[1] 1712 vmovdqu 0x40($inp),@X[4] 1713 vpshufb $t3,@X[2],@X[2] 1714 vmovdqu 0x50($inp),@X[5] 1715 vpshufb $t3,@X[3],@X[3] 1716 vmovdqu 0x60($inp),@X[6] 1717 vpshufb $t3,@X[4],@X[4] 1718 vmovdqu 0x70($inp),@X[7] 1719 vpshufb $t3,@X[5],@X[5] 1720 vpaddq -0x80($Tbl),@X[0],$t0 1721 vpshufb $t3,@X[6],@X[6] 1722 vpaddq -0x60($Tbl),@X[1],$t1 1723 vpshufb $t3,@X[7],@X[7] 1724 vpaddq -0x40($Tbl),@X[2],$t2 1725 vpaddq -0x20($Tbl),@X[3],$t3 1726 vmovdqa $t0,0x00(%rsp) 1727 vpaddq 0x00($Tbl),@X[4],$t0 1728 vmovdqa $t1,0x10(%rsp) 1729 vpaddq 0x20($Tbl),@X[5],$t1 1730 vmovdqa $t2,0x20(%rsp) 1731 vpaddq 0x40($Tbl),@X[6],$t2 1732 vmovdqa $t3,0x30(%rsp) 1733 vpaddq 0x60($Tbl),@X[7],$t3 1734 vmovdqa $t0,0x40(%rsp) 1735 mov $A,$a1 1736 vmovdqa $t1,0x50(%rsp) 1737 mov $B,$a3 1738 vmovdqa $t2,0x60(%rsp) 1739 xor $C,$a3 # magic 1740 vmovdqa $t3,0x70(%rsp) 1741 mov $E,$a0 1742 jmp .Lavx_00_47 1743 1744.align 16 1745.Lavx_00_47: 1746 add \$`16*2*$SZ`,$Tbl 1747___ 1748sub Xupdate_512_AVX () { 1749 ( 1750 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1751 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1752 '&vpsrlq ($t2,$t0,$sigma0[0])', 1753 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1754 '&vpsrlq ($t3,$t0,$sigma0[2])', 1755 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1756 '&vpxor ($t0,$t3,$t2)', 1757 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1758 '&vpxor ($t0,$t0,$t1)', 1759 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1760 '&vpxor ($t0,$t0,$t2)', 1761 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1762 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1763 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1764 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1765 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1766 '&vpxor ($t3,$t3,$t2)', 1767 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1768 '&vpxor ($t3,$t3,$t1)', 1769 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1770 '&vpxor ($t3,$t3,$t2)', 1771 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1772 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1773 ); 1774} 1775 1776sub AVX_512_00_47 () { 1777my $j = shift; 1778my $body = shift; 1779my @X = @_; 1780my @insns = (&$body,&$body); # 52 instructions 1781 1782 foreach (Xupdate_512_AVX()) { # 23 instructions 1783 eval; 1784 eval(shift(@insns)); 1785 eval(shift(@insns)); 1786 } 1787 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1788 foreach (@insns) { eval; } # remaining instructions 1789 &vmovdqa (16*$j."(%rsp)",$t2); 1790} 1791 1792 for ($i=0,$j=0; $j<8; $j++) { 1793 &AVX_512_00_47($j,\&body_00_15,@X); 1794 push(@X,shift(@X)); # rotate(@X) 1795 } 1796 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1797 &jne (".Lavx_00_47"); 1798 1799 for ($i=0; $i<16; ) { 1800 foreach(body_00_15()) { eval; } 1801 } 1802} 1803$code.=<<___; 1804 mov $_ctx,$ctx 1805 mov $a1,$A 1806 1807 add $SZ*0($ctx),$A 1808 lea 16*$SZ($inp),$inp 1809 add $SZ*1($ctx),$B 1810 add $SZ*2($ctx),$C 1811 add $SZ*3($ctx),$D 1812 add $SZ*4($ctx),$E 1813 add $SZ*5($ctx),$F 1814 add $SZ*6($ctx),$G 1815 add $SZ*7($ctx),$H 1816 1817 cmp $_end,$inp 1818 1819 mov $A,$SZ*0($ctx) 1820 mov $B,$SZ*1($ctx) 1821 mov $C,$SZ*2($ctx) 1822 mov $D,$SZ*3($ctx) 1823 mov $E,$SZ*4($ctx) 1824 mov $F,$SZ*5($ctx) 1825 mov $G,$SZ*6($ctx) 1826 mov $H,$SZ*7($ctx) 1827 jb .Lloop_avx 1828 1829 mov $_rsp,%rsi 1830.cfi_def_cfa %rsi,8 1831 vzeroupper 1832___ 1833$code.=<<___ if ($win64); 1834 movaps 16*$SZ+32(%rsp),%xmm6 1835 movaps 16*$SZ+48(%rsp),%xmm7 1836 movaps 16*$SZ+64(%rsp),%xmm8 1837 movaps 16*$SZ+80(%rsp),%xmm9 1838___ 1839$code.=<<___ if ($win64 && $SZ>4); 1840 movaps 16*$SZ+96(%rsp),%xmm10 1841 movaps 16*$SZ+112(%rsp),%xmm11 1842___ 1843$code.=<<___; 1844 mov -48(%rsi),%r15 1845.cfi_restore %r15 1846 mov -40(%rsi),%r14 1847.cfi_restore %r14 1848 mov -32(%rsi),%r13 1849.cfi_restore %r13 1850 mov -24(%rsi),%r12 1851.cfi_restore %r12 1852 mov -16(%rsi),%rbp 1853.cfi_restore %rbp 1854 mov -8(%rsi),%rbx 1855.cfi_restore %rbx 1856 lea (%rsi),%rsp 1857.cfi_def_cfa_register %rsp 1858.Lepilogue_avx: 1859 ret 1860.cfi_endproc 1861.size ${func}_avx,.-${func}_avx 1862___ 1863 1864if ($avx>1) {{ 1865###################################################################### 1866# AVX2+BMI code path 1867# 1868my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1869my $PUSH8=8*2*$SZ; 1870use integer; 1871 1872sub bodyx_00_15 () { 1873 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1874 ( 1875 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1876 1877 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1878 '&and ($a4,$e)', # f&e 1879 '&rorx ($a0,$e,$Sigma1[2])', 1880 '&rorx ($a2,$e,$Sigma1[1])', 1881 1882 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1883 '&lea ($h,"($h,$a4)")', 1884 '&andn ($a4,$e,$g)', # ~e&g 1885 '&xor ($a0,$a2)', 1886 1887 '&rorx ($a1,$e,$Sigma1[0])', 1888 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1889 '&xor ($a0,$a1)', # Sigma1(e) 1890 '&mov ($a2,$a)', 1891 1892 '&rorx ($a4,$a,$Sigma0[2])', 1893 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1894 '&xor ($a2,$b)', # a^b, b^c in next round 1895 '&rorx ($a1,$a,$Sigma0[1])', 1896 1897 '&rorx ($a0,$a,$Sigma0[0])', 1898 '&lea ($d,"($d,$h)")', # d+=h 1899 '&and ($a3,$a2)', # (b^c)&(a^b) 1900 '&xor ($a1,$a4)', 1901 1902 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1903 '&xor ($a1,$a0)', # Sigma0(a) 1904 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1905 '&mov ($a4,$e)', # copy of f in future 1906 1907 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1908 ); 1909 # and at the finish one has to $a+=$a1 1910} 1911 1912$code.=<<___; 1913.type ${func}_avx2,\@function,3 1914.align 64 1915${func}_avx2: 1916.cfi_startproc 1917.Lavx2_shortcut: 1918 mov %rsp,%rax # copy %rsp 1919.cfi_def_cfa_register %rax 1920 push %rbx 1921.cfi_push %rbx 1922 push %rbp 1923.cfi_push %rbp 1924 push %r12 1925.cfi_push %r12 1926 push %r13 1927.cfi_push %r13 1928 push %r14 1929.cfi_push %r14 1930 push %r15 1931.cfi_push %r15 1932 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1933 shl \$4,%rdx # num*16 1934 and \$-256*$SZ,%rsp # align stack frame 1935 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1936 add \$`2*$SZ*($rounds-8)`,%rsp 1937 mov $ctx,$_ctx # save ctx, 1st arg 1938 mov $inp,$_inp # save inp, 2nd arh 1939 mov %rdx,$_end # save end pointer, "3rd" arg 1940 mov %rax,$_rsp # save copy of %rsp 1941.cfi_cfa_expression $_rsp,deref,+8 1942___ 1943$code.=<<___ if ($win64); 1944 movaps %xmm6,16*$SZ+32(%rsp) 1945 movaps %xmm7,16*$SZ+48(%rsp) 1946 movaps %xmm8,16*$SZ+64(%rsp) 1947 movaps %xmm9,16*$SZ+80(%rsp) 1948___ 1949$code.=<<___ if ($win64 && $SZ>4); 1950 movaps %xmm10,16*$SZ+96(%rsp) 1951 movaps %xmm11,16*$SZ+112(%rsp) 1952___ 1953$code.=<<___; 1954.Lprologue_avx2: 1955 1956 vzeroupper 1957 sub \$-16*$SZ,$inp # inp++, size optimization 1958 mov $SZ*0($ctx),$A 1959 mov $inp,%r12 # borrow $T1 1960 mov $SZ*1($ctx),$B 1961 cmp %rdx,$inp # $_end 1962 mov $SZ*2($ctx),$C 1963 cmove %rsp,%r12 # next block or random data 1964 mov $SZ*3($ctx),$D 1965 mov $SZ*4($ctx),$E 1966 mov $SZ*5($ctx),$F 1967 mov $SZ*6($ctx),$G 1968 mov $SZ*7($ctx),$H 1969___ 1970 if ($SZ==4) { # SHA256 1971 my @X = map("%ymm$_",(0..3)); 1972 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1973 1974$code.=<<___; 1975 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1976 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1977 jmp .Loop_avx2 1978.align 16 1979.Loop_avx2: 1980 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1981 vmovdqu -16*$SZ+0($inp),%xmm0 1982 vmovdqu -16*$SZ+16($inp),%xmm1 1983 vmovdqu -16*$SZ+32($inp),%xmm2 1984 vmovdqu -16*$SZ+48($inp),%xmm3 1985 #mov $inp,$_inp # offload $inp 1986 vinserti128 \$1,(%r12),@X[0],@X[0] 1987 vinserti128 \$1,16(%r12),@X[1],@X[1] 1988 vpshufb $t3,@X[0],@X[0] 1989 vinserti128 \$1,32(%r12),@X[2],@X[2] 1990 vpshufb $t3,@X[1],@X[1] 1991 vinserti128 \$1,48(%r12),@X[3],@X[3] 1992 1993 lea $TABLE(%rip),$Tbl 1994 vpshufb $t3,@X[2],@X[2] 1995 vpaddd 0x00($Tbl),@X[0],$t0 1996 vpshufb $t3,@X[3],@X[3] 1997 vpaddd 0x20($Tbl),@X[1],$t1 1998 vpaddd 0x40($Tbl),@X[2],$t2 1999 vpaddd 0x60($Tbl),@X[3],$t3 2000 vmovdqa $t0,0x00(%rsp) 2001 xor $a1,$a1 2002 vmovdqa $t1,0x20(%rsp) 2003___ 2004$code.=<<___ if (!$win64); 2005# temporarily use %rdi as frame pointer 2006 mov $_rsp,%rdi 2007.cfi_def_cfa %rdi,8 2008___ 2009$code.=<<___; 2010 lea -$PUSH8(%rsp),%rsp 2011___ 2012$code.=<<___ if (!$win64); 2013# the frame info is at $_rsp, but the stack is moving... 2014# so a second frame pointer is saved at -8(%rsp) 2015# that is in the red zone 2016 mov %rdi,-8(%rsp) 2017.cfi_cfa_expression %rsp-8,deref,+8 2018___ 2019$code.=<<___; 2020 mov $B,$a3 2021 vmovdqa $t2,0x00(%rsp) 2022 xor $C,$a3 # magic 2023 vmovdqa $t3,0x20(%rsp) 2024 mov $F,$a4 2025 sub \$-16*2*$SZ,$Tbl # size optimization 2026 jmp .Lavx2_00_47 2027 2028.align 16 2029.Lavx2_00_47: 2030___ 2031 2032sub AVX2_256_00_47 () { 2033my $j = shift; 2034my $body = shift; 2035my @X = @_; 2036my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 2037my $base = "+2*$PUSH8(%rsp)"; 2038 2039 if (($j%2)==0) { 2040 &lea ("%rsp","-$PUSH8(%rsp)"); 2041$code.=<<___ if (!$win64); 2042.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2043# copy secondary frame pointer to new location again at -8(%rsp) 2044 pushq $PUSH8-8(%rsp) 2045.cfi_cfa_expression %rsp,deref,+8 2046 lea 8(%rsp),%rsp 2047.cfi_cfa_expression %rsp-8,deref,+8 2048___ 2049 } 2050 2051 foreach (Xupdate_256_AVX()) { # 29 instructions 2052 eval; 2053 eval(shift(@insns)); 2054 eval(shift(@insns)); 2055 eval(shift(@insns)); 2056 } 2057 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 2058 foreach (@insns) { eval; } # remaining instructions 2059 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2060} 2061 2062 for ($i=0,$j=0; $j<4; $j++) { 2063 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 2064 push(@X,shift(@X)); # rotate(@X) 2065 } 2066 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2067 &cmpb (($SZ-1)."($Tbl)",0); 2068 &jne (".Lavx2_00_47"); 2069 2070 for ($i=0; $i<16; ) { 2071 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2072 foreach(bodyx_00_15()) { eval; } 2073 } 2074 } else { # SHA512 2075 my @X = map("%ymm$_",(0..7)); 2076 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 2077 2078$code.=<<___; 2079 jmp .Loop_avx2 2080.align 16 2081.Loop_avx2: 2082 vmovdqu -16*$SZ($inp),%xmm0 2083 vmovdqu -16*$SZ+16($inp),%xmm1 2084 vmovdqu -16*$SZ+32($inp),%xmm2 2085 lea $TABLE+0x80(%rip),$Tbl # size optimization 2086 vmovdqu -16*$SZ+48($inp),%xmm3 2087 vmovdqu -16*$SZ+64($inp),%xmm4 2088 vmovdqu -16*$SZ+80($inp),%xmm5 2089 vmovdqu -16*$SZ+96($inp),%xmm6 2090 vmovdqu -16*$SZ+112($inp),%xmm7 2091 #mov $inp,$_inp # offload $inp 2092 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 2093 vinserti128 \$1,(%r12),@X[0],@X[0] 2094 vinserti128 \$1,16(%r12),@X[1],@X[1] 2095 vpshufb $t2,@X[0],@X[0] 2096 vinserti128 \$1,32(%r12),@X[2],@X[2] 2097 vpshufb $t2,@X[1],@X[1] 2098 vinserti128 \$1,48(%r12),@X[3],@X[3] 2099 vpshufb $t2,@X[2],@X[2] 2100 vinserti128 \$1,64(%r12),@X[4],@X[4] 2101 vpshufb $t2,@X[3],@X[3] 2102 vinserti128 \$1,80(%r12),@X[5],@X[5] 2103 vpshufb $t2,@X[4],@X[4] 2104 vinserti128 \$1,96(%r12),@X[6],@X[6] 2105 vpshufb $t2,@X[5],@X[5] 2106 vinserti128 \$1,112(%r12),@X[7],@X[7] 2107 2108 vpaddq -0x80($Tbl),@X[0],$t0 2109 vpshufb $t2,@X[6],@X[6] 2110 vpaddq -0x60($Tbl),@X[1],$t1 2111 vpshufb $t2,@X[7],@X[7] 2112 vpaddq -0x40($Tbl),@X[2],$t2 2113 vpaddq -0x20($Tbl),@X[3],$t3 2114 vmovdqa $t0,0x00(%rsp) 2115 vpaddq 0x00($Tbl),@X[4],$t0 2116 vmovdqa $t1,0x20(%rsp) 2117 vpaddq 0x20($Tbl),@X[5],$t1 2118 vmovdqa $t2,0x40(%rsp) 2119 vpaddq 0x40($Tbl),@X[6],$t2 2120 vmovdqa $t3,0x60(%rsp) 2121___ 2122$code.=<<___ if (!$win64); 2123# temporarily use %rdi as frame pointer 2124 mov $_rsp,%rdi 2125.cfi_def_cfa %rdi,8 2126___ 2127$code.=<<___; 2128 lea -$PUSH8(%rsp),%rsp 2129___ 2130$code.=<<___ if (!$win64); 2131# the frame info is at $_rsp, but the stack is moving... 2132# so a second frame pointer is saved at -8(%rsp) 2133# that is in the red zone 2134 mov %rdi,-8(%rsp) 2135.cfi_cfa_expression %rsp-8,deref,+8 2136___ 2137$code.=<<___; 2138 vpaddq 0x60($Tbl),@X[7],$t3 2139 vmovdqa $t0,0x00(%rsp) 2140 xor $a1,$a1 2141 vmovdqa $t1,0x20(%rsp) 2142 mov $B,$a3 2143 vmovdqa $t2,0x40(%rsp) 2144 xor $C,$a3 # magic 2145 vmovdqa $t3,0x60(%rsp) 2146 mov $F,$a4 2147 add \$16*2*$SZ,$Tbl 2148 jmp .Lavx2_00_47 2149 2150.align 16 2151.Lavx2_00_47: 2152___ 2153 2154sub AVX2_512_00_47 () { 2155my $j = shift; 2156my $body = shift; 2157my @X = @_; 2158my @insns = (&$body,&$body); # 48 instructions 2159my $base = "+2*$PUSH8(%rsp)"; 2160 2161 if (($j%4)==0) { 2162 &lea ("%rsp","-$PUSH8(%rsp)"); 2163$code.=<<___ if (!$win64); 2164.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 2165# copy secondary frame pointer to new location again at -8(%rsp) 2166 pushq $PUSH8-8(%rsp) 2167.cfi_cfa_expression %rsp,deref,+8 2168 lea 8(%rsp),%rsp 2169.cfi_cfa_expression %rsp-8,deref,+8 2170___ 2171 } 2172 2173 foreach (Xupdate_512_AVX()) { # 23 instructions 2174 eval; 2175 if ($_ !~ /\;$/) { 2176 eval(shift(@insns)); 2177 eval(shift(@insns)); 2178 eval(shift(@insns)); 2179 } 2180 } 2181 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2182 foreach (@insns) { eval; } # remaining instructions 2183 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2184} 2185 2186 for ($i=0,$j=0; $j<8; $j++) { 2187 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2188 push(@X,shift(@X)); # rotate(@X) 2189 } 2190 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2191 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2192 &jne (".Lavx2_00_47"); 2193 2194 for ($i=0; $i<16; ) { 2195 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2196 foreach(bodyx_00_15()) { eval; } 2197 } 2198} 2199$code.=<<___; 2200 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2201 add $a1,$A 2202 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2203 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2204 2205 add $SZ*0($ctx),$A 2206 add $SZ*1($ctx),$B 2207 add $SZ*2($ctx),$C 2208 add $SZ*3($ctx),$D 2209 add $SZ*4($ctx),$E 2210 add $SZ*5($ctx),$F 2211 add $SZ*6($ctx),$G 2212 add $SZ*7($ctx),$H 2213 2214 mov $A,$SZ*0($ctx) 2215 mov $B,$SZ*1($ctx) 2216 mov $C,$SZ*2($ctx) 2217 mov $D,$SZ*3($ctx) 2218 mov $E,$SZ*4($ctx) 2219 mov $F,$SZ*5($ctx) 2220 mov $G,$SZ*6($ctx) 2221 mov $H,$SZ*7($ctx) 2222 2223 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2224 je .Ldone_avx2 2225 2226 xor $a1,$a1 2227 mov $B,$a3 2228 xor $C,$a3 # magic 2229 mov $F,$a4 2230 jmp .Lower_avx2 2231.align 16 2232.Lower_avx2: 2233___ 2234 for ($i=0; $i<8; ) { 2235 my $base="+16($Tbl)"; 2236 foreach(bodyx_00_15()) { eval; } 2237 } 2238$code.=<<___; 2239 lea -$PUSH8($Tbl),$Tbl 2240 cmp %rsp,$Tbl 2241 jae .Lower_avx2 2242 2243 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2244 add $a1,$A 2245 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2246 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2247# restore frame pointer to original location at $_rsp 2248.cfi_cfa_expression $_rsp,deref,+8 2249 2250 add $SZ*0($ctx),$A 2251 add $SZ*1($ctx),$B 2252 add $SZ*2($ctx),$C 2253 add $SZ*3($ctx),$D 2254 add $SZ*4($ctx),$E 2255 add $SZ*5($ctx),$F 2256 lea `2*16*$SZ`($inp),$inp # inp+=2 2257 add $SZ*6($ctx),$G 2258 mov $inp,%r12 2259 add $SZ*7($ctx),$H 2260 cmp $_end,$inp 2261 2262 mov $A,$SZ*0($ctx) 2263 cmove %rsp,%r12 # next block or stale data 2264 mov $B,$SZ*1($ctx) 2265 mov $C,$SZ*2($ctx) 2266 mov $D,$SZ*3($ctx) 2267 mov $E,$SZ*4($ctx) 2268 mov $F,$SZ*5($ctx) 2269 mov $G,$SZ*6($ctx) 2270 mov $H,$SZ*7($ctx) 2271 2272 jbe .Loop_avx2 2273 lea (%rsp),$Tbl 2274# temporarily use $Tbl as index to $_rsp 2275# this avoids the need to save a secondary frame pointer at -8(%rsp) 2276.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 2277 2278.Ldone_avx2: 2279 mov `16*$SZ+3*8`($Tbl),%rsi 2280.cfi_def_cfa %rsi,8 2281 vzeroupper 2282___ 2283$code.=<<___ if ($win64); 2284 movaps 16*$SZ+32($Tbl),%xmm6 2285 movaps 16*$SZ+48($Tbl),%xmm7 2286 movaps 16*$SZ+64($Tbl),%xmm8 2287 movaps 16*$SZ+80($Tbl),%xmm9 2288___ 2289$code.=<<___ if ($win64 && $SZ>4); 2290 movaps 16*$SZ+96($Tbl),%xmm10 2291 movaps 16*$SZ+112($Tbl),%xmm11 2292___ 2293$code.=<<___; 2294 mov -48(%rsi),%r15 2295.cfi_restore %r15 2296 mov -40(%rsi),%r14 2297.cfi_restore %r14 2298 mov -32(%rsi),%r13 2299.cfi_restore %r13 2300 mov -24(%rsi),%r12 2301.cfi_restore %r12 2302 mov -16(%rsi),%rbp 2303.cfi_restore %rbp 2304 mov -8(%rsi),%rbx 2305.cfi_restore %rbx 2306 lea (%rsi),%rsp 2307.cfi_def_cfa_register %rsp 2308.Lepilogue_avx2: 2309 ret 2310.cfi_endproc 2311.size ${func}_avx2,.-${func}_avx2 2312___ 2313}} 2314}}}}} 2315 2316# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2317# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2318if ($win64) { 2319$rec="%rcx"; 2320$frame="%rdx"; 2321$context="%r8"; 2322$disp="%r9"; 2323 2324$code.=<<___; 2325.extern __imp_RtlVirtualUnwind 2326.type se_handler,\@abi-omnipotent 2327.align 16 2328se_handler: 2329 push %rsi 2330 push %rdi 2331 push %rbx 2332 push %rbp 2333 push %r12 2334 push %r13 2335 push %r14 2336 push %r15 2337 pushfq 2338 sub \$64,%rsp 2339 2340 mov 120($context),%rax # pull context->Rax 2341 mov 248($context),%rbx # pull context->Rip 2342 2343 mov 8($disp),%rsi # disp->ImageBase 2344 mov 56($disp),%r11 # disp->HanderlData 2345 2346 mov 0(%r11),%r10d # HandlerData[0] 2347 lea (%rsi,%r10),%r10 # prologue label 2348 cmp %r10,%rbx # context->Rip<prologue label 2349 jb .Lin_prologue 2350 2351 mov 152($context),%rax # pull context->Rsp 2352 2353 mov 4(%r11),%r10d # HandlerData[1] 2354 lea (%rsi,%r10),%r10 # epilogue label 2355 cmp %r10,%rbx # context->Rip>=epilogue label 2356 jae .Lin_prologue 2357___ 2358$code.=<<___ if ($avx>1); 2359 lea .Lavx2_shortcut(%rip),%r10 2360 cmp %r10,%rbx # context->Rip<avx2_shortcut 2361 jb .Lnot_in_avx2 2362 2363 and \$-256*$SZ,%rax 2364 add \$`2*$SZ*($rounds-8)`,%rax 2365.Lnot_in_avx2: 2366___ 2367$code.=<<___; 2368 mov %rax,%rsi # put aside Rsp 2369 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2370 2371 mov -8(%rax),%rbx 2372 mov -16(%rax),%rbp 2373 mov -24(%rax),%r12 2374 mov -32(%rax),%r13 2375 mov -40(%rax),%r14 2376 mov -48(%rax),%r15 2377 mov %rbx,144($context) # restore context->Rbx 2378 mov %rbp,160($context) # restore context->Rbp 2379 mov %r12,216($context) # restore context->R12 2380 mov %r13,224($context) # restore context->R13 2381 mov %r14,232($context) # restore context->R14 2382 mov %r15,240($context) # restore context->R15 2383 2384 lea .Lepilogue(%rip),%r10 2385 cmp %r10,%rbx 2386 jb .Lin_prologue # non-AVX code 2387 2388 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2389 lea 512($context),%rdi # &context.Xmm6 2390 mov \$`$SZ==4?8:12`,%ecx 2391 .long 0xa548f3fc # cld; rep movsq 2392 2393.Lin_prologue: 2394 mov 8(%rax),%rdi 2395 mov 16(%rax),%rsi 2396 mov %rax,152($context) # restore context->Rsp 2397 mov %rsi,168($context) # restore context->Rsi 2398 mov %rdi,176($context) # restore context->Rdi 2399 2400 mov 40($disp),%rdi # disp->ContextRecord 2401 mov $context,%rsi # context 2402 mov \$154,%ecx # sizeof(CONTEXT) 2403 .long 0xa548f3fc # cld; rep movsq 2404 2405 mov $disp,%rsi 2406 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2407 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2408 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2409 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2410 mov 40(%rsi),%r10 # disp->ContextRecord 2411 lea 56(%rsi),%r11 # &disp->HandlerData 2412 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2413 mov %r10,32(%rsp) # arg5 2414 mov %r11,40(%rsp) # arg6 2415 mov %r12,48(%rsp) # arg7 2416 mov %rcx,56(%rsp) # arg8, (NULL) 2417 call *__imp_RtlVirtualUnwind(%rip) 2418 2419 mov \$1,%eax # ExceptionContinueSearch 2420 add \$64,%rsp 2421 popfq 2422 pop %r15 2423 pop %r14 2424 pop %r13 2425 pop %r12 2426 pop %rbp 2427 pop %rbx 2428 pop %rdi 2429 pop %rsi 2430 ret 2431.size se_handler,.-se_handler 2432___ 2433 2434$code.=<<___ if ($SZ==4 && $shaext); 2435.type shaext_handler,\@abi-omnipotent 2436.align 16 2437shaext_handler: 2438 push %rsi 2439 push %rdi 2440 push %rbx 2441 push %rbp 2442 push %r12 2443 push %r13 2444 push %r14 2445 push %r15 2446 pushfq 2447 sub \$64,%rsp 2448 2449 mov 120($context),%rax # pull context->Rax 2450 mov 248($context),%rbx # pull context->Rip 2451 2452 lea .Lprologue_shaext(%rip),%r10 2453 cmp %r10,%rbx # context->Rip<.Lprologue 2454 jb .Lin_prologue 2455 2456 lea .Lepilogue_shaext(%rip),%r10 2457 cmp %r10,%rbx # context->Rip>=.Lepilogue 2458 jae .Lin_prologue 2459 2460 lea -8-5*16(%rax),%rsi 2461 lea 512($context),%rdi # &context.Xmm6 2462 mov \$10,%ecx 2463 .long 0xa548f3fc # cld; rep movsq 2464 2465 jmp .Lin_prologue 2466.size shaext_handler,.-shaext_handler 2467___ 2468 2469$code.=<<___; 2470.section .pdata 2471.align 4 2472 .rva .LSEH_begin_$func 2473 .rva .LSEH_end_$func 2474 .rva .LSEH_info_$func 2475___ 2476$code.=<<___ if ($SZ==4 && $shaext); 2477 .rva .LSEH_begin_${func}_shaext 2478 .rva .LSEH_end_${func}_shaext 2479 .rva .LSEH_info_${func}_shaext 2480___ 2481$code.=<<___ if ($SZ==4); 2482 .rva .LSEH_begin_${func}_ssse3 2483 .rva .LSEH_end_${func}_ssse3 2484 .rva .LSEH_info_${func}_ssse3 2485___ 2486$code.=<<___ if ($avx && $SZ==8); 2487 .rva .LSEH_begin_${func}_xop 2488 .rva .LSEH_end_${func}_xop 2489 .rva .LSEH_info_${func}_xop 2490___ 2491$code.=<<___ if ($avx); 2492 .rva .LSEH_begin_${func}_avx 2493 .rva .LSEH_end_${func}_avx 2494 .rva .LSEH_info_${func}_avx 2495___ 2496$code.=<<___ if ($avx>1); 2497 .rva .LSEH_begin_${func}_avx2 2498 .rva .LSEH_end_${func}_avx2 2499 .rva .LSEH_info_${func}_avx2 2500___ 2501$code.=<<___; 2502.section .xdata 2503.align 8 2504.LSEH_info_$func: 2505 .byte 9,0,0,0 2506 .rva se_handler 2507 .rva .Lprologue,.Lepilogue # HandlerData[] 2508___ 2509$code.=<<___ if ($SZ==4 && $shaext); 2510.LSEH_info_${func}_shaext: 2511 .byte 9,0,0,0 2512 .rva shaext_handler 2513___ 2514$code.=<<___ if ($SZ==4); 2515.LSEH_info_${func}_ssse3: 2516 .byte 9,0,0,0 2517 .rva se_handler 2518 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2519___ 2520$code.=<<___ if ($avx && $SZ==8); 2521.LSEH_info_${func}_xop: 2522 .byte 9,0,0,0 2523 .rva se_handler 2524 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2525___ 2526$code.=<<___ if ($avx); 2527.LSEH_info_${func}_avx: 2528 .byte 9,0,0,0 2529 .rva se_handler 2530 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2531___ 2532$code.=<<___ if ($avx>1); 2533.LSEH_info_${func}_avx2: 2534 .byte 9,0,0,0 2535 .rva se_handler 2536 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2537___ 2538} 2539 2540sub sha256op38 { 2541 my $instr = shift; 2542 my %opcodelet = ( 2543 "sha256rnds2" => 0xcb, 2544 "sha256msg1" => 0xcc, 2545 "sha256msg2" => 0xcd ); 2546 2547 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2548 my @opcode=(0x0f,0x38); 2549 push @opcode,$opcodelet{$instr}; 2550 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2551 return ".byte\t".join(',',@opcode); 2552 } else { 2553 return $instr."\t".@_[0]; 2554 } 2555} 2556 2557foreach (split("\n",$code)) { 2558 s/\`([^\`]*)\`/eval $1/geo; 2559 2560 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2561 2562 print $_,"\n"; 2563} 2564close STDOUT or die "error closing STDOUT: $!"; 2565