1#! /usr/bin/env perl 2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# November 2014 18# 19# ChaCha20 for x86_64. 20# 21# December 2016 22# 23# Add AVX512F code path. 24# 25# December 2017 26# 27# Add AVX512VL code path. 28# 29# Performance in cycles per byte out of large buffer. 30# 31# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) 32# 33# P4 9.48/+99% - - 34# Core2 7.83/+55% 7.90/5.76 4.35 35# Westmere 7.19/+50% 5.60/4.50 3.00 36# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 37# Ivy Bridge 6.71/+46% 5.40/? 2.41 38# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 39# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] 40# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) 41# Knights L 11.7/- ? 9.60(iii) 0.80 42# Goldmont 10.6/+17% 5.10/3.52 3.28 43# Sledgehammer 7.28/+52% - - 44# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) 45# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 46# VIA Nano 10.5/+46% 6.72/6.88 6.05 47# 48# (i) compared to older gcc 3.x one can observe >2x improvement on 49# most platforms; 50# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used 51# by chacha20_poly1305_tls_cipher, results are EVP-free; 52# (iii) this is not optimal result for Atom because of MSROM 53# limitations, SSE2 can do better, but gain is considered too 54# low to justify the [maintenance] effort; 55# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 56# and 4.85 for 128-byte inputs; 57# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; 58# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 59# cpb in single thread, the corresponding capability is suppressed; 60 61# $output is the last argument if it looks like a file (it has an extension) 62# $flavour is the first argument if it doesn't look like a file 63$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 64$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 65 66$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 67 68$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 69( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 70( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 71die "can't locate x86_64-xlate.pl"; 72 73if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 76} 77 78if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 81 $avx += 1 if ($1==2.11 && $2>=8); 82} 83 84if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 85 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 86 $avx = ($1>=10) + ($1>=11); 87} 88 89if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 90 $avx = ($2>=3.0) + ($2>3.0); 91} 92 93open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 94 or die "can't call $xlate: $!"; 95*STDOUT=*OUT; 96 97# input parameter block 98($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); 99 100$code.=<<___; 101.text 102 103.extern OPENSSL_ia32cap_P 104 105.section .rodata align=64 106.align 64 107.Lzero: 108.long 0,0,0,0 109.Lone: 110.long 1,0,0,0 111.Linc: 112.long 0,1,2,3 113.Lfour: 114.long 4,4,4,4 115.Lincy: 116.long 0,2,4,6,1,3,5,7 117.Leight: 118.long 8,8,8,8,8,8,8,8 119.Lrot16: 120.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 121.Lrot24: 122.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 123.Ltwoy: 124.long 2,0,0,0, 2,0,0,0 125.align 64 126.Lzeroz: 127.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 128.Lfourz: 129.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 130.Lincz: 131.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 132.Lsixteen: 133.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 134.Lsigma: 135.asciz "expand 32-byte k" 136.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 137.previous 138___ 139 140sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 141{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 142 my $arg = pop; 143 $arg = "\$$arg" if ($arg*1 eq $arg); 144 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 145} 146 147@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), 148 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); 149@t=("%esi","%edi"); 150 151sub ROUND { # critical path is 24 cycles per round 152my ($a0,$b0,$c0,$d0)=@_; 153my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 154my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 155my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 156my ($xc,$xc_)=map("\"$_\"",@t); 157my @x=map("\"$_\"",@x); 158 159 # Consider order in which variables are addressed by their 160 # index: 161 # 162 # a b c d 163 # 164 # 0 4 8 12 < even round 165 # 1 5 9 13 166 # 2 6 10 14 167 # 3 7 11 15 168 # 0 5 10 15 < odd round 169 # 1 6 11 12 170 # 2 7 8 13 171 # 3 4 9 14 172 # 173 # 'a', 'b' and 'd's are permanently allocated in registers, 174 # @x[0..7,12..15], while 'c's are maintained in memory. If 175 # you observe 'c' column, you'll notice that pair of 'c's is 176 # invariant between rounds. This means that we have to reload 177 # them once per round, in the middle. This is why you'll see 178 # bunch of 'c' stores and loads in the middle, but none in 179 # the beginning or end. 180 181 # Normally instructions would be interleaved to favour in-order 182 # execution. Generally out-of-order cores manage it gracefully, 183 # but not this time for some reason. As in-order execution 184 # cores are dying breed, old Atom is the only one around, 185 # instructions are left uninterleaved. Besides, Atom is better 186 # off executing 1xSSSE3 code anyway... 187 188 ( 189 "&add (@x[$a0],@x[$b0])", # Q1 190 "&xor (@x[$d0],@x[$a0])", 191 "&rol (@x[$d0],16)", 192 "&add (@x[$a1],@x[$b1])", # Q2 193 "&xor (@x[$d1],@x[$a1])", 194 "&rol (@x[$d1],16)", 195 196 "&add ($xc,@x[$d0])", 197 "&xor (@x[$b0],$xc)", 198 "&rol (@x[$b0],12)", 199 "&add ($xc_,@x[$d1])", 200 "&xor (@x[$b1],$xc_)", 201 "&rol (@x[$b1],12)", 202 203 "&add (@x[$a0],@x[$b0])", 204 "&xor (@x[$d0],@x[$a0])", 205 "&rol (@x[$d0],8)", 206 "&add (@x[$a1],@x[$b1])", 207 "&xor (@x[$d1],@x[$a1])", 208 "&rol (@x[$d1],8)", 209 210 "&add ($xc,@x[$d0])", 211 "&xor (@x[$b0],$xc)", 212 "&rol (@x[$b0],7)", 213 "&add ($xc_,@x[$d1])", 214 "&xor (@x[$b1],$xc_)", 215 "&rol (@x[$b1],7)", 216 217 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's 218 "&mov (\"4*$c1(%rsp)\",$xc_)", 219 "&mov ($xc,\"4*$c2(%rsp)\")", 220 "&mov ($xc_,\"4*$c3(%rsp)\")", 221 222 "&add (@x[$a2],@x[$b2])", # Q3 223 "&xor (@x[$d2],@x[$a2])", 224 "&rol (@x[$d2],16)", 225 "&add (@x[$a3],@x[$b3])", # Q4 226 "&xor (@x[$d3],@x[$a3])", 227 "&rol (@x[$d3],16)", 228 229 "&add ($xc,@x[$d2])", 230 "&xor (@x[$b2],$xc)", 231 "&rol (@x[$b2],12)", 232 "&add ($xc_,@x[$d3])", 233 "&xor (@x[$b3],$xc_)", 234 "&rol (@x[$b3],12)", 235 236 "&add (@x[$a2],@x[$b2])", 237 "&xor (@x[$d2],@x[$a2])", 238 "&rol (@x[$d2],8)", 239 "&add (@x[$a3],@x[$b3])", 240 "&xor (@x[$d3],@x[$a3])", 241 "&rol (@x[$d3],8)", 242 243 "&add ($xc,@x[$d2])", 244 "&xor (@x[$b2],$xc)", 245 "&rol (@x[$b2],7)", 246 "&add ($xc_,@x[$d3])", 247 "&xor (@x[$b3],$xc_)", 248 "&rol (@x[$b3],7)" 249 ); 250} 251 252######################################################################## 253# Generic code path that handles all lengths on pre-SSSE3 processors. 254$code.=<<___; 255.globl ChaCha20_ctr32 256.type ChaCha20_ctr32,\@function,5 257.align 64 258ChaCha20_ctr32: 259.cfi_startproc 260 cmp \$0,$len 261 je .Lno_data 262 mov OPENSSL_ia32cap_P+4(%rip),%r10 263___ 264$code.=<<___ if ($avx>2); 265 bt \$48,%r10 # check for AVX512F 266 jc .LChaCha20_avx512 267 test %r10,%r10 # check for AVX512VL 268 js .LChaCha20_avx512vl 269___ 270$code.=<<___; 271 test \$`1<<(41-32)`,%r10d 272 jnz .LChaCha20_ssse3 273 274 push %rbx 275.cfi_push %rbx 276 push %rbp 277.cfi_push %rbp 278 push %r12 279.cfi_push %r12 280 push %r13 281.cfi_push %r13 282 push %r14 283.cfi_push %r14 284 push %r15 285.cfi_push %r15 286 sub \$64+24,%rsp 287.cfi_adjust_cfa_offset 64+24 288.Lctr32_body: 289 290 #movdqa .Lsigma(%rip),%xmm0 291 movdqu ($key),%xmm1 292 movdqu 16($key),%xmm2 293 movdqu ($counter),%xmm3 294 movdqa .Lone(%rip),%xmm4 295 296 #movdqa %xmm0,4*0(%rsp) # key[0] 297 movdqa %xmm1,4*4(%rsp) # key[1] 298 movdqa %xmm2,4*8(%rsp) # key[2] 299 movdqa %xmm3,4*12(%rsp) # key[3] 300 mov $len,%rbp # reassign $len 301 jmp .Loop_outer 302 303.align 32 304.Loop_outer: 305 mov \$0x61707865,@x[0] # 'expa' 306 mov \$0x3320646e,@x[1] # 'nd 3' 307 mov \$0x79622d32,@x[2] # '2-by' 308 mov \$0x6b206574,@x[3] # 'te k' 309 mov 4*4(%rsp),@x[4] 310 mov 4*5(%rsp),@x[5] 311 mov 4*6(%rsp),@x[6] 312 mov 4*7(%rsp),@x[7] 313 movd %xmm3,@x[12] 314 mov 4*13(%rsp),@x[13] 315 mov 4*14(%rsp),@x[14] 316 mov 4*15(%rsp),@x[15] 317 318 mov %rbp,64+0(%rsp) # save len 319 mov \$10,%ebp 320 mov $inp,64+8(%rsp) # save inp 321 movq %xmm2,%rsi # "@x[8]" 322 mov $out,64+16(%rsp) # save out 323 mov %rsi,%rdi 324 shr \$32,%rdi # "@x[9]" 325 jmp .Loop 326 327.align 32 328.Loop: 329___ 330 foreach (&ROUND (0, 4, 8,12)) { eval; } 331 foreach (&ROUND (0, 5,10,15)) { eval; } 332 &dec ("%ebp"); 333 &jnz (".Loop"); 334 335$code.=<<___; 336 mov @t[1],4*9(%rsp) # modulo-scheduled 337 mov @t[0],4*8(%rsp) 338 mov 64(%rsp),%rbp # load len 339 movdqa %xmm2,%xmm1 340 mov 64+8(%rsp),$inp # load inp 341 paddd %xmm4,%xmm3 # increment counter 342 mov 64+16(%rsp),$out # load out 343 344 add \$0x61707865,@x[0] # 'expa' 345 add \$0x3320646e,@x[1] # 'nd 3' 346 add \$0x79622d32,@x[2] # '2-by' 347 add \$0x6b206574,@x[3] # 'te k' 348 add 4*4(%rsp),@x[4] 349 add 4*5(%rsp),@x[5] 350 add 4*6(%rsp),@x[6] 351 add 4*7(%rsp),@x[7] 352 add 4*12(%rsp),@x[12] 353 add 4*13(%rsp),@x[13] 354 add 4*14(%rsp),@x[14] 355 add 4*15(%rsp),@x[15] 356 paddd 4*8(%rsp),%xmm1 357 358 cmp \$64,%rbp 359 jb .Ltail 360 361 xor 4*0($inp),@x[0] # xor with input 362 xor 4*1($inp),@x[1] 363 xor 4*2($inp),@x[2] 364 xor 4*3($inp),@x[3] 365 xor 4*4($inp),@x[4] 366 xor 4*5($inp),@x[5] 367 xor 4*6($inp),@x[6] 368 xor 4*7($inp),@x[7] 369 movdqu 4*8($inp),%xmm0 370 xor 4*12($inp),@x[12] 371 xor 4*13($inp),@x[13] 372 xor 4*14($inp),@x[14] 373 xor 4*15($inp),@x[15] 374 lea 4*16($inp),$inp # inp+=64 375 pxor %xmm1,%xmm0 376 377 movdqa %xmm2,4*8(%rsp) 378 movd %xmm3,4*12(%rsp) 379 380 mov @x[0],4*0($out) # write output 381 mov @x[1],4*1($out) 382 mov @x[2],4*2($out) 383 mov @x[3],4*3($out) 384 mov @x[4],4*4($out) 385 mov @x[5],4*5($out) 386 mov @x[6],4*6($out) 387 mov @x[7],4*7($out) 388 movdqu %xmm0,4*8($out) 389 mov @x[12],4*12($out) 390 mov @x[13],4*13($out) 391 mov @x[14],4*14($out) 392 mov @x[15],4*15($out) 393 lea 4*16($out),$out # out+=64 394 395 sub \$64,%rbp 396 jnz .Loop_outer 397 398 jmp .Ldone 399 400.align 16 401.Ltail: 402 mov @x[0],4*0(%rsp) 403 mov @x[1],4*1(%rsp) 404 xor %rbx,%rbx 405 mov @x[2],4*2(%rsp) 406 mov @x[3],4*3(%rsp) 407 mov @x[4],4*4(%rsp) 408 mov @x[5],4*5(%rsp) 409 mov @x[6],4*6(%rsp) 410 mov @x[7],4*7(%rsp) 411 movdqa %xmm1,4*8(%rsp) 412 mov @x[12],4*12(%rsp) 413 mov @x[13],4*13(%rsp) 414 mov @x[14],4*14(%rsp) 415 mov @x[15],4*15(%rsp) 416 417.Loop_tail: 418 movzb ($inp,%rbx),%eax 419 movzb (%rsp,%rbx),%edx 420 lea 1(%rbx),%rbx 421 xor %edx,%eax 422 mov %al,-1($out,%rbx) 423 dec %rbp 424 jnz .Loop_tail 425 426.Ldone: 427 lea 64+24+48(%rsp),%rsi 428.cfi_def_cfa %rsi,8 429 mov -48(%rsi),%r15 430.cfi_restore %r15 431 mov -40(%rsi),%r14 432.cfi_restore %r14 433 mov -32(%rsi),%r13 434.cfi_restore %r13 435 mov -24(%rsi),%r12 436.cfi_restore %r12 437 mov -16(%rsi),%rbp 438.cfi_restore %rbp 439 mov -8(%rsi),%rbx 440.cfi_restore %rbx 441 lea (%rsi),%rsp 442.cfi_def_cfa_register %rsp 443.Lno_data: 444 ret 445.cfi_endproc 446.size ChaCha20_ctr32,.-ChaCha20_ctr32 447___ 448 449######################################################################## 450# SSSE3 code path that handles shorter lengths 451{ 452my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); 453 454sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 455 &paddd ($a,$b); 456 &pxor ($d,$a); 457 &pshufb ($d,$rot16); 458 459 &paddd ($c,$d); 460 &pxor ($b,$c); 461 &movdqa ($t,$b); 462 &psrld ($b,20); 463 &pslld ($t,12); 464 &por ($b,$t); 465 466 &paddd ($a,$b); 467 &pxor ($d,$a); 468 &pshufb ($d,$rot24); 469 470 &paddd ($c,$d); 471 &pxor ($b,$c); 472 &movdqa ($t,$b); 473 &psrld ($b,25); 474 &pslld ($t,7); 475 &por ($b,$t); 476} 477 478my $xframe = $win64 ? 160+8 : 8; 479 480$code.=<<___; 481.type ChaCha20_ssse3,\@function,5 482.align 32 483ChaCha20_ssse3: 484.cfi_startproc 485.LChaCha20_ssse3: 486 mov %rsp,%r9 # frame pointer 487.cfi_def_cfa_register %r9 488___ 489$code.=<<___ if ($avx); 490 test \$`1<<(43-32)`,%r10d 491 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 492___ 493$code.=<<___; 494 cmp \$128,$len # we might throw away some data, 495 je .LChaCha20_128 496 ja .LChaCha20_4x # but overall it won't be slower 497 498.Ldo_sse3_after_all: 499 sub \$64+$xframe,%rsp 500___ 501$code.=<<___ if ($win64); 502 movaps %xmm6,-0x28(%r9) 503 movaps %xmm7,-0x18(%r9) 504.Lssse3_body: 505___ 506$code.=<<___; 507 movdqa .Lsigma(%rip),$a 508 movdqu ($key),$b 509 movdqu 16($key),$c 510 movdqu ($counter),$d 511 movdqa .Lrot16(%rip),$rot16 512 movdqa .Lrot24(%rip),$rot24 513 514 movdqa $a,0x00(%rsp) 515 movdqa $b,0x10(%rsp) 516 movdqa $c,0x20(%rsp) 517 movdqa $d,0x30(%rsp) 518 mov \$10,$counter # reuse $counter 519 jmp .Loop_ssse3 520 521.align 32 522.Loop_outer_ssse3: 523 movdqa .Lone(%rip),$d 524 movdqa 0x00(%rsp),$a 525 movdqa 0x10(%rsp),$b 526 movdqa 0x20(%rsp),$c 527 paddd 0x30(%rsp),$d 528 mov \$10,$counter 529 movdqa $d,0x30(%rsp) 530 jmp .Loop_ssse3 531 532.align 32 533.Loop_ssse3: 534___ 535 &SSSE3ROUND(); 536 &pshufd ($c,$c,0b01001110); 537 &pshufd ($b,$b,0b00111001); 538 &pshufd ($d,$d,0b10010011); 539 &nop (); 540 541 &SSSE3ROUND(); 542 &pshufd ($c,$c,0b01001110); 543 &pshufd ($b,$b,0b10010011); 544 &pshufd ($d,$d,0b00111001); 545 546 &dec ($counter); 547 &jnz (".Loop_ssse3"); 548 549$code.=<<___; 550 paddd 0x00(%rsp),$a 551 paddd 0x10(%rsp),$b 552 paddd 0x20(%rsp),$c 553 paddd 0x30(%rsp),$d 554 555 cmp \$64,$len 556 jb .Ltail_ssse3 557 558 movdqu 0x00($inp),$t 559 movdqu 0x10($inp),$t1 560 pxor $t,$a # xor with input 561 movdqu 0x20($inp),$t 562 pxor $t1,$b 563 movdqu 0x30($inp),$t1 564 lea 0x40($inp),$inp # inp+=64 565 pxor $t,$c 566 pxor $t1,$d 567 568 movdqu $a,0x00($out) # write output 569 movdqu $b,0x10($out) 570 movdqu $c,0x20($out) 571 movdqu $d,0x30($out) 572 lea 0x40($out),$out # out+=64 573 574 sub \$64,$len 575 jnz .Loop_outer_ssse3 576 577 jmp .Ldone_ssse3 578 579.align 16 580.Ltail_ssse3: 581 movdqa $a,0x00(%rsp) 582 movdqa $b,0x10(%rsp) 583 movdqa $c,0x20(%rsp) 584 movdqa $d,0x30(%rsp) 585 xor $counter,$counter 586 587.Loop_tail_ssse3: 588 movzb ($inp,$counter),%eax 589 movzb (%rsp,$counter),%ecx 590 lea 1($counter),$counter 591 xor %ecx,%eax 592 mov %al,-1($out,$counter) 593 dec $len 594 jnz .Loop_tail_ssse3 595 596.Ldone_ssse3: 597___ 598$code.=<<___ if ($win64); 599 movaps -0x28(%r9),%xmm6 600 movaps -0x18(%r9),%xmm7 601___ 602$code.=<<___; 603 lea (%r9),%rsp 604.cfi_def_cfa_register %rsp 605.Lssse3_epilogue: 606 ret 607.cfi_endproc 608.size ChaCha20_ssse3,.-ChaCha20_ssse3 609___ 610} 611 612######################################################################## 613# SSSE3 code path that handles 128-byte inputs 614{ 615my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); 616my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); 617 618sub SSSE3ROUND_2x { 619 &paddd ($a,$b); 620 &pxor ($d,$a); 621 &paddd ($a1,$b1); 622 &pxor ($d1,$a1); 623 &pshufb ($d,$rot16); 624 &pshufb($d1,$rot16); 625 626 &paddd ($c,$d); 627 &paddd ($c1,$d1); 628 &pxor ($b,$c); 629 &pxor ($b1,$c1); 630 &movdqa ($t,$b); 631 &psrld ($b,20); 632 &movdqa($t1,$b1); 633 &pslld ($t,12); 634 &psrld ($b1,20); 635 &por ($b,$t); 636 &pslld ($t1,12); 637 &por ($b1,$t1); 638 639 &paddd ($a,$b); 640 &pxor ($d,$a); 641 &paddd ($a1,$b1); 642 &pxor ($d1,$a1); 643 &pshufb ($d,$rot24); 644 &pshufb($d1,$rot24); 645 646 &paddd ($c,$d); 647 &paddd ($c1,$d1); 648 &pxor ($b,$c); 649 &pxor ($b1,$c1); 650 &movdqa ($t,$b); 651 &psrld ($b,25); 652 &movdqa($t1,$b1); 653 &pslld ($t,7); 654 &psrld ($b1,25); 655 &por ($b,$t); 656 &pslld ($t1,7); 657 &por ($b1,$t1); 658} 659 660my $xframe = $win64 ? 0x68 : 8; 661 662$code.=<<___; 663.type ChaCha20_128,\@function,5 664.align 32 665ChaCha20_128: 666.cfi_startproc 667.LChaCha20_128: 668 mov %rsp,%r9 # frame pointer 669.cfi_def_cfa_register %r9 670 sub \$64+$xframe,%rsp 671___ 672$code.=<<___ if ($win64); 673 movaps %xmm6,-0x68(%r9) 674 movaps %xmm7,-0x58(%r9) 675 movaps %xmm8,-0x48(%r9) 676 movaps %xmm9,-0x38(%r9) 677 movaps %xmm10,-0x28(%r9) 678 movaps %xmm11,-0x18(%r9) 679.L128_body: 680___ 681$code.=<<___; 682 movdqa .Lsigma(%rip),$a 683 movdqu ($key),$b 684 movdqu 16($key),$c 685 movdqu ($counter),$d 686 movdqa .Lone(%rip),$d1 687 movdqa .Lrot16(%rip),$rot16 688 movdqa .Lrot24(%rip),$rot24 689 690 movdqa $a,$a1 691 movdqa $a,0x00(%rsp) 692 movdqa $b,$b1 693 movdqa $b,0x10(%rsp) 694 movdqa $c,$c1 695 movdqa $c,0x20(%rsp) 696 paddd $d,$d1 697 movdqa $d,0x30(%rsp) 698 mov \$10,$counter # reuse $counter 699 jmp .Loop_128 700 701.align 32 702.Loop_128: 703___ 704 &SSSE3ROUND_2x(); 705 &pshufd ($c,$c,0b01001110); 706 &pshufd ($b,$b,0b00111001); 707 &pshufd ($d,$d,0b10010011); 708 &pshufd ($c1,$c1,0b01001110); 709 &pshufd ($b1,$b1,0b00111001); 710 &pshufd ($d1,$d1,0b10010011); 711 712 &SSSE3ROUND_2x(); 713 &pshufd ($c,$c,0b01001110); 714 &pshufd ($b,$b,0b10010011); 715 &pshufd ($d,$d,0b00111001); 716 &pshufd ($c1,$c1,0b01001110); 717 &pshufd ($b1,$b1,0b10010011); 718 &pshufd ($d1,$d1,0b00111001); 719 720 &dec ($counter); 721 &jnz (".Loop_128"); 722 723$code.=<<___; 724 paddd 0x00(%rsp),$a 725 paddd 0x10(%rsp),$b 726 paddd 0x20(%rsp),$c 727 paddd 0x30(%rsp),$d 728 paddd .Lone(%rip),$d1 729 paddd 0x00(%rsp),$a1 730 paddd 0x10(%rsp),$b1 731 paddd 0x20(%rsp),$c1 732 paddd 0x30(%rsp),$d1 733 734 movdqu 0x00($inp),$t 735 movdqu 0x10($inp),$t1 736 pxor $t,$a # xor with input 737 movdqu 0x20($inp),$t 738 pxor $t1,$b 739 movdqu 0x30($inp),$t1 740 pxor $t,$c 741 movdqu 0x40($inp),$t 742 pxor $t1,$d 743 movdqu 0x50($inp),$t1 744 pxor $t,$a1 745 movdqu 0x60($inp),$t 746 pxor $t1,$b1 747 movdqu 0x70($inp),$t1 748 pxor $t,$c1 749 pxor $t1,$d1 750 751 movdqu $a,0x00($out) # write output 752 movdqu $b,0x10($out) 753 movdqu $c,0x20($out) 754 movdqu $d,0x30($out) 755 movdqu $a1,0x40($out) 756 movdqu $b1,0x50($out) 757 movdqu $c1,0x60($out) 758 movdqu $d1,0x70($out) 759___ 760$code.=<<___ if ($win64); 761 movaps -0x68(%r9),%xmm6 762 movaps -0x58(%r9),%xmm7 763 movaps -0x48(%r9),%xmm8 764 movaps -0x38(%r9),%xmm9 765 movaps -0x28(%r9),%xmm10 766 movaps -0x18(%r9),%xmm11 767___ 768$code.=<<___; 769 lea (%r9),%rsp 770.cfi_def_cfa_register %rsp 771.L128_epilogue: 772 ret 773.cfi_endproc 774.size ChaCha20_128,.-ChaCha20_128 775___ 776} 777 778######################################################################## 779# SSSE3 code path that handles longer messages. 780{ 781# assign variables to favor Atom front-end 782my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, 783 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); 784my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 785 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 786 787sub SSSE3_lane_ROUND { 788my ($a0,$b0,$c0,$d0)=@_; 789my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 790my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 791my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 792my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 793my @x=map("\"$_\"",@xx); 794 795 # Consider order in which variables are addressed by their 796 # index: 797 # 798 # a b c d 799 # 800 # 0 4 8 12 < even round 801 # 1 5 9 13 802 # 2 6 10 14 803 # 3 7 11 15 804 # 0 5 10 15 < odd round 805 # 1 6 11 12 806 # 2 7 8 13 807 # 3 4 9 14 808 # 809 # 'a', 'b' and 'd's are permanently allocated in registers, 810 # @x[0..7,12..15], while 'c's are maintained in memory. If 811 # you observe 'c' column, you'll notice that pair of 'c's is 812 # invariant between rounds. This means that we have to reload 813 # them once per round, in the middle. This is why you'll see 814 # bunch of 'c' stores and loads in the middle, but none in 815 # the beginning or end. 816 817 ( 818 "&paddd (@x[$a0],@x[$b0])", # Q1 819 "&paddd (@x[$a1],@x[$b1])", # Q2 820 "&pxor (@x[$d0],@x[$a0])", 821 "&pxor (@x[$d1],@x[$a1])", 822 "&pshufb (@x[$d0],$t1)", 823 "&pshufb (@x[$d1],$t1)", 824 825 "&paddd ($xc,@x[$d0])", 826 "&paddd ($xc_,@x[$d1])", 827 "&pxor (@x[$b0],$xc)", 828 "&pxor (@x[$b1],$xc_)", 829 "&movdqa ($t0,@x[$b0])", 830 "&pslld (@x[$b0],12)", 831 "&psrld ($t0,20)", 832 "&movdqa ($t1,@x[$b1])", 833 "&pslld (@x[$b1],12)", 834 "&por (@x[$b0],$t0)", 835 "&psrld ($t1,20)", 836 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 837 "&por (@x[$b1],$t1)", 838 839 "&paddd (@x[$a0],@x[$b0])", 840 "&paddd (@x[$a1],@x[$b1])", 841 "&pxor (@x[$d0],@x[$a0])", 842 "&pxor (@x[$d1],@x[$a1])", 843 "&pshufb (@x[$d0],$t0)", 844 "&pshufb (@x[$d1],$t0)", 845 846 "&paddd ($xc,@x[$d0])", 847 "&paddd ($xc_,@x[$d1])", 848 "&pxor (@x[$b0],$xc)", 849 "&pxor (@x[$b1],$xc_)", 850 "&movdqa ($t1,@x[$b0])", 851 "&pslld (@x[$b0],7)", 852 "&psrld ($t1,25)", 853 "&movdqa ($t0,@x[$b1])", 854 "&pslld (@x[$b1],7)", 855 "&por (@x[$b0],$t1)", 856 "&psrld ($t0,25)", 857 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 858 "&por (@x[$b1],$t0)", 859 860 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 861 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", 862 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", 863 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", 864 865 "&paddd (@x[$a2],@x[$b2])", # Q3 866 "&paddd (@x[$a3],@x[$b3])", # Q4 867 "&pxor (@x[$d2],@x[$a2])", 868 "&pxor (@x[$d3],@x[$a3])", 869 "&pshufb (@x[$d2],$t1)", 870 "&pshufb (@x[$d3],$t1)", 871 872 "&paddd ($xc,@x[$d2])", 873 "&paddd ($xc_,@x[$d3])", 874 "&pxor (@x[$b2],$xc)", 875 "&pxor (@x[$b3],$xc_)", 876 "&movdqa ($t0,@x[$b2])", 877 "&pslld (@x[$b2],12)", 878 "&psrld ($t0,20)", 879 "&movdqa ($t1,@x[$b3])", 880 "&pslld (@x[$b3],12)", 881 "&por (@x[$b2],$t0)", 882 "&psrld ($t1,20)", 883 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) 884 "&por (@x[$b3],$t1)", 885 886 "&paddd (@x[$a2],@x[$b2])", 887 "&paddd (@x[$a3],@x[$b3])", 888 "&pxor (@x[$d2],@x[$a2])", 889 "&pxor (@x[$d3],@x[$a3])", 890 "&pshufb (@x[$d2],$t0)", 891 "&pshufb (@x[$d3],$t0)", 892 893 "&paddd ($xc,@x[$d2])", 894 "&paddd ($xc_,@x[$d3])", 895 "&pxor (@x[$b2],$xc)", 896 "&pxor (@x[$b3],$xc_)", 897 "&movdqa ($t1,@x[$b2])", 898 "&pslld (@x[$b2],7)", 899 "&psrld ($t1,25)", 900 "&movdqa ($t0,@x[$b3])", 901 "&pslld (@x[$b3],7)", 902 "&por (@x[$b2],$t1)", 903 "&psrld ($t0,25)", 904 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) 905 "&por (@x[$b3],$t0)" 906 ); 907} 908 909my $xframe = $win64 ? 0xa8 : 8; 910 911$code.=<<___; 912.type ChaCha20_4x,\@function,5 913.align 32 914ChaCha20_4x: 915.cfi_startproc 916.LChaCha20_4x: 917 mov %rsp,%r9 # frame pointer 918.cfi_def_cfa_register %r9 919 mov %r10,%r11 920___ 921$code.=<<___ if ($avx>1); 922 shr \$32,%r10 # OPENSSL_ia32cap_P+8 923 test \$`1<<5`,%r10 # test AVX2 924 jnz .LChaCha20_8x 925___ 926$code.=<<___; 927 cmp \$192,$len 928 ja .Lproceed4x 929 930 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE 931 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE 932 je .Ldo_sse3_after_all # to detect Atom 933 934.Lproceed4x: 935 sub \$0x140+$xframe,%rsp 936___ 937 ################ stack layout 938 # +0x00 SIMD equivalent of @x[8-12] 939 # ... 940 # +0x40 constant copy of key[0-2] smashed by lanes 941 # ... 942 # +0x100 SIMD counters (with nonce smashed by lanes) 943 # ... 944 # +0x140 945$code.=<<___ if ($win64); 946 movaps %xmm6,-0xa8(%r9) 947 movaps %xmm7,-0x98(%r9) 948 movaps %xmm8,-0x88(%r9) 949 movaps %xmm9,-0x78(%r9) 950 movaps %xmm10,-0x68(%r9) 951 movaps %xmm11,-0x58(%r9) 952 movaps %xmm12,-0x48(%r9) 953 movaps %xmm13,-0x38(%r9) 954 movaps %xmm14,-0x28(%r9) 955 movaps %xmm15,-0x18(%r9) 956.L4x_body: 957___ 958$code.=<<___; 959 movdqa .Lsigma(%rip),$xa3 # key[0] 960 movdqu ($key),$xb3 # key[1] 961 movdqu 16($key),$xt3 # key[2] 962 movdqu ($counter),$xd3 # key[3] 963 lea 0x100(%rsp),%rcx # size optimization 964 lea .Lrot16(%rip),%r10 965 lea .Lrot24(%rip),%r11 966 967 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... 968 pshufd \$0x55,$xa3,$xa1 969 movdqa $xa0,0x40(%rsp) # ... and offload 970 pshufd \$0xaa,$xa3,$xa2 971 movdqa $xa1,0x50(%rsp) 972 pshufd \$0xff,$xa3,$xa3 973 movdqa $xa2,0x60(%rsp) 974 movdqa $xa3,0x70(%rsp) 975 976 pshufd \$0x00,$xb3,$xb0 977 pshufd \$0x55,$xb3,$xb1 978 movdqa $xb0,0x80-0x100(%rcx) 979 pshufd \$0xaa,$xb3,$xb2 980 movdqa $xb1,0x90-0x100(%rcx) 981 pshufd \$0xff,$xb3,$xb3 982 movdqa $xb2,0xa0-0x100(%rcx) 983 movdqa $xb3,0xb0-0x100(%rcx) 984 985 pshufd \$0x00,$xt3,$xt0 # "$xc0" 986 pshufd \$0x55,$xt3,$xt1 # "$xc1" 987 movdqa $xt0,0xc0-0x100(%rcx) 988 pshufd \$0xaa,$xt3,$xt2 # "$xc2" 989 movdqa $xt1,0xd0-0x100(%rcx) 990 pshufd \$0xff,$xt3,$xt3 # "$xc3" 991 movdqa $xt2,0xe0-0x100(%rcx) 992 movdqa $xt3,0xf0-0x100(%rcx) 993 994 pshufd \$0x00,$xd3,$xd0 995 pshufd \$0x55,$xd3,$xd1 996 paddd .Linc(%rip),$xd0 # don't save counters yet 997 pshufd \$0xaa,$xd3,$xd2 998 movdqa $xd1,0x110-0x100(%rcx) 999 pshufd \$0xff,$xd3,$xd3 1000 movdqa $xd2,0x120-0x100(%rcx) 1001 movdqa $xd3,0x130-0x100(%rcx) 1002 1003 jmp .Loop_enter4x 1004 1005.align 32 1006.Loop_outer4x: 1007 movdqa 0x40(%rsp),$xa0 # re-load smashed key 1008 movdqa 0x50(%rsp),$xa1 1009 movdqa 0x60(%rsp),$xa2 1010 movdqa 0x70(%rsp),$xa3 1011 movdqa 0x80-0x100(%rcx),$xb0 1012 movdqa 0x90-0x100(%rcx),$xb1 1013 movdqa 0xa0-0x100(%rcx),$xb2 1014 movdqa 0xb0-0x100(%rcx),$xb3 1015 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1016 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1017 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1018 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1019 movdqa 0x100-0x100(%rcx),$xd0 1020 movdqa 0x110-0x100(%rcx),$xd1 1021 movdqa 0x120-0x100(%rcx),$xd2 1022 movdqa 0x130-0x100(%rcx),$xd3 1023 paddd .Lfour(%rip),$xd0 # next SIMD counters 1024 1025.Loop_enter4x: 1026 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" 1027 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" 1028 movdqa (%r10),$xt3 # .Lrot16(%rip) 1029 mov \$10,%eax 1030 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1031 jmp .Loop4x 1032 1033.align 32 1034.Loop4x: 1035___ 1036 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } 1037 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } 1038$code.=<<___; 1039 dec %eax 1040 jnz .Loop4x 1041 1042 paddd 0x40(%rsp),$xa0 # accumulate key material 1043 paddd 0x50(%rsp),$xa1 1044 paddd 0x60(%rsp),$xa2 1045 paddd 0x70(%rsp),$xa3 1046 1047 movdqa $xa0,$xt2 # "de-interlace" data 1048 punpckldq $xa1,$xa0 1049 movdqa $xa2,$xt3 1050 punpckldq $xa3,$xa2 1051 punpckhdq $xa1,$xt2 1052 punpckhdq $xa3,$xt3 1053 movdqa $xa0,$xa1 1054 punpcklqdq $xa2,$xa0 # "a0" 1055 movdqa $xt2,$xa3 1056 punpcklqdq $xt3,$xt2 # "a2" 1057 punpckhqdq $xa2,$xa1 # "a1" 1058 punpckhqdq $xt3,$xa3 # "a3" 1059___ 1060 ($xa2,$xt2)=($xt2,$xa2); 1061$code.=<<___; 1062 paddd 0x80-0x100(%rcx),$xb0 1063 paddd 0x90-0x100(%rcx),$xb1 1064 paddd 0xa0-0x100(%rcx),$xb2 1065 paddd 0xb0-0x100(%rcx),$xb3 1066 1067 movdqa $xa0,0x00(%rsp) # offload $xaN 1068 movdqa $xa1,0x10(%rsp) 1069 movdqa 0x20(%rsp),$xa0 # "xc2" 1070 movdqa 0x30(%rsp),$xa1 # "xc3" 1071 1072 movdqa $xb0,$xt2 1073 punpckldq $xb1,$xb0 1074 movdqa $xb2,$xt3 1075 punpckldq $xb3,$xb2 1076 punpckhdq $xb1,$xt2 1077 punpckhdq $xb3,$xt3 1078 movdqa $xb0,$xb1 1079 punpcklqdq $xb2,$xb0 # "b0" 1080 movdqa $xt2,$xb3 1081 punpcklqdq $xt3,$xt2 # "b2" 1082 punpckhqdq $xb2,$xb1 # "b1" 1083 punpckhqdq $xt3,$xb3 # "b3" 1084___ 1085 ($xb2,$xt2)=($xt2,$xb2); 1086 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1087$code.=<<___; 1088 paddd 0xc0-0x100(%rcx),$xc0 1089 paddd 0xd0-0x100(%rcx),$xc1 1090 paddd 0xe0-0x100(%rcx),$xc2 1091 paddd 0xf0-0x100(%rcx),$xc3 1092 1093 movdqa $xa2,0x20(%rsp) # keep offloading $xaN 1094 movdqa $xa3,0x30(%rsp) 1095 1096 movdqa $xc0,$xt2 1097 punpckldq $xc1,$xc0 1098 movdqa $xc2,$xt3 1099 punpckldq $xc3,$xc2 1100 punpckhdq $xc1,$xt2 1101 punpckhdq $xc3,$xt3 1102 movdqa $xc0,$xc1 1103 punpcklqdq $xc2,$xc0 # "c0" 1104 movdqa $xt2,$xc3 1105 punpcklqdq $xt3,$xt2 # "c2" 1106 punpckhqdq $xc2,$xc1 # "c1" 1107 punpckhqdq $xt3,$xc3 # "c3" 1108___ 1109 ($xc2,$xt2)=($xt2,$xc2); 1110 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary 1111$code.=<<___; 1112 paddd 0x100-0x100(%rcx),$xd0 1113 paddd 0x110-0x100(%rcx),$xd1 1114 paddd 0x120-0x100(%rcx),$xd2 1115 paddd 0x130-0x100(%rcx),$xd3 1116 1117 movdqa $xd0,$xt2 1118 punpckldq $xd1,$xd0 1119 movdqa $xd2,$xt3 1120 punpckldq $xd3,$xd2 1121 punpckhdq $xd1,$xt2 1122 punpckhdq $xd3,$xt3 1123 movdqa $xd0,$xd1 1124 punpcklqdq $xd2,$xd0 # "d0" 1125 movdqa $xt2,$xd3 1126 punpcklqdq $xt3,$xt2 # "d2" 1127 punpckhqdq $xd2,$xd1 # "d1" 1128 punpckhqdq $xt3,$xd3 # "d3" 1129___ 1130 ($xd2,$xt2)=($xt2,$xd2); 1131$code.=<<___; 1132 cmp \$64*4,$len 1133 jb .Ltail4x 1134 1135 movdqu 0x00($inp),$xt0 # xor with input 1136 movdqu 0x10($inp),$xt1 1137 movdqu 0x20($inp),$xt2 1138 movdqu 0x30($inp),$xt3 1139 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1140 pxor $xb0,$xt1 1141 pxor $xc0,$xt2 1142 pxor $xd0,$xt3 1143 1144 movdqu $xt0,0x00($out) 1145 movdqu 0x40($inp),$xt0 1146 movdqu $xt1,0x10($out) 1147 movdqu 0x50($inp),$xt1 1148 movdqu $xt2,0x20($out) 1149 movdqu 0x60($inp),$xt2 1150 movdqu $xt3,0x30($out) 1151 movdqu 0x70($inp),$xt3 1152 lea 0x80($inp),$inp # size optimization 1153 pxor 0x10(%rsp),$xt0 1154 pxor $xb1,$xt1 1155 pxor $xc1,$xt2 1156 pxor $xd1,$xt3 1157 1158 movdqu $xt0,0x40($out) 1159 movdqu 0x00($inp),$xt0 1160 movdqu $xt1,0x50($out) 1161 movdqu 0x10($inp),$xt1 1162 movdqu $xt2,0x60($out) 1163 movdqu 0x20($inp),$xt2 1164 movdqu $xt3,0x70($out) 1165 lea 0x80($out),$out # size optimization 1166 movdqu 0x30($inp),$xt3 1167 pxor 0x20(%rsp),$xt0 1168 pxor $xb2,$xt1 1169 pxor $xc2,$xt2 1170 pxor $xd2,$xt3 1171 1172 movdqu $xt0,0x00($out) 1173 movdqu 0x40($inp),$xt0 1174 movdqu $xt1,0x10($out) 1175 movdqu 0x50($inp),$xt1 1176 movdqu $xt2,0x20($out) 1177 movdqu 0x60($inp),$xt2 1178 movdqu $xt3,0x30($out) 1179 movdqu 0x70($inp),$xt3 1180 lea 0x80($inp),$inp # inp+=64*4 1181 pxor 0x30(%rsp),$xt0 1182 pxor $xb3,$xt1 1183 pxor $xc3,$xt2 1184 pxor $xd3,$xt3 1185 movdqu $xt0,0x40($out) 1186 movdqu $xt1,0x50($out) 1187 movdqu $xt2,0x60($out) 1188 movdqu $xt3,0x70($out) 1189 lea 0x80($out),$out # out+=64*4 1190 1191 sub \$64*4,$len 1192 jnz .Loop_outer4x 1193 1194 jmp .Ldone4x 1195 1196.Ltail4x: 1197 cmp \$192,$len 1198 jae .L192_or_more4x 1199 cmp \$128,$len 1200 jae .L128_or_more4x 1201 cmp \$64,$len 1202 jae .L64_or_more4x 1203 1204 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1205 xor %r10,%r10 1206 #movdqa $xt0,0x00(%rsp) 1207 movdqa $xb0,0x10(%rsp) 1208 movdqa $xc0,0x20(%rsp) 1209 movdqa $xd0,0x30(%rsp) 1210 jmp .Loop_tail4x 1211 1212.align 32 1213.L64_or_more4x: 1214 movdqu 0x00($inp),$xt0 # xor with input 1215 movdqu 0x10($inp),$xt1 1216 movdqu 0x20($inp),$xt2 1217 movdqu 0x30($inp),$xt3 1218 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? 1219 pxor $xb0,$xt1 1220 pxor $xc0,$xt2 1221 pxor $xd0,$xt3 1222 movdqu $xt0,0x00($out) 1223 movdqu $xt1,0x10($out) 1224 movdqu $xt2,0x20($out) 1225 movdqu $xt3,0x30($out) 1226 je .Ldone4x 1227 1228 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? 1229 lea 0x40($inp),$inp # inp+=64*1 1230 xor %r10,%r10 1231 movdqa $xt0,0x00(%rsp) 1232 movdqa $xb1,0x10(%rsp) 1233 lea 0x40($out),$out # out+=64*1 1234 movdqa $xc1,0x20(%rsp) 1235 sub \$64,$len # len-=64*1 1236 movdqa $xd1,0x30(%rsp) 1237 jmp .Loop_tail4x 1238 1239.align 32 1240.L128_or_more4x: 1241 movdqu 0x00($inp),$xt0 # xor with input 1242 movdqu 0x10($inp),$xt1 1243 movdqu 0x20($inp),$xt2 1244 movdqu 0x30($inp),$xt3 1245 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1246 pxor $xb0,$xt1 1247 pxor $xc0,$xt2 1248 pxor $xd0,$xt3 1249 1250 movdqu $xt0,0x00($out) 1251 movdqu 0x40($inp),$xt0 1252 movdqu $xt1,0x10($out) 1253 movdqu 0x50($inp),$xt1 1254 movdqu $xt2,0x20($out) 1255 movdqu 0x60($inp),$xt2 1256 movdqu $xt3,0x30($out) 1257 movdqu 0x70($inp),$xt3 1258 pxor 0x10(%rsp),$xt0 1259 pxor $xb1,$xt1 1260 pxor $xc1,$xt2 1261 pxor $xd1,$xt3 1262 movdqu $xt0,0x40($out) 1263 movdqu $xt1,0x50($out) 1264 movdqu $xt2,0x60($out) 1265 movdqu $xt3,0x70($out) 1266 je .Ldone4x 1267 1268 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? 1269 lea 0x80($inp),$inp # inp+=64*2 1270 xor %r10,%r10 1271 movdqa $xt0,0x00(%rsp) 1272 movdqa $xb2,0x10(%rsp) 1273 lea 0x80($out),$out # out+=64*2 1274 movdqa $xc2,0x20(%rsp) 1275 sub \$128,$len # len-=64*2 1276 movdqa $xd2,0x30(%rsp) 1277 jmp .Loop_tail4x 1278 1279.align 32 1280.L192_or_more4x: 1281 movdqu 0x00($inp),$xt0 # xor with input 1282 movdqu 0x10($inp),$xt1 1283 movdqu 0x20($inp),$xt2 1284 movdqu 0x30($inp),$xt3 1285 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? 1286 pxor $xb0,$xt1 1287 pxor $xc0,$xt2 1288 pxor $xd0,$xt3 1289 1290 movdqu $xt0,0x00($out) 1291 movdqu 0x40($inp),$xt0 1292 movdqu $xt1,0x10($out) 1293 movdqu 0x50($inp),$xt1 1294 movdqu $xt2,0x20($out) 1295 movdqu 0x60($inp),$xt2 1296 movdqu $xt3,0x30($out) 1297 movdqu 0x70($inp),$xt3 1298 lea 0x80($inp),$inp # size optimization 1299 pxor 0x10(%rsp),$xt0 1300 pxor $xb1,$xt1 1301 pxor $xc1,$xt2 1302 pxor $xd1,$xt3 1303 1304 movdqu $xt0,0x40($out) 1305 movdqu 0x00($inp),$xt0 1306 movdqu $xt1,0x50($out) 1307 movdqu 0x10($inp),$xt1 1308 movdqu $xt2,0x60($out) 1309 movdqu 0x20($inp),$xt2 1310 movdqu $xt3,0x70($out) 1311 lea 0x80($out),$out # size optimization 1312 movdqu 0x30($inp),$xt3 1313 pxor 0x20(%rsp),$xt0 1314 pxor $xb2,$xt1 1315 pxor $xc2,$xt2 1316 pxor $xd2,$xt3 1317 movdqu $xt0,0x00($out) 1318 movdqu $xt1,0x10($out) 1319 movdqu $xt2,0x20($out) 1320 movdqu $xt3,0x30($out) 1321 je .Ldone4x 1322 1323 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? 1324 lea 0x40($inp),$inp # inp+=64*3 1325 xor %r10,%r10 1326 movdqa $xt0,0x00(%rsp) 1327 movdqa $xb3,0x10(%rsp) 1328 lea 0x40($out),$out # out+=64*3 1329 movdqa $xc3,0x20(%rsp) 1330 sub \$192,$len # len-=64*3 1331 movdqa $xd3,0x30(%rsp) 1332 1333.Loop_tail4x: 1334 movzb ($inp,%r10),%eax 1335 movzb (%rsp,%r10),%ecx 1336 lea 1(%r10),%r10 1337 xor %ecx,%eax 1338 mov %al,-1($out,%r10) 1339 dec $len 1340 jnz .Loop_tail4x 1341 1342.Ldone4x: 1343___ 1344$code.=<<___ if ($win64); 1345 movaps -0xa8(%r9),%xmm6 1346 movaps -0x98(%r9),%xmm7 1347 movaps -0x88(%r9),%xmm8 1348 movaps -0x78(%r9),%xmm9 1349 movaps -0x68(%r9),%xmm10 1350 movaps -0x58(%r9),%xmm11 1351 movaps -0x48(%r9),%xmm12 1352 movaps -0x38(%r9),%xmm13 1353 movaps -0x28(%r9),%xmm14 1354 movaps -0x18(%r9),%xmm15 1355___ 1356$code.=<<___; 1357 lea (%r9),%rsp 1358.cfi_def_cfa_register %rsp 1359.L4x_epilogue: 1360 ret 1361.cfi_endproc 1362.size ChaCha20_4x,.-ChaCha20_4x 1363___ 1364} 1365 1366######################################################################## 1367# XOP code path that handles all lengths. 1368if ($avx) { 1369# There is some "anomaly" observed depending on instructions' size or 1370# alignment. If you look closely at below code you'll notice that 1371# sometimes argument order varies. The order affects instruction 1372# encoding by making it larger, and such fiddling gives 5% performance 1373# improvement. This is on FX-4100... 1374 1375my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1376 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); 1377my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1378 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); 1379 1380sub XOP_lane_ROUND { 1381my ($a0,$b0,$c0,$d0)=@_; 1382my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1383my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1384my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1385my @x=map("\"$_\"",@xx); 1386 1387 ( 1388 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1389 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1390 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1391 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1392 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1393 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1394 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1395 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1396 "&vprotd (@x[$d0],@x[$d0],16)", 1397 "&vprotd (@x[$d1],@x[$d1],16)", 1398 "&vprotd (@x[$d2],@x[$d2],16)", 1399 "&vprotd (@x[$d3],@x[$d3],16)", 1400 1401 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1402 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1403 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1404 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1405 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1406 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1407 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1408 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1409 "&vprotd (@x[$b0],@x[$b0],12)", 1410 "&vprotd (@x[$b1],@x[$b1],12)", 1411 "&vprotd (@x[$b2],@x[$b2],12)", 1412 "&vprotd (@x[$b3],@x[$b3],12)", 1413 1414 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip 1415 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip 1416 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1417 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1418 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1419 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1420 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1421 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1422 "&vprotd (@x[$d0],@x[$d0],8)", 1423 "&vprotd (@x[$d1],@x[$d1],8)", 1424 "&vprotd (@x[$d2],@x[$d2],8)", 1425 "&vprotd (@x[$d3],@x[$d3],8)", 1426 1427 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 1428 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 1429 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 1430 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 1431 "&vpxor (@x[$b0],@x[$c0],@x[$b0])", 1432 "&vpxor (@x[$b1],@x[$c1],@x[$b1])", 1433 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip 1434 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip 1435 "&vprotd (@x[$b0],@x[$b0],7)", 1436 "&vprotd (@x[$b1],@x[$b1],7)", 1437 "&vprotd (@x[$b2],@x[$b2],7)", 1438 "&vprotd (@x[$b3],@x[$b3],7)" 1439 ); 1440} 1441 1442my $xframe = $win64 ? 0xa8 : 8; 1443 1444$code.=<<___; 1445.type ChaCha20_4xop,\@function,5 1446.align 32 1447ChaCha20_4xop: 1448.cfi_startproc 1449.LChaCha20_4xop: 1450 mov %rsp,%r9 # frame pointer 1451.cfi_def_cfa_register %r9 1452 sub \$0x140+$xframe,%rsp 1453___ 1454 ################ stack layout 1455 # +0x00 SIMD equivalent of @x[8-12] 1456 # ... 1457 # +0x40 constant copy of key[0-2] smashed by lanes 1458 # ... 1459 # +0x100 SIMD counters (with nonce smashed by lanes) 1460 # ... 1461 # +0x140 1462$code.=<<___ if ($win64); 1463 movaps %xmm6,-0xa8(%r9) 1464 movaps %xmm7,-0x98(%r9) 1465 movaps %xmm8,-0x88(%r9) 1466 movaps %xmm9,-0x78(%r9) 1467 movaps %xmm10,-0x68(%r9) 1468 movaps %xmm11,-0x58(%r9) 1469 movaps %xmm12,-0x48(%r9) 1470 movaps %xmm13,-0x38(%r9) 1471 movaps %xmm14,-0x28(%r9) 1472 movaps %xmm15,-0x18(%r9) 1473.L4xop_body: 1474___ 1475$code.=<<___; 1476 vzeroupper 1477 1478 vmovdqa .Lsigma(%rip),$xa3 # key[0] 1479 vmovdqu ($key),$xb3 # key[1] 1480 vmovdqu 16($key),$xt3 # key[2] 1481 vmovdqu ($counter),$xd3 # key[3] 1482 lea 0x100(%rsp),%rcx # size optimization 1483 1484 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1485 vpshufd \$0x55,$xa3,$xa1 1486 vmovdqa $xa0,0x40(%rsp) # ... and offload 1487 vpshufd \$0xaa,$xa3,$xa2 1488 vmovdqa $xa1,0x50(%rsp) 1489 vpshufd \$0xff,$xa3,$xa3 1490 vmovdqa $xa2,0x60(%rsp) 1491 vmovdqa $xa3,0x70(%rsp) 1492 1493 vpshufd \$0x00,$xb3,$xb0 1494 vpshufd \$0x55,$xb3,$xb1 1495 vmovdqa $xb0,0x80-0x100(%rcx) 1496 vpshufd \$0xaa,$xb3,$xb2 1497 vmovdqa $xb1,0x90-0x100(%rcx) 1498 vpshufd \$0xff,$xb3,$xb3 1499 vmovdqa $xb2,0xa0-0x100(%rcx) 1500 vmovdqa $xb3,0xb0-0x100(%rcx) 1501 1502 vpshufd \$0x00,$xt3,$xt0 # "$xc0" 1503 vpshufd \$0x55,$xt3,$xt1 # "$xc1" 1504 vmovdqa $xt0,0xc0-0x100(%rcx) 1505 vpshufd \$0xaa,$xt3,$xt2 # "$xc2" 1506 vmovdqa $xt1,0xd0-0x100(%rcx) 1507 vpshufd \$0xff,$xt3,$xt3 # "$xc3" 1508 vmovdqa $xt2,0xe0-0x100(%rcx) 1509 vmovdqa $xt3,0xf0-0x100(%rcx) 1510 1511 vpshufd \$0x00,$xd3,$xd0 1512 vpshufd \$0x55,$xd3,$xd1 1513 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet 1514 vpshufd \$0xaa,$xd3,$xd2 1515 vmovdqa $xd1,0x110-0x100(%rcx) 1516 vpshufd \$0xff,$xd3,$xd3 1517 vmovdqa $xd2,0x120-0x100(%rcx) 1518 vmovdqa $xd3,0x130-0x100(%rcx) 1519 1520 jmp .Loop_enter4xop 1521 1522.align 32 1523.Loop_outer4xop: 1524 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key 1525 vmovdqa 0x50(%rsp),$xa1 1526 vmovdqa 0x60(%rsp),$xa2 1527 vmovdqa 0x70(%rsp),$xa3 1528 vmovdqa 0x80-0x100(%rcx),$xb0 1529 vmovdqa 0x90-0x100(%rcx),$xb1 1530 vmovdqa 0xa0-0x100(%rcx),$xb2 1531 vmovdqa 0xb0-0x100(%rcx),$xb3 1532 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" 1533 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" 1534 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" 1535 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" 1536 vmovdqa 0x100-0x100(%rcx),$xd0 1537 vmovdqa 0x110-0x100(%rcx),$xd1 1538 vmovdqa 0x120-0x100(%rcx),$xd2 1539 vmovdqa 0x130-0x100(%rcx),$xd3 1540 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters 1541 1542.Loop_enter4xop: 1543 mov \$10,%eax 1544 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters 1545 jmp .Loop4xop 1546 1547.align 32 1548.Loop4xop: 1549___ 1550 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } 1551 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } 1552$code.=<<___; 1553 dec %eax 1554 jnz .Loop4xop 1555 1556 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material 1557 vpaddd 0x50(%rsp),$xa1,$xa1 1558 vpaddd 0x60(%rsp),$xa2,$xa2 1559 vpaddd 0x70(%rsp),$xa3,$xa3 1560 1561 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 1562 vmovdqa $xt3,0x30(%rsp) 1563 1564 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 1565 vpunpckldq $xa3,$xa2,$xt3 1566 vpunpckhdq $xa1,$xa0,$xa0 1567 vpunpckhdq $xa3,$xa2,$xa2 1568 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 1569 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 1570 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 1571 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 1572___ 1573 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 1574$code.=<<___; 1575 vpaddd 0x80-0x100(%rcx),$xb0,$xb0 1576 vpaddd 0x90-0x100(%rcx),$xb1,$xb1 1577 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 1578 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 1579 1580 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 1581 vmovdqa $xa1,0x10(%rsp) 1582 vmovdqa 0x20(%rsp),$xa0 # "xc2" 1583 vmovdqa 0x30(%rsp),$xa1 # "xc3" 1584 1585 vpunpckldq $xb1,$xb0,$xt2 1586 vpunpckldq $xb3,$xb2,$xt3 1587 vpunpckhdq $xb1,$xb0,$xb0 1588 vpunpckhdq $xb3,$xb2,$xb2 1589 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 1590 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 1591 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 1592 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 1593___ 1594 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 1595 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 1596$code.=<<___; 1597 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 1598 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 1599 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 1600 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 1601 1602 vpunpckldq $xc1,$xc0,$xt2 1603 vpunpckldq $xc3,$xc2,$xt3 1604 vpunpckhdq $xc1,$xc0,$xc0 1605 vpunpckhdq $xc3,$xc2,$xc2 1606 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 1607 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 1608 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 1609 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 1610___ 1611 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 1612$code.=<<___; 1613 vpaddd 0x100-0x100(%rcx),$xd0,$xd0 1614 vpaddd 0x110-0x100(%rcx),$xd1,$xd1 1615 vpaddd 0x120-0x100(%rcx),$xd2,$xd2 1616 vpaddd 0x130-0x100(%rcx),$xd3,$xd3 1617 1618 vpunpckldq $xd1,$xd0,$xt2 1619 vpunpckldq $xd3,$xd2,$xt3 1620 vpunpckhdq $xd1,$xd0,$xd0 1621 vpunpckhdq $xd3,$xd2,$xd2 1622 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 1623 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 1624 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 1625 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 1626___ 1627 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 1628 ($xa0,$xa1)=($xt2,$xt3); 1629$code.=<<___; 1630 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 1631 vmovdqa 0x10(%rsp),$xa1 1632 1633 cmp \$64*4,$len 1634 jb .Ltail4xop 1635 1636 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1637 vpxor 0x10($inp),$xb0,$xb0 1638 vpxor 0x20($inp),$xc0,$xc0 1639 vpxor 0x30($inp),$xd0,$xd0 1640 vpxor 0x40($inp),$xa1,$xa1 1641 vpxor 0x50($inp),$xb1,$xb1 1642 vpxor 0x60($inp),$xc1,$xc1 1643 vpxor 0x70($inp),$xd1,$xd1 1644 lea 0x80($inp),$inp # size optimization 1645 vpxor 0x00($inp),$xa2,$xa2 1646 vpxor 0x10($inp),$xb2,$xb2 1647 vpxor 0x20($inp),$xc2,$xc2 1648 vpxor 0x30($inp),$xd2,$xd2 1649 vpxor 0x40($inp),$xa3,$xa3 1650 vpxor 0x50($inp),$xb3,$xb3 1651 vpxor 0x60($inp),$xc3,$xc3 1652 vpxor 0x70($inp),$xd3,$xd3 1653 lea 0x80($inp),$inp # inp+=64*4 1654 1655 vmovdqu $xa0,0x00($out) 1656 vmovdqu $xb0,0x10($out) 1657 vmovdqu $xc0,0x20($out) 1658 vmovdqu $xd0,0x30($out) 1659 vmovdqu $xa1,0x40($out) 1660 vmovdqu $xb1,0x50($out) 1661 vmovdqu $xc1,0x60($out) 1662 vmovdqu $xd1,0x70($out) 1663 lea 0x80($out),$out # size optimization 1664 vmovdqu $xa2,0x00($out) 1665 vmovdqu $xb2,0x10($out) 1666 vmovdqu $xc2,0x20($out) 1667 vmovdqu $xd2,0x30($out) 1668 vmovdqu $xa3,0x40($out) 1669 vmovdqu $xb3,0x50($out) 1670 vmovdqu $xc3,0x60($out) 1671 vmovdqu $xd3,0x70($out) 1672 lea 0x80($out),$out # out+=64*4 1673 1674 sub \$64*4,$len 1675 jnz .Loop_outer4xop 1676 1677 jmp .Ldone4xop 1678 1679.align 32 1680.Ltail4xop: 1681 cmp \$192,$len 1682 jae .L192_or_more4xop 1683 cmp \$128,$len 1684 jae .L128_or_more4xop 1685 cmp \$64,$len 1686 jae .L64_or_more4xop 1687 1688 xor %r10,%r10 1689 vmovdqa $xa0,0x00(%rsp) 1690 vmovdqa $xb0,0x10(%rsp) 1691 vmovdqa $xc0,0x20(%rsp) 1692 vmovdqa $xd0,0x30(%rsp) 1693 jmp .Loop_tail4xop 1694 1695.align 32 1696.L64_or_more4xop: 1697 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1698 vpxor 0x10($inp),$xb0,$xb0 1699 vpxor 0x20($inp),$xc0,$xc0 1700 vpxor 0x30($inp),$xd0,$xd0 1701 vmovdqu $xa0,0x00($out) 1702 vmovdqu $xb0,0x10($out) 1703 vmovdqu $xc0,0x20($out) 1704 vmovdqu $xd0,0x30($out) 1705 je .Ldone4xop 1706 1707 lea 0x40($inp),$inp # inp+=64*1 1708 vmovdqa $xa1,0x00(%rsp) 1709 xor %r10,%r10 1710 vmovdqa $xb1,0x10(%rsp) 1711 lea 0x40($out),$out # out+=64*1 1712 vmovdqa $xc1,0x20(%rsp) 1713 sub \$64,$len # len-=64*1 1714 vmovdqa $xd1,0x30(%rsp) 1715 jmp .Loop_tail4xop 1716 1717.align 32 1718.L128_or_more4xop: 1719 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1720 vpxor 0x10($inp),$xb0,$xb0 1721 vpxor 0x20($inp),$xc0,$xc0 1722 vpxor 0x30($inp),$xd0,$xd0 1723 vpxor 0x40($inp),$xa1,$xa1 1724 vpxor 0x50($inp),$xb1,$xb1 1725 vpxor 0x60($inp),$xc1,$xc1 1726 vpxor 0x70($inp),$xd1,$xd1 1727 1728 vmovdqu $xa0,0x00($out) 1729 vmovdqu $xb0,0x10($out) 1730 vmovdqu $xc0,0x20($out) 1731 vmovdqu $xd0,0x30($out) 1732 vmovdqu $xa1,0x40($out) 1733 vmovdqu $xb1,0x50($out) 1734 vmovdqu $xc1,0x60($out) 1735 vmovdqu $xd1,0x70($out) 1736 je .Ldone4xop 1737 1738 lea 0x80($inp),$inp # inp+=64*2 1739 vmovdqa $xa2,0x00(%rsp) 1740 xor %r10,%r10 1741 vmovdqa $xb2,0x10(%rsp) 1742 lea 0x80($out),$out # out+=64*2 1743 vmovdqa $xc2,0x20(%rsp) 1744 sub \$128,$len # len-=64*2 1745 vmovdqa $xd2,0x30(%rsp) 1746 jmp .Loop_tail4xop 1747 1748.align 32 1749.L192_or_more4xop: 1750 vpxor 0x00($inp),$xa0,$xa0 # xor with input 1751 vpxor 0x10($inp),$xb0,$xb0 1752 vpxor 0x20($inp),$xc0,$xc0 1753 vpxor 0x30($inp),$xd0,$xd0 1754 vpxor 0x40($inp),$xa1,$xa1 1755 vpxor 0x50($inp),$xb1,$xb1 1756 vpxor 0x60($inp),$xc1,$xc1 1757 vpxor 0x70($inp),$xd1,$xd1 1758 lea 0x80($inp),$inp # size optimization 1759 vpxor 0x00($inp),$xa2,$xa2 1760 vpxor 0x10($inp),$xb2,$xb2 1761 vpxor 0x20($inp),$xc2,$xc2 1762 vpxor 0x30($inp),$xd2,$xd2 1763 1764 vmovdqu $xa0,0x00($out) 1765 vmovdqu $xb0,0x10($out) 1766 vmovdqu $xc0,0x20($out) 1767 vmovdqu $xd0,0x30($out) 1768 vmovdqu $xa1,0x40($out) 1769 vmovdqu $xb1,0x50($out) 1770 vmovdqu $xc1,0x60($out) 1771 vmovdqu $xd1,0x70($out) 1772 lea 0x80($out),$out # size optimization 1773 vmovdqu $xa2,0x00($out) 1774 vmovdqu $xb2,0x10($out) 1775 vmovdqu $xc2,0x20($out) 1776 vmovdqu $xd2,0x30($out) 1777 je .Ldone4xop 1778 1779 lea 0x40($inp),$inp # inp+=64*3 1780 vmovdqa $xa3,0x00(%rsp) 1781 xor %r10,%r10 1782 vmovdqa $xb3,0x10(%rsp) 1783 lea 0x40($out),$out # out+=64*3 1784 vmovdqa $xc3,0x20(%rsp) 1785 sub \$192,$len # len-=64*3 1786 vmovdqa $xd3,0x30(%rsp) 1787 1788.Loop_tail4xop: 1789 movzb ($inp,%r10),%eax 1790 movzb (%rsp,%r10),%ecx 1791 lea 1(%r10),%r10 1792 xor %ecx,%eax 1793 mov %al,-1($out,%r10) 1794 dec $len 1795 jnz .Loop_tail4xop 1796 1797.Ldone4xop: 1798 vzeroupper 1799___ 1800$code.=<<___ if ($win64); 1801 movaps -0xa8(%r9),%xmm6 1802 movaps -0x98(%r9),%xmm7 1803 movaps -0x88(%r9),%xmm8 1804 movaps -0x78(%r9),%xmm9 1805 movaps -0x68(%r9),%xmm10 1806 movaps -0x58(%r9),%xmm11 1807 movaps -0x48(%r9),%xmm12 1808 movaps -0x38(%r9),%xmm13 1809 movaps -0x28(%r9),%xmm14 1810 movaps -0x18(%r9),%xmm15 1811___ 1812$code.=<<___; 1813 lea (%r9),%rsp 1814.cfi_def_cfa_register %rsp 1815.L4xop_epilogue: 1816 ret 1817.cfi_endproc 1818.size ChaCha20_4xop,.-ChaCha20_4xop 1819___ 1820} 1821 1822######################################################################## 1823# AVX2 code path 1824if ($avx>1) { 1825my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, 1826 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); 1827my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 1828 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); 1829 1830sub AVX2_lane_ROUND { 1831my ($a0,$b0,$c0,$d0)=@_; 1832my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 1833my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 1834my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 1835my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); 1836my @x=map("\"$_\"",@xx); 1837 1838 # Consider order in which variables are addressed by their 1839 # index: 1840 # 1841 # a b c d 1842 # 1843 # 0 4 8 12 < even round 1844 # 1 5 9 13 1845 # 2 6 10 14 1846 # 3 7 11 15 1847 # 0 5 10 15 < odd round 1848 # 1 6 11 12 1849 # 2 7 8 13 1850 # 3 4 9 14 1851 # 1852 # 'a', 'b' and 'd's are permanently allocated in registers, 1853 # @x[0..7,12..15], while 'c's are maintained in memory. If 1854 # you observe 'c' column, you'll notice that pair of 'c's is 1855 # invariant between rounds. This means that we have to reload 1856 # them once per round, in the middle. This is why you'll see 1857 # bunch of 'c' stores and loads in the middle, but none in 1858 # the beginning or end. 1859 1860 ( 1861 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 1862 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1863 "&vpshufb (@x[$d0],@x[$d0],$t1)", 1864 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 1865 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1866 "&vpshufb (@x[$d1],@x[$d1],$t1)", 1867 1868 "&vpaddd ($xc,$xc,@x[$d0])", 1869 "&vpxor (@x[$b0],$xc,@x[$b0])", 1870 "&vpslld ($t0,@x[$b0],12)", 1871 "&vpsrld (@x[$b0],@x[$b0],20)", 1872 "&vpor (@x[$b0],$t0,@x[$b0])", 1873 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1874 "&vpaddd ($xc_,$xc_,@x[$d1])", 1875 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1876 "&vpslld ($t1,@x[$b1],12)", 1877 "&vpsrld (@x[$b1],@x[$b1],20)", 1878 "&vpor (@x[$b1],$t1,@x[$b1])", 1879 1880 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 1881 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", 1882 "&vpshufb (@x[$d0],@x[$d0],$t0)", 1883 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 1884 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", 1885 "&vpshufb (@x[$d1],@x[$d1],$t0)", 1886 1887 "&vpaddd ($xc,$xc,@x[$d0])", 1888 "&vpxor (@x[$b0],$xc,@x[$b0])", 1889 "&vpslld ($t1,@x[$b0],7)", 1890 "&vpsrld (@x[$b0],@x[$b0],25)", 1891 "&vpor (@x[$b0],$t1,@x[$b0])", 1892 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1893 "&vpaddd ($xc_,$xc_,@x[$d1])", 1894 "&vpxor (@x[$b1],$xc_,@x[$b1])", 1895 "&vpslld ($t0,@x[$b1],7)", 1896 "&vpsrld (@x[$b1],@x[$b1],25)", 1897 "&vpor (@x[$b1],$t0,@x[$b1])", 1898 1899 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's 1900 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", 1901 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", 1902 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", 1903 1904 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 1905 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1906 "&vpshufb (@x[$d2],@x[$d2],$t1)", 1907 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 1908 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1909 "&vpshufb (@x[$d3],@x[$d3],$t1)", 1910 1911 "&vpaddd ($xc,$xc,@x[$d2])", 1912 "&vpxor (@x[$b2],$xc,@x[$b2])", 1913 "&vpslld ($t0,@x[$b2],12)", 1914 "&vpsrld (@x[$b2],@x[$b2],20)", 1915 "&vpor (@x[$b2],$t0,@x[$b2])", 1916 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) 1917 "&vpaddd ($xc_,$xc_,@x[$d3])", 1918 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1919 "&vpslld ($t1,@x[$b3],12)", 1920 "&vpsrld (@x[$b3],@x[$b3],20)", 1921 "&vpor (@x[$b3],$t1,@x[$b3])", 1922 1923 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 1924 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", 1925 "&vpshufb (@x[$d2],@x[$d2],$t0)", 1926 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 1927 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", 1928 "&vpshufb (@x[$d3],@x[$d3],$t0)", 1929 1930 "&vpaddd ($xc,$xc,@x[$d2])", 1931 "&vpxor (@x[$b2],$xc,@x[$b2])", 1932 "&vpslld ($t1,@x[$b2],7)", 1933 "&vpsrld (@x[$b2],@x[$b2],25)", 1934 "&vpor (@x[$b2],$t1,@x[$b2])", 1935 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) 1936 "&vpaddd ($xc_,$xc_,@x[$d3])", 1937 "&vpxor (@x[$b3],$xc_,@x[$b3])", 1938 "&vpslld ($t0,@x[$b3],7)", 1939 "&vpsrld (@x[$b3],@x[$b3],25)", 1940 "&vpor (@x[$b3],$t0,@x[$b3])" 1941 ); 1942} 1943 1944my $xframe = $win64 ? 0xa8 : 8; 1945 1946$code.=<<___; 1947.type ChaCha20_8x,\@function,5 1948.align 32 1949ChaCha20_8x: 1950.cfi_startproc 1951.LChaCha20_8x: 1952 mov %rsp,%r9 # frame register 1953.cfi_def_cfa_register %r9 1954 sub \$0x280+$xframe,%rsp 1955 and \$-32,%rsp 1956___ 1957$code.=<<___ if ($win64); 1958 movaps %xmm6,-0xa8(%r9) 1959 movaps %xmm7,-0x98(%r9) 1960 movaps %xmm8,-0x88(%r9) 1961 movaps %xmm9,-0x78(%r9) 1962 movaps %xmm10,-0x68(%r9) 1963 movaps %xmm11,-0x58(%r9) 1964 movaps %xmm12,-0x48(%r9) 1965 movaps %xmm13,-0x38(%r9) 1966 movaps %xmm14,-0x28(%r9) 1967 movaps %xmm15,-0x18(%r9) 1968.L8x_body: 1969___ 1970$code.=<<___; 1971 vzeroupper 1972 1973 ################ stack layout 1974 # +0x00 SIMD equivalent of @x[8-12] 1975 # ... 1976 # +0x80 constant copy of key[0-2] smashed by lanes 1977 # ... 1978 # +0x200 SIMD counters (with nonce smashed by lanes) 1979 # ... 1980 # +0x280 1981 1982 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] 1983 vbroadcasti128 ($key),$xb3 # key[1] 1984 vbroadcasti128 16($key),$xt3 # key[2] 1985 vbroadcasti128 ($counter),$xd3 # key[3] 1986 lea 0x100(%rsp),%rcx # size optimization 1987 lea 0x200(%rsp),%rax # size optimization 1988 lea .Lrot16(%rip),%r10 1989 lea .Lrot24(%rip),%r11 1990 1991 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 1992 vpshufd \$0x55,$xa3,$xa1 1993 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload 1994 vpshufd \$0xaa,$xa3,$xa2 1995 vmovdqa $xa1,0xa0-0x100(%rcx) 1996 vpshufd \$0xff,$xa3,$xa3 1997 vmovdqa $xa2,0xc0-0x100(%rcx) 1998 vmovdqa $xa3,0xe0-0x100(%rcx) 1999 2000 vpshufd \$0x00,$xb3,$xb0 2001 vpshufd \$0x55,$xb3,$xb1 2002 vmovdqa $xb0,0x100-0x100(%rcx) 2003 vpshufd \$0xaa,$xb3,$xb2 2004 vmovdqa $xb1,0x120-0x100(%rcx) 2005 vpshufd \$0xff,$xb3,$xb3 2006 vmovdqa $xb2,0x140-0x100(%rcx) 2007 vmovdqa $xb3,0x160-0x100(%rcx) 2008 2009 vpshufd \$0x00,$xt3,$xt0 # "xc0" 2010 vpshufd \$0x55,$xt3,$xt1 # "xc1" 2011 vmovdqa $xt0,0x180-0x200(%rax) 2012 vpshufd \$0xaa,$xt3,$xt2 # "xc2" 2013 vmovdqa $xt1,0x1a0-0x200(%rax) 2014 vpshufd \$0xff,$xt3,$xt3 # "xc3" 2015 vmovdqa $xt2,0x1c0-0x200(%rax) 2016 vmovdqa $xt3,0x1e0-0x200(%rax) 2017 2018 vpshufd \$0x00,$xd3,$xd0 2019 vpshufd \$0x55,$xd3,$xd1 2020 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 2021 vpshufd \$0xaa,$xd3,$xd2 2022 vmovdqa $xd1,0x220-0x200(%rax) 2023 vpshufd \$0xff,$xd3,$xd3 2024 vmovdqa $xd2,0x240-0x200(%rax) 2025 vmovdqa $xd3,0x260-0x200(%rax) 2026 2027 jmp .Loop_enter8x 2028 2029.align 32 2030.Loop_outer8x: 2031 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key 2032 vmovdqa 0xa0-0x100(%rcx),$xa1 2033 vmovdqa 0xc0-0x100(%rcx),$xa2 2034 vmovdqa 0xe0-0x100(%rcx),$xa3 2035 vmovdqa 0x100-0x100(%rcx),$xb0 2036 vmovdqa 0x120-0x100(%rcx),$xb1 2037 vmovdqa 0x140-0x100(%rcx),$xb2 2038 vmovdqa 0x160-0x100(%rcx),$xb3 2039 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" 2040 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" 2041 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" 2042 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" 2043 vmovdqa 0x200-0x200(%rax),$xd0 2044 vmovdqa 0x220-0x200(%rax),$xd1 2045 vmovdqa 0x240-0x200(%rax),$xd2 2046 vmovdqa 0x260-0x200(%rax),$xd3 2047 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters 2048 2049.Loop_enter8x: 2050 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" 2051 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" 2052 vbroadcasti128 (%r10),$xt3 2053 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters 2054 mov \$10,%eax 2055 jmp .Loop8x 2056 2057.align 32 2058.Loop8x: 2059___ 2060 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } 2061 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } 2062$code.=<<___; 2063 dec %eax 2064 jnz .Loop8x 2065 2066 lea 0x200(%rsp),%rax # size optimization 2067 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key 2068 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 2069 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 2070 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 2071 2072 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 2073 vpunpckldq $xa3,$xa2,$xt3 2074 vpunpckhdq $xa1,$xa0,$xa0 2075 vpunpckhdq $xa3,$xa2,$xa2 2076 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 2077 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 2078 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 2079 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 2080___ 2081 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 2082$code.=<<___; 2083 vpaddd 0x100-0x100(%rcx),$xb0,$xb0 2084 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 2085 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 2086 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 2087 2088 vpunpckldq $xb1,$xb0,$xt2 2089 vpunpckldq $xb3,$xb2,$xt3 2090 vpunpckhdq $xb1,$xb0,$xb0 2091 vpunpckhdq $xb3,$xb2,$xb2 2092 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 2093 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 2094 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 2095 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 2096___ 2097 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 2098$code.=<<___; 2099 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further 2100 vperm2i128 \$0x31,$xb0,$xa0,$xb0 2101 vperm2i128 \$0x20,$xb1,$xa1,$xa0 2102 vperm2i128 \$0x31,$xb1,$xa1,$xb1 2103 vperm2i128 \$0x20,$xb2,$xa2,$xa1 2104 vperm2i128 \$0x31,$xb2,$xa2,$xb2 2105 vperm2i128 \$0x20,$xb3,$xa3,$xa2 2106 vperm2i128 \$0x31,$xb3,$xa3,$xb3 2107___ 2108 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 2109 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); 2110$code.=<<___; 2111 vmovdqa $xa0,0x00(%rsp) # offload $xaN 2112 vmovdqa $xa1,0x20(%rsp) 2113 vmovdqa 0x40(%rsp),$xc2 # $xa0 2114 vmovdqa 0x60(%rsp),$xc3 # $xa1 2115 2116 vpaddd 0x180-0x200(%rax),$xc0,$xc0 2117 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 2118 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 2119 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 2120 2121 vpunpckldq $xc1,$xc0,$xt2 2122 vpunpckldq $xc3,$xc2,$xt3 2123 vpunpckhdq $xc1,$xc0,$xc0 2124 vpunpckhdq $xc3,$xc2,$xc2 2125 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 2126 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 2127 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 2128 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 2129___ 2130 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 2131$code.=<<___; 2132 vpaddd 0x200-0x200(%rax),$xd0,$xd0 2133 vpaddd 0x220-0x200(%rax),$xd1,$xd1 2134 vpaddd 0x240-0x200(%rax),$xd2,$xd2 2135 vpaddd 0x260-0x200(%rax),$xd3,$xd3 2136 2137 vpunpckldq $xd1,$xd0,$xt2 2138 vpunpckldq $xd3,$xd2,$xt3 2139 vpunpckhdq $xd1,$xd0,$xd0 2140 vpunpckhdq $xd3,$xd2,$xd2 2141 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 2142 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 2143 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 2144 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 2145___ 2146 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 2147$code.=<<___; 2148 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 2149 vperm2i128 \$0x31,$xd0,$xc0,$xd0 2150 vperm2i128 \$0x20,$xd1,$xc1,$xc0 2151 vperm2i128 \$0x31,$xd1,$xc1,$xd1 2152 vperm2i128 \$0x20,$xd2,$xc2,$xc1 2153 vperm2i128 \$0x31,$xd2,$xc2,$xd2 2154 vperm2i128 \$0x20,$xd3,$xc3,$xc2 2155 vperm2i128 \$0x31,$xd3,$xc3,$xd3 2156___ 2157 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 2158 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 2159 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 2160 ($xa0,$xa1)=($xt2,$xt3); 2161$code.=<<___; 2162 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? 2163 vmovdqa 0x20(%rsp),$xa1 2164 2165 cmp \$64*8,$len 2166 jb .Ltail8x 2167 2168 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2169 vpxor 0x20($inp),$xb0,$xb0 2170 vpxor 0x40($inp),$xc0,$xc0 2171 vpxor 0x60($inp),$xd0,$xd0 2172 lea 0x80($inp),$inp # size optimization 2173 vmovdqu $xa0,0x00($out) 2174 vmovdqu $xb0,0x20($out) 2175 vmovdqu $xc0,0x40($out) 2176 vmovdqu $xd0,0x60($out) 2177 lea 0x80($out),$out # size optimization 2178 2179 vpxor 0x00($inp),$xa1,$xa1 2180 vpxor 0x20($inp),$xb1,$xb1 2181 vpxor 0x40($inp),$xc1,$xc1 2182 vpxor 0x60($inp),$xd1,$xd1 2183 lea 0x80($inp),$inp # size optimization 2184 vmovdqu $xa1,0x00($out) 2185 vmovdqu $xb1,0x20($out) 2186 vmovdqu $xc1,0x40($out) 2187 vmovdqu $xd1,0x60($out) 2188 lea 0x80($out),$out # size optimization 2189 2190 vpxor 0x00($inp),$xa2,$xa2 2191 vpxor 0x20($inp),$xb2,$xb2 2192 vpxor 0x40($inp),$xc2,$xc2 2193 vpxor 0x60($inp),$xd2,$xd2 2194 lea 0x80($inp),$inp # size optimization 2195 vmovdqu $xa2,0x00($out) 2196 vmovdqu $xb2,0x20($out) 2197 vmovdqu $xc2,0x40($out) 2198 vmovdqu $xd2,0x60($out) 2199 lea 0x80($out),$out # size optimization 2200 2201 vpxor 0x00($inp),$xa3,$xa3 2202 vpxor 0x20($inp),$xb3,$xb3 2203 vpxor 0x40($inp),$xc3,$xc3 2204 vpxor 0x60($inp),$xd3,$xd3 2205 lea 0x80($inp),$inp # size optimization 2206 vmovdqu $xa3,0x00($out) 2207 vmovdqu $xb3,0x20($out) 2208 vmovdqu $xc3,0x40($out) 2209 vmovdqu $xd3,0x60($out) 2210 lea 0x80($out),$out # size optimization 2211 2212 sub \$64*8,$len 2213 jnz .Loop_outer8x 2214 2215 jmp .Ldone8x 2216 2217.Ltail8x: 2218 cmp \$448,$len 2219 jae .L448_or_more8x 2220 cmp \$384,$len 2221 jae .L384_or_more8x 2222 cmp \$320,$len 2223 jae .L320_or_more8x 2224 cmp \$256,$len 2225 jae .L256_or_more8x 2226 cmp \$192,$len 2227 jae .L192_or_more8x 2228 cmp \$128,$len 2229 jae .L128_or_more8x 2230 cmp \$64,$len 2231 jae .L64_or_more8x 2232 2233 xor %r10,%r10 2234 vmovdqa $xa0,0x00(%rsp) 2235 vmovdqa $xb0,0x20(%rsp) 2236 jmp .Loop_tail8x 2237 2238.align 32 2239.L64_or_more8x: 2240 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2241 vpxor 0x20($inp),$xb0,$xb0 2242 vmovdqu $xa0,0x00($out) 2243 vmovdqu $xb0,0x20($out) 2244 je .Ldone8x 2245 2246 lea 0x40($inp),$inp # inp+=64*1 2247 xor %r10,%r10 2248 vmovdqa $xc0,0x00(%rsp) 2249 lea 0x40($out),$out # out+=64*1 2250 sub \$64,$len # len-=64*1 2251 vmovdqa $xd0,0x20(%rsp) 2252 jmp .Loop_tail8x 2253 2254.align 32 2255.L128_or_more8x: 2256 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2257 vpxor 0x20($inp),$xb0,$xb0 2258 vpxor 0x40($inp),$xc0,$xc0 2259 vpxor 0x60($inp),$xd0,$xd0 2260 vmovdqu $xa0,0x00($out) 2261 vmovdqu $xb0,0x20($out) 2262 vmovdqu $xc0,0x40($out) 2263 vmovdqu $xd0,0x60($out) 2264 je .Ldone8x 2265 2266 lea 0x80($inp),$inp # inp+=64*2 2267 xor %r10,%r10 2268 vmovdqa $xa1,0x00(%rsp) 2269 lea 0x80($out),$out # out+=64*2 2270 sub \$128,$len # len-=64*2 2271 vmovdqa $xb1,0x20(%rsp) 2272 jmp .Loop_tail8x 2273 2274.align 32 2275.L192_or_more8x: 2276 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2277 vpxor 0x20($inp),$xb0,$xb0 2278 vpxor 0x40($inp),$xc0,$xc0 2279 vpxor 0x60($inp),$xd0,$xd0 2280 vpxor 0x80($inp),$xa1,$xa1 2281 vpxor 0xa0($inp),$xb1,$xb1 2282 vmovdqu $xa0,0x00($out) 2283 vmovdqu $xb0,0x20($out) 2284 vmovdqu $xc0,0x40($out) 2285 vmovdqu $xd0,0x60($out) 2286 vmovdqu $xa1,0x80($out) 2287 vmovdqu $xb1,0xa0($out) 2288 je .Ldone8x 2289 2290 lea 0xc0($inp),$inp # inp+=64*3 2291 xor %r10,%r10 2292 vmovdqa $xc1,0x00(%rsp) 2293 lea 0xc0($out),$out # out+=64*3 2294 sub \$192,$len # len-=64*3 2295 vmovdqa $xd1,0x20(%rsp) 2296 jmp .Loop_tail8x 2297 2298.align 32 2299.L256_or_more8x: 2300 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2301 vpxor 0x20($inp),$xb0,$xb0 2302 vpxor 0x40($inp),$xc0,$xc0 2303 vpxor 0x60($inp),$xd0,$xd0 2304 vpxor 0x80($inp),$xa1,$xa1 2305 vpxor 0xa0($inp),$xb1,$xb1 2306 vpxor 0xc0($inp),$xc1,$xc1 2307 vpxor 0xe0($inp),$xd1,$xd1 2308 vmovdqu $xa0,0x00($out) 2309 vmovdqu $xb0,0x20($out) 2310 vmovdqu $xc0,0x40($out) 2311 vmovdqu $xd0,0x60($out) 2312 vmovdqu $xa1,0x80($out) 2313 vmovdqu $xb1,0xa0($out) 2314 vmovdqu $xc1,0xc0($out) 2315 vmovdqu $xd1,0xe0($out) 2316 je .Ldone8x 2317 2318 lea 0x100($inp),$inp # inp+=64*4 2319 xor %r10,%r10 2320 vmovdqa $xa2,0x00(%rsp) 2321 lea 0x100($out),$out # out+=64*4 2322 sub \$256,$len # len-=64*4 2323 vmovdqa $xb2,0x20(%rsp) 2324 jmp .Loop_tail8x 2325 2326.align 32 2327.L320_or_more8x: 2328 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2329 vpxor 0x20($inp),$xb0,$xb0 2330 vpxor 0x40($inp),$xc0,$xc0 2331 vpxor 0x60($inp),$xd0,$xd0 2332 vpxor 0x80($inp),$xa1,$xa1 2333 vpxor 0xa0($inp),$xb1,$xb1 2334 vpxor 0xc0($inp),$xc1,$xc1 2335 vpxor 0xe0($inp),$xd1,$xd1 2336 vpxor 0x100($inp),$xa2,$xa2 2337 vpxor 0x120($inp),$xb2,$xb2 2338 vmovdqu $xa0,0x00($out) 2339 vmovdqu $xb0,0x20($out) 2340 vmovdqu $xc0,0x40($out) 2341 vmovdqu $xd0,0x60($out) 2342 vmovdqu $xa1,0x80($out) 2343 vmovdqu $xb1,0xa0($out) 2344 vmovdqu $xc1,0xc0($out) 2345 vmovdqu $xd1,0xe0($out) 2346 vmovdqu $xa2,0x100($out) 2347 vmovdqu $xb2,0x120($out) 2348 je .Ldone8x 2349 2350 lea 0x140($inp),$inp # inp+=64*5 2351 xor %r10,%r10 2352 vmovdqa $xc2,0x00(%rsp) 2353 lea 0x140($out),$out # out+=64*5 2354 sub \$320,$len # len-=64*5 2355 vmovdqa $xd2,0x20(%rsp) 2356 jmp .Loop_tail8x 2357 2358.align 32 2359.L384_or_more8x: 2360 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2361 vpxor 0x20($inp),$xb0,$xb0 2362 vpxor 0x40($inp),$xc0,$xc0 2363 vpxor 0x60($inp),$xd0,$xd0 2364 vpxor 0x80($inp),$xa1,$xa1 2365 vpxor 0xa0($inp),$xb1,$xb1 2366 vpxor 0xc0($inp),$xc1,$xc1 2367 vpxor 0xe0($inp),$xd1,$xd1 2368 vpxor 0x100($inp),$xa2,$xa2 2369 vpxor 0x120($inp),$xb2,$xb2 2370 vpxor 0x140($inp),$xc2,$xc2 2371 vpxor 0x160($inp),$xd2,$xd2 2372 vmovdqu $xa0,0x00($out) 2373 vmovdqu $xb0,0x20($out) 2374 vmovdqu $xc0,0x40($out) 2375 vmovdqu $xd0,0x60($out) 2376 vmovdqu $xa1,0x80($out) 2377 vmovdqu $xb1,0xa0($out) 2378 vmovdqu $xc1,0xc0($out) 2379 vmovdqu $xd1,0xe0($out) 2380 vmovdqu $xa2,0x100($out) 2381 vmovdqu $xb2,0x120($out) 2382 vmovdqu $xc2,0x140($out) 2383 vmovdqu $xd2,0x160($out) 2384 je .Ldone8x 2385 2386 lea 0x180($inp),$inp # inp+=64*6 2387 xor %r10,%r10 2388 vmovdqa $xa3,0x00(%rsp) 2389 lea 0x180($out),$out # out+=64*6 2390 sub \$384,$len # len-=64*6 2391 vmovdqa $xb3,0x20(%rsp) 2392 jmp .Loop_tail8x 2393 2394.align 32 2395.L448_or_more8x: 2396 vpxor 0x00($inp),$xa0,$xa0 # xor with input 2397 vpxor 0x20($inp),$xb0,$xb0 2398 vpxor 0x40($inp),$xc0,$xc0 2399 vpxor 0x60($inp),$xd0,$xd0 2400 vpxor 0x80($inp),$xa1,$xa1 2401 vpxor 0xa0($inp),$xb1,$xb1 2402 vpxor 0xc0($inp),$xc1,$xc1 2403 vpxor 0xe0($inp),$xd1,$xd1 2404 vpxor 0x100($inp),$xa2,$xa2 2405 vpxor 0x120($inp),$xb2,$xb2 2406 vpxor 0x140($inp),$xc2,$xc2 2407 vpxor 0x160($inp),$xd2,$xd2 2408 vpxor 0x180($inp),$xa3,$xa3 2409 vpxor 0x1a0($inp),$xb3,$xb3 2410 vmovdqu $xa0,0x00($out) 2411 vmovdqu $xb0,0x20($out) 2412 vmovdqu $xc0,0x40($out) 2413 vmovdqu $xd0,0x60($out) 2414 vmovdqu $xa1,0x80($out) 2415 vmovdqu $xb1,0xa0($out) 2416 vmovdqu $xc1,0xc0($out) 2417 vmovdqu $xd1,0xe0($out) 2418 vmovdqu $xa2,0x100($out) 2419 vmovdqu $xb2,0x120($out) 2420 vmovdqu $xc2,0x140($out) 2421 vmovdqu $xd2,0x160($out) 2422 vmovdqu $xa3,0x180($out) 2423 vmovdqu $xb3,0x1a0($out) 2424 je .Ldone8x 2425 2426 lea 0x1c0($inp),$inp # inp+=64*7 2427 xor %r10,%r10 2428 vmovdqa $xc3,0x00(%rsp) 2429 lea 0x1c0($out),$out # out+=64*7 2430 sub \$448,$len # len-=64*7 2431 vmovdqa $xd3,0x20(%rsp) 2432 2433.Loop_tail8x: 2434 movzb ($inp,%r10),%eax 2435 movzb (%rsp,%r10),%ecx 2436 lea 1(%r10),%r10 2437 xor %ecx,%eax 2438 mov %al,-1($out,%r10) 2439 dec $len 2440 jnz .Loop_tail8x 2441 2442.Ldone8x: 2443 vzeroall 2444___ 2445$code.=<<___ if ($win64); 2446 movaps -0xa8(%r9),%xmm6 2447 movaps -0x98(%r9),%xmm7 2448 movaps -0x88(%r9),%xmm8 2449 movaps -0x78(%r9),%xmm9 2450 movaps -0x68(%r9),%xmm10 2451 movaps -0x58(%r9),%xmm11 2452 movaps -0x48(%r9),%xmm12 2453 movaps -0x38(%r9),%xmm13 2454 movaps -0x28(%r9),%xmm14 2455 movaps -0x18(%r9),%xmm15 2456___ 2457$code.=<<___; 2458 lea (%r9),%rsp 2459.cfi_def_cfa_register %rsp 2460.L8x_epilogue: 2461 ret 2462.cfi_endproc 2463.size ChaCha20_8x,.-ChaCha20_8x 2464___ 2465} 2466 2467######################################################################## 2468# AVX512 code paths 2469if ($avx>2) { 2470# This one handles shorter inputs... 2471 2472my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); 2473my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 2474 2475sub vpxord() # size optimization 2476{ my $opcode = "vpxor"; # adhere to vpxor when possible 2477 2478 foreach (@_) { 2479 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { 2480 $opcode = "vpxord"; 2481 last; 2482 } 2483 } 2484 2485 $code .= "\t$opcode\t".join(',',reverse @_)."\n"; 2486} 2487 2488sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round 2489 &vpaddd ($a,$a,$b); 2490 &vpxord ($d,$d,$a); 2491 &vprold ($d,$d,16); 2492 2493 &vpaddd ($c,$c,$d); 2494 &vpxord ($b,$b,$c); 2495 &vprold ($b,$b,12); 2496 2497 &vpaddd ($a,$a,$b); 2498 &vpxord ($d,$d,$a); 2499 &vprold ($d,$d,8); 2500 2501 &vpaddd ($c,$c,$d); 2502 &vpxord ($b,$b,$c); 2503 &vprold ($b,$b,7); 2504} 2505 2506my $xframe = $win64 ? 160+8 : 8; 2507 2508$code.=<<___; 2509.type ChaCha20_avx512,\@function,5 2510.align 32 2511ChaCha20_avx512: 2512.cfi_startproc 2513.LChaCha20_avx512: 2514 mov %rsp,%r9 # frame pointer 2515.cfi_def_cfa_register %r9 2516 cmp \$512,$len 2517 ja .LChaCha20_16x 2518 2519 sub \$64+$xframe,%rsp 2520___ 2521$code.=<<___ if ($win64); 2522 movaps %xmm6,-0xa8(%r9) 2523 movaps %xmm7,-0x98(%r9) 2524 movaps %xmm8,-0x88(%r9) 2525 movaps %xmm9,-0x78(%r9) 2526 movaps %xmm10,-0x68(%r9) 2527 movaps %xmm11,-0x58(%r9) 2528 movaps %xmm12,-0x48(%r9) 2529 movaps %xmm13,-0x38(%r9) 2530 movaps %xmm14,-0x28(%r9) 2531 movaps %xmm15,-0x18(%r9) 2532.Lavx512_body: 2533___ 2534$code.=<<___; 2535 vbroadcasti32x4 .Lsigma(%rip),$a 2536 vbroadcasti32x4 ($key),$b 2537 vbroadcasti32x4 16($key),$c 2538 vbroadcasti32x4 ($counter),$d 2539 2540 vmovdqa32 $a,$a_ 2541 vmovdqa32 $b,$b_ 2542 vmovdqa32 $c,$c_ 2543 vpaddd .Lzeroz(%rip),$d,$d 2544 vmovdqa32 .Lfourz(%rip),$fourz 2545 mov \$10,$counter # reuse $counter 2546 vmovdqa32 $d,$d_ 2547 jmp .Loop_avx512 2548 2549.align 16 2550.Loop_outer_avx512: 2551 vmovdqa32 $a_,$a 2552 vmovdqa32 $b_,$b 2553 vmovdqa32 $c_,$c 2554 vpaddd $fourz,$d_,$d 2555 mov \$10,$counter 2556 vmovdqa32 $d,$d_ 2557 jmp .Loop_avx512 2558 2559.align 32 2560.Loop_avx512: 2561___ 2562 &AVX512ROUND(); 2563 &vpshufd ($c,$c,0b01001110); 2564 &vpshufd ($b,$b,0b00111001); 2565 &vpshufd ($d,$d,0b10010011); 2566 2567 &AVX512ROUND(); 2568 &vpshufd ($c,$c,0b01001110); 2569 &vpshufd ($b,$b,0b10010011); 2570 &vpshufd ($d,$d,0b00111001); 2571 2572 &dec ($counter); 2573 &jnz (".Loop_avx512"); 2574 2575$code.=<<___; 2576 vpaddd $a_,$a,$a 2577 vpaddd $b_,$b,$b 2578 vpaddd $c_,$c,$c 2579 vpaddd $d_,$d,$d 2580 2581 sub \$64,$len 2582 jb .Ltail64_avx512 2583 2584 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2585 vpxor 0x10($inp),%x#$b,$t1 2586 vpxor 0x20($inp),%x#$c,$t2 2587 vpxor 0x30($inp),%x#$d,$t3 2588 lea 0x40($inp),$inp # inp+=64 2589 2590 vmovdqu $t0,0x00($out) # write output 2591 vmovdqu $t1,0x10($out) 2592 vmovdqu $t2,0x20($out) 2593 vmovdqu $t3,0x30($out) 2594 lea 0x40($out),$out # out+=64 2595 2596 jz .Ldone_avx512 2597 2598 vextracti32x4 \$1,$a,$t0 2599 vextracti32x4 \$1,$b,$t1 2600 vextracti32x4 \$1,$c,$t2 2601 vextracti32x4 \$1,$d,$t3 2602 2603 sub \$64,$len 2604 jb .Ltail_avx512 2605 2606 vpxor 0x00($inp),$t0,$t0 # xor with input 2607 vpxor 0x10($inp),$t1,$t1 2608 vpxor 0x20($inp),$t2,$t2 2609 vpxor 0x30($inp),$t3,$t3 2610 lea 0x40($inp),$inp # inp+=64 2611 2612 vmovdqu $t0,0x00($out) # write output 2613 vmovdqu $t1,0x10($out) 2614 vmovdqu $t2,0x20($out) 2615 vmovdqu $t3,0x30($out) 2616 lea 0x40($out),$out # out+=64 2617 2618 jz .Ldone_avx512 2619 2620 vextracti32x4 \$2,$a,$t0 2621 vextracti32x4 \$2,$b,$t1 2622 vextracti32x4 \$2,$c,$t2 2623 vextracti32x4 \$2,$d,$t3 2624 2625 sub \$64,$len 2626 jb .Ltail_avx512 2627 2628 vpxor 0x00($inp),$t0,$t0 # xor with input 2629 vpxor 0x10($inp),$t1,$t1 2630 vpxor 0x20($inp),$t2,$t2 2631 vpxor 0x30($inp),$t3,$t3 2632 lea 0x40($inp),$inp # inp+=64 2633 2634 vmovdqu $t0,0x00($out) # write output 2635 vmovdqu $t1,0x10($out) 2636 vmovdqu $t2,0x20($out) 2637 vmovdqu $t3,0x30($out) 2638 lea 0x40($out),$out # out+=64 2639 2640 jz .Ldone_avx512 2641 2642 vextracti32x4 \$3,$a,$t0 2643 vextracti32x4 \$3,$b,$t1 2644 vextracti32x4 \$3,$c,$t2 2645 vextracti32x4 \$3,$d,$t3 2646 2647 sub \$64,$len 2648 jb .Ltail_avx512 2649 2650 vpxor 0x00($inp),$t0,$t0 # xor with input 2651 vpxor 0x10($inp),$t1,$t1 2652 vpxor 0x20($inp),$t2,$t2 2653 vpxor 0x30($inp),$t3,$t3 2654 lea 0x40($inp),$inp # inp+=64 2655 2656 vmovdqu $t0,0x00($out) # write output 2657 vmovdqu $t1,0x10($out) 2658 vmovdqu $t2,0x20($out) 2659 vmovdqu $t3,0x30($out) 2660 lea 0x40($out),$out # out+=64 2661 2662 jnz .Loop_outer_avx512 2663 2664 jmp .Ldone_avx512 2665 2666.align 16 2667.Ltail64_avx512: 2668 vmovdqa %x#$a,0x00(%rsp) 2669 vmovdqa %x#$b,0x10(%rsp) 2670 vmovdqa %x#$c,0x20(%rsp) 2671 vmovdqa %x#$d,0x30(%rsp) 2672 add \$64,$len 2673 jmp .Loop_tail_avx512 2674 2675.align 16 2676.Ltail_avx512: 2677 vmovdqa $t0,0x00(%rsp) 2678 vmovdqa $t1,0x10(%rsp) 2679 vmovdqa $t2,0x20(%rsp) 2680 vmovdqa $t3,0x30(%rsp) 2681 add \$64,$len 2682 2683.Loop_tail_avx512: 2684 movzb ($inp,$counter),%eax 2685 movzb (%rsp,$counter),%ecx 2686 lea 1($counter),$counter 2687 xor %ecx,%eax 2688 mov %al,-1($out,$counter) 2689 dec $len 2690 jnz .Loop_tail_avx512 2691 2692 vmovdqu32 $a_,0x00(%rsp) 2693 2694.Ldone_avx512: 2695 vzeroall 2696___ 2697$code.=<<___ if ($win64); 2698 movaps -0xa8(%r9),%xmm6 2699 movaps -0x98(%r9),%xmm7 2700 movaps -0x88(%r9),%xmm8 2701 movaps -0x78(%r9),%xmm9 2702 movaps -0x68(%r9),%xmm10 2703 movaps -0x58(%r9),%xmm11 2704 movaps -0x48(%r9),%xmm12 2705 movaps -0x38(%r9),%xmm13 2706 movaps -0x28(%r9),%xmm14 2707 movaps -0x18(%r9),%xmm15 2708___ 2709$code.=<<___; 2710 lea (%r9),%rsp 2711.cfi_def_cfa_register %rsp 2712.Lavx512_epilogue: 2713 ret 2714.cfi_endproc 2715.size ChaCha20_avx512,.-ChaCha20_avx512 2716___ 2717 2718map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); 2719 2720$code.=<<___; 2721.type ChaCha20_avx512vl,\@function,5 2722.align 32 2723ChaCha20_avx512vl: 2724.cfi_startproc 2725.LChaCha20_avx512vl: 2726 mov %rsp,%r9 # frame pointer 2727.cfi_def_cfa_register %r9 2728 cmp \$128,$len 2729 ja .LChaCha20_8xvl 2730 2731 sub \$64+$xframe,%rsp 2732___ 2733$code.=<<___ if ($win64); 2734 movaps %xmm6,-0xa8(%r9) 2735 movaps %xmm7,-0x98(%r9) 2736 movaps %xmm8,-0x88(%r9) 2737 movaps %xmm9,-0x78(%r9) 2738 movaps %xmm10,-0x68(%r9) 2739 movaps %xmm11,-0x58(%r9) 2740 movaps %xmm12,-0x48(%r9) 2741 movaps %xmm13,-0x38(%r9) 2742 movaps %xmm14,-0x28(%r9) 2743 movaps %xmm15,-0x18(%r9) 2744.Lavx512vl_body: 2745___ 2746$code.=<<___; 2747 vbroadcasti128 .Lsigma(%rip),$a 2748 vbroadcasti128 ($key),$b 2749 vbroadcasti128 16($key),$c 2750 vbroadcasti128 ($counter),$d 2751 2752 vmovdqa32 $a,$a_ 2753 vmovdqa32 $b,$b_ 2754 vmovdqa32 $c,$c_ 2755 vpaddd .Lzeroz(%rip),$d,$d 2756 vmovdqa32 .Ltwoy(%rip),$fourz 2757 mov \$10,$counter # reuse $counter 2758 vmovdqa32 $d,$d_ 2759 jmp .Loop_avx512vl 2760 2761.align 16 2762.Loop_outer_avx512vl: 2763 vmovdqa32 $c_,$c 2764 vpaddd $fourz,$d_,$d 2765 mov \$10,$counter 2766 vmovdqa32 $d,$d_ 2767 jmp .Loop_avx512vl 2768 2769.align 32 2770.Loop_avx512vl: 2771___ 2772 &AVX512ROUND(); 2773 &vpshufd ($c,$c,0b01001110); 2774 &vpshufd ($b,$b,0b00111001); 2775 &vpshufd ($d,$d,0b10010011); 2776 2777 &AVX512ROUND(); 2778 &vpshufd ($c,$c,0b01001110); 2779 &vpshufd ($b,$b,0b10010011); 2780 &vpshufd ($d,$d,0b00111001); 2781 2782 &dec ($counter); 2783 &jnz (".Loop_avx512vl"); 2784 2785$code.=<<___; 2786 vpaddd $a_,$a,$a 2787 vpaddd $b_,$b,$b 2788 vpaddd $c_,$c,$c 2789 vpaddd $d_,$d,$d 2790 2791 sub \$64,$len 2792 jb .Ltail64_avx512vl 2793 2794 vpxor 0x00($inp),%x#$a,$t0 # xor with input 2795 vpxor 0x10($inp),%x#$b,$t1 2796 vpxor 0x20($inp),%x#$c,$t2 2797 vpxor 0x30($inp),%x#$d,$t3 2798 lea 0x40($inp),$inp # inp+=64 2799 2800 vmovdqu $t0,0x00($out) # write output 2801 vmovdqu $t1,0x10($out) 2802 vmovdqu $t2,0x20($out) 2803 vmovdqu $t3,0x30($out) 2804 lea 0x40($out),$out # out+=64 2805 2806 jz .Ldone_avx512vl 2807 2808 vextracti128 \$1,$a,$t0 2809 vextracti128 \$1,$b,$t1 2810 vextracti128 \$1,$c,$t2 2811 vextracti128 \$1,$d,$t3 2812 2813 sub \$64,$len 2814 jb .Ltail_avx512vl 2815 2816 vpxor 0x00($inp),$t0,$t0 # xor with input 2817 vpxor 0x10($inp),$t1,$t1 2818 vpxor 0x20($inp),$t2,$t2 2819 vpxor 0x30($inp),$t3,$t3 2820 lea 0x40($inp),$inp # inp+=64 2821 2822 vmovdqu $t0,0x00($out) # write output 2823 vmovdqu $t1,0x10($out) 2824 vmovdqu $t2,0x20($out) 2825 vmovdqu $t3,0x30($out) 2826 lea 0x40($out),$out # out+=64 2827 2828 vmovdqa32 $a_,$a 2829 vmovdqa32 $b_,$b 2830 jnz .Loop_outer_avx512vl 2831 2832 jmp .Ldone_avx512vl 2833 2834.align 16 2835.Ltail64_avx512vl: 2836 vmovdqa %x#$a,0x00(%rsp) 2837 vmovdqa %x#$b,0x10(%rsp) 2838 vmovdqa %x#$c,0x20(%rsp) 2839 vmovdqa %x#$d,0x30(%rsp) 2840 add \$64,$len 2841 jmp .Loop_tail_avx512vl 2842 2843.align 16 2844.Ltail_avx512vl: 2845 vmovdqa $t0,0x00(%rsp) 2846 vmovdqa $t1,0x10(%rsp) 2847 vmovdqa $t2,0x20(%rsp) 2848 vmovdqa $t3,0x30(%rsp) 2849 add \$64,$len 2850 2851.Loop_tail_avx512vl: 2852 movzb ($inp,$counter),%eax 2853 movzb (%rsp,$counter),%ecx 2854 lea 1($counter),$counter 2855 xor %ecx,%eax 2856 mov %al,-1($out,$counter) 2857 dec $len 2858 jnz .Loop_tail_avx512vl 2859 2860 vmovdqu32 $a_,0x00(%rsp) 2861 vmovdqu32 $a_,0x20(%rsp) 2862 2863.Ldone_avx512vl: 2864 vzeroall 2865___ 2866$code.=<<___ if ($win64); 2867 movaps -0xa8(%r9),%xmm6 2868 movaps -0x98(%r9),%xmm7 2869 movaps -0x88(%r9),%xmm8 2870 movaps -0x78(%r9),%xmm9 2871 movaps -0x68(%r9),%xmm10 2872 movaps -0x58(%r9),%xmm11 2873 movaps -0x48(%r9),%xmm12 2874 movaps -0x38(%r9),%xmm13 2875 movaps -0x28(%r9),%xmm14 2876 movaps -0x18(%r9),%xmm15 2877___ 2878$code.=<<___; 2879 lea (%r9),%rsp 2880.cfi_def_cfa_register %rsp 2881.Lavx512vl_epilogue: 2882 ret 2883.cfi_endproc 2884.size ChaCha20_avx512vl,.-ChaCha20_avx512vl 2885___ 2886} 2887if ($avx>2) { 2888# This one handles longer inputs... 2889 2890my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2891 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); 2892my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 2893 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 2894my @key=map("%zmm$_",(16..31)); 2895my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 2896 2897sub AVX512_lane_ROUND { 2898my ($a0,$b0,$c0,$d0)=@_; 2899my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 2900my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 2901my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 2902my @x=map("\"$_\"",@xx); 2903 2904 ( 2905 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 2906 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 2907 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 2908 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 2909 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2910 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2911 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2912 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2913 "&vprold (@x[$d0],@x[$d0],16)", 2914 "&vprold (@x[$d1],@x[$d1],16)", 2915 "&vprold (@x[$d2],@x[$d2],16)", 2916 "&vprold (@x[$d3],@x[$d3],16)", 2917 2918 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2919 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2920 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2921 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2922 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2923 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2924 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2925 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2926 "&vprold (@x[$b0],@x[$b0],12)", 2927 "&vprold (@x[$b1],@x[$b1],12)", 2928 "&vprold (@x[$b2],@x[$b2],12)", 2929 "&vprold (@x[$b3],@x[$b3],12)", 2930 2931 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", 2932 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", 2933 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", 2934 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", 2935 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", 2936 "&vpxord (@x[$d1],@x[$d1],@x[$a1])", 2937 "&vpxord (@x[$d2],@x[$d2],@x[$a2])", 2938 "&vpxord (@x[$d3],@x[$d3],@x[$a3])", 2939 "&vprold (@x[$d0],@x[$d0],8)", 2940 "&vprold (@x[$d1],@x[$d1],8)", 2941 "&vprold (@x[$d2],@x[$d2],8)", 2942 "&vprold (@x[$d3],@x[$d3],8)", 2943 2944 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", 2945 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", 2946 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", 2947 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", 2948 "&vpxord (@x[$b0],@x[$b0],@x[$c0])", 2949 "&vpxord (@x[$b1],@x[$b1],@x[$c1])", 2950 "&vpxord (@x[$b2],@x[$b2],@x[$c2])", 2951 "&vpxord (@x[$b3],@x[$b3],@x[$c3])", 2952 "&vprold (@x[$b0],@x[$b0],7)", 2953 "&vprold (@x[$b1],@x[$b1],7)", 2954 "&vprold (@x[$b2],@x[$b2],7)", 2955 "&vprold (@x[$b3],@x[$b3],7)" 2956 ); 2957} 2958 2959my $xframe = $win64 ? 0xa8 : 8; 2960 2961$code.=<<___; 2962.type ChaCha20_16x,\@function,5 2963.align 32 2964ChaCha20_16x: 2965.cfi_startproc 2966.LChaCha20_16x: 2967 mov %rsp,%r9 # frame register 2968.cfi_def_cfa_register %r9 2969 sub \$64+$xframe,%rsp 2970 and \$-64,%rsp 2971___ 2972$code.=<<___ if ($win64); 2973 movaps %xmm6,-0xa8(%r9) 2974 movaps %xmm7,-0x98(%r9) 2975 movaps %xmm8,-0x88(%r9) 2976 movaps %xmm9,-0x78(%r9) 2977 movaps %xmm10,-0x68(%r9) 2978 movaps %xmm11,-0x58(%r9) 2979 movaps %xmm12,-0x48(%r9) 2980 movaps %xmm13,-0x38(%r9) 2981 movaps %xmm14,-0x28(%r9) 2982 movaps %xmm15,-0x18(%r9) 2983.L16x_body: 2984___ 2985$code.=<<___; 2986 vzeroupper 2987 2988 lea .Lsigma(%rip),%r10 2989 vbroadcasti32x4 (%r10),$xa3 # key[0] 2990 vbroadcasti32x4 ($key),$xb3 # key[1] 2991 vbroadcasti32x4 16($key),$xc3 # key[2] 2992 vbroadcasti32x4 ($counter),$xd3 # key[3] 2993 2994 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 2995 vpshufd \$0x55,$xa3,$xa1 2996 vpshufd \$0xaa,$xa3,$xa2 2997 vpshufd \$0xff,$xa3,$xa3 2998 vmovdqa64 $xa0,@key[0] 2999 vmovdqa64 $xa1,@key[1] 3000 vmovdqa64 $xa2,@key[2] 3001 vmovdqa64 $xa3,@key[3] 3002 3003 vpshufd \$0x00,$xb3,$xb0 3004 vpshufd \$0x55,$xb3,$xb1 3005 vpshufd \$0xaa,$xb3,$xb2 3006 vpshufd \$0xff,$xb3,$xb3 3007 vmovdqa64 $xb0,@key[4] 3008 vmovdqa64 $xb1,@key[5] 3009 vmovdqa64 $xb2,@key[6] 3010 vmovdqa64 $xb3,@key[7] 3011 3012 vpshufd \$0x00,$xc3,$xc0 3013 vpshufd \$0x55,$xc3,$xc1 3014 vpshufd \$0xaa,$xc3,$xc2 3015 vpshufd \$0xff,$xc3,$xc3 3016 vmovdqa64 $xc0,@key[8] 3017 vmovdqa64 $xc1,@key[9] 3018 vmovdqa64 $xc2,@key[10] 3019 vmovdqa64 $xc3,@key[11] 3020 3021 vpshufd \$0x00,$xd3,$xd0 3022 vpshufd \$0x55,$xd3,$xd1 3023 vpshufd \$0xaa,$xd3,$xd2 3024 vpshufd \$0xff,$xd3,$xd3 3025 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet 3026 vmovdqa64 $xd0,@key[12] 3027 vmovdqa64 $xd1,@key[13] 3028 vmovdqa64 $xd2,@key[14] 3029 vmovdqa64 $xd3,@key[15] 3030 3031 mov \$10,%eax 3032 jmp .Loop16x 3033 3034.align 32 3035.Loop_outer16x: 3036 vpbroadcastd 0(%r10),$xa0 # reload key 3037 vpbroadcastd 4(%r10),$xa1 3038 vpbroadcastd 8(%r10),$xa2 3039 vpbroadcastd 12(%r10),$xa3 3040 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters 3041 vmovdqa64 @key[4],$xb0 3042 vmovdqa64 @key[5],$xb1 3043 vmovdqa64 @key[6],$xb2 3044 vmovdqa64 @key[7],$xb3 3045 vmovdqa64 @key[8],$xc0 3046 vmovdqa64 @key[9],$xc1 3047 vmovdqa64 @key[10],$xc2 3048 vmovdqa64 @key[11],$xc3 3049 vmovdqa64 @key[12],$xd0 3050 vmovdqa64 @key[13],$xd1 3051 vmovdqa64 @key[14],$xd2 3052 vmovdqa64 @key[15],$xd3 3053 3054 vmovdqa64 $xa0,@key[0] 3055 vmovdqa64 $xa1,@key[1] 3056 vmovdqa64 $xa2,@key[2] 3057 vmovdqa64 $xa3,@key[3] 3058 3059 mov \$10,%eax 3060 jmp .Loop16x 3061 3062.align 32 3063.Loop16x: 3064___ 3065 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3066 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3067$code.=<<___; 3068 dec %eax 3069 jnz .Loop16x 3070 3071 vpaddd @key[0],$xa0,$xa0 # accumulate key 3072 vpaddd @key[1],$xa1,$xa1 3073 vpaddd @key[2],$xa2,$xa2 3074 vpaddd @key[3],$xa3,$xa3 3075 3076 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3077 vpunpckldq $xa3,$xa2,$xt3 3078 vpunpckhdq $xa1,$xa0,$xa0 3079 vpunpckhdq $xa3,$xa2,$xa2 3080 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3081 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3082 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3083 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3084___ 3085 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3086$code.=<<___; 3087 vpaddd @key[4],$xb0,$xb0 3088 vpaddd @key[5],$xb1,$xb1 3089 vpaddd @key[6],$xb2,$xb2 3090 vpaddd @key[7],$xb3,$xb3 3091 3092 vpunpckldq $xb1,$xb0,$xt2 3093 vpunpckldq $xb3,$xb2,$xt3 3094 vpunpckhdq $xb1,$xb0,$xb0 3095 vpunpckhdq $xb3,$xb2,$xb2 3096 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3097 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3098 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3099 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3100___ 3101 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3102$code.=<<___; 3103 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further 3104 vshufi32x4 \$0xee,$xb0,$xa0,$xb0 3105 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 3106 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 3107 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 3108 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 3109 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 3110 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 3111___ 3112 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3113$code.=<<___; 3114 vpaddd @key[8],$xc0,$xc0 3115 vpaddd @key[9],$xc1,$xc1 3116 vpaddd @key[10],$xc2,$xc2 3117 vpaddd @key[11],$xc3,$xc3 3118 3119 vpunpckldq $xc1,$xc0,$xt2 3120 vpunpckldq $xc3,$xc2,$xt3 3121 vpunpckhdq $xc1,$xc0,$xc0 3122 vpunpckhdq $xc3,$xc2,$xc2 3123 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3124 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3125 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3126 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3127___ 3128 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3129$code.=<<___; 3130 vpaddd @key[12],$xd0,$xd0 3131 vpaddd @key[13],$xd1,$xd1 3132 vpaddd @key[14],$xd2,$xd2 3133 vpaddd @key[15],$xd3,$xd3 3134 3135 vpunpckldq $xd1,$xd0,$xt2 3136 vpunpckldq $xd3,$xd2,$xt3 3137 vpunpckhdq $xd1,$xd0,$xd0 3138 vpunpckhdq $xd3,$xd2,$xd2 3139 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3140 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3141 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3142 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3143___ 3144 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3145$code.=<<___; 3146 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further 3147 vshufi32x4 \$0xee,$xd0,$xc0,$xd0 3148 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 3149 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 3150 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 3151 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 3152 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 3153 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 3154___ 3155 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3156$code.=<<___; 3157 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further 3158 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 3159 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 3160 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 3161 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 3162 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 3163 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 3164 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 3165 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 3166 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 3167 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 3168 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 3169 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 3170 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 3171 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 3172 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 3173___ 3174 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= 3175 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); 3176 3177 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, 3178 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = 3179 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3180 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3181$code.=<<___; 3182 cmp \$64*16,$len 3183 jb .Ltail16x 3184 3185 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3186 vpxord 0x40($inp),$xb0,$xb0 3187 vpxord 0x80($inp),$xc0,$xc0 3188 vpxord 0xc0($inp),$xd0,$xd0 3189 vmovdqu32 $xa0,0x00($out) 3190 vmovdqu32 $xb0,0x40($out) 3191 vmovdqu32 $xc0,0x80($out) 3192 vmovdqu32 $xd0,0xc0($out) 3193 3194 vpxord 0x100($inp),$xa1,$xa1 3195 vpxord 0x140($inp),$xb1,$xb1 3196 vpxord 0x180($inp),$xc1,$xc1 3197 vpxord 0x1c0($inp),$xd1,$xd1 3198 vmovdqu32 $xa1,0x100($out) 3199 vmovdqu32 $xb1,0x140($out) 3200 vmovdqu32 $xc1,0x180($out) 3201 vmovdqu32 $xd1,0x1c0($out) 3202 3203 vpxord 0x200($inp),$xa2,$xa2 3204 vpxord 0x240($inp),$xb2,$xb2 3205 vpxord 0x280($inp),$xc2,$xc2 3206 vpxord 0x2c0($inp),$xd2,$xd2 3207 vmovdqu32 $xa2,0x200($out) 3208 vmovdqu32 $xb2,0x240($out) 3209 vmovdqu32 $xc2,0x280($out) 3210 vmovdqu32 $xd2,0x2c0($out) 3211 3212 vpxord 0x300($inp),$xa3,$xa3 3213 vpxord 0x340($inp),$xb3,$xb3 3214 vpxord 0x380($inp),$xc3,$xc3 3215 vpxord 0x3c0($inp),$xd3,$xd3 3216 lea 0x400($inp),$inp 3217 vmovdqu32 $xa3,0x300($out) 3218 vmovdqu32 $xb3,0x340($out) 3219 vmovdqu32 $xc3,0x380($out) 3220 vmovdqu32 $xd3,0x3c0($out) 3221 lea 0x400($out),$out 3222 3223 sub \$64*16,$len 3224 jnz .Loop_outer16x 3225 3226 jmp .Ldone16x 3227 3228.align 32 3229.Ltail16x: 3230 xor %r10,%r10 3231 sub $inp,$out 3232 cmp \$64*1,$len 3233 jb .Less_than_64_16x 3234 vpxord ($inp),$xa0,$xa0 # xor with input 3235 vmovdqu32 $xa0,($out,$inp) 3236 je .Ldone16x 3237 vmovdqa32 $xb0,$xa0 3238 lea 64($inp),$inp 3239 3240 cmp \$64*2,$len 3241 jb .Less_than_64_16x 3242 vpxord ($inp),$xb0,$xb0 3243 vmovdqu32 $xb0,($out,$inp) 3244 je .Ldone16x 3245 vmovdqa32 $xc0,$xa0 3246 lea 64($inp),$inp 3247 3248 cmp \$64*3,$len 3249 jb .Less_than_64_16x 3250 vpxord ($inp),$xc0,$xc0 3251 vmovdqu32 $xc0,($out,$inp) 3252 je .Ldone16x 3253 vmovdqa32 $xd0,$xa0 3254 lea 64($inp),$inp 3255 3256 cmp \$64*4,$len 3257 jb .Less_than_64_16x 3258 vpxord ($inp),$xd0,$xd0 3259 vmovdqu32 $xd0,($out,$inp) 3260 je .Ldone16x 3261 vmovdqa32 $xa1,$xa0 3262 lea 64($inp),$inp 3263 3264 cmp \$64*5,$len 3265 jb .Less_than_64_16x 3266 vpxord ($inp),$xa1,$xa1 3267 vmovdqu32 $xa1,($out,$inp) 3268 je .Ldone16x 3269 vmovdqa32 $xb1,$xa0 3270 lea 64($inp),$inp 3271 3272 cmp \$64*6,$len 3273 jb .Less_than_64_16x 3274 vpxord ($inp),$xb1,$xb1 3275 vmovdqu32 $xb1,($out,$inp) 3276 je .Ldone16x 3277 vmovdqa32 $xc1,$xa0 3278 lea 64($inp),$inp 3279 3280 cmp \$64*7,$len 3281 jb .Less_than_64_16x 3282 vpxord ($inp),$xc1,$xc1 3283 vmovdqu32 $xc1,($out,$inp) 3284 je .Ldone16x 3285 vmovdqa32 $xd1,$xa0 3286 lea 64($inp),$inp 3287 3288 cmp \$64*8,$len 3289 jb .Less_than_64_16x 3290 vpxord ($inp),$xd1,$xd1 3291 vmovdqu32 $xd1,($out,$inp) 3292 je .Ldone16x 3293 vmovdqa32 $xa2,$xa0 3294 lea 64($inp),$inp 3295 3296 cmp \$64*9,$len 3297 jb .Less_than_64_16x 3298 vpxord ($inp),$xa2,$xa2 3299 vmovdqu32 $xa2,($out,$inp) 3300 je .Ldone16x 3301 vmovdqa32 $xb2,$xa0 3302 lea 64($inp),$inp 3303 3304 cmp \$64*10,$len 3305 jb .Less_than_64_16x 3306 vpxord ($inp),$xb2,$xb2 3307 vmovdqu32 $xb2,($out,$inp) 3308 je .Ldone16x 3309 vmovdqa32 $xc2,$xa0 3310 lea 64($inp),$inp 3311 3312 cmp \$64*11,$len 3313 jb .Less_than_64_16x 3314 vpxord ($inp),$xc2,$xc2 3315 vmovdqu32 $xc2,($out,$inp) 3316 je .Ldone16x 3317 vmovdqa32 $xd2,$xa0 3318 lea 64($inp),$inp 3319 3320 cmp \$64*12,$len 3321 jb .Less_than_64_16x 3322 vpxord ($inp),$xd2,$xd2 3323 vmovdqu32 $xd2,($out,$inp) 3324 je .Ldone16x 3325 vmovdqa32 $xa3,$xa0 3326 lea 64($inp),$inp 3327 3328 cmp \$64*13,$len 3329 jb .Less_than_64_16x 3330 vpxord ($inp),$xa3,$xa3 3331 vmovdqu32 $xa3,($out,$inp) 3332 je .Ldone16x 3333 vmovdqa32 $xb3,$xa0 3334 lea 64($inp),$inp 3335 3336 cmp \$64*14,$len 3337 jb .Less_than_64_16x 3338 vpxord ($inp),$xb3,$xb3 3339 vmovdqu32 $xb3,($out,$inp) 3340 je .Ldone16x 3341 vmovdqa32 $xc3,$xa0 3342 lea 64($inp),$inp 3343 3344 cmp \$64*15,$len 3345 jb .Less_than_64_16x 3346 vpxord ($inp),$xc3,$xc3 3347 vmovdqu32 $xc3,($out,$inp) 3348 je .Ldone16x 3349 vmovdqa32 $xd3,$xa0 3350 lea 64($inp),$inp 3351 3352.Less_than_64_16x: 3353 vmovdqa32 $xa0,0x00(%rsp) 3354 lea ($out,$inp),$out 3355 and \$63,$len 3356 3357.Loop_tail16x: 3358 movzb ($inp,%r10),%eax 3359 movzb (%rsp,%r10),%ecx 3360 lea 1(%r10),%r10 3361 xor %ecx,%eax 3362 mov %al,-1($out,%r10) 3363 dec $len 3364 jnz .Loop_tail16x 3365 3366 vpxord $xa0,$xa0,$xa0 3367 vmovdqa32 $xa0,0(%rsp) 3368 3369.Ldone16x: 3370 vzeroall 3371___ 3372$code.=<<___ if ($win64); 3373 movaps -0xa8(%r9),%xmm6 3374 movaps -0x98(%r9),%xmm7 3375 movaps -0x88(%r9),%xmm8 3376 movaps -0x78(%r9),%xmm9 3377 movaps -0x68(%r9),%xmm10 3378 movaps -0x58(%r9),%xmm11 3379 movaps -0x48(%r9),%xmm12 3380 movaps -0x38(%r9),%xmm13 3381 movaps -0x28(%r9),%xmm14 3382 movaps -0x18(%r9),%xmm15 3383___ 3384$code.=<<___; 3385 lea (%r9),%rsp 3386.cfi_def_cfa_register %rsp 3387.L16x_epilogue: 3388 ret 3389.cfi_endproc 3390.size ChaCha20_16x,.-ChaCha20_16x 3391___ 3392 3393# switch to %ymm domain 3394($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3395 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); 3396@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 3397 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); 3398@key=map("%ymm$_",(16..31)); 3399($xt0,$xt1,$xt2,$xt3)=@key[0..3]; 3400 3401$code.=<<___; 3402.type ChaCha20_8xvl,\@function,5 3403.align 32 3404ChaCha20_8xvl: 3405.cfi_startproc 3406.LChaCha20_8xvl: 3407 mov %rsp,%r9 # frame register 3408.cfi_def_cfa_register %r9 3409 sub \$64+$xframe,%rsp 3410 and \$-64,%rsp 3411___ 3412$code.=<<___ if ($win64); 3413 movaps %xmm6,-0xa8(%r9) 3414 movaps %xmm7,-0x98(%r9) 3415 movaps %xmm8,-0x88(%r9) 3416 movaps %xmm9,-0x78(%r9) 3417 movaps %xmm10,-0x68(%r9) 3418 movaps %xmm11,-0x58(%r9) 3419 movaps %xmm12,-0x48(%r9) 3420 movaps %xmm13,-0x38(%r9) 3421 movaps %xmm14,-0x28(%r9) 3422 movaps %xmm15,-0x18(%r9) 3423.L8xvl_body: 3424___ 3425$code.=<<___; 3426 vzeroupper 3427 3428 lea .Lsigma(%rip),%r10 3429 vbroadcasti128 (%r10),$xa3 # key[0] 3430 vbroadcasti128 ($key),$xb3 # key[1] 3431 vbroadcasti128 16($key),$xc3 # key[2] 3432 vbroadcasti128 ($counter),$xd3 # key[3] 3433 3434 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... 3435 vpshufd \$0x55,$xa3,$xa1 3436 vpshufd \$0xaa,$xa3,$xa2 3437 vpshufd \$0xff,$xa3,$xa3 3438 vmovdqa64 $xa0,@key[0] 3439 vmovdqa64 $xa1,@key[1] 3440 vmovdqa64 $xa2,@key[2] 3441 vmovdqa64 $xa3,@key[3] 3442 3443 vpshufd \$0x00,$xb3,$xb0 3444 vpshufd \$0x55,$xb3,$xb1 3445 vpshufd \$0xaa,$xb3,$xb2 3446 vpshufd \$0xff,$xb3,$xb3 3447 vmovdqa64 $xb0,@key[4] 3448 vmovdqa64 $xb1,@key[5] 3449 vmovdqa64 $xb2,@key[6] 3450 vmovdqa64 $xb3,@key[7] 3451 3452 vpshufd \$0x00,$xc3,$xc0 3453 vpshufd \$0x55,$xc3,$xc1 3454 vpshufd \$0xaa,$xc3,$xc2 3455 vpshufd \$0xff,$xc3,$xc3 3456 vmovdqa64 $xc0,@key[8] 3457 vmovdqa64 $xc1,@key[9] 3458 vmovdqa64 $xc2,@key[10] 3459 vmovdqa64 $xc3,@key[11] 3460 3461 vpshufd \$0x00,$xd3,$xd0 3462 vpshufd \$0x55,$xd3,$xd1 3463 vpshufd \$0xaa,$xd3,$xd2 3464 vpshufd \$0xff,$xd3,$xd3 3465 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet 3466 vmovdqa64 $xd0,@key[12] 3467 vmovdqa64 $xd1,@key[13] 3468 vmovdqa64 $xd2,@key[14] 3469 vmovdqa64 $xd3,@key[15] 3470 3471 mov \$10,%eax 3472 jmp .Loop8xvl 3473 3474.align 32 3475.Loop_outer8xvl: 3476 #vpbroadcastd 0(%r10),$xa0 # reload key 3477 #vpbroadcastd 4(%r10),$xa1 3478 vpbroadcastd 8(%r10),$xa2 3479 vpbroadcastd 12(%r10),$xa3 3480 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters 3481 vmovdqa64 @key[4],$xb0 3482 vmovdqa64 @key[5],$xb1 3483 vmovdqa64 @key[6],$xb2 3484 vmovdqa64 @key[7],$xb3 3485 vmovdqa64 @key[8],$xc0 3486 vmovdqa64 @key[9],$xc1 3487 vmovdqa64 @key[10],$xc2 3488 vmovdqa64 @key[11],$xc3 3489 vmovdqa64 @key[12],$xd0 3490 vmovdqa64 @key[13],$xd1 3491 vmovdqa64 @key[14],$xd2 3492 vmovdqa64 @key[15],$xd3 3493 3494 vmovdqa64 $xa0,@key[0] 3495 vmovdqa64 $xa1,@key[1] 3496 vmovdqa64 $xa2,@key[2] 3497 vmovdqa64 $xa3,@key[3] 3498 3499 mov \$10,%eax 3500 jmp .Loop8xvl 3501 3502.align 32 3503.Loop8xvl: 3504___ 3505 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } 3506 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } 3507$code.=<<___; 3508 dec %eax 3509 jnz .Loop8xvl 3510 3511 vpaddd @key[0],$xa0,$xa0 # accumulate key 3512 vpaddd @key[1],$xa1,$xa1 3513 vpaddd @key[2],$xa2,$xa2 3514 vpaddd @key[3],$xa3,$xa3 3515 3516 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data 3517 vpunpckldq $xa3,$xa2,$xt3 3518 vpunpckhdq $xa1,$xa0,$xa0 3519 vpunpckhdq $xa3,$xa2,$xa2 3520 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" 3521 vpunpckhqdq $xt3,$xt2,$xt2 # "a1" 3522 vpunpcklqdq $xa2,$xa0,$xa3 # "a2" 3523 vpunpckhqdq $xa2,$xa0,$xa0 # "a3" 3524___ 3525 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); 3526$code.=<<___; 3527 vpaddd @key[4],$xb0,$xb0 3528 vpaddd @key[5],$xb1,$xb1 3529 vpaddd @key[6],$xb2,$xb2 3530 vpaddd @key[7],$xb3,$xb3 3531 3532 vpunpckldq $xb1,$xb0,$xt2 3533 vpunpckldq $xb3,$xb2,$xt3 3534 vpunpckhdq $xb1,$xb0,$xb0 3535 vpunpckhdq $xb3,$xb2,$xb2 3536 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" 3537 vpunpckhqdq $xt3,$xt2,$xt2 # "b1" 3538 vpunpcklqdq $xb2,$xb0,$xb3 # "b2" 3539 vpunpckhqdq $xb2,$xb0,$xb0 # "b3" 3540___ 3541 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); 3542$code.=<<___; 3543 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further 3544 vshufi32x4 \$3,$xb0,$xa0,$xb0 3545 vshufi32x4 \$0,$xb1,$xa1,$xa0 3546 vshufi32x4 \$3,$xb1,$xa1,$xb1 3547 vshufi32x4 \$0,$xb2,$xa2,$xa1 3548 vshufi32x4 \$3,$xb2,$xa2,$xb2 3549 vshufi32x4 \$0,$xb3,$xa3,$xa2 3550 vshufi32x4 \$3,$xb3,$xa3,$xb3 3551___ 3552 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); 3553$code.=<<___; 3554 vpaddd @key[8],$xc0,$xc0 3555 vpaddd @key[9],$xc1,$xc1 3556 vpaddd @key[10],$xc2,$xc2 3557 vpaddd @key[11],$xc3,$xc3 3558 3559 vpunpckldq $xc1,$xc0,$xt2 3560 vpunpckldq $xc3,$xc2,$xt3 3561 vpunpckhdq $xc1,$xc0,$xc0 3562 vpunpckhdq $xc3,$xc2,$xc2 3563 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" 3564 vpunpckhqdq $xt3,$xt2,$xt2 # "c1" 3565 vpunpcklqdq $xc2,$xc0,$xc3 # "c2" 3566 vpunpckhqdq $xc2,$xc0,$xc0 # "c3" 3567___ 3568 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); 3569$code.=<<___; 3570 vpaddd @key[12],$xd0,$xd0 3571 vpaddd @key[13],$xd1,$xd1 3572 vpaddd @key[14],$xd2,$xd2 3573 vpaddd @key[15],$xd3,$xd3 3574 3575 vpunpckldq $xd1,$xd0,$xt2 3576 vpunpckldq $xd3,$xd2,$xt3 3577 vpunpckhdq $xd1,$xd0,$xd0 3578 vpunpckhdq $xd3,$xd2,$xd2 3579 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" 3580 vpunpckhqdq $xt3,$xt2,$xt2 # "d1" 3581 vpunpcklqdq $xd2,$xd0,$xd3 # "d2" 3582 vpunpckhqdq $xd2,$xd0,$xd0 # "d3" 3583___ 3584 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); 3585$code.=<<___; 3586 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further 3587 vperm2i128 \$0x31,$xd0,$xc0,$xd0 3588 vperm2i128 \$0x20,$xd1,$xc1,$xc0 3589 vperm2i128 \$0x31,$xd1,$xc1,$xd1 3590 vperm2i128 \$0x20,$xd2,$xc2,$xc1 3591 vperm2i128 \$0x31,$xd2,$xc2,$xd2 3592 vperm2i128 \$0x20,$xd3,$xc3,$xc2 3593 vperm2i128 \$0x31,$xd3,$xc3,$xd3 3594___ 3595 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); 3596 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= 3597 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); 3598$code.=<<___; 3599 cmp \$64*8,$len 3600 jb .Ltail8xvl 3601 3602 mov \$0x80,%eax # size optimization 3603 vpxord 0x00($inp),$xa0,$xa0 # xor with input 3604 vpxor 0x20($inp),$xb0,$xb0 3605 vpxor 0x40($inp),$xc0,$xc0 3606 vpxor 0x60($inp),$xd0,$xd0 3607 lea ($inp,%rax),$inp # size optimization 3608 vmovdqu32 $xa0,0x00($out) 3609 vmovdqu $xb0,0x20($out) 3610 vmovdqu $xc0,0x40($out) 3611 vmovdqu $xd0,0x60($out) 3612 lea ($out,%rax),$out # size optimization 3613 3614 vpxor 0x00($inp),$xa1,$xa1 3615 vpxor 0x20($inp),$xb1,$xb1 3616 vpxor 0x40($inp),$xc1,$xc1 3617 vpxor 0x60($inp),$xd1,$xd1 3618 lea ($inp,%rax),$inp # size optimization 3619 vmovdqu $xa1,0x00($out) 3620 vmovdqu $xb1,0x20($out) 3621 vmovdqu $xc1,0x40($out) 3622 vmovdqu $xd1,0x60($out) 3623 lea ($out,%rax),$out # size optimization 3624 3625 vpxord 0x00($inp),$xa2,$xa2 3626 vpxor 0x20($inp),$xb2,$xb2 3627 vpxor 0x40($inp),$xc2,$xc2 3628 vpxor 0x60($inp),$xd2,$xd2 3629 lea ($inp,%rax),$inp # size optimization 3630 vmovdqu32 $xa2,0x00($out) 3631 vmovdqu $xb2,0x20($out) 3632 vmovdqu $xc2,0x40($out) 3633 vmovdqu $xd2,0x60($out) 3634 lea ($out,%rax),$out # size optimization 3635 3636 vpxor 0x00($inp),$xa3,$xa3 3637 vpxor 0x20($inp),$xb3,$xb3 3638 vpxor 0x40($inp),$xc3,$xc3 3639 vpxor 0x60($inp),$xd3,$xd3 3640 lea ($inp,%rax),$inp # size optimization 3641 vmovdqu $xa3,0x00($out) 3642 vmovdqu $xb3,0x20($out) 3643 vmovdqu $xc3,0x40($out) 3644 vmovdqu $xd3,0x60($out) 3645 lea ($out,%rax),$out # size optimization 3646 3647 vpbroadcastd 0(%r10),%ymm0 # reload key 3648 vpbroadcastd 4(%r10),%ymm1 3649 3650 sub \$64*8,$len 3651 jnz .Loop_outer8xvl 3652 3653 jmp .Ldone8xvl 3654 3655.align 32 3656.Ltail8xvl: 3657 vmovdqa64 $xa0,%ymm8 # size optimization 3658___ 3659$xa0 = "%ymm8"; 3660$code.=<<___; 3661 xor %r10,%r10 3662 sub $inp,$out 3663 cmp \$64*1,$len 3664 jb .Less_than_64_8xvl 3665 vpxor 0x00($inp),$xa0,$xa0 # xor with input 3666 vpxor 0x20($inp),$xb0,$xb0 3667 vmovdqu $xa0,0x00($out,$inp) 3668 vmovdqu $xb0,0x20($out,$inp) 3669 je .Ldone8xvl 3670 vmovdqa $xc0,$xa0 3671 vmovdqa $xd0,$xb0 3672 lea 64($inp),$inp 3673 3674 cmp \$64*2,$len 3675 jb .Less_than_64_8xvl 3676 vpxor 0x00($inp),$xc0,$xc0 3677 vpxor 0x20($inp),$xd0,$xd0 3678 vmovdqu $xc0,0x00($out,$inp) 3679 vmovdqu $xd0,0x20($out,$inp) 3680 je .Ldone8xvl 3681 vmovdqa $xa1,$xa0 3682 vmovdqa $xb1,$xb0 3683 lea 64($inp),$inp 3684 3685 cmp \$64*3,$len 3686 jb .Less_than_64_8xvl 3687 vpxor 0x00($inp),$xa1,$xa1 3688 vpxor 0x20($inp),$xb1,$xb1 3689 vmovdqu $xa1,0x00($out,$inp) 3690 vmovdqu $xb1,0x20($out,$inp) 3691 je .Ldone8xvl 3692 vmovdqa $xc1,$xa0 3693 vmovdqa $xd1,$xb0 3694 lea 64($inp),$inp 3695 3696 cmp \$64*4,$len 3697 jb .Less_than_64_8xvl 3698 vpxor 0x00($inp),$xc1,$xc1 3699 vpxor 0x20($inp),$xd1,$xd1 3700 vmovdqu $xc1,0x00($out,$inp) 3701 vmovdqu $xd1,0x20($out,$inp) 3702 je .Ldone8xvl 3703 vmovdqa32 $xa2,$xa0 3704 vmovdqa $xb2,$xb0 3705 lea 64($inp),$inp 3706 3707 cmp \$64*5,$len 3708 jb .Less_than_64_8xvl 3709 vpxord 0x00($inp),$xa2,$xa2 3710 vpxor 0x20($inp),$xb2,$xb2 3711 vmovdqu32 $xa2,0x00($out,$inp) 3712 vmovdqu $xb2,0x20($out,$inp) 3713 je .Ldone8xvl 3714 vmovdqa $xc2,$xa0 3715 vmovdqa $xd2,$xb0 3716 lea 64($inp),$inp 3717 3718 cmp \$64*6,$len 3719 jb .Less_than_64_8xvl 3720 vpxor 0x00($inp),$xc2,$xc2 3721 vpxor 0x20($inp),$xd2,$xd2 3722 vmovdqu $xc2,0x00($out,$inp) 3723 vmovdqu $xd2,0x20($out,$inp) 3724 je .Ldone8xvl 3725 vmovdqa $xa3,$xa0 3726 vmovdqa $xb3,$xb0 3727 lea 64($inp),$inp 3728 3729 cmp \$64*7,$len 3730 jb .Less_than_64_8xvl 3731 vpxor 0x00($inp),$xa3,$xa3 3732 vpxor 0x20($inp),$xb3,$xb3 3733 vmovdqu $xa3,0x00($out,$inp) 3734 vmovdqu $xb3,0x20($out,$inp) 3735 je .Ldone8xvl 3736 vmovdqa $xc3,$xa0 3737 vmovdqa $xd3,$xb0 3738 lea 64($inp),$inp 3739 3740.Less_than_64_8xvl: 3741 vmovdqa $xa0,0x00(%rsp) 3742 vmovdqa $xb0,0x20(%rsp) 3743 lea ($out,$inp),$out 3744 and \$63,$len 3745 3746.Loop_tail8xvl: 3747 movzb ($inp,%r10),%eax 3748 movzb (%rsp,%r10),%ecx 3749 lea 1(%r10),%r10 3750 xor %ecx,%eax 3751 mov %al,-1($out,%r10) 3752 dec $len 3753 jnz .Loop_tail8xvl 3754 3755 vpxor $xa0,$xa0,$xa0 3756 vmovdqa $xa0,0x00(%rsp) 3757 vmovdqa $xa0,0x20(%rsp) 3758 3759.Ldone8xvl: 3760 vzeroall 3761___ 3762$code.=<<___ if ($win64); 3763 movaps -0xa8(%r9),%xmm6 3764 movaps -0x98(%r9),%xmm7 3765 movaps -0x88(%r9),%xmm8 3766 movaps -0x78(%r9),%xmm9 3767 movaps -0x68(%r9),%xmm10 3768 movaps -0x58(%r9),%xmm11 3769 movaps -0x48(%r9),%xmm12 3770 movaps -0x38(%r9),%xmm13 3771 movaps -0x28(%r9),%xmm14 3772 movaps -0x18(%r9),%xmm15 3773___ 3774$code.=<<___; 3775 lea (%r9),%rsp 3776.cfi_def_cfa_register %rsp 3777.L8xvl_epilogue: 3778 ret 3779.cfi_endproc 3780.size ChaCha20_8xvl,.-ChaCha20_8xvl 3781___ 3782} 3783 3784# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3785# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3786if ($win64) { 3787$rec="%rcx"; 3788$frame="%rdx"; 3789$context="%r8"; 3790$disp="%r9"; 3791 3792$code.=<<___; 3793.extern __imp_RtlVirtualUnwind 3794.type se_handler,\@abi-omnipotent 3795.align 16 3796se_handler: 3797 push %rsi 3798 push %rdi 3799 push %rbx 3800 push %rbp 3801 push %r12 3802 push %r13 3803 push %r14 3804 push %r15 3805 pushfq 3806 sub \$64,%rsp 3807 3808 mov 120($context),%rax # pull context->Rax 3809 mov 248($context),%rbx # pull context->Rip 3810 3811 mov 8($disp),%rsi # disp->ImageBase 3812 mov 56($disp),%r11 # disp->HandlerData 3813 3814 lea .Lctr32_body(%rip),%r10 3815 cmp %r10,%rbx # context->Rip<.Lprologue 3816 jb .Lcommon_seh_tail 3817 3818 mov 152($context),%rax # pull context->Rsp 3819 3820 lea .Lno_data(%rip),%r10 # epilogue label 3821 cmp %r10,%rbx # context->Rip>=.Lepilogue 3822 jae .Lcommon_seh_tail 3823 3824 lea 64+24+48(%rax),%rax 3825 3826 mov -8(%rax),%rbx 3827 mov -16(%rax),%rbp 3828 mov -24(%rax),%r12 3829 mov -32(%rax),%r13 3830 mov -40(%rax),%r14 3831 mov -48(%rax),%r15 3832 mov %rbx,144($context) # restore context->Rbx 3833 mov %rbp,160($context) # restore context->Rbp 3834 mov %r12,216($context) # restore context->R12 3835 mov %r13,224($context) # restore context->R13 3836 mov %r14,232($context) # restore context->R14 3837 mov %r15,240($context) # restore context->R14 3838 3839.Lcommon_seh_tail: 3840 mov 8(%rax),%rdi 3841 mov 16(%rax),%rsi 3842 mov %rax,152($context) # restore context->Rsp 3843 mov %rsi,168($context) # restore context->Rsi 3844 mov %rdi,176($context) # restore context->Rdi 3845 3846 mov 40($disp),%rdi # disp->ContextRecord 3847 mov $context,%rsi # context 3848 mov \$154,%ecx # sizeof(CONTEXT) 3849 .long 0xa548f3fc # cld; rep movsq 3850 3851 mov $disp,%rsi 3852 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3853 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3854 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3855 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3856 mov 40(%rsi),%r10 # disp->ContextRecord 3857 lea 56(%rsi),%r11 # &disp->HandlerData 3858 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3859 mov %r10,32(%rsp) # arg5 3860 mov %r11,40(%rsp) # arg6 3861 mov %r12,48(%rsp) # arg7 3862 mov %rcx,56(%rsp) # arg8, (NULL) 3863 call *__imp_RtlVirtualUnwind(%rip) 3864 3865 mov \$1,%eax # ExceptionContinueSearch 3866 add \$64,%rsp 3867 popfq 3868 pop %r15 3869 pop %r14 3870 pop %r13 3871 pop %r12 3872 pop %rbp 3873 pop %rbx 3874 pop %rdi 3875 pop %rsi 3876 ret 3877.size se_handler,.-se_handler 3878 3879.type simd_handler,\@abi-omnipotent 3880.align 16 3881simd_handler: 3882 push %rsi 3883 push %rdi 3884 push %rbx 3885 push %rbp 3886 push %r12 3887 push %r13 3888 push %r14 3889 push %r15 3890 pushfq 3891 sub \$64,%rsp 3892 3893 mov 120($context),%rax # pull context->Rax 3894 mov 248($context),%rbx # pull context->Rip 3895 3896 mov 8($disp),%rsi # disp->ImageBase 3897 mov 56($disp),%r11 # disp->HandlerData 3898 3899 mov 0(%r11),%r10d # HandlerData[0] 3900 lea (%rsi,%r10),%r10 # prologue label 3901 cmp %r10,%rbx # context->Rip<prologue label 3902 jb .Lcommon_seh_tail 3903 3904 mov 192($context),%rax # pull context->R9 3905 3906 mov 4(%r11),%r10d # HandlerData[1] 3907 mov 8(%r11),%ecx # HandlerData[2] 3908 lea (%rsi,%r10),%r10 # epilogue label 3909 cmp %r10,%rbx # context->Rip>=epilogue label 3910 jae .Lcommon_seh_tail 3911 3912 neg %rcx 3913 lea -8(%rax,%rcx),%rsi 3914 lea 512($context),%rdi # &context.Xmm6 3915 neg %ecx 3916 shr \$3,%ecx 3917 .long 0xa548f3fc # cld; rep movsq 3918 3919 jmp .Lcommon_seh_tail 3920.size simd_handler,.-simd_handler 3921 3922.section .pdata 3923.align 4 3924 .rva .LSEH_begin_ChaCha20_ctr32 3925 .rva .LSEH_end_ChaCha20_ctr32 3926 .rva .LSEH_info_ChaCha20_ctr32 3927 3928 .rva .LSEH_begin_ChaCha20_ssse3 3929 .rva .LSEH_end_ChaCha20_ssse3 3930 .rva .LSEH_info_ChaCha20_ssse3 3931 3932 .rva .LSEH_begin_ChaCha20_128 3933 .rva .LSEH_end_ChaCha20_128 3934 .rva .LSEH_info_ChaCha20_128 3935 3936 .rva .LSEH_begin_ChaCha20_4x 3937 .rva .LSEH_end_ChaCha20_4x 3938 .rva .LSEH_info_ChaCha20_4x 3939___ 3940$code.=<<___ if ($avx); 3941 .rva .LSEH_begin_ChaCha20_4xop 3942 .rva .LSEH_end_ChaCha20_4xop 3943 .rva .LSEH_info_ChaCha20_4xop 3944___ 3945$code.=<<___ if ($avx>1); 3946 .rva .LSEH_begin_ChaCha20_8x 3947 .rva .LSEH_end_ChaCha20_8x 3948 .rva .LSEH_info_ChaCha20_8x 3949___ 3950$code.=<<___ if ($avx>2); 3951 .rva .LSEH_begin_ChaCha20_avx512 3952 .rva .LSEH_end_ChaCha20_avx512 3953 .rva .LSEH_info_ChaCha20_avx512 3954 3955 .rva .LSEH_begin_ChaCha20_avx512vl 3956 .rva .LSEH_end_ChaCha20_avx512vl 3957 .rva .LSEH_info_ChaCha20_avx512vl 3958 3959 .rva .LSEH_begin_ChaCha20_16x 3960 .rva .LSEH_end_ChaCha20_16x 3961 .rva .LSEH_info_ChaCha20_16x 3962 3963 .rva .LSEH_begin_ChaCha20_8xvl 3964 .rva .LSEH_end_ChaCha20_8xvl 3965 .rva .LSEH_info_ChaCha20_8xvl 3966___ 3967$code.=<<___; 3968.section .xdata 3969.align 8 3970.LSEH_info_ChaCha20_ctr32: 3971 .byte 9,0,0,0 3972 .rva se_handler 3973 3974.LSEH_info_ChaCha20_ssse3: 3975 .byte 9,0,0,0 3976 .rva simd_handler 3977 .rva .Lssse3_body,.Lssse3_epilogue 3978 .long 0x20,0 3979 3980.LSEH_info_ChaCha20_128: 3981 .byte 9,0,0,0 3982 .rva simd_handler 3983 .rva .L128_body,.L128_epilogue 3984 .long 0x60,0 3985 3986.LSEH_info_ChaCha20_4x: 3987 .byte 9,0,0,0 3988 .rva simd_handler 3989 .rva .L4x_body,.L4x_epilogue 3990 .long 0xa0,0 3991___ 3992$code.=<<___ if ($avx); 3993.LSEH_info_ChaCha20_4xop: 3994 .byte 9,0,0,0 3995 .rva simd_handler 3996 .rva .L4xop_body,.L4xop_epilogue # HandlerData[] 3997 .long 0xa0,0 3998___ 3999$code.=<<___ if ($avx>1); 4000.LSEH_info_ChaCha20_8x: 4001 .byte 9,0,0,0 4002 .rva simd_handler 4003 .rva .L8x_body,.L8x_epilogue # HandlerData[] 4004 .long 0xa0,0 4005___ 4006$code.=<<___ if ($avx>2); 4007.LSEH_info_ChaCha20_avx512: 4008 .byte 9,0,0,0 4009 .rva simd_handler 4010 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] 4011 .long 0x20,0 4012 4013.LSEH_info_ChaCha20_avx512vl: 4014 .byte 9,0,0,0 4015 .rva simd_handler 4016 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] 4017 .long 0x20,0 4018 4019.LSEH_info_ChaCha20_16x: 4020 .byte 9,0,0,0 4021 .rva simd_handler 4022 .rva .L16x_body,.L16x_epilogue # HandlerData[] 4023 .long 0xa0,0 4024 4025.LSEH_info_ChaCha20_8xvl: 4026 .byte 9,0,0,0 4027 .rva simd_handler 4028 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] 4029 .long 0xa0,0 4030___ 4031} 4032 4033foreach (split("\n",$code)) { 4034 s/\`([^\`]*)\`/eval $1/ge; 4035 4036 s/%x#%[yz]/%x/g; # "down-shift" 4037 4038 print $_,"\n"; 4039} 4040 4041close STDOUT or die "error closing STDOUT: $!"; 4042