1#! /usr/bin/env perl 2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for x86_64. 18# 19# March 2015 20# 21# Initial release. 22# 23# December 2016 24# 25# Add AVX512F+VL+BW code path. 26# 27# November 2017 28# 29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 30# executed even on Knights Landing. Trigger for modification was 31# observation that AVX512 code paths can negatively affect overall 32# Skylake-X system performance. Since we are likely to suppress 33# AVX512F capability flag [at least on Skylake-X], conversion serves 34# as kind of "investment protection". Note that next *lake processor, 35# Cannolake, has AVX512IFMA code path to execute... 36# 37# Numbers are cycles per processed byte with poly1305_blocks alone, 38# measured with rdtsc at fixed clock frequency. 39# 40# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 41# P4 4.46/+120% - 42# Core 2 2.41/+90% - 43# Westmere 1.88/+120% - 44# Sandy Bridge 1.39/+140% 1.10 45# Haswell 1.14/+175% 1.11 0.65 46# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 47# Silvermont 2.83/+95% - 48# Knights L 3.60/? 1.65 1.10 0.41(***) 49# Goldmont 1.70/+180% - 50# VIA Nano 1.82/+150% - 51# Sledgehammer 1.38/+160% - 52# Bulldozer 2.30/+130% 0.97 53# Ryzen 1.15/+200% 1.08 1.18 54# 55# (*) improvement coefficients relative to clang are more modest and 56# are ~50% on most processors, in both cases we are comparing to 57# __int128 code; 58# (**) SSE2 implementation was attempted, but among non-AVX processors 59# it was faster than integer-only code only on older Intel P4 and 60# Core processors, 50-30%, less newer processor is, but slower on 61# contemporary ones, for example almost 2x slower on Atom, and as 62# former are naturally disappearing, SSE2 is deemed unnecessary; 63# (***) strangely enough performance seems to vary from core to core, 64# listed result is best case; 65 66# $output is the last argument if it looks like a file (it has an extension) 67# $flavour is the first argument if it doesn't look like a file 68$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 69$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 70 71$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 72 73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 74( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 75( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 76die "can't locate x86_64-xlate.pl"; 77 78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 80 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); 81} 82 83if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 85 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); 86 $avx += 2 if ($1==2.11 && $2>=8); 87} 88 89if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 90 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 91 $avx = ($1>=10) + ($1>=12); 92} 93 94if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 95 $avx = ($2>=3.0) + ($2>3.0); 96} 97 98open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 99 or die "can't call $xlate: $!"; 100*STDOUT=*OUT; 101 102my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 103my ($mac,$nonce)=($inp,$len); # *_emit arguments 104my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); 105my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); 106 107sub poly1305_iteration { 108# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 109# output: $h0-$h2 *= $r0-$r1 110$code.=<<___; 111 mulq $h0 # h0*r1 112 mov %rax,$d2 113 mov $r0,%rax 114 mov %rdx,$d3 115 116 mulq $h0 # h0*r0 117 mov %rax,$h0 # future $h0 118 mov $r0,%rax 119 mov %rdx,$d1 120 121 mulq $h1 # h1*r0 122 add %rax,$d2 123 mov $s1,%rax 124 adc %rdx,$d3 125 126 mulq $h1 # h1*s1 127 mov $h2,$h1 # borrow $h1 128 add %rax,$h0 129 adc %rdx,$d1 130 131 imulq $s1,$h1 # h2*s1 132 add $h1,$d2 133 mov $d1,$h1 134 adc \$0,$d3 135 136 imulq $r0,$h2 # h2*r0 137 add $d2,$h1 138 mov \$-4,%rax # mask value 139 adc $h2,$d3 140 141 and $d3,%rax # last reduction step 142 mov $d3,$h2 143 shr \$2,$d3 144 and \$3,$h2 145 add $d3,%rax 146 add %rax,$h0 147 adc \$0,$h1 148 adc \$0,$h2 149___ 150} 151 152######################################################################## 153# Layout of opaque area is following. 154# 155# unsigned __int64 h[3]; # current hash value base 2^64 156# unsigned __int64 r[2]; # key value base 2^64 157 158$code.=<<___; 159.text 160 161.extern OPENSSL_ia32cap_P 162 163.globl poly1305_init 164.hidden poly1305_init 165.globl poly1305_blocks 166.hidden poly1305_blocks 167.globl poly1305_emit 168.hidden poly1305_emit 169 170.type poly1305_init,\@function,3 171.align 32 172poly1305_init: 173.cfi_startproc 174 xor %rax,%rax 175 mov %rax,0($ctx) # initialize hash value 176 mov %rax,8($ctx) 177 mov %rax,16($ctx) 178 179 cmp \$0,$inp 180 je .Lno_key 181 182 lea poly1305_blocks(%rip),%r10 183 lea poly1305_emit(%rip),%r11 184___ 185$code.=<<___ if ($avx); 186 mov OPENSSL_ia32cap_P+4(%rip),%r9 187 lea poly1305_blocks_avx(%rip),%rax 188 lea poly1305_emit_avx(%rip),%rcx 189 bt \$`60-32`,%r9 # AVX? 190 cmovc %rax,%r10 191 cmovc %rcx,%r11 192___ 193$code.=<<___ if ($avx>1); 194 lea poly1305_blocks_avx2(%rip),%rax 195 bt \$`5+32`,%r9 # AVX2? 196 cmovc %rax,%r10 197___ 198$code.=<<___ if ($avx>3 && !$win64); 199 mov \$`(1<<31|1<<21|1<<16)`,%rax 200 shr \$32,%r9 201 and %rax,%r9 202 cmp %rax,%r9 203 je .Linit_base2_44 204___ 205$code.=<<___; 206 mov \$0x0ffffffc0fffffff,%rax 207 mov \$0x0ffffffc0ffffffc,%rcx 208 and 0($inp),%rax 209 and 8($inp),%rcx 210 mov %rax,24($ctx) 211 mov %rcx,32($ctx) 212___ 213$code.=<<___ if ($flavour !~ /elf32/); 214 mov %r10,0(%rdx) 215 mov %r11,8(%rdx) 216___ 217$code.=<<___ if ($flavour =~ /elf32/); 218 mov %r10d,0(%rdx) 219 mov %r11d,4(%rdx) 220___ 221$code.=<<___; 222 mov \$1,%eax 223.Lno_key: 224 ret 225.cfi_endproc 226.size poly1305_init,.-poly1305_init 227 228.type poly1305_blocks,\@function,4 229.align 32 230poly1305_blocks: 231.cfi_startproc 232 endbranch 233.Lblocks: 234 shr \$4,$len 235 jz .Lno_data # too short 236 237 push %rbx 238.cfi_push %rbx 239 push %rbp 240.cfi_push %rbp 241 push %r12 242.cfi_push %r12 243 push %r13 244.cfi_push %r13 245 push %r14 246.cfi_push %r14 247 push %r15 248.cfi_push %r15 249.Lblocks_body: 250 251 mov $len,%r15 # reassign $len 252 253 mov 24($ctx),$r0 # load r 254 mov 32($ctx),$s1 255 256 mov 0($ctx),$h0 # load hash value 257 mov 8($ctx),$h1 258 mov 16($ctx),$h2 259 260 mov $s1,$r1 261 shr \$2,$s1 262 mov $r1,%rax 263 add $r1,$s1 # s1 = r1 + (r1 >> 2) 264 jmp .Loop 265 266.align 32 267.Loop: 268 add 0($inp),$h0 # accumulate input 269 adc 8($inp),$h1 270 lea 16($inp),$inp 271 adc $padbit,$h2 272___ 273 &poly1305_iteration(); 274$code.=<<___; 275 mov $r1,%rax 276 dec %r15 # len-=16 277 jnz .Loop 278 279 mov $h0,0($ctx) # store hash value 280 mov $h1,8($ctx) 281 mov $h2,16($ctx) 282 283 mov 0(%rsp),%r15 284.cfi_restore %r15 285 mov 8(%rsp),%r14 286.cfi_restore %r14 287 mov 16(%rsp),%r13 288.cfi_restore %r13 289 mov 24(%rsp),%r12 290.cfi_restore %r12 291 mov 32(%rsp),%rbp 292.cfi_restore %rbp 293 mov 40(%rsp),%rbx 294.cfi_restore %rbx 295 lea 48(%rsp),%rsp 296.cfi_adjust_cfa_offset -48 297.Lno_data: 298.Lblocks_epilogue: 299 ret 300.cfi_endproc 301.size poly1305_blocks,.-poly1305_blocks 302 303.type poly1305_emit,\@function,3 304.align 32 305poly1305_emit: 306.cfi_startproc 307 endbranch 308.Lemit: 309 mov 0($ctx),%r8 # load hash value 310 mov 8($ctx),%r9 311 mov 16($ctx),%r10 312 313 mov %r8,%rax 314 add \$5,%r8 # compare to modulus 315 mov %r9,%rcx 316 adc \$0,%r9 317 adc \$0,%r10 318 shr \$2,%r10 # did 130-bit value overflow? 319 cmovnz %r8,%rax 320 cmovnz %r9,%rcx 321 322 add 0($nonce),%rax # accumulate nonce 323 adc 8($nonce),%rcx 324 mov %rax,0($mac) # write result 325 mov %rcx,8($mac) 326 327 ret 328.cfi_endproc 329.size poly1305_emit,.-poly1305_emit 330___ 331if ($avx) { 332 333######################################################################## 334# Layout of opaque area is following. 335# 336# unsigned __int32 h[5]; # current hash value base 2^26 337# unsigned __int32 is_base2_26; 338# unsigned __int64 r[2]; # key value base 2^64 339# unsigned __int64 pad; 340# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 341# 342# where r^n are base 2^26 digits of degrees of multiplier key. There are 343# 5 digits, but last four are interleaved with multiples of 5, totalling 344# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 345 346my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 347 map("%xmm$_",(0..15)); 348 349$code.=<<___; 350.type __poly1305_block,\@abi-omnipotent 351.align 32 352__poly1305_block: 353.cfi_startproc 354___ 355 &poly1305_iteration(); 356$code.=<<___; 357 ret 358.cfi_endproc 359.size __poly1305_block,.-__poly1305_block 360 361.type __poly1305_init_avx,\@abi-omnipotent 362.align 32 363__poly1305_init_avx: 364.cfi_startproc 365 mov $r0,$h0 366 mov $r1,$h1 367 xor $h2,$h2 368 369 lea 48+64($ctx),$ctx # size optimization 370 371 mov $r1,%rax 372 call __poly1305_block # r^2 373 374 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 375 mov \$0x3ffffff,%edx 376 mov $h0,$d1 377 and $h0#d,%eax 378 mov $r0,$d2 379 and $r0#d,%edx 380 mov %eax,`16*0+0-64`($ctx) 381 shr \$26,$d1 382 mov %edx,`16*0+4-64`($ctx) 383 shr \$26,$d2 384 385 mov \$0x3ffffff,%eax 386 mov \$0x3ffffff,%edx 387 and $d1#d,%eax 388 and $d2#d,%edx 389 mov %eax,`16*1+0-64`($ctx) 390 lea (%rax,%rax,4),%eax # *5 391 mov %edx,`16*1+4-64`($ctx) 392 lea (%rdx,%rdx,4),%edx # *5 393 mov %eax,`16*2+0-64`($ctx) 394 shr \$26,$d1 395 mov %edx,`16*2+4-64`($ctx) 396 shr \$26,$d2 397 398 mov $h1,%rax 399 mov $r1,%rdx 400 shl \$12,%rax 401 shl \$12,%rdx 402 or $d1,%rax 403 or $d2,%rdx 404 and \$0x3ffffff,%eax 405 and \$0x3ffffff,%edx 406 mov %eax,`16*3+0-64`($ctx) 407 lea (%rax,%rax,4),%eax # *5 408 mov %edx,`16*3+4-64`($ctx) 409 lea (%rdx,%rdx,4),%edx # *5 410 mov %eax,`16*4+0-64`($ctx) 411 mov $h1,$d1 412 mov %edx,`16*4+4-64`($ctx) 413 mov $r1,$d2 414 415 mov \$0x3ffffff,%eax 416 mov \$0x3ffffff,%edx 417 shr \$14,$d1 418 shr \$14,$d2 419 and $d1#d,%eax 420 and $d2#d,%edx 421 mov %eax,`16*5+0-64`($ctx) 422 lea (%rax,%rax,4),%eax # *5 423 mov %edx,`16*5+4-64`($ctx) 424 lea (%rdx,%rdx,4),%edx # *5 425 mov %eax,`16*6+0-64`($ctx) 426 shr \$26,$d1 427 mov %edx,`16*6+4-64`($ctx) 428 shr \$26,$d2 429 430 mov $h2,%rax 431 shl \$24,%rax 432 or %rax,$d1 433 mov $d1#d,`16*7+0-64`($ctx) 434 lea ($d1,$d1,4),$d1 # *5 435 mov $d2#d,`16*7+4-64`($ctx) 436 lea ($d2,$d2,4),$d2 # *5 437 mov $d1#d,`16*8+0-64`($ctx) 438 mov $d2#d,`16*8+4-64`($ctx) 439 440 mov $r1,%rax 441 call __poly1305_block # r^3 442 443 mov \$0x3ffffff,%eax # save r^3 base 2^26 444 mov $h0,$d1 445 and $h0#d,%eax 446 shr \$26,$d1 447 mov %eax,`16*0+12-64`($ctx) 448 449 mov \$0x3ffffff,%edx 450 and $d1#d,%edx 451 mov %edx,`16*1+12-64`($ctx) 452 lea (%rdx,%rdx,4),%edx # *5 453 shr \$26,$d1 454 mov %edx,`16*2+12-64`($ctx) 455 456 mov $h1,%rax 457 shl \$12,%rax 458 or $d1,%rax 459 and \$0x3ffffff,%eax 460 mov %eax,`16*3+12-64`($ctx) 461 lea (%rax,%rax,4),%eax # *5 462 mov $h1,$d1 463 mov %eax,`16*4+12-64`($ctx) 464 465 mov \$0x3ffffff,%edx 466 shr \$14,$d1 467 and $d1#d,%edx 468 mov %edx,`16*5+12-64`($ctx) 469 lea (%rdx,%rdx,4),%edx # *5 470 shr \$26,$d1 471 mov %edx,`16*6+12-64`($ctx) 472 473 mov $h2,%rax 474 shl \$24,%rax 475 or %rax,$d1 476 mov $d1#d,`16*7+12-64`($ctx) 477 lea ($d1,$d1,4),$d1 # *5 478 mov $d1#d,`16*8+12-64`($ctx) 479 480 mov $r1,%rax 481 call __poly1305_block # r^4 482 483 mov \$0x3ffffff,%eax # save r^4 base 2^26 484 mov $h0,$d1 485 and $h0#d,%eax 486 shr \$26,$d1 487 mov %eax,`16*0+8-64`($ctx) 488 489 mov \$0x3ffffff,%edx 490 and $d1#d,%edx 491 mov %edx,`16*1+8-64`($ctx) 492 lea (%rdx,%rdx,4),%edx # *5 493 shr \$26,$d1 494 mov %edx,`16*2+8-64`($ctx) 495 496 mov $h1,%rax 497 shl \$12,%rax 498 or $d1,%rax 499 and \$0x3ffffff,%eax 500 mov %eax,`16*3+8-64`($ctx) 501 lea (%rax,%rax,4),%eax # *5 502 mov $h1,$d1 503 mov %eax,`16*4+8-64`($ctx) 504 505 mov \$0x3ffffff,%edx 506 shr \$14,$d1 507 and $d1#d,%edx 508 mov %edx,`16*5+8-64`($ctx) 509 lea (%rdx,%rdx,4),%edx # *5 510 shr \$26,$d1 511 mov %edx,`16*6+8-64`($ctx) 512 513 mov $h2,%rax 514 shl \$24,%rax 515 or %rax,$d1 516 mov $d1#d,`16*7+8-64`($ctx) 517 lea ($d1,$d1,4),$d1 # *5 518 mov $d1#d,`16*8+8-64`($ctx) 519 520 lea -48-64($ctx),$ctx # size [de-]optimization 521 ret 522.cfi_endproc 523.size __poly1305_init_avx,.-__poly1305_init_avx 524 525.type poly1305_blocks_avx,\@function,4 526.align 32 527poly1305_blocks_avx: 528.cfi_startproc 529 endbranch 530 mov 20($ctx),%r8d # is_base2_26 531 cmp \$128,$len 532 jae .Lblocks_avx 533 test %r8d,%r8d 534 jz .Lblocks 535 536.Lblocks_avx: 537 and \$-16,$len 538 jz .Lno_data_avx 539 540 vzeroupper 541 542 test %r8d,%r8d 543 jz .Lbase2_64_avx 544 545 test \$31,$len 546 jz .Leven_avx 547 548 push %rbx 549.cfi_push %rbx 550 push %rbp 551.cfi_push %rbp 552 push %r12 553.cfi_push %r12 554 push %r13 555.cfi_push %r13 556 push %r14 557.cfi_push %r14 558 push %r15 559.cfi_push %r15 560.Lblocks_avx_body: 561 562 mov $len,%r15 # reassign $len 563 564 mov 0($ctx),$d1 # load hash value 565 mov 8($ctx),$d2 566 mov 16($ctx),$h2#d 567 568 mov 24($ctx),$r0 # load r 569 mov 32($ctx),$s1 570 571 ################################# base 2^26 -> base 2^64 572 mov $d1#d,$h0#d 573 and \$`-1*(1<<31)`,$d1 574 mov $d2,$r1 # borrow $r1 575 mov $d2#d,$h1#d 576 and \$`-1*(1<<31)`,$d2 577 578 shr \$6,$d1 579 shl \$52,$r1 580 add $d1,$h0 581 shr \$12,$h1 582 shr \$18,$d2 583 add $r1,$h0 584 adc $d2,$h1 585 586 mov $h2,$d1 587 shl \$40,$d1 588 shr \$24,$h2 589 add $d1,$h1 590 adc \$0,$h2 # can be partially reduced... 591 592 mov \$-4,$d2 # ... so reduce 593 mov $h2,$d1 594 and $h2,$d2 595 shr \$2,$d1 596 and \$3,$h2 597 add $d2,$d1 # =*5 598 add $d1,$h0 599 adc \$0,$h1 600 adc \$0,$h2 601 602 mov $s1,$r1 603 mov $s1,%rax 604 shr \$2,$s1 605 add $r1,$s1 # s1 = r1 + (r1 >> 2) 606 607 add 0($inp),$h0 # accumulate input 608 adc 8($inp),$h1 609 lea 16($inp),$inp 610 adc $padbit,$h2 611 612 call __poly1305_block 613 614 test $padbit,$padbit # if $padbit is zero, 615 jz .Lstore_base2_64_avx # store hash in base 2^64 format 616 617 ################################# base 2^64 -> base 2^26 618 mov $h0,%rax 619 mov $h0,%rdx 620 shr \$52,$h0 621 mov $h1,$r0 622 mov $h1,$r1 623 shr \$26,%rdx 624 and \$0x3ffffff,%rax # h[0] 625 shl \$12,$r0 626 and \$0x3ffffff,%rdx # h[1] 627 shr \$14,$h1 628 or $r0,$h0 629 shl \$24,$h2 630 and \$0x3ffffff,$h0 # h[2] 631 shr \$40,$r1 632 and \$0x3ffffff,$h1 # h[3] 633 or $r1,$h2 # h[4] 634 635 sub \$16,%r15 636 jz .Lstore_base2_26_avx 637 638 vmovd %rax#d,$H0 639 vmovd %rdx#d,$H1 640 vmovd $h0#d,$H2 641 vmovd $h1#d,$H3 642 vmovd $h2#d,$H4 643 jmp .Lproceed_avx 644 645.align 32 646.Lstore_base2_64_avx: 647 mov $h0,0($ctx) 648 mov $h1,8($ctx) 649 mov $h2,16($ctx) # note that is_base2_26 is zeroed 650 jmp .Ldone_avx 651 652.align 16 653.Lstore_base2_26_avx: 654 mov %rax#d,0($ctx) # store hash value base 2^26 655 mov %rdx#d,4($ctx) 656 mov $h0#d,8($ctx) 657 mov $h1#d,12($ctx) 658 mov $h2#d,16($ctx) 659.align 16 660.Ldone_avx: 661 mov 0(%rsp),%r15 662.cfi_restore %r15 663 mov 8(%rsp),%r14 664.cfi_restore %r14 665 mov 16(%rsp),%r13 666.cfi_restore %r13 667 mov 24(%rsp),%r12 668.cfi_restore %r12 669 mov 32(%rsp),%rbp 670.cfi_restore %rbp 671 mov 40(%rsp),%rbx 672.cfi_restore %rbx 673 lea 48(%rsp),%rsp 674.cfi_adjust_cfa_offset -48 675.Lno_data_avx: 676.Lblocks_avx_epilogue: 677 ret 678.cfi_endproc 679 680.align 32 681.Lbase2_64_avx: 682.cfi_startproc 683 push %rbx 684.cfi_push %rbx 685 push %rbp 686.cfi_push %rbp 687 push %r12 688.cfi_push %r12 689 push %r13 690.cfi_push %r13 691 push %r14 692.cfi_push %r14 693 push %r15 694.cfi_push %r15 695.Lbase2_64_avx_body: 696 697 mov $len,%r15 # reassign $len 698 699 mov 24($ctx),$r0 # load r 700 mov 32($ctx),$s1 701 702 mov 0($ctx),$h0 # load hash value 703 mov 8($ctx),$h1 704 mov 16($ctx),$h2#d 705 706 mov $s1,$r1 707 mov $s1,%rax 708 shr \$2,$s1 709 add $r1,$s1 # s1 = r1 + (r1 >> 2) 710 711 test \$31,$len 712 jz .Linit_avx 713 714 add 0($inp),$h0 # accumulate input 715 adc 8($inp),$h1 716 lea 16($inp),$inp 717 adc $padbit,$h2 718 sub \$16,%r15 719 720 call __poly1305_block 721 722.Linit_avx: 723 ################################# base 2^64 -> base 2^26 724 mov $h0,%rax 725 mov $h0,%rdx 726 shr \$52,$h0 727 mov $h1,$d1 728 mov $h1,$d2 729 shr \$26,%rdx 730 and \$0x3ffffff,%rax # h[0] 731 shl \$12,$d1 732 and \$0x3ffffff,%rdx # h[1] 733 shr \$14,$h1 734 or $d1,$h0 735 shl \$24,$h2 736 and \$0x3ffffff,$h0 # h[2] 737 shr \$40,$d2 738 and \$0x3ffffff,$h1 # h[3] 739 or $d2,$h2 # h[4] 740 741 vmovd %rax#d,$H0 742 vmovd %rdx#d,$H1 743 vmovd $h0#d,$H2 744 vmovd $h1#d,$H3 745 vmovd $h2#d,$H4 746 movl \$1,20($ctx) # set is_base2_26 747 748 call __poly1305_init_avx 749 750.Lproceed_avx: 751 mov %r15,$len 752 753 mov 0(%rsp),%r15 754.cfi_restore %r15 755 mov 8(%rsp),%r14 756.cfi_restore %r14 757 mov 16(%rsp),%r13 758.cfi_restore %r13 759 mov 24(%rsp),%r12 760.cfi_restore %r12 761 mov 32(%rsp),%rbp 762.cfi_restore %rbp 763 mov 40(%rsp),%rbx 764.cfi_restore %rbx 765 lea 48(%rsp),%rax 766 lea 48(%rsp),%rsp 767.cfi_adjust_cfa_offset -48 768.Lbase2_64_avx_epilogue: 769 jmp .Ldo_avx 770.cfi_endproc 771 772.align 32 773.Leven_avx: 774.cfi_startproc 775 vmovd 4*0($ctx),$H0 # load hash value 776 vmovd 4*1($ctx),$H1 777 vmovd 4*2($ctx),$H2 778 vmovd 4*3($ctx),$H3 779 vmovd 4*4($ctx),$H4 780 781.Ldo_avx: 782___ 783$code.=<<___ if (!$win64); 784 lea -0x58(%rsp),%r11 785.cfi_def_cfa %r11,0x60 786 sub \$0x178,%rsp 787___ 788$code.=<<___ if ($win64); 789 lea -0xf8(%rsp),%r11 790 sub \$0x218,%rsp 791 vmovdqa %xmm6,0x50(%r11) 792 vmovdqa %xmm7,0x60(%r11) 793 vmovdqa %xmm8,0x70(%r11) 794 vmovdqa %xmm9,0x80(%r11) 795 vmovdqa %xmm10,0x90(%r11) 796 vmovdqa %xmm11,0xa0(%r11) 797 vmovdqa %xmm12,0xb0(%r11) 798 vmovdqa %xmm13,0xc0(%r11) 799 vmovdqa %xmm14,0xd0(%r11) 800 vmovdqa %xmm15,0xe0(%r11) 801.Ldo_avx_body: 802___ 803$code.=<<___; 804 sub \$64,$len 805 lea -32($inp),%rax 806 cmovc %rax,$inp 807 808 vmovdqu `16*3`($ctx),$D4 # preload r0^2 809 lea `16*3+64`($ctx),$ctx # size optimization 810 lea .Lconst(%rip),%rcx 811 812 ################################################################ 813 # load input 814 vmovdqu 16*2($inp),$T0 815 vmovdqu 16*3($inp),$T1 816 vmovdqa 64(%rcx),$MASK # .Lmask26 817 818 vpsrldq \$6,$T0,$T2 # splat input 819 vpsrldq \$6,$T1,$T3 820 vpunpckhqdq $T1,$T0,$T4 # 4 821 vpunpcklqdq $T1,$T0,$T0 # 0:1 822 vpunpcklqdq $T3,$T2,$T3 # 2:3 823 824 vpsrlq \$40,$T4,$T4 # 4 825 vpsrlq \$26,$T0,$T1 826 vpand $MASK,$T0,$T0 # 0 827 vpsrlq \$4,$T3,$T2 828 vpand $MASK,$T1,$T1 # 1 829 vpsrlq \$30,$T3,$T3 830 vpand $MASK,$T2,$T2 # 2 831 vpand $MASK,$T3,$T3 # 3 832 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 833 834 jbe .Lskip_loop_avx 835 836 # expand and copy pre-calculated table to stack 837 vmovdqu `16*1-64`($ctx),$D1 838 vmovdqu `16*2-64`($ctx),$D2 839 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 840 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 841 vmovdqa $D3,-0x90(%r11) 842 vmovdqa $D0,0x00(%rsp) 843 vpshufd \$0xEE,$D1,$D4 844 vmovdqu `16*3-64`($ctx),$D0 845 vpshufd \$0x44,$D1,$D1 846 vmovdqa $D4,-0x80(%r11) 847 vmovdqa $D1,0x10(%rsp) 848 vpshufd \$0xEE,$D2,$D3 849 vmovdqu `16*4-64`($ctx),$D1 850 vpshufd \$0x44,$D2,$D2 851 vmovdqa $D3,-0x70(%r11) 852 vmovdqa $D2,0x20(%rsp) 853 vpshufd \$0xEE,$D0,$D4 854 vmovdqu `16*5-64`($ctx),$D2 855 vpshufd \$0x44,$D0,$D0 856 vmovdqa $D4,-0x60(%r11) 857 vmovdqa $D0,0x30(%rsp) 858 vpshufd \$0xEE,$D1,$D3 859 vmovdqu `16*6-64`($ctx),$D0 860 vpshufd \$0x44,$D1,$D1 861 vmovdqa $D3,-0x50(%r11) 862 vmovdqa $D1,0x40(%rsp) 863 vpshufd \$0xEE,$D2,$D4 864 vmovdqu `16*7-64`($ctx),$D1 865 vpshufd \$0x44,$D2,$D2 866 vmovdqa $D4,-0x40(%r11) 867 vmovdqa $D2,0x50(%rsp) 868 vpshufd \$0xEE,$D0,$D3 869 vmovdqu `16*8-64`($ctx),$D2 870 vpshufd \$0x44,$D0,$D0 871 vmovdqa $D3,-0x30(%r11) 872 vmovdqa $D0,0x60(%rsp) 873 vpshufd \$0xEE,$D1,$D4 874 vpshufd \$0x44,$D1,$D1 875 vmovdqa $D4,-0x20(%r11) 876 vmovdqa $D1,0x70(%rsp) 877 vpshufd \$0xEE,$D2,$D3 878 vmovdqa 0x00(%rsp),$D4 # preload r0^2 879 vpshufd \$0x44,$D2,$D2 880 vmovdqa $D3,-0x10(%r11) 881 vmovdqa $D2,0x80(%rsp) 882 883 jmp .Loop_avx 884 885.align 32 886.Loop_avx: 887 ################################################################ 888 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 889 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 890 # \___________________/ 891 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 892 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 893 # \___________________/ \____________________/ 894 # 895 # Note that we start with inp[2:3]*r^2. This is because it 896 # doesn't depend on reduction in previous iteration. 897 ################################################################ 898 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 899 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 900 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 901 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 902 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 903 # 904 # though note that $Tx and $Hx are "reversed" in this section, 905 # and $D4 is preloaded with r0^2... 906 907 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 908 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 909 vmovdqa $H2,0x20(%r11) # offload hash 910 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 911 vmovdqa 0x10(%rsp),$H2 # r1^2 912 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 913 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 914 915 vmovdqa $H0,0x00(%r11) # 916 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 917 vmovdqa $H1,0x10(%r11) # 918 vpmuludq $T3,$H2,$H1 # h3*r1 919 vpaddq $H0,$D0,$D0 # d0 += h4*s1 920 vpaddq $H1,$D4,$D4 # d4 += h3*r1 921 vmovdqa $H3,0x30(%r11) # 922 vpmuludq $T2,$H2,$H0 # h2*r1 923 vpmuludq $T1,$H2,$H1 # h1*r1 924 vpaddq $H0,$D3,$D3 # d3 += h2*r1 925 vmovdqa 0x30(%rsp),$H3 # r2^2 926 vpaddq $H1,$D2,$D2 # d2 += h1*r1 927 vmovdqa $H4,0x40(%r11) # 928 vpmuludq $T0,$H2,$H2 # h0*r1 929 vpmuludq $T2,$H3,$H0 # h2*r2 930 vpaddq $H2,$D1,$D1 # d1 += h0*r1 931 932 vmovdqa 0x40(%rsp),$H4 # s2^2 933 vpaddq $H0,$D4,$D4 # d4 += h2*r2 934 vpmuludq $T1,$H3,$H1 # h1*r2 935 vpmuludq $T0,$H3,$H3 # h0*r2 936 vpaddq $H1,$D3,$D3 # d3 += h1*r2 937 vmovdqa 0x50(%rsp),$H2 # r3^2 938 vpaddq $H3,$D2,$D2 # d2 += h0*r2 939 vpmuludq $T4,$H4,$H0 # h4*s2 940 vpmuludq $T3,$H4,$H4 # h3*s2 941 vpaddq $H0,$D1,$D1 # d1 += h4*s2 942 vmovdqa 0x60(%rsp),$H3 # s3^2 943 vpaddq $H4,$D0,$D0 # d0 += h3*s2 944 945 vmovdqa 0x80(%rsp),$H4 # s4^2 946 vpmuludq $T1,$H2,$H1 # h1*r3 947 vpmuludq $T0,$H2,$H2 # h0*r3 948 vpaddq $H1,$D4,$D4 # d4 += h1*r3 949 vpaddq $H2,$D3,$D3 # d3 += h0*r3 950 vpmuludq $T4,$H3,$H0 # h4*s3 951 vpmuludq $T3,$H3,$H1 # h3*s3 952 vpaddq $H0,$D2,$D2 # d2 += h4*s3 953 vmovdqu 16*0($inp),$H0 # load input 954 vpaddq $H1,$D1,$D1 # d1 += h3*s3 955 vpmuludq $T2,$H3,$H3 # h2*s3 956 vpmuludq $T2,$H4,$T2 # h2*s4 957 vpaddq $H3,$D0,$D0 # d0 += h2*s3 958 959 vmovdqu 16*1($inp),$H1 # 960 vpaddq $T2,$D1,$D1 # d1 += h2*s4 961 vpmuludq $T3,$H4,$T3 # h3*s4 962 vpmuludq $T4,$H4,$T4 # h4*s4 963 vpsrldq \$6,$H0,$H2 # splat input 964 vpaddq $T3,$D2,$D2 # d2 += h3*s4 965 vpaddq $T4,$D3,$D3 # d3 += h4*s4 966 vpsrldq \$6,$H1,$H3 # 967 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 968 vpmuludq $T1,$H4,$T0 # h1*s4 969 vpunpckhqdq $H1,$H0,$H4 # 4 970 vpaddq $T4,$D4,$D4 # d4 += h0*r4 971 vmovdqa -0x90(%r11),$T4 # r0^4 972 vpaddq $T0,$D0,$D0 # d0 += h1*s4 973 974 vpunpcklqdq $H1,$H0,$H0 # 0:1 975 vpunpcklqdq $H3,$H2,$H3 # 2:3 976 977 #vpsrlq \$40,$H4,$H4 # 4 978 vpsrldq \$`40/8`,$H4,$H4 # 4 979 vpsrlq \$26,$H0,$H1 980 vpand $MASK,$H0,$H0 # 0 981 vpsrlq \$4,$H3,$H2 982 vpand $MASK,$H1,$H1 # 1 983 vpand 0(%rcx),$H4,$H4 # .Lmask24 984 vpsrlq \$30,$H3,$H3 985 vpand $MASK,$H2,$H2 # 2 986 vpand $MASK,$H3,$H3 # 3 987 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 988 989 vpaddq 0x00(%r11),$H0,$H0 # add hash value 990 vpaddq 0x10(%r11),$H1,$H1 991 vpaddq 0x20(%r11),$H2,$H2 992 vpaddq 0x30(%r11),$H3,$H3 993 vpaddq 0x40(%r11),$H4,$H4 994 995 lea 16*2($inp),%rax 996 lea 16*4($inp),$inp 997 sub \$64,$len 998 cmovc %rax,$inp 999 1000 ################################################################ 1001 # Now we accumulate (inp[0:1]+hash)*r^4 1002 ################################################################ 1003 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1004 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1005 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1006 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1007 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1008 1009 vpmuludq $H0,$T4,$T0 # h0*r0 1010 vpmuludq $H1,$T4,$T1 # h1*r0 1011 vpaddq $T0,$D0,$D0 1012 vpaddq $T1,$D1,$D1 1013 vmovdqa -0x80(%r11),$T2 # r1^4 1014 vpmuludq $H2,$T4,$T0 # h2*r0 1015 vpmuludq $H3,$T4,$T1 # h3*r0 1016 vpaddq $T0,$D2,$D2 1017 vpaddq $T1,$D3,$D3 1018 vpmuludq $H4,$T4,$T4 # h4*r0 1019 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1020 vpaddq $T4,$D4,$D4 1021 1022 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1023 vpmuludq $H2,$T2,$T1 # h2*r1 1024 vpmuludq $H3,$T2,$T0 # h3*r1 1025 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1026 vmovdqa -0x60(%r11),$T3 # r2^4 1027 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1028 vpmuludq $H1,$T2,$T1 # h1*r1 1029 vpmuludq $H0,$T2,$T2 # h0*r1 1030 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1031 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1032 1033 vmovdqa -0x50(%r11),$T4 # s2^4 1034 vpmuludq $H2,$T3,$T0 # h2*r2 1035 vpmuludq $H1,$T3,$T1 # h1*r2 1036 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1037 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1038 vmovdqa -0x40(%r11),$T2 # r3^4 1039 vpmuludq $H0,$T3,$T3 # h0*r2 1040 vpmuludq $H4,$T4,$T0 # h4*s2 1041 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1042 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1043 vmovdqa -0x30(%r11),$T3 # s3^4 1044 vpmuludq $H3,$T4,$T4 # h3*s2 1045 vpmuludq $H1,$T2,$T1 # h1*r3 1046 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1047 1048 vmovdqa -0x10(%r11),$T4 # s4^4 1049 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1050 vpmuludq $H0,$T2,$T2 # h0*r3 1051 vpmuludq $H4,$T3,$T0 # h4*s3 1052 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1053 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1054 vmovdqu 16*2($inp),$T0 # load input 1055 vpmuludq $H3,$T3,$T2 # h3*s3 1056 vpmuludq $H2,$T3,$T3 # h2*s3 1057 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1058 vmovdqu 16*3($inp),$T1 # 1059 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1060 1061 vpmuludq $H2,$T4,$H2 # h2*s4 1062 vpmuludq $H3,$T4,$H3 # h3*s4 1063 vpsrldq \$6,$T0,$T2 # splat input 1064 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1065 vpmuludq $H4,$T4,$H4 # h4*s4 1066 vpsrldq \$6,$T1,$T3 # 1067 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1068 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1069 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1070 vpmuludq $H1,$T4,$H0 1071 vpunpckhqdq $T1,$T0,$T4 # 4 1072 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1073 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1074 1075 vpunpcklqdq $T1,$T0,$T0 # 0:1 1076 vpunpcklqdq $T3,$T2,$T3 # 2:3 1077 1078 #vpsrlq \$40,$T4,$T4 # 4 1079 vpsrldq \$`40/8`,$T4,$T4 # 4 1080 vpsrlq \$26,$T0,$T1 1081 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1082 vpand $MASK,$T0,$T0 # 0 1083 vpsrlq \$4,$T3,$T2 1084 vpand $MASK,$T1,$T1 # 1 1085 vpand 0(%rcx),$T4,$T4 # .Lmask24 1086 vpsrlq \$30,$T3,$T3 1087 vpand $MASK,$T2,$T2 # 2 1088 vpand $MASK,$T3,$T3 # 3 1089 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1090 1091 ################################################################ 1092 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1093 # and P. Schwabe 1094 1095 vpsrlq \$26,$H3,$D3 1096 vpand $MASK,$H3,$H3 1097 vpaddq $D3,$H4,$H4 # h3 -> h4 1098 1099 vpsrlq \$26,$H0,$D0 1100 vpand $MASK,$H0,$H0 1101 vpaddq $D0,$D1,$H1 # h0 -> h1 1102 1103 vpsrlq \$26,$H4,$D0 1104 vpand $MASK,$H4,$H4 1105 1106 vpsrlq \$26,$H1,$D1 1107 vpand $MASK,$H1,$H1 1108 vpaddq $D1,$H2,$H2 # h1 -> h2 1109 1110 vpaddq $D0,$H0,$H0 1111 vpsllq \$2,$D0,$D0 1112 vpaddq $D0,$H0,$H0 # h4 -> h0 1113 1114 vpsrlq \$26,$H2,$D2 1115 vpand $MASK,$H2,$H2 1116 vpaddq $D2,$H3,$H3 # h2 -> h3 1117 1118 vpsrlq \$26,$H0,$D0 1119 vpand $MASK,$H0,$H0 1120 vpaddq $D0,$H1,$H1 # h0 -> h1 1121 1122 vpsrlq \$26,$H3,$D3 1123 vpand $MASK,$H3,$H3 1124 vpaddq $D3,$H4,$H4 # h3 -> h4 1125 1126 ja .Loop_avx 1127 1128.Lskip_loop_avx: 1129 ################################################################ 1130 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1131 1132 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1133 add \$32,$len 1134 jnz .Long_tail_avx 1135 1136 vpaddq $H2,$T2,$T2 1137 vpaddq $H0,$T0,$T0 1138 vpaddq $H1,$T1,$T1 1139 vpaddq $H3,$T3,$T3 1140 vpaddq $H4,$T4,$T4 1141 1142.Long_tail_avx: 1143 vmovdqa $H2,0x20(%r11) 1144 vmovdqa $H0,0x00(%r11) 1145 vmovdqa $H1,0x10(%r11) 1146 vmovdqa $H3,0x30(%r11) 1147 vmovdqa $H4,0x40(%r11) 1148 1149 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1150 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1151 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1152 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1153 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1154 1155 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1156 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1157 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1158 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1159 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1160 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1161 1162 vpmuludq $T3,$H2,$H0 # h3*r1 1163 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1164 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1165 vpmuludq $T2,$H2,$H1 # h2*r1 1166 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1167 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1168 vpmuludq $T1,$H2,$H0 # h1*r1 1169 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1170 vpmuludq $T0,$H2,$H2 # h0*r1 1171 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1172 vpmuludq $T4,$H3,$H3 # h4*s1 1173 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1174 1175 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1176 vpmuludq $T2,$H4,$H1 # h2*r2 1177 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1178 vpmuludq $T1,$H4,$H0 # h1*r2 1179 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1180 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1181 vpmuludq $T0,$H4,$H4 # h0*r2 1182 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1183 vpmuludq $T4,$H2,$H1 # h4*s2 1184 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1185 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1186 vpmuludq $T3,$H2,$H2 # h3*s2 1187 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1188 1189 vpmuludq $T1,$H3,$H0 # h1*r3 1190 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1191 vpmuludq $T0,$H3,$H3 # h0*r3 1192 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1193 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1194 vpmuludq $T4,$H4,$H1 # h4*s3 1195 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1196 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1197 vpmuludq $T3,$H4,$H0 # h3*s3 1198 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1199 vpmuludq $T2,$H4,$H4 # h2*s3 1200 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1201 1202 vpmuludq $T0,$H2,$H2 # h0*r4 1203 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1204 vpmuludq $T4,$H3,$H1 # h4*s4 1205 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1206 vpmuludq $T3,$H3,$H0 # h3*s4 1207 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1208 vpmuludq $T2,$H3,$H1 # h2*s4 1209 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1210 vpmuludq $T1,$H3,$H3 # h1*s4 1211 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1212 1213 jz .Lshort_tail_avx 1214 1215 vmovdqu 16*0($inp),$H0 # load input 1216 vmovdqu 16*1($inp),$H1 1217 1218 vpsrldq \$6,$H0,$H2 # splat input 1219 vpsrldq \$6,$H1,$H3 1220 vpunpckhqdq $H1,$H0,$H4 # 4 1221 vpunpcklqdq $H1,$H0,$H0 # 0:1 1222 vpunpcklqdq $H3,$H2,$H3 # 2:3 1223 1224 vpsrlq \$40,$H4,$H4 # 4 1225 vpsrlq \$26,$H0,$H1 1226 vpand $MASK,$H0,$H0 # 0 1227 vpsrlq \$4,$H3,$H2 1228 vpand $MASK,$H1,$H1 # 1 1229 vpsrlq \$30,$H3,$H3 1230 vpand $MASK,$H2,$H2 # 2 1231 vpand $MASK,$H3,$H3 # 3 1232 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1233 1234 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1235 vpaddq 0x00(%r11),$H0,$H0 1236 vpaddq 0x10(%r11),$H1,$H1 1237 vpaddq 0x20(%r11),$H2,$H2 1238 vpaddq 0x30(%r11),$H3,$H3 1239 vpaddq 0x40(%r11),$H4,$H4 1240 1241 ################################################################ 1242 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1243 1244 vpmuludq $H0,$T4,$T0 # h0*r0 1245 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1246 vpmuludq $H1,$T4,$T1 # h1*r0 1247 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1248 vpmuludq $H2,$T4,$T0 # h2*r0 1249 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1250 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1251 vpmuludq $H3,$T4,$T1 # h3*r0 1252 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1253 vpmuludq $H4,$T4,$T4 # h4*r0 1254 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1255 1256 vpmuludq $H3,$T2,$T0 # h3*r1 1257 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1258 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1259 vpmuludq $H2,$T2,$T1 # h2*r1 1260 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1261 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1262 vpmuludq $H1,$T2,$T0 # h1*r1 1263 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1264 vpmuludq $H0,$T2,$T2 # h0*r1 1265 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1266 vpmuludq $H4,$T3,$T3 # h4*s1 1267 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1268 1269 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1270 vpmuludq $H2,$T4,$T1 # h2*r2 1271 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1272 vpmuludq $H1,$T4,$T0 # h1*r2 1273 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1274 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1275 vpmuludq $H0,$T4,$T4 # h0*r2 1276 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1277 vpmuludq $H4,$T2,$T1 # h4*s2 1278 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1279 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1280 vpmuludq $H3,$T2,$T2 # h3*s2 1281 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1282 1283 vpmuludq $H1,$T3,$T0 # h1*r3 1284 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1285 vpmuludq $H0,$T3,$T3 # h0*r3 1286 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1287 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1288 vpmuludq $H4,$T4,$T1 # h4*s3 1289 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1290 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1291 vpmuludq $H3,$T4,$T0 # h3*s3 1292 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1293 vpmuludq $H2,$T4,$T4 # h2*s3 1294 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1295 1296 vpmuludq $H0,$T2,$T2 # h0*r4 1297 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1298 vpmuludq $H4,$T3,$T1 # h4*s4 1299 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1300 vpmuludq $H3,$T3,$T0 # h3*s4 1301 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1302 vpmuludq $H2,$T3,$T1 # h2*s4 1303 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1304 vpmuludq $H1,$T3,$T3 # h1*s4 1305 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1306 1307.Lshort_tail_avx: 1308 ################################################################ 1309 # horizontal addition 1310 1311 vpsrldq \$8,$D4,$T4 1312 vpsrldq \$8,$D3,$T3 1313 vpsrldq \$8,$D1,$T1 1314 vpsrldq \$8,$D0,$T0 1315 vpsrldq \$8,$D2,$T2 1316 vpaddq $T3,$D3,$D3 1317 vpaddq $T4,$D4,$D4 1318 vpaddq $T0,$D0,$D0 1319 vpaddq $T1,$D1,$D1 1320 vpaddq $T2,$D2,$D2 1321 1322 ################################################################ 1323 # lazy reduction 1324 1325 vpsrlq \$26,$D3,$H3 1326 vpand $MASK,$D3,$D3 1327 vpaddq $H3,$D4,$D4 # h3 -> h4 1328 1329 vpsrlq \$26,$D0,$H0 1330 vpand $MASK,$D0,$D0 1331 vpaddq $H0,$D1,$D1 # h0 -> h1 1332 1333 vpsrlq \$26,$D4,$H4 1334 vpand $MASK,$D4,$D4 1335 1336 vpsrlq \$26,$D1,$H1 1337 vpand $MASK,$D1,$D1 1338 vpaddq $H1,$D2,$D2 # h1 -> h2 1339 1340 vpaddq $H4,$D0,$D0 1341 vpsllq \$2,$H4,$H4 1342 vpaddq $H4,$D0,$D0 # h4 -> h0 1343 1344 vpsrlq \$26,$D2,$H2 1345 vpand $MASK,$D2,$D2 1346 vpaddq $H2,$D3,$D3 # h2 -> h3 1347 1348 vpsrlq \$26,$D0,$H0 1349 vpand $MASK,$D0,$D0 1350 vpaddq $H0,$D1,$D1 # h0 -> h1 1351 1352 vpsrlq \$26,$D3,$H3 1353 vpand $MASK,$D3,$D3 1354 vpaddq $H3,$D4,$D4 # h3 -> h4 1355 1356 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1357 vmovd $D1,`4*1-48-64`($ctx) 1358 vmovd $D2,`4*2-48-64`($ctx) 1359 vmovd $D3,`4*3-48-64`($ctx) 1360 vmovd $D4,`4*4-48-64`($ctx) 1361___ 1362$code.=<<___ if ($win64); 1363 vmovdqa 0x50(%r11),%xmm6 1364 vmovdqa 0x60(%r11),%xmm7 1365 vmovdqa 0x70(%r11),%xmm8 1366 vmovdqa 0x80(%r11),%xmm9 1367 vmovdqa 0x90(%r11),%xmm10 1368 vmovdqa 0xa0(%r11),%xmm11 1369 vmovdqa 0xb0(%r11),%xmm12 1370 vmovdqa 0xc0(%r11),%xmm13 1371 vmovdqa 0xd0(%r11),%xmm14 1372 vmovdqa 0xe0(%r11),%xmm15 1373 lea 0xf8(%r11),%rsp 1374.Ldo_avx_epilogue: 1375___ 1376$code.=<<___ if (!$win64); 1377 lea 0x58(%r11),%rsp 1378.cfi_def_cfa %rsp,8 1379___ 1380$code.=<<___; 1381 vzeroupper 1382 ret 1383.cfi_endproc 1384.size poly1305_blocks_avx,.-poly1305_blocks_avx 1385 1386.type poly1305_emit_avx,\@function,3 1387.align 32 1388poly1305_emit_avx: 1389.cfi_startproc 1390 endbranch 1391 cmpl \$0,20($ctx) # is_base2_26? 1392 je .Lemit 1393 1394 mov 0($ctx),%eax # load hash value base 2^26 1395 mov 4($ctx),%ecx 1396 mov 8($ctx),%r8d 1397 mov 12($ctx),%r11d 1398 mov 16($ctx),%r10d 1399 1400 shl \$26,%rcx # base 2^26 -> base 2^64 1401 mov %r8,%r9 1402 shl \$52,%r8 1403 add %rcx,%rax 1404 shr \$12,%r9 1405 add %rax,%r8 # h0 1406 adc \$0,%r9 1407 1408 shl \$14,%r11 1409 mov %r10,%rax 1410 shr \$24,%r10 1411 add %r11,%r9 1412 shl \$40,%rax 1413 add %rax,%r9 # h1 1414 adc \$0,%r10 # h2 1415 1416 mov %r10,%rax # could be partially reduced, so reduce 1417 mov %r10,%rcx 1418 and \$3,%r10 1419 shr \$2,%rax 1420 and \$-4,%rcx 1421 add %rcx,%rax 1422 add %rax,%r8 1423 adc \$0,%r9 1424 adc \$0,%r10 1425 1426 mov %r8,%rax 1427 add \$5,%r8 # compare to modulus 1428 mov %r9,%rcx 1429 adc \$0,%r9 1430 adc \$0,%r10 1431 shr \$2,%r10 # did 130-bit value overflow? 1432 cmovnz %r8,%rax 1433 cmovnz %r9,%rcx 1434 1435 add 0($nonce),%rax # accumulate nonce 1436 adc 8($nonce),%rcx 1437 mov %rax,0($mac) # write result 1438 mov %rcx,8($mac) 1439 1440 ret 1441.cfi_endproc 1442.size poly1305_emit_avx,.-poly1305_emit_avx 1443___ 1444 1445if ($avx>1) { 1446my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1447 map("%ymm$_",(0..15)); 1448my $S4=$MASK; 1449 1450$code.=<<___; 1451.type poly1305_blocks_avx2,\@function,4 1452.align 32 1453poly1305_blocks_avx2: 1454.cfi_startproc 1455 endbranch 1456 mov 20($ctx),%r8d # is_base2_26 1457 cmp \$128,$len 1458 jae .Lblocks_avx2 1459 test %r8d,%r8d 1460 jz .Lblocks 1461 1462.Lblocks_avx2: 1463 and \$-16,$len 1464 jz .Lno_data_avx2 1465 1466 vzeroupper 1467 1468 test %r8d,%r8d 1469 jz .Lbase2_64_avx2 1470 1471 test \$63,$len 1472 jz .Leven_avx2 1473 1474 push %rbx 1475.cfi_push %rbx 1476 push %rbp 1477.cfi_push %rbp 1478 push %r12 1479.cfi_push %r12 1480 push %r13 1481.cfi_push %r13 1482 push %r14 1483.cfi_push %r14 1484 push %r15 1485.cfi_push %r15 1486.Lblocks_avx2_body: 1487 1488 mov $len,%r15 # reassign $len 1489 1490 mov 0($ctx),$d1 # load hash value 1491 mov 8($ctx),$d2 1492 mov 16($ctx),$h2#d 1493 1494 mov 24($ctx),$r0 # load r 1495 mov 32($ctx),$s1 1496 1497 ################################# base 2^26 -> base 2^64 1498 mov $d1#d,$h0#d 1499 and \$`-1*(1<<31)`,$d1 1500 mov $d2,$r1 # borrow $r1 1501 mov $d2#d,$h1#d 1502 and \$`-1*(1<<31)`,$d2 1503 1504 shr \$6,$d1 1505 shl \$52,$r1 1506 add $d1,$h0 1507 shr \$12,$h1 1508 shr \$18,$d2 1509 add $r1,$h0 1510 adc $d2,$h1 1511 1512 mov $h2,$d1 1513 shl \$40,$d1 1514 shr \$24,$h2 1515 add $d1,$h1 1516 adc \$0,$h2 # can be partially reduced... 1517 1518 mov \$-4,$d2 # ... so reduce 1519 mov $h2,$d1 1520 and $h2,$d2 1521 shr \$2,$d1 1522 and \$3,$h2 1523 add $d2,$d1 # =*5 1524 add $d1,$h0 1525 adc \$0,$h1 1526 adc \$0,$h2 1527 1528 mov $s1,$r1 1529 mov $s1,%rax 1530 shr \$2,$s1 1531 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1532 1533.Lbase2_26_pre_avx2: 1534 add 0($inp),$h0 # accumulate input 1535 adc 8($inp),$h1 1536 lea 16($inp),$inp 1537 adc $padbit,$h2 1538 sub \$16,%r15 1539 1540 call __poly1305_block 1541 mov $r1,%rax 1542 1543 test \$63,%r15 1544 jnz .Lbase2_26_pre_avx2 1545 1546 test $padbit,$padbit # if $padbit is zero, 1547 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format 1548 1549 ################################# base 2^64 -> base 2^26 1550 mov $h0,%rax 1551 mov $h0,%rdx 1552 shr \$52,$h0 1553 mov $h1,$r0 1554 mov $h1,$r1 1555 shr \$26,%rdx 1556 and \$0x3ffffff,%rax # h[0] 1557 shl \$12,$r0 1558 and \$0x3ffffff,%rdx # h[1] 1559 shr \$14,$h1 1560 or $r0,$h0 1561 shl \$24,$h2 1562 and \$0x3ffffff,$h0 # h[2] 1563 shr \$40,$r1 1564 and \$0x3ffffff,$h1 # h[3] 1565 or $r1,$h2 # h[4] 1566 1567 test %r15,%r15 1568 jz .Lstore_base2_26_avx2 1569 1570 vmovd %rax#d,%x#$H0 1571 vmovd %rdx#d,%x#$H1 1572 vmovd $h0#d,%x#$H2 1573 vmovd $h1#d,%x#$H3 1574 vmovd $h2#d,%x#$H4 1575 jmp .Lproceed_avx2 1576 1577.align 32 1578.Lstore_base2_64_avx2: 1579 mov $h0,0($ctx) 1580 mov $h1,8($ctx) 1581 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1582 jmp .Ldone_avx2 1583 1584.align 16 1585.Lstore_base2_26_avx2: 1586 mov %rax#d,0($ctx) # store hash value base 2^26 1587 mov %rdx#d,4($ctx) 1588 mov $h0#d,8($ctx) 1589 mov $h1#d,12($ctx) 1590 mov $h2#d,16($ctx) 1591.align 16 1592.Ldone_avx2: 1593 mov 0(%rsp),%r15 1594.cfi_restore %r15 1595 mov 8(%rsp),%r14 1596.cfi_restore %r14 1597 mov 16(%rsp),%r13 1598.cfi_restore %r13 1599 mov 24(%rsp),%r12 1600.cfi_restore %r12 1601 mov 32(%rsp),%rbp 1602.cfi_restore %rbp 1603 mov 40(%rsp),%rbx 1604.cfi_restore %rbx 1605 lea 48(%rsp),%rsp 1606.cfi_adjust_cfa_offset -48 1607.Lno_data_avx2: 1608.Lblocks_avx2_epilogue: 1609 ret 1610.cfi_endproc 1611 1612.align 32 1613.Lbase2_64_avx2: 1614.cfi_startproc 1615 push %rbx 1616.cfi_push %rbx 1617 push %rbp 1618.cfi_push %rbp 1619 push %r12 1620.cfi_push %r12 1621 push %r13 1622.cfi_push %r13 1623 push %r14 1624.cfi_push %r14 1625 push %r15 1626.cfi_push %r15 1627.Lbase2_64_avx2_body: 1628 1629 mov $len,%r15 # reassign $len 1630 1631 mov 24($ctx),$r0 # load r 1632 mov 32($ctx),$s1 1633 1634 mov 0($ctx),$h0 # load hash value 1635 mov 8($ctx),$h1 1636 mov 16($ctx),$h2#d 1637 1638 mov $s1,$r1 1639 mov $s1,%rax 1640 shr \$2,$s1 1641 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1642 1643 test \$63,$len 1644 jz .Linit_avx2 1645 1646.Lbase2_64_pre_avx2: 1647 add 0($inp),$h0 # accumulate input 1648 adc 8($inp),$h1 1649 lea 16($inp),$inp 1650 adc $padbit,$h2 1651 sub \$16,%r15 1652 1653 call __poly1305_block 1654 mov $r1,%rax 1655 1656 test \$63,%r15 1657 jnz .Lbase2_64_pre_avx2 1658 1659.Linit_avx2: 1660 ################################# base 2^64 -> base 2^26 1661 mov $h0,%rax 1662 mov $h0,%rdx 1663 shr \$52,$h0 1664 mov $h1,$d1 1665 mov $h1,$d2 1666 shr \$26,%rdx 1667 and \$0x3ffffff,%rax # h[0] 1668 shl \$12,$d1 1669 and \$0x3ffffff,%rdx # h[1] 1670 shr \$14,$h1 1671 or $d1,$h0 1672 shl \$24,$h2 1673 and \$0x3ffffff,$h0 # h[2] 1674 shr \$40,$d2 1675 and \$0x3ffffff,$h1 # h[3] 1676 or $d2,$h2 # h[4] 1677 1678 vmovd %rax#d,%x#$H0 1679 vmovd %rdx#d,%x#$H1 1680 vmovd $h0#d,%x#$H2 1681 vmovd $h1#d,%x#$H3 1682 vmovd $h2#d,%x#$H4 1683 movl \$1,20($ctx) # set is_base2_26 1684 1685 call __poly1305_init_avx 1686 1687.Lproceed_avx2: 1688 mov %r15,$len # restore $len 1689 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1690 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1691 1692 mov 0(%rsp),%r15 1693.cfi_restore %r15 1694 mov 8(%rsp),%r14 1695.cfi_restore %r14 1696 mov 16(%rsp),%r13 1697.cfi_restore %r13 1698 mov 24(%rsp),%r12 1699.cfi_restore %r12 1700 mov 32(%rsp),%rbp 1701.cfi_restore %rbp 1702 mov 40(%rsp),%rbx 1703.cfi_restore %rbx 1704 lea 48(%rsp),%rax 1705 lea 48(%rsp),%rsp 1706.cfi_adjust_cfa_offset -48 1707.Lbase2_64_avx2_epilogue: 1708 jmp .Ldo_avx2 1709.cfi_endproc 1710 1711.align 32 1712.Leven_avx2: 1713.cfi_startproc 1714 mov OPENSSL_ia32cap_P+8(%rip),%r10d 1715 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1716 vmovd 4*1($ctx),%x#$H1 1717 vmovd 4*2($ctx),%x#$H2 1718 vmovd 4*3($ctx),%x#$H3 1719 vmovd 4*4($ctx),%x#$H4 1720 1721.Ldo_avx2: 1722___ 1723$code.=<<___ if ($avx>2); 1724 cmp \$512,$len 1725 jb .Lskip_avx512 1726 and %r11d,%r10d 1727 test \$`1<<16`,%r10d # check for AVX512F 1728 jnz .Lblocks_avx512 1729.Lskip_avx512: 1730___ 1731$code.=<<___ if (!$win64); 1732 lea -8(%rsp),%r11 1733.cfi_def_cfa %r11,16 1734 sub \$0x128,%rsp 1735___ 1736$code.=<<___ if ($win64); 1737 lea -0xf8(%rsp),%r11 1738 sub \$0x1c8,%rsp 1739 vmovdqa %xmm6,0x50(%r11) 1740 vmovdqa %xmm7,0x60(%r11) 1741 vmovdqa %xmm8,0x70(%r11) 1742 vmovdqa %xmm9,0x80(%r11) 1743 vmovdqa %xmm10,0x90(%r11) 1744 vmovdqa %xmm11,0xa0(%r11) 1745 vmovdqa %xmm12,0xb0(%r11) 1746 vmovdqa %xmm13,0xc0(%r11) 1747 vmovdqa %xmm14,0xd0(%r11) 1748 vmovdqa %xmm15,0xe0(%r11) 1749.Ldo_avx2_body: 1750___ 1751$code.=<<___; 1752 lea .Lconst(%rip),%rcx 1753 lea 48+64($ctx),$ctx # size optimization 1754 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1755 1756 # expand and copy pre-calculated table to stack 1757 vmovdqu `16*0-64`($ctx),%x#$T2 1758 and \$-512,%rsp 1759 vmovdqu `16*1-64`($ctx),%x#$T3 1760 vmovdqu `16*2-64`($ctx),%x#$T4 1761 vmovdqu `16*3-64`($ctx),%x#$D0 1762 vmovdqu `16*4-64`($ctx),%x#$D1 1763 vmovdqu `16*5-64`($ctx),%x#$D2 1764 lea 0x90(%rsp),%rax # size optimization 1765 vmovdqu `16*6-64`($ctx),%x#$D3 1766 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1767 vmovdqu `16*7-64`($ctx),%x#$D4 1768 vpermd $T3,$T0,$T3 1769 vmovdqu `16*8-64`($ctx),%x#$MASK 1770 vpermd $T4,$T0,$T4 1771 vmovdqa $T2,0x00(%rsp) 1772 vpermd $D0,$T0,$D0 1773 vmovdqa $T3,0x20-0x90(%rax) 1774 vpermd $D1,$T0,$D1 1775 vmovdqa $T4,0x40-0x90(%rax) 1776 vpermd $D2,$T0,$D2 1777 vmovdqa $D0,0x60-0x90(%rax) 1778 vpermd $D3,$T0,$D3 1779 vmovdqa $D1,0x80-0x90(%rax) 1780 vpermd $D4,$T0,$D4 1781 vmovdqa $D2,0xa0-0x90(%rax) 1782 vpermd $MASK,$T0,$MASK 1783 vmovdqa $D3,0xc0-0x90(%rax) 1784 vmovdqa $D4,0xe0-0x90(%rax) 1785 vmovdqa $MASK,0x100-0x90(%rax) 1786 vmovdqa 64(%rcx),$MASK # .Lmask26 1787 1788 ################################################################ 1789 # load input 1790 vmovdqu 16*0($inp),%x#$T0 1791 vmovdqu 16*1($inp),%x#$T1 1792 vinserti128 \$1,16*2($inp),$T0,$T0 1793 vinserti128 \$1,16*3($inp),$T1,$T1 1794 lea 16*4($inp),$inp 1795 1796 vpsrldq \$6,$T0,$T2 # splat input 1797 vpsrldq \$6,$T1,$T3 1798 vpunpckhqdq $T1,$T0,$T4 # 4 1799 vpunpcklqdq $T3,$T2,$T2 # 2:3 1800 vpunpcklqdq $T1,$T0,$T0 # 0:1 1801 1802 vpsrlq \$30,$T2,$T3 1803 vpsrlq \$4,$T2,$T2 1804 vpsrlq \$26,$T0,$T1 1805 vpsrlq \$40,$T4,$T4 # 4 1806 vpand $MASK,$T2,$T2 # 2 1807 vpand $MASK,$T0,$T0 # 0 1808 vpand $MASK,$T1,$T1 # 1 1809 vpand $MASK,$T3,$T3 # 3 1810 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1811 1812 vpaddq $H2,$T2,$H2 # accumulate input 1813 sub \$64,$len 1814 jz .Ltail_avx2 1815 jmp .Loop_avx2 1816 1817.align 32 1818.Loop_avx2: 1819 ################################################################ 1820 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1821 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1822 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1823 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1824 # \________/\__________/ 1825 ################################################################ 1826 #vpaddq $H2,$T2,$H2 # accumulate input 1827 vpaddq $H0,$T0,$H0 1828 vmovdqa `32*0`(%rsp),$T0 # r0^4 1829 vpaddq $H1,$T1,$H1 1830 vmovdqa `32*1`(%rsp),$T1 # r1^4 1831 vpaddq $H3,$T3,$H3 1832 vmovdqa `32*3`(%rsp),$T2 # r2^4 1833 vpaddq $H4,$T4,$H4 1834 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1835 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1836 1837 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1838 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1839 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1840 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1841 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1842 # 1843 # however, as h2 is "chronologically" first one available pull 1844 # corresponding operations up, so it's 1845 # 1846 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1847 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1848 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1849 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1850 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1851 1852 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1853 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1854 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1855 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1856 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1857 1858 vpmuludq $H0,$T1,$T4 # h0*r1 1859 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1860 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1861 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1862 vpmuludq $H3,$T1,$T4 # h3*r1 1863 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1864 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1865 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1866 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1867 1868 vpmuludq $H0,$T0,$T4 # h0*r0 1869 vpmuludq $H1,$T0,$H2 # h1*r0 1870 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1871 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1872 vpmuludq $H3,$T0,$T4 # h3*r0 1873 vpmuludq $H4,$T0,$H2 # h4*r0 1874 vmovdqu 16*0($inp),%x#$T0 # load input 1875 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1876 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1877 vinserti128 \$1,16*2($inp),$T0,$T0 1878 1879 vpmuludq $H3,$T1,$T4 # h3*s2 1880 vpmuludq $H4,$T1,$H2 # h4*s2 1881 vmovdqu 16*1($inp),%x#$T1 1882 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1883 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1884 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1885 vpmuludq $H1,$T2,$T4 # h1*r2 1886 vpmuludq $H0,$T2,$T2 # h0*r2 1887 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1888 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1889 vinserti128 \$1,16*3($inp),$T1,$T1 1890 lea 16*4($inp),$inp 1891 1892 vpmuludq $H1,$H2,$T4 # h1*r3 1893 vpmuludq $H0,$H2,$H2 # h0*r3 1894 vpsrldq \$6,$T0,$T2 # splat input 1895 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1896 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1897 vpmuludq $H3,$T3,$T4 # h3*s3 1898 vpmuludq $H4,$T3,$H2 # h4*s3 1899 vpsrldq \$6,$T1,$T3 1900 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1901 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1902 vpunpckhqdq $T1,$T0,$T4 # 4 1903 1904 vpmuludq $H3,$S4,$H3 # h3*s4 1905 vpmuludq $H4,$S4,$H4 # h4*s4 1906 vpunpcklqdq $T1,$T0,$T0 # 0:1 1907 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1908 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1909 vpunpcklqdq $T3,$T2,$T3 # 2:3 1910 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1911 vpmuludq $H1,$S4,$H0 # h1*s4 1912 vmovdqa 64(%rcx),$MASK # .Lmask26 1913 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1914 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1915 1916 ################################################################ 1917 # lazy reduction (interleaved with tail of input splat) 1918 1919 vpsrlq \$26,$H3,$D3 1920 vpand $MASK,$H3,$H3 1921 vpaddq $D3,$H4,$H4 # h3 -> h4 1922 1923 vpsrlq \$26,$H0,$D0 1924 vpand $MASK,$H0,$H0 1925 vpaddq $D0,$D1,$H1 # h0 -> h1 1926 1927 vpsrlq \$26,$H4,$D4 1928 vpand $MASK,$H4,$H4 1929 1930 vpsrlq \$4,$T3,$T2 1931 1932 vpsrlq \$26,$H1,$D1 1933 vpand $MASK,$H1,$H1 1934 vpaddq $D1,$H2,$H2 # h1 -> h2 1935 1936 vpaddq $D4,$H0,$H0 1937 vpsllq \$2,$D4,$D4 1938 vpaddq $D4,$H0,$H0 # h4 -> h0 1939 1940 vpand $MASK,$T2,$T2 # 2 1941 vpsrlq \$26,$T0,$T1 1942 1943 vpsrlq \$26,$H2,$D2 1944 vpand $MASK,$H2,$H2 1945 vpaddq $D2,$H3,$H3 # h2 -> h3 1946 1947 vpaddq $T2,$H2,$H2 # modulo-scheduled 1948 vpsrlq \$30,$T3,$T3 1949 1950 vpsrlq \$26,$H0,$D0 1951 vpand $MASK,$H0,$H0 1952 vpaddq $D0,$H1,$H1 # h0 -> h1 1953 1954 vpsrlq \$40,$T4,$T4 # 4 1955 1956 vpsrlq \$26,$H3,$D3 1957 vpand $MASK,$H3,$H3 1958 vpaddq $D3,$H4,$H4 # h3 -> h4 1959 1960 vpand $MASK,$T0,$T0 # 0 1961 vpand $MASK,$T1,$T1 # 1 1962 vpand $MASK,$T3,$T3 # 3 1963 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1964 1965 sub \$64,$len 1966 jnz .Loop_avx2 1967 1968 .byte 0x66,0x90 1969.Ltail_avx2: 1970 ################################################################ 1971 # while above multiplications were by r^4 in all lanes, in last 1972 # iteration we multiply least significant lane by r^4 and most 1973 # significant one by r, so copy of above except that references 1974 # to the precomputed table are displaced by 4... 1975 1976 #vpaddq $H2,$T2,$H2 # accumulate input 1977 vpaddq $H0,$T0,$H0 1978 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 1979 vpaddq $H1,$T1,$H1 1980 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 1981 vpaddq $H3,$T3,$H3 1982 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 1983 vpaddq $H4,$T4,$H4 1984 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 1985 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 1986 1987 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1988 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1989 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1990 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1991 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1992 1993 vpmuludq $H0,$T1,$T4 # h0*r1 1994 vpmuludq $H1,$T1,$H2 # h1*r1 1995 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1996 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1997 vpmuludq $H3,$T1,$T4 # h3*r1 1998 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 1999 vpaddq $T4,$D4,$D4 # d4 += h3*r1 2000 vpaddq $H2,$D0,$D0 # d0 += h4*s1 2001 2002 vpmuludq $H0,$T0,$T4 # h0*r0 2003 vpmuludq $H1,$T0,$H2 # h1*r0 2004 vpaddq $T4,$D0,$D0 # d0 += h0*r0 2005 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 2006 vpaddq $H2,$D1,$D1 # d1 += h1*r0 2007 vpmuludq $H3,$T0,$T4 # h3*r0 2008 vpmuludq $H4,$T0,$H2 # h4*r0 2009 vpaddq $T4,$D3,$D3 # d3 += h3*r0 2010 vpaddq $H2,$D4,$D4 # d4 += h4*r0 2011 2012 vpmuludq $H3,$T1,$T4 # h3*s2 2013 vpmuludq $H4,$T1,$H2 # h4*s2 2014 vpaddq $T4,$D0,$D0 # d0 += h3*s2 2015 vpaddq $H2,$D1,$D1 # d1 += h4*s2 2016 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2017 vpmuludq $H1,$T2,$T4 # h1*r2 2018 vpmuludq $H0,$T2,$T2 # h0*r2 2019 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2020 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2021 2022 vpmuludq $H1,$H2,$T4 # h1*r3 2023 vpmuludq $H0,$H2,$H2 # h0*r3 2024 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2025 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2026 vpmuludq $H3,$T3,$T4 # h3*s3 2027 vpmuludq $H4,$T3,$H2 # h4*s3 2028 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2029 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2030 2031 vpmuludq $H3,$S4,$H3 # h3*s4 2032 vpmuludq $H4,$S4,$H4 # h4*s4 2033 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2034 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2035 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2036 vpmuludq $H1,$S4,$H0 # h1*s4 2037 vmovdqa 64(%rcx),$MASK # .Lmask26 2038 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2039 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2040 2041 ################################################################ 2042 # horizontal addition 2043 2044 vpsrldq \$8,$D1,$T1 2045 vpsrldq \$8,$H2,$T2 2046 vpsrldq \$8,$H3,$T3 2047 vpsrldq \$8,$H4,$T4 2048 vpsrldq \$8,$H0,$T0 2049 vpaddq $T1,$D1,$D1 2050 vpaddq $T2,$H2,$H2 2051 vpaddq $T3,$H3,$H3 2052 vpaddq $T4,$H4,$H4 2053 vpaddq $T0,$H0,$H0 2054 2055 vpermq \$0x2,$H3,$T3 2056 vpermq \$0x2,$H4,$T4 2057 vpermq \$0x2,$H0,$T0 2058 vpermq \$0x2,$D1,$T1 2059 vpermq \$0x2,$H2,$T2 2060 vpaddq $T3,$H3,$H3 2061 vpaddq $T4,$H4,$H4 2062 vpaddq $T0,$H0,$H0 2063 vpaddq $T1,$D1,$D1 2064 vpaddq $T2,$H2,$H2 2065 2066 ################################################################ 2067 # lazy reduction 2068 2069 vpsrlq \$26,$H3,$D3 2070 vpand $MASK,$H3,$H3 2071 vpaddq $D3,$H4,$H4 # h3 -> h4 2072 2073 vpsrlq \$26,$H0,$D0 2074 vpand $MASK,$H0,$H0 2075 vpaddq $D0,$D1,$H1 # h0 -> h1 2076 2077 vpsrlq \$26,$H4,$D4 2078 vpand $MASK,$H4,$H4 2079 2080 vpsrlq \$26,$H1,$D1 2081 vpand $MASK,$H1,$H1 2082 vpaddq $D1,$H2,$H2 # h1 -> h2 2083 2084 vpaddq $D4,$H0,$H0 2085 vpsllq \$2,$D4,$D4 2086 vpaddq $D4,$H0,$H0 # h4 -> h0 2087 2088 vpsrlq \$26,$H2,$D2 2089 vpand $MASK,$H2,$H2 2090 vpaddq $D2,$H3,$H3 # h2 -> h3 2091 2092 vpsrlq \$26,$H0,$D0 2093 vpand $MASK,$H0,$H0 2094 vpaddq $D0,$H1,$H1 # h0 -> h1 2095 2096 vpsrlq \$26,$H3,$D3 2097 vpand $MASK,$H3,$H3 2098 vpaddq $D3,$H4,$H4 # h3 -> h4 2099 2100 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2101 vmovd %x#$H1,`4*1-48-64`($ctx) 2102 vmovd %x#$H2,`4*2-48-64`($ctx) 2103 vmovd %x#$H3,`4*3-48-64`($ctx) 2104 vmovd %x#$H4,`4*4-48-64`($ctx) 2105___ 2106$code.=<<___ if ($win64); 2107 vmovdqa 0x50(%r11),%xmm6 2108 vmovdqa 0x60(%r11),%xmm7 2109 vmovdqa 0x70(%r11),%xmm8 2110 vmovdqa 0x80(%r11),%xmm9 2111 vmovdqa 0x90(%r11),%xmm10 2112 vmovdqa 0xa0(%r11),%xmm11 2113 vmovdqa 0xb0(%r11),%xmm12 2114 vmovdqa 0xc0(%r11),%xmm13 2115 vmovdqa 0xd0(%r11),%xmm14 2116 vmovdqa 0xe0(%r11),%xmm15 2117 lea 0xf8(%r11),%rsp 2118.Ldo_avx2_epilogue: 2119___ 2120$code.=<<___ if (!$win64); 2121 lea 8(%r11),%rsp 2122.cfi_def_cfa %rsp,8 2123___ 2124$code.=<<___; 2125 vzeroupper 2126 ret 2127.cfi_endproc 2128.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 2129___ 2130####################################################################### 2131if ($avx>2) { 2132# On entry we have input length divisible by 64. But since inner loop 2133# processes 128 bytes per iteration, cases when length is not divisible 2134# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2135# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2136# for this tail, we wouldn't have to even allocate stack frame... 2137 2138my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2139my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2140my $PADBIT="%zmm30"; 2141 2142map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2143map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2144map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2145map(s/%y/%z/,($MASK)); 2146 2147$code.=<<___; 2148.type poly1305_blocks_avx512,\@function,4 2149.align 32 2150poly1305_blocks_avx512: 2151.cfi_startproc 2152 endbranch 2153.Lblocks_avx512: 2154 mov \$15,%eax 2155 kmovw %eax,%k2 2156___ 2157$code.=<<___ if (!$win64); 2158 lea -8(%rsp),%r11 2159.cfi_def_cfa %r11,16 2160 sub \$0x128,%rsp 2161___ 2162$code.=<<___ if ($win64); 2163 lea -0xf8(%rsp),%r11 2164 sub \$0x1c8,%rsp 2165 vmovdqa %xmm6,0x50(%r11) 2166 vmovdqa %xmm7,0x60(%r11) 2167 vmovdqa %xmm8,0x70(%r11) 2168 vmovdqa %xmm9,0x80(%r11) 2169 vmovdqa %xmm10,0x90(%r11) 2170 vmovdqa %xmm11,0xa0(%r11) 2171 vmovdqa %xmm12,0xb0(%r11) 2172 vmovdqa %xmm13,0xc0(%r11) 2173 vmovdqa %xmm14,0xd0(%r11) 2174 vmovdqa %xmm15,0xe0(%r11) 2175.Ldo_avx512_body: 2176___ 2177$code.=<<___; 2178 lea .Lconst(%rip),%rcx 2179 lea 48+64($ctx),$ctx # size optimization 2180 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2181 2182 # expand pre-calculated table 2183 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2184 and \$-512,%rsp 2185 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2186 mov \$0x20,%rax 2187 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2188 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2189 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2190 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2191 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2192 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2193 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2194 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2195 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2196 vpermd $D1,$T2,$R1 2197 vpermd $T0,$T2,$S1 2198 vpermd $D2,$T2,$R2 2199 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2200 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2201 vpermd $T1,$T2,$S2 2202 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2203 vpsrlq \$32,$R1,$T1 2204 vpermd $D3,$T2,$R3 2205 vmovdqa64 $S1,0x40(%rsp){%k2} 2206 vpermd $T3,$T2,$S3 2207 vpermd $D4,$T2,$R4 2208 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2209 vpermd $T4,$T2,$S4 2210 vmovdqa64 $S2,0x80(%rsp){%k2} 2211 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2212 vmovdqa64 $S3,0xc0(%rsp){%k2} 2213 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2214 vmovdqa64 $S4,0x100(%rsp){%k2} 2215 2216 ################################################################ 2217 # calculate 5th through 8th powers of the key 2218 # 2219 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2220 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2221 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2222 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2223 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2224 2225 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2226 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2227 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2228 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2229 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2230 vpsrlq \$32,$R2,$T2 2231 2232 vpmuludq $T1,$S4,$M0 2233 vpmuludq $T1,$R0,$M1 2234 vpmuludq $T1,$R1,$M2 2235 vpmuludq $T1,$R2,$M3 2236 vpmuludq $T1,$R3,$M4 2237 vpsrlq \$32,$R3,$T3 2238 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2239 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2240 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2241 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2242 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2243 2244 vpmuludq $T2,$S3,$M0 2245 vpmuludq $T2,$S4,$M1 2246 vpmuludq $T2,$R1,$M3 2247 vpmuludq $T2,$R2,$M4 2248 vpmuludq $T2,$R0,$M2 2249 vpsrlq \$32,$R4,$T4 2250 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2251 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2252 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2253 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2254 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2255 2256 vpmuludq $T3,$S2,$M0 2257 vpmuludq $T3,$R0,$M3 2258 vpmuludq $T3,$R1,$M4 2259 vpmuludq $T3,$S3,$M1 2260 vpmuludq $T3,$S4,$M2 2261 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2262 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2263 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2264 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2265 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2266 2267 vpmuludq $T4,$S4,$M3 2268 vpmuludq $T4,$R0,$M4 2269 vpmuludq $T4,$S1,$M0 2270 vpmuludq $T4,$S2,$M1 2271 vpmuludq $T4,$S3,$M2 2272 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2273 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2274 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2275 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2276 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2277 2278 ################################################################ 2279 # load input 2280 vmovdqu64 16*0($inp),%z#$T3 2281 vmovdqu64 16*4($inp),%z#$T4 2282 lea 16*8($inp),$inp 2283 2284 ################################################################ 2285 # lazy reduction 2286 2287 vpsrlq \$26,$D3,$M3 2288 vpandq $MASK,$D3,$D3 2289 vpaddq $M3,$D4,$D4 # d3 -> d4 2290 2291 vpsrlq \$26,$D0,$M0 2292 vpandq $MASK,$D0,$D0 2293 vpaddq $M0,$D1,$D1 # d0 -> d1 2294 2295 vpsrlq \$26,$D4,$M4 2296 vpandq $MASK,$D4,$D4 2297 2298 vpsrlq \$26,$D1,$M1 2299 vpandq $MASK,$D1,$D1 2300 vpaddq $M1,$D2,$D2 # d1 -> d2 2301 2302 vpaddq $M4,$D0,$D0 2303 vpsllq \$2,$M4,$M4 2304 vpaddq $M4,$D0,$D0 # d4 -> d0 2305 2306 vpsrlq \$26,$D2,$M2 2307 vpandq $MASK,$D2,$D2 2308 vpaddq $M2,$D3,$D3 # d2 -> d3 2309 2310 vpsrlq \$26,$D0,$M0 2311 vpandq $MASK,$D0,$D0 2312 vpaddq $M0,$D1,$D1 # d0 -> d1 2313 2314 vpsrlq \$26,$D3,$M3 2315 vpandq $MASK,$D3,$D3 2316 vpaddq $M3,$D4,$D4 # d3 -> d4 2317 2318 ################################################################ 2319 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2320 # $D0-$D4, ... 2321 2322 vpunpcklqdq $T4,$T3,$T0 # transpose input 2323 vpunpckhqdq $T4,$T3,$T4 2324 2325 # ... since input 64-bit lanes are ordered as 73625140, we could 2326 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2327 # we could just flow along, hence the goal for $R0-$S4 is 2328 # 1858286838784888 ... 2329 2330 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2331 mov \$0x7777,%eax 2332 kmovw %eax,%k1 2333 2334 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2335 vpermd $R1,$M0,$R1 2336 vpermd $R2,$M0,$R2 2337 vpermd $R3,$M0,$R3 2338 vpermd $R4,$M0,$R4 2339 2340 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2341 vpermd $D1,$M0,${R1}{%k1} 2342 vpermd $D2,$M0,${R2}{%k1} 2343 vpermd $D3,$M0,${R3}{%k1} 2344 vpermd $D4,$M0,${R4}{%k1} 2345 2346 vpslld \$2,$R1,$S1 # *5 2347 vpslld \$2,$R2,$S2 2348 vpslld \$2,$R3,$S3 2349 vpslld \$2,$R4,$S4 2350 vpaddd $R1,$S1,$S1 2351 vpaddd $R2,$S2,$S2 2352 vpaddd $R3,$S3,$S3 2353 vpaddd $R4,$S4,$S4 2354 2355 vpbroadcastq 32(%rcx),$PADBIT # .L129 2356 2357 vpsrlq \$52,$T0,$T2 # splat input 2358 vpsllq \$12,$T4,$T3 2359 vporq $T3,$T2,$T2 2360 vpsrlq \$26,$T0,$T1 2361 vpsrlq \$14,$T4,$T3 2362 vpsrlq \$40,$T4,$T4 # 4 2363 vpandq $MASK,$T2,$T2 # 2 2364 vpandq $MASK,$T0,$T0 # 0 2365 #vpandq $MASK,$T1,$T1 # 1 2366 #vpandq $MASK,$T3,$T3 # 3 2367 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2368 2369 vpaddq $H2,$T2,$H2 # accumulate input 2370 sub \$192,$len 2371 jbe .Ltail_avx512 2372 jmp .Loop_avx512 2373 2374.align 32 2375.Loop_avx512: 2376 ################################################################ 2377 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2378 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2379 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2380 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2381 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2382 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2383 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2384 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2385 # \________/\___________/ 2386 ################################################################ 2387 #vpaddq $H2,$T2,$H2 # accumulate input 2388 2389 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2390 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2391 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2392 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2393 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2394 # 2395 # however, as h2 is "chronologically" first one available pull 2396 # corresponding operations up, so it's 2397 # 2398 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2399 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2400 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2401 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2402 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2403 2404 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2405 vpaddq $H0,$T0,$H0 2406 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2407 vpandq $MASK,$T1,$T1 # 1 2408 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2409 vpandq $MASK,$T3,$T3 # 3 2410 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2411 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2412 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2413 vpaddq $H1,$T1,$H1 # accumulate input 2414 vpaddq $H3,$T3,$H3 2415 vpaddq $H4,$T4,$H4 2416 2417 vmovdqu64 16*0($inp),$T3 # load input 2418 vmovdqu64 16*4($inp),$T4 2419 lea 16*8($inp),$inp 2420 vpmuludq $H0,$R3,$M3 2421 vpmuludq $H0,$R4,$M4 2422 vpmuludq $H0,$R0,$M0 2423 vpmuludq $H0,$R1,$M1 2424 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2425 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2426 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2427 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2428 2429 vpmuludq $H1,$R2,$M3 2430 vpmuludq $H1,$R3,$M4 2431 vpmuludq $H1,$S4,$M0 2432 vpmuludq $H0,$R2,$M2 2433 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2434 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2435 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2436 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2437 2438 vpunpcklqdq $T4,$T3,$T0 # transpose input 2439 vpunpckhqdq $T4,$T3,$T4 2440 2441 vpmuludq $H3,$R0,$M3 2442 vpmuludq $H3,$R1,$M4 2443 vpmuludq $H1,$R0,$M1 2444 vpmuludq $H1,$R1,$M2 2445 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2446 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2447 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2448 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2449 2450 vpmuludq $H4,$S4,$M3 2451 vpmuludq $H4,$R0,$M4 2452 vpmuludq $H3,$S2,$M0 2453 vpmuludq $H3,$S3,$M1 2454 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2455 vpmuludq $H3,$S4,$M2 2456 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2457 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2458 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2459 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2460 2461 vpmuludq $H4,$S1,$M0 2462 vpmuludq $H4,$S2,$M1 2463 vpmuludq $H4,$S3,$M2 2464 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2465 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2466 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2467 2468 ################################################################ 2469 # lazy reduction (interleaved with input splat) 2470 2471 vpsrlq \$52,$T0,$T2 # splat input 2472 vpsllq \$12,$T4,$T3 2473 2474 vpsrlq \$26,$D3,$H3 2475 vpandq $MASK,$D3,$D3 2476 vpaddq $H3,$D4,$H4 # h3 -> h4 2477 2478 vporq $T3,$T2,$T2 2479 2480 vpsrlq \$26,$H0,$D0 2481 vpandq $MASK,$H0,$H0 2482 vpaddq $D0,$H1,$H1 # h0 -> h1 2483 2484 vpandq $MASK,$T2,$T2 # 2 2485 2486 vpsrlq \$26,$H4,$D4 2487 vpandq $MASK,$H4,$H4 2488 2489 vpsrlq \$26,$H1,$D1 2490 vpandq $MASK,$H1,$H1 2491 vpaddq $D1,$H2,$H2 # h1 -> h2 2492 2493 vpaddq $D4,$H0,$H0 2494 vpsllq \$2,$D4,$D4 2495 vpaddq $D4,$H0,$H0 # h4 -> h0 2496 2497 vpaddq $T2,$H2,$H2 # modulo-scheduled 2498 vpsrlq \$26,$T0,$T1 2499 2500 vpsrlq \$26,$H2,$D2 2501 vpandq $MASK,$H2,$H2 2502 vpaddq $D2,$D3,$H3 # h2 -> h3 2503 2504 vpsrlq \$14,$T4,$T3 2505 2506 vpsrlq \$26,$H0,$D0 2507 vpandq $MASK,$H0,$H0 2508 vpaddq $D0,$H1,$H1 # h0 -> h1 2509 2510 vpsrlq \$40,$T4,$T4 # 4 2511 2512 vpsrlq \$26,$H3,$D3 2513 vpandq $MASK,$H3,$H3 2514 vpaddq $D3,$H4,$H4 # h3 -> h4 2515 2516 vpandq $MASK,$T0,$T0 # 0 2517 #vpandq $MASK,$T1,$T1 # 1 2518 #vpandq $MASK,$T3,$T3 # 3 2519 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2520 2521 sub \$128,$len 2522 ja .Loop_avx512 2523 2524.Ltail_avx512: 2525 ################################################################ 2526 # while above multiplications were by r^8 in all lanes, in last 2527 # iteration we multiply least significant lane by r^8 and most 2528 # significant one by r, that's why table gets shifted... 2529 2530 vpsrlq \$32,$R0,$R0 # 0105020603070408 2531 vpsrlq \$32,$R1,$R1 2532 vpsrlq \$32,$R2,$R2 2533 vpsrlq \$32,$S3,$S3 2534 vpsrlq \$32,$S4,$S4 2535 vpsrlq \$32,$R3,$R3 2536 vpsrlq \$32,$R4,$R4 2537 vpsrlq \$32,$S1,$S1 2538 vpsrlq \$32,$S2,$S2 2539 2540 ################################################################ 2541 # load either next or last 64 byte of input 2542 lea ($inp,$len),$inp 2543 2544 #vpaddq $H2,$T2,$H2 # accumulate input 2545 vpaddq $H0,$T0,$H0 2546 2547 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2548 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2549 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2550 vpandq $MASK,$T1,$T1 # 1 2551 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2552 vpandq $MASK,$T3,$T3 # 3 2553 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2554 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2555 vpaddq $H1,$T1,$H1 # accumulate input 2556 vpaddq $H3,$T3,$H3 2557 vpaddq $H4,$T4,$H4 2558 2559 vmovdqu 16*0($inp),%x#$T0 2560 vpmuludq $H0,$R3,$M3 2561 vpmuludq $H0,$R4,$M4 2562 vpmuludq $H0,$R0,$M0 2563 vpmuludq $H0,$R1,$M1 2564 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2565 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2566 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2567 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2568 2569 vmovdqu 16*1($inp),%x#$T1 2570 vpmuludq $H1,$R2,$M3 2571 vpmuludq $H1,$R3,$M4 2572 vpmuludq $H1,$S4,$M0 2573 vpmuludq $H0,$R2,$M2 2574 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2575 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2576 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2577 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2578 2579 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2580 vpmuludq $H3,$R0,$M3 2581 vpmuludq $H3,$R1,$M4 2582 vpmuludq $H1,$R0,$M1 2583 vpmuludq $H1,$R1,$M2 2584 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2585 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2586 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2587 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2588 2589 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2590 vpmuludq $H4,$S4,$M3 2591 vpmuludq $H4,$R0,$M4 2592 vpmuludq $H3,$S2,$M0 2593 vpmuludq $H3,$S3,$M1 2594 vpmuludq $H3,$S4,$M2 2595 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2596 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2597 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2598 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2599 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2600 2601 vpmuludq $H4,$S1,$M0 2602 vpmuludq $H4,$S2,$M1 2603 vpmuludq $H4,$S3,$M2 2604 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2605 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2606 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2607 2608 ################################################################ 2609 # horizontal addition 2610 2611 mov \$1,%eax 2612 vpermq \$0xb1,$H3,$D3 2613 vpermq \$0xb1,$D4,$H4 2614 vpermq \$0xb1,$H0,$D0 2615 vpermq \$0xb1,$H1,$D1 2616 vpermq \$0xb1,$H2,$D2 2617 vpaddq $D3,$H3,$H3 2618 vpaddq $D4,$H4,$H4 2619 vpaddq $D0,$H0,$H0 2620 vpaddq $D1,$H1,$H1 2621 vpaddq $D2,$H2,$H2 2622 2623 kmovw %eax,%k3 2624 vpermq \$0x2,$H3,$D3 2625 vpermq \$0x2,$H4,$D4 2626 vpermq \$0x2,$H0,$D0 2627 vpermq \$0x2,$H1,$D1 2628 vpermq \$0x2,$H2,$D2 2629 vpaddq $D3,$H3,$H3 2630 vpaddq $D4,$H4,$H4 2631 vpaddq $D0,$H0,$H0 2632 vpaddq $D1,$H1,$H1 2633 vpaddq $D2,$H2,$H2 2634 2635 vextracti64x4 \$0x1,$H3,%y#$D3 2636 vextracti64x4 \$0x1,$H4,%y#$D4 2637 vextracti64x4 \$0x1,$H0,%y#$D0 2638 vextracti64x4 \$0x1,$H1,%y#$D1 2639 vextracti64x4 \$0x1,$H2,%y#$D2 2640 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2641 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2642 vpaddq $D0,$H0,${H0}{%k3}{z} 2643 vpaddq $D1,$H1,${H1}{%k3}{z} 2644 vpaddq $D2,$H2,${H2}{%k3}{z} 2645___ 2646map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2647map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2648$code.=<<___; 2649 ################################################################ 2650 # lazy reduction (interleaved with input splat) 2651 2652 vpsrlq \$26,$H3,$D3 2653 vpand $MASK,$H3,$H3 2654 vpsrldq \$6,$T0,$T2 # splat input 2655 vpsrldq \$6,$T1,$T3 2656 vpunpckhqdq $T1,$T0,$T4 # 4 2657 vpaddq $D3,$H4,$H4 # h3 -> h4 2658 2659 vpsrlq \$26,$H0,$D0 2660 vpand $MASK,$H0,$H0 2661 vpunpcklqdq $T3,$T2,$T2 # 2:3 2662 vpunpcklqdq $T1,$T0,$T0 # 0:1 2663 vpaddq $D0,$H1,$H1 # h0 -> h1 2664 2665 vpsrlq \$26,$H4,$D4 2666 vpand $MASK,$H4,$H4 2667 2668 vpsrlq \$26,$H1,$D1 2669 vpand $MASK,$H1,$H1 2670 vpsrlq \$30,$T2,$T3 2671 vpsrlq \$4,$T2,$T2 2672 vpaddq $D1,$H2,$H2 # h1 -> h2 2673 2674 vpaddq $D4,$H0,$H0 2675 vpsllq \$2,$D4,$D4 2676 vpsrlq \$26,$T0,$T1 2677 vpsrlq \$40,$T4,$T4 # 4 2678 vpaddq $D4,$H0,$H0 # h4 -> h0 2679 2680 vpsrlq \$26,$H2,$D2 2681 vpand $MASK,$H2,$H2 2682 vpand $MASK,$T2,$T2 # 2 2683 vpand $MASK,$T0,$T0 # 0 2684 vpaddq $D2,$H3,$H3 # h2 -> h3 2685 2686 vpsrlq \$26,$H0,$D0 2687 vpand $MASK,$H0,$H0 2688 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2689 vpand $MASK,$T1,$T1 # 1 2690 vpaddq $D0,$H1,$H1 # h0 -> h1 2691 2692 vpsrlq \$26,$H3,$D3 2693 vpand $MASK,$H3,$H3 2694 vpand $MASK,$T3,$T3 # 3 2695 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2696 vpaddq $D3,$H4,$H4 # h3 -> h4 2697 2698 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2699 add \$64,$len 2700 jnz .Ltail_avx2 2701 2702 vpsubq $T2,$H2,$H2 # undo input accumulation 2703 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2704 vmovd %x#$H1,`4*1-48-64`($ctx) 2705 vmovd %x#$H2,`4*2-48-64`($ctx) 2706 vmovd %x#$H3,`4*3-48-64`($ctx) 2707 vmovd %x#$H4,`4*4-48-64`($ctx) 2708 vzeroall 2709___ 2710$code.=<<___ if ($win64); 2711 movdqa 0x50(%r11),%xmm6 2712 movdqa 0x60(%r11),%xmm7 2713 movdqa 0x70(%r11),%xmm8 2714 movdqa 0x80(%r11),%xmm9 2715 movdqa 0x90(%r11),%xmm10 2716 movdqa 0xa0(%r11),%xmm11 2717 movdqa 0xb0(%r11),%xmm12 2718 movdqa 0xc0(%r11),%xmm13 2719 movdqa 0xd0(%r11),%xmm14 2720 movdqa 0xe0(%r11),%xmm15 2721 lea 0xf8(%r11),%rsp 2722.Ldo_avx512_epilogue: 2723___ 2724$code.=<<___ if (!$win64); 2725 lea 8(%r11),%rsp 2726.cfi_def_cfa %rsp,8 2727___ 2728$code.=<<___; 2729 ret 2730.cfi_endproc 2731.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 2732___ 2733if ($avx>3 && !$win64) { 2734######################################################################## 2735# VPMADD52 version using 2^44 radix. 2736# 2737# One can argue that base 2^52 would be more natural. Well, even though 2738# some operations would be more natural, one has to recognize couple of 2739# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2740# at amount of multiply-n-accumulate operations. Secondly, it makes it 2741# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2742# reference implementations], which means that more such operations 2743# would have to be performed in inner loop, which in turn makes critical 2744# path longer. In other words, even though base 2^44 reduction might 2745# look less elegant, overall critical path is actually shorter... 2746 2747######################################################################## 2748# Layout of opaque area is following. 2749# 2750# unsigned __int64 h[3]; # current hash value base 2^44 2751# unsigned __int64 s[2]; # key value*20 base 2^44 2752# unsigned __int64 r[3]; # key value base 2^44 2753# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2754# # r^n positions reflect 2755# # placement in register, not 2756# # memory, R[3] is R[1]*20 2757 2758$code.=<<___; 2759.type poly1305_init_base2_44,\@function,3 2760.align 32 2761poly1305_init_base2_44: 2762.cfi_startproc 2763 xor %rax,%rax 2764 mov %rax,0($ctx) # initialize hash value 2765 mov %rax,8($ctx) 2766 mov %rax,16($ctx) 2767 2768.Linit_base2_44: 2769 lea poly1305_blocks_vpmadd52(%rip),%r10 2770 lea poly1305_emit_base2_44(%rip),%r11 2771 2772 mov \$0x0ffffffc0fffffff,%rax 2773 mov \$0x0ffffffc0ffffffc,%rcx 2774 and 0($inp),%rax 2775 mov \$0x00000fffffffffff,%r8 2776 and 8($inp),%rcx 2777 mov \$0x00000fffffffffff,%r9 2778 and %rax,%r8 2779 shrd \$44,%rcx,%rax 2780 mov %r8,40($ctx) # r0 2781 and %r9,%rax 2782 shr \$24,%rcx 2783 mov %rax,48($ctx) # r1 2784 lea (%rax,%rax,4),%rax # *5 2785 mov %rcx,56($ctx) # r2 2786 shl \$2,%rax # magic <<2 2787 lea (%rcx,%rcx,4),%rcx # *5 2788 shl \$2,%rcx # magic <<2 2789 mov %rax,24($ctx) # s1 2790 mov %rcx,32($ctx) # s2 2791 movq \$-1,64($ctx) # write impossible value 2792___ 2793$code.=<<___ if ($flavour !~ /elf32/); 2794 mov %r10,0(%rdx) 2795 mov %r11,8(%rdx) 2796___ 2797$code.=<<___ if ($flavour =~ /elf32/); 2798 mov %r10d,0(%rdx) 2799 mov %r11d,4(%rdx) 2800___ 2801$code.=<<___; 2802 mov \$1,%eax 2803 ret 2804.cfi_endproc 2805.size poly1305_init_base2_44,.-poly1305_init_base2_44 2806___ 2807{ 2808my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2809my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2810my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2811 2812$code.=<<___; 2813.type poly1305_blocks_vpmadd52,\@function,4 2814.align 32 2815poly1305_blocks_vpmadd52: 2816.cfi_startproc 2817 endbranch 2818 shr \$4,$len 2819 jz .Lno_data_vpmadd52 # too short 2820 2821 shl \$40,$padbit 2822 mov 64($ctx),%r8 # peek on power of the key 2823 2824 # if powers of the key are not calculated yet, process up to 3 2825 # blocks with this single-block subroutine, otherwise ensure that 2826 # length is divisible by 2 blocks and pass the rest down to next 2827 # subroutine... 2828 2829 mov \$3,%rax 2830 mov \$1,%r10 2831 cmp \$4,$len # is input long 2832 cmovae %r10,%rax 2833 test %r8,%r8 # is power value impossible? 2834 cmovns %r10,%rax 2835 2836 and $len,%rax # is input of favourable length? 2837 jz .Lblocks_vpmadd52_4x 2838 2839 sub %rax,$len 2840 mov \$7,%r10d 2841 mov \$1,%r11d 2842 kmovw %r10d,%k7 2843 lea .L2_44_inp_permd(%rip),%r10 2844 kmovw %r11d,%k1 2845 2846 vmovq $padbit,%x#$PAD 2847 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2848 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2849 vpermq \$0xcf,$PAD,$PAD 2850 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2851 2852 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2853 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2854 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2855 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2856 2857 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2858 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2859 2860 jmp .Loop_vpmadd52 2861 2862.align 32 2863.Loop_vpmadd52: 2864 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2865 lea 16($inp),$inp 2866 2867 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2868 vpsrlvq $inp_shift,$T0,$T0 2869 vpandq $reduc_mask,$T0,$T0 2870 vporq $PAD,$T0,$T0 2871 2872 vpaddq $T0,$Dlo,$Dlo # accumulate input 2873 2874 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2875 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2876 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2877 2878 vpxord $Dlo,$Dlo,$Dlo 2879 vpxord $Dhi,$Dhi,$Dhi 2880 2881 vpmadd52luq $r2r1r0,$H0,$Dlo 2882 vpmadd52huq $r2r1r0,$H0,$Dhi 2883 2884 vpmadd52luq $r1r0s2,$H1,$Dlo 2885 vpmadd52huq $r1r0s2,$H1,$Dhi 2886 2887 vpmadd52luq $r0s2s1,$H2,$Dlo 2888 vpmadd52huq $r0s2s1,$H2,$Dhi 2889 2890 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2891 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2892 vpandq $reduc_mask,$Dlo,$Dlo 2893 2894 vpaddq $T0,$Dhi,$Dhi 2895 2896 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2897 2898 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2899 2900 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2901 vpandq $reduc_mask,$Dlo,$Dlo 2902 2903 vpermq \$0b10010011,$T0,$T0 2904 2905 vpaddq $T0,$Dlo,$Dlo 2906 2907 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2908 2909 vpaddq $T0,$Dlo,$Dlo 2910 vpsllq \$2,$T0,$T0 2911 2912 vpaddq $T0,$Dlo,$Dlo 2913 2914 dec %rax # len-=16 2915 jnz .Loop_vpmadd52 2916 2917 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 2918 2919 test $len,$len 2920 jnz .Lblocks_vpmadd52_4x 2921 2922.Lno_data_vpmadd52: 2923 ret 2924.cfi_endproc 2925.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 2926___ 2927} 2928{ 2929######################################################################## 2930# As implied by its name 4x subroutine processes 4 blocks in parallel 2931# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 2932# and is handled in 256-bit %ymm registers. 2933 2934my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 2935my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 2936my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 2937 2938$code.=<<___; 2939.type poly1305_blocks_vpmadd52_4x,\@function,4 2940.align 32 2941poly1305_blocks_vpmadd52_4x: 2942.cfi_startproc 2943 shr \$4,$len 2944 jz .Lno_data_vpmadd52_4x # too short 2945 2946 shl \$40,$padbit 2947 mov 64($ctx),%r8 # peek on power of the key 2948 2949.Lblocks_vpmadd52_4x: 2950 vpbroadcastq $padbit,$PAD 2951 2952 vmovdqa64 .Lx_mask44(%rip),$mask44 2953 mov \$5,%eax 2954 vmovdqa64 .Lx_mask42(%rip),$mask42 2955 kmovw %eax,%k1 # used in 2x path 2956 2957 test %r8,%r8 # is power value impossible? 2958 js .Linit_vpmadd52 # if it is, then init R[4] 2959 2960 vmovq 0($ctx),%x#$H0 # load current hash value 2961 vmovq 8($ctx),%x#$H1 2962 vmovq 16($ctx),%x#$H2 2963 2964 test \$3,$len # is length 4*n+2? 2965 jnz .Lblocks_vpmadd52_2x_do 2966 2967.Lblocks_vpmadd52_4x_do: 2968 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 2969 vpbroadcastq 96($ctx),$R1 2970 vpbroadcastq 128($ctx),$R2 2971 vpbroadcastq 160($ctx),$S1 2972 2973.Lblocks_vpmadd52_4x_key_loaded: 2974 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 2975 vpaddq $R2,$S2,$S2 2976 vpsllq \$2,$S2,$S2 2977 2978 test \$7,$len # is len 8*n? 2979 jz .Lblocks_vpmadd52_8x 2980 2981 vmovdqu64 16*0($inp),$T2 # load data 2982 vmovdqu64 16*2($inp),$T3 2983 lea 16*4($inp),$inp 2984 2985 vpunpcklqdq $T3,$T2,$T1 # transpose data 2986 vpunpckhqdq $T3,$T2,$T3 2987 2988 # at this point 64-bit lanes are ordered as 3-1-2-0 2989 2990 vpsrlq \$24,$T3,$T2 # splat the data 2991 vporq $PAD,$T2,$T2 2992 vpaddq $T2,$H2,$H2 # accumulate input 2993 vpandq $mask44,$T1,$T0 2994 vpsrlq \$44,$T1,$T1 2995 vpsllq \$20,$T3,$T3 2996 vporq $T3,$T1,$T1 2997 vpandq $mask44,$T1,$T1 2998 2999 sub \$4,$len 3000 jz .Ltail_vpmadd52_4x 3001 jmp .Loop_vpmadd52_4x 3002 ud2 3003 3004.align 32 3005.Linit_vpmadd52: 3006 vmovq 24($ctx),%x#$S1 # load key 3007 vmovq 56($ctx),%x#$H2 3008 vmovq 32($ctx),%x#$S2 3009 vmovq 40($ctx),%x#$R0 3010 vmovq 48($ctx),%x#$R1 3011 3012 vmovdqa $R0,$H0 3013 vmovdqa $R1,$H1 3014 vmovdqa $H2,$R2 3015 3016 mov \$2,%eax 3017 3018.Lmul_init_vpmadd52: 3019 vpxorq $D0lo,$D0lo,$D0lo 3020 vpmadd52luq $H2,$S1,$D0lo 3021 vpxorq $D0hi,$D0hi,$D0hi 3022 vpmadd52huq $H2,$S1,$D0hi 3023 vpxorq $D1lo,$D1lo,$D1lo 3024 vpmadd52luq $H2,$S2,$D1lo 3025 vpxorq $D1hi,$D1hi,$D1hi 3026 vpmadd52huq $H2,$S2,$D1hi 3027 vpxorq $D2lo,$D2lo,$D2lo 3028 vpmadd52luq $H2,$R0,$D2lo 3029 vpxorq $D2hi,$D2hi,$D2hi 3030 vpmadd52huq $H2,$R0,$D2hi 3031 3032 vpmadd52luq $H0,$R0,$D0lo 3033 vpmadd52huq $H0,$R0,$D0hi 3034 vpmadd52luq $H0,$R1,$D1lo 3035 vpmadd52huq $H0,$R1,$D1hi 3036 vpmadd52luq $H0,$R2,$D2lo 3037 vpmadd52huq $H0,$R2,$D2hi 3038 3039 vpmadd52luq $H1,$S2,$D0lo 3040 vpmadd52huq $H1,$S2,$D0hi 3041 vpmadd52luq $H1,$R0,$D1lo 3042 vpmadd52huq $H1,$R0,$D1hi 3043 vpmadd52luq $H1,$R1,$D2lo 3044 vpmadd52huq $H1,$R1,$D2hi 3045 3046 ################################################################ 3047 # partial reduction 3048 vpsrlq \$44,$D0lo,$tmp 3049 vpsllq \$8,$D0hi,$D0hi 3050 vpandq $mask44,$D0lo,$H0 3051 vpaddq $tmp,$D0hi,$D0hi 3052 3053 vpaddq $D0hi,$D1lo,$D1lo 3054 3055 vpsrlq \$44,$D1lo,$tmp 3056 vpsllq \$8,$D1hi,$D1hi 3057 vpandq $mask44,$D1lo,$H1 3058 vpaddq $tmp,$D1hi,$D1hi 3059 3060 vpaddq $D1hi,$D2lo,$D2lo 3061 3062 vpsrlq \$42,$D2lo,$tmp 3063 vpsllq \$10,$D2hi,$D2hi 3064 vpandq $mask42,$D2lo,$H2 3065 vpaddq $tmp,$D2hi,$D2hi 3066 3067 vpaddq $D2hi,$H0,$H0 3068 vpsllq \$2,$D2hi,$D2hi 3069 3070 vpaddq $D2hi,$H0,$H0 3071 3072 vpsrlq \$44,$H0,$tmp # additional step 3073 vpandq $mask44,$H0,$H0 3074 3075 vpaddq $tmp,$H1,$H1 3076 3077 dec %eax 3078 jz .Ldone_init_vpmadd52 3079 3080 vpunpcklqdq $R1,$H1,$R1 # 1,2 3081 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3082 vpunpcklqdq $R2,$H2,$R2 3083 vpbroadcastq %x#$H2,%x#$H2 3084 vpunpcklqdq $R0,$H0,$R0 3085 vpbroadcastq %x#$H0,%x#$H0 3086 3087 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3088 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3089 vpaddq $R1,$S1,$S1 3090 vpaddq $R2,$S2,$S2 3091 vpsllq \$2,$S1,$S1 3092 vpsllq \$2,$S2,$S2 3093 3094 jmp .Lmul_init_vpmadd52 3095 ud2 3096 3097.align 32 3098.Ldone_init_vpmadd52: 3099 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3100 vinserti128 \$1,%x#$R2,$H2,$R2 3101 vinserti128 \$1,%x#$R0,$H0,$R0 3102 3103 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3104 vpermq \$0b11011000,$R2,$R2 3105 vpermq \$0b11011000,$R0,$R0 3106 3107 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3108 vpaddq $R1,$S1,$S1 3109 vpsllq \$2,$S1,$S1 3110 3111 vmovq 0($ctx),%x#$H0 # load current hash value 3112 vmovq 8($ctx),%x#$H1 3113 vmovq 16($ctx),%x#$H2 3114 3115 test \$3,$len # is length 4*n+2? 3116 jnz .Ldone_init_vpmadd52_2x 3117 3118 vmovdqu64 $R0,64($ctx) # save key powers 3119 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3120 vmovdqu64 $R1,96($ctx) 3121 vpbroadcastq %x#$R1,$R1 3122 vmovdqu64 $R2,128($ctx) 3123 vpbroadcastq %x#$R2,$R2 3124 vmovdqu64 $S1,160($ctx) 3125 vpbroadcastq %x#$S1,$S1 3126 3127 jmp .Lblocks_vpmadd52_4x_key_loaded 3128 ud2 3129 3130.align 32 3131.Ldone_init_vpmadd52_2x: 3132 vmovdqu64 $R0,64($ctx) # save key powers 3133 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3134 vmovdqu64 $R1,96($ctx) 3135 vpsrldq \$8,$R1,$R1 3136 vmovdqu64 $R2,128($ctx) 3137 vpsrldq \$8,$R2,$R2 3138 vmovdqu64 $S1,160($ctx) 3139 vpsrldq \$8,$S1,$S1 3140 jmp .Lblocks_vpmadd52_2x_key_loaded 3141 ud2 3142 3143.align 32 3144.Lblocks_vpmadd52_2x_do: 3145 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3146 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3147 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3148 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3149 3150.Lblocks_vpmadd52_2x_key_loaded: 3151 vmovdqu64 16*0($inp),$T2 # load data 3152 vpxorq $T3,$T3,$T3 3153 lea 16*2($inp),$inp 3154 3155 vpunpcklqdq $T3,$T2,$T1 # transpose data 3156 vpunpckhqdq $T3,$T2,$T3 3157 3158 # at this point 64-bit lanes are ordered as x-1-x-0 3159 3160 vpsrlq \$24,$T3,$T2 # splat the data 3161 vporq $PAD,$T2,$T2 3162 vpaddq $T2,$H2,$H2 # accumulate input 3163 vpandq $mask44,$T1,$T0 3164 vpsrlq \$44,$T1,$T1 3165 vpsllq \$20,$T3,$T3 3166 vporq $T3,$T1,$T1 3167 vpandq $mask44,$T1,$T1 3168 3169 jmp .Ltail_vpmadd52_2x 3170 ud2 3171 3172.align 32 3173.Loop_vpmadd52_4x: 3174 #vpaddq $T2,$H2,$H2 # accumulate input 3175 vpaddq $T0,$H0,$H0 3176 vpaddq $T1,$H1,$H1 3177 3178 vpxorq $D0lo,$D0lo,$D0lo 3179 vpmadd52luq $H2,$S1,$D0lo 3180 vpxorq $D0hi,$D0hi,$D0hi 3181 vpmadd52huq $H2,$S1,$D0hi 3182 vpxorq $D1lo,$D1lo,$D1lo 3183 vpmadd52luq $H2,$S2,$D1lo 3184 vpxorq $D1hi,$D1hi,$D1hi 3185 vpmadd52huq $H2,$S2,$D1hi 3186 vpxorq $D2lo,$D2lo,$D2lo 3187 vpmadd52luq $H2,$R0,$D2lo 3188 vpxorq $D2hi,$D2hi,$D2hi 3189 vpmadd52huq $H2,$R0,$D2hi 3190 3191 vmovdqu64 16*0($inp),$T2 # load data 3192 vmovdqu64 16*2($inp),$T3 3193 lea 16*4($inp),$inp 3194 vpmadd52luq $H0,$R0,$D0lo 3195 vpmadd52huq $H0,$R0,$D0hi 3196 vpmadd52luq $H0,$R1,$D1lo 3197 vpmadd52huq $H0,$R1,$D1hi 3198 vpmadd52luq $H0,$R2,$D2lo 3199 vpmadd52huq $H0,$R2,$D2hi 3200 3201 vpunpcklqdq $T3,$T2,$T1 # transpose data 3202 vpunpckhqdq $T3,$T2,$T3 3203 vpmadd52luq $H1,$S2,$D0lo 3204 vpmadd52huq $H1,$S2,$D0hi 3205 vpmadd52luq $H1,$R0,$D1lo 3206 vpmadd52huq $H1,$R0,$D1hi 3207 vpmadd52luq $H1,$R1,$D2lo 3208 vpmadd52huq $H1,$R1,$D2hi 3209 3210 ################################################################ 3211 # partial reduction (interleaved with data splat) 3212 vpsrlq \$44,$D0lo,$tmp 3213 vpsllq \$8,$D0hi,$D0hi 3214 vpandq $mask44,$D0lo,$H0 3215 vpaddq $tmp,$D0hi,$D0hi 3216 3217 vpsrlq \$24,$T3,$T2 3218 vporq $PAD,$T2,$T2 3219 vpaddq $D0hi,$D1lo,$D1lo 3220 3221 vpsrlq \$44,$D1lo,$tmp 3222 vpsllq \$8,$D1hi,$D1hi 3223 vpandq $mask44,$D1lo,$H1 3224 vpaddq $tmp,$D1hi,$D1hi 3225 3226 vpandq $mask44,$T1,$T0 3227 vpsrlq \$44,$T1,$T1 3228 vpsllq \$20,$T3,$T3 3229 vpaddq $D1hi,$D2lo,$D2lo 3230 3231 vpsrlq \$42,$D2lo,$tmp 3232 vpsllq \$10,$D2hi,$D2hi 3233 vpandq $mask42,$D2lo,$H2 3234 vpaddq $tmp,$D2hi,$D2hi 3235 3236 vpaddq $T2,$H2,$H2 # accumulate input 3237 vpaddq $D2hi,$H0,$H0 3238 vpsllq \$2,$D2hi,$D2hi 3239 3240 vpaddq $D2hi,$H0,$H0 3241 vporq $T3,$T1,$T1 3242 vpandq $mask44,$T1,$T1 3243 3244 vpsrlq \$44,$H0,$tmp # additional step 3245 vpandq $mask44,$H0,$H0 3246 3247 vpaddq $tmp,$H1,$H1 3248 3249 sub \$4,$len # len-=64 3250 jnz .Loop_vpmadd52_4x 3251 3252.Ltail_vpmadd52_4x: 3253 vmovdqu64 128($ctx),$R2 # load all key powers 3254 vmovdqu64 160($ctx),$S1 3255 vmovdqu64 64($ctx),$R0 3256 vmovdqu64 96($ctx),$R1 3257 3258.Ltail_vpmadd52_2x: 3259 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3260 vpaddq $R2,$S2,$S2 3261 vpsllq \$2,$S2,$S2 3262 3263 #vpaddq $T2,$H2,$H2 # accumulate input 3264 vpaddq $T0,$H0,$H0 3265 vpaddq $T1,$H1,$H1 3266 3267 vpxorq $D0lo,$D0lo,$D0lo 3268 vpmadd52luq $H2,$S1,$D0lo 3269 vpxorq $D0hi,$D0hi,$D0hi 3270 vpmadd52huq $H2,$S1,$D0hi 3271 vpxorq $D1lo,$D1lo,$D1lo 3272 vpmadd52luq $H2,$S2,$D1lo 3273 vpxorq $D1hi,$D1hi,$D1hi 3274 vpmadd52huq $H2,$S2,$D1hi 3275 vpxorq $D2lo,$D2lo,$D2lo 3276 vpmadd52luq $H2,$R0,$D2lo 3277 vpxorq $D2hi,$D2hi,$D2hi 3278 vpmadd52huq $H2,$R0,$D2hi 3279 3280 vpmadd52luq $H0,$R0,$D0lo 3281 vpmadd52huq $H0,$R0,$D0hi 3282 vpmadd52luq $H0,$R1,$D1lo 3283 vpmadd52huq $H0,$R1,$D1hi 3284 vpmadd52luq $H0,$R2,$D2lo 3285 vpmadd52huq $H0,$R2,$D2hi 3286 3287 vpmadd52luq $H1,$S2,$D0lo 3288 vpmadd52huq $H1,$S2,$D0hi 3289 vpmadd52luq $H1,$R0,$D1lo 3290 vpmadd52huq $H1,$R0,$D1hi 3291 vpmadd52luq $H1,$R1,$D2lo 3292 vpmadd52huq $H1,$R1,$D2hi 3293 3294 ################################################################ 3295 # horizontal addition 3296 3297 mov \$1,%eax 3298 kmovw %eax,%k1 3299 vpsrldq \$8,$D0lo,$T0 3300 vpsrldq \$8,$D0hi,$H0 3301 vpsrldq \$8,$D1lo,$T1 3302 vpsrldq \$8,$D1hi,$H1 3303 vpaddq $T0,$D0lo,$D0lo 3304 vpaddq $H0,$D0hi,$D0hi 3305 vpsrldq \$8,$D2lo,$T2 3306 vpsrldq \$8,$D2hi,$H2 3307 vpaddq $T1,$D1lo,$D1lo 3308 vpaddq $H1,$D1hi,$D1hi 3309 vpermq \$0x2,$D0lo,$T0 3310 vpermq \$0x2,$D0hi,$H0 3311 vpaddq $T2,$D2lo,$D2lo 3312 vpaddq $H2,$D2hi,$D2hi 3313 3314 vpermq \$0x2,$D1lo,$T1 3315 vpermq \$0x2,$D1hi,$H1 3316 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3317 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3318 vpermq \$0x2,$D2lo,$T2 3319 vpermq \$0x2,$D2hi,$H2 3320 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3321 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3322 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3323 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3324 3325 ################################################################ 3326 # partial reduction 3327 vpsrlq \$44,$D0lo,$tmp 3328 vpsllq \$8,$D0hi,$D0hi 3329 vpandq $mask44,$D0lo,$H0 3330 vpaddq $tmp,$D0hi,$D0hi 3331 3332 vpaddq $D0hi,$D1lo,$D1lo 3333 3334 vpsrlq \$44,$D1lo,$tmp 3335 vpsllq \$8,$D1hi,$D1hi 3336 vpandq $mask44,$D1lo,$H1 3337 vpaddq $tmp,$D1hi,$D1hi 3338 3339 vpaddq $D1hi,$D2lo,$D2lo 3340 3341 vpsrlq \$42,$D2lo,$tmp 3342 vpsllq \$10,$D2hi,$D2hi 3343 vpandq $mask42,$D2lo,$H2 3344 vpaddq $tmp,$D2hi,$D2hi 3345 3346 vpaddq $D2hi,$H0,$H0 3347 vpsllq \$2,$D2hi,$D2hi 3348 3349 vpaddq $D2hi,$H0,$H0 3350 3351 vpsrlq \$44,$H0,$tmp # additional step 3352 vpandq $mask44,$H0,$H0 3353 3354 vpaddq $tmp,$H1,$H1 3355 # at this point $len is 3356 # either 4*n+2 or 0... 3357 sub \$2,$len # len-=32 3358 ja .Lblocks_vpmadd52_4x_do 3359 3360 vmovq %x#$H0,0($ctx) 3361 vmovq %x#$H1,8($ctx) 3362 vmovq %x#$H2,16($ctx) 3363 vzeroall 3364 3365.Lno_data_vpmadd52_4x: 3366 ret 3367.cfi_endproc 3368.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3369___ 3370} 3371{ 3372######################################################################## 3373# As implied by its name 8x subroutine processes 8 blocks in parallel... 3374# This is intermediate version, as it's used only in cases when input 3375# length is either 8*n, 8*n+1 or 8*n+2... 3376 3377my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3378my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3379my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3380my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3381 3382$code.=<<___; 3383.type poly1305_blocks_vpmadd52_8x,\@function,4 3384.align 32 3385poly1305_blocks_vpmadd52_8x: 3386.cfi_startproc 3387 shr \$4,$len 3388 jz .Lno_data_vpmadd52_8x # too short 3389 3390 shl \$40,$padbit 3391 mov 64($ctx),%r8 # peek on power of the key 3392 3393 vmovdqa64 .Lx_mask44(%rip),$mask44 3394 vmovdqa64 .Lx_mask42(%rip),$mask42 3395 3396 test %r8,%r8 # is power value impossible? 3397 js .Linit_vpmadd52 # if it is, then init R[4] 3398 3399 vmovq 0($ctx),%x#$H0 # load current hash value 3400 vmovq 8($ctx),%x#$H1 3401 vmovq 16($ctx),%x#$H2 3402 3403.Lblocks_vpmadd52_8x: 3404 ################################################################ 3405 # fist we calculate more key powers 3406 3407 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3408 vmovdqu64 160($ctx),$S1 3409 vmovdqu64 64($ctx),$R0 3410 vmovdqu64 96($ctx),$R1 3411 3412 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3413 vpaddq $R2,$S2,$S2 3414 vpsllq \$2,$S2,$S2 3415 3416 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3417 vpbroadcastq %x#$R0,$RR0 3418 vpbroadcastq %x#$R1,$RR1 3419 3420 vpxorq $D0lo,$D0lo,$D0lo 3421 vpmadd52luq $RR2,$S1,$D0lo 3422 vpxorq $D0hi,$D0hi,$D0hi 3423 vpmadd52huq $RR2,$S1,$D0hi 3424 vpxorq $D1lo,$D1lo,$D1lo 3425 vpmadd52luq $RR2,$S2,$D1lo 3426 vpxorq $D1hi,$D1hi,$D1hi 3427 vpmadd52huq $RR2,$S2,$D1hi 3428 vpxorq $D2lo,$D2lo,$D2lo 3429 vpmadd52luq $RR2,$R0,$D2lo 3430 vpxorq $D2hi,$D2hi,$D2hi 3431 vpmadd52huq $RR2,$R0,$D2hi 3432 3433 vpmadd52luq $RR0,$R0,$D0lo 3434 vpmadd52huq $RR0,$R0,$D0hi 3435 vpmadd52luq $RR0,$R1,$D1lo 3436 vpmadd52huq $RR0,$R1,$D1hi 3437 vpmadd52luq $RR0,$R2,$D2lo 3438 vpmadd52huq $RR0,$R2,$D2hi 3439 3440 vpmadd52luq $RR1,$S2,$D0lo 3441 vpmadd52huq $RR1,$S2,$D0hi 3442 vpmadd52luq $RR1,$R0,$D1lo 3443 vpmadd52huq $RR1,$R0,$D1hi 3444 vpmadd52luq $RR1,$R1,$D2lo 3445 vpmadd52huq $RR1,$R1,$D2hi 3446 3447 ################################################################ 3448 # partial reduction 3449 vpsrlq \$44,$D0lo,$tmp 3450 vpsllq \$8,$D0hi,$D0hi 3451 vpandq $mask44,$D0lo,$RR0 3452 vpaddq $tmp,$D0hi,$D0hi 3453 3454 vpaddq $D0hi,$D1lo,$D1lo 3455 3456 vpsrlq \$44,$D1lo,$tmp 3457 vpsllq \$8,$D1hi,$D1hi 3458 vpandq $mask44,$D1lo,$RR1 3459 vpaddq $tmp,$D1hi,$D1hi 3460 3461 vpaddq $D1hi,$D2lo,$D2lo 3462 3463 vpsrlq \$42,$D2lo,$tmp 3464 vpsllq \$10,$D2hi,$D2hi 3465 vpandq $mask42,$D2lo,$RR2 3466 vpaddq $tmp,$D2hi,$D2hi 3467 3468 vpaddq $D2hi,$RR0,$RR0 3469 vpsllq \$2,$D2hi,$D2hi 3470 3471 vpaddq $D2hi,$RR0,$RR0 3472 3473 vpsrlq \$44,$RR0,$tmp # additional step 3474 vpandq $mask44,$RR0,$RR0 3475 3476 vpaddq $tmp,$RR1,$RR1 3477 3478 ################################################################ 3479 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3480 # is 15263748, which reflects how data is loaded... 3481 3482 vpunpcklqdq $R2,$RR2,$T2 # 3748 3483 vpunpckhqdq $R2,$RR2,$R2 # 1526 3484 vpunpcklqdq $R0,$RR0,$T0 3485 vpunpckhqdq $R0,$RR0,$R0 3486 vpunpcklqdq $R1,$RR1,$T1 3487 vpunpckhqdq $R1,$RR1,$R1 3488___ 3489######## switch to %zmm 3490map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3491map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3492map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3493map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3494 3495$code.=<<___; 3496 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3497 vshufi64x2 \$0x44,$R0,$T0,$RR0 3498 vshufi64x2 \$0x44,$R1,$T1,$RR1 3499 3500 vmovdqu64 16*0($inp),$T2 # load data 3501 vmovdqu64 16*4($inp),$T3 3502 lea 16*8($inp),$inp 3503 3504 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3505 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3506 vpaddq $RR2,$SS2,$SS2 3507 vpaddq $RR1,$SS1,$SS1 3508 vpsllq \$2,$SS2,$SS2 3509 vpsllq \$2,$SS1,$SS1 3510 3511 vpbroadcastq $padbit,$PAD 3512 vpbroadcastq %x#$mask44,$mask44 3513 vpbroadcastq %x#$mask42,$mask42 3514 3515 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3516 vpbroadcastq %x#$SS2,$S2 3517 vpbroadcastq %x#$RR0,$R0 3518 vpbroadcastq %x#$RR1,$R1 3519 vpbroadcastq %x#$RR2,$R2 3520 3521 vpunpcklqdq $T3,$T2,$T1 # transpose data 3522 vpunpckhqdq $T3,$T2,$T3 3523 3524 # at this point 64-bit lanes are ordered as 73625140 3525 3526 vpsrlq \$24,$T3,$T2 # splat the data 3527 vporq $PAD,$T2,$T2 3528 vpaddq $T2,$H2,$H2 # accumulate input 3529 vpandq $mask44,$T1,$T0 3530 vpsrlq \$44,$T1,$T1 3531 vpsllq \$20,$T3,$T3 3532 vporq $T3,$T1,$T1 3533 vpandq $mask44,$T1,$T1 3534 3535 sub \$8,$len 3536 jz .Ltail_vpmadd52_8x 3537 jmp .Loop_vpmadd52_8x 3538 3539.align 32 3540.Loop_vpmadd52_8x: 3541 #vpaddq $T2,$H2,$H2 # accumulate input 3542 vpaddq $T0,$H0,$H0 3543 vpaddq $T1,$H1,$H1 3544 3545 vpxorq $D0lo,$D0lo,$D0lo 3546 vpmadd52luq $H2,$S1,$D0lo 3547 vpxorq $D0hi,$D0hi,$D0hi 3548 vpmadd52huq $H2,$S1,$D0hi 3549 vpxorq $D1lo,$D1lo,$D1lo 3550 vpmadd52luq $H2,$S2,$D1lo 3551 vpxorq $D1hi,$D1hi,$D1hi 3552 vpmadd52huq $H2,$S2,$D1hi 3553 vpxorq $D2lo,$D2lo,$D2lo 3554 vpmadd52luq $H2,$R0,$D2lo 3555 vpxorq $D2hi,$D2hi,$D2hi 3556 vpmadd52huq $H2,$R0,$D2hi 3557 3558 vmovdqu64 16*0($inp),$T2 # load data 3559 vmovdqu64 16*4($inp),$T3 3560 lea 16*8($inp),$inp 3561 vpmadd52luq $H0,$R0,$D0lo 3562 vpmadd52huq $H0,$R0,$D0hi 3563 vpmadd52luq $H0,$R1,$D1lo 3564 vpmadd52huq $H0,$R1,$D1hi 3565 vpmadd52luq $H0,$R2,$D2lo 3566 vpmadd52huq $H0,$R2,$D2hi 3567 3568 vpunpcklqdq $T3,$T2,$T1 # transpose data 3569 vpunpckhqdq $T3,$T2,$T3 3570 vpmadd52luq $H1,$S2,$D0lo 3571 vpmadd52huq $H1,$S2,$D0hi 3572 vpmadd52luq $H1,$R0,$D1lo 3573 vpmadd52huq $H1,$R0,$D1hi 3574 vpmadd52luq $H1,$R1,$D2lo 3575 vpmadd52huq $H1,$R1,$D2hi 3576 3577 ################################################################ 3578 # partial reduction (interleaved with data splat) 3579 vpsrlq \$44,$D0lo,$tmp 3580 vpsllq \$8,$D0hi,$D0hi 3581 vpandq $mask44,$D0lo,$H0 3582 vpaddq $tmp,$D0hi,$D0hi 3583 3584 vpsrlq \$24,$T3,$T2 3585 vporq $PAD,$T2,$T2 3586 vpaddq $D0hi,$D1lo,$D1lo 3587 3588 vpsrlq \$44,$D1lo,$tmp 3589 vpsllq \$8,$D1hi,$D1hi 3590 vpandq $mask44,$D1lo,$H1 3591 vpaddq $tmp,$D1hi,$D1hi 3592 3593 vpandq $mask44,$T1,$T0 3594 vpsrlq \$44,$T1,$T1 3595 vpsllq \$20,$T3,$T3 3596 vpaddq $D1hi,$D2lo,$D2lo 3597 3598 vpsrlq \$42,$D2lo,$tmp 3599 vpsllq \$10,$D2hi,$D2hi 3600 vpandq $mask42,$D2lo,$H2 3601 vpaddq $tmp,$D2hi,$D2hi 3602 3603 vpaddq $T2,$H2,$H2 # accumulate input 3604 vpaddq $D2hi,$H0,$H0 3605 vpsllq \$2,$D2hi,$D2hi 3606 3607 vpaddq $D2hi,$H0,$H0 3608 vporq $T3,$T1,$T1 3609 vpandq $mask44,$T1,$T1 3610 3611 vpsrlq \$44,$H0,$tmp # additional step 3612 vpandq $mask44,$H0,$H0 3613 3614 vpaddq $tmp,$H1,$H1 3615 3616 sub \$8,$len # len-=128 3617 jnz .Loop_vpmadd52_8x 3618 3619.Ltail_vpmadd52_8x: 3620 #vpaddq $T2,$H2,$H2 # accumulate input 3621 vpaddq $T0,$H0,$H0 3622 vpaddq $T1,$H1,$H1 3623 3624 vpxorq $D0lo,$D0lo,$D0lo 3625 vpmadd52luq $H2,$SS1,$D0lo 3626 vpxorq $D0hi,$D0hi,$D0hi 3627 vpmadd52huq $H2,$SS1,$D0hi 3628 vpxorq $D1lo,$D1lo,$D1lo 3629 vpmadd52luq $H2,$SS2,$D1lo 3630 vpxorq $D1hi,$D1hi,$D1hi 3631 vpmadd52huq $H2,$SS2,$D1hi 3632 vpxorq $D2lo,$D2lo,$D2lo 3633 vpmadd52luq $H2,$RR0,$D2lo 3634 vpxorq $D2hi,$D2hi,$D2hi 3635 vpmadd52huq $H2,$RR0,$D2hi 3636 3637 vpmadd52luq $H0,$RR0,$D0lo 3638 vpmadd52huq $H0,$RR0,$D0hi 3639 vpmadd52luq $H0,$RR1,$D1lo 3640 vpmadd52huq $H0,$RR1,$D1hi 3641 vpmadd52luq $H0,$RR2,$D2lo 3642 vpmadd52huq $H0,$RR2,$D2hi 3643 3644 vpmadd52luq $H1,$SS2,$D0lo 3645 vpmadd52huq $H1,$SS2,$D0hi 3646 vpmadd52luq $H1,$RR0,$D1lo 3647 vpmadd52huq $H1,$RR0,$D1hi 3648 vpmadd52luq $H1,$RR1,$D2lo 3649 vpmadd52huq $H1,$RR1,$D2hi 3650 3651 ################################################################ 3652 # horizontal addition 3653 3654 mov \$1,%eax 3655 kmovw %eax,%k1 3656 vpsrldq \$8,$D0lo,$T0 3657 vpsrldq \$8,$D0hi,$H0 3658 vpsrldq \$8,$D1lo,$T1 3659 vpsrldq \$8,$D1hi,$H1 3660 vpaddq $T0,$D0lo,$D0lo 3661 vpaddq $H0,$D0hi,$D0hi 3662 vpsrldq \$8,$D2lo,$T2 3663 vpsrldq \$8,$D2hi,$H2 3664 vpaddq $T1,$D1lo,$D1lo 3665 vpaddq $H1,$D1hi,$D1hi 3666 vpermq \$0x2,$D0lo,$T0 3667 vpermq \$0x2,$D0hi,$H0 3668 vpaddq $T2,$D2lo,$D2lo 3669 vpaddq $H2,$D2hi,$D2hi 3670 3671 vpermq \$0x2,$D1lo,$T1 3672 vpermq \$0x2,$D1hi,$H1 3673 vpaddq $T0,$D0lo,$D0lo 3674 vpaddq $H0,$D0hi,$D0hi 3675 vpermq \$0x2,$D2lo,$T2 3676 vpermq \$0x2,$D2hi,$H2 3677 vpaddq $T1,$D1lo,$D1lo 3678 vpaddq $H1,$D1hi,$D1hi 3679 vextracti64x4 \$1,$D0lo,%y#$T0 3680 vextracti64x4 \$1,$D0hi,%y#$H0 3681 vpaddq $T2,$D2lo,$D2lo 3682 vpaddq $H2,$D2hi,$D2hi 3683 3684 vextracti64x4 \$1,$D1lo,%y#$T1 3685 vextracti64x4 \$1,$D1hi,%y#$H1 3686 vextracti64x4 \$1,$D2lo,%y#$T2 3687 vextracti64x4 \$1,$D2hi,%y#$H2 3688___ 3689######## switch back to %ymm 3690map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3691map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3692map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3693 3694$code.=<<___; 3695 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3696 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3697 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3698 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3699 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3700 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3701 3702 ################################################################ 3703 # partial reduction 3704 vpsrlq \$44,$D0lo,$tmp 3705 vpsllq \$8,$D0hi,$D0hi 3706 vpandq $mask44,$D0lo,$H0 3707 vpaddq $tmp,$D0hi,$D0hi 3708 3709 vpaddq $D0hi,$D1lo,$D1lo 3710 3711 vpsrlq \$44,$D1lo,$tmp 3712 vpsllq \$8,$D1hi,$D1hi 3713 vpandq $mask44,$D1lo,$H1 3714 vpaddq $tmp,$D1hi,$D1hi 3715 3716 vpaddq $D1hi,$D2lo,$D2lo 3717 3718 vpsrlq \$42,$D2lo,$tmp 3719 vpsllq \$10,$D2hi,$D2hi 3720 vpandq $mask42,$D2lo,$H2 3721 vpaddq $tmp,$D2hi,$D2hi 3722 3723 vpaddq $D2hi,$H0,$H0 3724 vpsllq \$2,$D2hi,$D2hi 3725 3726 vpaddq $D2hi,$H0,$H0 3727 3728 vpsrlq \$44,$H0,$tmp # additional step 3729 vpandq $mask44,$H0,$H0 3730 3731 vpaddq $tmp,$H1,$H1 3732 3733 ################################################################ 3734 3735 vmovq %x#$H0,0($ctx) 3736 vmovq %x#$H1,8($ctx) 3737 vmovq %x#$H2,16($ctx) 3738 vzeroall 3739 3740.Lno_data_vpmadd52_8x: 3741 ret 3742.cfi_endproc 3743.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3744___ 3745} 3746$code.=<<___; 3747.type poly1305_emit_base2_44,\@function,3 3748.align 32 3749poly1305_emit_base2_44: 3750.cfi_startproc 3751 endbranch 3752 mov 0($ctx),%r8 # load hash value 3753 mov 8($ctx),%r9 3754 mov 16($ctx),%r10 3755 3756 mov %r9,%rax 3757 shr \$20,%r9 3758 shl \$44,%rax 3759 mov %r10,%rcx 3760 shr \$40,%r10 3761 shl \$24,%rcx 3762 3763 add %rax,%r8 3764 adc %rcx,%r9 3765 adc \$0,%r10 3766 3767 mov %r8,%rax 3768 add \$5,%r8 # compare to modulus 3769 mov %r9,%rcx 3770 adc \$0,%r9 3771 adc \$0,%r10 3772 shr \$2,%r10 # did 130-bit value overflow? 3773 cmovnz %r8,%rax 3774 cmovnz %r9,%rcx 3775 3776 add 0($nonce),%rax # accumulate nonce 3777 adc 8($nonce),%rcx 3778 mov %rax,0($mac) # write result 3779 mov %rcx,8($mac) 3780 3781 ret 3782.cfi_endproc 3783.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3784___ 3785} } } 3786$code.=<<___; 3787.section .rodata align=64 3788.align 64 3789.Lconst: 3790.Lmask24: 3791.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 3792.L129: 3793.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 3794.Lmask26: 3795.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 3796.Lpermd_avx2: 3797.long 2,2,2,3,2,0,2,1 3798.Lpermd_avx512: 3799.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 3800 3801.L2_44_inp_permd: 3802.long 0,1,1,2,2,3,7,7 3803.L2_44_inp_shift: 3804.quad 0,12,24,64 3805.L2_44_mask: 3806.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 3807.L2_44_shift_rgt: 3808.quad 44,44,42,64 3809.L2_44_shift_lft: 3810.quad 8,8,10,64 3811 3812.align 64 3813.Lx_mask44: 3814.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3815.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3816.Lx_mask42: 3817.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3818.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3819.previous 3820___ 3821} 3822$code.=<<___; 3823.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 3824.align 16 3825___ 3826 3827{ # chacha20-poly1305 helpers 3828my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3829 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3830$code.=<<___; 3831.globl xor128_encrypt_n_pad 3832.type xor128_encrypt_n_pad,\@abi-omnipotent 3833.align 16 3834xor128_encrypt_n_pad: 3835.cfi_startproc 3836 sub $otp,$inp 3837 sub $otp,$out 3838 mov $len,%r10 # put len aside 3839 shr \$4,$len # len / 16 3840 jz .Ltail_enc 3841 nop 3842.Loop_enc_xmm: 3843 movdqu ($inp,$otp),%xmm0 3844 pxor ($otp),%xmm0 3845 movdqu %xmm0,($out,$otp) 3846 movdqa %xmm0,($otp) 3847 lea 16($otp),$otp 3848 dec $len 3849 jnz .Loop_enc_xmm 3850 3851 and \$15,%r10 # len % 16 3852 jz .Ldone_enc 3853 3854.Ltail_enc: 3855 mov \$16,$len 3856 sub %r10,$len 3857 xor %eax,%eax 3858.Loop_enc_byte: 3859 mov ($inp,$otp),%al 3860 xor ($otp),%al 3861 mov %al,($out,$otp) 3862 mov %al,($otp) 3863 lea 1($otp),$otp 3864 dec %r10 3865 jnz .Loop_enc_byte 3866 3867 xor %eax,%eax 3868.Loop_enc_pad: 3869 mov %al,($otp) 3870 lea 1($otp),$otp 3871 dec $len 3872 jnz .Loop_enc_pad 3873 3874.Ldone_enc: 3875 mov $otp,%rax 3876 ret 3877.cfi_endproc 3878.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3879 3880.globl xor128_decrypt_n_pad 3881.type xor128_decrypt_n_pad,\@abi-omnipotent 3882.align 16 3883xor128_decrypt_n_pad: 3884.cfi_startproc 3885 sub $otp,$inp 3886 sub $otp,$out 3887 mov $len,%r10 # put len aside 3888 shr \$4,$len # len / 16 3889 jz .Ltail_dec 3890 nop 3891.Loop_dec_xmm: 3892 movdqu ($inp,$otp),%xmm0 3893 movdqa ($otp),%xmm1 3894 pxor %xmm0,%xmm1 3895 movdqu %xmm1,($out,$otp) 3896 movdqa %xmm0,($otp) 3897 lea 16($otp),$otp 3898 dec $len 3899 jnz .Loop_dec_xmm 3900 3901 pxor %xmm1,%xmm1 3902 and \$15,%r10 # len % 16 3903 jz .Ldone_dec 3904 3905.Ltail_dec: 3906 mov \$16,$len 3907 sub %r10,$len 3908 xor %eax,%eax 3909 xor %r11,%r11 3910.Loop_dec_byte: 3911 mov ($inp,$otp),%r11b 3912 mov ($otp),%al 3913 xor %r11b,%al 3914 mov %al,($out,$otp) 3915 mov %r11b,($otp) 3916 lea 1($otp),$otp 3917 dec %r10 3918 jnz .Loop_dec_byte 3919 3920 xor %eax,%eax 3921.Loop_dec_pad: 3922 mov %al,($otp) 3923 lea 1($otp),$otp 3924 dec $len 3925 jnz .Loop_dec_pad 3926 3927.Ldone_dec: 3928 mov $otp,%rax 3929 ret 3930.cfi_endproc 3931.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3932___ 3933} 3934 3935# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3936# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3937if ($win64) { 3938$rec="%rcx"; 3939$frame="%rdx"; 3940$context="%r8"; 3941$disp="%r9"; 3942 3943$code.=<<___; 3944.extern __imp_RtlVirtualUnwind 3945.type se_handler,\@abi-omnipotent 3946.align 16 3947se_handler: 3948 push %rsi 3949 push %rdi 3950 push %rbx 3951 push %rbp 3952 push %r12 3953 push %r13 3954 push %r14 3955 push %r15 3956 pushfq 3957 sub \$64,%rsp 3958 3959 mov 120($context),%rax # pull context->Rax 3960 mov 248($context),%rbx # pull context->Rip 3961 3962 mov 8($disp),%rsi # disp->ImageBase 3963 mov 56($disp),%r11 # disp->HandlerData 3964 3965 mov 0(%r11),%r10d # HandlerData[0] 3966 lea (%rsi,%r10),%r10 # prologue label 3967 cmp %r10,%rbx # context->Rip<.Lprologue 3968 jb .Lcommon_seh_tail 3969 3970 mov 152($context),%rax # pull context->Rsp 3971 3972 mov 4(%r11),%r10d # HandlerData[1] 3973 lea (%rsi,%r10),%r10 # epilogue label 3974 cmp %r10,%rbx # context->Rip>=.Lepilogue 3975 jae .Lcommon_seh_tail 3976 3977 lea 48(%rax),%rax 3978 3979 mov -8(%rax),%rbx 3980 mov -16(%rax),%rbp 3981 mov -24(%rax),%r12 3982 mov -32(%rax),%r13 3983 mov -40(%rax),%r14 3984 mov -48(%rax),%r15 3985 mov %rbx,144($context) # restore context->Rbx 3986 mov %rbp,160($context) # restore context->Rbp 3987 mov %r12,216($context) # restore context->R12 3988 mov %r13,224($context) # restore context->R13 3989 mov %r14,232($context) # restore context->R14 3990 mov %r15,240($context) # restore context->R14 3991 3992 jmp .Lcommon_seh_tail 3993.size se_handler,.-se_handler 3994 3995.type avx_handler,\@abi-omnipotent 3996.align 16 3997avx_handler: 3998 push %rsi 3999 push %rdi 4000 push %rbx 4001 push %rbp 4002 push %r12 4003 push %r13 4004 push %r14 4005 push %r15 4006 pushfq 4007 sub \$64,%rsp 4008 4009 mov 120($context),%rax # pull context->Rax 4010 mov 248($context),%rbx # pull context->Rip 4011 4012 mov 8($disp),%rsi # disp->ImageBase 4013 mov 56($disp),%r11 # disp->HandlerData 4014 4015 mov 0(%r11),%r10d # HandlerData[0] 4016 lea (%rsi,%r10),%r10 # prologue label 4017 cmp %r10,%rbx # context->Rip<prologue label 4018 jb .Lcommon_seh_tail 4019 4020 mov 152($context),%rax # pull context->Rsp 4021 4022 mov 4(%r11),%r10d # HandlerData[1] 4023 lea (%rsi,%r10),%r10 # epilogue label 4024 cmp %r10,%rbx # context->Rip>=epilogue label 4025 jae .Lcommon_seh_tail 4026 4027 mov 208($context),%rax # pull context->R11 4028 4029 lea 0x50(%rax),%rsi 4030 lea 0xf8(%rax),%rax 4031 lea 512($context),%rdi # &context.Xmm6 4032 mov \$20,%ecx 4033 .long 0xa548f3fc # cld; rep movsq 4034 4035.Lcommon_seh_tail: 4036 mov 8(%rax),%rdi 4037 mov 16(%rax),%rsi 4038 mov %rax,152($context) # restore context->Rsp 4039 mov %rsi,168($context) # restore context->Rsi 4040 mov %rdi,176($context) # restore context->Rdi 4041 4042 mov 40($disp),%rdi # disp->ContextRecord 4043 mov $context,%rsi # context 4044 mov \$154,%ecx # sizeof(CONTEXT) 4045 .long 0xa548f3fc # cld; rep movsq 4046 4047 mov $disp,%rsi 4048 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4049 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4050 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4051 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4052 mov 40(%rsi),%r10 # disp->ContextRecord 4053 lea 56(%rsi),%r11 # &disp->HandlerData 4054 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4055 mov %r10,32(%rsp) # arg5 4056 mov %r11,40(%rsp) # arg6 4057 mov %r12,48(%rsp) # arg7 4058 mov %rcx,56(%rsp) # arg8, (NULL) 4059 call *__imp_RtlVirtualUnwind(%rip) 4060 4061 mov \$1,%eax # ExceptionContinueSearch 4062 add \$64,%rsp 4063 popfq 4064 pop %r15 4065 pop %r14 4066 pop %r13 4067 pop %r12 4068 pop %rbp 4069 pop %rbx 4070 pop %rdi 4071 pop %rsi 4072 ret 4073.size avx_handler,.-avx_handler 4074 4075.section .pdata 4076.align 4 4077 .rva .LSEH_begin_poly1305_init 4078 .rva .LSEH_end_poly1305_init 4079 .rva .LSEH_info_poly1305_init 4080 4081 .rva .LSEH_begin_poly1305_blocks 4082 .rva .LSEH_end_poly1305_blocks 4083 .rva .LSEH_info_poly1305_blocks 4084 4085 .rva .LSEH_begin_poly1305_emit 4086 .rva .LSEH_end_poly1305_emit 4087 .rva .LSEH_info_poly1305_emit 4088___ 4089$code.=<<___ if ($avx); 4090 .rva .LSEH_begin_poly1305_blocks_avx 4091 .rva .Lbase2_64_avx 4092 .rva .LSEH_info_poly1305_blocks_avx_1 4093 4094 .rva .Lbase2_64_avx 4095 .rva .Leven_avx 4096 .rva .LSEH_info_poly1305_blocks_avx_2 4097 4098 .rva .Leven_avx 4099 .rva .LSEH_end_poly1305_blocks_avx 4100 .rva .LSEH_info_poly1305_blocks_avx_3 4101 4102 .rva .LSEH_begin_poly1305_emit_avx 4103 .rva .LSEH_end_poly1305_emit_avx 4104 .rva .LSEH_info_poly1305_emit_avx 4105___ 4106$code.=<<___ if ($avx>1); 4107 .rva .LSEH_begin_poly1305_blocks_avx2 4108 .rva .Lbase2_64_avx2 4109 .rva .LSEH_info_poly1305_blocks_avx2_1 4110 4111 .rva .Lbase2_64_avx2 4112 .rva .Leven_avx2 4113 .rva .LSEH_info_poly1305_blocks_avx2_2 4114 4115 .rva .Leven_avx2 4116 .rva .LSEH_end_poly1305_blocks_avx2 4117 .rva .LSEH_info_poly1305_blocks_avx2_3 4118___ 4119$code.=<<___ if ($avx>2); 4120 .rva .LSEH_begin_poly1305_blocks_avx512 4121 .rva .LSEH_end_poly1305_blocks_avx512 4122 .rva .LSEH_info_poly1305_blocks_avx512 4123___ 4124$code.=<<___; 4125.section .xdata 4126.align 8 4127.LSEH_info_poly1305_init: 4128 .byte 9,0,0,0 4129 .rva se_handler 4130 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init 4131 4132.LSEH_info_poly1305_blocks: 4133 .byte 9,0,0,0 4134 .rva se_handler 4135 .rva .Lblocks_body,.Lblocks_epilogue 4136 4137.LSEH_info_poly1305_emit: 4138 .byte 9,0,0,0 4139 .rva se_handler 4140 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit 4141___ 4142$code.=<<___ if ($avx); 4143.LSEH_info_poly1305_blocks_avx_1: 4144 .byte 9,0,0,0 4145 .rva se_handler 4146 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4147 4148.LSEH_info_poly1305_blocks_avx_2: 4149 .byte 9,0,0,0 4150 .rva se_handler 4151 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4152 4153.LSEH_info_poly1305_blocks_avx_3: 4154 .byte 9,0,0,0 4155 .rva avx_handler 4156 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4157 4158.LSEH_info_poly1305_emit_avx: 4159 .byte 9,0,0,0 4160 .rva se_handler 4161 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4162___ 4163$code.=<<___ if ($avx>1); 4164.LSEH_info_poly1305_blocks_avx2_1: 4165 .byte 9,0,0,0 4166 .rva se_handler 4167 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4168 4169.LSEH_info_poly1305_blocks_avx2_2: 4170 .byte 9,0,0,0 4171 .rva se_handler 4172 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4173 4174.LSEH_info_poly1305_blocks_avx2_3: 4175 .byte 9,0,0,0 4176 .rva avx_handler 4177 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4178___ 4179$code.=<<___ if ($avx>2); 4180.LSEH_info_poly1305_blocks_avx512: 4181 .byte 9,0,0,0 4182 .rva avx_handler 4183 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4184___ 4185} 4186 4187foreach (split('\n',$code)) { 4188 s/\`([^\`]*)\`/eval($1)/ge; 4189 s/%r([a-z]+)#d/%e$1/g; 4190 s/%r([0-9]+)#d/%r$1d/g; 4191 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4192 4193 print $_,"\n"; 4194} 4195close STDOUT or die "error closing STDOUT: $!"; 4196