1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Multi-buffer AES-NI procedures process several independent buffers 18# in parallel by interleaving independent instructions. 19# 20# Cycles per byte for interleave factor 4: 21# 22# asymptotic measured 23# --------------------------- 24# Westmere 5.00/4=1.25 5.13/4=1.28 25# Atom 15.0/4=3.75 ?15.7/4=3.93 26# Sandy Bridge 5.06/4=1.27 5.18/4=1.29 27# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 28# Haswell 4.44/4=1.11 4.44/4=1.11 29# Bulldozer 5.75/4=1.44 5.76/4=1.44 30# 31# Cycles per byte for interleave factor 8 (not implemented for 32# pre-AVX processors, where higher interleave factor incidentally 33# doesn't result in improvement): 34# 35# asymptotic measured 36# --------------------------- 37# Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) 38# Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) 39# Haswell 5.00/8=0.63 5.00/8=0.63 40# Bulldozer 5.75/8=0.72 5.77/8=0.72 41# 42# (*) Sandy/Ivy Bridge are known to handle high interleave factors 43# suboptimally; 44 45# $output is the last argument if it looks like a file (it has an extension) 46# $flavour is the first argument if it doesn't look like a file 47$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 48$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 49 50$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 51 52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 54( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 55die "can't locate x86_64-xlate.pl"; 56 57push(@INC,"${dir}","${dir}../../perlasm"); 58require "x86_64-support.pl"; 59 60$ptr_size=&pointer_size($flavour); 61 62$avx=0; 63 64if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 65 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 66 $avx = ($1>=2.19) + ($1>=2.22); 67} 68 69if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 70 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 71 $avx = ($1>=2.09) + ($1>=2.10); 72} 73 74if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 75 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 76 $avx = ($1>=10) + ($1>=11); 77} 78 79if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { 80 $avx = ($2>=3.0) + ($2>3.0); 81} 82 83open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 84 or die "can't call $xlate: $!"; 85*STDOUT=*OUT; 86 87# void aesni_multi_cbc_encrypt ( 88# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; 89# const AES_KEY *key, 90# int num); /* 1 or 2 */ 91# 92$inp="%rdi"; # 1st arg 93$key="%rsi"; # 2nd arg 94$num="%edx"; 95 96$inp_elm_size=2*$ptr_size+8+16; 97 98@inptr=map("%r$_",(8..11)); 99@outptr=map("%r$_",(12..15)); 100 101($rndkey0,$rndkey1)=("%xmm0","%xmm1"); 102@out=map("%xmm$_",(2..5)); 103@inp=map("%xmm$_",(6..9)); 104($counters,$mask,$zero)=map("%xmm$_",(10..12)); 105 106($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); 107 108$code.=<<___; 109.text 110 111.extern OPENSSL_ia32cap_P 112 113.globl aesni_multi_cbc_encrypt 114.type aesni_multi_cbc_encrypt,\@function,3 115.align 32 116aesni_multi_cbc_encrypt: 117.cfi_startproc 118___ 119$code.=<<___ if ($avx); 120 cmp \$2,$num 121 jb .Lenc_non_avx 122 mov OPENSSL_ia32cap_P+4(%rip),%ecx 123 test \$`1<<28`,%ecx # AVX bit 124 jnz _avx_cbc_enc_shortcut 125 jmp .Lenc_non_avx 126.align 16 127.Lenc_non_avx: 128___ 129$code.=<<___; 130 mov %rsp,%rax 131.cfi_def_cfa_register %rax 132 push %rbx 133.cfi_push %rbx 134 push %rbp 135.cfi_push %rbp 136 push %r12 137.cfi_push %r12 138 push %r13 139.cfi_push %r13 140 push %r14 141.cfi_push %r14 142 push %r15 143.cfi_push %r15 144___ 145$code.=<<___ if ($win64); 146 lea -0xa8(%rsp),%rsp 147 movaps %xmm6,(%rsp) 148 movaps %xmm7,0x10(%rsp) 149 movaps %xmm8,0x20(%rsp) 150 movaps %xmm9,0x30(%rsp) 151 movaps %xmm10,0x40(%rsp) 152 movaps %xmm11,0x50(%rsp) 153 movaps %xmm12,0x60(%rsp) 154 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 155 movaps %xmm14,-0x58(%rax) 156 movaps %xmm15,-0x48(%rax) 157___ 158$code.=<<___; 159 # stack layout 160 # 161 # +0 output sink 162 # +16 input sink [original %rsp and $num] 163 # +32 counters 164 165 sub \$48,%rsp 166 and \$-64,%rsp 167 mov %rax,16(%rsp) # original %rsp 168.cfi_cfa_expression %rsp+16,deref,+8 169 170.Lenc4x_body: 171 movdqu ($key),$zero # 0-round key 172 lea 0x78($key),$key # size optimization 173 lea $inp_elm_size*2($inp),$inp 174 175.Lenc4x_loop_grande: 176 mov $num,24(%rsp) # original $num 177 xor $num,$num 178___ 179for($i=0;$i<4;$i++) { 180 $inptr_reg=&pointer_register($flavour,@inptr[$i]); 181 $outptr_reg=&pointer_register($flavour,@outptr[$i]); 182 $code.=<<___; 183 # borrow $one for number of blocks 184 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one 185 mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg 186 cmp $num,$one 187 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg 188 cmovg $one,$num # find maximum 189 test $one,$one 190 # load IV 191 movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i] 192 mov $one,`32+4*$i`(%rsp) # initialize counters 193 cmovle %rsp,@inptr[$i] # cancel input 194___ 195} 196$code.=<<___; 197 test $num,$num 198 jz .Lenc4x_done 199 200 movups 0x10-0x78($key),$rndkey1 201 pxor $zero,@out[0] 202 movups 0x20-0x78($key),$rndkey0 203 pxor $zero,@out[1] 204 mov 0xf0-0x78($key),$rounds 205 pxor $zero,@out[2] 206 movdqu (@inptr[0]),@inp[0] # load inputs 207 pxor $zero,@out[3] 208 movdqu (@inptr[1]),@inp[1] 209 pxor @inp[0],@out[0] 210 movdqu (@inptr[2]),@inp[2] 211 pxor @inp[1],@out[1] 212 movdqu (@inptr[3]),@inp[3] 213 pxor @inp[2],@out[2] 214 pxor @inp[3],@out[3] 215 movdqa 32(%rsp),$counters # load counters 216 xor $offset,$offset 217 jmp .Loop_enc4x 218 219.align 32 220.Loop_enc4x: 221 add \$16,$offset 222 lea 16(%rsp),$sink # sink pointer 223 mov \$1,$one # constant of 1 224 sub $offset,$sink 225 226 aesenc $rndkey1,@out[0] 227 prefetcht0 31(@inptr[0],$offset) # prefetch input 228 prefetcht0 31(@inptr[1],$offset) 229 aesenc $rndkey1,@out[1] 230 prefetcht0 31(@inptr[2],$offset) 231 prefetcht0 31(@inptr[2],$offset) 232 aesenc $rndkey1,@out[2] 233 aesenc $rndkey1,@out[3] 234 movups 0x30-0x78($key),$rndkey1 235___ 236for($i=0;$i<4;$i++) { 237my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 238$code.=<<___; 239 cmp `32+4*$i`(%rsp),$one 240 aesenc $rndkey,@out[0] 241 aesenc $rndkey,@out[1] 242 aesenc $rndkey,@out[2] 243 cmovge $sink,@inptr[$i] # cancel input 244 cmovg $sink,@outptr[$i] # sink output 245 aesenc $rndkey,@out[3] 246 movups `0x40+16*$i-0x78`($key),$rndkey 247___ 248} 249$code.=<<___; 250 movdqa $counters,$mask 251 aesenc $rndkey0,@out[0] 252 prefetcht0 15(@outptr[0],$offset) # prefetch output 253 prefetcht0 15(@outptr[1],$offset) 254 aesenc $rndkey0,@out[1] 255 prefetcht0 15(@outptr[2],$offset) 256 prefetcht0 15(@outptr[3],$offset) 257 aesenc $rndkey0,@out[2] 258 aesenc $rndkey0,@out[3] 259 movups 0x80-0x78($key),$rndkey0 260 pxor $zero,$zero 261 262 aesenc $rndkey1,@out[0] 263 pcmpgtd $zero,$mask 264 movdqu -0x78($key),$zero # reload 0-round key 265 aesenc $rndkey1,@out[1] 266 paddd $mask,$counters # decrement counters 267 movdqa $counters,32(%rsp) # update counters 268 aesenc $rndkey1,@out[2] 269 aesenc $rndkey1,@out[3] 270 movups 0x90-0x78($key),$rndkey1 271 272 cmp \$11,$rounds 273 274 aesenc $rndkey0,@out[0] 275 aesenc $rndkey0,@out[1] 276 aesenc $rndkey0,@out[2] 277 aesenc $rndkey0,@out[3] 278 movups 0xa0-0x78($key),$rndkey0 279 280 jb .Lenc4x_tail 281 282 aesenc $rndkey1,@out[0] 283 aesenc $rndkey1,@out[1] 284 aesenc $rndkey1,@out[2] 285 aesenc $rndkey1,@out[3] 286 movups 0xb0-0x78($key),$rndkey1 287 288 aesenc $rndkey0,@out[0] 289 aesenc $rndkey0,@out[1] 290 aesenc $rndkey0,@out[2] 291 aesenc $rndkey0,@out[3] 292 movups 0xc0-0x78($key),$rndkey0 293 294 je .Lenc4x_tail 295 296 aesenc $rndkey1,@out[0] 297 aesenc $rndkey1,@out[1] 298 aesenc $rndkey1,@out[2] 299 aesenc $rndkey1,@out[3] 300 movups 0xd0-0x78($key),$rndkey1 301 302 aesenc $rndkey0,@out[0] 303 aesenc $rndkey0,@out[1] 304 aesenc $rndkey0,@out[2] 305 aesenc $rndkey0,@out[3] 306 movups 0xe0-0x78($key),$rndkey0 307 jmp .Lenc4x_tail 308 309.align 32 310.Lenc4x_tail: 311 aesenc $rndkey1,@out[0] 312 aesenc $rndkey1,@out[1] 313 aesenc $rndkey1,@out[2] 314 aesenc $rndkey1,@out[3] 315 movdqu (@inptr[0],$offset),@inp[0] 316 movdqu 0x10-0x78($key),$rndkey1 317 318 aesenclast $rndkey0,@out[0] 319 movdqu (@inptr[1],$offset),@inp[1] 320 pxor $zero,@inp[0] 321 aesenclast $rndkey0,@out[1] 322 movdqu (@inptr[2],$offset),@inp[2] 323 pxor $zero,@inp[1] 324 aesenclast $rndkey0,@out[2] 325 movdqu (@inptr[3],$offset),@inp[3] 326 pxor $zero,@inp[2] 327 aesenclast $rndkey0,@out[3] 328 movdqu 0x20-0x78($key),$rndkey0 329 pxor $zero,@inp[3] 330 331 movups @out[0],-16(@outptr[0],$offset) 332 pxor @inp[0],@out[0] 333 movups @out[1],-16(@outptr[1],$offset) 334 pxor @inp[1],@out[1] 335 movups @out[2],-16(@outptr[2],$offset) 336 pxor @inp[2],@out[2] 337 movups @out[3],-16(@outptr[3],$offset) 338 pxor @inp[3],@out[3] 339 340 dec $num 341 jnz .Loop_enc4x 342 343 mov 16(%rsp),%rax # original %rsp 344.cfi_def_cfa %rax,8 345 mov 24(%rsp),$num 346 347 #pxor @inp[0],@out[0] 348 #pxor @inp[1],@out[1] 349 # output iv FIX ME! 350 #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp) 351 #pxor @inp[2],@out[2] 352 #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp) 353 #pxor @inp[3],@out[3] 354 #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller 355 #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out... 356 357 lea `$inp_elm_size*4`($inp),$inp 358 dec $num 359 jnz .Lenc4x_loop_grande 360 361.Lenc4x_done: 362___ 363$code.=<<___ if ($win64); 364 movaps -0xd8(%rax),%xmm6 365 movaps -0xc8(%rax),%xmm7 366 movaps -0xb8(%rax),%xmm8 367 movaps -0xa8(%rax),%xmm9 368 movaps -0x98(%rax),%xmm10 369 movaps -0x88(%rax),%xmm11 370 movaps -0x78(%rax),%xmm12 371 #movaps -0x68(%rax),%xmm13 372 #movaps -0x58(%rax),%xmm14 373 #movaps -0x48(%rax),%xmm15 374___ 375$code.=<<___; 376 mov -48(%rax),%r15 377.cfi_restore %r15 378 mov -40(%rax),%r14 379.cfi_restore %r14 380 mov -32(%rax),%r13 381.cfi_restore %r13 382 mov -24(%rax),%r12 383.cfi_restore %r12 384 mov -16(%rax),%rbp 385.cfi_restore %rbp 386 mov -8(%rax),%rbx 387.cfi_restore %rbx 388 lea (%rax),%rsp 389.cfi_def_cfa_register %rsp 390.Lenc4x_epilogue: 391 ret 392.cfi_endproc 393.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt 394 395.globl aesni_multi_cbc_decrypt 396.type aesni_multi_cbc_decrypt,\@function,3 397.align 32 398aesni_multi_cbc_decrypt: 399.cfi_startproc 400___ 401$code.=<<___ if ($avx); 402 cmp \$2,$num 403 jb .Ldec_non_avx 404 mov OPENSSL_ia32cap_P+4(%rip),%ecx 405 test \$`1<<28`,%ecx # AVX bit 406 jnz _avx_cbc_dec_shortcut 407 jmp .Ldec_non_avx 408.align 16 409.Ldec_non_avx: 410___ 411$code.=<<___; 412 mov %rsp,%rax 413.cfi_def_cfa_register %rax 414 push %rbx 415.cfi_push %rbx 416 push %rbp 417.cfi_push %rbp 418 push %r12 419.cfi_push %r12 420 push %r13 421.cfi_push %r13 422 push %r14 423.cfi_push %r14 424 push %r15 425.cfi_push %r15 426___ 427$code.=<<___ if ($win64); 428 lea -0xa8(%rsp),%rsp 429 movaps %xmm6,(%rsp) 430 movaps %xmm7,0x10(%rsp) 431 movaps %xmm8,0x20(%rsp) 432 movaps %xmm9,0x30(%rsp) 433 movaps %xmm10,0x40(%rsp) 434 movaps %xmm11,0x50(%rsp) 435 movaps %xmm12,0x60(%rsp) 436 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler 437 movaps %xmm14,-0x58(%rax) 438 movaps %xmm15,-0x48(%rax) 439___ 440$code.=<<___; 441 # stack layout 442 # 443 # +0 output sink 444 # +16 input sink [original %rsp and $num] 445 # +32 counters 446 447 sub \$48,%rsp 448 and \$-64,%rsp 449 mov %rax,16(%rsp) # original %rsp 450.cfi_cfa_expression %rsp+16,deref,+8 451 452.Ldec4x_body: 453 movdqu ($key),$zero # 0-round key 454 lea 0x78($key),$key # size optimization 455 lea $inp_elm_size*2($inp),$inp 456 457.Ldec4x_loop_grande: 458 mov $num,24(%rsp) # original $num 459 xor $num,$num 460___ 461for($i=0;$i<4;$i++) { 462 $inptr_reg=&pointer_register($flavour,@inptr[$i]); 463 $outptr_reg=&pointer_register($flavour,@outptr[$i]); 464 $code.=<<___; 465 # borrow $one for number of blocks 466 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one 467 mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg 468 cmp $num,$one 469 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg 470 cmovg $one,$num # find maximum 471 test $one,$one 472 # load IV 473 movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i] 474 mov $one,`32+4*$i`(%rsp) # initialize counters 475 cmovle %rsp,@inptr[$i] # cancel input 476___ 477} 478$code.=<<___; 479 test $num,$num 480 jz .Ldec4x_done 481 482 movups 0x10-0x78($key),$rndkey1 483 movups 0x20-0x78($key),$rndkey0 484 mov 0xf0-0x78($key),$rounds 485 movdqu (@inptr[0]),@out[0] # load inputs 486 movdqu (@inptr[1]),@out[1] 487 pxor $zero,@out[0] 488 movdqu (@inptr[2]),@out[2] 489 pxor $zero,@out[1] 490 movdqu (@inptr[3]),@out[3] 491 pxor $zero,@out[2] 492 pxor $zero,@out[3] 493 movdqa 32(%rsp),$counters # load counters 494 xor $offset,$offset 495 jmp .Loop_dec4x 496 497.align 32 498.Loop_dec4x: 499 add \$16,$offset 500 lea 16(%rsp),$sink # sink pointer 501 mov \$1,$one # constant of 1 502 sub $offset,$sink 503 504 aesdec $rndkey1,@out[0] 505 prefetcht0 31(@inptr[0],$offset) # prefetch input 506 prefetcht0 31(@inptr[1],$offset) 507 aesdec $rndkey1,@out[1] 508 prefetcht0 31(@inptr[2],$offset) 509 prefetcht0 31(@inptr[3],$offset) 510 aesdec $rndkey1,@out[2] 511 aesdec $rndkey1,@out[3] 512 movups 0x30-0x78($key),$rndkey1 513___ 514for($i=0;$i<4;$i++) { 515my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; 516$code.=<<___; 517 cmp `32+4*$i`(%rsp),$one 518 aesdec $rndkey,@out[0] 519 aesdec $rndkey,@out[1] 520 aesdec $rndkey,@out[2] 521 cmovge $sink,@inptr[$i] # cancel input 522 cmovg $sink,@outptr[$i] # sink output 523 aesdec $rndkey,@out[3] 524 movups `0x40+16*$i-0x78`($key),$rndkey 525___ 526} 527$code.=<<___; 528 movdqa $counters,$mask 529 aesdec $rndkey0,@out[0] 530 prefetcht0 15(@outptr[0],$offset) # prefetch output 531 prefetcht0 15(@outptr[1],$offset) 532 aesdec $rndkey0,@out[1] 533 prefetcht0 15(@outptr[2],$offset) 534 prefetcht0 15(@outptr[3],$offset) 535 aesdec $rndkey0,@out[2] 536 aesdec $rndkey0,@out[3] 537 movups 0x80-0x78($key),$rndkey0 538 pxor $zero,$zero 539 540 aesdec $rndkey1,@out[0] 541 pcmpgtd $zero,$mask 542 movdqu -0x78($key),$zero # reload 0-round key 543 aesdec $rndkey1,@out[1] 544 paddd $mask,$counters # decrement counters 545 movdqa $counters,32(%rsp) # update counters 546 aesdec $rndkey1,@out[2] 547 aesdec $rndkey1,@out[3] 548 movups 0x90-0x78($key),$rndkey1 549 550 cmp \$11,$rounds 551 552 aesdec $rndkey0,@out[0] 553 aesdec $rndkey0,@out[1] 554 aesdec $rndkey0,@out[2] 555 aesdec $rndkey0,@out[3] 556 movups 0xa0-0x78($key),$rndkey0 557 558 jb .Ldec4x_tail 559 560 aesdec $rndkey1,@out[0] 561 aesdec $rndkey1,@out[1] 562 aesdec $rndkey1,@out[2] 563 aesdec $rndkey1,@out[3] 564 movups 0xb0-0x78($key),$rndkey1 565 566 aesdec $rndkey0,@out[0] 567 aesdec $rndkey0,@out[1] 568 aesdec $rndkey0,@out[2] 569 aesdec $rndkey0,@out[3] 570 movups 0xc0-0x78($key),$rndkey0 571 572 je .Ldec4x_tail 573 574 aesdec $rndkey1,@out[0] 575 aesdec $rndkey1,@out[1] 576 aesdec $rndkey1,@out[2] 577 aesdec $rndkey1,@out[3] 578 movups 0xd0-0x78($key),$rndkey1 579 580 aesdec $rndkey0,@out[0] 581 aesdec $rndkey0,@out[1] 582 aesdec $rndkey0,@out[2] 583 aesdec $rndkey0,@out[3] 584 movups 0xe0-0x78($key),$rndkey0 585 jmp .Ldec4x_tail 586 587.align 32 588.Ldec4x_tail: 589 aesdec $rndkey1,@out[0] 590 aesdec $rndkey1,@out[1] 591 aesdec $rndkey1,@out[2] 592 pxor $rndkey0,@inp[0] 593 pxor $rndkey0,@inp[1] 594 aesdec $rndkey1,@out[3] 595 movdqu 0x10-0x78($key),$rndkey1 596 pxor $rndkey0,@inp[2] 597 pxor $rndkey0,@inp[3] 598 movdqu 0x20-0x78($key),$rndkey0 599 600 aesdeclast @inp[0],@out[0] 601 aesdeclast @inp[1],@out[1] 602 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV 603 movdqu -16(@inptr[1],$offset),@inp[1] 604 aesdeclast @inp[2],@out[2] 605 aesdeclast @inp[3],@out[3] 606 movdqu -16(@inptr[2],$offset),@inp[2] 607 movdqu -16(@inptr[3],$offset),@inp[3] 608 609 movups @out[0],-16(@outptr[0],$offset) 610 movdqu (@inptr[0],$offset),@out[0] 611 movups @out[1],-16(@outptr[1],$offset) 612 movdqu (@inptr[1],$offset),@out[1] 613 pxor $zero,@out[0] 614 movups @out[2],-16(@outptr[2],$offset) 615 movdqu (@inptr[2],$offset),@out[2] 616 pxor $zero,@out[1] 617 movups @out[3],-16(@outptr[3],$offset) 618 movdqu (@inptr[3],$offset),@out[3] 619 pxor $zero,@out[2] 620 pxor $zero,@out[3] 621 622 dec $num 623 jnz .Loop_dec4x 624 625 mov 16(%rsp),%rax # original %rsp 626.cfi_def_cfa %rax,8 627 mov 24(%rsp),$num 628 629 lea `$inp_elm_size*4`($inp),$inp 630 dec $num 631 jnz .Ldec4x_loop_grande 632 633.Ldec4x_done: 634___ 635$code.=<<___ if ($win64); 636 movaps -0xd8(%rax),%xmm6 637 movaps -0xc8(%rax),%xmm7 638 movaps -0xb8(%rax),%xmm8 639 movaps -0xa8(%rax),%xmm9 640 movaps -0x98(%rax),%xmm10 641 movaps -0x88(%rax),%xmm11 642 movaps -0x78(%rax),%xmm12 643 #movaps -0x68(%rax),%xmm13 644 #movaps -0x58(%rax),%xmm14 645 #movaps -0x48(%rax),%xmm15 646___ 647$code.=<<___; 648 mov -48(%rax),%r15 649.cfi_restore %r15 650 mov -40(%rax),%r14 651.cfi_restore %r14 652 mov -32(%rax),%r13 653.cfi_restore %r13 654 mov -24(%rax),%r12 655.cfi_restore %r12 656 mov -16(%rax),%rbp 657.cfi_restore %rbp 658 mov -8(%rax),%rbx 659.cfi_restore %rbx 660 lea (%rax),%rsp 661.cfi_def_cfa_register %rsp 662.Ldec4x_epilogue: 663 ret 664.cfi_endproc 665.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt 666___ 667 668 if ($avx) {{{ 669my @ptr=map("%r$_",(8..15)); 670my $offload=$sink; 671 672my @out=map("%xmm$_",(2..9)); 673my @inp=map("%xmm$_",(10..13)); 674my ($counters,$zero)=("%xmm14","%xmm15"); 675 676$code.=<<___; 677.type aesni_multi_cbc_encrypt_avx,\@function,3 678.align 32 679aesni_multi_cbc_encrypt_avx: 680.cfi_startproc 681_avx_cbc_enc_shortcut: 682 mov %rsp,%rax 683.cfi_def_cfa_register %rax 684 push %rbx 685.cfi_push %rbx 686 push %rbp 687.cfi_push %rbp 688 push %r12 689.cfi_push %r12 690 push %r13 691.cfi_push %r13 692 push %r14 693.cfi_push %r14 694 push %r15 695.cfi_push %r15 696___ 697$code.=<<___ if ($win64); 698 lea -0xa8(%rsp),%rsp 699 movaps %xmm6,(%rsp) 700 movaps %xmm7,0x10(%rsp) 701 movaps %xmm8,0x20(%rsp) 702 movaps %xmm9,0x30(%rsp) 703 movaps %xmm10,0x40(%rsp) 704 movaps %xmm11,0x50(%rsp) 705 movaps %xmm12,-0x78(%rax) 706 movaps %xmm13,-0x68(%rax) 707 movaps %xmm14,-0x58(%rax) 708 movaps %xmm15,-0x48(%rax) 709___ 710$code.=<<___; 711 # stack layout 712 # 713 # +0 output sink 714 # +16 input sink [original %rsp and $num] 715 # +32 counters 716 # +64 distances between inputs and outputs 717 # +128 off-load area for @inp[0..3] 718 719 sub \$192,%rsp 720 and \$-128,%rsp 721 mov %rax,16(%rsp) # original %rsp 722.cfi_cfa_expression %rsp+16,deref,+8 723 724.Lenc8x_body: 725 vzeroupper 726 vmovdqu ($key),$zero # 0-round key 727 lea 0x78($key),$key # size optimization 728 lea `$inp_elm_size*4`($inp),$inp 729 shr \$1,$num 730 731.Lenc8x_loop_grande: 732 #mov $num,24(%rsp) # original $num 733 xor $num,$num 734___ 735for($i=0;$i<8;$i++) { 736 my $temp = $i ? $offload : $offset; 737 $ptr_reg=&pointer_register($flavour,@ptr[$i]); 738 $temp_reg=&pointer_register($flavour,$temp); 739 $code.=<<___; 740 # borrow $one for number of blocks 741 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one 742 # input pointer 743 mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg 744 cmp $num,$one 745 # output pointer 746 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg 747 cmovg $one,$num # find maximum 748 test $one,$one 749 # load IV 750 vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] 751 mov $one,`32+4*$i`(%rsp) # initialize counters 752 cmovle %rsp,@ptr[$i] # cancel input 753 sub @ptr[$i],$temp # distance between input and output 754 mov $temp,`64+8*$i`(%rsp) # initialize distances 755___ 756} 757$code.=<<___; 758 test $num,$num 759 jz .Lenc8x_done 760 761 vmovups 0x10-0x78($key),$rndkey1 762 vmovups 0x20-0x78($key),$rndkey0 763 mov 0xf0-0x78($key),$rounds 764 765 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round 766 lea 128(%rsp),$offload # offload area 767 vpxor (@ptr[1]),$zero,@inp[1] 768 vpxor (@ptr[2]),$zero,@inp[2] 769 vpxor (@ptr[3]),$zero,@inp[3] 770 vpxor @inp[0],@out[0],@out[0] 771 vpxor (@ptr[4]),$zero,@inp[0] 772 vpxor @inp[1],@out[1],@out[1] 773 vpxor (@ptr[5]),$zero,@inp[1] 774 vpxor @inp[2],@out[2],@out[2] 775 vpxor (@ptr[6]),$zero,@inp[2] 776 vpxor @inp[3],@out[3],@out[3] 777 vpxor (@ptr[7]),$zero,@inp[3] 778 vpxor @inp[0],@out[4],@out[4] 779 mov \$1,$one # constant of 1 780 vpxor @inp[1],@out[5],@out[5] 781 vpxor @inp[2],@out[6],@out[6] 782 vpxor @inp[3],@out[7],@out[7] 783 jmp .Loop_enc8x 784 785.align 32 786.Loop_enc8x: 787___ 788for($i=0;$i<8;$i++) { 789my $rndkey=($i&1)?$rndkey0:$rndkey1; 790$code.=<<___; 791 vaesenc $rndkey,@out[0],@out[0] 792 cmp 32+4*$i(%rsp),$one 793___ 794$code.=<<___ if ($i); 795 mov 64+8*$i(%rsp),$offset 796___ 797$code.=<<___; 798 vaesenc $rndkey,@out[1],@out[1] 799 prefetcht0 31(@ptr[$i]) # prefetch input 800 vaesenc $rndkey,@out[2],@out[2] 801___ 802$code.=<<___ if ($i>1); 803 prefetcht0 15(@ptr[$i-2]) # prefetch output 804___ 805$code.=<<___; 806 vaesenc $rndkey,@out[3],@out[3] 807 lea (@ptr[$i],$offset),$offset 808 cmovge %rsp,@ptr[$i] # cancel input 809 vaesenc $rndkey,@out[4],@out[4] 810 cmovg %rsp,$offset # sink output 811 vaesenc $rndkey,@out[5],@out[5] 812 sub @ptr[$i],$offset 813 vaesenc $rndkey,@out[6],@out[6] 814 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round 815 mov $offset,64+8*$i(%rsp) 816 vaesenc $rndkey,@out[7],@out[7] 817 vmovups `16*(3+$i)-0x78`($key),$rndkey 818 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 819___ 820$code.=<<___ if ($i<4) 821 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load 822___ 823} 824$code.=<<___; 825 vmovdqu 32(%rsp),$counters 826 prefetcht0 15(@ptr[$i-2]) # prefetch output 827 prefetcht0 15(@ptr[$i-1]) 828 cmp \$11,$rounds 829 jb .Lenc8x_tail 830 831 vaesenc $rndkey1,@out[0],@out[0] 832 vaesenc $rndkey1,@out[1],@out[1] 833 vaesenc $rndkey1,@out[2],@out[2] 834 vaesenc $rndkey1,@out[3],@out[3] 835 vaesenc $rndkey1,@out[4],@out[4] 836 vaesenc $rndkey1,@out[5],@out[5] 837 vaesenc $rndkey1,@out[6],@out[6] 838 vaesenc $rndkey1,@out[7],@out[7] 839 vmovups 0xb0-0x78($key),$rndkey1 840 841 vaesenc $rndkey0,@out[0],@out[0] 842 vaesenc $rndkey0,@out[1],@out[1] 843 vaesenc $rndkey0,@out[2],@out[2] 844 vaesenc $rndkey0,@out[3],@out[3] 845 vaesenc $rndkey0,@out[4],@out[4] 846 vaesenc $rndkey0,@out[5],@out[5] 847 vaesenc $rndkey0,@out[6],@out[6] 848 vaesenc $rndkey0,@out[7],@out[7] 849 vmovups 0xc0-0x78($key),$rndkey0 850 je .Lenc8x_tail 851 852 vaesenc $rndkey1,@out[0],@out[0] 853 vaesenc $rndkey1,@out[1],@out[1] 854 vaesenc $rndkey1,@out[2],@out[2] 855 vaesenc $rndkey1,@out[3],@out[3] 856 vaesenc $rndkey1,@out[4],@out[4] 857 vaesenc $rndkey1,@out[5],@out[5] 858 vaesenc $rndkey1,@out[6],@out[6] 859 vaesenc $rndkey1,@out[7],@out[7] 860 vmovups 0xd0-0x78($key),$rndkey1 861 862 vaesenc $rndkey0,@out[0],@out[0] 863 vaesenc $rndkey0,@out[1],@out[1] 864 vaesenc $rndkey0,@out[2],@out[2] 865 vaesenc $rndkey0,@out[3],@out[3] 866 vaesenc $rndkey0,@out[4],@out[4] 867 vaesenc $rndkey0,@out[5],@out[5] 868 vaesenc $rndkey0,@out[6],@out[6] 869 vaesenc $rndkey0,@out[7],@out[7] 870 vmovups 0xe0-0x78($key),$rndkey0 871 872.Lenc8x_tail: 873 vaesenc $rndkey1,@out[0],@out[0] 874 vpxor $zero,$zero,$zero 875 vaesenc $rndkey1,@out[1],@out[1] 876 vaesenc $rndkey1,@out[2],@out[2] 877 vpcmpgtd $zero,$counters,$zero 878 vaesenc $rndkey1,@out[3],@out[3] 879 vaesenc $rndkey1,@out[4],@out[4] 880 vpaddd $counters,$zero,$zero # decrement counters 881 vmovdqu 48(%rsp),$counters 882 vaesenc $rndkey1,@out[5],@out[5] 883 mov 64(%rsp),$offset # pre-load 1st offset 884 vaesenc $rndkey1,@out[6],@out[6] 885 vaesenc $rndkey1,@out[7],@out[7] 886 vmovups 0x10-0x78($key),$rndkey1 887 888 vaesenclast $rndkey0,@out[0],@out[0] 889 vmovdqa $zero,32(%rsp) # update counters 890 vpxor $zero,$zero,$zero 891 vaesenclast $rndkey0,@out[1],@out[1] 892 vaesenclast $rndkey0,@out[2],@out[2] 893 vpcmpgtd $zero,$counters,$zero 894 vaesenclast $rndkey0,@out[3],@out[3] 895 vaesenclast $rndkey0,@out[4],@out[4] 896 vpaddd $zero,$counters,$counters # decrement counters 897 vmovdqu -0x78($key),$zero # 0-round 898 vaesenclast $rndkey0,@out[5],@out[5] 899 vaesenclast $rndkey0,@out[6],@out[6] 900 vmovdqa $counters,48(%rsp) # update counters 901 vaesenclast $rndkey0,@out[7],@out[7] 902 vmovups 0x20-0x78($key),$rndkey0 903 904 vmovups @out[0],-16(@ptr[0]) # write output 905 sub $offset,@ptr[0] # switch to input 906 vpxor 0x00($offload),@out[0],@out[0] 907 vmovups @out[1],-16(@ptr[1]) 908 sub `64+1*8`(%rsp),@ptr[1] 909 vpxor 0x10($offload),@out[1],@out[1] 910 vmovups @out[2],-16(@ptr[2]) 911 sub `64+2*8`(%rsp),@ptr[2] 912 vpxor 0x20($offload),@out[2],@out[2] 913 vmovups @out[3],-16(@ptr[3]) 914 sub `64+3*8`(%rsp),@ptr[3] 915 vpxor 0x30($offload),@out[3],@out[3] 916 vmovups @out[4],-16(@ptr[4]) 917 sub `64+4*8`(%rsp),@ptr[4] 918 vpxor @inp[0],@out[4],@out[4] 919 vmovups @out[5],-16(@ptr[5]) 920 sub `64+5*8`(%rsp),@ptr[5] 921 vpxor @inp[1],@out[5],@out[5] 922 vmovups @out[6],-16(@ptr[6]) 923 sub `64+6*8`(%rsp),@ptr[6] 924 vpxor @inp[2],@out[6],@out[6] 925 vmovups @out[7],-16(@ptr[7]) 926 sub `64+7*8`(%rsp),@ptr[7] 927 vpxor @inp[3],@out[7],@out[7] 928 929 dec $num 930 jnz .Loop_enc8x 931 932 mov 16(%rsp),%rax # original %rsp 933.cfi_def_cfa %rax,8 934 #mov 24(%rsp),$num 935 #lea `$inp_elm_size*8`($inp),$inp 936 #dec $num 937 #jnz .Lenc8x_loop_grande 938 939.Lenc8x_done: 940 vzeroupper 941___ 942$code.=<<___ if ($win64); 943 movaps -0xd8(%rax),%xmm6 944 movaps -0xc8(%rax),%xmm7 945 movaps -0xb8(%rax),%xmm8 946 movaps -0xa8(%rax),%xmm9 947 movaps -0x98(%rax),%xmm10 948 movaps -0x88(%rax),%xmm11 949 movaps -0x78(%rax),%xmm12 950 movaps -0x68(%rax),%xmm13 951 movaps -0x58(%rax),%xmm14 952 movaps -0x48(%rax),%xmm15 953___ 954$code.=<<___; 955 mov -48(%rax),%r15 956.cfi_restore %r15 957 mov -40(%rax),%r14 958.cfi_restore %r14 959 mov -32(%rax),%r13 960.cfi_restore %r13 961 mov -24(%rax),%r12 962.cfi_restore %r12 963 mov -16(%rax),%rbp 964.cfi_restore %rbp 965 mov -8(%rax),%rbx 966.cfi_restore %rbx 967 lea (%rax),%rsp 968.cfi_def_cfa_register %rsp 969.Lenc8x_epilogue: 970 ret 971.cfi_endproc 972.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx 973 974.type aesni_multi_cbc_decrypt_avx,\@function,3 975.align 32 976aesni_multi_cbc_decrypt_avx: 977.cfi_startproc 978_avx_cbc_dec_shortcut: 979 mov %rsp,%rax 980.cfi_def_cfa_register %rax 981 push %rbx 982.cfi_push %rbx 983 push %rbp 984.cfi_push %rbp 985 push %r12 986.cfi_push %r12 987 push %r13 988.cfi_push %r13 989 push %r14 990.cfi_push %r14 991 push %r15 992.cfi_push %r15 993___ 994$code.=<<___ if ($win64); 995 lea -0xa8(%rsp),%rsp 996 movaps %xmm6,(%rsp) 997 movaps %xmm7,0x10(%rsp) 998 movaps %xmm8,0x20(%rsp) 999 movaps %xmm9,0x30(%rsp) 1000 movaps %xmm10,0x40(%rsp) 1001 movaps %xmm11,0x50(%rsp) 1002 movaps %xmm12,-0x78(%rax) 1003 movaps %xmm13,-0x68(%rax) 1004 movaps %xmm14,-0x58(%rax) 1005 movaps %xmm15,-0x48(%rax) 1006___ 1007$code.=<<___; 1008 # stack layout 1009 # 1010 # +0 output sink 1011 # +16 input sink [original %rsp and $num] 1012 # +32 counters 1013 # +64 distances between inputs and outputs 1014 # +128 off-load area for @inp[0..3] 1015 # +192 IV/input offload 1016 1017 sub \$256,%rsp 1018 and \$-256,%rsp 1019 sub \$192,%rsp 1020 mov %rax,16(%rsp) # original %rsp 1021.cfi_cfa_expression %rsp+16,deref,+8 1022 1023.Ldec8x_body: 1024 vzeroupper 1025 vmovdqu ($key),$zero # 0-round key 1026 lea 0x78($key),$key # size optimization 1027 lea `$inp_elm_size*4`($inp),$inp 1028 shr \$1,$num 1029 1030.Ldec8x_loop_grande: 1031 #mov $num,24(%rsp) # original $num 1032 xor $num,$num 1033___ 1034for($i=0;$i<8;$i++) { 1035 my $temp = $i ? $offload : $offset; 1036 $ptr_reg=&pointer_register($flavour,@ptr[$i]); 1037 $temp_reg=&pointer_register($flavour,$temp); 1038 $code.=<<___; 1039 # borrow $one for number of blocks 1040 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one 1041 # input pointer 1042 mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg 1043 cmp $num,$one 1044 # output pointer 1045 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg 1046 cmovg $one,$num # find maximum 1047 test $one,$one 1048 # load IV 1049 vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i] 1050 mov $one,`32+4*$i`(%rsp) # initialize counters 1051 cmovle %rsp,@ptr[$i] # cancel input 1052 sub @ptr[$i],$temp # distance between input and output 1053 mov $temp,`64+8*$i`(%rsp) # initialize distances 1054 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV 1055___ 1056} 1057$code.=<<___; 1058 test $num,$num 1059 jz .Ldec8x_done 1060 1061 vmovups 0x10-0x78($key),$rndkey1 1062 vmovups 0x20-0x78($key),$rndkey0 1063 mov 0xf0-0x78($key),$rounds 1064 lea 192+128(%rsp),$offload # offload area 1065 1066 vmovdqu (@ptr[0]),@out[0] # load inputs 1067 vmovdqu (@ptr[1]),@out[1] 1068 vmovdqu (@ptr[2]),@out[2] 1069 vmovdqu (@ptr[3]),@out[3] 1070 vmovdqu (@ptr[4]),@out[4] 1071 vmovdqu (@ptr[5]),@out[5] 1072 vmovdqu (@ptr[6]),@out[6] 1073 vmovdqu (@ptr[7]),@out[7] 1074 vmovdqu @out[0],0x00($offload) # offload inputs 1075 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round 1076 vmovdqu @out[1],0x10($offload) 1077 vpxor $zero,@out[1],@out[1] 1078 vmovdqu @out[2],0x20($offload) 1079 vpxor $zero,@out[2],@out[2] 1080 vmovdqu @out[3],0x30($offload) 1081 vpxor $zero,@out[3],@out[3] 1082 vmovdqu @out[4],0x40($offload) 1083 vpxor $zero,@out[4],@out[4] 1084 vmovdqu @out[5],0x50($offload) 1085 vpxor $zero,@out[5],@out[5] 1086 vmovdqu @out[6],0x60($offload) 1087 vpxor $zero,@out[6],@out[6] 1088 vmovdqu @out[7],0x70($offload) 1089 vpxor $zero,@out[7],@out[7] 1090 xor \$0x80,$offload 1091 mov \$1,$one # constant of 1 1092 jmp .Loop_dec8x 1093 1094.align 32 1095.Loop_dec8x: 1096___ 1097for($i=0;$i<8;$i++) { 1098my $rndkey=($i&1)?$rndkey0:$rndkey1; 1099$code.=<<___; 1100 vaesdec $rndkey,@out[0],@out[0] 1101 cmp 32+4*$i(%rsp),$one 1102___ 1103$code.=<<___ if ($i); 1104 mov 64+8*$i(%rsp),$offset 1105___ 1106$code.=<<___; 1107 vaesdec $rndkey,@out[1],@out[1] 1108 prefetcht0 31(@ptr[$i]) # prefetch input 1109 vaesdec $rndkey,@out[2],@out[2] 1110___ 1111$code.=<<___ if ($i>1); 1112 prefetcht0 15(@ptr[$i-2]) # prefetch output 1113___ 1114$code.=<<___; 1115 vaesdec $rndkey,@out[3],@out[3] 1116 lea (@ptr[$i],$offset),$offset 1117 cmovge %rsp,@ptr[$i] # cancel input 1118 vaesdec $rndkey,@out[4],@out[4] 1119 cmovg %rsp,$offset # sink output 1120 vaesdec $rndkey,@out[5],@out[5] 1121 sub @ptr[$i],$offset 1122 vaesdec $rndkey,@out[6],@out[6] 1123 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input 1124 mov $offset,64+8*$i(%rsp) 1125 vaesdec $rndkey,@out[7],@out[7] 1126 vmovups `16*(3+$i)-0x78`($key),$rndkey 1127 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output 1128___ 1129$code.=<<___ if ($i<4); 1130 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load 1131___ 1132} 1133$code.=<<___; 1134 vmovdqu 32(%rsp),$counters 1135 prefetcht0 15(@ptr[$i-2]) # prefetch output 1136 prefetcht0 15(@ptr[$i-1]) 1137 cmp \$11,$rounds 1138 jb .Ldec8x_tail 1139 1140 vaesdec $rndkey1,@out[0],@out[0] 1141 vaesdec $rndkey1,@out[1],@out[1] 1142 vaesdec $rndkey1,@out[2],@out[2] 1143 vaesdec $rndkey1,@out[3],@out[3] 1144 vaesdec $rndkey1,@out[4],@out[4] 1145 vaesdec $rndkey1,@out[5],@out[5] 1146 vaesdec $rndkey1,@out[6],@out[6] 1147 vaesdec $rndkey1,@out[7],@out[7] 1148 vmovups 0xb0-0x78($key),$rndkey1 1149 1150 vaesdec $rndkey0,@out[0],@out[0] 1151 vaesdec $rndkey0,@out[1],@out[1] 1152 vaesdec $rndkey0,@out[2],@out[2] 1153 vaesdec $rndkey0,@out[3],@out[3] 1154 vaesdec $rndkey0,@out[4],@out[4] 1155 vaesdec $rndkey0,@out[5],@out[5] 1156 vaesdec $rndkey0,@out[6],@out[6] 1157 vaesdec $rndkey0,@out[7],@out[7] 1158 vmovups 0xc0-0x78($key),$rndkey0 1159 je .Ldec8x_tail 1160 1161 vaesdec $rndkey1,@out[0],@out[0] 1162 vaesdec $rndkey1,@out[1],@out[1] 1163 vaesdec $rndkey1,@out[2],@out[2] 1164 vaesdec $rndkey1,@out[3],@out[3] 1165 vaesdec $rndkey1,@out[4],@out[4] 1166 vaesdec $rndkey1,@out[5],@out[5] 1167 vaesdec $rndkey1,@out[6],@out[6] 1168 vaesdec $rndkey1,@out[7],@out[7] 1169 vmovups 0xd0-0x78($key),$rndkey1 1170 1171 vaesdec $rndkey0,@out[0],@out[0] 1172 vaesdec $rndkey0,@out[1],@out[1] 1173 vaesdec $rndkey0,@out[2],@out[2] 1174 vaesdec $rndkey0,@out[3],@out[3] 1175 vaesdec $rndkey0,@out[4],@out[4] 1176 vaesdec $rndkey0,@out[5],@out[5] 1177 vaesdec $rndkey0,@out[6],@out[6] 1178 vaesdec $rndkey0,@out[7],@out[7] 1179 vmovups 0xe0-0x78($key),$rndkey0 1180 1181.Ldec8x_tail: 1182 vaesdec $rndkey1,@out[0],@out[0] 1183 vpxor $zero,$zero,$zero 1184 vaesdec $rndkey1,@out[1],@out[1] 1185 vaesdec $rndkey1,@out[2],@out[2] 1186 vpcmpgtd $zero,$counters,$zero 1187 vaesdec $rndkey1,@out[3],@out[3] 1188 vaesdec $rndkey1,@out[4],@out[4] 1189 vpaddd $counters,$zero,$zero # decrement counters 1190 vmovdqu 48(%rsp),$counters 1191 vaesdec $rndkey1,@out[5],@out[5] 1192 mov 64(%rsp),$offset # pre-load 1st offset 1193 vaesdec $rndkey1,@out[6],@out[6] 1194 vaesdec $rndkey1,@out[7],@out[7] 1195 vmovups 0x10-0x78($key),$rndkey1 1196 1197 vaesdeclast $rndkey0,@out[0],@out[0] 1198 vmovdqa $zero,32(%rsp) # update counters 1199 vpxor $zero,$zero,$zero 1200 vaesdeclast $rndkey0,@out[1],@out[1] 1201 vpxor 0x00($offload),@out[0],@out[0] # xor with IV 1202 vaesdeclast $rndkey0,@out[2],@out[2] 1203 vpxor 0x10($offload),@out[1],@out[1] 1204 vpcmpgtd $zero,$counters,$zero 1205 vaesdeclast $rndkey0,@out[3],@out[3] 1206 vpxor 0x20($offload),@out[2],@out[2] 1207 vaesdeclast $rndkey0,@out[4],@out[4] 1208 vpxor 0x30($offload),@out[3],@out[3] 1209 vpaddd $zero,$counters,$counters # decrement counters 1210 vmovdqu -0x78($key),$zero # 0-round 1211 vaesdeclast $rndkey0,@out[5],@out[5] 1212 vpxor 0x40($offload),@out[4],@out[4] 1213 vaesdeclast $rndkey0,@out[6],@out[6] 1214 vpxor 0x50($offload),@out[5],@out[5] 1215 vmovdqa $counters,48(%rsp) # update counters 1216 vaesdeclast $rndkey0,@out[7],@out[7] 1217 vpxor 0x60($offload),@out[6],@out[6] 1218 vmovups 0x20-0x78($key),$rndkey0 1219 1220 vmovups @out[0],-16(@ptr[0]) # write output 1221 sub $offset,@ptr[0] # switch to input 1222 vmovdqu 128+0(%rsp),@out[0] 1223 vpxor 0x70($offload),@out[7],@out[7] 1224 vmovups @out[1],-16(@ptr[1]) 1225 sub `64+1*8`(%rsp),@ptr[1] 1226 vmovdqu @out[0],0x00($offload) 1227 vpxor $zero,@out[0],@out[0] 1228 vmovdqu 128+16(%rsp),@out[1] 1229 vmovups @out[2],-16(@ptr[2]) 1230 sub `64+2*8`(%rsp),@ptr[2] 1231 vmovdqu @out[1],0x10($offload) 1232 vpxor $zero,@out[1],@out[1] 1233 vmovdqu 128+32(%rsp),@out[2] 1234 vmovups @out[3],-16(@ptr[3]) 1235 sub `64+3*8`(%rsp),@ptr[3] 1236 vmovdqu @out[2],0x20($offload) 1237 vpxor $zero,@out[2],@out[2] 1238 vmovdqu 128+48(%rsp),@out[3] 1239 vmovups @out[4],-16(@ptr[4]) 1240 sub `64+4*8`(%rsp),@ptr[4] 1241 vmovdqu @out[3],0x30($offload) 1242 vpxor $zero,@out[3],@out[3] 1243 vmovdqu @inp[0],0x40($offload) 1244 vpxor @inp[0],$zero,@out[4] 1245 vmovups @out[5],-16(@ptr[5]) 1246 sub `64+5*8`(%rsp),@ptr[5] 1247 vmovdqu @inp[1],0x50($offload) 1248 vpxor @inp[1],$zero,@out[5] 1249 vmovups @out[6],-16(@ptr[6]) 1250 sub `64+6*8`(%rsp),@ptr[6] 1251 vmovdqu @inp[2],0x60($offload) 1252 vpxor @inp[2],$zero,@out[6] 1253 vmovups @out[7],-16(@ptr[7]) 1254 sub `64+7*8`(%rsp),@ptr[7] 1255 vmovdqu @inp[3],0x70($offload) 1256 vpxor @inp[3],$zero,@out[7] 1257 1258 xor \$128,$offload 1259 dec $num 1260 jnz .Loop_dec8x 1261 1262 mov 16(%rsp),%rax # original %rsp 1263.cfi_def_cfa %rax,8 1264 #mov 24(%rsp),$num 1265 #lea `$inp_elm_size*8`($inp),$inp 1266 #dec $num 1267 #jnz .Ldec8x_loop_grande 1268 1269.Ldec8x_done: 1270 vzeroupper 1271___ 1272$code.=<<___ if ($win64); 1273 movaps -0xd8(%rax),%xmm6 1274 movaps -0xc8(%rax),%xmm7 1275 movaps -0xb8(%rax),%xmm8 1276 movaps -0xa8(%rax),%xmm9 1277 movaps -0x98(%rax),%xmm10 1278 movaps -0x88(%rax),%xmm11 1279 movaps -0x78(%rax),%xmm12 1280 movaps -0x68(%rax),%xmm13 1281 movaps -0x58(%rax),%xmm14 1282 movaps -0x48(%rax),%xmm15 1283___ 1284$code.=<<___; 1285 mov -48(%rax),%r15 1286.cfi_restore %r15 1287 mov -40(%rax),%r14 1288.cfi_restore %r14 1289 mov -32(%rax),%r13 1290.cfi_restore %r13 1291 mov -24(%rax),%r12 1292.cfi_restore %r12 1293 mov -16(%rax),%rbp 1294.cfi_restore %rbp 1295 mov -8(%rax),%rbx 1296.cfi_restore %rbx 1297 lea (%rax),%rsp 1298.cfi_def_cfa_register %rsp 1299.Ldec8x_epilogue: 1300 ret 1301.cfi_endproc 1302.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx 1303___ 1304 }}} 1305 1306if ($win64) { 1307# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1308# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1309$rec="%rcx"; 1310$frame="%rdx"; 1311$context="%r8"; 1312$disp="%r9"; 1313 1314$code.=<<___; 1315.extern __imp_RtlVirtualUnwind 1316.type se_handler,\@abi-omnipotent 1317.align 16 1318se_handler: 1319 push %rsi 1320 push %rdi 1321 push %rbx 1322 push %rbp 1323 push %r12 1324 push %r13 1325 push %r14 1326 push %r15 1327 pushfq 1328 sub \$64,%rsp 1329 1330 mov 120($context),%rax # pull context->Rax 1331 mov 248($context),%rbx # pull context->Rip 1332 1333 mov 8($disp),%rsi # disp->ImageBase 1334 mov 56($disp),%r11 # disp->HandlerData 1335 1336 mov 0(%r11),%r10d # HandlerData[0] 1337 lea (%rsi,%r10),%r10 # prologue label 1338 cmp %r10,%rbx # context->Rip<.Lprologue 1339 jb .Lin_prologue 1340 1341 mov 152($context),%rax # pull context->Rsp 1342 1343 mov 4(%r11),%r10d # HandlerData[1] 1344 lea (%rsi,%r10),%r10 # epilogue label 1345 cmp %r10,%rbx # context->Rip>=.Lepilogue 1346 jae .Lin_prologue 1347 1348 mov 16(%rax),%rax # pull saved stack pointer 1349 1350 mov -8(%rax),%rbx 1351 mov -16(%rax),%rbp 1352 mov -24(%rax),%r12 1353 mov -32(%rax),%r13 1354 mov -40(%rax),%r14 1355 mov -48(%rax),%r15 1356 mov %rbx,144($context) # restore context->Rbx 1357 mov %rbp,160($context) # restore context->Rbp 1358 mov %r12,216($context) # restore context->R12 1359 mov %r13,224($context) # restore context->R13 1360 mov %r14,232($context) # restore context->R14 1361 mov %r15,240($context) # restore context->R15 1362 1363 lea -56-10*16(%rax),%rsi 1364 lea 512($context),%rdi # &context.Xmm6 1365 mov \$20,%ecx 1366 .long 0xa548f3fc # cld; rep movsq 1367 1368.Lin_prologue: 1369 mov 8(%rax),%rdi 1370 mov 16(%rax),%rsi 1371 mov %rax,152($context) # restore context->Rsp 1372 mov %rsi,168($context) # restore context->Rsi 1373 mov %rdi,176($context) # restore context->Rdi 1374 1375 mov 40($disp),%rdi # disp->ContextRecord 1376 mov $context,%rsi # context 1377 mov \$154,%ecx # sizeof(CONTEXT) 1378 .long 0xa548f3fc # cld; rep movsq 1379 1380 mov $disp,%rsi 1381 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1382 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1383 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1384 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1385 mov 40(%rsi),%r10 # disp->ContextRecord 1386 lea 56(%rsi),%r11 # &disp->HandlerData 1387 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1388 mov %r10,32(%rsp) # arg5 1389 mov %r11,40(%rsp) # arg6 1390 mov %r12,48(%rsp) # arg7 1391 mov %rcx,56(%rsp) # arg8, (NULL) 1392 call *__imp_RtlVirtualUnwind(%rip) 1393 1394 mov \$1,%eax # ExceptionContinueSearch 1395 add \$64,%rsp 1396 popfq 1397 pop %r15 1398 pop %r14 1399 pop %r13 1400 pop %r12 1401 pop %rbp 1402 pop %rbx 1403 pop %rdi 1404 pop %rsi 1405 ret 1406.size se_handler,.-se_handler 1407 1408.section .pdata 1409.align 4 1410 .rva .LSEH_begin_aesni_multi_cbc_encrypt 1411 .rva .LSEH_end_aesni_multi_cbc_encrypt 1412 .rva .LSEH_info_aesni_multi_cbc_encrypt 1413 .rva .LSEH_begin_aesni_multi_cbc_decrypt 1414 .rva .LSEH_end_aesni_multi_cbc_decrypt 1415 .rva .LSEH_info_aesni_multi_cbc_decrypt 1416___ 1417$code.=<<___ if ($avx); 1418 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx 1419 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx 1420 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx 1421 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx 1422 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx 1423 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx 1424___ 1425$code.=<<___; 1426.section .xdata 1427.align 8 1428.LSEH_info_aesni_multi_cbc_encrypt: 1429 .byte 9,0,0,0 1430 .rva se_handler 1431 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] 1432.LSEH_info_aesni_multi_cbc_decrypt: 1433 .byte 9,0,0,0 1434 .rva se_handler 1435 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] 1436___ 1437$code.=<<___ if ($avx); 1438.LSEH_info_aesni_multi_cbc_encrypt_avx: 1439 .byte 9,0,0,0 1440 .rva se_handler 1441 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] 1442.LSEH_info_aesni_multi_cbc_decrypt_avx: 1443 .byte 9,0,0,0 1444 .rva se_handler 1445 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] 1446___ 1447} 1448#################################################################### 1449 1450sub rex { 1451 local *opcode=shift; 1452 my ($dst,$src)=@_; 1453 my $rex=0; 1454 1455 $rex|=0x04 if($dst>=8); 1456 $rex|=0x01 if($src>=8); 1457 push @opcode,$rex|0x40 if($rex); 1458} 1459 1460sub aesni { 1461 my $line=shift; 1462 my @opcode=(0x66); 1463 1464 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1465 rex(\@opcode,$4,$3); 1466 push @opcode,0x0f,0x3a,0xdf; 1467 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 1468 my $c=$2; 1469 push @opcode,$c=~/^0/?oct($c):$c; 1470 return ".byte\t".join(',',@opcode); 1471 } 1472 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1473 my %opcodelet = ( 1474 "aesimc" => 0xdb, 1475 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1476 "aesdec" => 0xde, "aesdeclast" => 0xdf 1477 ); 1478 return undef if (!defined($opcodelet{$1})); 1479 rex(\@opcode,$3,$2); 1480 push @opcode,0x0f,0x38,$opcodelet{$1}; 1481 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1482 return ".byte\t".join(',',@opcode); 1483 } 1484 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 1485 my %opcodelet = ( 1486 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1487 "aesdec" => 0xde, "aesdeclast" => 0xdf 1488 ); 1489 return undef if (!defined($opcodelet{$1})); 1490 my $off = $2; 1491 push @opcode,0x44 if ($3>=8); 1492 push @opcode,0x0f,0x38,$opcodelet{$1}; 1493 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 1494 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 1495 return ".byte\t".join(',',@opcode); 1496 } 1497 return $line; 1498} 1499 1500$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1501$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 1502 1503print $code; 1504close STDOUT or die "error closing STDOUT: $!"; 1505