1#! /usr/bin/env perl 2# Copyright 2009-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved with CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195# $output is the last argument if it looks like a file (it has an extension) 196# $flavour is the first argument if it doesn't look like a file 197$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 198$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 199 200$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 201 202$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 203( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 204( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 205die "can't locate x86_64-xlate.pl"; 206 207open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 208 or die "can't call $xlate: $!"; 209*STDOUT=*OUT; 210 211$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; 212@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 213 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 214 215$code=".text\n"; 216$code.=".extern OPENSSL_ia32cap_P\n"; 217 218$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 219# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 220$inp="%rdi"; 221$out="%rsi"; 222$len="%rdx"; 223$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 224$ivp="%r8"; # cbc, ctr, ... 225 226$rnds_="%r10d"; # backup copy for $rounds 227$key_="%r11"; # backup copy for $key 228 229# %xmm register layout 230$rndkey0="%xmm0"; $rndkey1="%xmm1"; 231$inout0="%xmm2"; $inout1="%xmm3"; 232$inout2="%xmm4"; $inout3="%xmm5"; 233$inout4="%xmm6"; $inout5="%xmm7"; 234$inout6="%xmm8"; $inout7="%xmm9"; 235 236$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 237$in0="%xmm8"; $iv="%xmm9"; 238 239# Inline version of internal aesni_[en|de]crypt1. 240# 241# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 242# cycles which take care of loop variables... 243{ my $sn; 244sub aesni_generate1 { 245my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 246++$sn; 247$code.=<<___; 248 $movkey ($key),$rndkey0 249 $movkey 16($key),$rndkey1 250___ 251$code.=<<___ if (defined($ivec)); 252 xorps $rndkey0,$ivec 253 lea 32($key),$key 254 xorps $ivec,$inout 255___ 256$code.=<<___ if (!defined($ivec)); 257 lea 32($key),$key 258 xorps $rndkey0,$inout 259___ 260$code.=<<___; 261.Loop_${p}1_$sn: 262 aes${p} $rndkey1,$inout 263 dec $rounds 264 $movkey ($key),$rndkey1 265 lea 16($key),$key 266 jnz .Loop_${p}1_$sn # loop body is 16 bytes 267 aes${p}last $rndkey1,$inout 268___ 269}} 270# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 271# 272{ my ($inp,$out,$key) = @_4args; 273 274$code.=<<___; 275.globl ${PREFIX}_encrypt 276.type ${PREFIX}_encrypt,\@abi-omnipotent 277.align 16 278${PREFIX}_encrypt: 279.cfi_startproc 280 endbranch 281 movups ($inp),$inout0 # load input 282 mov 240($key),$rounds # key->rounds 283___ 284 &aesni_generate1("enc",$key,$rounds); 285$code.=<<___; 286 pxor $rndkey0,$rndkey0 # clear register bank 287 pxor $rndkey1,$rndkey1 288 movups $inout0,($out) # output 289 pxor $inout0,$inout0 290 ret 291.cfi_endproc 292.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 293 294.globl ${PREFIX}_decrypt 295.type ${PREFIX}_decrypt,\@abi-omnipotent 296.align 16 297${PREFIX}_decrypt: 298.cfi_startproc 299 endbranch 300 movups ($inp),$inout0 # load input 301 mov 240($key),$rounds # key->rounds 302___ 303 &aesni_generate1("dec",$key,$rounds); 304$code.=<<___; 305 pxor $rndkey0,$rndkey0 # clear register bank 306 pxor $rndkey1,$rndkey1 307 movups $inout0,($out) # output 308 pxor $inout0,$inout0 309 ret 310.cfi_endproc 311.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 312___ 313} 314 315# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 316# factor. Why 3x subroutine were originally used in loops? Even though 317# aes[enc|dec] latency was originally 6, it could be scheduled only 318# every *2nd* cycle. Thus 3x interleave was the one providing optimal 319# utilization, i.e. when subroutine's throughput is virtually same as 320# of non-interleaved subroutine [for number of input blocks up to 3]. 321# This is why it originally made no sense to implement 2x subroutine. 322# But times change and it became appropriate to spend extra 192 bytes 323# on 2x subroutine on Atom Silvermont account. For processors that 324# can schedule aes[enc|dec] every cycle optimal interleave factor 325# equals to corresponding instructions latency. 8x is optimal for 326# * Bridge and "super-optimal" for other Intel CPUs... 327 328sub aesni_generate2 { 329my $dir=shift; 330# As already mentioned it takes in $key and $rounds, which are *not* 331# preserved. $inout[0-1] is cipher/clear text... 332$code.=<<___; 333.type _aesni_${dir}rypt2,\@abi-omnipotent 334.align 16 335_aesni_${dir}rypt2: 336.cfi_startproc 337 $movkey ($key),$rndkey0 338 shl \$4,$rounds 339 $movkey 16($key),$rndkey1 340 xorps $rndkey0,$inout0 341 xorps $rndkey0,$inout1 342 $movkey 32($key),$rndkey0 343 lea 32($key,$rounds),$key 344 neg %rax # $rounds 345 add \$16,%rax 346 347.L${dir}_loop2: 348 aes${dir} $rndkey1,$inout0 349 aes${dir} $rndkey1,$inout1 350 $movkey ($key,%rax),$rndkey1 351 add \$32,%rax 352 aes${dir} $rndkey0,$inout0 353 aes${dir} $rndkey0,$inout1 354 $movkey -16($key,%rax),$rndkey0 355 jnz .L${dir}_loop2 356 357 aes${dir} $rndkey1,$inout0 358 aes${dir} $rndkey1,$inout1 359 aes${dir}last $rndkey0,$inout0 360 aes${dir}last $rndkey0,$inout1 361 ret 362.cfi_endproc 363.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 364___ 365} 366sub aesni_generate3 { 367my $dir=shift; 368# As already mentioned it takes in $key and $rounds, which are *not* 369# preserved. $inout[0-2] is cipher/clear text... 370$code.=<<___; 371.type _aesni_${dir}rypt3,\@abi-omnipotent 372.align 16 373_aesni_${dir}rypt3: 374.cfi_startproc 375 $movkey ($key),$rndkey0 376 shl \$4,$rounds 377 $movkey 16($key),$rndkey1 378 xorps $rndkey0,$inout0 379 xorps $rndkey0,$inout1 380 xorps $rndkey0,$inout2 381 $movkey 32($key),$rndkey0 382 lea 32($key,$rounds),$key 383 neg %rax # $rounds 384 add \$16,%rax 385 386.L${dir}_loop3: 387 aes${dir} $rndkey1,$inout0 388 aes${dir} $rndkey1,$inout1 389 aes${dir} $rndkey1,$inout2 390 $movkey ($key,%rax),$rndkey1 391 add \$32,%rax 392 aes${dir} $rndkey0,$inout0 393 aes${dir} $rndkey0,$inout1 394 aes${dir} $rndkey0,$inout2 395 $movkey -16($key,%rax),$rndkey0 396 jnz .L${dir}_loop3 397 398 aes${dir} $rndkey1,$inout0 399 aes${dir} $rndkey1,$inout1 400 aes${dir} $rndkey1,$inout2 401 aes${dir}last $rndkey0,$inout0 402 aes${dir}last $rndkey0,$inout1 403 aes${dir}last $rndkey0,$inout2 404 ret 405.cfi_endproc 406.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 407___ 408} 409# 4x interleave is implemented to improve small block performance, 410# most notably [and naturally] 4 block by ~30%. One can argue that one 411# should have implemented 5x as well, but improvement would be <20%, 412# so it's not worth it... 413sub aesni_generate4 { 414my $dir=shift; 415# As already mentioned it takes in $key and $rounds, which are *not* 416# preserved. $inout[0-3] is cipher/clear text... 417$code.=<<___; 418.type _aesni_${dir}rypt4,\@abi-omnipotent 419.align 16 420_aesni_${dir}rypt4: 421.cfi_startproc 422 $movkey ($key),$rndkey0 423 shl \$4,$rounds 424 $movkey 16($key),$rndkey1 425 xorps $rndkey0,$inout0 426 xorps $rndkey0,$inout1 427 xorps $rndkey0,$inout2 428 xorps $rndkey0,$inout3 429 $movkey 32($key),$rndkey0 430 lea 32($key,$rounds),$key 431 neg %rax # $rounds 432 .byte 0x0f,0x1f,0x00 433 add \$16,%rax 434 435.L${dir}_loop4: 436 aes${dir} $rndkey1,$inout0 437 aes${dir} $rndkey1,$inout1 438 aes${dir} $rndkey1,$inout2 439 aes${dir} $rndkey1,$inout3 440 $movkey ($key,%rax),$rndkey1 441 add \$32,%rax 442 aes${dir} $rndkey0,$inout0 443 aes${dir} $rndkey0,$inout1 444 aes${dir} $rndkey0,$inout2 445 aes${dir} $rndkey0,$inout3 446 $movkey -16($key,%rax),$rndkey0 447 jnz .L${dir}_loop4 448 449 aes${dir} $rndkey1,$inout0 450 aes${dir} $rndkey1,$inout1 451 aes${dir} $rndkey1,$inout2 452 aes${dir} $rndkey1,$inout3 453 aes${dir}last $rndkey0,$inout0 454 aes${dir}last $rndkey0,$inout1 455 aes${dir}last $rndkey0,$inout2 456 aes${dir}last $rndkey0,$inout3 457 ret 458.cfi_endproc 459.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 460___ 461} 462sub aesni_generate6 { 463my $dir=shift; 464# As already mentioned it takes in $key and $rounds, which are *not* 465# preserved. $inout[0-5] is cipher/clear text... 466$code.=<<___; 467.type _aesni_${dir}rypt6,\@abi-omnipotent 468.align 16 469_aesni_${dir}rypt6: 470.cfi_startproc 471 $movkey ($key),$rndkey0 472 shl \$4,$rounds 473 $movkey 16($key),$rndkey1 474 xorps $rndkey0,$inout0 475 pxor $rndkey0,$inout1 476 pxor $rndkey0,$inout2 477 aes${dir} $rndkey1,$inout0 478 lea 32($key,$rounds),$key 479 neg %rax # $rounds 480 aes${dir} $rndkey1,$inout1 481 pxor $rndkey0,$inout3 482 pxor $rndkey0,$inout4 483 aes${dir} $rndkey1,$inout2 484 pxor $rndkey0,$inout5 485 $movkey ($key,%rax),$rndkey0 486 add \$16,%rax 487 jmp .L${dir}_loop6_enter 488.align 16 489.L${dir}_loop6: 490 aes${dir} $rndkey1,$inout0 491 aes${dir} $rndkey1,$inout1 492 aes${dir} $rndkey1,$inout2 493.L${dir}_loop6_enter: 494 aes${dir} $rndkey1,$inout3 495 aes${dir} $rndkey1,$inout4 496 aes${dir} $rndkey1,$inout5 497 $movkey ($key,%rax),$rndkey1 498 add \$32,%rax 499 aes${dir} $rndkey0,$inout0 500 aes${dir} $rndkey0,$inout1 501 aes${dir} $rndkey0,$inout2 502 aes${dir} $rndkey0,$inout3 503 aes${dir} $rndkey0,$inout4 504 aes${dir} $rndkey0,$inout5 505 $movkey -16($key,%rax),$rndkey0 506 jnz .L${dir}_loop6 507 508 aes${dir} $rndkey1,$inout0 509 aes${dir} $rndkey1,$inout1 510 aes${dir} $rndkey1,$inout2 511 aes${dir} $rndkey1,$inout3 512 aes${dir} $rndkey1,$inout4 513 aes${dir} $rndkey1,$inout5 514 aes${dir}last $rndkey0,$inout0 515 aes${dir}last $rndkey0,$inout1 516 aes${dir}last $rndkey0,$inout2 517 aes${dir}last $rndkey0,$inout3 518 aes${dir}last $rndkey0,$inout4 519 aes${dir}last $rndkey0,$inout5 520 ret 521.cfi_endproc 522.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 523___ 524} 525sub aesni_generate8 { 526my $dir=shift; 527# As already mentioned it takes in $key and $rounds, which are *not* 528# preserved. $inout[0-7] is cipher/clear text... 529$code.=<<___; 530.type _aesni_${dir}rypt8,\@abi-omnipotent 531.align 16 532_aesni_${dir}rypt8: 533.cfi_startproc 534 $movkey ($key),$rndkey0 535 shl \$4,$rounds 536 $movkey 16($key),$rndkey1 537 xorps $rndkey0,$inout0 538 xorps $rndkey0,$inout1 539 pxor $rndkey0,$inout2 540 pxor $rndkey0,$inout3 541 pxor $rndkey0,$inout4 542 lea 32($key,$rounds),$key 543 neg %rax # $rounds 544 aes${dir} $rndkey1,$inout0 545 pxor $rndkey0,$inout5 546 pxor $rndkey0,$inout6 547 aes${dir} $rndkey1,$inout1 548 pxor $rndkey0,$inout7 549 $movkey ($key,%rax),$rndkey0 550 add \$16,%rax 551 jmp .L${dir}_loop8_inner 552.align 16 553.L${dir}_loop8: 554 aes${dir} $rndkey1,$inout0 555 aes${dir} $rndkey1,$inout1 556.L${dir}_loop8_inner: 557 aes${dir} $rndkey1,$inout2 558 aes${dir} $rndkey1,$inout3 559 aes${dir} $rndkey1,$inout4 560 aes${dir} $rndkey1,$inout5 561 aes${dir} $rndkey1,$inout6 562 aes${dir} $rndkey1,$inout7 563.L${dir}_loop8_enter: 564 $movkey ($key,%rax),$rndkey1 565 add \$32,%rax 566 aes${dir} $rndkey0,$inout0 567 aes${dir} $rndkey0,$inout1 568 aes${dir} $rndkey0,$inout2 569 aes${dir} $rndkey0,$inout3 570 aes${dir} $rndkey0,$inout4 571 aes${dir} $rndkey0,$inout5 572 aes${dir} $rndkey0,$inout6 573 aes${dir} $rndkey0,$inout7 574 $movkey -16($key,%rax),$rndkey0 575 jnz .L${dir}_loop8 576 577 aes${dir} $rndkey1,$inout0 578 aes${dir} $rndkey1,$inout1 579 aes${dir} $rndkey1,$inout2 580 aes${dir} $rndkey1,$inout3 581 aes${dir} $rndkey1,$inout4 582 aes${dir} $rndkey1,$inout5 583 aes${dir} $rndkey1,$inout6 584 aes${dir} $rndkey1,$inout7 585 aes${dir}last $rndkey0,$inout0 586 aes${dir}last $rndkey0,$inout1 587 aes${dir}last $rndkey0,$inout2 588 aes${dir}last $rndkey0,$inout3 589 aes${dir}last $rndkey0,$inout4 590 aes${dir}last $rndkey0,$inout5 591 aes${dir}last $rndkey0,$inout6 592 aes${dir}last $rndkey0,$inout7 593 ret 594.cfi_endproc 595.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 596___ 597} 598&aesni_generate2("enc") if ($PREFIX eq "aesni"); 599&aesni_generate2("dec"); 600&aesni_generate3("enc") if ($PREFIX eq "aesni"); 601&aesni_generate3("dec"); 602&aesni_generate4("enc") if ($PREFIX eq "aesni"); 603&aesni_generate4("dec"); 604&aesni_generate6("enc") if ($PREFIX eq "aesni"); 605&aesni_generate6("dec"); 606&aesni_generate8("enc") if ($PREFIX eq "aesni"); 607&aesni_generate8("dec"); 608 609if ($PREFIX eq "aesni") { 610######################################################################## 611# void aesni_ecb_encrypt (const void *in, void *out, 612# size_t length, const AES_KEY *key, 613# int enc); 614$code.=<<___; 615.globl aesni_ecb_encrypt 616.type aesni_ecb_encrypt,\@function,5 617.align 16 618aesni_ecb_encrypt: 619.cfi_startproc 620 endbranch 621___ 622$code.=<<___ if ($win64); 623 lea -0x58(%rsp),%rsp 624 movaps %xmm6,(%rsp) # offload $inout4..7 625 movaps %xmm7,0x10(%rsp) 626 movaps %xmm8,0x20(%rsp) 627 movaps %xmm9,0x30(%rsp) 628.Lecb_enc_body: 629___ 630$code.=<<___; 631 and \$-16,$len # if ($len<16) 632 jz .Lecb_ret # return 633 634 mov 240($key),$rounds # key->rounds 635 $movkey ($key),$rndkey0 636 mov $key,$key_ # backup $key 637 mov $rounds,$rnds_ # backup $rounds 638 test %r8d,%r8d # 5th argument 639 jz .Lecb_decrypt 640#--------------------------- ECB ENCRYPT ------------------------------# 641 cmp \$0x80,$len # if ($len<8*16) 642 jb .Lecb_enc_tail # short input 643 644 movdqu ($inp),$inout0 # load 8 input blocks 645 movdqu 0x10($inp),$inout1 646 movdqu 0x20($inp),$inout2 647 movdqu 0x30($inp),$inout3 648 movdqu 0x40($inp),$inout4 649 movdqu 0x50($inp),$inout5 650 movdqu 0x60($inp),$inout6 651 movdqu 0x70($inp),$inout7 652 lea 0x80($inp),$inp # $inp+=8*16 653 sub \$0x80,$len # $len-=8*16 (can be zero) 654 jmp .Lecb_enc_loop8_enter 655.align 16 656.Lecb_enc_loop8: 657 movups $inout0,($out) # store 8 output blocks 658 mov $key_,$key # restore $key 659 movdqu ($inp),$inout0 # load 8 input blocks 660 mov $rnds_,$rounds # restore $rounds 661 movups $inout1,0x10($out) 662 movdqu 0x10($inp),$inout1 663 movups $inout2,0x20($out) 664 movdqu 0x20($inp),$inout2 665 movups $inout3,0x30($out) 666 movdqu 0x30($inp),$inout3 667 movups $inout4,0x40($out) 668 movdqu 0x40($inp),$inout4 669 movups $inout5,0x50($out) 670 movdqu 0x50($inp),$inout5 671 movups $inout6,0x60($out) 672 movdqu 0x60($inp),$inout6 673 movups $inout7,0x70($out) 674 lea 0x80($out),$out # $out+=8*16 675 movdqu 0x70($inp),$inout7 676 lea 0x80($inp),$inp # $inp+=8*16 677.Lecb_enc_loop8_enter: 678 679 call _aesni_encrypt8 680 681 sub \$0x80,$len 682 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 683 684 movups $inout0,($out) # store 8 output blocks 685 mov $key_,$key # restore $key 686 movups $inout1,0x10($out) 687 mov $rnds_,$rounds # restore $rounds 688 movups $inout2,0x20($out) 689 movups $inout3,0x30($out) 690 movups $inout4,0x40($out) 691 movups $inout5,0x50($out) 692 movups $inout6,0x60($out) 693 movups $inout7,0x70($out) 694 lea 0x80($out),$out # $out+=8*16 695 add \$0x80,$len # restore real remaining $len 696 jz .Lecb_ret # done if ($len==0) 697 698.Lecb_enc_tail: # $len is less than 8*16 699 movups ($inp),$inout0 700 cmp \$0x20,$len 701 jb .Lecb_enc_one 702 movups 0x10($inp),$inout1 703 je .Lecb_enc_two 704 movups 0x20($inp),$inout2 705 cmp \$0x40,$len 706 jb .Lecb_enc_three 707 movups 0x30($inp),$inout3 708 je .Lecb_enc_four 709 movups 0x40($inp),$inout4 710 cmp \$0x60,$len 711 jb .Lecb_enc_five 712 movups 0x50($inp),$inout5 713 je .Lecb_enc_six 714 movdqu 0x60($inp),$inout6 715 xorps $inout7,$inout7 716 call _aesni_encrypt8 717 movups $inout0,($out) # store 7 output blocks 718 movups $inout1,0x10($out) 719 movups $inout2,0x20($out) 720 movups $inout3,0x30($out) 721 movups $inout4,0x40($out) 722 movups $inout5,0x50($out) 723 movups $inout6,0x60($out) 724 jmp .Lecb_ret 725.align 16 726.Lecb_enc_one: 727___ 728 &aesni_generate1("enc",$key,$rounds); 729$code.=<<___; 730 movups $inout0,($out) # store one output block 731 jmp .Lecb_ret 732.align 16 733.Lecb_enc_two: 734 call _aesni_encrypt2 735 movups $inout0,($out) # store 2 output blocks 736 movups $inout1,0x10($out) 737 jmp .Lecb_ret 738.align 16 739.Lecb_enc_three: 740 call _aesni_encrypt3 741 movups $inout0,($out) # store 3 output blocks 742 movups $inout1,0x10($out) 743 movups $inout2,0x20($out) 744 jmp .Lecb_ret 745.align 16 746.Lecb_enc_four: 747 call _aesni_encrypt4 748 movups $inout0,($out) # store 4 output blocks 749 movups $inout1,0x10($out) 750 movups $inout2,0x20($out) 751 movups $inout3,0x30($out) 752 jmp .Lecb_ret 753.align 16 754.Lecb_enc_five: 755 xorps $inout5,$inout5 756 call _aesni_encrypt6 757 movups $inout0,($out) # store 5 output blocks 758 movups $inout1,0x10($out) 759 movups $inout2,0x20($out) 760 movups $inout3,0x30($out) 761 movups $inout4,0x40($out) 762 jmp .Lecb_ret 763.align 16 764.Lecb_enc_six: 765 call _aesni_encrypt6 766 movups $inout0,($out) # store 6 output blocks 767 movups $inout1,0x10($out) 768 movups $inout2,0x20($out) 769 movups $inout3,0x30($out) 770 movups $inout4,0x40($out) 771 movups $inout5,0x50($out) 772 jmp .Lecb_ret 773#--------------------------- ECB DECRYPT ------------------------------# 774.align 16 775.Lecb_decrypt: 776 cmp \$0x80,$len # if ($len<8*16) 777 jb .Lecb_dec_tail # short input 778 779 movdqu ($inp),$inout0 # load 8 input blocks 780 movdqu 0x10($inp),$inout1 781 movdqu 0x20($inp),$inout2 782 movdqu 0x30($inp),$inout3 783 movdqu 0x40($inp),$inout4 784 movdqu 0x50($inp),$inout5 785 movdqu 0x60($inp),$inout6 786 movdqu 0x70($inp),$inout7 787 lea 0x80($inp),$inp # $inp+=8*16 788 sub \$0x80,$len # $len-=8*16 (can be zero) 789 jmp .Lecb_dec_loop8_enter 790.align 16 791.Lecb_dec_loop8: 792 movups $inout0,($out) # store 8 output blocks 793 mov $key_,$key # restore $key 794 movdqu ($inp),$inout0 # load 8 input blocks 795 mov $rnds_,$rounds # restore $rounds 796 movups $inout1,0x10($out) 797 movdqu 0x10($inp),$inout1 798 movups $inout2,0x20($out) 799 movdqu 0x20($inp),$inout2 800 movups $inout3,0x30($out) 801 movdqu 0x30($inp),$inout3 802 movups $inout4,0x40($out) 803 movdqu 0x40($inp),$inout4 804 movups $inout5,0x50($out) 805 movdqu 0x50($inp),$inout5 806 movups $inout6,0x60($out) 807 movdqu 0x60($inp),$inout6 808 movups $inout7,0x70($out) 809 lea 0x80($out),$out # $out+=8*16 810 movdqu 0x70($inp),$inout7 811 lea 0x80($inp),$inp # $inp+=8*16 812.Lecb_dec_loop8_enter: 813 814 call _aesni_decrypt8 815 816 $movkey ($key_),$rndkey0 817 sub \$0x80,$len 818 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 819 820 movups $inout0,($out) # store 8 output blocks 821 pxor $inout0,$inout0 # clear register bank 822 mov $key_,$key # restore $key 823 movups $inout1,0x10($out) 824 pxor $inout1,$inout1 825 mov $rnds_,$rounds # restore $rounds 826 movups $inout2,0x20($out) 827 pxor $inout2,$inout2 828 movups $inout3,0x30($out) 829 pxor $inout3,$inout3 830 movups $inout4,0x40($out) 831 pxor $inout4,$inout4 832 movups $inout5,0x50($out) 833 pxor $inout5,$inout5 834 movups $inout6,0x60($out) 835 pxor $inout6,$inout6 836 movups $inout7,0x70($out) 837 pxor $inout7,$inout7 838 lea 0x80($out),$out # $out+=8*16 839 add \$0x80,$len # restore real remaining $len 840 jz .Lecb_ret # done if ($len==0) 841 842.Lecb_dec_tail: 843 movups ($inp),$inout0 844 cmp \$0x20,$len 845 jb .Lecb_dec_one 846 movups 0x10($inp),$inout1 847 je .Lecb_dec_two 848 movups 0x20($inp),$inout2 849 cmp \$0x40,$len 850 jb .Lecb_dec_three 851 movups 0x30($inp),$inout3 852 je .Lecb_dec_four 853 movups 0x40($inp),$inout4 854 cmp \$0x60,$len 855 jb .Lecb_dec_five 856 movups 0x50($inp),$inout5 857 je .Lecb_dec_six 858 movups 0x60($inp),$inout6 859 $movkey ($key),$rndkey0 860 xorps $inout7,$inout7 861 call _aesni_decrypt8 862 movups $inout0,($out) # store 7 output blocks 863 pxor $inout0,$inout0 # clear register bank 864 movups $inout1,0x10($out) 865 pxor $inout1,$inout1 866 movups $inout2,0x20($out) 867 pxor $inout2,$inout2 868 movups $inout3,0x30($out) 869 pxor $inout3,$inout3 870 movups $inout4,0x40($out) 871 pxor $inout4,$inout4 872 movups $inout5,0x50($out) 873 pxor $inout5,$inout5 874 movups $inout6,0x60($out) 875 pxor $inout6,$inout6 876 pxor $inout7,$inout7 877 jmp .Lecb_ret 878.align 16 879.Lecb_dec_one: 880___ 881 &aesni_generate1("dec",$key,$rounds); 882$code.=<<___; 883 movups $inout0,($out) # store one output block 884 pxor $inout0,$inout0 # clear register bank 885 jmp .Lecb_ret 886.align 16 887.Lecb_dec_two: 888 call _aesni_decrypt2 889 movups $inout0,($out) # store 2 output blocks 890 pxor $inout0,$inout0 # clear register bank 891 movups $inout1,0x10($out) 892 pxor $inout1,$inout1 893 jmp .Lecb_ret 894.align 16 895.Lecb_dec_three: 896 call _aesni_decrypt3 897 movups $inout0,($out) # store 3 output blocks 898 pxor $inout0,$inout0 # clear register bank 899 movups $inout1,0x10($out) 900 pxor $inout1,$inout1 901 movups $inout2,0x20($out) 902 pxor $inout2,$inout2 903 jmp .Lecb_ret 904.align 16 905.Lecb_dec_four: 906 call _aesni_decrypt4 907 movups $inout0,($out) # store 4 output blocks 908 pxor $inout0,$inout0 # clear register bank 909 movups $inout1,0x10($out) 910 pxor $inout1,$inout1 911 movups $inout2,0x20($out) 912 pxor $inout2,$inout2 913 movups $inout3,0x30($out) 914 pxor $inout3,$inout3 915 jmp .Lecb_ret 916.align 16 917.Lecb_dec_five: 918 xorps $inout5,$inout5 919 call _aesni_decrypt6 920 movups $inout0,($out) # store 5 output blocks 921 pxor $inout0,$inout0 # clear register bank 922 movups $inout1,0x10($out) 923 pxor $inout1,$inout1 924 movups $inout2,0x20($out) 925 pxor $inout2,$inout2 926 movups $inout3,0x30($out) 927 pxor $inout3,$inout3 928 movups $inout4,0x40($out) 929 pxor $inout4,$inout4 930 pxor $inout5,$inout5 931 jmp .Lecb_ret 932.align 16 933.Lecb_dec_six: 934 call _aesni_decrypt6 935 movups $inout0,($out) # store 6 output blocks 936 pxor $inout0,$inout0 # clear register bank 937 movups $inout1,0x10($out) 938 pxor $inout1,$inout1 939 movups $inout2,0x20($out) 940 pxor $inout2,$inout2 941 movups $inout3,0x30($out) 942 pxor $inout3,$inout3 943 movups $inout4,0x40($out) 944 pxor $inout4,$inout4 945 movups $inout5,0x50($out) 946 pxor $inout5,$inout5 947 948.Lecb_ret: 949 xorps $rndkey0,$rndkey0 # %xmm0 950 pxor $rndkey1,$rndkey1 951___ 952$code.=<<___ if ($win64); 953 movaps (%rsp),%xmm6 954 movaps %xmm0,(%rsp) # clear stack 955 movaps 0x10(%rsp),%xmm7 956 movaps %xmm0,0x10(%rsp) 957 movaps 0x20(%rsp),%xmm8 958 movaps %xmm0,0x20(%rsp) 959 movaps 0x30(%rsp),%xmm9 960 movaps %xmm0,0x30(%rsp) 961 lea 0x58(%rsp),%rsp 962.Lecb_enc_ret: 963___ 964$code.=<<___; 965 ret 966.cfi_endproc 967.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 968___ 969 970{ 971###################################################################### 972# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 973# size_t blocks, const AES_KEY *key, 974# const char *ivec,char *cmac); 975# 976# Handles only complete blocks, operates on 64-bit counter and 977# does not update *ivec! Nor does it finalize CMAC value 978# (see engine/eng_aesni.c for details) 979# 980{ 981my $cmac="%r9"; # 6th argument 982 983my $increment="%xmm9"; 984my $iv="%xmm6"; 985my $bswap_mask="%xmm7"; 986 987$code.=<<___; 988.globl aesni_ccm64_encrypt_blocks 989.type aesni_ccm64_encrypt_blocks,\@function,6 990.align 16 991aesni_ccm64_encrypt_blocks: 992.cfi_startproc 993 endbranch 994___ 995$code.=<<___ if ($win64); 996 lea -0x58(%rsp),%rsp 997 movaps %xmm6,(%rsp) # $iv 998 movaps %xmm7,0x10(%rsp) # $bswap_mask 999 movaps %xmm8,0x20(%rsp) # $in0 1000 movaps %xmm9,0x30(%rsp) # $increment 1001.Lccm64_enc_body: 1002___ 1003$code.=<<___; 1004 mov 240($key),$rounds # key->rounds 1005 movdqu ($ivp),$iv 1006 movdqa .Lincrement64(%rip),$increment 1007 movdqa .Lbswap_mask(%rip),$bswap_mask 1008 1009 shl \$4,$rounds 1010 mov \$16,$rnds_ 1011 lea 0($key),$key_ 1012 movdqu ($cmac),$inout1 1013 movdqa $iv,$inout0 1014 lea 32($key,$rounds),$key # end of key schedule 1015 pshufb $bswap_mask,$iv 1016 sub %rax,%r10 # twisted $rounds 1017 jmp .Lccm64_enc_outer 1018.align 16 1019.Lccm64_enc_outer: 1020 $movkey ($key_),$rndkey0 1021 mov %r10,%rax 1022 movups ($inp),$in0 # load inp 1023 1024 xorps $rndkey0,$inout0 # counter 1025 $movkey 16($key_),$rndkey1 1026 xorps $in0,$rndkey0 1027 xorps $rndkey0,$inout1 # cmac^=inp 1028 $movkey 32($key_),$rndkey0 1029 1030.Lccm64_enc2_loop: 1031 aesenc $rndkey1,$inout0 1032 aesenc $rndkey1,$inout1 1033 $movkey ($key,%rax),$rndkey1 1034 add \$32,%rax 1035 aesenc $rndkey0,$inout0 1036 aesenc $rndkey0,$inout1 1037 $movkey -16($key,%rax),$rndkey0 1038 jnz .Lccm64_enc2_loop 1039 aesenc $rndkey1,$inout0 1040 aesenc $rndkey1,$inout1 1041 paddq $increment,$iv 1042 dec $len # $len-- ($len is in blocks) 1043 aesenclast $rndkey0,$inout0 1044 aesenclast $rndkey0,$inout1 1045 1046 lea 16($inp),$inp 1047 xorps $inout0,$in0 # inp ^= E(iv) 1048 movdqa $iv,$inout0 1049 movups $in0,($out) # save output 1050 pshufb $bswap_mask,$inout0 1051 lea 16($out),$out # $out+=16 1052 jnz .Lccm64_enc_outer # loop if ($len!=0) 1053 1054 pxor $rndkey0,$rndkey0 # clear register bank 1055 pxor $rndkey1,$rndkey1 1056 pxor $inout0,$inout0 1057 movups $inout1,($cmac) # store resulting mac 1058 pxor $inout1,$inout1 1059 pxor $in0,$in0 1060 pxor $iv,$iv 1061___ 1062$code.=<<___ if ($win64); 1063 movaps (%rsp),%xmm6 1064 movaps %xmm0,(%rsp) # clear stack 1065 movaps 0x10(%rsp),%xmm7 1066 movaps %xmm0,0x10(%rsp) 1067 movaps 0x20(%rsp),%xmm8 1068 movaps %xmm0,0x20(%rsp) 1069 movaps 0x30(%rsp),%xmm9 1070 movaps %xmm0,0x30(%rsp) 1071 lea 0x58(%rsp),%rsp 1072.Lccm64_enc_ret: 1073___ 1074$code.=<<___; 1075 ret 1076.cfi_endproc 1077.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks 1078___ 1079###################################################################### 1080$code.=<<___; 1081.globl aesni_ccm64_decrypt_blocks 1082.type aesni_ccm64_decrypt_blocks,\@function,6 1083.align 16 1084aesni_ccm64_decrypt_blocks: 1085.cfi_startproc 1086 endbranch 1087___ 1088$code.=<<___ if ($win64); 1089 lea -0x58(%rsp),%rsp 1090 movaps %xmm6,(%rsp) # $iv 1091 movaps %xmm7,0x10(%rsp) # $bswap_mask 1092 movaps %xmm8,0x20(%rsp) # $in8 1093 movaps %xmm9,0x30(%rsp) # $increment 1094.Lccm64_dec_body: 1095___ 1096$code.=<<___; 1097 mov 240($key),$rounds # key->rounds 1098 movups ($ivp),$iv 1099 movdqu ($cmac),$inout1 1100 movdqa .Lincrement64(%rip),$increment 1101 movdqa .Lbswap_mask(%rip),$bswap_mask 1102 1103 movaps $iv,$inout0 1104 mov $rounds,$rnds_ 1105 mov $key,$key_ 1106 pshufb $bswap_mask,$iv 1107___ 1108 &aesni_generate1("enc",$key,$rounds); 1109$code.=<<___; 1110 shl \$4,$rnds_ 1111 mov \$16,$rounds 1112 movups ($inp),$in0 # load inp 1113 paddq $increment,$iv 1114 lea 16($inp),$inp # $inp+=16 1115 sub %r10,%rax # twisted $rounds 1116 lea 32($key_,$rnds_),$key # end of key schedule 1117 mov %rax,%r10 1118 jmp .Lccm64_dec_outer 1119.align 16 1120.Lccm64_dec_outer: 1121 xorps $inout0,$in0 # inp ^= E(iv) 1122 movdqa $iv,$inout0 1123 movups $in0,($out) # save output 1124 lea 16($out),$out # $out+=16 1125 pshufb $bswap_mask,$inout0 1126 1127 sub \$1,$len # $len-- ($len is in blocks) 1128 jz .Lccm64_dec_break # if ($len==0) break 1129 1130 $movkey ($key_),$rndkey0 1131 mov %r10,%rax 1132 $movkey 16($key_),$rndkey1 1133 xorps $rndkey0,$in0 1134 xorps $rndkey0,$inout0 1135 xorps $in0,$inout1 # cmac^=out 1136 $movkey 32($key_),$rndkey0 1137 jmp .Lccm64_dec2_loop 1138.align 16 1139.Lccm64_dec2_loop: 1140 aesenc $rndkey1,$inout0 1141 aesenc $rndkey1,$inout1 1142 $movkey ($key,%rax),$rndkey1 1143 add \$32,%rax 1144 aesenc $rndkey0,$inout0 1145 aesenc $rndkey0,$inout1 1146 $movkey -16($key,%rax),$rndkey0 1147 jnz .Lccm64_dec2_loop 1148 movups ($inp),$in0 # load input 1149 paddq $increment,$iv 1150 aesenc $rndkey1,$inout0 1151 aesenc $rndkey1,$inout1 1152 aesenclast $rndkey0,$inout0 1153 aesenclast $rndkey0,$inout1 1154 lea 16($inp),$inp # $inp+=16 1155 jmp .Lccm64_dec_outer 1156 1157.align 16 1158.Lccm64_dec_break: 1159 #xorps $in0,$inout1 # cmac^=out 1160 mov 240($key_),$rounds 1161___ 1162 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1163$code.=<<___; 1164 pxor $rndkey0,$rndkey0 # clear register bank 1165 pxor $rndkey1,$rndkey1 1166 pxor $inout0,$inout0 1167 movups $inout1,($cmac) # store resulting mac 1168 pxor $inout1,$inout1 1169 pxor $in0,$in0 1170 pxor $iv,$iv 1171___ 1172$code.=<<___ if ($win64); 1173 movaps (%rsp),%xmm6 1174 movaps %xmm0,(%rsp) # clear stack 1175 movaps 0x10(%rsp),%xmm7 1176 movaps %xmm0,0x10(%rsp) 1177 movaps 0x20(%rsp),%xmm8 1178 movaps %xmm0,0x20(%rsp) 1179 movaps 0x30(%rsp),%xmm9 1180 movaps %xmm0,0x30(%rsp) 1181 lea 0x58(%rsp),%rsp 1182.Lccm64_dec_ret: 1183___ 1184$code.=<<___; 1185 ret 1186.cfi_endproc 1187.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks 1188___ 1189} 1190###################################################################### 1191# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1192# size_t blocks, const AES_KEY *key, 1193# const char *ivec); 1194# 1195# Handles only complete blocks, operates on 32-bit counter and 1196# does not update *ivec! (see crypto/modes/ctr128.c for details) 1197# 1198# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1199# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1200# Keywords are full unroll and modulo-schedule counter calculations 1201# with zero-round key xor. 1202{ 1203my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1204my ($key0,$ctr)=("%ebp","${ivp}d"); 1205my $frame_size = 0x80 + ($win64?160:0); 1206 1207$code.=<<___; 1208.globl aesni_ctr32_encrypt_blocks 1209.type aesni_ctr32_encrypt_blocks,\@function,5 1210.align 16 1211aesni_ctr32_encrypt_blocks: 1212.cfi_startproc 1213 endbranch 1214 cmp \$1,$len 1215 jne .Lctr32_bulk 1216 1217 # handle single block without allocating stack frame, 1218 # useful when handling edges 1219 movups ($ivp),$inout0 1220 movups ($inp),$inout1 1221 mov 240($key),%edx # key->rounds 1222___ 1223 &aesni_generate1("enc",$key,"%edx"); 1224$code.=<<___; 1225 pxor $rndkey0,$rndkey0 # clear register bank 1226 pxor $rndkey1,$rndkey1 1227 xorps $inout1,$inout0 1228 pxor $inout1,$inout1 1229 movups $inout0,($out) 1230 xorps $inout0,$inout0 1231 jmp .Lctr32_epilogue 1232 1233.align 16 1234.Lctr32_bulk: 1235 lea (%rsp),$key_ # use $key_ as frame pointer 1236.cfi_def_cfa_register $key_ 1237 push %rbp 1238.cfi_push %rbp 1239 sub \$$frame_size,%rsp 1240 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1241___ 1242$code.=<<___ if ($win64); 1243 movaps %xmm6,-0xa8($key_) # offload everything 1244 movaps %xmm7,-0x98($key_) 1245 movaps %xmm8,-0x88($key_) 1246 movaps %xmm9,-0x78($key_) 1247 movaps %xmm10,-0x68($key_) 1248 movaps %xmm11,-0x58($key_) 1249 movaps %xmm12,-0x48($key_) 1250 movaps %xmm13,-0x38($key_) 1251 movaps %xmm14,-0x28($key_) 1252 movaps %xmm15,-0x18($key_) 1253.Lctr32_body: 1254___ 1255$code.=<<___; 1256 1257 # 8 16-byte words on top of stack are counter values 1258 # xor-ed with zero-round key 1259 1260 movdqu ($ivp),$inout0 1261 movdqu ($key),$rndkey0 1262 mov 12($ivp),$ctr # counter LSB 1263 pxor $rndkey0,$inout0 1264 mov 12($key),$key0 # 0-round key LSB 1265 movdqa $inout0,0x00(%rsp) # populate counter block 1266 bswap $ctr 1267 movdqa $inout0,$inout1 1268 movdqa $inout0,$inout2 1269 movdqa $inout0,$inout3 1270 movdqa $inout0,0x40(%rsp) 1271 movdqa $inout0,0x50(%rsp) 1272 movdqa $inout0,0x60(%rsp) 1273 mov %rdx,%r10 # about to borrow %rdx 1274 movdqa $inout0,0x70(%rsp) 1275 1276 lea 1($ctr),%rax 1277 lea 2($ctr),%rdx 1278 bswap %eax 1279 bswap %edx 1280 xor $key0,%eax 1281 xor $key0,%edx 1282 pinsrd \$3,%eax,$inout1 1283 lea 3($ctr),%rax 1284 movdqa $inout1,0x10(%rsp) 1285 pinsrd \$3,%edx,$inout2 1286 bswap %eax 1287 mov %r10,%rdx # restore %rdx 1288 lea 4($ctr),%r10 1289 movdqa $inout2,0x20(%rsp) 1290 xor $key0,%eax 1291 bswap %r10d 1292 pinsrd \$3,%eax,$inout3 1293 xor $key0,%r10d 1294 movdqa $inout3,0x30(%rsp) 1295 lea 5($ctr),%r9 1296 mov %r10d,0x40+12(%rsp) 1297 bswap %r9d 1298 lea 6($ctr),%r10 1299 mov 240($key),$rounds # key->rounds 1300 xor $key0,%r9d 1301 bswap %r10d 1302 mov %r9d,0x50+12(%rsp) 1303 xor $key0,%r10d 1304 lea 7($ctr),%r9 1305 mov %r10d,0x60+12(%rsp) 1306 bswap %r9d 1307 mov OPENSSL_ia32cap_P+4(%rip),%r10d 1308 xor $key0,%r9d 1309 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1310 mov %r9d,0x70+12(%rsp) 1311 1312 $movkey 0x10($key),$rndkey1 1313 1314 movdqa 0x40(%rsp),$inout4 1315 movdqa 0x50(%rsp),$inout5 1316 1317 cmp \$8,$len # $len is in blocks 1318 jb .Lctr32_tail # short input if ($len<8) 1319 1320 sub \$6,$len # $len is biased by -6 1321 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1322 je .Lctr32_6x # [which denotes Atom Silvermont] 1323 1324 lea 0x80($key),$key # size optimization 1325 sub \$2,$len # $len is biased by -8 1326 jmp .Lctr32_loop8 1327 1328.align 16 1329.Lctr32_6x: 1330 shl \$4,$rounds 1331 mov \$48,$rnds_ 1332 bswap $key0 1333 lea 32($key,$rounds),$key # end of key schedule 1334 sub %rax,%r10 # twisted $rounds 1335 jmp .Lctr32_loop6 1336 1337.align 16 1338.Lctr32_loop6: 1339 add \$6,$ctr # next counter value 1340 $movkey -48($key,$rnds_),$rndkey0 1341 aesenc $rndkey1,$inout0 1342 mov $ctr,%eax 1343 xor $key0,%eax 1344 aesenc $rndkey1,$inout1 1345 movbe %eax,`0x00+12`(%rsp) # store next counter value 1346 lea 1($ctr),%eax 1347 aesenc $rndkey1,$inout2 1348 xor $key0,%eax 1349 movbe %eax,`0x10+12`(%rsp) 1350 aesenc $rndkey1,$inout3 1351 lea 2($ctr),%eax 1352 xor $key0,%eax 1353 aesenc $rndkey1,$inout4 1354 movbe %eax,`0x20+12`(%rsp) 1355 lea 3($ctr),%eax 1356 aesenc $rndkey1,$inout5 1357 $movkey -32($key,$rnds_),$rndkey1 1358 xor $key0,%eax 1359 1360 aesenc $rndkey0,$inout0 1361 movbe %eax,`0x30+12`(%rsp) 1362 lea 4($ctr),%eax 1363 aesenc $rndkey0,$inout1 1364 xor $key0,%eax 1365 movbe %eax,`0x40+12`(%rsp) 1366 aesenc $rndkey0,$inout2 1367 lea 5($ctr),%eax 1368 xor $key0,%eax 1369 aesenc $rndkey0,$inout3 1370 movbe %eax,`0x50+12`(%rsp) 1371 mov %r10,%rax # mov $rnds_,$rounds 1372 aesenc $rndkey0,$inout4 1373 aesenc $rndkey0,$inout5 1374 $movkey -16($key,$rnds_),$rndkey0 1375 1376 call .Lenc_loop6 1377 1378 movdqu ($inp),$inout6 # load 6 input blocks 1379 movdqu 0x10($inp),$inout7 1380 movdqu 0x20($inp),$in0 1381 movdqu 0x30($inp),$in1 1382 movdqu 0x40($inp),$in2 1383 movdqu 0x50($inp),$in3 1384 lea 0x60($inp),$inp # $inp+=6*16 1385 $movkey -64($key,$rnds_),$rndkey1 1386 pxor $inout0,$inout6 # inp^=E(ctr) 1387 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1388 pxor $inout1,$inout7 1389 movaps 0x10(%rsp),$inout1 1390 pxor $inout2,$in0 1391 movaps 0x20(%rsp),$inout2 1392 pxor $inout3,$in1 1393 movaps 0x30(%rsp),$inout3 1394 pxor $inout4,$in2 1395 movaps 0x40(%rsp),$inout4 1396 pxor $inout5,$in3 1397 movaps 0x50(%rsp),$inout5 1398 movdqu $inout6,($out) # store 6 output blocks 1399 movdqu $inout7,0x10($out) 1400 movdqu $in0,0x20($out) 1401 movdqu $in1,0x30($out) 1402 movdqu $in2,0x40($out) 1403 movdqu $in3,0x50($out) 1404 lea 0x60($out),$out # $out+=6*16 1405 1406 sub \$6,$len 1407 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1408 1409 add \$6,$len # restore real remaining $len 1410 jz .Lctr32_done # done if ($len==0) 1411 1412 lea -48($rnds_),$rounds 1413 lea -80($key,$rnds_),$key # restore $key 1414 neg $rounds 1415 shr \$4,$rounds # restore $rounds 1416 jmp .Lctr32_tail 1417 1418.align 32 1419.Lctr32_loop8: 1420 add \$8,$ctr # next counter value 1421 movdqa 0x60(%rsp),$inout6 1422 aesenc $rndkey1,$inout0 1423 mov $ctr,%r9d 1424 movdqa 0x70(%rsp),$inout7 1425 aesenc $rndkey1,$inout1 1426 bswap %r9d 1427 $movkey 0x20-0x80($key),$rndkey0 1428 aesenc $rndkey1,$inout2 1429 xor $key0,%r9d 1430 nop 1431 aesenc $rndkey1,$inout3 1432 mov %r9d,0x00+12(%rsp) # store next counter value 1433 lea 1($ctr),%r9 1434 aesenc $rndkey1,$inout4 1435 aesenc $rndkey1,$inout5 1436 aesenc $rndkey1,$inout6 1437 aesenc $rndkey1,$inout7 1438 $movkey 0x30-0x80($key),$rndkey1 1439___ 1440for($i=2;$i<8;$i++) { 1441my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1442$code.=<<___; 1443 bswap %r9d 1444 aesenc $rndkeyx,$inout0 1445 aesenc $rndkeyx,$inout1 1446 xor $key0,%r9d 1447 .byte 0x66,0x90 1448 aesenc $rndkeyx,$inout2 1449 aesenc $rndkeyx,$inout3 1450 mov %r9d,`0x10*($i-1)`+12(%rsp) 1451 lea $i($ctr),%r9 1452 aesenc $rndkeyx,$inout4 1453 aesenc $rndkeyx,$inout5 1454 aesenc $rndkeyx,$inout6 1455 aesenc $rndkeyx,$inout7 1456 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1457___ 1458} 1459$code.=<<___; 1460 bswap %r9d 1461 aesenc $rndkey0,$inout0 1462 aesenc $rndkey0,$inout1 1463 aesenc $rndkey0,$inout2 1464 xor $key0,%r9d 1465 movdqu 0x00($inp),$in0 # start loading input 1466 aesenc $rndkey0,$inout3 1467 mov %r9d,0x70+12(%rsp) 1468 cmp \$11,$rounds 1469 aesenc $rndkey0,$inout4 1470 aesenc $rndkey0,$inout5 1471 aesenc $rndkey0,$inout6 1472 aesenc $rndkey0,$inout7 1473 $movkey 0xa0-0x80($key),$rndkey0 1474 1475 jb .Lctr32_enc_done 1476 1477 aesenc $rndkey1,$inout0 1478 aesenc $rndkey1,$inout1 1479 aesenc $rndkey1,$inout2 1480 aesenc $rndkey1,$inout3 1481 aesenc $rndkey1,$inout4 1482 aesenc $rndkey1,$inout5 1483 aesenc $rndkey1,$inout6 1484 aesenc $rndkey1,$inout7 1485 $movkey 0xb0-0x80($key),$rndkey1 1486 1487 aesenc $rndkey0,$inout0 1488 aesenc $rndkey0,$inout1 1489 aesenc $rndkey0,$inout2 1490 aesenc $rndkey0,$inout3 1491 aesenc $rndkey0,$inout4 1492 aesenc $rndkey0,$inout5 1493 aesenc $rndkey0,$inout6 1494 aesenc $rndkey0,$inout7 1495 $movkey 0xc0-0x80($key),$rndkey0 1496 je .Lctr32_enc_done 1497 1498 aesenc $rndkey1,$inout0 1499 aesenc $rndkey1,$inout1 1500 aesenc $rndkey1,$inout2 1501 aesenc $rndkey1,$inout3 1502 aesenc $rndkey1,$inout4 1503 aesenc $rndkey1,$inout5 1504 aesenc $rndkey1,$inout6 1505 aesenc $rndkey1,$inout7 1506 $movkey 0xd0-0x80($key),$rndkey1 1507 1508 aesenc $rndkey0,$inout0 1509 aesenc $rndkey0,$inout1 1510 aesenc $rndkey0,$inout2 1511 aesenc $rndkey0,$inout3 1512 aesenc $rndkey0,$inout4 1513 aesenc $rndkey0,$inout5 1514 aesenc $rndkey0,$inout6 1515 aesenc $rndkey0,$inout7 1516 $movkey 0xe0-0x80($key),$rndkey0 1517 jmp .Lctr32_enc_done 1518 1519.align 16 1520.Lctr32_enc_done: 1521 movdqu 0x10($inp),$in1 1522 pxor $rndkey0,$in0 # input^=round[last] 1523 movdqu 0x20($inp),$in2 1524 pxor $rndkey0,$in1 1525 movdqu 0x30($inp),$in3 1526 pxor $rndkey0,$in2 1527 movdqu 0x40($inp),$in4 1528 pxor $rndkey0,$in3 1529 movdqu 0x50($inp),$in5 1530 pxor $rndkey0,$in4 1531 pxor $rndkey0,$in5 1532 aesenc $rndkey1,$inout0 1533 aesenc $rndkey1,$inout1 1534 aesenc $rndkey1,$inout2 1535 aesenc $rndkey1,$inout3 1536 aesenc $rndkey1,$inout4 1537 aesenc $rndkey1,$inout5 1538 aesenc $rndkey1,$inout6 1539 aesenc $rndkey1,$inout7 1540 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1541 lea 0x80($inp),$inp # $inp+=8*16 1542 1543 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1544 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1545 movdqu 0x70-0x80($inp),$in0 1546 aesenclast $in1,$inout1 1547 pxor $rndkey0,$in0 1548 movdqa 0x00(%rsp),$in1 # load next counter block 1549 aesenclast $in2,$inout2 1550 aesenclast $in3,$inout3 1551 movdqa 0x10(%rsp),$in2 1552 movdqa 0x20(%rsp),$in3 1553 aesenclast $in4,$inout4 1554 aesenclast $in5,$inout5 1555 movdqa 0x30(%rsp),$in4 1556 movdqa 0x40(%rsp),$in5 1557 aesenclast $rndkey1,$inout6 1558 movdqa 0x50(%rsp),$rndkey0 1559 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1560 aesenclast $in0,$inout7 1561 1562 movups $inout0,($out) # store 8 output blocks 1563 movdqa $in1,$inout0 1564 movups $inout1,0x10($out) 1565 movdqa $in2,$inout1 1566 movups $inout2,0x20($out) 1567 movdqa $in3,$inout2 1568 movups $inout3,0x30($out) 1569 movdqa $in4,$inout3 1570 movups $inout4,0x40($out) 1571 movdqa $in5,$inout4 1572 movups $inout5,0x50($out) 1573 movdqa $rndkey0,$inout5 1574 movups $inout6,0x60($out) 1575 movups $inout7,0x70($out) 1576 lea 0x80($out),$out # $out+=8*16 1577 1578 sub \$8,$len 1579 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1580 1581 add \$8,$len # restore real remaining $len 1582 jz .Lctr32_done # done if ($len==0) 1583 lea -0x80($key),$key 1584 1585.Lctr32_tail: 1586 # note that at this point $inout0..5 are populated with 1587 # counter values xor-ed with 0-round key 1588 lea 16($key),$key 1589 cmp \$4,$len 1590 jb .Lctr32_loop3 1591 je .Lctr32_loop4 1592 1593 # if ($len>4) compute 7 E(counter) 1594 shl \$4,$rounds 1595 movdqa 0x60(%rsp),$inout6 1596 pxor $inout7,$inout7 1597 1598 $movkey 16($key),$rndkey0 1599 aesenc $rndkey1,$inout0 1600 aesenc $rndkey1,$inout1 1601 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1602 neg %rax 1603 aesenc $rndkey1,$inout2 1604 add \$16,%rax # prepare for .Lenc_loop8_enter 1605 movups ($inp),$in0 1606 aesenc $rndkey1,$inout3 1607 aesenc $rndkey1,$inout4 1608 movups 0x10($inp),$in1 # pre-load input 1609 movups 0x20($inp),$in2 1610 aesenc $rndkey1,$inout5 1611 aesenc $rndkey1,$inout6 1612 1613 call .Lenc_loop8_enter 1614 1615 movdqu 0x30($inp),$in3 1616 pxor $in0,$inout0 1617 movdqu 0x40($inp),$in0 1618 pxor $in1,$inout1 1619 movdqu $inout0,($out) # store output 1620 pxor $in2,$inout2 1621 movdqu $inout1,0x10($out) 1622 pxor $in3,$inout3 1623 movdqu $inout2,0x20($out) 1624 pxor $in0,$inout4 1625 movdqu $inout3,0x30($out) 1626 movdqu $inout4,0x40($out) 1627 cmp \$6,$len 1628 jb .Lctr32_done # $len was 5, stop store 1629 1630 movups 0x50($inp),$in1 1631 xorps $in1,$inout5 1632 movups $inout5,0x50($out) 1633 je .Lctr32_done # $len was 6, stop store 1634 1635 movups 0x60($inp),$in2 1636 xorps $in2,$inout6 1637 movups $inout6,0x60($out) 1638 jmp .Lctr32_done # $len was 7, stop store 1639 1640.align 32 1641.Lctr32_loop4: 1642 aesenc $rndkey1,$inout0 1643 lea 16($key),$key 1644 dec $rounds 1645 aesenc $rndkey1,$inout1 1646 aesenc $rndkey1,$inout2 1647 aesenc $rndkey1,$inout3 1648 $movkey ($key),$rndkey1 1649 jnz .Lctr32_loop4 1650 aesenclast $rndkey1,$inout0 1651 aesenclast $rndkey1,$inout1 1652 movups ($inp),$in0 # load input 1653 movups 0x10($inp),$in1 1654 aesenclast $rndkey1,$inout2 1655 aesenclast $rndkey1,$inout3 1656 movups 0x20($inp),$in2 1657 movups 0x30($inp),$in3 1658 1659 xorps $in0,$inout0 1660 movups $inout0,($out) # store output 1661 xorps $in1,$inout1 1662 movups $inout1,0x10($out) 1663 pxor $in2,$inout2 1664 movdqu $inout2,0x20($out) 1665 pxor $in3,$inout3 1666 movdqu $inout3,0x30($out) 1667 jmp .Lctr32_done # $len was 4, stop store 1668 1669.align 32 1670.Lctr32_loop3: 1671 aesenc $rndkey1,$inout0 1672 lea 16($key),$key 1673 dec $rounds 1674 aesenc $rndkey1,$inout1 1675 aesenc $rndkey1,$inout2 1676 $movkey ($key),$rndkey1 1677 jnz .Lctr32_loop3 1678 aesenclast $rndkey1,$inout0 1679 aesenclast $rndkey1,$inout1 1680 aesenclast $rndkey1,$inout2 1681 1682 movups ($inp),$in0 # load input 1683 xorps $in0,$inout0 1684 movups $inout0,($out) # store output 1685 cmp \$2,$len 1686 jb .Lctr32_done # $len was 1, stop store 1687 1688 movups 0x10($inp),$in1 1689 xorps $in1,$inout1 1690 movups $inout1,0x10($out) 1691 je .Lctr32_done # $len was 2, stop store 1692 1693 movups 0x20($inp),$in2 1694 xorps $in2,$inout2 1695 movups $inout2,0x20($out) # $len was 3, stop store 1696 1697.Lctr32_done: 1698 xorps %xmm0,%xmm0 # clear register bank 1699 xor $key0,$key0 1700 pxor %xmm1,%xmm1 1701 pxor %xmm2,%xmm2 1702 pxor %xmm3,%xmm3 1703 pxor %xmm4,%xmm4 1704 pxor %xmm5,%xmm5 1705___ 1706$code.=<<___ if (!$win64); 1707 pxor %xmm6,%xmm6 1708 pxor %xmm7,%xmm7 1709 movaps %xmm0,0x00(%rsp) # clear stack 1710 pxor %xmm8,%xmm8 1711 movaps %xmm0,0x10(%rsp) 1712 pxor %xmm9,%xmm9 1713 movaps %xmm0,0x20(%rsp) 1714 pxor %xmm10,%xmm10 1715 movaps %xmm0,0x30(%rsp) 1716 pxor %xmm11,%xmm11 1717 movaps %xmm0,0x40(%rsp) 1718 pxor %xmm12,%xmm12 1719 movaps %xmm0,0x50(%rsp) 1720 pxor %xmm13,%xmm13 1721 movaps %xmm0,0x60(%rsp) 1722 pxor %xmm14,%xmm14 1723 movaps %xmm0,0x70(%rsp) 1724 pxor %xmm15,%xmm15 1725___ 1726$code.=<<___ if ($win64); 1727 movaps -0xa8($key_),%xmm6 1728 movaps %xmm0,-0xa8($key_) # clear stack 1729 movaps -0x98($key_),%xmm7 1730 movaps %xmm0,-0x98($key_) 1731 movaps -0x88($key_),%xmm8 1732 movaps %xmm0,-0x88($key_) 1733 movaps -0x78($key_),%xmm9 1734 movaps %xmm0,-0x78($key_) 1735 movaps -0x68($key_),%xmm10 1736 movaps %xmm0,-0x68($key_) 1737 movaps -0x58($key_),%xmm11 1738 movaps %xmm0,-0x58($key_) 1739 movaps -0x48($key_),%xmm12 1740 movaps %xmm0,-0x48($key_) 1741 movaps -0x38($key_),%xmm13 1742 movaps %xmm0,-0x38($key_) 1743 movaps -0x28($key_),%xmm14 1744 movaps %xmm0,-0x28($key_) 1745 movaps -0x18($key_),%xmm15 1746 movaps %xmm0,-0x18($key_) 1747 movaps %xmm0,0x00(%rsp) 1748 movaps %xmm0,0x10(%rsp) 1749 movaps %xmm0,0x20(%rsp) 1750 movaps %xmm0,0x30(%rsp) 1751 movaps %xmm0,0x40(%rsp) 1752 movaps %xmm0,0x50(%rsp) 1753 movaps %xmm0,0x60(%rsp) 1754 movaps %xmm0,0x70(%rsp) 1755___ 1756$code.=<<___; 1757 mov -8($key_),%rbp 1758.cfi_restore %rbp 1759 lea ($key_),%rsp 1760.cfi_def_cfa_register %rsp 1761.Lctr32_epilogue: 1762 ret 1763.cfi_endproc 1764.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks 1765___ 1766} 1767 1768###################################################################### 1769# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1770# const AES_KEY *key1, const AES_KEY *key2 1771# const unsigned char iv[16]); 1772# 1773{ 1774my @tweak=map("%xmm$_",(10..15)); 1775my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1776my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1777my $frame_size = 0x70 + ($win64?160:0); 1778my $key_ = "%rbp"; # override so that we can use %r11 as FP 1779 1780$code.=<<___; 1781.globl aesni_xts_encrypt 1782.type aesni_xts_encrypt,\@function,6 1783.align 16 1784aesni_xts_encrypt: 1785.cfi_startproc 1786 endbranch 1787 lea (%rsp),%r11 # frame pointer 1788.cfi_def_cfa_register %r11 1789 push %rbp 1790.cfi_push %rbp 1791 sub \$$frame_size,%rsp 1792 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1793___ 1794$code.=<<___ if ($win64); 1795 movaps %xmm6,-0xa8(%r11) # offload everything 1796 movaps %xmm7,-0x98(%r11) 1797 movaps %xmm8,-0x88(%r11) 1798 movaps %xmm9,-0x78(%r11) 1799 movaps %xmm10,-0x68(%r11) 1800 movaps %xmm11,-0x58(%r11) 1801 movaps %xmm12,-0x48(%r11) 1802 movaps %xmm13,-0x38(%r11) 1803 movaps %xmm14,-0x28(%r11) 1804 movaps %xmm15,-0x18(%r11) 1805.Lxts_enc_body: 1806___ 1807$code.=<<___; 1808 movups ($ivp),$inout0 # load clear-text tweak 1809 mov 240(%r8),$rounds # key2->rounds 1810 mov 240($key),$rnds_ # key1->rounds 1811___ 1812 # generate the tweak 1813 &aesni_generate1("enc",$key2,$rounds,$inout0); 1814$code.=<<___; 1815 $movkey ($key),$rndkey0 # zero round key 1816 mov $key,$key_ # backup $key 1817 mov $rnds_,$rounds # backup $rounds 1818 shl \$4,$rnds_ 1819 mov $len,$len_ # backup $len 1820 and \$-16,$len 1821 1822 $movkey 16($key,$rnds_),$rndkey1 # last round key 1823 1824 movdqa .Lxts_magic(%rip),$twmask 1825 movdqa $inout0,@tweak[5] 1826 pshufd \$0x5f,$inout0,$twres 1827 pxor $rndkey0,$rndkey1 1828___ 1829 # alternative tweak calculation algorithm is based on suggestions 1830 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1831 # and should help in the future... 1832 for ($i=0;$i<4;$i++) { 1833 $code.=<<___; 1834 movdqa $twres,$twtmp 1835 paddd $twres,$twres 1836 movdqa @tweak[5],@tweak[$i] 1837 psrad \$31,$twtmp # broadcast upper bits 1838 paddq @tweak[5],@tweak[5] 1839 pand $twmask,$twtmp 1840 pxor $rndkey0,@tweak[$i] 1841 pxor $twtmp,@tweak[5] 1842___ 1843 } 1844$code.=<<___; 1845 movdqa @tweak[5],@tweak[4] 1846 psrad \$31,$twres 1847 paddq @tweak[5],@tweak[5] 1848 pand $twmask,$twres 1849 pxor $rndkey0,@tweak[4] 1850 pxor $twres,@tweak[5] 1851 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1852 1853 sub \$16*6,$len 1854 jc .Lxts_enc_short # if $len-=6*16 borrowed 1855 1856 mov \$16+96,$rounds 1857 lea 32($key_,$rnds_),$key # end of key schedule 1858 sub %r10,%rax # twisted $rounds 1859 $movkey 16($key_),$rndkey1 1860 mov %rax,%r10 # backup twisted $rounds 1861 lea .Lxts_magic(%rip),%r8 1862 jmp .Lxts_enc_grandloop 1863 1864.align 32 1865.Lxts_enc_grandloop: 1866 movdqu `16*0`($inp),$inout0 # load input 1867 movdqa $rndkey0,$twmask 1868 movdqu `16*1`($inp),$inout1 1869 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1870 movdqu `16*2`($inp),$inout2 1871 pxor @tweak[1],$inout1 1872 aesenc $rndkey1,$inout0 1873 movdqu `16*3`($inp),$inout3 1874 pxor @tweak[2],$inout2 1875 aesenc $rndkey1,$inout1 1876 movdqu `16*4`($inp),$inout4 1877 pxor @tweak[3],$inout3 1878 aesenc $rndkey1,$inout2 1879 movdqu `16*5`($inp),$inout5 1880 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1881 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1882 pxor @tweak[4],$inout4 1883 aesenc $rndkey1,$inout3 1884 $movkey 32($key_),$rndkey0 1885 lea `16*6`($inp),$inp 1886 pxor $twmask,$inout5 1887 1888 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1889 aesenc $rndkey1,$inout4 1890 pxor $twres,@tweak[1] 1891 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1892 aesenc $rndkey1,$inout5 1893 $movkey 48($key_),$rndkey1 1894 pxor $twres,@tweak[2] 1895 1896 aesenc $rndkey0,$inout0 1897 pxor $twres,@tweak[3] 1898 movdqa @tweak[1],`16*1`(%rsp) 1899 aesenc $rndkey0,$inout1 1900 pxor $twres,@tweak[4] 1901 movdqa @tweak[2],`16*2`(%rsp) 1902 aesenc $rndkey0,$inout2 1903 aesenc $rndkey0,$inout3 1904 pxor $twres,$twmask 1905 movdqa @tweak[4],`16*4`(%rsp) 1906 aesenc $rndkey0,$inout4 1907 aesenc $rndkey0,$inout5 1908 $movkey 64($key_),$rndkey0 1909 movdqa $twmask,`16*5`(%rsp) 1910 pshufd \$0x5f,@tweak[5],$twres 1911 jmp .Lxts_enc_loop6 1912.align 32 1913.Lxts_enc_loop6: 1914 aesenc $rndkey1,$inout0 1915 aesenc $rndkey1,$inout1 1916 aesenc $rndkey1,$inout2 1917 aesenc $rndkey1,$inout3 1918 aesenc $rndkey1,$inout4 1919 aesenc $rndkey1,$inout5 1920 $movkey -64($key,%rax),$rndkey1 1921 add \$32,%rax 1922 1923 aesenc $rndkey0,$inout0 1924 aesenc $rndkey0,$inout1 1925 aesenc $rndkey0,$inout2 1926 aesenc $rndkey0,$inout3 1927 aesenc $rndkey0,$inout4 1928 aesenc $rndkey0,$inout5 1929 $movkey -80($key,%rax),$rndkey0 1930 jnz .Lxts_enc_loop6 1931 1932 movdqa (%r8),$twmask # start calculating next tweak 1933 movdqa $twres,$twtmp 1934 paddd $twres,$twres 1935 aesenc $rndkey1,$inout0 1936 paddq @tweak[5],@tweak[5] 1937 psrad \$31,$twtmp 1938 aesenc $rndkey1,$inout1 1939 pand $twmask,$twtmp 1940 $movkey ($key_),@tweak[0] # load round[0] 1941 aesenc $rndkey1,$inout2 1942 aesenc $rndkey1,$inout3 1943 aesenc $rndkey1,$inout4 1944 pxor $twtmp,@tweak[5] 1945 movaps @tweak[0],@tweak[1] # copy round[0] 1946 aesenc $rndkey1,$inout5 1947 $movkey -64($key),$rndkey1 1948 1949 movdqa $twres,$twtmp 1950 aesenc $rndkey0,$inout0 1951 paddd $twres,$twres 1952 pxor @tweak[5],@tweak[0] 1953 aesenc $rndkey0,$inout1 1954 psrad \$31,$twtmp 1955 paddq @tweak[5],@tweak[5] 1956 aesenc $rndkey0,$inout2 1957 aesenc $rndkey0,$inout3 1958 pand $twmask,$twtmp 1959 movaps @tweak[1],@tweak[2] 1960 aesenc $rndkey0,$inout4 1961 pxor $twtmp,@tweak[5] 1962 movdqa $twres,$twtmp 1963 aesenc $rndkey0,$inout5 1964 $movkey -48($key),$rndkey0 1965 1966 paddd $twres,$twres 1967 aesenc $rndkey1,$inout0 1968 pxor @tweak[5],@tweak[1] 1969 psrad \$31,$twtmp 1970 aesenc $rndkey1,$inout1 1971 paddq @tweak[5],@tweak[5] 1972 pand $twmask,$twtmp 1973 aesenc $rndkey1,$inout2 1974 aesenc $rndkey1,$inout3 1975 movdqa @tweak[3],`16*3`(%rsp) 1976 pxor $twtmp,@tweak[5] 1977 aesenc $rndkey1,$inout4 1978 movaps @tweak[2],@tweak[3] 1979 movdqa $twres,$twtmp 1980 aesenc $rndkey1,$inout5 1981 $movkey -32($key),$rndkey1 1982 1983 paddd $twres,$twres 1984 aesenc $rndkey0,$inout0 1985 pxor @tweak[5],@tweak[2] 1986 psrad \$31,$twtmp 1987 aesenc $rndkey0,$inout1 1988 paddq @tweak[5],@tweak[5] 1989 pand $twmask,$twtmp 1990 aesenc $rndkey0,$inout2 1991 aesenc $rndkey0,$inout3 1992 aesenc $rndkey0,$inout4 1993 pxor $twtmp,@tweak[5] 1994 movaps @tweak[3],@tweak[4] 1995 aesenc $rndkey0,$inout5 1996 1997 movdqa $twres,$rndkey0 1998 paddd $twres,$twres 1999 aesenc $rndkey1,$inout0 2000 pxor @tweak[5],@tweak[3] 2001 psrad \$31,$rndkey0 2002 aesenc $rndkey1,$inout1 2003 paddq @tweak[5],@tweak[5] 2004 pand $twmask,$rndkey0 2005 aesenc $rndkey1,$inout2 2006 aesenc $rndkey1,$inout3 2007 pxor $rndkey0,@tweak[5] 2008 $movkey ($key_),$rndkey0 2009 aesenc $rndkey1,$inout4 2010 aesenc $rndkey1,$inout5 2011 $movkey 16($key_),$rndkey1 2012 2013 pxor @tweak[5],@tweak[4] 2014 aesenclast `16*0`(%rsp),$inout0 2015 psrad \$31,$twres 2016 paddq @tweak[5],@tweak[5] 2017 aesenclast `16*1`(%rsp),$inout1 2018 aesenclast `16*2`(%rsp),$inout2 2019 pand $twmask,$twres 2020 mov %r10,%rax # restore $rounds 2021 aesenclast `16*3`(%rsp),$inout3 2022 aesenclast `16*4`(%rsp),$inout4 2023 aesenclast `16*5`(%rsp),$inout5 2024 pxor $twres,@tweak[5] 2025 2026 lea `16*6`($out),$out # $out+=6*16 2027 movups $inout0,`-16*6`($out) # store 6 output blocks 2028 movups $inout1,`-16*5`($out) 2029 movups $inout2,`-16*4`($out) 2030 movups $inout3,`-16*3`($out) 2031 movups $inout4,`-16*2`($out) 2032 movups $inout5,`-16*1`($out) 2033 sub \$16*6,$len 2034 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2035 2036 mov \$16+96,$rounds 2037 sub $rnds_,$rounds 2038 mov $key_,$key # restore $key 2039 shr \$4,$rounds # restore original value 2040 2041.Lxts_enc_short: 2042 # at the point @tweak[0..5] are populated with tweak values 2043 mov $rounds,$rnds_ # backup $rounds 2044 pxor $rndkey0,@tweak[0] 2045 add \$16*6,$len # restore real remaining $len 2046 jz .Lxts_enc_done # done if ($len==0) 2047 2048 pxor $rndkey0,@tweak[1] 2049 cmp \$0x20,$len 2050 jb .Lxts_enc_one # $len is 1*16 2051 pxor $rndkey0,@tweak[2] 2052 je .Lxts_enc_two # $len is 2*16 2053 2054 pxor $rndkey0,@tweak[3] 2055 cmp \$0x40,$len 2056 jb .Lxts_enc_three # $len is 3*16 2057 pxor $rndkey0,@tweak[4] 2058 je .Lxts_enc_four # $len is 4*16 2059 2060 movdqu ($inp),$inout0 # $len is 5*16 2061 movdqu 16*1($inp),$inout1 2062 movdqu 16*2($inp),$inout2 2063 pxor @tweak[0],$inout0 2064 movdqu 16*3($inp),$inout3 2065 pxor @tweak[1],$inout1 2066 movdqu 16*4($inp),$inout4 2067 lea 16*5($inp),$inp # $inp+=5*16 2068 pxor @tweak[2],$inout2 2069 pxor @tweak[3],$inout3 2070 pxor @tweak[4],$inout4 2071 pxor $inout5,$inout5 2072 2073 call _aesni_encrypt6 2074 2075 xorps @tweak[0],$inout0 2076 movdqa @tweak[5],@tweak[0] 2077 xorps @tweak[1],$inout1 2078 xorps @tweak[2],$inout2 2079 movdqu $inout0,($out) # store 5 output blocks 2080 xorps @tweak[3],$inout3 2081 movdqu $inout1,16*1($out) 2082 xorps @tweak[4],$inout4 2083 movdqu $inout2,16*2($out) 2084 movdqu $inout3,16*3($out) 2085 movdqu $inout4,16*4($out) 2086 lea 16*5($out),$out # $out+=5*16 2087 jmp .Lxts_enc_done 2088 2089.align 16 2090.Lxts_enc_one: 2091 movups ($inp),$inout0 2092 lea 16*1($inp),$inp # inp+=1*16 2093 xorps @tweak[0],$inout0 2094___ 2095 &aesni_generate1("enc",$key,$rounds); 2096$code.=<<___; 2097 xorps @tweak[0],$inout0 2098 movdqa @tweak[1],@tweak[0] 2099 movups $inout0,($out) # store one output block 2100 lea 16*1($out),$out # $out+=1*16 2101 jmp .Lxts_enc_done 2102 2103.align 16 2104.Lxts_enc_two: 2105 movups ($inp),$inout0 2106 movups 16($inp),$inout1 2107 lea 32($inp),$inp # $inp+=2*16 2108 xorps @tweak[0],$inout0 2109 xorps @tweak[1],$inout1 2110 2111 call _aesni_encrypt2 2112 2113 xorps @tweak[0],$inout0 2114 movdqa @tweak[2],@tweak[0] 2115 xorps @tweak[1],$inout1 2116 movups $inout0,($out) # store 2 output blocks 2117 movups $inout1,16*1($out) 2118 lea 16*2($out),$out # $out+=2*16 2119 jmp .Lxts_enc_done 2120 2121.align 16 2122.Lxts_enc_three: 2123 movups ($inp),$inout0 2124 movups 16*1($inp),$inout1 2125 movups 16*2($inp),$inout2 2126 lea 16*3($inp),$inp # $inp+=3*16 2127 xorps @tweak[0],$inout0 2128 xorps @tweak[1],$inout1 2129 xorps @tweak[2],$inout2 2130 2131 call _aesni_encrypt3 2132 2133 xorps @tweak[0],$inout0 2134 movdqa @tweak[3],@tweak[0] 2135 xorps @tweak[1],$inout1 2136 xorps @tweak[2],$inout2 2137 movups $inout0,($out) # store 3 output blocks 2138 movups $inout1,16*1($out) 2139 movups $inout2,16*2($out) 2140 lea 16*3($out),$out # $out+=3*16 2141 jmp .Lxts_enc_done 2142 2143.align 16 2144.Lxts_enc_four: 2145 movups ($inp),$inout0 2146 movups 16*1($inp),$inout1 2147 movups 16*2($inp),$inout2 2148 xorps @tweak[0],$inout0 2149 movups 16*3($inp),$inout3 2150 lea 16*4($inp),$inp # $inp+=4*16 2151 xorps @tweak[1],$inout1 2152 xorps @tweak[2],$inout2 2153 xorps @tweak[3],$inout3 2154 2155 call _aesni_encrypt4 2156 2157 pxor @tweak[0],$inout0 2158 movdqa @tweak[4],@tweak[0] 2159 pxor @tweak[1],$inout1 2160 pxor @tweak[2],$inout2 2161 movdqu $inout0,($out) # store 4 output blocks 2162 pxor @tweak[3],$inout3 2163 movdqu $inout1,16*1($out) 2164 movdqu $inout2,16*2($out) 2165 movdqu $inout3,16*3($out) 2166 lea 16*4($out),$out # $out+=4*16 2167 jmp .Lxts_enc_done 2168 2169.align 16 2170.Lxts_enc_done: 2171 and \$15,$len_ # see if $len%16 is 0 2172 jz .Lxts_enc_ret 2173 mov $len_,$len 2174 2175.Lxts_enc_steal: 2176 movzb ($inp),%eax # borrow $rounds ... 2177 movzb -16($out),%ecx # ... and $key 2178 lea 1($inp),$inp 2179 mov %al,-16($out) 2180 mov %cl,0($out) 2181 lea 1($out),$out 2182 sub \$1,$len 2183 jnz .Lxts_enc_steal 2184 2185 sub $len_,$out # rewind $out 2186 mov $key_,$key # restore $key 2187 mov $rnds_,$rounds # restore $rounds 2188 2189 movups -16($out),$inout0 2190 xorps @tweak[0],$inout0 2191___ 2192 &aesni_generate1("enc",$key,$rounds); 2193$code.=<<___; 2194 xorps @tweak[0],$inout0 2195 movups $inout0,-16($out) 2196 2197.Lxts_enc_ret: 2198 xorps %xmm0,%xmm0 # clear register bank 2199 pxor %xmm1,%xmm1 2200 pxor %xmm2,%xmm2 2201 pxor %xmm3,%xmm3 2202 pxor %xmm4,%xmm4 2203 pxor %xmm5,%xmm5 2204___ 2205$code.=<<___ if (!$win64); 2206 pxor %xmm6,%xmm6 2207 pxor %xmm7,%xmm7 2208 movaps %xmm0,0x00(%rsp) # clear stack 2209 pxor %xmm8,%xmm8 2210 movaps %xmm0,0x10(%rsp) 2211 pxor %xmm9,%xmm9 2212 movaps %xmm0,0x20(%rsp) 2213 pxor %xmm10,%xmm10 2214 movaps %xmm0,0x30(%rsp) 2215 pxor %xmm11,%xmm11 2216 movaps %xmm0,0x40(%rsp) 2217 pxor %xmm12,%xmm12 2218 movaps %xmm0,0x50(%rsp) 2219 pxor %xmm13,%xmm13 2220 movaps %xmm0,0x60(%rsp) 2221 pxor %xmm14,%xmm14 2222 pxor %xmm15,%xmm15 2223___ 2224$code.=<<___ if ($win64); 2225 movaps -0xa8(%r11),%xmm6 2226 movaps %xmm0,-0xa8(%r11) # clear stack 2227 movaps -0x98(%r11),%xmm7 2228 movaps %xmm0,-0x98(%r11) 2229 movaps -0x88(%r11),%xmm8 2230 movaps %xmm0,-0x88(%r11) 2231 movaps -0x78(%r11),%xmm9 2232 movaps %xmm0,-0x78(%r11) 2233 movaps -0x68(%r11),%xmm10 2234 movaps %xmm0,-0x68(%r11) 2235 movaps -0x58(%r11),%xmm11 2236 movaps %xmm0,-0x58(%r11) 2237 movaps -0x48(%r11),%xmm12 2238 movaps %xmm0,-0x48(%r11) 2239 movaps -0x38(%r11),%xmm13 2240 movaps %xmm0,-0x38(%r11) 2241 movaps -0x28(%r11),%xmm14 2242 movaps %xmm0,-0x28(%r11) 2243 movaps -0x18(%r11),%xmm15 2244 movaps %xmm0,-0x18(%r11) 2245 movaps %xmm0,0x00(%rsp) 2246 movaps %xmm0,0x10(%rsp) 2247 movaps %xmm0,0x20(%rsp) 2248 movaps %xmm0,0x30(%rsp) 2249 movaps %xmm0,0x40(%rsp) 2250 movaps %xmm0,0x50(%rsp) 2251 movaps %xmm0,0x60(%rsp) 2252___ 2253$code.=<<___; 2254 mov -8(%r11),%rbp 2255.cfi_restore %rbp 2256 lea (%r11),%rsp 2257.cfi_def_cfa_register %rsp 2258.Lxts_enc_epilogue: 2259 ret 2260.cfi_endproc 2261.size aesni_xts_encrypt,.-aesni_xts_encrypt 2262___ 2263 2264$code.=<<___; 2265.globl aesni_xts_decrypt 2266.type aesni_xts_decrypt,\@function,6 2267.align 16 2268aesni_xts_decrypt: 2269.cfi_startproc 2270 endbranch 2271 lea (%rsp),%r11 # frame pointer 2272.cfi_def_cfa_register %r11 2273 push %rbp 2274.cfi_push %rbp 2275 sub \$$frame_size,%rsp 2276 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2277___ 2278$code.=<<___ if ($win64); 2279 movaps %xmm6,-0xa8(%r11) # offload everything 2280 movaps %xmm7,-0x98(%r11) 2281 movaps %xmm8,-0x88(%r11) 2282 movaps %xmm9,-0x78(%r11) 2283 movaps %xmm10,-0x68(%r11) 2284 movaps %xmm11,-0x58(%r11) 2285 movaps %xmm12,-0x48(%r11) 2286 movaps %xmm13,-0x38(%r11) 2287 movaps %xmm14,-0x28(%r11) 2288 movaps %xmm15,-0x18(%r11) 2289.Lxts_dec_body: 2290___ 2291$code.=<<___; 2292 movups ($ivp),$inout0 # load clear-text tweak 2293 mov 240($key2),$rounds # key2->rounds 2294 mov 240($key),$rnds_ # key1->rounds 2295___ 2296 # generate the tweak 2297 &aesni_generate1("enc",$key2,$rounds,$inout0); 2298$code.=<<___; 2299 xor %eax,%eax # if ($len%16) len-=16; 2300 test \$15,$len 2301 setnz %al 2302 shl \$4,%rax 2303 sub %rax,$len 2304 2305 $movkey ($key),$rndkey0 # zero round key 2306 mov $key,$key_ # backup $key 2307 mov $rnds_,$rounds # backup $rounds 2308 shl \$4,$rnds_ 2309 mov $len,$len_ # backup $len 2310 and \$-16,$len 2311 2312 $movkey 16($key,$rnds_),$rndkey1 # last round key 2313 2314 movdqa .Lxts_magic(%rip),$twmask 2315 movdqa $inout0,@tweak[5] 2316 pshufd \$0x5f,$inout0,$twres 2317 pxor $rndkey0,$rndkey1 2318___ 2319 for ($i=0;$i<4;$i++) { 2320 $code.=<<___; 2321 movdqa $twres,$twtmp 2322 paddd $twres,$twres 2323 movdqa @tweak[5],@tweak[$i] 2324 psrad \$31,$twtmp # broadcast upper bits 2325 paddq @tweak[5],@tweak[5] 2326 pand $twmask,$twtmp 2327 pxor $rndkey0,@tweak[$i] 2328 pxor $twtmp,@tweak[5] 2329___ 2330 } 2331$code.=<<___; 2332 movdqa @tweak[5],@tweak[4] 2333 psrad \$31,$twres 2334 paddq @tweak[5],@tweak[5] 2335 pand $twmask,$twres 2336 pxor $rndkey0,@tweak[4] 2337 pxor $twres,@tweak[5] 2338 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2339 2340 sub \$16*6,$len 2341 jc .Lxts_dec_short # if $len-=6*16 borrowed 2342 2343 mov \$16+96,$rounds 2344 lea 32($key_,$rnds_),$key # end of key schedule 2345 sub %r10,%rax # twisted $rounds 2346 $movkey 16($key_),$rndkey1 2347 mov %rax,%r10 # backup twisted $rounds 2348 lea .Lxts_magic(%rip),%r8 2349 jmp .Lxts_dec_grandloop 2350 2351.align 32 2352.Lxts_dec_grandloop: 2353 movdqu `16*0`($inp),$inout0 # load input 2354 movdqa $rndkey0,$twmask 2355 movdqu `16*1`($inp),$inout1 2356 pxor @tweak[0],$inout0 # input^=tweak^round[0] 2357 movdqu `16*2`($inp),$inout2 2358 pxor @tweak[1],$inout1 2359 aesdec $rndkey1,$inout0 2360 movdqu `16*3`($inp),$inout3 2361 pxor @tweak[2],$inout2 2362 aesdec $rndkey1,$inout1 2363 movdqu `16*4`($inp),$inout4 2364 pxor @tweak[3],$inout3 2365 aesdec $rndkey1,$inout2 2366 movdqu `16*5`($inp),$inout5 2367 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2368 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2369 pxor @tweak[4],$inout4 2370 aesdec $rndkey1,$inout3 2371 $movkey 32($key_),$rndkey0 2372 lea `16*6`($inp),$inp 2373 pxor $twmask,$inout5 2374 2375 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2376 aesdec $rndkey1,$inout4 2377 pxor $twres,@tweak[1] 2378 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2379 aesdec $rndkey1,$inout5 2380 $movkey 48($key_),$rndkey1 2381 pxor $twres,@tweak[2] 2382 2383 aesdec $rndkey0,$inout0 2384 pxor $twres,@tweak[3] 2385 movdqa @tweak[1],`16*1`(%rsp) 2386 aesdec $rndkey0,$inout1 2387 pxor $twres,@tweak[4] 2388 movdqa @tweak[2],`16*2`(%rsp) 2389 aesdec $rndkey0,$inout2 2390 aesdec $rndkey0,$inout3 2391 pxor $twres,$twmask 2392 movdqa @tweak[4],`16*4`(%rsp) 2393 aesdec $rndkey0,$inout4 2394 aesdec $rndkey0,$inout5 2395 $movkey 64($key_),$rndkey0 2396 movdqa $twmask,`16*5`(%rsp) 2397 pshufd \$0x5f,@tweak[5],$twres 2398 jmp .Lxts_dec_loop6 2399.align 32 2400.Lxts_dec_loop6: 2401 aesdec $rndkey1,$inout0 2402 aesdec $rndkey1,$inout1 2403 aesdec $rndkey1,$inout2 2404 aesdec $rndkey1,$inout3 2405 aesdec $rndkey1,$inout4 2406 aesdec $rndkey1,$inout5 2407 $movkey -64($key,%rax),$rndkey1 2408 add \$32,%rax 2409 2410 aesdec $rndkey0,$inout0 2411 aesdec $rndkey0,$inout1 2412 aesdec $rndkey0,$inout2 2413 aesdec $rndkey0,$inout3 2414 aesdec $rndkey0,$inout4 2415 aesdec $rndkey0,$inout5 2416 $movkey -80($key,%rax),$rndkey0 2417 jnz .Lxts_dec_loop6 2418 2419 movdqa (%r8),$twmask # start calculating next tweak 2420 movdqa $twres,$twtmp 2421 paddd $twres,$twres 2422 aesdec $rndkey1,$inout0 2423 paddq @tweak[5],@tweak[5] 2424 psrad \$31,$twtmp 2425 aesdec $rndkey1,$inout1 2426 pand $twmask,$twtmp 2427 $movkey ($key_),@tweak[0] # load round[0] 2428 aesdec $rndkey1,$inout2 2429 aesdec $rndkey1,$inout3 2430 aesdec $rndkey1,$inout4 2431 pxor $twtmp,@tweak[5] 2432 movaps @tweak[0],@tweak[1] # copy round[0] 2433 aesdec $rndkey1,$inout5 2434 $movkey -64($key),$rndkey1 2435 2436 movdqa $twres,$twtmp 2437 aesdec $rndkey0,$inout0 2438 paddd $twres,$twres 2439 pxor @tweak[5],@tweak[0] 2440 aesdec $rndkey0,$inout1 2441 psrad \$31,$twtmp 2442 paddq @tweak[5],@tweak[5] 2443 aesdec $rndkey0,$inout2 2444 aesdec $rndkey0,$inout3 2445 pand $twmask,$twtmp 2446 movaps @tweak[1],@tweak[2] 2447 aesdec $rndkey0,$inout4 2448 pxor $twtmp,@tweak[5] 2449 movdqa $twres,$twtmp 2450 aesdec $rndkey0,$inout5 2451 $movkey -48($key),$rndkey0 2452 2453 paddd $twres,$twres 2454 aesdec $rndkey1,$inout0 2455 pxor @tweak[5],@tweak[1] 2456 psrad \$31,$twtmp 2457 aesdec $rndkey1,$inout1 2458 paddq @tweak[5],@tweak[5] 2459 pand $twmask,$twtmp 2460 aesdec $rndkey1,$inout2 2461 aesdec $rndkey1,$inout3 2462 movdqa @tweak[3],`16*3`(%rsp) 2463 pxor $twtmp,@tweak[5] 2464 aesdec $rndkey1,$inout4 2465 movaps @tweak[2],@tweak[3] 2466 movdqa $twres,$twtmp 2467 aesdec $rndkey1,$inout5 2468 $movkey -32($key),$rndkey1 2469 2470 paddd $twres,$twres 2471 aesdec $rndkey0,$inout0 2472 pxor @tweak[5],@tweak[2] 2473 psrad \$31,$twtmp 2474 aesdec $rndkey0,$inout1 2475 paddq @tweak[5],@tweak[5] 2476 pand $twmask,$twtmp 2477 aesdec $rndkey0,$inout2 2478 aesdec $rndkey0,$inout3 2479 aesdec $rndkey0,$inout4 2480 pxor $twtmp,@tweak[5] 2481 movaps @tweak[3],@tweak[4] 2482 aesdec $rndkey0,$inout5 2483 2484 movdqa $twres,$rndkey0 2485 paddd $twres,$twres 2486 aesdec $rndkey1,$inout0 2487 pxor @tweak[5],@tweak[3] 2488 psrad \$31,$rndkey0 2489 aesdec $rndkey1,$inout1 2490 paddq @tweak[5],@tweak[5] 2491 pand $twmask,$rndkey0 2492 aesdec $rndkey1,$inout2 2493 aesdec $rndkey1,$inout3 2494 pxor $rndkey0,@tweak[5] 2495 $movkey ($key_),$rndkey0 2496 aesdec $rndkey1,$inout4 2497 aesdec $rndkey1,$inout5 2498 $movkey 16($key_),$rndkey1 2499 2500 pxor @tweak[5],@tweak[4] 2501 aesdeclast `16*0`(%rsp),$inout0 2502 psrad \$31,$twres 2503 paddq @tweak[5],@tweak[5] 2504 aesdeclast `16*1`(%rsp),$inout1 2505 aesdeclast `16*2`(%rsp),$inout2 2506 pand $twmask,$twres 2507 mov %r10,%rax # restore $rounds 2508 aesdeclast `16*3`(%rsp),$inout3 2509 aesdeclast `16*4`(%rsp),$inout4 2510 aesdeclast `16*5`(%rsp),$inout5 2511 pxor $twres,@tweak[5] 2512 2513 lea `16*6`($out),$out # $out+=6*16 2514 movups $inout0,`-16*6`($out) # store 6 output blocks 2515 movups $inout1,`-16*5`($out) 2516 movups $inout2,`-16*4`($out) 2517 movups $inout3,`-16*3`($out) 2518 movups $inout4,`-16*2`($out) 2519 movups $inout5,`-16*1`($out) 2520 sub \$16*6,$len 2521 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2522 2523 mov \$16+96,$rounds 2524 sub $rnds_,$rounds 2525 mov $key_,$key # restore $key 2526 shr \$4,$rounds # restore original value 2527 2528.Lxts_dec_short: 2529 # at the point @tweak[0..5] are populated with tweak values 2530 mov $rounds,$rnds_ # backup $rounds 2531 pxor $rndkey0,@tweak[0] 2532 pxor $rndkey0,@tweak[1] 2533 add \$16*6,$len # restore real remaining $len 2534 jz .Lxts_dec_done # done if ($len==0) 2535 2536 pxor $rndkey0,@tweak[2] 2537 cmp \$0x20,$len 2538 jb .Lxts_dec_one # $len is 1*16 2539 pxor $rndkey0,@tweak[3] 2540 je .Lxts_dec_two # $len is 2*16 2541 2542 pxor $rndkey0,@tweak[4] 2543 cmp \$0x40,$len 2544 jb .Lxts_dec_three # $len is 3*16 2545 je .Lxts_dec_four # $len is 4*16 2546 2547 movdqu ($inp),$inout0 # $len is 5*16 2548 movdqu 16*1($inp),$inout1 2549 movdqu 16*2($inp),$inout2 2550 pxor @tweak[0],$inout0 2551 movdqu 16*3($inp),$inout3 2552 pxor @tweak[1],$inout1 2553 movdqu 16*4($inp),$inout4 2554 lea 16*5($inp),$inp # $inp+=5*16 2555 pxor @tweak[2],$inout2 2556 pxor @tweak[3],$inout3 2557 pxor @tweak[4],$inout4 2558 2559 call _aesni_decrypt6 2560 2561 xorps @tweak[0],$inout0 2562 xorps @tweak[1],$inout1 2563 xorps @tweak[2],$inout2 2564 movdqu $inout0,($out) # store 5 output blocks 2565 xorps @tweak[3],$inout3 2566 movdqu $inout1,16*1($out) 2567 xorps @tweak[4],$inout4 2568 movdqu $inout2,16*2($out) 2569 pxor $twtmp,$twtmp 2570 movdqu $inout3,16*3($out) 2571 pcmpgtd @tweak[5],$twtmp 2572 movdqu $inout4,16*4($out) 2573 lea 16*5($out),$out # $out+=5*16 2574 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2575 and \$15,$len_ 2576 jz .Lxts_dec_ret 2577 2578 movdqa @tweak[5],@tweak[0] 2579 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2580 pand $twmask,@tweak[1] # isolate carry and residue 2581 pxor @tweak[5],@tweak[1] 2582 jmp .Lxts_dec_done2 2583 2584.align 16 2585.Lxts_dec_one: 2586 movups ($inp),$inout0 2587 lea 16*1($inp),$inp # $inp+=1*16 2588 xorps @tweak[0],$inout0 2589___ 2590 &aesni_generate1("dec",$key,$rounds); 2591$code.=<<___; 2592 xorps @tweak[0],$inout0 2593 movdqa @tweak[1],@tweak[0] 2594 movups $inout0,($out) # store one output block 2595 movdqa @tweak[2],@tweak[1] 2596 lea 16*1($out),$out # $out+=1*16 2597 jmp .Lxts_dec_done 2598 2599.align 16 2600.Lxts_dec_two: 2601 movups ($inp),$inout0 2602 movups 16($inp),$inout1 2603 lea 32($inp),$inp # $inp+=2*16 2604 xorps @tweak[0],$inout0 2605 xorps @tweak[1],$inout1 2606 2607 call _aesni_decrypt2 2608 2609 xorps @tweak[0],$inout0 2610 movdqa @tweak[2],@tweak[0] 2611 xorps @tweak[1],$inout1 2612 movdqa @tweak[3],@tweak[1] 2613 movups $inout0,($out) # store 2 output blocks 2614 movups $inout1,16*1($out) 2615 lea 16*2($out),$out # $out+=2*16 2616 jmp .Lxts_dec_done 2617 2618.align 16 2619.Lxts_dec_three: 2620 movups ($inp),$inout0 2621 movups 16*1($inp),$inout1 2622 movups 16*2($inp),$inout2 2623 lea 16*3($inp),$inp # $inp+=3*16 2624 xorps @tweak[0],$inout0 2625 xorps @tweak[1],$inout1 2626 xorps @tweak[2],$inout2 2627 2628 call _aesni_decrypt3 2629 2630 xorps @tweak[0],$inout0 2631 movdqa @tweak[3],@tweak[0] 2632 xorps @tweak[1],$inout1 2633 movdqa @tweak[4],@tweak[1] 2634 xorps @tweak[2],$inout2 2635 movups $inout0,($out) # store 3 output blocks 2636 movups $inout1,16*1($out) 2637 movups $inout2,16*2($out) 2638 lea 16*3($out),$out # $out+=3*16 2639 jmp .Lxts_dec_done 2640 2641.align 16 2642.Lxts_dec_four: 2643 movups ($inp),$inout0 2644 movups 16*1($inp),$inout1 2645 movups 16*2($inp),$inout2 2646 xorps @tweak[0],$inout0 2647 movups 16*3($inp),$inout3 2648 lea 16*4($inp),$inp # $inp+=4*16 2649 xorps @tweak[1],$inout1 2650 xorps @tweak[2],$inout2 2651 xorps @tweak[3],$inout3 2652 2653 call _aesni_decrypt4 2654 2655 pxor @tweak[0],$inout0 2656 movdqa @tweak[4],@tweak[0] 2657 pxor @tweak[1],$inout1 2658 movdqa @tweak[5],@tweak[1] 2659 pxor @tweak[2],$inout2 2660 movdqu $inout0,($out) # store 4 output blocks 2661 pxor @tweak[3],$inout3 2662 movdqu $inout1,16*1($out) 2663 movdqu $inout2,16*2($out) 2664 movdqu $inout3,16*3($out) 2665 lea 16*4($out),$out # $out+=4*16 2666 jmp .Lxts_dec_done 2667 2668.align 16 2669.Lxts_dec_done: 2670 and \$15,$len_ # see if $len%16 is 0 2671 jz .Lxts_dec_ret 2672.Lxts_dec_done2: 2673 mov $len_,$len 2674 mov $key_,$key # restore $key 2675 mov $rnds_,$rounds # restore $rounds 2676 2677 movups ($inp),$inout0 2678 xorps @tweak[1],$inout0 2679___ 2680 &aesni_generate1("dec",$key,$rounds); 2681$code.=<<___; 2682 xorps @tweak[1],$inout0 2683 movups $inout0,($out) 2684 2685.Lxts_dec_steal: 2686 movzb 16($inp),%eax # borrow $rounds ... 2687 movzb ($out),%ecx # ... and $key 2688 lea 1($inp),$inp 2689 mov %al,($out) 2690 mov %cl,16($out) 2691 lea 1($out),$out 2692 sub \$1,$len 2693 jnz .Lxts_dec_steal 2694 2695 sub $len_,$out # rewind $out 2696 mov $key_,$key # restore $key 2697 mov $rnds_,$rounds # restore $rounds 2698 2699 movups ($out),$inout0 2700 xorps @tweak[0],$inout0 2701___ 2702 &aesni_generate1("dec",$key,$rounds); 2703$code.=<<___; 2704 xorps @tweak[0],$inout0 2705 movups $inout0,($out) 2706 2707.Lxts_dec_ret: 2708 xorps %xmm0,%xmm0 # clear register bank 2709 pxor %xmm1,%xmm1 2710 pxor %xmm2,%xmm2 2711 pxor %xmm3,%xmm3 2712 pxor %xmm4,%xmm4 2713 pxor %xmm5,%xmm5 2714___ 2715$code.=<<___ if (!$win64); 2716 pxor %xmm6,%xmm6 2717 pxor %xmm7,%xmm7 2718 movaps %xmm0,0x00(%rsp) # clear stack 2719 pxor %xmm8,%xmm8 2720 movaps %xmm0,0x10(%rsp) 2721 pxor %xmm9,%xmm9 2722 movaps %xmm0,0x20(%rsp) 2723 pxor %xmm10,%xmm10 2724 movaps %xmm0,0x30(%rsp) 2725 pxor %xmm11,%xmm11 2726 movaps %xmm0,0x40(%rsp) 2727 pxor %xmm12,%xmm12 2728 movaps %xmm0,0x50(%rsp) 2729 pxor %xmm13,%xmm13 2730 movaps %xmm0,0x60(%rsp) 2731 pxor %xmm14,%xmm14 2732 pxor %xmm15,%xmm15 2733___ 2734$code.=<<___ if ($win64); 2735 movaps -0xa8(%r11),%xmm6 2736 movaps %xmm0,-0xa8(%r11) # clear stack 2737 movaps -0x98(%r11),%xmm7 2738 movaps %xmm0,-0x98(%r11) 2739 movaps -0x88(%r11),%xmm8 2740 movaps %xmm0,-0x88(%r11) 2741 movaps -0x78(%r11),%xmm9 2742 movaps %xmm0,-0x78(%r11) 2743 movaps -0x68(%r11),%xmm10 2744 movaps %xmm0,-0x68(%r11) 2745 movaps -0x58(%r11),%xmm11 2746 movaps %xmm0,-0x58(%r11) 2747 movaps -0x48(%r11),%xmm12 2748 movaps %xmm0,-0x48(%r11) 2749 movaps -0x38(%r11),%xmm13 2750 movaps %xmm0,-0x38(%r11) 2751 movaps -0x28(%r11),%xmm14 2752 movaps %xmm0,-0x28(%r11) 2753 movaps -0x18(%r11),%xmm15 2754 movaps %xmm0,-0x18(%r11) 2755 movaps %xmm0,0x00(%rsp) 2756 movaps %xmm0,0x10(%rsp) 2757 movaps %xmm0,0x20(%rsp) 2758 movaps %xmm0,0x30(%rsp) 2759 movaps %xmm0,0x40(%rsp) 2760 movaps %xmm0,0x50(%rsp) 2761 movaps %xmm0,0x60(%rsp) 2762___ 2763$code.=<<___; 2764 mov -8(%r11),%rbp 2765.cfi_restore %rbp 2766 lea (%r11),%rsp 2767.cfi_def_cfa_register %rsp 2768.Lxts_dec_epilogue: 2769 ret 2770.cfi_endproc 2771.size aesni_xts_decrypt,.-aesni_xts_decrypt 2772___ 2773} 2774 2775###################################################################### 2776# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2777# const AES_KEY *key, unsigned int start_block_num, 2778# unsigned char offset_i[16], const unsigned char L_[][16], 2779# unsigned char checksum[16]); 2780# 2781{ 2782my @offset=map("%xmm$_",(10..15)); 2783my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2784my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2785my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2786my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2787my $seventh_arg = $win64 ? 56 : 8; 2788my $blocks = $len; 2789 2790$code.=<<___; 2791.globl aesni_ocb_encrypt 2792.type aesni_ocb_encrypt,\@function,6 2793.align 32 2794aesni_ocb_encrypt: 2795.cfi_startproc 2796 endbranch 2797 lea (%rsp),%rax 2798 push %rbx 2799.cfi_push %rbx 2800 push %rbp 2801.cfi_push %rbp 2802 push %r12 2803.cfi_push %r12 2804 push %r13 2805.cfi_push %r13 2806 push %r14 2807.cfi_push %r14 2808___ 2809$code.=<<___ if ($win64); 2810 lea -0xa0(%rsp),%rsp 2811 movaps %xmm6,0x00(%rsp) # offload everything 2812 movaps %xmm7,0x10(%rsp) 2813 movaps %xmm8,0x20(%rsp) 2814 movaps %xmm9,0x30(%rsp) 2815 movaps %xmm10,0x40(%rsp) 2816 movaps %xmm11,0x50(%rsp) 2817 movaps %xmm12,0x60(%rsp) 2818 movaps %xmm13,0x70(%rsp) 2819 movaps %xmm14,0x80(%rsp) 2820 movaps %xmm15,0x90(%rsp) 2821.Locb_enc_body: 2822___ 2823$code.=<<___; 2824 mov $seventh_arg(%rax),$L_p # 7th argument 2825 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2826 2827 mov 240($key),$rnds_ 2828 mov $key,$key_ 2829 shl \$4,$rnds_ 2830 $movkey ($key),$rndkey0l # round[0] 2831 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2832 2833 movdqu ($offset_p),@offset[5] # load last offset_i 2834 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2835 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2836 2837 mov \$16+32,$rounds 2838 lea 32($key_,$rnds_),$key 2839 $movkey 16($key_),$rndkey1 # round[1] 2840 sub %r10,%rax # twisted $rounds 2841 mov %rax,%r10 # backup twisted $rounds 2842 2843 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2844 movdqu ($checksum_p),$checksum # load checksum 2845 2846 test \$1,$block_num # is first block number odd? 2847 jnz .Locb_enc_odd 2848 2849 bsf $block_num,$i1 2850 add \$1,$block_num 2851 shl \$4,$i1 2852 movdqu ($L_p,$i1),$inout5 # borrow 2853 movdqu ($inp),$inout0 2854 lea 16($inp),$inp 2855 2856 call __ocb_encrypt1 2857 2858 movdqa $inout5,@offset[5] 2859 movups $inout0,($out) 2860 lea 16($out),$out 2861 sub \$1,$blocks 2862 jz .Locb_enc_done 2863 2864.Locb_enc_odd: 2865 lea 1($block_num),$i1 # even-numbered blocks 2866 lea 3($block_num),$i3 2867 lea 5($block_num),$i5 2868 lea 6($block_num),$block_num 2869 bsf $i1,$i1 # ntz(block) 2870 bsf $i3,$i3 2871 bsf $i5,$i5 2872 shl \$4,$i1 # ntz(block) -> table offset 2873 shl \$4,$i3 2874 shl \$4,$i5 2875 2876 sub \$6,$blocks 2877 jc .Locb_enc_short 2878 jmp .Locb_enc_grandloop 2879 2880.align 32 2881.Locb_enc_grandloop: 2882 movdqu `16*0`($inp),$inout0 # load input 2883 movdqu `16*1`($inp),$inout1 2884 movdqu `16*2`($inp),$inout2 2885 movdqu `16*3`($inp),$inout3 2886 movdqu `16*4`($inp),$inout4 2887 movdqu `16*5`($inp),$inout5 2888 lea `16*6`($inp),$inp 2889 2890 call __ocb_encrypt6 2891 2892 movups $inout0,`16*0`($out) # store output 2893 movups $inout1,`16*1`($out) 2894 movups $inout2,`16*2`($out) 2895 movups $inout3,`16*3`($out) 2896 movups $inout4,`16*4`($out) 2897 movups $inout5,`16*5`($out) 2898 lea `16*6`($out),$out 2899 sub \$6,$blocks 2900 jnc .Locb_enc_grandloop 2901 2902.Locb_enc_short: 2903 add \$6,$blocks 2904 jz .Locb_enc_done 2905 2906 movdqu `16*0`($inp),$inout0 2907 cmp \$2,$blocks 2908 jb .Locb_enc_one 2909 movdqu `16*1`($inp),$inout1 2910 je .Locb_enc_two 2911 2912 movdqu `16*2`($inp),$inout2 2913 cmp \$4,$blocks 2914 jb .Locb_enc_three 2915 movdqu `16*3`($inp),$inout3 2916 je .Locb_enc_four 2917 2918 movdqu `16*4`($inp),$inout4 2919 pxor $inout5,$inout5 2920 2921 call __ocb_encrypt6 2922 2923 movdqa @offset[4],@offset[5] 2924 movups $inout0,`16*0`($out) 2925 movups $inout1,`16*1`($out) 2926 movups $inout2,`16*2`($out) 2927 movups $inout3,`16*3`($out) 2928 movups $inout4,`16*4`($out) 2929 2930 jmp .Locb_enc_done 2931 2932.align 16 2933.Locb_enc_one: 2934 movdqa @offset[0],$inout5 # borrow 2935 2936 call __ocb_encrypt1 2937 2938 movdqa $inout5,@offset[5] 2939 movups $inout0,`16*0`($out) 2940 jmp .Locb_enc_done 2941 2942.align 16 2943.Locb_enc_two: 2944 pxor $inout2,$inout2 2945 pxor $inout3,$inout3 2946 2947 call __ocb_encrypt4 2948 2949 movdqa @offset[1],@offset[5] 2950 movups $inout0,`16*0`($out) 2951 movups $inout1,`16*1`($out) 2952 2953 jmp .Locb_enc_done 2954 2955.align 16 2956.Locb_enc_three: 2957 pxor $inout3,$inout3 2958 2959 call __ocb_encrypt4 2960 2961 movdqa @offset[2],@offset[5] 2962 movups $inout0,`16*0`($out) 2963 movups $inout1,`16*1`($out) 2964 movups $inout2,`16*2`($out) 2965 2966 jmp .Locb_enc_done 2967 2968.align 16 2969.Locb_enc_four: 2970 call __ocb_encrypt4 2971 2972 movdqa @offset[3],@offset[5] 2973 movups $inout0,`16*0`($out) 2974 movups $inout1,`16*1`($out) 2975 movups $inout2,`16*2`($out) 2976 movups $inout3,`16*3`($out) 2977 2978.Locb_enc_done: 2979 pxor $rndkey0,@offset[5] # "remove" round[last] 2980 movdqu $checksum,($checksum_p) # store checksum 2981 movdqu @offset[5],($offset_p) # store last offset_i 2982 2983 xorps %xmm0,%xmm0 # clear register bank 2984 pxor %xmm1,%xmm1 2985 pxor %xmm2,%xmm2 2986 pxor %xmm3,%xmm3 2987 pxor %xmm4,%xmm4 2988 pxor %xmm5,%xmm5 2989___ 2990$code.=<<___ if (!$win64); 2991 pxor %xmm6,%xmm6 2992 pxor %xmm7,%xmm7 2993 pxor %xmm8,%xmm8 2994 pxor %xmm9,%xmm9 2995 pxor %xmm10,%xmm10 2996 pxor %xmm11,%xmm11 2997 pxor %xmm12,%xmm12 2998 pxor %xmm13,%xmm13 2999 pxor %xmm14,%xmm14 3000 pxor %xmm15,%xmm15 3001 lea 0x28(%rsp),%rax 3002.cfi_def_cfa %rax,8 3003___ 3004$code.=<<___ if ($win64); 3005 movaps 0x00(%rsp),%xmm6 3006 movaps %xmm0,0x00(%rsp) # clear stack 3007 movaps 0x10(%rsp),%xmm7 3008 movaps %xmm0,0x10(%rsp) 3009 movaps 0x20(%rsp),%xmm8 3010 movaps %xmm0,0x20(%rsp) 3011 movaps 0x30(%rsp),%xmm9 3012 movaps %xmm0,0x30(%rsp) 3013 movaps 0x40(%rsp),%xmm10 3014 movaps %xmm0,0x40(%rsp) 3015 movaps 0x50(%rsp),%xmm11 3016 movaps %xmm0,0x50(%rsp) 3017 movaps 0x60(%rsp),%xmm12 3018 movaps %xmm0,0x60(%rsp) 3019 movaps 0x70(%rsp),%xmm13 3020 movaps %xmm0,0x70(%rsp) 3021 movaps 0x80(%rsp),%xmm14 3022 movaps %xmm0,0x80(%rsp) 3023 movaps 0x90(%rsp),%xmm15 3024 movaps %xmm0,0x90(%rsp) 3025 lea 0xa0+0x28(%rsp),%rax 3026.Locb_enc_pop: 3027___ 3028$code.=<<___; 3029 mov -40(%rax),%r14 3030.cfi_restore %r14 3031 mov -32(%rax),%r13 3032.cfi_restore %r13 3033 mov -24(%rax),%r12 3034.cfi_restore %r12 3035 mov -16(%rax),%rbp 3036.cfi_restore %rbp 3037 mov -8(%rax),%rbx 3038.cfi_restore %rbx 3039 lea (%rax),%rsp 3040.cfi_def_cfa_register %rsp 3041.Locb_enc_epilogue: 3042 ret 3043.cfi_endproc 3044.size aesni_ocb_encrypt,.-aesni_ocb_encrypt 3045 3046.type __ocb_encrypt6,\@abi-omnipotent 3047.align 32 3048__ocb_encrypt6: 3049.cfi_startproc 3050 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3051 movdqu ($L_p,$i1),@offset[1] 3052 movdqa @offset[0],@offset[2] 3053 movdqu ($L_p,$i3),@offset[3] 3054 movdqa @offset[0],@offset[4] 3055 pxor @offset[5],@offset[0] 3056 movdqu ($L_p,$i5),@offset[5] 3057 pxor @offset[0],@offset[1] 3058 pxor $inout0,$checksum # accumulate checksum 3059 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3060 pxor @offset[1],@offset[2] 3061 pxor $inout1,$checksum 3062 pxor @offset[1],$inout1 3063 pxor @offset[2],@offset[3] 3064 pxor $inout2,$checksum 3065 pxor @offset[2],$inout2 3066 pxor @offset[3],@offset[4] 3067 pxor $inout3,$checksum 3068 pxor @offset[3],$inout3 3069 pxor @offset[4],@offset[5] 3070 pxor $inout4,$checksum 3071 pxor @offset[4],$inout4 3072 pxor $inout5,$checksum 3073 pxor @offset[5],$inout5 3074 $movkey 32($key_),$rndkey0 3075 3076 lea 1($block_num),$i1 # even-numbered blocks 3077 lea 3($block_num),$i3 3078 lea 5($block_num),$i5 3079 add \$6,$block_num 3080 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3081 bsf $i1,$i1 # ntz(block) 3082 bsf $i3,$i3 3083 bsf $i5,$i5 3084 3085 aesenc $rndkey1,$inout0 3086 aesenc $rndkey1,$inout1 3087 aesenc $rndkey1,$inout2 3088 aesenc $rndkey1,$inout3 3089 pxor $rndkey0l,@offset[1] 3090 pxor $rndkey0l,@offset[2] 3091 aesenc $rndkey1,$inout4 3092 pxor $rndkey0l,@offset[3] 3093 pxor $rndkey0l,@offset[4] 3094 aesenc $rndkey1,$inout5 3095 $movkey 48($key_),$rndkey1 3096 pxor $rndkey0l,@offset[5] 3097 3098 aesenc $rndkey0,$inout0 3099 aesenc $rndkey0,$inout1 3100 aesenc $rndkey0,$inout2 3101 aesenc $rndkey0,$inout3 3102 aesenc $rndkey0,$inout4 3103 aesenc $rndkey0,$inout5 3104 $movkey 64($key_),$rndkey0 3105 shl \$4,$i1 # ntz(block) -> table offset 3106 shl \$4,$i3 3107 jmp .Locb_enc_loop6 3108 3109.align 32 3110.Locb_enc_loop6: 3111 aesenc $rndkey1,$inout0 3112 aesenc $rndkey1,$inout1 3113 aesenc $rndkey1,$inout2 3114 aesenc $rndkey1,$inout3 3115 aesenc $rndkey1,$inout4 3116 aesenc $rndkey1,$inout5 3117 $movkey ($key,%rax),$rndkey1 3118 add \$32,%rax 3119 3120 aesenc $rndkey0,$inout0 3121 aesenc $rndkey0,$inout1 3122 aesenc $rndkey0,$inout2 3123 aesenc $rndkey0,$inout3 3124 aesenc $rndkey0,$inout4 3125 aesenc $rndkey0,$inout5 3126 $movkey -16($key,%rax),$rndkey0 3127 jnz .Locb_enc_loop6 3128 3129 aesenc $rndkey1,$inout0 3130 aesenc $rndkey1,$inout1 3131 aesenc $rndkey1,$inout2 3132 aesenc $rndkey1,$inout3 3133 aesenc $rndkey1,$inout4 3134 aesenc $rndkey1,$inout5 3135 $movkey 16($key_),$rndkey1 3136 shl \$4,$i5 3137 3138 aesenclast @offset[0],$inout0 3139 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3140 mov %r10,%rax # restore twisted rounds 3141 aesenclast @offset[1],$inout1 3142 aesenclast @offset[2],$inout2 3143 aesenclast @offset[3],$inout3 3144 aesenclast @offset[4],$inout4 3145 aesenclast @offset[5],$inout5 3146 ret 3147.cfi_endproc 3148.size __ocb_encrypt6,.-__ocb_encrypt6 3149 3150.type __ocb_encrypt4,\@abi-omnipotent 3151.align 32 3152__ocb_encrypt4: 3153.cfi_startproc 3154 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3155 movdqu ($L_p,$i1),@offset[1] 3156 movdqa @offset[0],@offset[2] 3157 movdqu ($L_p,$i3),@offset[3] 3158 pxor @offset[5],@offset[0] 3159 pxor @offset[0],@offset[1] 3160 pxor $inout0,$checksum # accumulate checksum 3161 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3162 pxor @offset[1],@offset[2] 3163 pxor $inout1,$checksum 3164 pxor @offset[1],$inout1 3165 pxor @offset[2],@offset[3] 3166 pxor $inout2,$checksum 3167 pxor @offset[2],$inout2 3168 pxor $inout3,$checksum 3169 pxor @offset[3],$inout3 3170 $movkey 32($key_),$rndkey0 3171 3172 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3173 pxor $rndkey0l,@offset[1] 3174 pxor $rndkey0l,@offset[2] 3175 pxor $rndkey0l,@offset[3] 3176 3177 aesenc $rndkey1,$inout0 3178 aesenc $rndkey1,$inout1 3179 aesenc $rndkey1,$inout2 3180 aesenc $rndkey1,$inout3 3181 $movkey 48($key_),$rndkey1 3182 3183 aesenc $rndkey0,$inout0 3184 aesenc $rndkey0,$inout1 3185 aesenc $rndkey0,$inout2 3186 aesenc $rndkey0,$inout3 3187 $movkey 64($key_),$rndkey0 3188 jmp .Locb_enc_loop4 3189 3190.align 32 3191.Locb_enc_loop4: 3192 aesenc $rndkey1,$inout0 3193 aesenc $rndkey1,$inout1 3194 aesenc $rndkey1,$inout2 3195 aesenc $rndkey1,$inout3 3196 $movkey ($key,%rax),$rndkey1 3197 add \$32,%rax 3198 3199 aesenc $rndkey0,$inout0 3200 aesenc $rndkey0,$inout1 3201 aesenc $rndkey0,$inout2 3202 aesenc $rndkey0,$inout3 3203 $movkey -16($key,%rax),$rndkey0 3204 jnz .Locb_enc_loop4 3205 3206 aesenc $rndkey1,$inout0 3207 aesenc $rndkey1,$inout1 3208 aesenc $rndkey1,$inout2 3209 aesenc $rndkey1,$inout3 3210 $movkey 16($key_),$rndkey1 3211 mov %r10,%rax # restore twisted rounds 3212 3213 aesenclast @offset[0],$inout0 3214 aesenclast @offset[1],$inout1 3215 aesenclast @offset[2],$inout2 3216 aesenclast @offset[3],$inout3 3217 ret 3218.cfi_endproc 3219.size __ocb_encrypt4,.-__ocb_encrypt4 3220 3221.type __ocb_encrypt1,\@abi-omnipotent 3222.align 32 3223__ocb_encrypt1: 3224.cfi_startproc 3225 pxor @offset[5],$inout5 # offset_i 3226 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3227 pxor $inout0,$checksum # accumulate checksum 3228 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3229 $movkey 32($key_),$rndkey0 3230 3231 aesenc $rndkey1,$inout0 3232 $movkey 48($key_),$rndkey1 3233 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3234 3235 aesenc $rndkey0,$inout0 3236 $movkey 64($key_),$rndkey0 3237 jmp .Locb_enc_loop1 3238 3239.align 32 3240.Locb_enc_loop1: 3241 aesenc $rndkey1,$inout0 3242 $movkey ($key,%rax),$rndkey1 3243 add \$32,%rax 3244 3245 aesenc $rndkey0,$inout0 3246 $movkey -16($key,%rax),$rndkey0 3247 jnz .Locb_enc_loop1 3248 3249 aesenc $rndkey1,$inout0 3250 $movkey 16($key_),$rndkey1 # redundant in tail 3251 mov %r10,%rax # restore twisted rounds 3252 3253 aesenclast $inout5,$inout0 3254 ret 3255.cfi_endproc 3256.size __ocb_encrypt1,.-__ocb_encrypt1 3257 3258.globl aesni_ocb_decrypt 3259.type aesni_ocb_decrypt,\@function,6 3260.align 32 3261aesni_ocb_decrypt: 3262.cfi_startproc 3263 endbranch 3264 lea (%rsp),%rax 3265 push %rbx 3266.cfi_push %rbx 3267 push %rbp 3268.cfi_push %rbp 3269 push %r12 3270.cfi_push %r12 3271 push %r13 3272.cfi_push %r13 3273 push %r14 3274.cfi_push %r14 3275___ 3276$code.=<<___ if ($win64); 3277 lea -0xa0(%rsp),%rsp 3278 movaps %xmm6,0x00(%rsp) # offload everything 3279 movaps %xmm7,0x10(%rsp) 3280 movaps %xmm8,0x20(%rsp) 3281 movaps %xmm9,0x30(%rsp) 3282 movaps %xmm10,0x40(%rsp) 3283 movaps %xmm11,0x50(%rsp) 3284 movaps %xmm12,0x60(%rsp) 3285 movaps %xmm13,0x70(%rsp) 3286 movaps %xmm14,0x80(%rsp) 3287 movaps %xmm15,0x90(%rsp) 3288.Locb_dec_body: 3289___ 3290$code.=<<___; 3291 mov $seventh_arg(%rax),$L_p # 7th argument 3292 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3293 3294 mov 240($key),$rnds_ 3295 mov $key,$key_ 3296 shl \$4,$rnds_ 3297 $movkey ($key),$rndkey0l # round[0] 3298 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3299 3300 movdqu ($offset_p),@offset[5] # load last offset_i 3301 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3302 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3303 3304 mov \$16+32,$rounds 3305 lea 32($key_,$rnds_),$key 3306 $movkey 16($key_),$rndkey1 # round[1] 3307 sub %r10,%rax # twisted $rounds 3308 mov %rax,%r10 # backup twisted $rounds 3309 3310 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3311 movdqu ($checksum_p),$checksum # load checksum 3312 3313 test \$1,$block_num # is first block number odd? 3314 jnz .Locb_dec_odd 3315 3316 bsf $block_num,$i1 3317 add \$1,$block_num 3318 shl \$4,$i1 3319 movdqu ($L_p,$i1),$inout5 # borrow 3320 movdqu ($inp),$inout0 3321 lea 16($inp),$inp 3322 3323 call __ocb_decrypt1 3324 3325 movdqa $inout5,@offset[5] 3326 movups $inout0,($out) 3327 xorps $inout0,$checksum # accumulate checksum 3328 lea 16($out),$out 3329 sub \$1,$blocks 3330 jz .Locb_dec_done 3331 3332.Locb_dec_odd: 3333 lea 1($block_num),$i1 # even-numbered blocks 3334 lea 3($block_num),$i3 3335 lea 5($block_num),$i5 3336 lea 6($block_num),$block_num 3337 bsf $i1,$i1 # ntz(block) 3338 bsf $i3,$i3 3339 bsf $i5,$i5 3340 shl \$4,$i1 # ntz(block) -> table offset 3341 shl \$4,$i3 3342 shl \$4,$i5 3343 3344 sub \$6,$blocks 3345 jc .Locb_dec_short 3346 jmp .Locb_dec_grandloop 3347 3348.align 32 3349.Locb_dec_grandloop: 3350 movdqu `16*0`($inp),$inout0 # load input 3351 movdqu `16*1`($inp),$inout1 3352 movdqu `16*2`($inp),$inout2 3353 movdqu `16*3`($inp),$inout3 3354 movdqu `16*4`($inp),$inout4 3355 movdqu `16*5`($inp),$inout5 3356 lea `16*6`($inp),$inp 3357 3358 call __ocb_decrypt6 3359 3360 movups $inout0,`16*0`($out) # store output 3361 pxor $inout0,$checksum # accumulate checksum 3362 movups $inout1,`16*1`($out) 3363 pxor $inout1,$checksum 3364 movups $inout2,`16*2`($out) 3365 pxor $inout2,$checksum 3366 movups $inout3,`16*3`($out) 3367 pxor $inout3,$checksum 3368 movups $inout4,`16*4`($out) 3369 pxor $inout4,$checksum 3370 movups $inout5,`16*5`($out) 3371 pxor $inout5,$checksum 3372 lea `16*6`($out),$out 3373 sub \$6,$blocks 3374 jnc .Locb_dec_grandloop 3375 3376.Locb_dec_short: 3377 add \$6,$blocks 3378 jz .Locb_dec_done 3379 3380 movdqu `16*0`($inp),$inout0 3381 cmp \$2,$blocks 3382 jb .Locb_dec_one 3383 movdqu `16*1`($inp),$inout1 3384 je .Locb_dec_two 3385 3386 movdqu `16*2`($inp),$inout2 3387 cmp \$4,$blocks 3388 jb .Locb_dec_three 3389 movdqu `16*3`($inp),$inout3 3390 je .Locb_dec_four 3391 3392 movdqu `16*4`($inp),$inout4 3393 pxor $inout5,$inout5 3394 3395 call __ocb_decrypt6 3396 3397 movdqa @offset[4],@offset[5] 3398 movups $inout0,`16*0`($out) # store output 3399 pxor $inout0,$checksum # accumulate checksum 3400 movups $inout1,`16*1`($out) 3401 pxor $inout1,$checksum 3402 movups $inout2,`16*2`($out) 3403 pxor $inout2,$checksum 3404 movups $inout3,`16*3`($out) 3405 pxor $inout3,$checksum 3406 movups $inout4,`16*4`($out) 3407 pxor $inout4,$checksum 3408 3409 jmp .Locb_dec_done 3410 3411.align 16 3412.Locb_dec_one: 3413 movdqa @offset[0],$inout5 # borrow 3414 3415 call __ocb_decrypt1 3416 3417 movdqa $inout5,@offset[5] 3418 movups $inout0,`16*0`($out) # store output 3419 xorps $inout0,$checksum # accumulate checksum 3420 jmp .Locb_dec_done 3421 3422.align 16 3423.Locb_dec_two: 3424 pxor $inout2,$inout2 3425 pxor $inout3,$inout3 3426 3427 call __ocb_decrypt4 3428 3429 movdqa @offset[1],@offset[5] 3430 movups $inout0,`16*0`($out) # store output 3431 xorps $inout0,$checksum # accumulate checksum 3432 movups $inout1,`16*1`($out) 3433 xorps $inout1,$checksum 3434 3435 jmp .Locb_dec_done 3436 3437.align 16 3438.Locb_dec_three: 3439 pxor $inout3,$inout3 3440 3441 call __ocb_decrypt4 3442 3443 movdqa @offset[2],@offset[5] 3444 movups $inout0,`16*0`($out) # store output 3445 xorps $inout0,$checksum # accumulate checksum 3446 movups $inout1,`16*1`($out) 3447 xorps $inout1,$checksum 3448 movups $inout2,`16*2`($out) 3449 xorps $inout2,$checksum 3450 3451 jmp .Locb_dec_done 3452 3453.align 16 3454.Locb_dec_four: 3455 call __ocb_decrypt4 3456 3457 movdqa @offset[3],@offset[5] 3458 movups $inout0,`16*0`($out) # store output 3459 pxor $inout0,$checksum # accumulate checksum 3460 movups $inout1,`16*1`($out) 3461 pxor $inout1,$checksum 3462 movups $inout2,`16*2`($out) 3463 pxor $inout2,$checksum 3464 movups $inout3,`16*3`($out) 3465 pxor $inout3,$checksum 3466 3467.Locb_dec_done: 3468 pxor $rndkey0,@offset[5] # "remove" round[last] 3469 movdqu $checksum,($checksum_p) # store checksum 3470 movdqu @offset[5],($offset_p) # store last offset_i 3471 3472 xorps %xmm0,%xmm0 # clear register bank 3473 pxor %xmm1,%xmm1 3474 pxor %xmm2,%xmm2 3475 pxor %xmm3,%xmm3 3476 pxor %xmm4,%xmm4 3477 pxor %xmm5,%xmm5 3478___ 3479$code.=<<___ if (!$win64); 3480 pxor %xmm6,%xmm6 3481 pxor %xmm7,%xmm7 3482 pxor %xmm8,%xmm8 3483 pxor %xmm9,%xmm9 3484 pxor %xmm10,%xmm10 3485 pxor %xmm11,%xmm11 3486 pxor %xmm12,%xmm12 3487 pxor %xmm13,%xmm13 3488 pxor %xmm14,%xmm14 3489 pxor %xmm15,%xmm15 3490 lea 0x28(%rsp),%rax 3491.cfi_def_cfa %rax,8 3492___ 3493$code.=<<___ if ($win64); 3494 movaps 0x00(%rsp),%xmm6 3495 movaps %xmm0,0x00(%rsp) # clear stack 3496 movaps 0x10(%rsp),%xmm7 3497 movaps %xmm0,0x10(%rsp) 3498 movaps 0x20(%rsp),%xmm8 3499 movaps %xmm0,0x20(%rsp) 3500 movaps 0x30(%rsp),%xmm9 3501 movaps %xmm0,0x30(%rsp) 3502 movaps 0x40(%rsp),%xmm10 3503 movaps %xmm0,0x40(%rsp) 3504 movaps 0x50(%rsp),%xmm11 3505 movaps %xmm0,0x50(%rsp) 3506 movaps 0x60(%rsp),%xmm12 3507 movaps %xmm0,0x60(%rsp) 3508 movaps 0x70(%rsp),%xmm13 3509 movaps %xmm0,0x70(%rsp) 3510 movaps 0x80(%rsp),%xmm14 3511 movaps %xmm0,0x80(%rsp) 3512 movaps 0x90(%rsp),%xmm15 3513 movaps %xmm0,0x90(%rsp) 3514 lea 0xa0+0x28(%rsp),%rax 3515.Locb_dec_pop: 3516___ 3517$code.=<<___; 3518 mov -40(%rax),%r14 3519.cfi_restore %r14 3520 mov -32(%rax),%r13 3521.cfi_restore %r13 3522 mov -24(%rax),%r12 3523.cfi_restore %r12 3524 mov -16(%rax),%rbp 3525.cfi_restore %rbp 3526 mov -8(%rax),%rbx 3527.cfi_restore %rbx 3528 lea (%rax),%rsp 3529.cfi_def_cfa_register %rsp 3530.Locb_dec_epilogue: 3531 ret 3532.cfi_endproc 3533.size aesni_ocb_decrypt,.-aesni_ocb_decrypt 3534 3535.type __ocb_decrypt6,\@abi-omnipotent 3536.align 32 3537__ocb_decrypt6: 3538.cfi_startproc 3539 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3540 movdqu ($L_p,$i1),@offset[1] 3541 movdqa @offset[0],@offset[2] 3542 movdqu ($L_p,$i3),@offset[3] 3543 movdqa @offset[0],@offset[4] 3544 pxor @offset[5],@offset[0] 3545 movdqu ($L_p,$i5),@offset[5] 3546 pxor @offset[0],@offset[1] 3547 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3548 pxor @offset[1],@offset[2] 3549 pxor @offset[1],$inout1 3550 pxor @offset[2],@offset[3] 3551 pxor @offset[2],$inout2 3552 pxor @offset[3],@offset[4] 3553 pxor @offset[3],$inout3 3554 pxor @offset[4],@offset[5] 3555 pxor @offset[4],$inout4 3556 pxor @offset[5],$inout5 3557 $movkey 32($key_),$rndkey0 3558 3559 lea 1($block_num),$i1 # even-numbered blocks 3560 lea 3($block_num),$i3 3561 lea 5($block_num),$i5 3562 add \$6,$block_num 3563 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3564 bsf $i1,$i1 # ntz(block) 3565 bsf $i3,$i3 3566 bsf $i5,$i5 3567 3568 aesdec $rndkey1,$inout0 3569 aesdec $rndkey1,$inout1 3570 aesdec $rndkey1,$inout2 3571 aesdec $rndkey1,$inout3 3572 pxor $rndkey0l,@offset[1] 3573 pxor $rndkey0l,@offset[2] 3574 aesdec $rndkey1,$inout4 3575 pxor $rndkey0l,@offset[3] 3576 pxor $rndkey0l,@offset[4] 3577 aesdec $rndkey1,$inout5 3578 $movkey 48($key_),$rndkey1 3579 pxor $rndkey0l,@offset[5] 3580 3581 aesdec $rndkey0,$inout0 3582 aesdec $rndkey0,$inout1 3583 aesdec $rndkey0,$inout2 3584 aesdec $rndkey0,$inout3 3585 aesdec $rndkey0,$inout4 3586 aesdec $rndkey0,$inout5 3587 $movkey 64($key_),$rndkey0 3588 shl \$4,$i1 # ntz(block) -> table offset 3589 shl \$4,$i3 3590 jmp .Locb_dec_loop6 3591 3592.align 32 3593.Locb_dec_loop6: 3594 aesdec $rndkey1,$inout0 3595 aesdec $rndkey1,$inout1 3596 aesdec $rndkey1,$inout2 3597 aesdec $rndkey1,$inout3 3598 aesdec $rndkey1,$inout4 3599 aesdec $rndkey1,$inout5 3600 $movkey ($key,%rax),$rndkey1 3601 add \$32,%rax 3602 3603 aesdec $rndkey0,$inout0 3604 aesdec $rndkey0,$inout1 3605 aesdec $rndkey0,$inout2 3606 aesdec $rndkey0,$inout3 3607 aesdec $rndkey0,$inout4 3608 aesdec $rndkey0,$inout5 3609 $movkey -16($key,%rax),$rndkey0 3610 jnz .Locb_dec_loop6 3611 3612 aesdec $rndkey1,$inout0 3613 aesdec $rndkey1,$inout1 3614 aesdec $rndkey1,$inout2 3615 aesdec $rndkey1,$inout3 3616 aesdec $rndkey1,$inout4 3617 aesdec $rndkey1,$inout5 3618 $movkey 16($key_),$rndkey1 3619 shl \$4,$i5 3620 3621 aesdeclast @offset[0],$inout0 3622 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3623 mov %r10,%rax # restore twisted rounds 3624 aesdeclast @offset[1],$inout1 3625 aesdeclast @offset[2],$inout2 3626 aesdeclast @offset[3],$inout3 3627 aesdeclast @offset[4],$inout4 3628 aesdeclast @offset[5],$inout5 3629 ret 3630.cfi_endproc 3631.size __ocb_decrypt6,.-__ocb_decrypt6 3632 3633.type __ocb_decrypt4,\@abi-omnipotent 3634.align 32 3635__ocb_decrypt4: 3636.cfi_startproc 3637 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3638 movdqu ($L_p,$i1),@offset[1] 3639 movdqa @offset[0],@offset[2] 3640 movdqu ($L_p,$i3),@offset[3] 3641 pxor @offset[5],@offset[0] 3642 pxor @offset[0],@offset[1] 3643 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3644 pxor @offset[1],@offset[2] 3645 pxor @offset[1],$inout1 3646 pxor @offset[2],@offset[3] 3647 pxor @offset[2],$inout2 3648 pxor @offset[3],$inout3 3649 $movkey 32($key_),$rndkey0 3650 3651 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3652 pxor $rndkey0l,@offset[1] 3653 pxor $rndkey0l,@offset[2] 3654 pxor $rndkey0l,@offset[3] 3655 3656 aesdec $rndkey1,$inout0 3657 aesdec $rndkey1,$inout1 3658 aesdec $rndkey1,$inout2 3659 aesdec $rndkey1,$inout3 3660 $movkey 48($key_),$rndkey1 3661 3662 aesdec $rndkey0,$inout0 3663 aesdec $rndkey0,$inout1 3664 aesdec $rndkey0,$inout2 3665 aesdec $rndkey0,$inout3 3666 $movkey 64($key_),$rndkey0 3667 jmp .Locb_dec_loop4 3668 3669.align 32 3670.Locb_dec_loop4: 3671 aesdec $rndkey1,$inout0 3672 aesdec $rndkey1,$inout1 3673 aesdec $rndkey1,$inout2 3674 aesdec $rndkey1,$inout3 3675 $movkey ($key,%rax),$rndkey1 3676 add \$32,%rax 3677 3678 aesdec $rndkey0,$inout0 3679 aesdec $rndkey0,$inout1 3680 aesdec $rndkey0,$inout2 3681 aesdec $rndkey0,$inout3 3682 $movkey -16($key,%rax),$rndkey0 3683 jnz .Locb_dec_loop4 3684 3685 aesdec $rndkey1,$inout0 3686 aesdec $rndkey1,$inout1 3687 aesdec $rndkey1,$inout2 3688 aesdec $rndkey1,$inout3 3689 $movkey 16($key_),$rndkey1 3690 mov %r10,%rax # restore twisted rounds 3691 3692 aesdeclast @offset[0],$inout0 3693 aesdeclast @offset[1],$inout1 3694 aesdeclast @offset[2],$inout2 3695 aesdeclast @offset[3],$inout3 3696 ret 3697.cfi_endproc 3698.size __ocb_decrypt4,.-__ocb_decrypt4 3699 3700.type __ocb_decrypt1,\@abi-omnipotent 3701.align 32 3702__ocb_decrypt1: 3703.cfi_startproc 3704 pxor @offset[5],$inout5 # offset_i 3705 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3706 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3707 $movkey 32($key_),$rndkey0 3708 3709 aesdec $rndkey1,$inout0 3710 $movkey 48($key_),$rndkey1 3711 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3712 3713 aesdec $rndkey0,$inout0 3714 $movkey 64($key_),$rndkey0 3715 jmp .Locb_dec_loop1 3716 3717.align 32 3718.Locb_dec_loop1: 3719 aesdec $rndkey1,$inout0 3720 $movkey ($key,%rax),$rndkey1 3721 add \$32,%rax 3722 3723 aesdec $rndkey0,$inout0 3724 $movkey -16($key,%rax),$rndkey0 3725 jnz .Locb_dec_loop1 3726 3727 aesdec $rndkey1,$inout0 3728 $movkey 16($key_),$rndkey1 # redundant in tail 3729 mov %r10,%rax # restore twisted rounds 3730 3731 aesdeclast $inout5,$inout0 3732 ret 3733.cfi_endproc 3734.size __ocb_decrypt1,.-__ocb_decrypt1 3735___ 3736} }} 3737 3738######################################################################## 3739# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3740# size_t length, const AES_KEY *key, 3741# unsigned char *ivp,const int enc); 3742{ 3743my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3744my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3745 3746$code.=<<___; 3747.globl ${PREFIX}_cbc_encrypt 3748.type ${PREFIX}_cbc_encrypt,\@function,6 3749.align 16 3750${PREFIX}_cbc_encrypt: 3751.cfi_startproc 3752 endbranch 3753 test $len,$len # check length 3754 jz .Lcbc_ret 3755 3756 mov 240($key),$rnds_ # key->rounds 3757 mov $key,$key_ # backup $key 3758 test %r9d,%r9d # 6th argument 3759 jz .Lcbc_decrypt 3760#--------------------------- CBC ENCRYPT ------------------------------# 3761 movups ($ivp),$inout0 # load iv as initial state 3762 mov $rnds_,$rounds 3763 cmp \$16,$len 3764 jb .Lcbc_enc_tail 3765 sub \$16,$len 3766 jmp .Lcbc_enc_loop 3767.align 16 3768.Lcbc_enc_loop: 3769 movups ($inp),$inout1 # load input 3770 lea 16($inp),$inp 3771 #xorps $inout1,$inout0 3772___ 3773 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3774$code.=<<___; 3775 mov $rnds_,$rounds # restore $rounds 3776 mov $key_,$key # restore $key 3777 movups $inout0,0($out) # store output 3778 lea 16($out),$out 3779 sub \$16,$len 3780 jnc .Lcbc_enc_loop 3781 add \$16,$len 3782 jnz .Lcbc_enc_tail 3783 pxor $rndkey0,$rndkey0 # clear register bank 3784 pxor $rndkey1,$rndkey1 3785 movups $inout0,($ivp) 3786 pxor $inout0,$inout0 3787 pxor $inout1,$inout1 3788 jmp .Lcbc_ret 3789 3790.Lcbc_enc_tail: 3791 mov $len,%rcx # zaps $key 3792 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3793 .long 0x9066A4F3 # rep movsb 3794 mov \$16,%ecx # zero tail 3795 sub $len,%rcx 3796 xor %eax,%eax 3797 .long 0x9066AAF3 # rep stosb 3798 lea -16(%rdi),%rdi # rewind $out by 1 block 3799 mov $rnds_,$rounds # restore $rounds 3800 mov %rdi,%rsi # $inp and $out are the same 3801 mov $key_,$key # restore $key 3802 xor $len,$len # len=16 3803 jmp .Lcbc_enc_loop # one more spin 3804#--------------------------- CBC DECRYPT ------------------------------# 3805.align 16 3806.Lcbc_decrypt: 3807 cmp \$16,$len 3808 jne .Lcbc_decrypt_bulk 3809 3810 # handle single block without allocating stack frame, 3811 # useful in ciphertext stealing mode 3812 movdqu ($inp),$inout0 # load input 3813 movdqu ($ivp),$inout1 # load iv 3814 movdqa $inout0,$inout2 # future iv 3815___ 3816 &aesni_generate1("dec",$key,$rnds_); 3817$code.=<<___; 3818 pxor $rndkey0,$rndkey0 # clear register bank 3819 pxor $rndkey1,$rndkey1 3820 movdqu $inout2,($ivp) # store iv 3821 xorps $inout1,$inout0 # ^=iv 3822 pxor $inout1,$inout1 3823 movups $inout0,($out) # store output 3824 pxor $inout0,$inout0 3825 jmp .Lcbc_ret 3826.align 16 3827.Lcbc_decrypt_bulk: 3828 lea (%rsp),%r11 # frame pointer 3829.cfi_def_cfa_register %r11 3830 push %rbp 3831.cfi_push %rbp 3832 sub \$$frame_size,%rsp 3833 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3834___ 3835$code.=<<___ if ($win64); 3836 movaps %xmm6,0x10(%rsp) 3837 movaps %xmm7,0x20(%rsp) 3838 movaps %xmm8,0x30(%rsp) 3839 movaps %xmm9,0x40(%rsp) 3840 movaps %xmm10,0x50(%rsp) 3841 movaps %xmm11,0x60(%rsp) 3842 movaps %xmm12,0x70(%rsp) 3843 movaps %xmm13,0x80(%rsp) 3844 movaps %xmm14,0x90(%rsp) 3845 movaps %xmm15,0xa0(%rsp) 3846.Lcbc_decrypt_body: 3847___ 3848 3849my $inp_=$key_="%rbp"; # reassign $key_ 3850 3851$code.=<<___; 3852 mov $key,$key_ # [re-]backup $key [after reassignment] 3853 movups ($ivp),$iv 3854 mov $rnds_,$rounds 3855 cmp \$0x50,$len 3856 jbe .Lcbc_dec_tail 3857 3858 $movkey ($key),$rndkey0 3859 movdqu 0x00($inp),$inout0 # load input 3860 movdqu 0x10($inp),$inout1 3861 movdqa $inout0,$in0 3862 movdqu 0x20($inp),$inout2 3863 movdqa $inout1,$in1 3864 movdqu 0x30($inp),$inout3 3865 movdqa $inout2,$in2 3866 movdqu 0x40($inp),$inout4 3867 movdqa $inout3,$in3 3868 movdqu 0x50($inp),$inout5 3869 movdqa $inout4,$in4 3870 mov OPENSSL_ia32cap_P+4(%rip),%r9d 3871 cmp \$0x70,$len 3872 jbe .Lcbc_dec_six_or_seven 3873 3874 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3875 sub \$0x50,$len # $len is biased by -5*16 3876 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3877 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3878 sub \$0x20,$len # $len is biased by -7*16 3879 lea 0x70($key),$key # size optimization 3880 jmp .Lcbc_dec_loop8_enter 3881.align 16 3882.Lcbc_dec_loop8: 3883 movups $inout7,($out) 3884 lea 0x10($out),$out 3885.Lcbc_dec_loop8_enter: 3886 movdqu 0x60($inp),$inout6 3887 pxor $rndkey0,$inout0 3888 movdqu 0x70($inp),$inout7 3889 pxor $rndkey0,$inout1 3890 $movkey 0x10-0x70($key),$rndkey1 3891 pxor $rndkey0,$inout2 3892 mov \$-1,$inp_ 3893 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3894 pxor $rndkey0,$inout3 3895 pxor $rndkey0,$inout4 3896 pxor $rndkey0,$inout5 3897 pxor $rndkey0,$inout6 3898 3899 aesdec $rndkey1,$inout0 3900 pxor $rndkey0,$inout7 3901 $movkey 0x20-0x70($key),$rndkey0 3902 aesdec $rndkey1,$inout1 3903 aesdec $rndkey1,$inout2 3904 aesdec $rndkey1,$inout3 3905 aesdec $rndkey1,$inout4 3906 aesdec $rndkey1,$inout5 3907 aesdec $rndkey1,$inout6 3908 adc \$0,$inp_ 3909 and \$128,$inp_ 3910 aesdec $rndkey1,$inout7 3911 add $inp,$inp_ 3912 $movkey 0x30-0x70($key),$rndkey1 3913___ 3914for($i=1;$i<12;$i++) { 3915my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3916$code.=<<___ if ($i==7); 3917 cmp \$11,$rounds 3918___ 3919$code.=<<___; 3920 aesdec $rndkeyx,$inout0 3921 aesdec $rndkeyx,$inout1 3922 aesdec $rndkeyx,$inout2 3923 aesdec $rndkeyx,$inout3 3924 aesdec $rndkeyx,$inout4 3925 aesdec $rndkeyx,$inout5 3926 aesdec $rndkeyx,$inout6 3927 aesdec $rndkeyx,$inout7 3928 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3929___ 3930$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3931 nop 3932___ 3933$code.=<<___ if ($i==7); 3934 jb .Lcbc_dec_done 3935___ 3936$code.=<<___ if ($i==9); 3937 je .Lcbc_dec_done 3938___ 3939$code.=<<___ if ($i==11); 3940 jmp .Lcbc_dec_done 3941___ 3942} 3943$code.=<<___; 3944.align 16 3945.Lcbc_dec_done: 3946 aesdec $rndkey1,$inout0 3947 aesdec $rndkey1,$inout1 3948 pxor $rndkey0,$iv 3949 pxor $rndkey0,$in0 3950 aesdec $rndkey1,$inout2 3951 aesdec $rndkey1,$inout3 3952 pxor $rndkey0,$in1 3953 pxor $rndkey0,$in2 3954 aesdec $rndkey1,$inout4 3955 aesdec $rndkey1,$inout5 3956 pxor $rndkey0,$in3 3957 pxor $rndkey0,$in4 3958 aesdec $rndkey1,$inout6 3959 aesdec $rndkey1,$inout7 3960 movdqu 0x50($inp),$rndkey1 3961 3962 aesdeclast $iv,$inout0 3963 movdqu 0x60($inp),$iv # borrow $iv 3964 pxor $rndkey0,$rndkey1 3965 aesdeclast $in0,$inout1 3966 pxor $rndkey0,$iv 3967 movdqu 0x70($inp),$rndkey0 # next IV 3968 aesdeclast $in1,$inout2 3969 lea 0x80($inp),$inp 3970 movdqu 0x00($inp_),$in0 3971 aesdeclast $in2,$inout3 3972 aesdeclast $in3,$inout4 3973 movdqu 0x10($inp_),$in1 3974 movdqu 0x20($inp_),$in2 3975 aesdeclast $in4,$inout5 3976 aesdeclast $rndkey1,$inout6 3977 movdqu 0x30($inp_),$in3 3978 movdqu 0x40($inp_),$in4 3979 aesdeclast $iv,$inout7 3980 movdqa $rndkey0,$iv # return $iv 3981 movdqu 0x50($inp_),$rndkey1 3982 $movkey -0x70($key),$rndkey0 3983 3984 movups $inout0,($out) # store output 3985 movdqa $in0,$inout0 3986 movups $inout1,0x10($out) 3987 movdqa $in1,$inout1 3988 movups $inout2,0x20($out) 3989 movdqa $in2,$inout2 3990 movups $inout3,0x30($out) 3991 movdqa $in3,$inout3 3992 movups $inout4,0x40($out) 3993 movdqa $in4,$inout4 3994 movups $inout5,0x50($out) 3995 movdqa $rndkey1,$inout5 3996 movups $inout6,0x60($out) 3997 lea 0x70($out),$out 3998 3999 sub \$0x80,$len 4000 ja .Lcbc_dec_loop8 4001 4002 movaps $inout7,$inout0 4003 lea -0x70($key),$key 4004 add \$0x70,$len 4005 jle .Lcbc_dec_clear_tail_collected 4006 movups $inout7,($out) 4007 lea 0x10($out),$out 4008 cmp \$0x50,$len 4009 jbe .Lcbc_dec_tail 4010 4011 movaps $in0,$inout0 4012.Lcbc_dec_six_or_seven: 4013 cmp \$0x60,$len 4014 ja .Lcbc_dec_seven 4015 4016 movaps $inout5,$inout6 4017 call _aesni_decrypt6 4018 pxor $iv,$inout0 # ^= IV 4019 movaps $inout6,$iv 4020 pxor $in0,$inout1 4021 movdqu $inout0,($out) 4022 pxor $in1,$inout2 4023 movdqu $inout1,0x10($out) 4024 pxor $inout1,$inout1 # clear register bank 4025 pxor $in2,$inout3 4026 movdqu $inout2,0x20($out) 4027 pxor $inout2,$inout2 4028 pxor $in3,$inout4 4029 movdqu $inout3,0x30($out) 4030 pxor $inout3,$inout3 4031 pxor $in4,$inout5 4032 movdqu $inout4,0x40($out) 4033 pxor $inout4,$inout4 4034 lea 0x50($out),$out 4035 movdqa $inout5,$inout0 4036 pxor $inout5,$inout5 4037 jmp .Lcbc_dec_tail_collected 4038 4039.align 16 4040.Lcbc_dec_seven: 4041 movups 0x60($inp),$inout6 4042 xorps $inout7,$inout7 4043 call _aesni_decrypt8 4044 movups 0x50($inp),$inout7 4045 pxor $iv,$inout0 # ^= IV 4046 movups 0x60($inp),$iv 4047 pxor $in0,$inout1 4048 movdqu $inout0,($out) 4049 pxor $in1,$inout2 4050 movdqu $inout1,0x10($out) 4051 pxor $inout1,$inout1 # clear register bank 4052 pxor $in2,$inout3 4053 movdqu $inout2,0x20($out) 4054 pxor $inout2,$inout2 4055 pxor $in3,$inout4 4056 movdqu $inout3,0x30($out) 4057 pxor $inout3,$inout3 4058 pxor $in4,$inout5 4059 movdqu $inout4,0x40($out) 4060 pxor $inout4,$inout4 4061 pxor $inout7,$inout6 4062 movdqu $inout5,0x50($out) 4063 pxor $inout5,$inout5 4064 lea 0x60($out),$out 4065 movdqa $inout6,$inout0 4066 pxor $inout6,$inout6 4067 pxor $inout7,$inout7 4068 jmp .Lcbc_dec_tail_collected 4069 4070.align 16 4071.Lcbc_dec_loop6: 4072 movups $inout5,($out) 4073 lea 0x10($out),$out 4074 movdqu 0x00($inp),$inout0 # load input 4075 movdqu 0x10($inp),$inout1 4076 movdqa $inout0,$in0 4077 movdqu 0x20($inp),$inout2 4078 movdqa $inout1,$in1 4079 movdqu 0x30($inp),$inout3 4080 movdqa $inout2,$in2 4081 movdqu 0x40($inp),$inout4 4082 movdqa $inout3,$in3 4083 movdqu 0x50($inp),$inout5 4084 movdqa $inout4,$in4 4085.Lcbc_dec_loop6_enter: 4086 lea 0x60($inp),$inp 4087 movdqa $inout5,$inout6 4088 4089 call _aesni_decrypt6 4090 4091 pxor $iv,$inout0 # ^= IV 4092 movdqa $inout6,$iv 4093 pxor $in0,$inout1 4094 movdqu $inout0,($out) 4095 pxor $in1,$inout2 4096 movdqu $inout1,0x10($out) 4097 pxor $in2,$inout3 4098 movdqu $inout2,0x20($out) 4099 pxor $in3,$inout4 4100 mov $key_,$key 4101 movdqu $inout3,0x30($out) 4102 pxor $in4,$inout5 4103 mov $rnds_,$rounds 4104 movdqu $inout4,0x40($out) 4105 lea 0x50($out),$out 4106 sub \$0x60,$len 4107 ja .Lcbc_dec_loop6 4108 4109 movdqa $inout5,$inout0 4110 add \$0x50,$len 4111 jle .Lcbc_dec_clear_tail_collected 4112 movups $inout5,($out) 4113 lea 0x10($out),$out 4114 4115.Lcbc_dec_tail: 4116 movups ($inp),$inout0 4117 sub \$0x10,$len 4118 jbe .Lcbc_dec_one # $len is 1*16 or less 4119 4120 movups 0x10($inp),$inout1 4121 movaps $inout0,$in0 4122 sub \$0x10,$len 4123 jbe .Lcbc_dec_two # $len is 2*16 or less 4124 4125 movups 0x20($inp),$inout2 4126 movaps $inout1,$in1 4127 sub \$0x10,$len 4128 jbe .Lcbc_dec_three # $len is 3*16 or less 4129 4130 movups 0x30($inp),$inout3 4131 movaps $inout2,$in2 4132 sub \$0x10,$len 4133 jbe .Lcbc_dec_four # $len is 4*16 or less 4134 4135 movups 0x40($inp),$inout4 # $len is 5*16 or less 4136 movaps $inout3,$in3 4137 movaps $inout4,$in4 4138 xorps $inout5,$inout5 4139 call _aesni_decrypt6 4140 pxor $iv,$inout0 4141 movaps $in4,$iv 4142 pxor $in0,$inout1 4143 movdqu $inout0,($out) 4144 pxor $in1,$inout2 4145 movdqu $inout1,0x10($out) 4146 pxor $inout1,$inout1 # clear register bank 4147 pxor $in2,$inout3 4148 movdqu $inout2,0x20($out) 4149 pxor $inout2,$inout2 4150 pxor $in3,$inout4 4151 movdqu $inout3,0x30($out) 4152 pxor $inout3,$inout3 4153 lea 0x40($out),$out 4154 movdqa $inout4,$inout0 4155 pxor $inout4,$inout4 4156 pxor $inout5,$inout5 4157 sub \$0x10,$len 4158 jmp .Lcbc_dec_tail_collected 4159 4160.align 16 4161.Lcbc_dec_one: 4162 movaps $inout0,$in0 4163___ 4164 &aesni_generate1("dec",$key,$rounds); 4165$code.=<<___; 4166 xorps $iv,$inout0 4167 movaps $in0,$iv 4168 jmp .Lcbc_dec_tail_collected 4169.align 16 4170.Lcbc_dec_two: 4171 movaps $inout1,$in1 4172 call _aesni_decrypt2 4173 pxor $iv,$inout0 4174 movaps $in1,$iv 4175 pxor $in0,$inout1 4176 movdqu $inout0,($out) 4177 movdqa $inout1,$inout0 4178 pxor $inout1,$inout1 # clear register bank 4179 lea 0x10($out),$out 4180 jmp .Lcbc_dec_tail_collected 4181.align 16 4182.Lcbc_dec_three: 4183 movaps $inout2,$in2 4184 call _aesni_decrypt3 4185 pxor $iv,$inout0 4186 movaps $in2,$iv 4187 pxor $in0,$inout1 4188 movdqu $inout0,($out) 4189 pxor $in1,$inout2 4190 movdqu $inout1,0x10($out) 4191 pxor $inout1,$inout1 # clear register bank 4192 movdqa $inout2,$inout0 4193 pxor $inout2,$inout2 4194 lea 0x20($out),$out 4195 jmp .Lcbc_dec_tail_collected 4196.align 16 4197.Lcbc_dec_four: 4198 movaps $inout3,$in3 4199 call _aesni_decrypt4 4200 pxor $iv,$inout0 4201 movaps $in3,$iv 4202 pxor $in0,$inout1 4203 movdqu $inout0,($out) 4204 pxor $in1,$inout2 4205 movdqu $inout1,0x10($out) 4206 pxor $inout1,$inout1 # clear register bank 4207 pxor $in2,$inout3 4208 movdqu $inout2,0x20($out) 4209 pxor $inout2,$inout2 4210 movdqa $inout3,$inout0 4211 pxor $inout3,$inout3 4212 lea 0x30($out),$out 4213 jmp .Lcbc_dec_tail_collected 4214 4215.align 16 4216.Lcbc_dec_clear_tail_collected: 4217 pxor $inout1,$inout1 # clear register bank 4218 pxor $inout2,$inout2 4219 pxor $inout3,$inout3 4220___ 4221$code.=<<___ if (!$win64); 4222 pxor $inout4,$inout4 # %xmm6..9 4223 pxor $inout5,$inout5 4224 pxor $inout6,$inout6 4225 pxor $inout7,$inout7 4226___ 4227$code.=<<___; 4228.Lcbc_dec_tail_collected: 4229 movups $iv,($ivp) 4230 and \$15,$len 4231 jnz .Lcbc_dec_tail_partial 4232 movups $inout0,($out) 4233 pxor $inout0,$inout0 4234 jmp .Lcbc_dec_ret 4235.align 16 4236.Lcbc_dec_tail_partial: 4237 movaps $inout0,(%rsp) 4238 pxor $inout0,$inout0 4239 mov \$16,%rcx 4240 mov $out,%rdi 4241 sub $len,%rcx 4242 lea (%rsp),%rsi 4243 .long 0x9066A4F3 # rep movsb 4244 movdqa $inout0,(%rsp) 4245 4246.Lcbc_dec_ret: 4247 xorps $rndkey0,$rndkey0 # %xmm0 4248 pxor $rndkey1,$rndkey1 4249___ 4250$code.=<<___ if ($win64); 4251 movaps 0x10(%rsp),%xmm6 4252 movaps %xmm0,0x10(%rsp) # clear stack 4253 movaps 0x20(%rsp),%xmm7 4254 movaps %xmm0,0x20(%rsp) 4255 movaps 0x30(%rsp),%xmm8 4256 movaps %xmm0,0x30(%rsp) 4257 movaps 0x40(%rsp),%xmm9 4258 movaps %xmm0,0x40(%rsp) 4259 movaps 0x50(%rsp),%xmm10 4260 movaps %xmm0,0x50(%rsp) 4261 movaps 0x60(%rsp),%xmm11 4262 movaps %xmm0,0x60(%rsp) 4263 movaps 0x70(%rsp),%xmm12 4264 movaps %xmm0,0x70(%rsp) 4265 movaps 0x80(%rsp),%xmm13 4266 movaps %xmm0,0x80(%rsp) 4267 movaps 0x90(%rsp),%xmm14 4268 movaps %xmm0,0x90(%rsp) 4269 movaps 0xa0(%rsp),%xmm15 4270 movaps %xmm0,0xa0(%rsp) 4271___ 4272$code.=<<___; 4273 mov -8(%r11),%rbp 4274.cfi_restore %rbp 4275 lea (%r11),%rsp 4276.cfi_def_cfa_register %rsp 4277.Lcbc_ret: 4278 ret 4279.cfi_endproc 4280.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4281___ 4282} 4283# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4284# int bits, AES_KEY *key) 4285# 4286# input: $inp user-supplied key 4287# $bits $inp length in bits 4288# $key pointer to key schedule 4289# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4290# *$key key schedule 4291# 4292{ my ($inp,$bits,$key) = @_4args; 4293 $bits =~ s/%r/%e/; 4294 4295$code.=<<___; 4296.globl ${PREFIX}_set_decrypt_key 4297.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4298.align 16 4299${PREFIX}_set_decrypt_key: 4300.cfi_startproc 4301 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4302.cfi_adjust_cfa_offset 8 4303 call __aesni_set_encrypt_key 4304 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4305 test %eax,%eax 4306 jnz .Ldec_key_ret 4307 lea 16($key,$bits),$inp # points at the end of key schedule 4308 4309 $movkey ($key),%xmm0 # just swap 4310 $movkey ($inp),%xmm1 4311 $movkey %xmm0,($inp) 4312 $movkey %xmm1,($key) 4313 lea 16($key),$key 4314 lea -16($inp),$inp 4315 4316.Ldec_key_inverse: 4317 $movkey ($key),%xmm0 # swap and inverse 4318 $movkey ($inp),%xmm1 4319 aesimc %xmm0,%xmm0 4320 aesimc %xmm1,%xmm1 4321 lea 16($key),$key 4322 lea -16($inp),$inp 4323 $movkey %xmm0,16($inp) 4324 $movkey %xmm1,-16($key) 4325 cmp $key,$inp 4326 ja .Ldec_key_inverse 4327 4328 $movkey ($key),%xmm0 # inverse middle 4329 aesimc %xmm0,%xmm0 4330 pxor %xmm1,%xmm1 4331 $movkey %xmm0,($inp) 4332 pxor %xmm0,%xmm0 4333.Ldec_key_ret: 4334 add \$8,%rsp 4335.cfi_adjust_cfa_offset -8 4336 ret 4337.cfi_endproc 4338.LSEH_end_set_decrypt_key: 4339.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4340___ 4341 4342# This is based on submission from Intel by 4343# Huang Ying 4344# Vinodh Gopal 4345# Kahraman Akdemir 4346# 4347# Aggressively optimized in respect to aeskeygenassist's critical path 4348# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4349# 4350# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4351# int bits, AES_KEY * const key); 4352# 4353# input: $inp user-supplied key 4354# $bits $inp length in bits 4355# $key pointer to key schedule 4356# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4357# $bits rounds-1 (used in aesni_set_decrypt_key) 4358# *$key key schedule 4359# $key pointer to key schedule (used in 4360# aesni_set_decrypt_key) 4361# 4362# Subroutine is frame-less, which means that only volatile registers 4363# are used. Note that it's declared "abi-omnipotent", which means that 4364# amount of volatile registers is smaller on Windows. 4365# 4366$code.=<<___; 4367.globl ${PREFIX}_set_encrypt_key 4368.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4369.align 16 4370${PREFIX}_set_encrypt_key: 4371__aesni_set_encrypt_key: 4372.cfi_startproc 4373 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4374.cfi_adjust_cfa_offset 8 4375 mov \$-1,%rax 4376 test $inp,$inp 4377 jz .Lenc_key_ret 4378 test $key,$key 4379 jz .Lenc_key_ret 4380 4381 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4382 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4383 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4384 and OPENSSL_ia32cap_P+4(%rip),%r10d 4385 lea 16($key),%rax # %rax is used as modifiable copy of $key 4386 cmp \$256,$bits 4387 je .L14rounds 4388 cmp \$192,$bits 4389 je .L12rounds 4390 cmp \$128,$bits 4391 jne .Lbad_keybits 4392 4393.L10rounds: 4394 mov \$9,$bits # 10 rounds for 128-bit key 4395 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4396 je .L10rounds_alt 4397 4398 $movkey %xmm0,($key) # round 0 4399 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4400 call .Lkey_expansion_128_cold 4401 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4402 call .Lkey_expansion_128 4403 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4404 call .Lkey_expansion_128 4405 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4406 call .Lkey_expansion_128 4407 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4408 call .Lkey_expansion_128 4409 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4410 call .Lkey_expansion_128 4411 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4412 call .Lkey_expansion_128 4413 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4414 call .Lkey_expansion_128 4415 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4416 call .Lkey_expansion_128 4417 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4418 call .Lkey_expansion_128 4419 $movkey %xmm0,(%rax) 4420 mov $bits,80(%rax) # 240(%rdx) 4421 xor %eax,%eax 4422 jmp .Lenc_key_ret 4423 4424.align 16 4425.L10rounds_alt: 4426 movdqa .Lkey_rotate(%rip),%xmm5 4427 mov \$8,%r10d 4428 movdqa .Lkey_rcon1(%rip),%xmm4 4429 movdqa %xmm0,%xmm2 4430 movdqu %xmm0,($key) 4431 jmp .Loop_key128 4432 4433.align 16 4434.Loop_key128: 4435 pshufb %xmm5,%xmm0 4436 aesenclast %xmm4,%xmm0 4437 pslld \$1,%xmm4 4438 lea 16(%rax),%rax 4439 4440 movdqa %xmm2,%xmm3 4441 pslldq \$4,%xmm2 4442 pxor %xmm2,%xmm3 4443 pslldq \$4,%xmm2 4444 pxor %xmm2,%xmm3 4445 pslldq \$4,%xmm2 4446 pxor %xmm3,%xmm2 4447 4448 pxor %xmm2,%xmm0 4449 movdqu %xmm0,-16(%rax) 4450 movdqa %xmm0,%xmm2 4451 4452 dec %r10d 4453 jnz .Loop_key128 4454 4455 movdqa .Lkey_rcon1b(%rip),%xmm4 4456 4457 pshufb %xmm5,%xmm0 4458 aesenclast %xmm4,%xmm0 4459 pslld \$1,%xmm4 4460 4461 movdqa %xmm2,%xmm3 4462 pslldq \$4,%xmm2 4463 pxor %xmm2,%xmm3 4464 pslldq \$4,%xmm2 4465 pxor %xmm2,%xmm3 4466 pslldq \$4,%xmm2 4467 pxor %xmm3,%xmm2 4468 4469 pxor %xmm2,%xmm0 4470 movdqu %xmm0,(%rax) 4471 4472 movdqa %xmm0,%xmm2 4473 pshufb %xmm5,%xmm0 4474 aesenclast %xmm4,%xmm0 4475 4476 movdqa %xmm2,%xmm3 4477 pslldq \$4,%xmm2 4478 pxor %xmm2,%xmm3 4479 pslldq \$4,%xmm2 4480 pxor %xmm2,%xmm3 4481 pslldq \$4,%xmm2 4482 pxor %xmm3,%xmm2 4483 4484 pxor %xmm2,%xmm0 4485 movdqu %xmm0,16(%rax) 4486 4487 mov $bits,96(%rax) # 240($key) 4488 xor %eax,%eax 4489 jmp .Lenc_key_ret 4490 4491.align 16 4492.L12rounds: 4493 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4494 mov \$11,$bits # 12 rounds for 192 4495 cmp \$`1<<28`,%r10d # AVX, but no XOP 4496 je .L12rounds_alt 4497 4498 $movkey %xmm0,($key) # round 0 4499 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4500 call .Lkey_expansion_192a_cold 4501 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4502 call .Lkey_expansion_192b 4503 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4504 call .Lkey_expansion_192a 4505 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4506 call .Lkey_expansion_192b 4507 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4508 call .Lkey_expansion_192a 4509 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4510 call .Lkey_expansion_192b 4511 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4512 call .Lkey_expansion_192a 4513 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4514 call .Lkey_expansion_192b 4515 $movkey %xmm0,(%rax) 4516 mov $bits,48(%rax) # 240(%rdx) 4517 xor %rax, %rax 4518 jmp .Lenc_key_ret 4519 4520.align 16 4521.L12rounds_alt: 4522 movdqa .Lkey_rotate192(%rip),%xmm5 4523 movdqa .Lkey_rcon1(%rip),%xmm4 4524 mov \$8,%r10d 4525 movdqu %xmm0,($key) 4526 jmp .Loop_key192 4527 4528.align 16 4529.Loop_key192: 4530 movq %xmm2,0(%rax) 4531 movdqa %xmm2,%xmm1 4532 pshufb %xmm5,%xmm2 4533 aesenclast %xmm4,%xmm2 4534 pslld \$1, %xmm4 4535 lea 24(%rax),%rax 4536 4537 movdqa %xmm0,%xmm3 4538 pslldq \$4,%xmm0 4539 pxor %xmm0,%xmm3 4540 pslldq \$4,%xmm0 4541 pxor %xmm0,%xmm3 4542 pslldq \$4,%xmm0 4543 pxor %xmm3,%xmm0 4544 4545 pshufd \$0xff,%xmm0,%xmm3 4546 pxor %xmm1,%xmm3 4547 pslldq \$4,%xmm1 4548 pxor %xmm1,%xmm3 4549 4550 pxor %xmm2,%xmm0 4551 pxor %xmm3,%xmm2 4552 movdqu %xmm0,-16(%rax) 4553 4554 dec %r10d 4555 jnz .Loop_key192 4556 4557 mov $bits,32(%rax) # 240($key) 4558 xor %eax,%eax 4559 jmp .Lenc_key_ret 4560 4561.align 16 4562.L14rounds: 4563 movups 16($inp),%xmm2 # remaining half of *userKey 4564 mov \$13,$bits # 14 rounds for 256 4565 lea 16(%rax),%rax 4566 cmp \$`1<<28`,%r10d # AVX, but no XOP 4567 je .L14rounds_alt 4568 4569 $movkey %xmm0,($key) # round 0 4570 $movkey %xmm2,16($key) # round 1 4571 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4572 call .Lkey_expansion_256a_cold 4573 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4574 call .Lkey_expansion_256b 4575 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4576 call .Lkey_expansion_256a 4577 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4578 call .Lkey_expansion_256b 4579 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4580 call .Lkey_expansion_256a 4581 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4582 call .Lkey_expansion_256b 4583 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4584 call .Lkey_expansion_256a 4585 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4586 call .Lkey_expansion_256b 4587 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4588 call .Lkey_expansion_256a 4589 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4590 call .Lkey_expansion_256b 4591 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4592 call .Lkey_expansion_256a 4593 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4594 call .Lkey_expansion_256b 4595 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4596 call .Lkey_expansion_256a 4597 $movkey %xmm0,(%rax) 4598 mov $bits,16(%rax) # 240(%rdx) 4599 xor %rax,%rax 4600 jmp .Lenc_key_ret 4601 4602.align 16 4603.L14rounds_alt: 4604 movdqa .Lkey_rotate(%rip),%xmm5 4605 movdqa .Lkey_rcon1(%rip),%xmm4 4606 mov \$7,%r10d 4607 movdqu %xmm0,0($key) 4608 movdqa %xmm2,%xmm1 4609 movdqu %xmm2,16($key) 4610 jmp .Loop_key256 4611 4612.align 16 4613.Loop_key256: 4614 pshufb %xmm5,%xmm2 4615 aesenclast %xmm4,%xmm2 4616 4617 movdqa %xmm0,%xmm3 4618 pslldq \$4,%xmm0 4619 pxor %xmm0,%xmm3 4620 pslldq \$4,%xmm0 4621 pxor %xmm0,%xmm3 4622 pslldq \$4,%xmm0 4623 pxor %xmm3,%xmm0 4624 pslld \$1,%xmm4 4625 4626 pxor %xmm2,%xmm0 4627 movdqu %xmm0,(%rax) 4628 4629 dec %r10d 4630 jz .Ldone_key256 4631 4632 pshufd \$0xff,%xmm0,%xmm2 4633 pxor %xmm3,%xmm3 4634 aesenclast %xmm3,%xmm2 4635 4636 movdqa %xmm1,%xmm3 4637 pslldq \$4,%xmm1 4638 pxor %xmm1,%xmm3 4639 pslldq \$4,%xmm1 4640 pxor %xmm1,%xmm3 4641 pslldq \$4,%xmm1 4642 pxor %xmm3,%xmm1 4643 4644 pxor %xmm1,%xmm2 4645 movdqu %xmm2,16(%rax) 4646 lea 32(%rax),%rax 4647 movdqa %xmm2,%xmm1 4648 4649 jmp .Loop_key256 4650 4651.Ldone_key256: 4652 mov $bits,16(%rax) # 240($key) 4653 xor %eax,%eax 4654 jmp .Lenc_key_ret 4655 4656.align 16 4657.Lbad_keybits: 4658 mov \$-2,%rax 4659.Lenc_key_ret: 4660 pxor %xmm0,%xmm0 4661 pxor %xmm1,%xmm1 4662 pxor %xmm2,%xmm2 4663 pxor %xmm3,%xmm3 4664 pxor %xmm4,%xmm4 4665 pxor %xmm5,%xmm5 4666 add \$8,%rsp 4667.cfi_adjust_cfa_offset -8 4668 ret 4669.LSEH_end_set_encrypt_key: 4670 4671.align 16 4672.Lkey_expansion_128: 4673 $movkey %xmm0,(%rax) 4674 lea 16(%rax),%rax 4675.Lkey_expansion_128_cold: 4676 shufps \$0b00010000,%xmm0,%xmm4 4677 xorps %xmm4, %xmm0 4678 shufps \$0b10001100,%xmm0,%xmm4 4679 xorps %xmm4, %xmm0 4680 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4681 xorps %xmm1,%xmm0 4682 ret 4683 4684.align 16 4685.Lkey_expansion_192a: 4686 $movkey %xmm0,(%rax) 4687 lea 16(%rax),%rax 4688.Lkey_expansion_192a_cold: 4689 movaps %xmm2, %xmm5 4690.Lkey_expansion_192b_warm: 4691 shufps \$0b00010000,%xmm0,%xmm4 4692 movdqa %xmm2,%xmm3 4693 xorps %xmm4,%xmm0 4694 shufps \$0b10001100,%xmm0,%xmm4 4695 pslldq \$4,%xmm3 4696 xorps %xmm4,%xmm0 4697 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4698 pxor %xmm3,%xmm2 4699 pxor %xmm1,%xmm0 4700 pshufd \$0b11111111,%xmm0,%xmm3 4701 pxor %xmm3,%xmm2 4702 ret 4703 4704.align 16 4705.Lkey_expansion_192b: 4706 movaps %xmm0,%xmm3 4707 shufps \$0b01000100,%xmm0,%xmm5 4708 $movkey %xmm5,(%rax) 4709 shufps \$0b01001110,%xmm2,%xmm3 4710 $movkey %xmm3,16(%rax) 4711 lea 32(%rax),%rax 4712 jmp .Lkey_expansion_192b_warm 4713 4714.align 16 4715.Lkey_expansion_256a: 4716 $movkey %xmm2,(%rax) 4717 lea 16(%rax),%rax 4718.Lkey_expansion_256a_cold: 4719 shufps \$0b00010000,%xmm0,%xmm4 4720 xorps %xmm4,%xmm0 4721 shufps \$0b10001100,%xmm0,%xmm4 4722 xorps %xmm4,%xmm0 4723 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4724 xorps %xmm1,%xmm0 4725 ret 4726 4727.align 16 4728.Lkey_expansion_256b: 4729 $movkey %xmm0,(%rax) 4730 lea 16(%rax),%rax 4731 4732 shufps \$0b00010000,%xmm2,%xmm4 4733 xorps %xmm4,%xmm2 4734 shufps \$0b10001100,%xmm2,%xmm4 4735 xorps %xmm4,%xmm2 4736 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4737 xorps %xmm1,%xmm2 4738 ret 4739.cfi_endproc 4740.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4741.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4742___ 4743} 4744 4745$code.=<<___; 4746.section .rodata align=64 4747.align 64 4748.Lbswap_mask: 4749 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4750.Lincrement32: 4751 .long 6,6,6,0 4752.Lincrement64: 4753 .long 1,0,0,0 4754.Lxts_magic: 4755 .long 0x87,0,1,0 4756.Lincrement1: 4757 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4758.Lkey_rotate: 4759 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4760.Lkey_rotate192: 4761 .long 0x04070605,0x04070605,0x04070605,0x04070605 4762.Lkey_rcon1: 4763 .long 1,1,1,1 4764.Lkey_rcon1b: 4765 .long 0x1b,0x1b,0x1b,0x1b 4766 4767.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4768.align 64 4769.previous 4770___ 4771 4772# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4773# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4774if ($win64) { 4775$rec="%rcx"; 4776$frame="%rdx"; 4777$context="%r8"; 4778$disp="%r9"; 4779 4780$code.=<<___; 4781.extern __imp_RtlVirtualUnwind 4782___ 4783$code.=<<___ if ($PREFIX eq "aesni"); 4784.type ecb_ccm64_se_handler,\@abi-omnipotent 4785.align 16 4786ecb_ccm64_se_handler: 4787 push %rsi 4788 push %rdi 4789 push %rbx 4790 push %rbp 4791 push %r12 4792 push %r13 4793 push %r14 4794 push %r15 4795 pushfq 4796 sub \$64,%rsp 4797 4798 mov 120($context),%rax # pull context->Rax 4799 mov 248($context),%rbx # pull context->Rip 4800 4801 mov 8($disp),%rsi # disp->ImageBase 4802 mov 56($disp),%r11 # disp->HandlerData 4803 4804 mov 0(%r11),%r10d # HandlerData[0] 4805 lea (%rsi,%r10),%r10 # prologue label 4806 cmp %r10,%rbx # context->Rip<prologue label 4807 jb .Lcommon_seh_tail 4808 4809 mov 152($context),%rax # pull context->Rsp 4810 4811 mov 4(%r11),%r10d # HandlerData[1] 4812 lea (%rsi,%r10),%r10 # epilogue label 4813 cmp %r10,%rbx # context->Rip>=epilogue label 4814 jae .Lcommon_seh_tail 4815 4816 lea 0(%rax),%rsi # %xmm save area 4817 lea 512($context),%rdi # &context.Xmm6 4818 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4819 .long 0xa548f3fc # cld; rep movsq 4820 lea 0x58(%rax),%rax # adjust stack pointer 4821 4822 jmp .Lcommon_seh_tail 4823.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler 4824 4825.type ctr_xts_se_handler,\@abi-omnipotent 4826.align 16 4827ctr_xts_se_handler: 4828 push %rsi 4829 push %rdi 4830 push %rbx 4831 push %rbp 4832 push %r12 4833 push %r13 4834 push %r14 4835 push %r15 4836 pushfq 4837 sub \$64,%rsp 4838 4839 mov 120($context),%rax # pull context->Rax 4840 mov 248($context),%rbx # pull context->Rip 4841 4842 mov 8($disp),%rsi # disp->ImageBase 4843 mov 56($disp),%r11 # disp->HandlerData 4844 4845 mov 0(%r11),%r10d # HandlerData[0] 4846 lea (%rsi,%r10),%r10 # prologue label 4847 cmp %r10,%rbx # context->Rip<prologue label 4848 jb .Lcommon_seh_tail 4849 4850 mov 152($context),%rax # pull context->Rsp 4851 4852 mov 4(%r11),%r10d # HandlerData[1] 4853 lea (%rsi,%r10),%r10 # epilogue label 4854 cmp %r10,%rbx # context->Rip>=epilogue label 4855 jae .Lcommon_seh_tail 4856 4857 mov 208($context),%rax # pull context->R11 4858 4859 lea -0xa8(%rax),%rsi # %xmm save area 4860 lea 512($context),%rdi # & context.Xmm6 4861 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4862 .long 0xa548f3fc # cld; rep movsq 4863 4864 mov -8(%rax),%rbp # restore saved %rbp 4865 mov %rbp,160($context) # restore context->Rbp 4866 jmp .Lcommon_seh_tail 4867.size ctr_xts_se_handler,.-ctr_xts_se_handler 4868 4869.type ocb_se_handler,\@abi-omnipotent 4870.align 16 4871ocb_se_handler: 4872 push %rsi 4873 push %rdi 4874 push %rbx 4875 push %rbp 4876 push %r12 4877 push %r13 4878 push %r14 4879 push %r15 4880 pushfq 4881 sub \$64,%rsp 4882 4883 mov 120($context),%rax # pull context->Rax 4884 mov 248($context),%rbx # pull context->Rip 4885 4886 mov 8($disp),%rsi # disp->ImageBase 4887 mov 56($disp),%r11 # disp->HandlerData 4888 4889 mov 0(%r11),%r10d # HandlerData[0] 4890 lea (%rsi,%r10),%r10 # prologue label 4891 cmp %r10,%rbx # context->Rip<prologue label 4892 jb .Lcommon_seh_tail 4893 4894 mov 4(%r11),%r10d # HandlerData[1] 4895 lea (%rsi,%r10),%r10 # epilogue label 4896 cmp %r10,%rbx # context->Rip>=epilogue label 4897 jae .Lcommon_seh_tail 4898 4899 mov 8(%r11),%r10d # HandlerData[2] 4900 lea (%rsi,%r10),%r10 4901 cmp %r10,%rbx # context->Rip>=pop label 4902 jae .Locb_no_xmm 4903 4904 mov 152($context),%rax # pull context->Rsp 4905 4906 lea (%rax),%rsi # %xmm save area 4907 lea 512($context),%rdi # & context.Xmm6 4908 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4909 .long 0xa548f3fc # cld; rep movsq 4910 lea 0xa0+0x28(%rax),%rax 4911 4912.Locb_no_xmm: 4913 mov -8(%rax),%rbx 4914 mov -16(%rax),%rbp 4915 mov -24(%rax),%r12 4916 mov -32(%rax),%r13 4917 mov -40(%rax),%r14 4918 4919 mov %rbx,144($context) # restore context->Rbx 4920 mov %rbp,160($context) # restore context->Rbp 4921 mov %r12,216($context) # restore context->R12 4922 mov %r13,224($context) # restore context->R13 4923 mov %r14,232($context) # restore context->R14 4924 4925 jmp .Lcommon_seh_tail 4926.size ocb_se_handler,.-ocb_se_handler 4927___ 4928$code.=<<___; 4929.type cbc_se_handler,\@abi-omnipotent 4930.align 16 4931cbc_se_handler: 4932 push %rsi 4933 push %rdi 4934 push %rbx 4935 push %rbp 4936 push %r12 4937 push %r13 4938 push %r14 4939 push %r15 4940 pushfq 4941 sub \$64,%rsp 4942 4943 mov 152($context),%rax # pull context->Rsp 4944 mov 248($context),%rbx # pull context->Rip 4945 4946 lea .Lcbc_decrypt_bulk(%rip),%r10 4947 cmp %r10,%rbx # context->Rip<"prologue" label 4948 jb .Lcommon_seh_tail 4949 4950 mov 120($context),%rax # pull context->Rax 4951 4952 lea .Lcbc_decrypt_body(%rip),%r10 4953 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4954 jb .Lcommon_seh_tail 4955 4956 mov 152($context),%rax # pull context->Rsp 4957 4958 lea .Lcbc_ret(%rip),%r10 4959 cmp %r10,%rbx # context->Rip>="epilogue" label 4960 jae .Lcommon_seh_tail 4961 4962 lea 16(%rax),%rsi # %xmm save area 4963 lea 512($context),%rdi # &context.Xmm6 4964 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4965 .long 0xa548f3fc # cld; rep movsq 4966 4967 mov 208($context),%rax # pull context->R11 4968 4969 mov -8(%rax),%rbp # restore saved %rbp 4970 mov %rbp,160($context) # restore context->Rbp 4971 4972.Lcommon_seh_tail: 4973 mov 8(%rax),%rdi 4974 mov 16(%rax),%rsi 4975 mov %rax,152($context) # restore context->Rsp 4976 mov %rsi,168($context) # restore context->Rsi 4977 mov %rdi,176($context) # restore context->Rdi 4978 4979 mov 40($disp),%rdi # disp->ContextRecord 4980 mov $context,%rsi # context 4981 mov \$154,%ecx # sizeof(CONTEXT) 4982 .long 0xa548f3fc # cld; rep movsq 4983 4984 mov $disp,%rsi 4985 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4986 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4987 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4988 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4989 mov 40(%rsi),%r10 # disp->ContextRecord 4990 lea 56(%rsi),%r11 # &disp->HandlerData 4991 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4992 mov %r10,32(%rsp) # arg5 4993 mov %r11,40(%rsp) # arg6 4994 mov %r12,48(%rsp) # arg7 4995 mov %rcx,56(%rsp) # arg8, (NULL) 4996 call *__imp_RtlVirtualUnwind(%rip) 4997 4998 mov \$1,%eax # ExceptionContinueSearch 4999 add \$64,%rsp 5000 popfq 5001 pop %r15 5002 pop %r14 5003 pop %r13 5004 pop %r12 5005 pop %rbp 5006 pop %rbx 5007 pop %rdi 5008 pop %rsi 5009 ret 5010.size cbc_se_handler,.-cbc_se_handler 5011 5012.section .pdata 5013.align 4 5014___ 5015$code.=<<___ if ($PREFIX eq "aesni"); 5016 .rva .LSEH_begin_aesni_ecb_encrypt 5017 .rva .LSEH_end_aesni_ecb_encrypt 5018 .rva .LSEH_info_ecb 5019 5020 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks 5021 .rva .LSEH_end_aesni_ccm64_encrypt_blocks 5022 .rva .LSEH_info_ccm64_enc 5023 5024 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks 5025 .rva .LSEH_end_aesni_ccm64_decrypt_blocks 5026 .rva .LSEH_info_ccm64_dec 5027 5028 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks 5029 .rva .LSEH_end_aesni_ctr32_encrypt_blocks 5030 .rva .LSEH_info_ctr32 5031 5032 .rva .LSEH_begin_aesni_xts_encrypt 5033 .rva .LSEH_end_aesni_xts_encrypt 5034 .rva .LSEH_info_xts_enc 5035 5036 .rva .LSEH_begin_aesni_xts_decrypt 5037 .rva .LSEH_end_aesni_xts_decrypt 5038 .rva .LSEH_info_xts_dec 5039 5040 .rva .LSEH_begin_aesni_ocb_encrypt 5041 .rva .LSEH_end_aesni_ocb_encrypt 5042 .rva .LSEH_info_ocb_enc 5043 5044 .rva .LSEH_begin_aesni_ocb_decrypt 5045 .rva .LSEH_end_aesni_ocb_decrypt 5046 .rva .LSEH_info_ocb_dec 5047___ 5048$code.=<<___; 5049 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5050 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5051 .rva .LSEH_info_cbc 5052 5053 .rva ${PREFIX}_set_decrypt_key 5054 .rva .LSEH_end_set_decrypt_key 5055 .rva .LSEH_info_key 5056 5057 .rva ${PREFIX}_set_encrypt_key 5058 .rva .LSEH_end_set_encrypt_key 5059 .rva .LSEH_info_key 5060.section .xdata 5061.align 8 5062___ 5063$code.=<<___ if ($PREFIX eq "aesni"); 5064.LSEH_info_ecb: 5065 .byte 9,0,0,0 5066 .rva ecb_ccm64_se_handler 5067 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5068.LSEH_info_ccm64_enc: 5069 .byte 9,0,0,0 5070 .rva ecb_ccm64_se_handler 5071 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] 5072.LSEH_info_ccm64_dec: 5073 .byte 9,0,0,0 5074 .rva ecb_ccm64_se_handler 5075 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] 5076.LSEH_info_ctr32: 5077 .byte 9,0,0,0 5078 .rva ctr_xts_se_handler 5079 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5080.LSEH_info_xts_enc: 5081 .byte 9,0,0,0 5082 .rva ctr_xts_se_handler 5083 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 5084.LSEH_info_xts_dec: 5085 .byte 9,0,0,0 5086 .rva ctr_xts_se_handler 5087 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 5088.LSEH_info_ocb_enc: 5089 .byte 9,0,0,0 5090 .rva ocb_se_handler 5091 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[] 5092 .rva .Locb_enc_pop 5093 .long 0 5094.LSEH_info_ocb_dec: 5095 .byte 9,0,0,0 5096 .rva ocb_se_handler 5097 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[] 5098 .rva .Locb_dec_pop 5099 .long 0 5100___ 5101$code.=<<___; 5102.LSEH_info_cbc: 5103 .byte 9,0,0,0 5104 .rva cbc_se_handler 5105.LSEH_info_key: 5106 .byte 0x01,0x04,0x01,0x00 5107 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5108___ 5109} 5110 5111sub rex { 5112 local *opcode=shift; 5113 my ($dst,$src)=@_; 5114 my $rex=0; 5115 5116 $rex|=0x04 if($dst>=8); 5117 $rex|=0x01 if($src>=8); 5118 push @opcode,$rex|0x40 if($rex); 5119} 5120 5121sub aesni { 5122 my $line=shift; 5123 my @opcode=(0x66); 5124 5125 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5126 rex(\@opcode,$4,$3); 5127 push @opcode,0x0f,0x3a,0xdf; 5128 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5129 my $c=$2; 5130 push @opcode,$c=~/^0/?oct($c):$c; 5131 return ".byte\t".join(',',@opcode); 5132 } 5133 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5134 my %opcodelet = ( 5135 "aesimc" => 0xdb, 5136 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5137 "aesdec" => 0xde, "aesdeclast" => 0xdf 5138 ); 5139 return undef if (!defined($opcodelet{$1})); 5140 rex(\@opcode,$3,$2); 5141 push @opcode,0x0f,0x38,$opcodelet{$1}; 5142 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5143 return ".byte\t".join(',',@opcode); 5144 } 5145 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5146 my %opcodelet = ( 5147 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5148 "aesdec" => 0xde, "aesdeclast" => 0xdf 5149 ); 5150 return undef if (!defined($opcodelet{$1})); 5151 my $off = $2; 5152 push @opcode,0x44 if ($3>=8); 5153 push @opcode,0x0f,0x38,$opcodelet{$1}; 5154 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5155 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5156 return ".byte\t".join(',',@opcode); 5157 } 5158 return $line; 5159} 5160 5161sub movbe { 5162 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5163} 5164 5165$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5166$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5167#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5168$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5169 5170print $code; 5171 5172close STDOUT or die "error closing STDOUT: $!"; 5173