1#! /usr/bin/env perl 2# Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10################################################################### 11### AES-128 [originally in CTR mode] ### 12### bitsliced implementation for Intel Core 2 processors ### 13### requires support of SSE extensions up to SSSE3 ### 14### Author: Emilia Käsper and Peter Schwabe ### 15### Date: 2009-03-19 ### 16### Public domain ### 17### ### 18### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 19### further information. ### 20################################################################### 21# 22# September 2011. 23# 24# Started as transliteration to "perlasm" the original code has 25# undergone following changes: 26# 27# - code was made position-independent; 28# - rounds were folded into a loop resulting in >5x size reduction 29# from 12.5KB to 2.2KB; 30# - above was possible thanks to mixcolumns() modification that 31# allowed to feed its output back to aesenc[last], this was 32# achieved at cost of two additional inter-registers moves; 33# - some instruction reordering and interleaving; 34# - this module doesn't implement key setup subroutine, instead it 35# relies on conversion of "conventional" key schedule as returned 36# by AES_set_encrypt_key (see discussion below); 37# - first and last round keys are treated differently, which allowed 38# to skip one shiftrows(), reduce bit-sliced key schedule and 39# speed-up conversion by 22%; 40# - support for 192- and 256-bit keys was added; 41# 42# Resulting performance in CPU cycles spent to encrypt one byte out 43# of 4096-byte buffer with 128-bit key is: 44# 45# Emilia's this(*) difference 46# 47# Core 2 9.30 8.69 +7% 48# Nehalem(**) 7.63 6.88 +11% 49# Atom 17.1 16.4 +4% 50# Silvermont - 12.9 51# Goldmont - 8.85 52# 53# (*) Comparison is not completely fair, because "this" is ECB, 54# i.e. no extra processing such as counter values calculation 55# and xor-ing input as in Emilia's CTR implementation is 56# performed. However, the CTR calculations stand for not more 57# than 1% of total time, so comparison is *rather* fair. 58# 59# (**) Results were collected on Westmere, which is considered to 60# be equivalent to Nehalem for this code. 61# 62# As for key schedule conversion subroutine. Interface to OpenSSL 63# relies on per-invocation on-the-fly conversion. This naturally 64# has impact on performance, especially for short inputs. Conversion 65# time in CPU cycles and its ratio to CPU cycles spent in 8x block 66# function is: 67# 68# conversion conversion/8x block 69# Core 2 240 0.22 70# Nehalem 180 0.20 71# Atom 430 0.20 72# 73# The ratio values mean that 128-byte blocks will be processed 74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 75# etc. Then keep in mind that input sizes not divisible by 128 are 76# *effectively* slower, especially shortest ones, e.g. consecutive 77# 144-byte blocks are processed 44% slower than one would expect, 78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 79# it's still faster than ["hyper-threading-safe" code path in] 80# aes-x86_64.pl on all lengths above 64 bytes... 81# 82# October 2011. 83# 84# Add decryption procedure. Performance in CPU cycles spent to decrypt 85# one byte out of 4096-byte buffer with 128-bit key is: 86# 87# Core 2 9.98 88# Nehalem 7.80 89# Atom 17.9 90# Silvermont 14.0 91# Goldmont 10.2 92# 93# November 2011. 94# 95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 96# suboptimal, but XTS is meant to be used with larger blocks... 97# 98# <appro@openssl.org> 99 100# $output is the last argument if it looks like a file (it has an extension) 101# $flavour is the first argument if it doesn't look like a file 102$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 103$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 104 105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 106 107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 110die "can't locate x86_64-xlate.pl"; 111 112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 113 or die "can't call $xlate: $!"; 114*STDOUT=*OUT; 115 116my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 117my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 118my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 119 120{ 121my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 122 123sub Sbox { 124# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 125# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 126my @b=@_[0..7]; 127my @t=@_[8..11]; 128my @s=@_[12..15]; 129 &InBasisChange (@b); 130 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 131 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 132} 133 134sub InBasisChange { 135# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 136# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 137my @b=@_[0..7]; 138$code.=<<___; 139 pxor @b[6], @b[5] 140 pxor @b[1], @b[2] 141 pxor @b[0], @b[3] 142 pxor @b[2], @b[6] 143 pxor @b[0], @b[5] 144 145 pxor @b[3], @b[6] 146 pxor @b[7], @b[3] 147 pxor @b[5], @b[7] 148 pxor @b[4], @b[3] 149 pxor @b[5], @b[4] 150 pxor @b[1], @b[3] 151 152 pxor @b[7], @b[2] 153 pxor @b[5], @b[1] 154___ 155} 156 157sub OutBasisChange { 158# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 159# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 160my @b=@_[0..7]; 161$code.=<<___; 162 pxor @b[6], @b[0] 163 pxor @b[4], @b[1] 164 pxor @b[0], @b[2] 165 pxor @b[6], @b[4] 166 pxor @b[1], @b[6] 167 168 pxor @b[5], @b[1] 169 pxor @b[3], @b[5] 170 pxor @b[7], @b[3] 171 pxor @b[5], @b[7] 172 pxor @b[5], @b[2] 173 174 pxor @b[7], @b[4] 175___ 176} 177 178sub InvSbox { 179# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 180# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 181my @b=@_[0..7]; 182my @t=@_[8..11]; 183my @s=@_[12..15]; 184 &InvInBasisChange (@b); 185 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 186 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 187} 188 189sub InvInBasisChange { # OutBasisChange in reverse 190my @b=@_[5,1,2,6,3,7,0,4]; 191$code.=<<___ 192 pxor @b[7], @b[4] 193 194 pxor @b[5], @b[7] 195 pxor @b[5], @b[2] 196 pxor @b[7], @b[3] 197 pxor @b[3], @b[5] 198 pxor @b[5], @b[1] 199 200 pxor @b[1], @b[6] 201 pxor @b[0], @b[2] 202 pxor @b[6], @b[4] 203 pxor @b[6], @b[0] 204 pxor @b[4], @b[1] 205___ 206} 207 208sub InvOutBasisChange { # InBasisChange in reverse 209my @b=@_[2,5,7,3,6,1,0,4]; 210$code.=<<___; 211 pxor @b[5], @b[1] 212 pxor @b[7], @b[2] 213 214 pxor @b[1], @b[3] 215 pxor @b[5], @b[4] 216 pxor @b[5], @b[7] 217 pxor @b[4], @b[3] 218 pxor @b[0], @b[5] 219 pxor @b[7], @b[3] 220 pxor @b[2], @b[6] 221 pxor @b[1], @b[2] 222 pxor @b[3], @b[6] 223 224 pxor @b[0], @b[3] 225 pxor @b[6], @b[5] 226___ 227} 228 229sub Mul_GF4 { 230#;************************************************************* 231#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 232#;************************************************************* 233my ($x0,$x1,$y0,$y1,$t0)=@_; 234$code.=<<___; 235 movdqa $y0, $t0 236 pxor $y1, $t0 237 pand $x0, $t0 238 pxor $x1, $x0 239 pand $y0, $x1 240 pand $y1, $x0 241 pxor $x1, $x0 242 pxor $t0, $x1 243___ 244} 245 246sub Mul_GF4_N { # not used, see next subroutine 247# multiply and scale by N 248my ($x0,$x1,$y0,$y1,$t0)=@_; 249$code.=<<___; 250 movdqa $y0, $t0 251 pxor $y1, $t0 252 pand $x0, $t0 253 pxor $x1, $x0 254 pand $y0, $x1 255 pand $y1, $x0 256 pxor $x0, $x1 257 pxor $t0, $x0 258___ 259} 260 261sub Mul_GF4_N_GF4 { 262# interleaved Mul_GF4_N and Mul_GF4 263my ($x0,$x1,$y0,$y1,$t0, 264 $x2,$x3,$y2,$y3,$t1)=@_; 265$code.=<<___; 266 movdqa $y0, $t0 267 movdqa $y2, $t1 268 pxor $y1, $t0 269 pxor $y3, $t1 270 pand $x0, $t0 271 pand $x2, $t1 272 pxor $x1, $x0 273 pxor $x3, $x2 274 pand $y0, $x1 275 pand $y2, $x3 276 pand $y1, $x0 277 pand $y3, $x2 278 pxor $x0, $x1 279 pxor $x3, $x2 280 pxor $t0, $x0 281 pxor $t1, $x3 282___ 283} 284sub Mul_GF16_2 { 285my @x=@_[0..7]; 286my @y=@_[8..11]; 287my @t=@_[12..15]; 288$code.=<<___; 289 movdqa @x[0], @t[0] 290 movdqa @x[1], @t[1] 291___ 292 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 293$code.=<<___; 294 pxor @x[2], @t[0] 295 pxor @x[3], @t[1] 296 pxor @y[2], @y[0] 297 pxor @y[3], @y[1] 298___ 299 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 300 @x[2], @x[3], @y[2], @y[3], @t[2]); 301$code.=<<___; 302 pxor @t[0], @x[0] 303 pxor @t[0], @x[2] 304 pxor @t[1], @x[1] 305 pxor @t[1], @x[3] 306 307 movdqa @x[4], @t[0] 308 movdqa @x[5], @t[1] 309 pxor @x[6], @t[0] 310 pxor @x[7], @t[1] 311___ 312 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 313 @x[6], @x[7], @y[2], @y[3], @t[2]); 314$code.=<<___; 315 pxor @y[2], @y[0] 316 pxor @y[3], @y[1] 317___ 318 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 319$code.=<<___; 320 pxor @t[0], @x[4] 321 pxor @t[0], @x[6] 322 pxor @t[1], @x[5] 323 pxor @t[1], @x[7] 324___ 325} 326sub Inv_GF256 { 327#;******************************************************************** 328#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 329#;******************************************************************** 330my @x=@_[0..7]; 331my @t=@_[8..11]; 332my @s=@_[12..15]; 333# direct optimizations from hardware 334$code.=<<___; 335 movdqa @x[4], @t[3] 336 movdqa @x[5], @t[2] 337 movdqa @x[1], @t[1] 338 movdqa @x[7], @s[1] 339 movdqa @x[0], @s[0] 340 341 pxor @x[6], @t[3] 342 pxor @x[7], @t[2] 343 pxor @x[3], @t[1] 344 movdqa @t[3], @s[2] 345 pxor @x[6], @s[1] 346 movdqa @t[2], @t[0] 347 pxor @x[2], @s[0] 348 movdqa @t[3], @s[3] 349 350 por @t[1], @t[2] 351 por @s[0], @t[3] 352 pxor @t[0], @s[3] 353 pand @s[0], @s[2] 354 pxor @t[1], @s[0] 355 pand @t[1], @t[0] 356 pand @s[0], @s[3] 357 movdqa @x[3], @s[0] 358 pxor @x[2], @s[0] 359 pand @s[0], @s[1] 360 pxor @s[1], @t[3] 361 pxor @s[1], @t[2] 362 movdqa @x[4], @s[1] 363 movdqa @x[1], @s[0] 364 pxor @x[5], @s[1] 365 pxor @x[0], @s[0] 366 movdqa @s[1], @t[1] 367 pand @s[0], @s[1] 368 por @s[0], @t[1] 369 pxor @s[1], @t[0] 370 pxor @s[3], @t[3] 371 pxor @s[2], @t[2] 372 pxor @s[3], @t[1] 373 movdqa @x[7], @s[0] 374 pxor @s[2], @t[0] 375 movdqa @x[6], @s[1] 376 pxor @s[2], @t[1] 377 movdqa @x[5], @s[2] 378 pand @x[3], @s[0] 379 movdqa @x[4], @s[3] 380 pand @x[2], @s[1] 381 pand @x[1], @s[2] 382 por @x[0], @s[3] 383 pxor @s[0], @t[3] 384 pxor @s[1], @t[2] 385 pxor @s[2], @t[1] 386 pxor @s[3], @t[0] 387 388 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 389 390 # new smaller inversion 391 392 movdqa @t[3], @s[0] 393 pand @t[1], @t[3] 394 pxor @t[2], @s[0] 395 396 movdqa @t[0], @s[2] 397 movdqa @s[0], @s[3] 398 pxor @t[3], @s[2] 399 pand @s[2], @s[3] 400 401 movdqa @t[1], @s[1] 402 pxor @t[2], @s[3] 403 pxor @t[0], @s[1] 404 405 pxor @t[2], @t[3] 406 407 pand @t[3], @s[1] 408 409 movdqa @s[2], @t[2] 410 pxor @t[0], @s[1] 411 412 pxor @s[1], @t[2] 413 pxor @s[1], @t[1] 414 415 pand @t[0], @t[2] 416 417 pxor @t[2], @s[2] 418 pxor @t[2], @t[1] 419 420 pand @s[3], @s[2] 421 422 pxor @s[0], @s[2] 423___ 424# output in s3, s2, s1, t1 425 426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 427 428# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 429 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 430 431### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 432} 433 434# AES linear components 435 436sub ShiftRows { 437my @x=@_[0..7]; 438my $mask=pop; 439$code.=<<___; 440 pxor 0x00($key),@x[0] 441 pxor 0x10($key),@x[1] 442 pxor 0x20($key),@x[2] 443 pxor 0x30($key),@x[3] 444 pshufb $mask,@x[0] 445 pshufb $mask,@x[1] 446 pxor 0x40($key),@x[4] 447 pxor 0x50($key),@x[5] 448 pshufb $mask,@x[2] 449 pshufb $mask,@x[3] 450 pxor 0x60($key),@x[6] 451 pxor 0x70($key),@x[7] 452 pshufb $mask,@x[4] 453 pshufb $mask,@x[5] 454 pshufb $mask,@x[6] 455 pshufb $mask,@x[7] 456 lea 0x80($key),$key 457___ 458} 459 460sub MixColumns { 461# modified to emit output in order suitable for feeding back to aesenc[last] 462my @x=@_[0..7]; 463my @t=@_[8..15]; 464my $inv=@_[16]; # optional 465$code.=<<___; 466 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 467 pshufd \$0x93, @x[1], @t[1] 468 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 469 pshufd \$0x93, @x[2], @t[2] 470 pxor @t[1], @x[1] 471 pshufd \$0x93, @x[3], @t[3] 472 pxor @t[2], @x[2] 473 pshufd \$0x93, @x[4], @t[4] 474 pxor @t[3], @x[3] 475 pshufd \$0x93, @x[5], @t[5] 476 pxor @t[4], @x[4] 477 pshufd \$0x93, @x[6], @t[6] 478 pxor @t[5], @x[5] 479 pshufd \$0x93, @x[7], @t[7] 480 pxor @t[6], @x[6] 481 pxor @t[7], @x[7] 482 483 pxor @x[0], @t[1] 484 pxor @x[7], @t[0] 485 pxor @x[7], @t[1] 486 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 487 pxor @x[1], @t[2] 488 pshufd \$0x4E, @x[1], @x[1] 489 pxor @x[4], @t[5] 490 pxor @t[0], @x[0] 491 pxor @x[5], @t[6] 492 pxor @t[1], @x[1] 493 pxor @x[3], @t[4] 494 pshufd \$0x4E, @x[4], @t[0] 495 pxor @x[6], @t[7] 496 pshufd \$0x4E, @x[5], @t[1] 497 pxor @x[2], @t[3] 498 pshufd \$0x4E, @x[3], @x[4] 499 pxor @x[7], @t[3] 500 pshufd \$0x4E, @x[7], @x[5] 501 pxor @x[7], @t[4] 502 pshufd \$0x4E, @x[6], @x[3] 503 pxor @t[4], @t[0] 504 pshufd \$0x4E, @x[2], @x[6] 505 pxor @t[5], @t[1] 506___ 507$code.=<<___ if (!$inv); 508 pxor @t[3], @x[4] 509 pxor @t[7], @x[5] 510 pxor @t[6], @x[3] 511 movdqa @t[0], @x[2] 512 pxor @t[2], @x[6] 513 movdqa @t[1], @x[7] 514___ 515$code.=<<___ if ($inv); 516 pxor @x[4], @t[3] 517 pxor @t[7], @x[5] 518 pxor @x[3], @t[6] 519 movdqa @t[0], @x[3] 520 pxor @t[2], @x[6] 521 movdqa @t[6], @x[2] 522 movdqa @t[1], @x[7] 523 movdqa @x[6], @x[4] 524 movdqa @t[3], @x[6] 525___ 526} 527 528sub InvMixColumns_orig { 529my @x=@_[0..7]; 530my @t=@_[8..15]; 531 532$code.=<<___; 533 # multiplication by 0x0e 534 pshufd \$0x93, @x[7], @t[7] 535 movdqa @x[2], @t[2] 536 pxor @x[5], @x[7] # 7 5 537 pxor @x[5], @x[2] # 2 5 538 pshufd \$0x93, @x[0], @t[0] 539 movdqa @x[5], @t[5] 540 pxor @x[0], @x[5] # 5 0 [1] 541 pxor @x[1], @x[0] # 0 1 542 pshufd \$0x93, @x[1], @t[1] 543 pxor @x[2], @x[1] # 1 25 544 pxor @x[6], @x[0] # 01 6 [2] 545 pxor @x[3], @x[1] # 125 3 [4] 546 pshufd \$0x93, @x[3], @t[3] 547 pxor @x[0], @x[2] # 25 016 [3] 548 pxor @x[7], @x[3] # 3 75 549 pxor @x[6], @x[7] # 75 6 [0] 550 pshufd \$0x93, @x[6], @t[6] 551 movdqa @x[4], @t[4] 552 pxor @x[4], @x[6] # 6 4 553 pxor @x[3], @x[4] # 4 375 [6] 554 pxor @x[7], @x[3] # 375 756=36 555 pxor @t[5], @x[6] # 64 5 [7] 556 pxor @t[2], @x[3] # 36 2 557 pxor @t[4], @x[3] # 362 4 [5] 558 pshufd \$0x93, @t[5], @t[5] 559___ 560 my @y = @x[7,5,0,2,1,3,4,6]; 561$code.=<<___; 562 # multiplication by 0x0b 563 pxor @y[0], @y[1] 564 pxor @t[0], @y[0] 565 pxor @t[1], @y[1] 566 pshufd \$0x93, @t[2], @t[2] 567 pxor @t[5], @y[0] 568 pxor @t[6], @y[1] 569 pxor @t[7], @y[0] 570 pshufd \$0x93, @t[4], @t[4] 571 pxor @t[6], @t[7] # clobber t[7] 572 pxor @y[0], @y[1] 573 574 pxor @t[0], @y[3] 575 pshufd \$0x93, @t[0], @t[0] 576 pxor @t[1], @y[2] 577 pxor @t[1], @y[4] 578 pxor @t[2], @y[2] 579 pshufd \$0x93, @t[1], @t[1] 580 pxor @t[2], @y[3] 581 pxor @t[2], @y[5] 582 pxor @t[7], @y[2] 583 pshufd \$0x93, @t[2], @t[2] 584 pxor @t[3], @y[3] 585 pxor @t[3], @y[6] 586 pxor @t[3], @y[4] 587 pshufd \$0x93, @t[3], @t[3] 588 pxor @t[4], @y[7] 589 pxor @t[4], @y[5] 590 pxor @t[7], @y[7] 591 pxor @t[5], @y[3] 592 pxor @t[4], @y[4] 593 pxor @t[5], @t[7] # clobber t[7] even more 594 595 pxor @t[7], @y[5] 596 pshufd \$0x93, @t[4], @t[4] 597 pxor @t[7], @y[6] 598 pxor @t[7], @y[4] 599 600 pxor @t[5], @t[7] 601 pshufd \$0x93, @t[5], @t[5] 602 pxor @t[6], @t[7] # restore t[7] 603 604 # multiplication by 0x0d 605 pxor @y[7], @y[4] 606 pxor @t[4], @y[7] 607 pshufd \$0x93, @t[6], @t[6] 608 pxor @t[0], @y[2] 609 pxor @t[5], @y[7] 610 pxor @t[2], @y[2] 611 pshufd \$0x93, @t[7], @t[7] 612 613 pxor @y[1], @y[3] 614 pxor @t[1], @y[1] 615 pxor @t[0], @y[0] 616 pxor @t[0], @y[3] 617 pxor @t[5], @y[1] 618 pxor @t[5], @y[0] 619 pxor @t[7], @y[1] 620 pshufd \$0x93, @t[0], @t[0] 621 pxor @t[6], @y[0] 622 pxor @y[1], @y[3] 623 pxor @t[1], @y[4] 624 pshufd \$0x93, @t[1], @t[1] 625 626 pxor @t[7], @y[7] 627 pxor @t[2], @y[4] 628 pxor @t[2], @y[5] 629 pshufd \$0x93, @t[2], @t[2] 630 pxor @t[6], @y[2] 631 pxor @t[3], @t[6] # clobber t[6] 632 pxor @y[7], @y[4] 633 pxor @t[6], @y[3] 634 635 pxor @t[6], @y[6] 636 pxor @t[5], @y[5] 637 pxor @t[4], @y[6] 638 pshufd \$0x93, @t[4], @t[4] 639 pxor @t[6], @y[5] 640 pxor @t[7], @y[6] 641 pxor @t[3], @t[6] # restore t[6] 642 643 pshufd \$0x93, @t[5], @t[5] 644 pshufd \$0x93, @t[6], @t[6] 645 pshufd \$0x93, @t[7], @t[7] 646 pshufd \$0x93, @t[3], @t[3] 647 648 # multiplication by 0x09 649 pxor @y[1], @y[4] 650 pxor @y[1], @t[1] # t[1]=y[1] 651 pxor @t[5], @t[0] # clobber t[0] 652 pxor @t[5], @t[1] 653 pxor @t[0], @y[3] 654 pxor @y[0], @t[0] # t[0]=y[0] 655 pxor @t[6], @t[1] 656 pxor @t[7], @t[6] # clobber t[6] 657 pxor @t[1], @y[4] 658 pxor @t[4], @y[7] 659 pxor @y[4], @t[4] # t[4]=y[4] 660 pxor @t[3], @y[6] 661 pxor @y[3], @t[3] # t[3]=y[3] 662 pxor @t[2], @y[5] 663 pxor @y[2], @t[2] # t[2]=y[2] 664 pxor @t[7], @t[3] 665 pxor @y[5], @t[5] # t[5]=y[5] 666 pxor @t[6], @t[2] 667 pxor @t[6], @t[5] 668 pxor @y[6], @t[6] # t[6]=y[6] 669 pxor @y[7], @t[7] # t[7]=y[7] 670 671 movdqa @t[0],@XMM[0] 672 movdqa @t[1],@XMM[1] 673 movdqa @t[2],@XMM[2] 674 movdqa @t[3],@XMM[3] 675 movdqa @t[4],@XMM[4] 676 movdqa @t[5],@XMM[5] 677 movdqa @t[6],@XMM[6] 678 movdqa @t[7],@XMM[7] 679___ 680} 681 682sub InvMixColumns { 683my @x=@_[0..7]; 684my @t=@_[8..15]; 685 686# Thanks to Jussi Kivilinna for providing pointer to 687# 688# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 689# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 690# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 691# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 692 693$code.=<<___; 694 # multiplication by 0x05-0x00-0x04-0x00 695 pshufd \$0x4E, @x[0], @t[0] 696 pshufd \$0x4E, @x[6], @t[6] 697 pxor @x[0], @t[0] 698 pshufd \$0x4E, @x[7], @t[7] 699 pxor @x[6], @t[6] 700 pshufd \$0x4E, @x[1], @t[1] 701 pxor @x[7], @t[7] 702 pshufd \$0x4E, @x[2], @t[2] 703 pxor @x[1], @t[1] 704 pshufd \$0x4E, @x[3], @t[3] 705 pxor @x[2], @t[2] 706 pxor @t[6], @x[0] 707 pxor @t[6], @x[1] 708 pshufd \$0x4E, @x[4], @t[4] 709 pxor @x[3], @t[3] 710 pxor @t[0], @x[2] 711 pxor @t[1], @x[3] 712 pshufd \$0x4E, @x[5], @t[5] 713 pxor @x[4], @t[4] 714 pxor @t[7], @x[1] 715 pxor @t[2], @x[4] 716 pxor @x[5], @t[5] 717 718 pxor @t[7], @x[2] 719 pxor @t[6], @x[3] 720 pxor @t[6], @x[4] 721 pxor @t[3], @x[5] 722 pxor @t[4], @x[6] 723 pxor @t[7], @x[4] 724 pxor @t[7], @x[5] 725 pxor @t[5], @x[7] 726___ 727 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 728} 729 730sub aesenc { # not used 731my @b=@_[0..7]; 732my @t=@_[8..15]; 733$code.=<<___; 734 movdqa 0x30($const),@t[0] # .LSR 735___ 736 &ShiftRows (@b,@t[0]); 737 &Sbox (@b,@t); 738 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 739} 740 741sub aesenclast { # not used 742my @b=@_[0..7]; 743my @t=@_[8..15]; 744$code.=<<___; 745 movdqa 0x40($const),@t[0] # .LSRM0 746___ 747 &ShiftRows (@b,@t[0]); 748 &Sbox (@b,@t); 749$code.=<<___ 750 pxor 0x00($key),@b[0] 751 pxor 0x10($key),@b[1] 752 pxor 0x20($key),@b[4] 753 pxor 0x30($key),@b[6] 754 pxor 0x40($key),@b[3] 755 pxor 0x50($key),@b[7] 756 pxor 0x60($key),@b[2] 757 pxor 0x70($key),@b[5] 758___ 759} 760 761sub swapmove { 762my ($a,$b,$n,$mask,$t)=@_; 763$code.=<<___; 764 movdqa $b,$t 765 psrlq \$$n,$b 766 pxor $a,$b 767 pand $mask,$b 768 pxor $b,$a 769 psllq \$$n,$b 770 pxor $t,$b 771___ 772} 773sub swapmove2x { 774my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 775$code.=<<___; 776 movdqa $b0,$t0 777 psrlq \$$n,$b0 778 movdqa $b1,$t1 779 psrlq \$$n,$b1 780 pxor $a0,$b0 781 pxor $a1,$b1 782 pand $mask,$b0 783 pand $mask,$b1 784 pxor $b0,$a0 785 psllq \$$n,$b0 786 pxor $b1,$a1 787 psllq \$$n,$b1 788 pxor $t0,$b0 789 pxor $t1,$b1 790___ 791} 792 793sub bitslice { 794my @x=reverse(@_[0..7]); 795my ($t0,$t1,$t2,$t3)=@_[8..11]; 796$code.=<<___; 797 movdqa 0x00($const),$t0 # .LBS0 798 movdqa 0x10($const),$t1 # .LBS1 799___ 800 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 801 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 802$code.=<<___; 803 movdqa 0x20($const),$t0 # .LBS2 804___ 805 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 806 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 807 808 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 809 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 810} 811 812$code.=<<___; 813.text 814 815.extern asm_AES_encrypt 816.extern asm_AES_decrypt 817 818.type _bsaes_encrypt8,\@abi-omnipotent 819.align 64 820_bsaes_encrypt8: 821.cfi_startproc 822 lea .LBS0(%rip), $const # constants table 823 824 movdqa ($key), @XMM[9] # round 0 key 825 lea 0x10($key), $key 826 movdqa 0x50($const), @XMM[8] # .LM0SR 827 pxor @XMM[9], @XMM[0] # xor with round0 key 828 pxor @XMM[9], @XMM[1] 829 pxor @XMM[9], @XMM[2] 830 pxor @XMM[9], @XMM[3] 831 pshufb @XMM[8], @XMM[0] 832 pshufb @XMM[8], @XMM[1] 833 pxor @XMM[9], @XMM[4] 834 pxor @XMM[9], @XMM[5] 835 pshufb @XMM[8], @XMM[2] 836 pshufb @XMM[8], @XMM[3] 837 pxor @XMM[9], @XMM[6] 838 pxor @XMM[9], @XMM[7] 839 pshufb @XMM[8], @XMM[4] 840 pshufb @XMM[8], @XMM[5] 841 pshufb @XMM[8], @XMM[6] 842 pshufb @XMM[8], @XMM[7] 843_bsaes_encrypt8_bitslice: 844___ 845 &bitslice (@XMM[0..7, 8..11]); 846$code.=<<___; 847 dec $rounds 848 jmp .Lenc_sbox 849.align 16 850.Lenc_loop: 851___ 852 &ShiftRows (@XMM[0..7, 8]); 853$code.=".Lenc_sbox:\n"; 854 &Sbox (@XMM[0..7, 8..15]); 855$code.=<<___; 856 dec $rounds 857 jl .Lenc_done 858___ 859 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 860$code.=<<___; 861 movdqa 0x30($const), @XMM[8] # .LSR 862 jnz .Lenc_loop 863 movdqa 0x40($const), @XMM[8] # .LSRM0 864 jmp .Lenc_loop 865.align 16 866.Lenc_done: 867___ 868 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 869 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 870$code.=<<___; 871 movdqa ($key), @XMM[8] # last round key 872 pxor @XMM[8], @XMM[4] 873 pxor @XMM[8], @XMM[6] 874 pxor @XMM[8], @XMM[3] 875 pxor @XMM[8], @XMM[7] 876 pxor @XMM[8], @XMM[2] 877 pxor @XMM[8], @XMM[5] 878 pxor @XMM[8], @XMM[0] 879 pxor @XMM[8], @XMM[1] 880 ret 881.cfi_endproc 882.size _bsaes_encrypt8,.-_bsaes_encrypt8 883 884.type _bsaes_decrypt8,\@abi-omnipotent 885.align 64 886_bsaes_decrypt8: 887.cfi_startproc 888 lea .LBS0(%rip), $const # constants table 889 890 movdqa ($key), @XMM[9] # round 0 key 891 lea 0x10($key), $key 892 movdqa -0x30($const), @XMM[8] # .LM0ISR 893 pxor @XMM[9], @XMM[0] # xor with round0 key 894 pxor @XMM[9], @XMM[1] 895 pxor @XMM[9], @XMM[2] 896 pxor @XMM[9], @XMM[3] 897 pshufb @XMM[8], @XMM[0] 898 pshufb @XMM[8], @XMM[1] 899 pxor @XMM[9], @XMM[4] 900 pxor @XMM[9], @XMM[5] 901 pshufb @XMM[8], @XMM[2] 902 pshufb @XMM[8], @XMM[3] 903 pxor @XMM[9], @XMM[6] 904 pxor @XMM[9], @XMM[7] 905 pshufb @XMM[8], @XMM[4] 906 pshufb @XMM[8], @XMM[5] 907 pshufb @XMM[8], @XMM[6] 908 pshufb @XMM[8], @XMM[7] 909___ 910 &bitslice (@XMM[0..7, 8..11]); 911$code.=<<___; 912 dec $rounds 913 jmp .Ldec_sbox 914.align 16 915.Ldec_loop: 916___ 917 &ShiftRows (@XMM[0..7, 8]); 918$code.=".Ldec_sbox:\n"; 919 &InvSbox (@XMM[0..7, 8..15]); 920$code.=<<___; 921 dec $rounds 922 jl .Ldec_done 923___ 924 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 925$code.=<<___; 926 movdqa -0x10($const), @XMM[8] # .LISR 927 jnz .Ldec_loop 928 movdqa -0x20($const), @XMM[8] # .LISRM0 929 jmp .Ldec_loop 930.align 16 931.Ldec_done: 932___ 933 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 934$code.=<<___; 935 movdqa ($key), @XMM[8] # last round key 936 pxor @XMM[8], @XMM[6] 937 pxor @XMM[8], @XMM[4] 938 pxor @XMM[8], @XMM[2] 939 pxor @XMM[8], @XMM[7] 940 pxor @XMM[8], @XMM[3] 941 pxor @XMM[8], @XMM[5] 942 pxor @XMM[8], @XMM[0] 943 pxor @XMM[8], @XMM[1] 944 ret 945.cfi_endproc 946.size _bsaes_decrypt8,.-_bsaes_decrypt8 947___ 948} 949{ 950my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 951 952sub bitslice_key { 953my @x=reverse(@_[0..7]); 954my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 955 956 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 957$code.=<<___; 958 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 959 movdqa @x[0], @x[2] 960 movdqa @x[1], @x[3] 961___ 962 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 963 964 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 965$code.=<<___; 966 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 967 movdqa @x[0], @x[4] 968 movdqa @x[2], @x[6] 969 movdqa @x[1], @x[5] 970 movdqa @x[3], @x[7] 971___ 972 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 973 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 974} 975 976$code.=<<___; 977.type _bsaes_key_convert,\@abi-omnipotent 978.align 16 979_bsaes_key_convert: 980.cfi_startproc 981 lea .Lmasks(%rip), $const 982 movdqu ($inp), %xmm7 # load round 0 key 983 lea 0x10($inp), $inp 984 movdqa 0x00($const), %xmm0 # 0x01... 985 movdqa 0x10($const), %xmm1 # 0x02... 986 movdqa 0x20($const), %xmm2 # 0x04... 987 movdqa 0x30($const), %xmm3 # 0x08... 988 movdqa 0x40($const), %xmm4 # .LM0 989 pcmpeqd %xmm5, %xmm5 # .LNOT 990 991 movdqu ($inp), %xmm6 # load round 1 key 992 movdqa %xmm7, ($out) # save round 0 key 993 lea 0x10($out), $out 994 dec $rounds 995 jmp .Lkey_loop 996.align 16 997.Lkey_loop: 998 pshufb %xmm4, %xmm6 # .LM0 999 1000 movdqa %xmm0, %xmm8 1001 movdqa %xmm1, %xmm9 1002 1003 pand %xmm6, %xmm8 1004 pand %xmm6, %xmm9 1005 movdqa %xmm2, %xmm10 1006 pcmpeqb %xmm0, %xmm8 1007 psllq \$4, %xmm0 # 0x10... 1008 movdqa %xmm3, %xmm11 1009 pcmpeqb %xmm1, %xmm9 1010 psllq \$4, %xmm1 # 0x20... 1011 1012 pand %xmm6, %xmm10 1013 pand %xmm6, %xmm11 1014 movdqa %xmm0, %xmm12 1015 pcmpeqb %xmm2, %xmm10 1016 psllq \$4, %xmm2 # 0x40... 1017 movdqa %xmm1, %xmm13 1018 pcmpeqb %xmm3, %xmm11 1019 psllq \$4, %xmm3 # 0x80... 1020 1021 movdqa %xmm2, %xmm14 1022 movdqa %xmm3, %xmm15 1023 pxor %xmm5, %xmm8 # "pnot" 1024 pxor %xmm5, %xmm9 1025 1026 pand %xmm6, %xmm12 1027 pand %xmm6, %xmm13 1028 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1029 pcmpeqb %xmm0, %xmm12 1030 psrlq \$4, %xmm0 # 0x01... 1031 movdqa %xmm9, 0x10($out) 1032 pcmpeqb %xmm1, %xmm13 1033 psrlq \$4, %xmm1 # 0x02... 1034 lea 0x10($inp), $inp 1035 1036 pand %xmm6, %xmm14 1037 pand %xmm6, %xmm15 1038 movdqa %xmm10, 0x20($out) 1039 pcmpeqb %xmm2, %xmm14 1040 psrlq \$4, %xmm2 # 0x04... 1041 movdqa %xmm11, 0x30($out) 1042 pcmpeqb %xmm3, %xmm15 1043 psrlq \$4, %xmm3 # 0x08... 1044 movdqu ($inp), %xmm6 # load next round key 1045 1046 pxor %xmm5, %xmm13 # "pnot" 1047 pxor %xmm5, %xmm14 1048 movdqa %xmm12, 0x40($out) 1049 movdqa %xmm13, 0x50($out) 1050 movdqa %xmm14, 0x60($out) 1051 movdqa %xmm15, 0x70($out) 1052 lea 0x80($out),$out 1053 dec $rounds 1054 jnz .Lkey_loop 1055 1056 movdqa 0x50($const), %xmm7 # .L63 1057 #movdqa %xmm6, ($out) # don't save last round key 1058 ret 1059.cfi_endproc 1060.size _bsaes_key_convert,.-_bsaes_key_convert 1061___ 1062} 1063 1064if (0 && !$win64) { # following four functions are unsupported interface 1065 # used for benchmarking... 1066$code.=<<___; 1067.globl bsaes_enc_key_convert 1068.type bsaes_enc_key_convert,\@function,2 1069.align 16 1070bsaes_enc_key_convert: 1071 mov 240($inp),%r10d # pass rounds 1072 mov $inp,%rcx # pass key 1073 mov $out,%rax # pass key schedule 1074 call _bsaes_key_convert 1075 pxor %xmm6,%xmm7 # fix up last round key 1076 movdqa %xmm7,(%rax) # save last round key 1077 ret 1078.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1079 1080.globl bsaes_encrypt_128 1081.type bsaes_encrypt_128,\@function,4 1082.align 16 1083bsaes_encrypt_128: 1084.Lenc128_loop: 1085 movdqu 0x00($inp), @XMM[0] # load input 1086 movdqu 0x10($inp), @XMM[1] 1087 movdqu 0x20($inp), @XMM[2] 1088 movdqu 0x30($inp), @XMM[3] 1089 movdqu 0x40($inp), @XMM[4] 1090 movdqu 0x50($inp), @XMM[5] 1091 movdqu 0x60($inp), @XMM[6] 1092 movdqu 0x70($inp), @XMM[7] 1093 mov $key, %rax # pass the $key 1094 lea 0x80($inp), $inp 1095 mov \$10,%r10d 1096 1097 call _bsaes_encrypt8 1098 1099 movdqu @XMM[0], 0x00($out) # write output 1100 movdqu @XMM[1], 0x10($out) 1101 movdqu @XMM[4], 0x20($out) 1102 movdqu @XMM[6], 0x30($out) 1103 movdqu @XMM[3], 0x40($out) 1104 movdqu @XMM[7], 0x50($out) 1105 movdqu @XMM[2], 0x60($out) 1106 movdqu @XMM[5], 0x70($out) 1107 lea 0x80($out), $out 1108 sub \$0x80,$len 1109 ja .Lenc128_loop 1110 ret 1111.size bsaes_encrypt_128,.-bsaes_encrypt_128 1112 1113.globl bsaes_dec_key_convert 1114.type bsaes_dec_key_convert,\@function,2 1115.align 16 1116bsaes_dec_key_convert: 1117 mov 240($inp),%r10d # pass rounds 1118 mov $inp,%rcx # pass key 1119 mov $out,%rax # pass key schedule 1120 call _bsaes_key_convert 1121 pxor ($out),%xmm7 # fix up round 0 key 1122 movdqa %xmm6,(%rax) # save last round key 1123 movdqa %xmm7,($out) 1124 ret 1125.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1126 1127.globl bsaes_decrypt_128 1128.type bsaes_decrypt_128,\@function,4 1129.align 16 1130bsaes_decrypt_128: 1131.Ldec128_loop: 1132 movdqu 0x00($inp), @XMM[0] # load input 1133 movdqu 0x10($inp), @XMM[1] 1134 movdqu 0x20($inp), @XMM[2] 1135 movdqu 0x30($inp), @XMM[3] 1136 movdqu 0x40($inp), @XMM[4] 1137 movdqu 0x50($inp), @XMM[5] 1138 movdqu 0x60($inp), @XMM[6] 1139 movdqu 0x70($inp), @XMM[7] 1140 mov $key, %rax # pass the $key 1141 lea 0x80($inp), $inp 1142 mov \$10,%r10d 1143 1144 call _bsaes_decrypt8 1145 1146 movdqu @XMM[0], 0x00($out) # write output 1147 movdqu @XMM[1], 0x10($out) 1148 movdqu @XMM[6], 0x20($out) 1149 movdqu @XMM[4], 0x30($out) 1150 movdqu @XMM[2], 0x40($out) 1151 movdqu @XMM[7], 0x50($out) 1152 movdqu @XMM[3], 0x60($out) 1153 movdqu @XMM[5], 0x70($out) 1154 lea 0x80($out), $out 1155 sub \$0x80,$len 1156 ja .Ldec128_loop 1157 ret 1158.size bsaes_decrypt_128,.-bsaes_decrypt_128 1159___ 1160} 1161{ 1162###################################################################### 1163# 1164# OpenSSL interface 1165# 1166my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1167 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1168my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1169 1170if ($ecb) { 1171$code.=<<___; 1172.globl bsaes_ecb_encrypt_blocks 1173.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1174.align 16 1175bsaes_ecb_encrypt_blocks: 1176.cfi_startproc 1177 mov %rsp, %rax 1178.Lecb_enc_prologue: 1179 push %rbp 1180.cfi_push %rbp 1181 push %rbx 1182.cfi_push %rbx 1183 push %r12 1184.cfi_push %r12 1185 push %r13 1186.cfi_push %r13 1187 push %r14 1188.cfi_push %r14 1189 push %r15 1190.cfi_push %r15 1191 lea -0x48(%rsp),%rsp 1192.cfi_adjust_cfa_offset 0x48 1193___ 1194$code.=<<___ if ($win64); 1195 lea -0xa0(%rsp), %rsp 1196 movaps %xmm6, 0x40(%rsp) 1197 movaps %xmm7, 0x50(%rsp) 1198 movaps %xmm8, 0x60(%rsp) 1199 movaps %xmm9, 0x70(%rsp) 1200 movaps %xmm10, 0x80(%rsp) 1201 movaps %xmm11, 0x90(%rsp) 1202 movaps %xmm12, 0xa0(%rsp) 1203 movaps %xmm13, 0xb0(%rsp) 1204 movaps %xmm14, 0xc0(%rsp) 1205 movaps %xmm15, 0xd0(%rsp) 1206.Lecb_enc_body: 1207___ 1208$code.=<<___; 1209 mov %rsp,%rbp # backup %rsp 1210.cfi_def_cfa_register %rbp 1211 mov 240($arg4),%eax # rounds 1212 mov $arg1,$inp # backup arguments 1213 mov $arg2,$out 1214 mov $arg3,$len 1215 mov $arg4,$key 1216 cmp \$8,$arg3 1217 jb .Lecb_enc_short 1218 1219 mov %eax,%ebx # backup rounds 1220 shl \$7,%rax # 128 bytes per inner round key 1221 sub \$`128-32`,%rax # size of bit-sliced key schedule 1222 sub %rax,%rsp 1223 mov %rsp,%rax # pass key schedule 1224 mov $key,%rcx # pass key 1225 mov %ebx,%r10d # pass rounds 1226 call _bsaes_key_convert 1227 pxor %xmm6,%xmm7 # fix up last round key 1228 movdqa %xmm7,(%rax) # save last round key 1229 1230 sub \$8,$len 1231.Lecb_enc_loop: 1232 movdqu 0x00($inp), @XMM[0] # load input 1233 movdqu 0x10($inp), @XMM[1] 1234 movdqu 0x20($inp), @XMM[2] 1235 movdqu 0x30($inp), @XMM[3] 1236 movdqu 0x40($inp), @XMM[4] 1237 movdqu 0x50($inp), @XMM[5] 1238 mov %rsp, %rax # pass key schedule 1239 movdqu 0x60($inp), @XMM[6] 1240 mov %ebx,%r10d # pass rounds 1241 movdqu 0x70($inp), @XMM[7] 1242 lea 0x80($inp), $inp 1243 1244 call _bsaes_encrypt8 1245 1246 movdqu @XMM[0], 0x00($out) # write output 1247 movdqu @XMM[1], 0x10($out) 1248 movdqu @XMM[4], 0x20($out) 1249 movdqu @XMM[6], 0x30($out) 1250 movdqu @XMM[3], 0x40($out) 1251 movdqu @XMM[7], 0x50($out) 1252 movdqu @XMM[2], 0x60($out) 1253 movdqu @XMM[5], 0x70($out) 1254 lea 0x80($out), $out 1255 sub \$8,$len 1256 jnc .Lecb_enc_loop 1257 1258 add \$8,$len 1259 jz .Lecb_enc_done 1260 1261 movdqu 0x00($inp), @XMM[0] # load input 1262 mov %rsp, %rax # pass key schedule 1263 mov %ebx,%r10d # pass rounds 1264 cmp \$2,$len 1265 jb .Lecb_enc_one 1266 movdqu 0x10($inp), @XMM[1] 1267 je .Lecb_enc_two 1268 movdqu 0x20($inp), @XMM[2] 1269 cmp \$4,$len 1270 jb .Lecb_enc_three 1271 movdqu 0x30($inp), @XMM[3] 1272 je .Lecb_enc_four 1273 movdqu 0x40($inp), @XMM[4] 1274 cmp \$6,$len 1275 jb .Lecb_enc_five 1276 movdqu 0x50($inp), @XMM[5] 1277 je .Lecb_enc_six 1278 movdqu 0x60($inp), @XMM[6] 1279 call _bsaes_encrypt8 1280 movdqu @XMM[0], 0x00($out) # write output 1281 movdqu @XMM[1], 0x10($out) 1282 movdqu @XMM[4], 0x20($out) 1283 movdqu @XMM[6], 0x30($out) 1284 movdqu @XMM[3], 0x40($out) 1285 movdqu @XMM[7], 0x50($out) 1286 movdqu @XMM[2], 0x60($out) 1287 jmp .Lecb_enc_done 1288.align 16 1289.Lecb_enc_six: 1290 call _bsaes_encrypt8 1291 movdqu @XMM[0], 0x00($out) # write output 1292 movdqu @XMM[1], 0x10($out) 1293 movdqu @XMM[4], 0x20($out) 1294 movdqu @XMM[6], 0x30($out) 1295 movdqu @XMM[3], 0x40($out) 1296 movdqu @XMM[7], 0x50($out) 1297 jmp .Lecb_enc_done 1298.align 16 1299.Lecb_enc_five: 1300 call _bsaes_encrypt8 1301 movdqu @XMM[0], 0x00($out) # write output 1302 movdqu @XMM[1], 0x10($out) 1303 movdqu @XMM[4], 0x20($out) 1304 movdqu @XMM[6], 0x30($out) 1305 movdqu @XMM[3], 0x40($out) 1306 jmp .Lecb_enc_done 1307.align 16 1308.Lecb_enc_four: 1309 call _bsaes_encrypt8 1310 movdqu @XMM[0], 0x00($out) # write output 1311 movdqu @XMM[1], 0x10($out) 1312 movdqu @XMM[4], 0x20($out) 1313 movdqu @XMM[6], 0x30($out) 1314 jmp .Lecb_enc_done 1315.align 16 1316.Lecb_enc_three: 1317 call _bsaes_encrypt8 1318 movdqu @XMM[0], 0x00($out) # write output 1319 movdqu @XMM[1], 0x10($out) 1320 movdqu @XMM[4], 0x20($out) 1321 jmp .Lecb_enc_done 1322.align 16 1323.Lecb_enc_two: 1324 call _bsaes_encrypt8 1325 movdqu @XMM[0], 0x00($out) # write output 1326 movdqu @XMM[1], 0x10($out) 1327 jmp .Lecb_enc_done 1328.align 16 1329.Lecb_enc_one: 1330 call _bsaes_encrypt8 1331 movdqu @XMM[0], 0x00($out) # write output 1332 jmp .Lecb_enc_done 1333.align 16 1334.Lecb_enc_short: 1335 lea ($inp), $arg1 1336 lea ($out), $arg2 1337 lea ($key), $arg3 1338 call asm_AES_encrypt 1339 lea 16($inp), $inp 1340 lea 16($out), $out 1341 dec $len 1342 jnz .Lecb_enc_short 1343 1344.Lecb_enc_done: 1345 lea (%rsp),%rax 1346 pxor %xmm0, %xmm0 1347.Lecb_enc_bzero: # wipe key schedule [if any] 1348 movdqa %xmm0, 0x00(%rax) 1349 movdqa %xmm0, 0x10(%rax) 1350 lea 0x20(%rax), %rax 1351 cmp %rax, %rbp 1352 jb .Lecb_enc_bzero 1353 1354 lea 0x78(%rbp),%rax 1355.cfi_def_cfa %rax,8 1356___ 1357$code.=<<___ if ($win64); 1358 movaps 0x40(%rbp), %xmm6 1359 movaps 0x50(%rbp), %xmm7 1360 movaps 0x60(%rbp), %xmm8 1361 movaps 0x70(%rbp), %xmm9 1362 movaps 0x80(%rbp), %xmm10 1363 movaps 0x90(%rbp), %xmm11 1364 movaps 0xa0(%rbp), %xmm12 1365 movaps 0xb0(%rbp), %xmm13 1366 movaps 0xc0(%rbp), %xmm14 1367 movaps 0xd0(%rbp), %xmm15 1368 lea 0xa0(%rax), %rax 1369.Lecb_enc_tail: 1370___ 1371$code.=<<___; 1372 mov -48(%rax), %r15 1373.cfi_restore %r15 1374 mov -40(%rax), %r14 1375.cfi_restore %r14 1376 mov -32(%rax), %r13 1377.cfi_restore %r13 1378 mov -24(%rax), %r12 1379.cfi_restore %r12 1380 mov -16(%rax), %rbx 1381.cfi_restore %rbx 1382 mov -8(%rax), %rbp 1383.cfi_restore %rbp 1384 lea (%rax), %rsp # restore %rsp 1385.cfi_def_cfa_register %rsp 1386.Lecb_enc_epilogue: 1387 ret 1388.cfi_endproc 1389.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1390 1391.globl bsaes_ecb_decrypt_blocks 1392.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1393.align 16 1394bsaes_ecb_decrypt_blocks: 1395.cfi_startproc 1396 mov %rsp, %rax 1397.Lecb_dec_prologue: 1398 push %rbp 1399.cfi_push %rbp 1400 push %rbx 1401.cfi_push %rbx 1402 push %r12 1403.cfi_push %r12 1404 push %r13 1405.cfi_push %r13 1406 push %r14 1407.cfi_push %r14 1408 push %r15 1409.cfi_push %r15 1410 lea -0x48(%rsp),%rsp 1411.cfi_adjust_cfa_offset 0x48 1412___ 1413$code.=<<___ if ($win64); 1414 lea -0xa0(%rsp), %rsp 1415 movaps %xmm6, 0x40(%rsp) 1416 movaps %xmm7, 0x50(%rsp) 1417 movaps %xmm8, 0x60(%rsp) 1418 movaps %xmm9, 0x70(%rsp) 1419 movaps %xmm10, 0x80(%rsp) 1420 movaps %xmm11, 0x90(%rsp) 1421 movaps %xmm12, 0xa0(%rsp) 1422 movaps %xmm13, 0xb0(%rsp) 1423 movaps %xmm14, 0xc0(%rsp) 1424 movaps %xmm15, 0xd0(%rsp) 1425.Lecb_dec_body: 1426___ 1427$code.=<<___; 1428 mov %rsp,%rbp # backup %rsp 1429.cfi_def_cfa_register %rbp 1430 mov 240($arg4),%eax # rounds 1431 mov $arg1,$inp # backup arguments 1432 mov $arg2,$out 1433 mov $arg3,$len 1434 mov $arg4,$key 1435 cmp \$8,$arg3 1436 jb .Lecb_dec_short 1437 1438 mov %eax,%ebx # backup rounds 1439 shl \$7,%rax # 128 bytes per inner round key 1440 sub \$`128-32`,%rax # size of bit-sliced key schedule 1441 sub %rax,%rsp 1442 mov %rsp,%rax # pass key schedule 1443 mov $key,%rcx # pass key 1444 mov %ebx,%r10d # pass rounds 1445 call _bsaes_key_convert 1446 pxor (%rsp),%xmm7 # fix up 0 round key 1447 movdqa %xmm6,(%rax) # save last round key 1448 movdqa %xmm7,(%rsp) 1449 1450 sub \$8,$len 1451.Lecb_dec_loop: 1452 movdqu 0x00($inp), @XMM[0] # load input 1453 movdqu 0x10($inp), @XMM[1] 1454 movdqu 0x20($inp), @XMM[2] 1455 movdqu 0x30($inp), @XMM[3] 1456 movdqu 0x40($inp), @XMM[4] 1457 movdqu 0x50($inp), @XMM[5] 1458 mov %rsp, %rax # pass key schedule 1459 movdqu 0x60($inp), @XMM[6] 1460 mov %ebx,%r10d # pass rounds 1461 movdqu 0x70($inp), @XMM[7] 1462 lea 0x80($inp), $inp 1463 1464 call _bsaes_decrypt8 1465 1466 movdqu @XMM[0], 0x00($out) # write output 1467 movdqu @XMM[1], 0x10($out) 1468 movdqu @XMM[6], 0x20($out) 1469 movdqu @XMM[4], 0x30($out) 1470 movdqu @XMM[2], 0x40($out) 1471 movdqu @XMM[7], 0x50($out) 1472 movdqu @XMM[3], 0x60($out) 1473 movdqu @XMM[5], 0x70($out) 1474 lea 0x80($out), $out 1475 sub \$8,$len 1476 jnc .Lecb_dec_loop 1477 1478 add \$8,$len 1479 jz .Lecb_dec_done 1480 1481 movdqu 0x00($inp), @XMM[0] # load input 1482 mov %rsp, %rax # pass key schedule 1483 mov %ebx,%r10d # pass rounds 1484 cmp \$2,$len 1485 jb .Lecb_dec_one 1486 movdqu 0x10($inp), @XMM[1] 1487 je .Lecb_dec_two 1488 movdqu 0x20($inp), @XMM[2] 1489 cmp \$4,$len 1490 jb .Lecb_dec_three 1491 movdqu 0x30($inp), @XMM[3] 1492 je .Lecb_dec_four 1493 movdqu 0x40($inp), @XMM[4] 1494 cmp \$6,$len 1495 jb .Lecb_dec_five 1496 movdqu 0x50($inp), @XMM[5] 1497 je .Lecb_dec_six 1498 movdqu 0x60($inp), @XMM[6] 1499 call _bsaes_decrypt8 1500 movdqu @XMM[0], 0x00($out) # write output 1501 movdqu @XMM[1], 0x10($out) 1502 movdqu @XMM[6], 0x20($out) 1503 movdqu @XMM[4], 0x30($out) 1504 movdqu @XMM[2], 0x40($out) 1505 movdqu @XMM[7], 0x50($out) 1506 movdqu @XMM[3], 0x60($out) 1507 jmp .Lecb_dec_done 1508.align 16 1509.Lecb_dec_six: 1510 call _bsaes_decrypt8 1511 movdqu @XMM[0], 0x00($out) # write output 1512 movdqu @XMM[1], 0x10($out) 1513 movdqu @XMM[6], 0x20($out) 1514 movdqu @XMM[4], 0x30($out) 1515 movdqu @XMM[2], 0x40($out) 1516 movdqu @XMM[7], 0x50($out) 1517 jmp .Lecb_dec_done 1518.align 16 1519.Lecb_dec_five: 1520 call _bsaes_decrypt8 1521 movdqu @XMM[0], 0x00($out) # write output 1522 movdqu @XMM[1], 0x10($out) 1523 movdqu @XMM[6], 0x20($out) 1524 movdqu @XMM[4], 0x30($out) 1525 movdqu @XMM[2], 0x40($out) 1526 jmp .Lecb_dec_done 1527.align 16 1528.Lecb_dec_four: 1529 call _bsaes_decrypt8 1530 movdqu @XMM[0], 0x00($out) # write output 1531 movdqu @XMM[1], 0x10($out) 1532 movdqu @XMM[6], 0x20($out) 1533 movdqu @XMM[4], 0x30($out) 1534 jmp .Lecb_dec_done 1535.align 16 1536.Lecb_dec_three: 1537 call _bsaes_decrypt8 1538 movdqu @XMM[0], 0x00($out) # write output 1539 movdqu @XMM[1], 0x10($out) 1540 movdqu @XMM[6], 0x20($out) 1541 jmp .Lecb_dec_done 1542.align 16 1543.Lecb_dec_two: 1544 call _bsaes_decrypt8 1545 movdqu @XMM[0], 0x00($out) # write output 1546 movdqu @XMM[1], 0x10($out) 1547 jmp .Lecb_dec_done 1548.align 16 1549.Lecb_dec_one: 1550 call _bsaes_decrypt8 1551 movdqu @XMM[0], 0x00($out) # write output 1552 jmp .Lecb_dec_done 1553.align 16 1554.Lecb_dec_short: 1555 lea ($inp), $arg1 1556 lea ($out), $arg2 1557 lea ($key), $arg3 1558 call asm_AES_decrypt 1559 lea 16($inp), $inp 1560 lea 16($out), $out 1561 dec $len 1562 jnz .Lecb_dec_short 1563 1564.Lecb_dec_done: 1565 lea (%rsp),%rax 1566 pxor %xmm0, %xmm0 1567.Lecb_dec_bzero: # wipe key schedule [if any] 1568 movdqa %xmm0, 0x00(%rax) 1569 movdqa %xmm0, 0x10(%rax) 1570 lea 0x20(%rax), %rax 1571 cmp %rax, %rbp 1572 jb .Lecb_dec_bzero 1573 1574 lea 0x78(%rbp),%rax 1575.cfi_def_cfa %rax,8 1576___ 1577$code.=<<___ if ($win64); 1578 movaps 0x40(%rbp), %xmm6 1579 movaps 0x50(%rbp), %xmm7 1580 movaps 0x60(%rbp), %xmm8 1581 movaps 0x70(%rbp), %xmm9 1582 movaps 0x80(%rbp), %xmm10 1583 movaps 0x90(%rbp), %xmm11 1584 movaps 0xa0(%rbp), %xmm12 1585 movaps 0xb0(%rbp), %xmm13 1586 movaps 0xc0(%rbp), %xmm14 1587 movaps 0xd0(%rbp), %xmm15 1588 lea 0xa0(%rax), %rax 1589.Lecb_dec_tail: 1590___ 1591$code.=<<___; 1592 mov -48(%rax), %r15 1593.cfi_restore %r15 1594 mov -40(%rax), %r14 1595.cfi_restore %r14 1596 mov -32(%rax), %r13 1597.cfi_restore %r13 1598 mov -24(%rax), %r12 1599.cfi_restore %r12 1600 mov -16(%rax), %rbx 1601.cfi_restore %rbx 1602 mov -8(%rax), %rbp 1603.cfi_restore %rbp 1604 lea (%rax), %rsp # restore %rsp 1605.cfi_def_cfa_register %rsp 1606.Lecb_dec_epilogue: 1607 ret 1608.cfi_endproc 1609.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1610___ 1611} 1612$code.=<<___; 1613.extern asm_AES_cbc_encrypt 1614.globl ossl_bsaes_cbc_encrypt 1615.type ossl_bsaes_cbc_encrypt,\@abi-omnipotent 1616.align 16 1617ossl_bsaes_cbc_encrypt: 1618.cfi_startproc 1619 endbranch 1620___ 1621$code.=<<___ if ($win64); 1622 mov 48(%rsp),$arg6 # pull direction flag 1623___ 1624$code.=<<___; 1625 cmp \$0,$arg6 1626 jne asm_AES_cbc_encrypt 1627 cmp \$128,$arg3 1628 jb asm_AES_cbc_encrypt 1629 1630 mov %rsp, %rax 1631.Lcbc_dec_prologue: 1632 push %rbp 1633.cfi_push %rbp 1634 push %rbx 1635.cfi_push %rbx 1636 push %r12 1637.cfi_push %r12 1638 push %r13 1639.cfi_push %r13 1640 push %r14 1641.cfi_push %r14 1642 push %r15 1643.cfi_push %r15 1644 lea -0x48(%rsp), %rsp 1645.cfi_adjust_cfa_offset 0x48 1646___ 1647$code.=<<___ if ($win64); 1648 mov 0xa0(%rsp),$arg5 # pull ivp 1649 lea -0xa0(%rsp), %rsp 1650 movaps %xmm6, 0x40(%rsp) 1651 movaps %xmm7, 0x50(%rsp) 1652 movaps %xmm8, 0x60(%rsp) 1653 movaps %xmm9, 0x70(%rsp) 1654 movaps %xmm10, 0x80(%rsp) 1655 movaps %xmm11, 0x90(%rsp) 1656 movaps %xmm12, 0xa0(%rsp) 1657 movaps %xmm13, 0xb0(%rsp) 1658 movaps %xmm14, 0xc0(%rsp) 1659 movaps %xmm15, 0xd0(%rsp) 1660.Lcbc_dec_body: 1661___ 1662$code.=<<___; 1663 mov %rsp, %rbp # backup %rsp 1664.cfi_def_cfa_register %rbp 1665 mov 240($arg4), %eax # rounds 1666 mov $arg1, $inp # backup arguments 1667 mov $arg2, $out 1668 mov $arg3, $len 1669 mov $arg4, $key 1670 mov $arg5, %rbx 1671 shr \$4, $len # bytes to blocks 1672 1673 mov %eax, %edx # rounds 1674 shl \$7, %rax # 128 bytes per inner round key 1675 sub \$`128-32`, %rax # size of bit-sliced key schedule 1676 sub %rax, %rsp 1677 1678 mov %rsp, %rax # pass key schedule 1679 mov $key, %rcx # pass key 1680 mov %edx, %r10d # pass rounds 1681 call _bsaes_key_convert 1682 pxor (%rsp),%xmm7 # fix up 0 round key 1683 movdqa %xmm6,(%rax) # save last round key 1684 movdqa %xmm7,(%rsp) 1685 1686 movdqu (%rbx), @XMM[15] # load IV 1687 sub \$8,$len 1688.Lcbc_dec_loop: 1689 movdqu 0x00($inp), @XMM[0] # load input 1690 movdqu 0x10($inp), @XMM[1] 1691 movdqu 0x20($inp), @XMM[2] 1692 movdqu 0x30($inp), @XMM[3] 1693 movdqu 0x40($inp), @XMM[4] 1694 movdqu 0x50($inp), @XMM[5] 1695 mov %rsp, %rax # pass key schedule 1696 movdqu 0x60($inp), @XMM[6] 1697 mov %edx,%r10d # pass rounds 1698 movdqu 0x70($inp), @XMM[7] 1699 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1700 1701 call _bsaes_decrypt8 1702 1703 pxor 0x20(%rbp), @XMM[0] # ^= IV 1704 movdqu 0x00($inp), @XMM[8] # re-load input 1705 movdqu 0x10($inp), @XMM[9] 1706 pxor @XMM[8], @XMM[1] 1707 movdqu 0x20($inp), @XMM[10] 1708 pxor @XMM[9], @XMM[6] 1709 movdqu 0x30($inp), @XMM[11] 1710 pxor @XMM[10], @XMM[4] 1711 movdqu 0x40($inp), @XMM[12] 1712 pxor @XMM[11], @XMM[2] 1713 movdqu 0x50($inp), @XMM[13] 1714 pxor @XMM[12], @XMM[7] 1715 movdqu 0x60($inp), @XMM[14] 1716 pxor @XMM[13], @XMM[3] 1717 movdqu 0x70($inp), @XMM[15] # IV 1718 pxor @XMM[14], @XMM[5] 1719 movdqu @XMM[0], 0x00($out) # write output 1720 lea 0x80($inp), $inp 1721 movdqu @XMM[1], 0x10($out) 1722 movdqu @XMM[6], 0x20($out) 1723 movdqu @XMM[4], 0x30($out) 1724 movdqu @XMM[2], 0x40($out) 1725 movdqu @XMM[7], 0x50($out) 1726 movdqu @XMM[3], 0x60($out) 1727 movdqu @XMM[5], 0x70($out) 1728 lea 0x80($out), $out 1729 sub \$8,$len 1730 jnc .Lcbc_dec_loop 1731 1732 add \$8,$len 1733 jz .Lcbc_dec_done 1734 1735 movdqu 0x00($inp), @XMM[0] # load input 1736 mov %rsp, %rax # pass key schedule 1737 mov %edx, %r10d # pass rounds 1738 cmp \$2,$len 1739 jb .Lcbc_dec_one 1740 movdqu 0x10($inp), @XMM[1] 1741 je .Lcbc_dec_two 1742 movdqu 0x20($inp), @XMM[2] 1743 cmp \$4,$len 1744 jb .Lcbc_dec_three 1745 movdqu 0x30($inp), @XMM[3] 1746 je .Lcbc_dec_four 1747 movdqu 0x40($inp), @XMM[4] 1748 cmp \$6,$len 1749 jb .Lcbc_dec_five 1750 movdqu 0x50($inp), @XMM[5] 1751 je .Lcbc_dec_six 1752 movdqu 0x60($inp), @XMM[6] 1753 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1754 call _bsaes_decrypt8 1755 pxor 0x20(%rbp), @XMM[0] # ^= IV 1756 movdqu 0x00($inp), @XMM[8] # re-load input 1757 movdqu 0x10($inp), @XMM[9] 1758 pxor @XMM[8], @XMM[1] 1759 movdqu 0x20($inp), @XMM[10] 1760 pxor @XMM[9], @XMM[6] 1761 movdqu 0x30($inp), @XMM[11] 1762 pxor @XMM[10], @XMM[4] 1763 movdqu 0x40($inp), @XMM[12] 1764 pxor @XMM[11], @XMM[2] 1765 movdqu 0x50($inp), @XMM[13] 1766 pxor @XMM[12], @XMM[7] 1767 movdqu 0x60($inp), @XMM[15] # IV 1768 pxor @XMM[13], @XMM[3] 1769 movdqu @XMM[0], 0x00($out) # write output 1770 movdqu @XMM[1], 0x10($out) 1771 movdqu @XMM[6], 0x20($out) 1772 movdqu @XMM[4], 0x30($out) 1773 movdqu @XMM[2], 0x40($out) 1774 movdqu @XMM[7], 0x50($out) 1775 movdqu @XMM[3], 0x60($out) 1776 jmp .Lcbc_dec_done 1777.align 16 1778.Lcbc_dec_six: 1779 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1780 call _bsaes_decrypt8 1781 pxor 0x20(%rbp), @XMM[0] # ^= IV 1782 movdqu 0x00($inp), @XMM[8] # re-load input 1783 movdqu 0x10($inp), @XMM[9] 1784 pxor @XMM[8], @XMM[1] 1785 movdqu 0x20($inp), @XMM[10] 1786 pxor @XMM[9], @XMM[6] 1787 movdqu 0x30($inp), @XMM[11] 1788 pxor @XMM[10], @XMM[4] 1789 movdqu 0x40($inp), @XMM[12] 1790 pxor @XMM[11], @XMM[2] 1791 movdqu 0x50($inp), @XMM[15] # IV 1792 pxor @XMM[12], @XMM[7] 1793 movdqu @XMM[0], 0x00($out) # write output 1794 movdqu @XMM[1], 0x10($out) 1795 movdqu @XMM[6], 0x20($out) 1796 movdqu @XMM[4], 0x30($out) 1797 movdqu @XMM[2], 0x40($out) 1798 movdqu @XMM[7], 0x50($out) 1799 jmp .Lcbc_dec_done 1800.align 16 1801.Lcbc_dec_five: 1802 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1803 call _bsaes_decrypt8 1804 pxor 0x20(%rbp), @XMM[0] # ^= IV 1805 movdqu 0x00($inp), @XMM[8] # re-load input 1806 movdqu 0x10($inp), @XMM[9] 1807 pxor @XMM[8], @XMM[1] 1808 movdqu 0x20($inp), @XMM[10] 1809 pxor @XMM[9], @XMM[6] 1810 movdqu 0x30($inp), @XMM[11] 1811 pxor @XMM[10], @XMM[4] 1812 movdqu 0x40($inp), @XMM[15] # IV 1813 pxor @XMM[11], @XMM[2] 1814 movdqu @XMM[0], 0x00($out) # write output 1815 movdqu @XMM[1], 0x10($out) 1816 movdqu @XMM[6], 0x20($out) 1817 movdqu @XMM[4], 0x30($out) 1818 movdqu @XMM[2], 0x40($out) 1819 jmp .Lcbc_dec_done 1820.align 16 1821.Lcbc_dec_four: 1822 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1823 call _bsaes_decrypt8 1824 pxor 0x20(%rbp), @XMM[0] # ^= IV 1825 movdqu 0x00($inp), @XMM[8] # re-load input 1826 movdqu 0x10($inp), @XMM[9] 1827 pxor @XMM[8], @XMM[1] 1828 movdqu 0x20($inp), @XMM[10] 1829 pxor @XMM[9], @XMM[6] 1830 movdqu 0x30($inp), @XMM[15] # IV 1831 pxor @XMM[10], @XMM[4] 1832 movdqu @XMM[0], 0x00($out) # write output 1833 movdqu @XMM[1], 0x10($out) 1834 movdqu @XMM[6], 0x20($out) 1835 movdqu @XMM[4], 0x30($out) 1836 jmp .Lcbc_dec_done 1837.align 16 1838.Lcbc_dec_three: 1839 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1840 call _bsaes_decrypt8 1841 pxor 0x20(%rbp), @XMM[0] # ^= IV 1842 movdqu 0x00($inp), @XMM[8] # re-load input 1843 movdqu 0x10($inp), @XMM[9] 1844 pxor @XMM[8], @XMM[1] 1845 movdqu 0x20($inp), @XMM[15] # IV 1846 pxor @XMM[9], @XMM[6] 1847 movdqu @XMM[0], 0x00($out) # write output 1848 movdqu @XMM[1], 0x10($out) 1849 movdqu @XMM[6], 0x20($out) 1850 jmp .Lcbc_dec_done 1851.align 16 1852.Lcbc_dec_two: 1853 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1854 call _bsaes_decrypt8 1855 pxor 0x20(%rbp), @XMM[0] # ^= IV 1856 movdqu 0x00($inp), @XMM[8] # re-load input 1857 movdqu 0x10($inp), @XMM[15] # IV 1858 pxor @XMM[8], @XMM[1] 1859 movdqu @XMM[0], 0x00($out) # write output 1860 movdqu @XMM[1], 0x10($out) 1861 jmp .Lcbc_dec_done 1862.align 16 1863.Lcbc_dec_one: 1864 lea ($inp), $arg1 1865 lea 0x20(%rbp), $arg2 # buffer output 1866 lea ($key), $arg3 1867 call asm_AES_decrypt # doesn't touch %xmm 1868 pxor 0x20(%rbp), @XMM[15] # ^= IV 1869 movdqu @XMM[15], ($out) # write output 1870 movdqa @XMM[0], @XMM[15] # IV 1871 1872.Lcbc_dec_done: 1873 movdqu @XMM[15], (%rbx) # return IV 1874 lea (%rsp), %rax 1875 pxor %xmm0, %xmm0 1876.Lcbc_dec_bzero: # wipe key schedule [if any] 1877 movdqa %xmm0, 0x00(%rax) 1878 movdqa %xmm0, 0x10(%rax) 1879 lea 0x20(%rax), %rax 1880 cmp %rax, %rbp 1881 ja .Lcbc_dec_bzero 1882 1883 lea 0x78(%rbp),%rax 1884.cfi_def_cfa %rax,8 1885___ 1886$code.=<<___ if ($win64); 1887 movaps 0x40(%rbp), %xmm6 1888 movaps 0x50(%rbp), %xmm7 1889 movaps 0x60(%rbp), %xmm8 1890 movaps 0x70(%rbp), %xmm9 1891 movaps 0x80(%rbp), %xmm10 1892 movaps 0x90(%rbp), %xmm11 1893 movaps 0xa0(%rbp), %xmm12 1894 movaps 0xb0(%rbp), %xmm13 1895 movaps 0xc0(%rbp), %xmm14 1896 movaps 0xd0(%rbp), %xmm15 1897 lea 0xa0(%rax), %rax 1898.Lcbc_dec_tail: 1899___ 1900$code.=<<___; 1901 mov -48(%rax), %r15 1902.cfi_restore %r15 1903 mov -40(%rax), %r14 1904.cfi_restore %r14 1905 mov -32(%rax), %r13 1906.cfi_restore %r13 1907 mov -24(%rax), %r12 1908.cfi_restore %r12 1909 mov -16(%rax), %rbx 1910.cfi_restore %rbx 1911 mov -8(%rax), %rbp 1912.cfi_restore %rbp 1913 lea (%rax), %rsp # restore %rsp 1914.cfi_def_cfa_register %rsp 1915.Lcbc_dec_epilogue: 1916 ret 1917.cfi_endproc 1918.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt 1919 1920.globl ossl_bsaes_ctr32_encrypt_blocks 1921.type ossl_bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1922.align 16 1923ossl_bsaes_ctr32_encrypt_blocks: 1924.cfi_startproc 1925 endbranch 1926 mov %rsp, %rax 1927.Lctr_enc_prologue: 1928 push %rbp 1929.cfi_push %rbp 1930 push %rbx 1931.cfi_push %rbx 1932 push %r12 1933.cfi_push %r12 1934 push %r13 1935.cfi_push %r13 1936 push %r14 1937.cfi_push %r14 1938 push %r15 1939.cfi_push %r15 1940 lea -0x48(%rsp), %rsp 1941.cfi_adjust_cfa_offset 0x48 1942___ 1943$code.=<<___ if ($win64); 1944 mov 0xa0(%rsp),$arg5 # pull ivp 1945 lea -0xa0(%rsp), %rsp 1946 movaps %xmm6, 0x40(%rsp) 1947 movaps %xmm7, 0x50(%rsp) 1948 movaps %xmm8, 0x60(%rsp) 1949 movaps %xmm9, 0x70(%rsp) 1950 movaps %xmm10, 0x80(%rsp) 1951 movaps %xmm11, 0x90(%rsp) 1952 movaps %xmm12, 0xa0(%rsp) 1953 movaps %xmm13, 0xb0(%rsp) 1954 movaps %xmm14, 0xc0(%rsp) 1955 movaps %xmm15, 0xd0(%rsp) 1956.Lctr_enc_body: 1957___ 1958$code.=<<___; 1959 mov %rsp, %rbp # backup %rsp 1960.cfi_def_cfa_register %rbp 1961 movdqu ($arg5), %xmm0 # load counter 1962 mov 240($arg4), %eax # rounds 1963 mov $arg1, $inp # backup arguments 1964 mov $arg2, $out 1965 mov $arg3, $len 1966 mov $arg4, $key 1967 movdqa %xmm0, 0x20(%rbp) # copy counter 1968 cmp \$8, $arg3 1969 jb .Lctr_enc_short 1970 1971 mov %eax, %ebx # rounds 1972 shl \$7, %rax # 128 bytes per inner round key 1973 sub \$`128-32`, %rax # size of bit-sliced key schedule 1974 sub %rax, %rsp 1975 1976 mov %rsp, %rax # pass key schedule 1977 mov $key, %rcx # pass key 1978 mov %ebx, %r10d # pass rounds 1979 call _bsaes_key_convert 1980 pxor %xmm6,%xmm7 # fix up last round key 1981 movdqa %xmm7,(%rax) # save last round key 1982 1983 movdqa (%rsp), @XMM[9] # load round0 key 1984 lea .LADD1(%rip), %r11 1985 movdqa 0x20(%rbp), @XMM[0] # counter copy 1986 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1987 pshufb @XMM[8], @XMM[9] # byte swap upper part 1988 pshufb @XMM[8], @XMM[0] 1989 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1990 jmp .Lctr_enc_loop 1991.align 16 1992.Lctr_enc_loop: 1993 movdqa @XMM[0], 0x20(%rbp) # save counter 1994 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1995 movdqa @XMM[0], @XMM[2] 1996 paddd 0x00(%r11), @XMM[1] # .LADD1 1997 movdqa @XMM[0], @XMM[3] 1998 paddd 0x10(%r11), @XMM[2] # .LADD2 1999 movdqa @XMM[0], @XMM[4] 2000 paddd 0x20(%r11), @XMM[3] # .LADD3 2001 movdqa @XMM[0], @XMM[5] 2002 paddd 0x30(%r11), @XMM[4] # .LADD4 2003 movdqa @XMM[0], @XMM[6] 2004 paddd 0x40(%r11), @XMM[5] # .LADD5 2005 movdqa @XMM[0], @XMM[7] 2006 paddd 0x50(%r11), @XMM[6] # .LADD6 2007 paddd 0x60(%r11), @XMM[7] # .LADD7 2008 2009 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 2010 # to flip byte order in 32-bit counter 2011 movdqa (%rsp), @XMM[9] # round 0 key 2012 lea 0x10(%rsp), %rax # pass key schedule 2013 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 2014 pxor @XMM[9], @XMM[0] # xor with round0 key 2015 pxor @XMM[9], @XMM[1] 2016 pxor @XMM[9], @XMM[2] 2017 pxor @XMM[9], @XMM[3] 2018 pshufb @XMM[8], @XMM[0] 2019 pshufb @XMM[8], @XMM[1] 2020 pxor @XMM[9], @XMM[4] 2021 pxor @XMM[9], @XMM[5] 2022 pshufb @XMM[8], @XMM[2] 2023 pshufb @XMM[8], @XMM[3] 2024 pxor @XMM[9], @XMM[6] 2025 pxor @XMM[9], @XMM[7] 2026 pshufb @XMM[8], @XMM[4] 2027 pshufb @XMM[8], @XMM[5] 2028 pshufb @XMM[8], @XMM[6] 2029 pshufb @XMM[8], @XMM[7] 2030 lea .LBS0(%rip), %r11 # constants table 2031 mov %ebx,%r10d # pass rounds 2032 2033 call _bsaes_encrypt8_bitslice 2034 2035 sub \$8,$len 2036 jc .Lctr_enc_loop_done 2037 2038 movdqu 0x00($inp), @XMM[8] # load input 2039 movdqu 0x10($inp), @XMM[9] 2040 movdqu 0x20($inp), @XMM[10] 2041 movdqu 0x30($inp), @XMM[11] 2042 movdqu 0x40($inp), @XMM[12] 2043 movdqu 0x50($inp), @XMM[13] 2044 movdqu 0x60($inp), @XMM[14] 2045 movdqu 0x70($inp), @XMM[15] 2046 lea 0x80($inp),$inp 2047 pxor @XMM[0], @XMM[8] 2048 movdqa 0x20(%rbp), @XMM[0] # load counter 2049 pxor @XMM[9], @XMM[1] 2050 movdqu @XMM[8], 0x00($out) # write output 2051 pxor @XMM[10], @XMM[4] 2052 movdqu @XMM[1], 0x10($out) 2053 pxor @XMM[11], @XMM[6] 2054 movdqu @XMM[4], 0x20($out) 2055 pxor @XMM[12], @XMM[3] 2056 movdqu @XMM[6], 0x30($out) 2057 pxor @XMM[13], @XMM[7] 2058 movdqu @XMM[3], 0x40($out) 2059 pxor @XMM[14], @XMM[2] 2060 movdqu @XMM[7], 0x50($out) 2061 pxor @XMM[15], @XMM[5] 2062 movdqu @XMM[2], 0x60($out) 2063 lea .LADD1(%rip), %r11 2064 movdqu @XMM[5], 0x70($out) 2065 lea 0x80($out), $out 2066 paddd 0x70(%r11), @XMM[0] # .LADD8 2067 jnz .Lctr_enc_loop 2068 2069 jmp .Lctr_enc_done 2070.align 16 2071.Lctr_enc_loop_done: 2072 add \$8, $len 2073 movdqu 0x00($inp), @XMM[8] # load input 2074 pxor @XMM[8], @XMM[0] 2075 movdqu @XMM[0], 0x00($out) # write output 2076 cmp \$2,$len 2077 jb .Lctr_enc_done 2078 movdqu 0x10($inp), @XMM[9] 2079 pxor @XMM[9], @XMM[1] 2080 movdqu @XMM[1], 0x10($out) 2081 je .Lctr_enc_done 2082 movdqu 0x20($inp), @XMM[10] 2083 pxor @XMM[10], @XMM[4] 2084 movdqu @XMM[4], 0x20($out) 2085 cmp \$4,$len 2086 jb .Lctr_enc_done 2087 movdqu 0x30($inp), @XMM[11] 2088 pxor @XMM[11], @XMM[6] 2089 movdqu @XMM[6], 0x30($out) 2090 je .Lctr_enc_done 2091 movdqu 0x40($inp), @XMM[12] 2092 pxor @XMM[12], @XMM[3] 2093 movdqu @XMM[3], 0x40($out) 2094 cmp \$6,$len 2095 jb .Lctr_enc_done 2096 movdqu 0x50($inp), @XMM[13] 2097 pxor @XMM[13], @XMM[7] 2098 movdqu @XMM[7], 0x50($out) 2099 je .Lctr_enc_done 2100 movdqu 0x60($inp), @XMM[14] 2101 pxor @XMM[14], @XMM[2] 2102 movdqu @XMM[2], 0x60($out) 2103 jmp .Lctr_enc_done 2104 2105.align 16 2106.Lctr_enc_short: 2107 lea 0x20(%rbp), $arg1 2108 lea 0x30(%rbp), $arg2 2109 lea ($key), $arg3 2110 call asm_AES_encrypt 2111 movdqu ($inp), @XMM[1] 2112 lea 16($inp), $inp 2113 mov 0x2c(%rbp), %eax # load 32-bit counter 2114 bswap %eax 2115 pxor 0x30(%rbp), @XMM[1] 2116 inc %eax # increment 2117 movdqu @XMM[1], ($out) 2118 bswap %eax 2119 lea 16($out), $out 2120 mov %eax, 0x2c(%rsp) # save 32-bit counter 2121 dec $len 2122 jnz .Lctr_enc_short 2123 2124.Lctr_enc_done: 2125 lea (%rsp), %rax 2126 pxor %xmm0, %xmm0 2127.Lctr_enc_bzero: # wipe key schedule [if any] 2128 movdqa %xmm0, 0x00(%rax) 2129 movdqa %xmm0, 0x10(%rax) 2130 lea 0x20(%rax), %rax 2131 cmp %rax, %rbp 2132 ja .Lctr_enc_bzero 2133 2134 lea 0x78(%rbp),%rax 2135.cfi_def_cfa %rax,8 2136___ 2137$code.=<<___ if ($win64); 2138 movaps 0x40(%rbp), %xmm6 2139 movaps 0x50(%rbp), %xmm7 2140 movaps 0x60(%rbp), %xmm8 2141 movaps 0x70(%rbp), %xmm9 2142 movaps 0x80(%rbp), %xmm10 2143 movaps 0x90(%rbp), %xmm11 2144 movaps 0xa0(%rbp), %xmm12 2145 movaps 0xb0(%rbp), %xmm13 2146 movaps 0xc0(%rbp), %xmm14 2147 movaps 0xd0(%rbp), %xmm15 2148 lea 0xa0(%rax), %rax 2149.Lctr_enc_tail: 2150___ 2151$code.=<<___; 2152 mov -48(%rax), %r15 2153.cfi_restore %r15 2154 mov -40(%rax), %r14 2155.cfi_restore %r14 2156 mov -32(%rax), %r13 2157.cfi_restore %r13 2158 mov -24(%rax), %r12 2159.cfi_restore %r12 2160 mov -16(%rax), %rbx 2161.cfi_restore %rbx 2162 mov -8(%rax), %rbp 2163.cfi_restore %rbp 2164 lea (%rax), %rsp # restore %rsp 2165.cfi_def_cfa_register %rsp 2166.Lctr_enc_epilogue: 2167 ret 2168.cfi_endproc 2169.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks 2170___ 2171###################################################################### 2172# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2173# const AES_KEY *key1, const AES_KEY *key2, 2174# const unsigned char iv[16]); 2175# 2176my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2177$arg6=~s/d$//; 2178 2179$code.=<<___; 2180.globl ossl_bsaes_xts_encrypt 2181.type ossl_bsaes_xts_encrypt,\@abi-omnipotent 2182.align 16 2183ossl_bsaes_xts_encrypt: 2184.cfi_startproc 2185 endbranch 2186 mov %rsp, %rax 2187.Lxts_enc_prologue: 2188 push %rbp 2189.cfi_push %rbp 2190 push %rbx 2191.cfi_push %rbx 2192 push %r12 2193.cfi_push %r12 2194 push %r13 2195.cfi_push %r13 2196 push %r14 2197.cfi_push %r14 2198 push %r15 2199.cfi_push %r15 2200 lea -0x48(%rsp), %rsp 2201.cfi_adjust_cfa_offset 0x48 2202___ 2203$code.=<<___ if ($win64); 2204 mov 0xa0(%rsp),$arg5 # pull key2 2205 mov 0xa8(%rsp),$arg6 # pull ivp 2206 lea -0xa0(%rsp), %rsp 2207 movaps %xmm6, 0x40(%rsp) 2208 movaps %xmm7, 0x50(%rsp) 2209 movaps %xmm8, 0x60(%rsp) 2210 movaps %xmm9, 0x70(%rsp) 2211 movaps %xmm10, 0x80(%rsp) 2212 movaps %xmm11, 0x90(%rsp) 2213 movaps %xmm12, 0xa0(%rsp) 2214 movaps %xmm13, 0xb0(%rsp) 2215 movaps %xmm14, 0xc0(%rsp) 2216 movaps %xmm15, 0xd0(%rsp) 2217.Lxts_enc_body: 2218___ 2219$code.=<<___; 2220 mov %rsp, %rbp # backup %rsp 2221.cfi_def_cfa_register %rbp 2222 mov $arg1, $inp # backup arguments 2223 mov $arg2, $out 2224 mov $arg3, $len 2225 mov $arg4, $key 2226 2227 lea ($arg6), $arg1 2228 lea 0x20(%rbp), $arg2 2229 lea ($arg5), $arg3 2230 call asm_AES_encrypt # generate initial tweak 2231 2232 mov 240($key), %eax # rounds 2233 mov $len, %rbx # backup $len 2234 2235 mov %eax, %edx # rounds 2236 shl \$7, %rax # 128 bytes per inner round key 2237 sub \$`128-32`, %rax # size of bit-sliced key schedule 2238 sub %rax, %rsp 2239 2240 mov %rsp, %rax # pass key schedule 2241 mov $key, %rcx # pass key 2242 mov %edx, %r10d # pass rounds 2243 call _bsaes_key_convert 2244 pxor %xmm6, %xmm7 # fix up last round key 2245 movdqa %xmm7, (%rax) # save last round key 2246 2247 and \$-16, $len 2248 sub \$0x80, %rsp # place for tweak[8] 2249 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2250 2251 pxor $twtmp, $twtmp 2252 movdqa .Lxts_magic(%rip), $twmask 2253 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2254 2255 sub \$0x80, $len 2256 jc .Lxts_enc_short 2257 jmp .Lxts_enc_loop 2258 2259.align 16 2260.Lxts_enc_loop: 2261___ 2262 for ($i=0;$i<7;$i++) { 2263 $code.=<<___; 2264 pshufd \$0x13, $twtmp, $twres 2265 pxor $twtmp, $twtmp 2266 movdqa @XMM[7], @XMM[$i] 2267 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2268 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2269 pand $twmask, $twres # isolate carry and residue 2270 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2271 pxor $twres, @XMM[7] 2272___ 2273 $code.=<<___ if ($i>=1); 2274 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2275___ 2276 $code.=<<___ if ($i>=2); 2277 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2278___ 2279 } 2280$code.=<<___; 2281 movdqu 0x60($inp), @XMM[8+6] 2282 pxor @XMM[8+5], @XMM[5] 2283 movdqu 0x70($inp), @XMM[8+7] 2284 lea 0x80($inp), $inp 2285 movdqa @XMM[7], 0x70(%rsp) 2286 pxor @XMM[8+6], @XMM[6] 2287 lea 0x80(%rsp), %rax # pass key schedule 2288 pxor @XMM[8+7], @XMM[7] 2289 mov %edx, %r10d # pass rounds 2290 2291 call _bsaes_encrypt8 2292 2293 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2294 pxor 0x10(%rsp), @XMM[1] 2295 movdqu @XMM[0], 0x00($out) # write output 2296 pxor 0x20(%rsp), @XMM[4] 2297 movdqu @XMM[1], 0x10($out) 2298 pxor 0x30(%rsp), @XMM[6] 2299 movdqu @XMM[4], 0x20($out) 2300 pxor 0x40(%rsp), @XMM[3] 2301 movdqu @XMM[6], 0x30($out) 2302 pxor 0x50(%rsp), @XMM[7] 2303 movdqu @XMM[3], 0x40($out) 2304 pxor 0x60(%rsp), @XMM[2] 2305 movdqu @XMM[7], 0x50($out) 2306 pxor 0x70(%rsp), @XMM[5] 2307 movdqu @XMM[2], 0x60($out) 2308 movdqu @XMM[5], 0x70($out) 2309 lea 0x80($out), $out 2310 2311 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2312 pxor $twtmp, $twtmp 2313 movdqa .Lxts_magic(%rip), $twmask 2314 pcmpgtd @XMM[7], $twtmp 2315 pshufd \$0x13, $twtmp, $twres 2316 pxor $twtmp, $twtmp 2317 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2318 pand $twmask, $twres # isolate carry and residue 2319 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2320 pxor $twres, @XMM[7] 2321 2322 sub \$0x80,$len 2323 jnc .Lxts_enc_loop 2324 2325.Lxts_enc_short: 2326 add \$0x80, $len 2327 jz .Lxts_enc_done 2328___ 2329 for ($i=0;$i<7;$i++) { 2330 $code.=<<___; 2331 pshufd \$0x13, $twtmp, $twres 2332 pxor $twtmp, $twtmp 2333 movdqa @XMM[7], @XMM[$i] 2334 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2335 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2336 pand $twmask, $twres # isolate carry and residue 2337 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2338 pxor $twres, @XMM[7] 2339___ 2340 $code.=<<___ if ($i>=1); 2341 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2342 cmp \$`0x10*$i`,$len 2343 je .Lxts_enc_$i 2344___ 2345 $code.=<<___ if ($i>=2); 2346 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2347___ 2348 } 2349$code.=<<___; 2350 movdqu 0x60($inp), @XMM[8+6] 2351 pxor @XMM[8+5], @XMM[5] 2352 movdqa @XMM[7], 0x70(%rsp) 2353 lea 0x70($inp), $inp 2354 pxor @XMM[8+6], @XMM[6] 2355 lea 0x80(%rsp), %rax # pass key schedule 2356 mov %edx, %r10d # pass rounds 2357 2358 call _bsaes_encrypt8 2359 2360 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2361 pxor 0x10(%rsp), @XMM[1] 2362 movdqu @XMM[0], 0x00($out) # write output 2363 pxor 0x20(%rsp), @XMM[4] 2364 movdqu @XMM[1], 0x10($out) 2365 pxor 0x30(%rsp), @XMM[6] 2366 movdqu @XMM[4], 0x20($out) 2367 pxor 0x40(%rsp), @XMM[3] 2368 movdqu @XMM[6], 0x30($out) 2369 pxor 0x50(%rsp), @XMM[7] 2370 movdqu @XMM[3], 0x40($out) 2371 pxor 0x60(%rsp), @XMM[2] 2372 movdqu @XMM[7], 0x50($out) 2373 movdqu @XMM[2], 0x60($out) 2374 lea 0x70($out), $out 2375 2376 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2377 jmp .Lxts_enc_done 2378.align 16 2379.Lxts_enc_6: 2380 pxor @XMM[8+4], @XMM[4] 2381 lea 0x60($inp), $inp 2382 pxor @XMM[8+5], @XMM[5] 2383 lea 0x80(%rsp), %rax # pass key schedule 2384 mov %edx, %r10d # pass rounds 2385 2386 call _bsaes_encrypt8 2387 2388 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2389 pxor 0x10(%rsp), @XMM[1] 2390 movdqu @XMM[0], 0x00($out) # write output 2391 pxor 0x20(%rsp), @XMM[4] 2392 movdqu @XMM[1], 0x10($out) 2393 pxor 0x30(%rsp), @XMM[6] 2394 movdqu @XMM[4], 0x20($out) 2395 pxor 0x40(%rsp), @XMM[3] 2396 movdqu @XMM[6], 0x30($out) 2397 pxor 0x50(%rsp), @XMM[7] 2398 movdqu @XMM[3], 0x40($out) 2399 movdqu @XMM[7], 0x50($out) 2400 lea 0x60($out), $out 2401 2402 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2403 jmp .Lxts_enc_done 2404.align 16 2405.Lxts_enc_5: 2406 pxor @XMM[8+3], @XMM[3] 2407 lea 0x50($inp), $inp 2408 pxor @XMM[8+4], @XMM[4] 2409 lea 0x80(%rsp), %rax # pass key schedule 2410 mov %edx, %r10d # pass rounds 2411 2412 call _bsaes_encrypt8 2413 2414 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2415 pxor 0x10(%rsp), @XMM[1] 2416 movdqu @XMM[0], 0x00($out) # write output 2417 pxor 0x20(%rsp), @XMM[4] 2418 movdqu @XMM[1], 0x10($out) 2419 pxor 0x30(%rsp), @XMM[6] 2420 movdqu @XMM[4], 0x20($out) 2421 pxor 0x40(%rsp), @XMM[3] 2422 movdqu @XMM[6], 0x30($out) 2423 movdqu @XMM[3], 0x40($out) 2424 lea 0x50($out), $out 2425 2426 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2427 jmp .Lxts_enc_done 2428.align 16 2429.Lxts_enc_4: 2430 pxor @XMM[8+2], @XMM[2] 2431 lea 0x40($inp), $inp 2432 pxor @XMM[8+3], @XMM[3] 2433 lea 0x80(%rsp), %rax # pass key schedule 2434 mov %edx, %r10d # pass rounds 2435 2436 call _bsaes_encrypt8 2437 2438 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2439 pxor 0x10(%rsp), @XMM[1] 2440 movdqu @XMM[0], 0x00($out) # write output 2441 pxor 0x20(%rsp), @XMM[4] 2442 movdqu @XMM[1], 0x10($out) 2443 pxor 0x30(%rsp), @XMM[6] 2444 movdqu @XMM[4], 0x20($out) 2445 movdqu @XMM[6], 0x30($out) 2446 lea 0x40($out), $out 2447 2448 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2449 jmp .Lxts_enc_done 2450.align 16 2451.Lxts_enc_3: 2452 pxor @XMM[8+1], @XMM[1] 2453 lea 0x30($inp), $inp 2454 pxor @XMM[8+2], @XMM[2] 2455 lea 0x80(%rsp), %rax # pass key schedule 2456 mov %edx, %r10d # pass rounds 2457 2458 call _bsaes_encrypt8 2459 2460 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2461 pxor 0x10(%rsp), @XMM[1] 2462 movdqu @XMM[0], 0x00($out) # write output 2463 pxor 0x20(%rsp), @XMM[4] 2464 movdqu @XMM[1], 0x10($out) 2465 movdqu @XMM[4], 0x20($out) 2466 lea 0x30($out), $out 2467 2468 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2469 jmp .Lxts_enc_done 2470.align 16 2471.Lxts_enc_2: 2472 pxor @XMM[8+0], @XMM[0] 2473 lea 0x20($inp), $inp 2474 pxor @XMM[8+1], @XMM[1] 2475 lea 0x80(%rsp), %rax # pass key schedule 2476 mov %edx, %r10d # pass rounds 2477 2478 call _bsaes_encrypt8 2479 2480 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2481 pxor 0x10(%rsp), @XMM[1] 2482 movdqu @XMM[0], 0x00($out) # write output 2483 movdqu @XMM[1], 0x10($out) 2484 lea 0x20($out), $out 2485 2486 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2487 jmp .Lxts_enc_done 2488.align 16 2489.Lxts_enc_1: 2490 pxor @XMM[0], @XMM[8] 2491 lea 0x10($inp), $inp 2492 movdqa @XMM[8], 0x20(%rbp) 2493 lea 0x20(%rbp), $arg1 2494 lea 0x20(%rbp), $arg2 2495 lea ($key), $arg3 2496 call asm_AES_encrypt # doesn't touch %xmm 2497 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2498 #pxor @XMM[8], @XMM[0] 2499 #lea 0x80(%rsp), %rax # pass key schedule 2500 #mov %edx, %r10d # pass rounds 2501 #call _bsaes_encrypt8 2502 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2503 movdqu @XMM[0], 0x00($out) # write output 2504 lea 0x10($out), $out 2505 2506 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2507 2508.Lxts_enc_done: 2509 and \$15, %ebx 2510 jz .Lxts_enc_ret 2511 mov $out, %rdx 2512 2513.Lxts_enc_steal: 2514 movzb ($inp), %eax 2515 movzb -16(%rdx), %ecx 2516 lea 1($inp), $inp 2517 mov %al, -16(%rdx) 2518 mov %cl, 0(%rdx) 2519 lea 1(%rdx), %rdx 2520 sub \$1,%ebx 2521 jnz .Lxts_enc_steal 2522 2523 movdqu -16($out), @XMM[0] 2524 lea 0x20(%rbp), $arg1 2525 pxor @XMM[7], @XMM[0] 2526 lea 0x20(%rbp), $arg2 2527 movdqa @XMM[0], 0x20(%rbp) 2528 lea ($key), $arg3 2529 call asm_AES_encrypt # doesn't touch %xmm 2530 pxor 0x20(%rbp), @XMM[7] 2531 movdqu @XMM[7], -16($out) 2532 2533.Lxts_enc_ret: 2534 lea (%rsp), %rax 2535 pxor %xmm0, %xmm0 2536.Lxts_enc_bzero: # wipe key schedule [if any] 2537 movdqa %xmm0, 0x00(%rax) 2538 movdqa %xmm0, 0x10(%rax) 2539 lea 0x20(%rax), %rax 2540 cmp %rax, %rbp 2541 ja .Lxts_enc_bzero 2542 2543 lea 0x78(%rbp),%rax 2544.cfi_def_cfa %rax,8 2545___ 2546$code.=<<___ if ($win64); 2547 movaps 0x40(%rbp), %xmm6 2548 movaps 0x50(%rbp), %xmm7 2549 movaps 0x60(%rbp), %xmm8 2550 movaps 0x70(%rbp), %xmm9 2551 movaps 0x80(%rbp), %xmm10 2552 movaps 0x90(%rbp), %xmm11 2553 movaps 0xa0(%rbp), %xmm12 2554 movaps 0xb0(%rbp), %xmm13 2555 movaps 0xc0(%rbp), %xmm14 2556 movaps 0xd0(%rbp), %xmm15 2557 lea 0xa0(%rax), %rax 2558.Lxts_enc_tail: 2559___ 2560$code.=<<___; 2561 mov -48(%rax), %r15 2562.cfi_restore %r15 2563 mov -40(%rax), %r14 2564.cfi_restore %r14 2565 mov -32(%rax), %r13 2566.cfi_restore %r13 2567 mov -24(%rax), %r12 2568.cfi_restore %r12 2569 mov -16(%rax), %rbx 2570.cfi_restore %rbx 2571 mov -8(%rax), %rbp 2572.cfi_restore %rbp 2573 lea (%rax), %rsp # restore %rsp 2574.cfi_def_cfa_register %rsp 2575.Lxts_enc_epilogue: 2576 ret 2577.cfi_endproc 2578.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt 2579 2580.globl ossl_bsaes_xts_decrypt 2581.type ossl_bsaes_xts_decrypt,\@abi-omnipotent 2582.align 16 2583ossl_bsaes_xts_decrypt: 2584.cfi_startproc 2585 endbranch 2586 mov %rsp, %rax 2587.Lxts_dec_prologue: 2588 push %rbp 2589.cfi_push %rbp 2590 push %rbx 2591.cfi_push %rbx 2592 push %r12 2593.cfi_push %r12 2594 push %r13 2595.cfi_push %r13 2596 push %r14 2597.cfi_push %r14 2598 push %r15 2599.cfi_push %r15 2600 lea -0x48(%rsp), %rsp 2601.cfi_adjust_cfa_offset 0x48 2602___ 2603$code.=<<___ if ($win64); 2604 mov 0xa0(%rsp),$arg5 # pull key2 2605 mov 0xa8(%rsp),$arg6 # pull ivp 2606 lea -0xa0(%rsp), %rsp 2607 movaps %xmm6, 0x40(%rsp) 2608 movaps %xmm7, 0x50(%rsp) 2609 movaps %xmm8, 0x60(%rsp) 2610 movaps %xmm9, 0x70(%rsp) 2611 movaps %xmm10, 0x80(%rsp) 2612 movaps %xmm11, 0x90(%rsp) 2613 movaps %xmm12, 0xa0(%rsp) 2614 movaps %xmm13, 0xb0(%rsp) 2615 movaps %xmm14, 0xc0(%rsp) 2616 movaps %xmm15, 0xd0(%rsp) 2617.Lxts_dec_body: 2618___ 2619$code.=<<___; 2620 mov %rsp, %rbp # backup %rsp 2621 mov $arg1, $inp # backup arguments 2622 mov $arg2, $out 2623 mov $arg3, $len 2624 mov $arg4, $key 2625 2626 lea ($arg6), $arg1 2627 lea 0x20(%rbp), $arg2 2628 lea ($arg5), $arg3 2629 call asm_AES_encrypt # generate initial tweak 2630 2631 mov 240($key), %eax # rounds 2632 mov $len, %rbx # backup $len 2633 2634 mov %eax, %edx # rounds 2635 shl \$7, %rax # 128 bytes per inner round key 2636 sub \$`128-32`, %rax # size of bit-sliced key schedule 2637 sub %rax, %rsp 2638 2639 mov %rsp, %rax # pass key schedule 2640 mov $key, %rcx # pass key 2641 mov %edx, %r10d # pass rounds 2642 call _bsaes_key_convert 2643 pxor (%rsp), %xmm7 # fix up round 0 key 2644 movdqa %xmm6, (%rax) # save last round key 2645 movdqa %xmm7, (%rsp) 2646 2647 xor %eax, %eax # if ($len%16) len-=16; 2648 and \$-16, $len 2649 test \$15, %ebx 2650 setnz %al 2651 shl \$4, %rax 2652 sub %rax, $len 2653 2654 sub \$0x80, %rsp # place for tweak[8] 2655 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2656 2657 pxor $twtmp, $twtmp 2658 movdqa .Lxts_magic(%rip), $twmask 2659 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2660 2661 sub \$0x80, $len 2662 jc .Lxts_dec_short 2663 jmp .Lxts_dec_loop 2664 2665.align 16 2666.Lxts_dec_loop: 2667___ 2668 for ($i=0;$i<7;$i++) { 2669 $code.=<<___; 2670 pshufd \$0x13, $twtmp, $twres 2671 pxor $twtmp, $twtmp 2672 movdqa @XMM[7], @XMM[$i] 2673 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2674 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2675 pand $twmask, $twres # isolate carry and residue 2676 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2677 pxor $twres, @XMM[7] 2678___ 2679 $code.=<<___ if ($i>=1); 2680 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2681___ 2682 $code.=<<___ if ($i>=2); 2683 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2684___ 2685 } 2686$code.=<<___; 2687 movdqu 0x60($inp), @XMM[8+6] 2688 pxor @XMM[8+5], @XMM[5] 2689 movdqu 0x70($inp), @XMM[8+7] 2690 lea 0x80($inp), $inp 2691 movdqa @XMM[7], 0x70(%rsp) 2692 pxor @XMM[8+6], @XMM[6] 2693 lea 0x80(%rsp), %rax # pass key schedule 2694 pxor @XMM[8+7], @XMM[7] 2695 mov %edx, %r10d # pass rounds 2696 2697 call _bsaes_decrypt8 2698 2699 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2700 pxor 0x10(%rsp), @XMM[1] 2701 movdqu @XMM[0], 0x00($out) # write output 2702 pxor 0x20(%rsp), @XMM[6] 2703 movdqu @XMM[1], 0x10($out) 2704 pxor 0x30(%rsp), @XMM[4] 2705 movdqu @XMM[6], 0x20($out) 2706 pxor 0x40(%rsp), @XMM[2] 2707 movdqu @XMM[4], 0x30($out) 2708 pxor 0x50(%rsp), @XMM[7] 2709 movdqu @XMM[2], 0x40($out) 2710 pxor 0x60(%rsp), @XMM[3] 2711 movdqu @XMM[7], 0x50($out) 2712 pxor 0x70(%rsp), @XMM[5] 2713 movdqu @XMM[3], 0x60($out) 2714 movdqu @XMM[5], 0x70($out) 2715 lea 0x80($out), $out 2716 2717 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2718 pxor $twtmp, $twtmp 2719 movdqa .Lxts_magic(%rip), $twmask 2720 pcmpgtd @XMM[7], $twtmp 2721 pshufd \$0x13, $twtmp, $twres 2722 pxor $twtmp, $twtmp 2723 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2724 pand $twmask, $twres # isolate carry and residue 2725 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2726 pxor $twres, @XMM[7] 2727 2728 sub \$0x80,$len 2729 jnc .Lxts_dec_loop 2730 2731.Lxts_dec_short: 2732 add \$0x80, $len 2733 jz .Lxts_dec_done 2734___ 2735 for ($i=0;$i<7;$i++) { 2736 $code.=<<___; 2737 pshufd \$0x13, $twtmp, $twres 2738 pxor $twtmp, $twtmp 2739 movdqa @XMM[7], @XMM[$i] 2740 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2741 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2742 pand $twmask, $twres # isolate carry and residue 2743 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2744 pxor $twres, @XMM[7] 2745___ 2746 $code.=<<___ if ($i>=1); 2747 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2748 cmp \$`0x10*$i`,$len 2749 je .Lxts_dec_$i 2750___ 2751 $code.=<<___ if ($i>=2); 2752 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2753___ 2754 } 2755$code.=<<___; 2756 movdqu 0x60($inp), @XMM[8+6] 2757 pxor @XMM[8+5], @XMM[5] 2758 movdqa @XMM[7], 0x70(%rsp) 2759 lea 0x70($inp), $inp 2760 pxor @XMM[8+6], @XMM[6] 2761 lea 0x80(%rsp), %rax # pass key schedule 2762 mov %edx, %r10d # pass rounds 2763 2764 call _bsaes_decrypt8 2765 2766 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2767 pxor 0x10(%rsp), @XMM[1] 2768 movdqu @XMM[0], 0x00($out) # write output 2769 pxor 0x20(%rsp), @XMM[6] 2770 movdqu @XMM[1], 0x10($out) 2771 pxor 0x30(%rsp), @XMM[4] 2772 movdqu @XMM[6], 0x20($out) 2773 pxor 0x40(%rsp), @XMM[2] 2774 movdqu @XMM[4], 0x30($out) 2775 pxor 0x50(%rsp), @XMM[7] 2776 movdqu @XMM[2], 0x40($out) 2777 pxor 0x60(%rsp), @XMM[3] 2778 movdqu @XMM[7], 0x50($out) 2779 movdqu @XMM[3], 0x60($out) 2780 lea 0x70($out), $out 2781 2782 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2783 jmp .Lxts_dec_done 2784.align 16 2785.Lxts_dec_6: 2786 pxor @XMM[8+4], @XMM[4] 2787 lea 0x60($inp), $inp 2788 pxor @XMM[8+5], @XMM[5] 2789 lea 0x80(%rsp), %rax # pass key schedule 2790 mov %edx, %r10d # pass rounds 2791 2792 call _bsaes_decrypt8 2793 2794 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2795 pxor 0x10(%rsp), @XMM[1] 2796 movdqu @XMM[0], 0x00($out) # write output 2797 pxor 0x20(%rsp), @XMM[6] 2798 movdqu @XMM[1], 0x10($out) 2799 pxor 0x30(%rsp), @XMM[4] 2800 movdqu @XMM[6], 0x20($out) 2801 pxor 0x40(%rsp), @XMM[2] 2802 movdqu @XMM[4], 0x30($out) 2803 pxor 0x50(%rsp), @XMM[7] 2804 movdqu @XMM[2], 0x40($out) 2805 movdqu @XMM[7], 0x50($out) 2806 lea 0x60($out), $out 2807 2808 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2809 jmp .Lxts_dec_done 2810.align 16 2811.Lxts_dec_5: 2812 pxor @XMM[8+3], @XMM[3] 2813 lea 0x50($inp), $inp 2814 pxor @XMM[8+4], @XMM[4] 2815 lea 0x80(%rsp), %rax # pass key schedule 2816 mov %edx, %r10d # pass rounds 2817 2818 call _bsaes_decrypt8 2819 2820 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2821 pxor 0x10(%rsp), @XMM[1] 2822 movdqu @XMM[0], 0x00($out) # write output 2823 pxor 0x20(%rsp), @XMM[6] 2824 movdqu @XMM[1], 0x10($out) 2825 pxor 0x30(%rsp), @XMM[4] 2826 movdqu @XMM[6], 0x20($out) 2827 pxor 0x40(%rsp), @XMM[2] 2828 movdqu @XMM[4], 0x30($out) 2829 movdqu @XMM[2], 0x40($out) 2830 lea 0x50($out), $out 2831 2832 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2833 jmp .Lxts_dec_done 2834.align 16 2835.Lxts_dec_4: 2836 pxor @XMM[8+2], @XMM[2] 2837 lea 0x40($inp), $inp 2838 pxor @XMM[8+3], @XMM[3] 2839 lea 0x80(%rsp), %rax # pass key schedule 2840 mov %edx, %r10d # pass rounds 2841 2842 call _bsaes_decrypt8 2843 2844 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2845 pxor 0x10(%rsp), @XMM[1] 2846 movdqu @XMM[0], 0x00($out) # write output 2847 pxor 0x20(%rsp), @XMM[6] 2848 movdqu @XMM[1], 0x10($out) 2849 pxor 0x30(%rsp), @XMM[4] 2850 movdqu @XMM[6], 0x20($out) 2851 movdqu @XMM[4], 0x30($out) 2852 lea 0x40($out), $out 2853 2854 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2855 jmp .Lxts_dec_done 2856.align 16 2857.Lxts_dec_3: 2858 pxor @XMM[8+1], @XMM[1] 2859 lea 0x30($inp), $inp 2860 pxor @XMM[8+2], @XMM[2] 2861 lea 0x80(%rsp), %rax # pass key schedule 2862 mov %edx, %r10d # pass rounds 2863 2864 call _bsaes_decrypt8 2865 2866 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2867 pxor 0x10(%rsp), @XMM[1] 2868 movdqu @XMM[0], 0x00($out) # write output 2869 pxor 0x20(%rsp), @XMM[6] 2870 movdqu @XMM[1], 0x10($out) 2871 movdqu @XMM[6], 0x20($out) 2872 lea 0x30($out), $out 2873 2874 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2875 jmp .Lxts_dec_done 2876.align 16 2877.Lxts_dec_2: 2878 pxor @XMM[8+0], @XMM[0] 2879 lea 0x20($inp), $inp 2880 pxor @XMM[8+1], @XMM[1] 2881 lea 0x80(%rsp), %rax # pass key schedule 2882 mov %edx, %r10d # pass rounds 2883 2884 call _bsaes_decrypt8 2885 2886 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2887 pxor 0x10(%rsp), @XMM[1] 2888 movdqu @XMM[0], 0x00($out) # write output 2889 movdqu @XMM[1], 0x10($out) 2890 lea 0x20($out), $out 2891 2892 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2893 jmp .Lxts_dec_done 2894.align 16 2895.Lxts_dec_1: 2896 pxor @XMM[0], @XMM[8] 2897 lea 0x10($inp), $inp 2898 movdqa @XMM[8], 0x20(%rbp) 2899 lea 0x20(%rbp), $arg1 2900 lea 0x20(%rbp), $arg2 2901 lea ($key), $arg3 2902 call asm_AES_decrypt # doesn't touch %xmm 2903 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2904 #pxor @XMM[8], @XMM[0] 2905 #lea 0x80(%rsp), %rax # pass key schedule 2906 #mov %edx, %r10d # pass rounds 2907 #call _bsaes_decrypt8 2908 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2909 movdqu @XMM[0], 0x00($out) # write output 2910 lea 0x10($out), $out 2911 2912 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2913 2914.Lxts_dec_done: 2915 and \$15, %ebx 2916 jz .Lxts_dec_ret 2917 2918 pxor $twtmp, $twtmp 2919 movdqa .Lxts_magic(%rip), $twmask 2920 pcmpgtd @XMM[7], $twtmp 2921 pshufd \$0x13, $twtmp, $twres 2922 movdqa @XMM[7], @XMM[6] 2923 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2924 pand $twmask, $twres # isolate carry and residue 2925 movdqu ($inp), @XMM[0] 2926 pxor $twres, @XMM[7] 2927 2928 lea 0x20(%rbp), $arg1 2929 pxor @XMM[7], @XMM[0] 2930 lea 0x20(%rbp), $arg2 2931 movdqa @XMM[0], 0x20(%rbp) 2932 lea ($key), $arg3 2933 call asm_AES_decrypt # doesn't touch %xmm 2934 pxor 0x20(%rbp), @XMM[7] 2935 mov $out, %rdx 2936 movdqu @XMM[7], ($out) 2937 2938.Lxts_dec_steal: 2939 movzb 16($inp), %eax 2940 movzb (%rdx), %ecx 2941 lea 1($inp), $inp 2942 mov %al, (%rdx) 2943 mov %cl, 16(%rdx) 2944 lea 1(%rdx), %rdx 2945 sub \$1,%ebx 2946 jnz .Lxts_dec_steal 2947 2948 movdqu ($out), @XMM[0] 2949 lea 0x20(%rbp), $arg1 2950 pxor @XMM[6], @XMM[0] 2951 lea 0x20(%rbp), $arg2 2952 movdqa @XMM[0], 0x20(%rbp) 2953 lea ($key), $arg3 2954 call asm_AES_decrypt # doesn't touch %xmm 2955 pxor 0x20(%rbp), @XMM[6] 2956 movdqu @XMM[6], ($out) 2957 2958.Lxts_dec_ret: 2959 lea (%rsp), %rax 2960 pxor %xmm0, %xmm0 2961.Lxts_dec_bzero: # wipe key schedule [if any] 2962 movdqa %xmm0, 0x00(%rax) 2963 movdqa %xmm0, 0x10(%rax) 2964 lea 0x20(%rax), %rax 2965 cmp %rax, %rbp 2966 ja .Lxts_dec_bzero 2967 2968 lea 0x78(%rbp),%rax 2969.cfi_def_cfa %rax,8 2970___ 2971$code.=<<___ if ($win64); 2972 movaps 0x40(%rbp), %xmm6 2973 movaps 0x50(%rbp), %xmm7 2974 movaps 0x60(%rbp), %xmm8 2975 movaps 0x70(%rbp), %xmm9 2976 movaps 0x80(%rbp), %xmm10 2977 movaps 0x90(%rbp), %xmm11 2978 movaps 0xa0(%rbp), %xmm12 2979 movaps 0xb0(%rbp), %xmm13 2980 movaps 0xc0(%rbp), %xmm14 2981 movaps 0xd0(%rbp), %xmm15 2982 lea 0xa0(%rax), %rax 2983.Lxts_dec_tail: 2984___ 2985$code.=<<___; 2986 mov -48(%rax), %r15 2987.cfi_restore %r15 2988 mov -40(%rax), %r14 2989.cfi_restore %r14 2990 mov -32(%rax), %r13 2991.cfi_restore %r13 2992 mov -24(%rax), %r12 2993.cfi_restore %r12 2994 mov -16(%rax), %rbx 2995.cfi_restore %rbx 2996 mov -8(%rax), %rbp 2997.cfi_restore %rbp 2998 lea (%rax), %rsp # restore %rsp 2999.cfi_def_cfa_register %rsp 3000.Lxts_dec_epilogue: 3001 ret 3002.cfi_endproc 3003.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt 3004___ 3005} 3006$code.=<<___; 3007.type _bsaes_const,\@object 3008.section .rodata align=64 3009.align 64 3010_bsaes_const: 3011.LM0ISR: # InvShiftRows constants 3012 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 3013.LISRM0: 3014 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 3015.LISR: 3016 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 3017.LBS0: # bit-slice constants 3018 .quad 0x5555555555555555, 0x5555555555555555 3019.LBS1: 3020 .quad 0x3333333333333333, 0x3333333333333333 3021.LBS2: 3022 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 3023.LSR: # shiftrows constants 3024 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 3025.LSRM0: 3026 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 3027.LM0SR: 3028 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 3029.LSWPUP: # byte-swap upper dword 3030 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 3031.LSWPUPM0SR: 3032 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 3033.LADD1: # counter increment constants 3034 .quad 0x0000000000000000, 0x0000000100000000 3035.LADD2: 3036 .quad 0x0000000000000000, 0x0000000200000000 3037.LADD3: 3038 .quad 0x0000000000000000, 0x0000000300000000 3039.LADD4: 3040 .quad 0x0000000000000000, 0x0000000400000000 3041.LADD5: 3042 .quad 0x0000000000000000, 0x0000000500000000 3043.LADD6: 3044 .quad 0x0000000000000000, 0x0000000600000000 3045.LADD7: 3046 .quad 0x0000000000000000, 0x0000000700000000 3047.LADD8: 3048 .quad 0x0000000000000000, 0x0000000800000000 3049.Lxts_magic: 3050 .long 0x87,0,1,0 3051.Lmasks: 3052 .quad 0x0101010101010101, 0x0101010101010101 3053 .quad 0x0202020202020202, 0x0202020202020202 3054 .quad 0x0404040404040404, 0x0404040404040404 3055 .quad 0x0808080808080808, 0x0808080808080808 3056.LM0: 3057 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 3058.L63: 3059 .quad 0x6363636363636363, 0x6363636363636363 3060.align 64 3061.size _bsaes_const,.-_bsaes_const 3062.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 3063___ 3064 3065# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3066# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3067if ($win64) { 3068$rec="%rcx"; 3069$frame="%rdx"; 3070$context="%r8"; 3071$disp="%r9"; 3072 3073$code.=<<___; 3074.extern __imp_RtlVirtualUnwind 3075.type se_handler,\@abi-omnipotent 3076.align 16 3077se_handler: 3078 push %rsi 3079 push %rdi 3080 push %rbx 3081 push %rbp 3082 push %r12 3083 push %r13 3084 push %r14 3085 push %r15 3086 pushfq 3087 sub \$64,%rsp 3088 3089 mov 120($context),%rax # pull context->Rax 3090 mov 248($context),%rbx # pull context->Rip 3091 3092 mov 8($disp),%rsi # disp->ImageBase 3093 mov 56($disp),%r11 # disp->HandlerData 3094 3095 mov 0(%r11),%r10d # HandlerData[0] 3096 lea (%rsi,%r10),%r10 # prologue label 3097 cmp %r10,%rbx # context->Rip<=prologue label 3098 jbe .Lin_prologue 3099 3100 mov 4(%r11),%r10d # HandlerData[1] 3101 lea (%rsi,%r10),%r10 # epilogue label 3102 cmp %r10,%rbx # context->Rip>=epilogue label 3103 jae .Lin_prologue 3104 3105 mov 8(%r11),%r10d # HandlerData[2] 3106 lea (%rsi,%r10),%r10 # epilogue label 3107 cmp %r10,%rbx # context->Rip>=tail label 3108 jae .Lin_tail 3109 3110 mov 160($context),%rax # pull context->Rbp 3111 3112 lea 0x40(%rax),%rsi # %xmm save area 3113 lea 512($context),%rdi # &context.Xmm6 3114 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3115 .long 0xa548f3fc # cld; rep movsq 3116 lea 0xa0+0x78(%rax),%rax # adjust stack pointer 3117 3118.Lin_tail: 3119 mov -48(%rax),%rbp 3120 mov -40(%rax),%rbx 3121 mov -32(%rax),%r12 3122 mov -24(%rax),%r13 3123 mov -16(%rax),%r14 3124 mov -8(%rax),%r15 3125 mov %rbx,144($context) # restore context->Rbx 3126 mov %rbp,160($context) # restore context->Rbp 3127 mov %r12,216($context) # restore context->R12 3128 mov %r13,224($context) # restore context->R13 3129 mov %r14,232($context) # restore context->R14 3130 mov %r15,240($context) # restore context->R15 3131 3132.Lin_prologue: 3133 mov %rax,152($context) # restore context->Rsp 3134 3135 mov 40($disp),%rdi # disp->ContextRecord 3136 mov $context,%rsi # context 3137 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3138 .long 0xa548f3fc # cld; rep movsq 3139 3140 mov $disp,%rsi 3141 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3142 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3143 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3144 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3145 mov 40(%rsi),%r10 # disp->ContextRecord 3146 lea 56(%rsi),%r11 # &disp->HandlerData 3147 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3148 mov %r10,32(%rsp) # arg5 3149 mov %r11,40(%rsp) # arg6 3150 mov %r12,48(%rsp) # arg7 3151 mov %rcx,56(%rsp) # arg8, (NULL) 3152 call *__imp_RtlVirtualUnwind(%rip) 3153 3154 mov \$1,%eax # ExceptionContinueSearch 3155 add \$64,%rsp 3156 popfq 3157 pop %r15 3158 pop %r14 3159 pop %r13 3160 pop %r12 3161 pop %rbp 3162 pop %rbx 3163 pop %rdi 3164 pop %rsi 3165 ret 3166.size se_handler,.-se_handler 3167 3168.section .pdata 3169.align 4 3170___ 3171$code.=<<___ if ($ecb); 3172 .rva .Lecb_enc_prologue 3173 .rva .Lecb_enc_epilogue 3174 .rva .Lecb_enc_info 3175 3176 .rva .Lecb_dec_prologue 3177 .rva .Lecb_dec_epilogue 3178 .rva .Lecb_dec_info 3179___ 3180$code.=<<___; 3181 .rva .Lcbc_dec_prologue 3182 .rva .Lcbc_dec_epilogue 3183 .rva .Lcbc_dec_info 3184 3185 .rva .Lctr_enc_prologue 3186 .rva .Lctr_enc_epilogue 3187 .rva .Lctr_enc_info 3188 3189 .rva .Lxts_enc_prologue 3190 .rva .Lxts_enc_epilogue 3191 .rva .Lxts_enc_info 3192 3193 .rva .Lxts_dec_prologue 3194 .rva .Lxts_dec_epilogue 3195 .rva .Lxts_dec_info 3196 3197.section .xdata 3198.align 8 3199___ 3200$code.=<<___ if ($ecb); 3201.Lecb_enc_info: 3202 .byte 9,0,0,0 3203 .rva se_handler 3204 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3205 .rva .Lecb_enc_tail 3206 .long 0 3207.Lecb_dec_info: 3208 .byte 9,0,0,0 3209 .rva se_handler 3210 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3211 .rva .Lecb_dec_tail 3212 .long 0 3213___ 3214$code.=<<___; 3215.Lcbc_dec_info: 3216 .byte 9,0,0,0 3217 .rva se_handler 3218 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3219 .rva .Lcbc_dec_tail 3220 .long 0 3221.Lctr_enc_info: 3222 .byte 9,0,0,0 3223 .rva se_handler 3224 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3225 .rva .Lctr_enc_tail 3226 .long 0 3227.Lxts_enc_info: 3228 .byte 9,0,0,0 3229 .rva se_handler 3230 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3231 .rva .Lxts_enc_tail 3232 .long 0 3233.Lxts_dec_info: 3234 .byte 9,0,0,0 3235 .rva se_handler 3236 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3237 .rva .Lxts_dec_tail 3238 .long 0 3239___ 3240} 3241 3242$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3243 3244print $code; 3245 3246close STDOUT or die "error closing STDOUT: $!"; 3247