1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 20# details]. 21# 22# Performance. 23# 24# To start with see corresponding paragraph in aesni-x86_64.pl... 25# Instead of filling table similar to one found there I've chosen to 26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 27# The simplified table below represents 32-bit performance relative 28# to 64-bit one in every given point. Ratios vary for different 29# encryption modes, therefore interval values. 30# 31# 16-byte 64-byte 256-byte 1-KB 8-KB 32# 53-67% 67-84% 91-94% 95-98% 97-99.5% 33# 34# Lower ratios for smaller block sizes are perfectly understandable, 35# because function call overhead is higher in 32-bit mode. Largest 36# 8-KB block performance is virtually same: 32-bit code is less than 37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 38 39# January 2011 40# 41# See aesni-x86_64.pl for details. Unlike x86_64 version this module 42# interleaves at most 6 aes[enc|dec] instructions, because there are 43# not enough registers for 8x interleave [which should be optimal for 44# Sandy Bridge]. Actually, performance results for 6x interleave 45# factor presented in aesni-x86_64.pl (except for CTR) are for this 46# module. 47 48# April 2011 49# 50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 52 53# November 2015 54# 55# Add aesni_ocb_[en|de]crypt. 56 57###################################################################### 58# Current large-block performance in cycles per byte processed with 59# 128-bit key (less is better). 60# 61# CBC en-/decrypt CTR XTS ECB OCB 62# Westmere 3.77/1.37 1.37 1.52 1.27 63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 69 70$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 71 # generates drop-in replacement for 72 # crypto/aes/asm/aes-586.pl:-) 73$inline=1; # inline _aesni_[en|de]crypt 74 75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 76push(@INC,"${dir}","${dir}../../perlasm"); 77require "x86asm.pl"; 78 79$output = pop and open STDOUT,">$output"; 80 81&asm_init($ARGV[0]); 82 83&external_label("OPENSSL_ia32cap_P"); 84&static_label("key_const"); 85 86if ($PREFIX eq "aesni") { $movekey=\&movups; } 87else { $movekey=\&movups; } 88 89$len="eax"; 90$rounds="ecx"; 91$key="edx"; 92$inp="esi"; 93$out="edi"; 94$rounds_="ebx"; # backup copy for $rounds 95$key_="ebp"; # backup copy for $key 96 97$rndkey0="xmm0"; 98$rndkey1="xmm1"; 99$inout0="xmm2"; 100$inout1="xmm3"; 101$inout2="xmm4"; 102$inout3="xmm5"; $in1="xmm5"; 103$inout4="xmm6"; $in0="xmm6"; 104$inout5="xmm7"; $ivec="xmm7"; 105 106# AESNI extension 107sub aeskeygenassist 108{ my($dst,$src,$imm)=@_; 109 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 110 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 111} 112sub aescommon 113{ my($opcodelet,$dst,$src)=@_; 114 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 115 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 116} 117sub aesimc { aescommon(0xdb,@_); } 118sub aesenc { aescommon(0xdc,@_); } 119sub aesenclast { aescommon(0xdd,@_); } 120sub aesdec { aescommon(0xde,@_); } 121sub aesdeclast { aescommon(0xdf,@_); } 122 123# Inline version of internal aesni_[en|de]crypt1 124{ my $sn; 125sub aesni_inline_generate1 126{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 127 $sn++; 128 129 &$movekey ($rndkey0,&QWP(0,$key)); 130 &$movekey ($rndkey1,&QWP(16,$key)); 131 &xorps ($ivec,$rndkey0) if (defined($ivec)); 132 &lea ($key,&DWP(32,$key)); 133 &xorps ($inout,$ivec) if (defined($ivec)); 134 &xorps ($inout,$rndkey0) if (!defined($ivec)); 135 &set_label("${p}1_loop_$sn"); 136 eval"&aes${p} ($inout,$rndkey1)"; 137 &dec ($rounds); 138 &$movekey ($rndkey1,&QWP(0,$key)); 139 &lea ($key,&DWP(16,$key)); 140 &jnz (&label("${p}1_loop_$sn")); 141 eval"&aes${p}last ($inout,$rndkey1)"; 142}} 143 144sub aesni_generate1 # fully unrolled loop 145{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 146 147 &function_begin_B("_aesni_${p}rypt1"); 148 &movups ($rndkey0,&QWP(0,$key)); 149 &$movekey ($rndkey1,&QWP(0x10,$key)); 150 &xorps ($inout,$rndkey0); 151 &$movekey ($rndkey0,&QWP(0x20,$key)); 152 &lea ($key,&DWP(0x30,$key)); 153 &cmp ($rounds,11); 154 &jb (&label("${p}128")); 155 &lea ($key,&DWP(0x20,$key)); 156 &je (&label("${p}192")); 157 &lea ($key,&DWP(0x20,$key)); 158 eval"&aes${p} ($inout,$rndkey1)"; 159 &$movekey ($rndkey1,&QWP(-0x40,$key)); 160 eval"&aes${p} ($inout,$rndkey0)"; 161 &$movekey ($rndkey0,&QWP(-0x30,$key)); 162 &set_label("${p}192"); 163 eval"&aes${p} ($inout,$rndkey1)"; 164 &$movekey ($rndkey1,&QWP(-0x20,$key)); 165 eval"&aes${p} ($inout,$rndkey0)"; 166 &$movekey ($rndkey0,&QWP(-0x10,$key)); 167 &set_label("${p}128"); 168 eval"&aes${p} ($inout,$rndkey1)"; 169 &$movekey ($rndkey1,&QWP(0,$key)); 170 eval"&aes${p} ($inout,$rndkey0)"; 171 &$movekey ($rndkey0,&QWP(0x10,$key)); 172 eval"&aes${p} ($inout,$rndkey1)"; 173 &$movekey ($rndkey1,&QWP(0x20,$key)); 174 eval"&aes${p} ($inout,$rndkey0)"; 175 &$movekey ($rndkey0,&QWP(0x30,$key)); 176 eval"&aes${p} ($inout,$rndkey1)"; 177 &$movekey ($rndkey1,&QWP(0x40,$key)); 178 eval"&aes${p} ($inout,$rndkey0)"; 179 &$movekey ($rndkey0,&QWP(0x50,$key)); 180 eval"&aes${p} ($inout,$rndkey1)"; 181 &$movekey ($rndkey1,&QWP(0x60,$key)); 182 eval"&aes${p} ($inout,$rndkey0)"; 183 &$movekey ($rndkey0,&QWP(0x70,$key)); 184 eval"&aes${p} ($inout,$rndkey1)"; 185 eval"&aes${p}last ($inout,$rndkey0)"; 186 &ret(); 187 &function_end_B("_aesni_${p}rypt1"); 188} 189 190# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 191&aesni_generate1("enc") if (!$inline); 192&function_begin_B("${PREFIX}_encrypt"); 193 &mov ("eax",&wparam(0)); 194 &mov ($key,&wparam(2)); 195 &movups ($inout0,&QWP(0,"eax")); 196 &mov ($rounds,&DWP(240,$key)); 197 &mov ("eax",&wparam(1)); 198 if ($inline) 199 { &aesni_inline_generate1("enc"); } 200 else 201 { &call ("_aesni_encrypt1"); } 202 &pxor ($rndkey0,$rndkey0); # clear register bank 203 &pxor ($rndkey1,$rndkey1); 204 &movups (&QWP(0,"eax"),$inout0); 205 &pxor ($inout0,$inout0); 206 &ret (); 207&function_end_B("${PREFIX}_encrypt"); 208 209# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 210&aesni_generate1("dec") if(!$inline); 211&function_begin_B("${PREFIX}_decrypt"); 212 &mov ("eax",&wparam(0)); 213 &mov ($key,&wparam(2)); 214 &movups ($inout0,&QWP(0,"eax")); 215 &mov ($rounds,&DWP(240,$key)); 216 &mov ("eax",&wparam(1)); 217 if ($inline) 218 { &aesni_inline_generate1("dec"); } 219 else 220 { &call ("_aesni_decrypt1"); } 221 &pxor ($rndkey0,$rndkey0); # clear register bank 222 &pxor ($rndkey1,$rndkey1); 223 &movups (&QWP(0,"eax"),$inout0); 224 &pxor ($inout0,$inout0); 225 &ret (); 226&function_end_B("${PREFIX}_decrypt"); 227 228# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 229# factor. Why 3x subroutine were originally used in loops? Even though 230# aes[enc|dec] latency was originally 6, it could be scheduled only 231# every *2nd* cycle. Thus 3x interleave was the one providing optimal 232# utilization, i.e. when subroutine's throughput is virtually same as 233# of non-interleaved subroutine [for number of input blocks up to 3]. 234# This is why it originally made no sense to implement 2x subroutine. 235# But times change and it became appropriate to spend extra 192 bytes 236# on 2x subroutine on Atom Silvermont account. For processors that 237# can schedule aes[enc|dec] every cycle optimal interleave factor 238# equals to corresponding instructions latency. 8x is optimal for 239# * Bridge, but it's unfeasible to accommodate such implementation 240# in XMM registers addressable in 32-bit mode and therefore maximum 241# of 6x is used instead... 242 243sub aesni_generate2 244{ my $p=shift; 245 246 &function_begin_B("_aesni_${p}rypt2"); 247 &$movekey ($rndkey0,&QWP(0,$key)); 248 &shl ($rounds,4); 249 &$movekey ($rndkey1,&QWP(16,$key)); 250 &xorps ($inout0,$rndkey0); 251 &pxor ($inout1,$rndkey0); 252 &$movekey ($rndkey0,&QWP(32,$key)); 253 &lea ($key,&DWP(32,$key,$rounds)); 254 &neg ($rounds); 255 &add ($rounds,16); 256 257 &set_label("${p}2_loop"); 258 eval"&aes${p} ($inout0,$rndkey1)"; 259 eval"&aes${p} ($inout1,$rndkey1)"; 260 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 261 &add ($rounds,32); 262 eval"&aes${p} ($inout0,$rndkey0)"; 263 eval"&aes${p} ($inout1,$rndkey0)"; 264 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 265 &jnz (&label("${p}2_loop")); 266 eval"&aes${p} ($inout0,$rndkey1)"; 267 eval"&aes${p} ($inout1,$rndkey1)"; 268 eval"&aes${p}last ($inout0,$rndkey0)"; 269 eval"&aes${p}last ($inout1,$rndkey0)"; 270 &ret(); 271 &function_end_B("_aesni_${p}rypt2"); 272} 273 274sub aesni_generate3 275{ my $p=shift; 276 277 &function_begin_B("_aesni_${p}rypt3"); 278 &$movekey ($rndkey0,&QWP(0,$key)); 279 &shl ($rounds,4); 280 &$movekey ($rndkey1,&QWP(16,$key)); 281 &xorps ($inout0,$rndkey0); 282 &pxor ($inout1,$rndkey0); 283 &pxor ($inout2,$rndkey0); 284 &$movekey ($rndkey0,&QWP(32,$key)); 285 &lea ($key,&DWP(32,$key,$rounds)); 286 &neg ($rounds); 287 &add ($rounds,16); 288 289 &set_label("${p}3_loop"); 290 eval"&aes${p} ($inout0,$rndkey1)"; 291 eval"&aes${p} ($inout1,$rndkey1)"; 292 eval"&aes${p} ($inout2,$rndkey1)"; 293 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 294 &add ($rounds,32); 295 eval"&aes${p} ($inout0,$rndkey0)"; 296 eval"&aes${p} ($inout1,$rndkey0)"; 297 eval"&aes${p} ($inout2,$rndkey0)"; 298 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 299 &jnz (&label("${p}3_loop")); 300 eval"&aes${p} ($inout0,$rndkey1)"; 301 eval"&aes${p} ($inout1,$rndkey1)"; 302 eval"&aes${p} ($inout2,$rndkey1)"; 303 eval"&aes${p}last ($inout0,$rndkey0)"; 304 eval"&aes${p}last ($inout1,$rndkey0)"; 305 eval"&aes${p}last ($inout2,$rndkey0)"; 306 &ret(); 307 &function_end_B("_aesni_${p}rypt3"); 308} 309 310# 4x interleave is implemented to improve small block performance, 311# most notably [and naturally] 4 block by ~30%. One can argue that one 312# should have implemented 5x as well, but improvement would be <20%, 313# so it's not worth it... 314sub aesni_generate4 315{ my $p=shift; 316 317 &function_begin_B("_aesni_${p}rypt4"); 318 &$movekey ($rndkey0,&QWP(0,$key)); 319 &$movekey ($rndkey1,&QWP(16,$key)); 320 &shl ($rounds,4); 321 &xorps ($inout0,$rndkey0); 322 &pxor ($inout1,$rndkey0); 323 &pxor ($inout2,$rndkey0); 324 &pxor ($inout3,$rndkey0); 325 &$movekey ($rndkey0,&QWP(32,$key)); 326 &lea ($key,&DWP(32,$key,$rounds)); 327 &neg ($rounds); 328 &data_byte (0x0f,0x1f,0x40,0x00); 329 &add ($rounds,16); 330 331 &set_label("${p}4_loop"); 332 eval"&aes${p} ($inout0,$rndkey1)"; 333 eval"&aes${p} ($inout1,$rndkey1)"; 334 eval"&aes${p} ($inout2,$rndkey1)"; 335 eval"&aes${p} ($inout3,$rndkey1)"; 336 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 337 &add ($rounds,32); 338 eval"&aes${p} ($inout0,$rndkey0)"; 339 eval"&aes${p} ($inout1,$rndkey0)"; 340 eval"&aes${p} ($inout2,$rndkey0)"; 341 eval"&aes${p} ($inout3,$rndkey0)"; 342 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 343 &jnz (&label("${p}4_loop")); 344 345 eval"&aes${p} ($inout0,$rndkey1)"; 346 eval"&aes${p} ($inout1,$rndkey1)"; 347 eval"&aes${p} ($inout2,$rndkey1)"; 348 eval"&aes${p} ($inout3,$rndkey1)"; 349 eval"&aes${p}last ($inout0,$rndkey0)"; 350 eval"&aes${p}last ($inout1,$rndkey0)"; 351 eval"&aes${p}last ($inout2,$rndkey0)"; 352 eval"&aes${p}last ($inout3,$rndkey0)"; 353 &ret(); 354 &function_end_B("_aesni_${p}rypt4"); 355} 356 357sub aesni_generate6 358{ my $p=shift; 359 360 &function_begin_B("_aesni_${p}rypt6"); 361 &static_label("_aesni_${p}rypt6_enter"); 362 &$movekey ($rndkey0,&QWP(0,$key)); 363 &shl ($rounds,4); 364 &$movekey ($rndkey1,&QWP(16,$key)); 365 &xorps ($inout0,$rndkey0); 366 &pxor ($inout1,$rndkey0); # pxor does better here 367 &pxor ($inout2,$rndkey0); 368 eval"&aes${p} ($inout0,$rndkey1)"; 369 &pxor ($inout3,$rndkey0); 370 &pxor ($inout4,$rndkey0); 371 eval"&aes${p} ($inout1,$rndkey1)"; 372 &lea ($key,&DWP(32,$key,$rounds)); 373 &neg ($rounds); 374 eval"&aes${p} ($inout2,$rndkey1)"; 375 &pxor ($inout5,$rndkey0); 376 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 377 &add ($rounds,16); 378 &jmp (&label("_aesni_${p}rypt6_inner")); 379 380 &set_label("${p}6_loop",16); 381 eval"&aes${p} ($inout0,$rndkey1)"; 382 eval"&aes${p} ($inout1,$rndkey1)"; 383 eval"&aes${p} ($inout2,$rndkey1)"; 384 &set_label("_aesni_${p}rypt6_inner"); 385 eval"&aes${p} ($inout3,$rndkey1)"; 386 eval"&aes${p} ($inout4,$rndkey1)"; 387 eval"&aes${p} ($inout5,$rndkey1)"; 388 &set_label("_aesni_${p}rypt6_enter"); 389 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 390 &add ($rounds,32); 391 eval"&aes${p} ($inout0,$rndkey0)"; 392 eval"&aes${p} ($inout1,$rndkey0)"; 393 eval"&aes${p} ($inout2,$rndkey0)"; 394 eval"&aes${p} ($inout3,$rndkey0)"; 395 eval"&aes${p} ($inout4,$rndkey0)"; 396 eval"&aes${p} ($inout5,$rndkey0)"; 397 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 398 &jnz (&label("${p}6_loop")); 399 400 eval"&aes${p} ($inout0,$rndkey1)"; 401 eval"&aes${p} ($inout1,$rndkey1)"; 402 eval"&aes${p} ($inout2,$rndkey1)"; 403 eval"&aes${p} ($inout3,$rndkey1)"; 404 eval"&aes${p} ($inout4,$rndkey1)"; 405 eval"&aes${p} ($inout5,$rndkey1)"; 406 eval"&aes${p}last ($inout0,$rndkey0)"; 407 eval"&aes${p}last ($inout1,$rndkey0)"; 408 eval"&aes${p}last ($inout2,$rndkey0)"; 409 eval"&aes${p}last ($inout3,$rndkey0)"; 410 eval"&aes${p}last ($inout4,$rndkey0)"; 411 eval"&aes${p}last ($inout5,$rndkey0)"; 412 &ret(); 413 &function_end_B("_aesni_${p}rypt6"); 414} 415&aesni_generate2("enc") if ($PREFIX eq "aesni"); 416&aesni_generate2("dec"); 417&aesni_generate3("enc") if ($PREFIX eq "aesni"); 418&aesni_generate3("dec"); 419&aesni_generate4("enc") if ($PREFIX eq "aesni"); 420&aesni_generate4("dec"); 421&aesni_generate6("enc") if ($PREFIX eq "aesni"); 422&aesni_generate6("dec"); 423 424if ($PREFIX eq "aesni") { 425###################################################################### 426# void aesni_ecb_encrypt (const void *in, void *out, 427# size_t length, const AES_KEY *key, 428# int enc); 429&function_begin("aesni_ecb_encrypt"); 430 &mov ($inp,&wparam(0)); 431 &mov ($out,&wparam(1)); 432 &mov ($len,&wparam(2)); 433 &mov ($key,&wparam(3)); 434 &mov ($rounds_,&wparam(4)); 435 &and ($len,-16); 436 &jz (&label("ecb_ret")); 437 &mov ($rounds,&DWP(240,$key)); 438 &test ($rounds_,$rounds_); 439 &jz (&label("ecb_decrypt")); 440 441 &mov ($key_,$key); # backup $key 442 &mov ($rounds_,$rounds); # backup $rounds 443 &cmp ($len,0x60); 444 &jb (&label("ecb_enc_tail")); 445 446 &movdqu ($inout0,&QWP(0,$inp)); 447 &movdqu ($inout1,&QWP(0x10,$inp)); 448 &movdqu ($inout2,&QWP(0x20,$inp)); 449 &movdqu ($inout3,&QWP(0x30,$inp)); 450 &movdqu ($inout4,&QWP(0x40,$inp)); 451 &movdqu ($inout5,&QWP(0x50,$inp)); 452 &lea ($inp,&DWP(0x60,$inp)); 453 &sub ($len,0x60); 454 &jmp (&label("ecb_enc_loop6_enter")); 455 456&set_label("ecb_enc_loop6",16); 457 &movups (&QWP(0,$out),$inout0); 458 &movdqu ($inout0,&QWP(0,$inp)); 459 &movups (&QWP(0x10,$out),$inout1); 460 &movdqu ($inout1,&QWP(0x10,$inp)); 461 &movups (&QWP(0x20,$out),$inout2); 462 &movdqu ($inout2,&QWP(0x20,$inp)); 463 &movups (&QWP(0x30,$out),$inout3); 464 &movdqu ($inout3,&QWP(0x30,$inp)); 465 &movups (&QWP(0x40,$out),$inout4); 466 &movdqu ($inout4,&QWP(0x40,$inp)); 467 &movups (&QWP(0x50,$out),$inout5); 468 &lea ($out,&DWP(0x60,$out)); 469 &movdqu ($inout5,&QWP(0x50,$inp)); 470 &lea ($inp,&DWP(0x60,$inp)); 471&set_label("ecb_enc_loop6_enter"); 472 473 &call ("_aesni_encrypt6"); 474 475 &mov ($key,$key_); # restore $key 476 &mov ($rounds,$rounds_); # restore $rounds 477 &sub ($len,0x60); 478 &jnc (&label("ecb_enc_loop6")); 479 480 &movups (&QWP(0,$out),$inout0); 481 &movups (&QWP(0x10,$out),$inout1); 482 &movups (&QWP(0x20,$out),$inout2); 483 &movups (&QWP(0x30,$out),$inout3); 484 &movups (&QWP(0x40,$out),$inout4); 485 &movups (&QWP(0x50,$out),$inout5); 486 &lea ($out,&DWP(0x60,$out)); 487 &add ($len,0x60); 488 &jz (&label("ecb_ret")); 489 490&set_label("ecb_enc_tail"); 491 &movups ($inout0,&QWP(0,$inp)); 492 &cmp ($len,0x20); 493 &jb (&label("ecb_enc_one")); 494 &movups ($inout1,&QWP(0x10,$inp)); 495 &je (&label("ecb_enc_two")); 496 &movups ($inout2,&QWP(0x20,$inp)); 497 &cmp ($len,0x40); 498 &jb (&label("ecb_enc_three")); 499 &movups ($inout3,&QWP(0x30,$inp)); 500 &je (&label("ecb_enc_four")); 501 &movups ($inout4,&QWP(0x40,$inp)); 502 &xorps ($inout5,$inout5); 503 &call ("_aesni_encrypt6"); 504 &movups (&QWP(0,$out),$inout0); 505 &movups (&QWP(0x10,$out),$inout1); 506 &movups (&QWP(0x20,$out),$inout2); 507 &movups (&QWP(0x30,$out),$inout3); 508 &movups (&QWP(0x40,$out),$inout4); 509 jmp (&label("ecb_ret")); 510 511&set_label("ecb_enc_one",16); 512 if ($inline) 513 { &aesni_inline_generate1("enc"); } 514 else 515 { &call ("_aesni_encrypt1"); } 516 &movups (&QWP(0,$out),$inout0); 517 &jmp (&label("ecb_ret")); 518 519&set_label("ecb_enc_two",16); 520 &call ("_aesni_encrypt2"); 521 &movups (&QWP(0,$out),$inout0); 522 &movups (&QWP(0x10,$out),$inout1); 523 &jmp (&label("ecb_ret")); 524 525&set_label("ecb_enc_three",16); 526 &call ("_aesni_encrypt3"); 527 &movups (&QWP(0,$out),$inout0); 528 &movups (&QWP(0x10,$out),$inout1); 529 &movups (&QWP(0x20,$out),$inout2); 530 &jmp (&label("ecb_ret")); 531 532&set_label("ecb_enc_four",16); 533 &call ("_aesni_encrypt4"); 534 &movups (&QWP(0,$out),$inout0); 535 &movups (&QWP(0x10,$out),$inout1); 536 &movups (&QWP(0x20,$out),$inout2); 537 &movups (&QWP(0x30,$out),$inout3); 538 &jmp (&label("ecb_ret")); 539###################################################################### 540&set_label("ecb_decrypt",16); 541 &mov ($key_,$key); # backup $key 542 &mov ($rounds_,$rounds); # backup $rounds 543 &cmp ($len,0x60); 544 &jb (&label("ecb_dec_tail")); 545 546 &movdqu ($inout0,&QWP(0,$inp)); 547 &movdqu ($inout1,&QWP(0x10,$inp)); 548 &movdqu ($inout2,&QWP(0x20,$inp)); 549 &movdqu ($inout3,&QWP(0x30,$inp)); 550 &movdqu ($inout4,&QWP(0x40,$inp)); 551 &movdqu ($inout5,&QWP(0x50,$inp)); 552 &lea ($inp,&DWP(0x60,$inp)); 553 &sub ($len,0x60); 554 &jmp (&label("ecb_dec_loop6_enter")); 555 556&set_label("ecb_dec_loop6",16); 557 &movups (&QWP(0,$out),$inout0); 558 &movdqu ($inout0,&QWP(0,$inp)); 559 &movups (&QWP(0x10,$out),$inout1); 560 &movdqu ($inout1,&QWP(0x10,$inp)); 561 &movups (&QWP(0x20,$out),$inout2); 562 &movdqu ($inout2,&QWP(0x20,$inp)); 563 &movups (&QWP(0x30,$out),$inout3); 564 &movdqu ($inout3,&QWP(0x30,$inp)); 565 &movups (&QWP(0x40,$out),$inout4); 566 &movdqu ($inout4,&QWP(0x40,$inp)); 567 &movups (&QWP(0x50,$out),$inout5); 568 &lea ($out,&DWP(0x60,$out)); 569 &movdqu ($inout5,&QWP(0x50,$inp)); 570 &lea ($inp,&DWP(0x60,$inp)); 571&set_label("ecb_dec_loop6_enter"); 572 573 &call ("_aesni_decrypt6"); 574 575 &mov ($key,$key_); # restore $key 576 &mov ($rounds,$rounds_); # restore $rounds 577 &sub ($len,0x60); 578 &jnc (&label("ecb_dec_loop6")); 579 580 &movups (&QWP(0,$out),$inout0); 581 &movups (&QWP(0x10,$out),$inout1); 582 &movups (&QWP(0x20,$out),$inout2); 583 &movups (&QWP(0x30,$out),$inout3); 584 &movups (&QWP(0x40,$out),$inout4); 585 &movups (&QWP(0x50,$out),$inout5); 586 &lea ($out,&DWP(0x60,$out)); 587 &add ($len,0x60); 588 &jz (&label("ecb_ret")); 589 590&set_label("ecb_dec_tail"); 591 &movups ($inout0,&QWP(0,$inp)); 592 &cmp ($len,0x20); 593 &jb (&label("ecb_dec_one")); 594 &movups ($inout1,&QWP(0x10,$inp)); 595 &je (&label("ecb_dec_two")); 596 &movups ($inout2,&QWP(0x20,$inp)); 597 &cmp ($len,0x40); 598 &jb (&label("ecb_dec_three")); 599 &movups ($inout3,&QWP(0x30,$inp)); 600 &je (&label("ecb_dec_four")); 601 &movups ($inout4,&QWP(0x40,$inp)); 602 &xorps ($inout5,$inout5); 603 &call ("_aesni_decrypt6"); 604 &movups (&QWP(0,$out),$inout0); 605 &movups (&QWP(0x10,$out),$inout1); 606 &movups (&QWP(0x20,$out),$inout2); 607 &movups (&QWP(0x30,$out),$inout3); 608 &movups (&QWP(0x40,$out),$inout4); 609 &jmp (&label("ecb_ret")); 610 611&set_label("ecb_dec_one",16); 612 if ($inline) 613 { &aesni_inline_generate1("dec"); } 614 else 615 { &call ("_aesni_decrypt1"); } 616 &movups (&QWP(0,$out),$inout0); 617 &jmp (&label("ecb_ret")); 618 619&set_label("ecb_dec_two",16); 620 &call ("_aesni_decrypt2"); 621 &movups (&QWP(0,$out),$inout0); 622 &movups (&QWP(0x10,$out),$inout1); 623 &jmp (&label("ecb_ret")); 624 625&set_label("ecb_dec_three",16); 626 &call ("_aesni_decrypt3"); 627 &movups (&QWP(0,$out),$inout0); 628 &movups (&QWP(0x10,$out),$inout1); 629 &movups (&QWP(0x20,$out),$inout2); 630 &jmp (&label("ecb_ret")); 631 632&set_label("ecb_dec_four",16); 633 &call ("_aesni_decrypt4"); 634 &movups (&QWP(0,$out),$inout0); 635 &movups (&QWP(0x10,$out),$inout1); 636 &movups (&QWP(0x20,$out),$inout2); 637 &movups (&QWP(0x30,$out),$inout3); 638 639&set_label("ecb_ret"); 640 &pxor ("xmm0","xmm0"); # clear register bank 641 &pxor ("xmm1","xmm1"); 642 &pxor ("xmm2","xmm2"); 643 &pxor ("xmm3","xmm3"); 644 &pxor ("xmm4","xmm4"); 645 &pxor ("xmm5","xmm5"); 646 &pxor ("xmm6","xmm6"); 647 &pxor ("xmm7","xmm7"); 648&function_end("aesni_ecb_encrypt"); 649 650###################################################################### 651# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 652# size_t blocks, const AES_KEY *key, 653# const char *ivec,char *cmac); 654# 655# Handles only complete blocks, operates on 64-bit counter and 656# does not update *ivec! Nor does it finalize CMAC value 657# (see engine/eng_aesni.c for details) 658# 659{ my $cmac=$inout1; 660&function_begin("aesni_ccm64_encrypt_blocks"); 661 &mov ($inp,&wparam(0)); 662 &mov ($out,&wparam(1)); 663 &mov ($len,&wparam(2)); 664 &mov ($key,&wparam(3)); 665 &mov ($rounds_,&wparam(4)); 666 &mov ($rounds,&wparam(5)); 667 &mov ($key_,"esp"); 668 &sub ("esp",60); 669 &and ("esp",-16); # align stack 670 &mov (&DWP(48,"esp"),$key_); 671 672 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 673 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 674 &mov ($rounds,&DWP(240,$key)); 675 676 # compose byte-swap control mask for pshufb on stack 677 &mov (&DWP(0,"esp"),0x0c0d0e0f); 678 &mov (&DWP(4,"esp"),0x08090a0b); 679 &mov (&DWP(8,"esp"),0x04050607); 680 &mov (&DWP(12,"esp"),0x00010203); 681 682 # compose counter increment vector on stack 683 &mov ($rounds_,1); 684 &xor ($key_,$key_); 685 &mov (&DWP(16,"esp"),$rounds_); 686 &mov (&DWP(20,"esp"),$key_); 687 &mov (&DWP(24,"esp"),$key_); 688 &mov (&DWP(28,"esp"),$key_); 689 690 &shl ($rounds,4); 691 &mov ($rounds_,16); 692 &lea ($key_,&DWP(0,$key)); 693 &movdqa ($inout3,&QWP(0,"esp")); 694 &movdqa ($inout0,$ivec); 695 &lea ($key,&DWP(32,$key,$rounds)); 696 &sub ($rounds_,$rounds); 697 &pshufb ($ivec,$inout3); 698 699&set_label("ccm64_enc_outer"); 700 &$movekey ($rndkey0,&QWP(0,$key_)); 701 &mov ($rounds,$rounds_); 702 &movups ($in0,&QWP(0,$inp)); 703 704 &xorps ($inout0,$rndkey0); 705 &$movekey ($rndkey1,&QWP(16,$key_)); 706 &xorps ($rndkey0,$in0); 707 &xorps ($cmac,$rndkey0); # cmac^=inp 708 &$movekey ($rndkey0,&QWP(32,$key_)); 709 710&set_label("ccm64_enc2_loop"); 711 &aesenc ($inout0,$rndkey1); 712 &aesenc ($cmac,$rndkey1); 713 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 714 &add ($rounds,32); 715 &aesenc ($inout0,$rndkey0); 716 &aesenc ($cmac,$rndkey0); 717 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 718 &jnz (&label("ccm64_enc2_loop")); 719 &aesenc ($inout0,$rndkey1); 720 &aesenc ($cmac,$rndkey1); 721 &paddq ($ivec,&QWP(16,"esp")); 722 &dec ($len); 723 &aesenclast ($inout0,$rndkey0); 724 &aesenclast ($cmac,$rndkey0); 725 726 &lea ($inp,&DWP(16,$inp)); 727 &xorps ($in0,$inout0); # inp^=E(ivec) 728 &movdqa ($inout0,$ivec); 729 &movups (&QWP(0,$out),$in0); # save output 730 &pshufb ($inout0,$inout3); 731 &lea ($out,&DWP(16,$out)); 732 &jnz (&label("ccm64_enc_outer")); 733 734 &mov ("esp",&DWP(48,"esp")); 735 &mov ($out,&wparam(5)); 736 &movups (&QWP(0,$out),$cmac); 737 738 &pxor ("xmm0","xmm0"); # clear register bank 739 &pxor ("xmm1","xmm1"); 740 &pxor ("xmm2","xmm2"); 741 &pxor ("xmm3","xmm3"); 742 &pxor ("xmm4","xmm4"); 743 &pxor ("xmm5","xmm5"); 744 &pxor ("xmm6","xmm6"); 745 &pxor ("xmm7","xmm7"); 746&function_end("aesni_ccm64_encrypt_blocks"); 747 748&function_begin("aesni_ccm64_decrypt_blocks"); 749 &mov ($inp,&wparam(0)); 750 &mov ($out,&wparam(1)); 751 &mov ($len,&wparam(2)); 752 &mov ($key,&wparam(3)); 753 &mov ($rounds_,&wparam(4)); 754 &mov ($rounds,&wparam(5)); 755 &mov ($key_,"esp"); 756 &sub ("esp",60); 757 &and ("esp",-16); # align stack 758 &mov (&DWP(48,"esp"),$key_); 759 760 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 761 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 762 &mov ($rounds,&DWP(240,$key)); 763 764 # compose byte-swap control mask for pshufb on stack 765 &mov (&DWP(0,"esp"),0x0c0d0e0f); 766 &mov (&DWP(4,"esp"),0x08090a0b); 767 &mov (&DWP(8,"esp"),0x04050607); 768 &mov (&DWP(12,"esp"),0x00010203); 769 770 # compose counter increment vector on stack 771 &mov ($rounds_,1); 772 &xor ($key_,$key_); 773 &mov (&DWP(16,"esp"),$rounds_); 774 &mov (&DWP(20,"esp"),$key_); 775 &mov (&DWP(24,"esp"),$key_); 776 &mov (&DWP(28,"esp"),$key_); 777 778 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 779 &movdqa ($inout0,$ivec); 780 781 &mov ($key_,$key); 782 &mov ($rounds_,$rounds); 783 784 &pshufb ($ivec,$inout3); 785 if ($inline) 786 { &aesni_inline_generate1("enc"); } 787 else 788 { &call ("_aesni_encrypt1"); } 789 &shl ($rounds_,4); 790 &mov ($rounds,16); 791 &movups ($in0,&QWP(0,$inp)); # load inp 792 &paddq ($ivec,&QWP(16,"esp")); 793 &lea ($inp,&QWP(16,$inp)); 794 &sub ($rounds,$rounds_); 795 &lea ($key,&DWP(32,$key_,$rounds_)); 796 &mov ($rounds_,$rounds); 797 &jmp (&label("ccm64_dec_outer")); 798 799&set_label("ccm64_dec_outer",16); 800 &xorps ($in0,$inout0); # inp ^= E(ivec) 801 &movdqa ($inout0,$ivec); 802 &movups (&QWP(0,$out),$in0); # save output 803 &lea ($out,&DWP(16,$out)); 804 &pshufb ($inout0,$inout3); 805 806 &sub ($len,1); 807 &jz (&label("ccm64_dec_break")); 808 809 &$movekey ($rndkey0,&QWP(0,$key_)); 810 &mov ($rounds,$rounds_); 811 &$movekey ($rndkey1,&QWP(16,$key_)); 812 &xorps ($in0,$rndkey0); 813 &xorps ($inout0,$rndkey0); 814 &xorps ($cmac,$in0); # cmac^=out 815 &$movekey ($rndkey0,&QWP(32,$key_)); 816 817&set_label("ccm64_dec2_loop"); 818 &aesenc ($inout0,$rndkey1); 819 &aesenc ($cmac,$rndkey1); 820 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 821 &add ($rounds,32); 822 &aesenc ($inout0,$rndkey0); 823 &aesenc ($cmac,$rndkey0); 824 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 825 &jnz (&label("ccm64_dec2_loop")); 826 &movups ($in0,&QWP(0,$inp)); # load inp 827 &paddq ($ivec,&QWP(16,"esp")); 828 &aesenc ($inout0,$rndkey1); 829 &aesenc ($cmac,$rndkey1); 830 &aesenclast ($inout0,$rndkey0); 831 &aesenclast ($cmac,$rndkey0); 832 &lea ($inp,&QWP(16,$inp)); 833 &jmp (&label("ccm64_dec_outer")); 834 835&set_label("ccm64_dec_break",16); 836 &mov ($rounds,&DWP(240,$key_)); 837 &mov ($key,$key_); 838 if ($inline) 839 { &aesni_inline_generate1("enc",$cmac,$in0); } 840 else 841 { &call ("_aesni_encrypt1",$cmac); } 842 843 &mov ("esp",&DWP(48,"esp")); 844 &mov ($out,&wparam(5)); 845 &movups (&QWP(0,$out),$cmac); 846 847 &pxor ("xmm0","xmm0"); # clear register bank 848 &pxor ("xmm1","xmm1"); 849 &pxor ("xmm2","xmm2"); 850 &pxor ("xmm3","xmm3"); 851 &pxor ("xmm4","xmm4"); 852 &pxor ("xmm5","xmm5"); 853 &pxor ("xmm6","xmm6"); 854 &pxor ("xmm7","xmm7"); 855&function_end("aesni_ccm64_decrypt_blocks"); 856} 857 858###################################################################### 859# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 860# size_t blocks, const AES_KEY *key, 861# const char *ivec); 862# 863# Handles only complete blocks, operates on 32-bit counter and 864# does not update *ivec! (see crypto/modes/ctr128.c for details) 865# 866# stack layout: 867# 0 pshufb mask 868# 16 vector addend: 0,6,6,6 869# 32 counter-less ivec 870# 48 1st triplet of counter vector 871# 64 2nd triplet of counter vector 872# 80 saved %esp 873 874&function_begin("aesni_ctr32_encrypt_blocks"); 875 &mov ($inp,&wparam(0)); 876 &mov ($out,&wparam(1)); 877 &mov ($len,&wparam(2)); 878 &mov ($key,&wparam(3)); 879 &mov ($rounds_,&wparam(4)); 880 &mov ($key_,"esp"); 881 &sub ("esp",88); 882 &and ("esp",-16); # align stack 883 &mov (&DWP(80,"esp"),$key_); 884 885 &cmp ($len,1); 886 &je (&label("ctr32_one_shortcut")); 887 888 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 889 890 # compose byte-swap control mask for pshufb on stack 891 &mov (&DWP(0,"esp"),0x0c0d0e0f); 892 &mov (&DWP(4,"esp"),0x08090a0b); 893 &mov (&DWP(8,"esp"),0x04050607); 894 &mov (&DWP(12,"esp"),0x00010203); 895 896 # compose counter increment vector on stack 897 &mov ($rounds,6); 898 &xor ($key_,$key_); 899 &mov (&DWP(16,"esp"),$rounds); 900 &mov (&DWP(20,"esp"),$rounds); 901 &mov (&DWP(24,"esp"),$rounds); 902 &mov (&DWP(28,"esp"),$key_); 903 904 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 905 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 906 907 &mov ($rounds,&DWP(240,$key)); # key->rounds 908 909 # compose 2 vectors of 3x32-bit counters 910 &bswap ($rounds_); 911 &pxor ($rndkey0,$rndkey0); 912 &pxor ($rndkey1,$rndkey1); 913 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 914 &pinsrd ($rndkey0,$rounds_,0); 915 &lea ($key_,&DWP(3,$rounds_)); 916 &pinsrd ($rndkey1,$key_,0); 917 &inc ($rounds_); 918 &pinsrd ($rndkey0,$rounds_,1); 919 &inc ($key_); 920 &pinsrd ($rndkey1,$key_,1); 921 &inc ($rounds_); 922 &pinsrd ($rndkey0,$rounds_,2); 923 &inc ($key_); 924 &pinsrd ($rndkey1,$key_,2); 925 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 926 &pshufb ($rndkey0,$inout0); # byte swap 927 &movdqu ($inout4,&QWP(0,$key)); # key[0] 928 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 929 &pshufb ($rndkey1,$inout0); # byte swap 930 931 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 932 &pshufd ($inout1,$rndkey0,2<<6); 933 &cmp ($len,6); 934 &jb (&label("ctr32_tail")); 935 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 936 &shl ($rounds,4); 937 &mov ($rounds_,16); 938 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 939 &mov ($key_,$key); # backup $key 940 &sub ($rounds_,$rounds); # backup twisted $rounds 941 &lea ($key,&DWP(32,$key,$rounds)); 942 &sub ($len,6); 943 &jmp (&label("ctr32_loop6")); 944 945&set_label("ctr32_loop6",16); 946 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 947 &pshufd ($inout2,$rndkey0,1<<6); 948 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 949 &pshufd ($inout3,$rndkey1,3<<6); 950 &pxor ($inout0,$rndkey0); # merge counter-less ivec 951 &pshufd ($inout4,$rndkey1,2<<6); 952 &pxor ($inout1,$rndkey0); 953 &pshufd ($inout5,$rndkey1,1<<6); 954 &$movekey ($rndkey1,&QWP(16,$key_)); 955 &pxor ($inout2,$rndkey0); 956 &pxor ($inout3,$rndkey0); 957 &aesenc ($inout0,$rndkey1); 958 &pxor ($inout4,$rndkey0); 959 &pxor ($inout5,$rndkey0); 960 &aesenc ($inout1,$rndkey1); 961 &$movekey ($rndkey0,&QWP(32,$key_)); 962 &mov ($rounds,$rounds_); 963 &aesenc ($inout2,$rndkey1); 964 &aesenc ($inout3,$rndkey1); 965 &aesenc ($inout4,$rndkey1); 966 &aesenc ($inout5,$rndkey1); 967 968 &call (&label("_aesni_encrypt6_enter")); 969 970 &movups ($rndkey1,&QWP(0,$inp)); 971 &movups ($rndkey0,&QWP(0x10,$inp)); 972 &xorps ($inout0,$rndkey1); 973 &movups ($rndkey1,&QWP(0x20,$inp)); 974 &xorps ($inout1,$rndkey0); 975 &movups (&QWP(0,$out),$inout0); 976 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 977 &xorps ($inout2,$rndkey1); 978 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 979 &movups (&QWP(0x10,$out),$inout1); 980 &movups (&QWP(0x20,$out),$inout2); 981 982 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 983 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 984 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 985 986 &movups ($inout1,&QWP(0x30,$inp)); 987 &movups ($inout2,&QWP(0x40,$inp)); 988 &xorps ($inout3,$inout1); 989 &movups ($inout1,&QWP(0x50,$inp)); 990 &lea ($inp,&DWP(0x60,$inp)); 991 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 992 &pshufb ($rndkey0,$inout0); # byte swap 993 &xorps ($inout4,$inout2); 994 &movups (&QWP(0x30,$out),$inout3); 995 &xorps ($inout5,$inout1); 996 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 997 &pshufb ($rndkey1,$inout0); # byte swap 998 &movups (&QWP(0x40,$out),$inout4); 999 &pshufd ($inout0,$rndkey0,3<<6); 1000 &movups (&QWP(0x50,$out),$inout5); 1001 &lea ($out,&DWP(0x60,$out)); 1002 1003 &pshufd ($inout1,$rndkey0,2<<6); 1004 &sub ($len,6); 1005 &jnc (&label("ctr32_loop6")); 1006 1007 &add ($len,6); 1008 &jz (&label("ctr32_ret")); 1009 &movdqu ($inout5,&QWP(0,$key_)); 1010 &mov ($key,$key_); 1011 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1012 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1013 1014&set_label("ctr32_tail"); 1015 &por ($inout0,$inout5); 1016 &cmp ($len,2); 1017 &jb (&label("ctr32_one")); 1018 1019 &pshufd ($inout2,$rndkey0,1<<6); 1020 &por ($inout1,$inout5); 1021 &je (&label("ctr32_two")); 1022 1023 &pshufd ($inout3,$rndkey1,3<<6); 1024 &por ($inout2,$inout5); 1025 &cmp ($len,4); 1026 &jb (&label("ctr32_three")); 1027 1028 &pshufd ($inout4,$rndkey1,2<<6); 1029 &por ($inout3,$inout5); 1030 &je (&label("ctr32_four")); 1031 1032 &por ($inout4,$inout5); 1033 &call ("_aesni_encrypt6"); 1034 &movups ($rndkey1,&QWP(0,$inp)); 1035 &movups ($rndkey0,&QWP(0x10,$inp)); 1036 &xorps ($inout0,$rndkey1); 1037 &movups ($rndkey1,&QWP(0x20,$inp)); 1038 &xorps ($inout1,$rndkey0); 1039 &movups ($rndkey0,&QWP(0x30,$inp)); 1040 &xorps ($inout2,$rndkey1); 1041 &movups ($rndkey1,&QWP(0x40,$inp)); 1042 &xorps ($inout3,$rndkey0); 1043 &movups (&QWP(0,$out),$inout0); 1044 &xorps ($inout4,$rndkey1); 1045 &movups (&QWP(0x10,$out),$inout1); 1046 &movups (&QWP(0x20,$out),$inout2); 1047 &movups (&QWP(0x30,$out),$inout3); 1048 &movups (&QWP(0x40,$out),$inout4); 1049 &jmp (&label("ctr32_ret")); 1050 1051&set_label("ctr32_one_shortcut",16); 1052 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1053 &mov ($rounds,&DWP(240,$key)); 1054 1055&set_label("ctr32_one"); 1056 if ($inline) 1057 { &aesni_inline_generate1("enc"); } 1058 else 1059 { &call ("_aesni_encrypt1"); } 1060 &movups ($in0,&QWP(0,$inp)); 1061 &xorps ($in0,$inout0); 1062 &movups (&QWP(0,$out),$in0); 1063 &jmp (&label("ctr32_ret")); 1064 1065&set_label("ctr32_two",16); 1066 &call ("_aesni_encrypt2"); 1067 &movups ($inout3,&QWP(0,$inp)); 1068 &movups ($inout4,&QWP(0x10,$inp)); 1069 &xorps ($inout0,$inout3); 1070 &xorps ($inout1,$inout4); 1071 &movups (&QWP(0,$out),$inout0); 1072 &movups (&QWP(0x10,$out),$inout1); 1073 &jmp (&label("ctr32_ret")); 1074 1075&set_label("ctr32_three",16); 1076 &call ("_aesni_encrypt3"); 1077 &movups ($inout3,&QWP(0,$inp)); 1078 &movups ($inout4,&QWP(0x10,$inp)); 1079 &xorps ($inout0,$inout3); 1080 &movups ($inout5,&QWP(0x20,$inp)); 1081 &xorps ($inout1,$inout4); 1082 &movups (&QWP(0,$out),$inout0); 1083 &xorps ($inout2,$inout5); 1084 &movups (&QWP(0x10,$out),$inout1); 1085 &movups (&QWP(0x20,$out),$inout2); 1086 &jmp (&label("ctr32_ret")); 1087 1088&set_label("ctr32_four",16); 1089 &call ("_aesni_encrypt4"); 1090 &movups ($inout4,&QWP(0,$inp)); 1091 &movups ($inout5,&QWP(0x10,$inp)); 1092 &movups ($rndkey1,&QWP(0x20,$inp)); 1093 &xorps ($inout0,$inout4); 1094 &movups ($rndkey0,&QWP(0x30,$inp)); 1095 &xorps ($inout1,$inout5); 1096 &movups (&QWP(0,$out),$inout0); 1097 &xorps ($inout2,$rndkey1); 1098 &movups (&QWP(0x10,$out),$inout1); 1099 &xorps ($inout3,$rndkey0); 1100 &movups (&QWP(0x20,$out),$inout2); 1101 &movups (&QWP(0x30,$out),$inout3); 1102 1103&set_label("ctr32_ret"); 1104 &pxor ("xmm0","xmm0"); # clear register bank 1105 &pxor ("xmm1","xmm1"); 1106 &pxor ("xmm2","xmm2"); 1107 &pxor ("xmm3","xmm3"); 1108 &pxor ("xmm4","xmm4"); 1109 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1110 &pxor ("xmm5","xmm5"); 1111 &movdqa (&QWP(48,"esp"),"xmm0"); 1112 &pxor ("xmm6","xmm6"); 1113 &movdqa (&QWP(64,"esp"),"xmm0"); 1114 &pxor ("xmm7","xmm7"); 1115 &mov ("esp",&DWP(80,"esp")); 1116&function_end("aesni_ctr32_encrypt_blocks"); 1117 1118###################################################################### 1119# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1120# const AES_KEY *key1, const AES_KEY *key2 1121# const unsigned char iv[16]); 1122# 1123{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1124 1125&function_begin("aesni_xts_encrypt"); 1126 &mov ($key,&wparam(4)); # key2 1127 &mov ($inp,&wparam(5)); # clear-text tweak 1128 1129 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1130 &movups ($inout0,&QWP(0,$inp)); 1131 if ($inline) 1132 { &aesni_inline_generate1("enc"); } 1133 else 1134 { &call ("_aesni_encrypt1"); } 1135 1136 &mov ($inp,&wparam(0)); 1137 &mov ($out,&wparam(1)); 1138 &mov ($len,&wparam(2)); 1139 &mov ($key,&wparam(3)); # key1 1140 1141 &mov ($key_,"esp"); 1142 &sub ("esp",16*7+8); 1143 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1144 &and ("esp",-16); # align stack 1145 1146 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1147 &mov (&DWP(16*6+4,"esp"),0); 1148 &mov (&DWP(16*6+8,"esp"),1); 1149 &mov (&DWP(16*6+12,"esp"),0); 1150 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1151 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1152 1153 &movdqa ($tweak,$inout0); 1154 &pxor ($twtmp,$twtmp); 1155 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1156 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1157 1158 &and ($len,-16); 1159 &mov ($key_,$key); # backup $key 1160 &mov ($rounds_,$rounds); # backup $rounds 1161 &sub ($len,16*6); 1162 &jc (&label("xts_enc_short")); 1163 1164 &shl ($rounds,4); 1165 &mov ($rounds_,16); 1166 &sub ($rounds_,$rounds); 1167 &lea ($key,&DWP(32,$key,$rounds)); 1168 &jmp (&label("xts_enc_loop6")); 1169 1170&set_label("xts_enc_loop6",16); 1171 for ($i=0;$i<4;$i++) { 1172 &pshufd ($twres,$twtmp,0x13); 1173 &pxor ($twtmp,$twtmp); 1174 &movdqa (&QWP(16*$i,"esp"),$tweak); 1175 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1176 &pand ($twres,$twmask); # isolate carry and residue 1177 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1178 &pxor ($tweak,$twres); 1179 } 1180 &pshufd ($inout5,$twtmp,0x13); 1181 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1182 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1183 &$movekey ($rndkey0,&QWP(0,$key_)); 1184 &pand ($inout5,$twmask); # isolate carry and residue 1185 &movups ($inout0,&QWP(0,$inp)); # load input 1186 &pxor ($inout5,$tweak); 1187 1188 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1189 &mov ($rounds,$rounds_); # restore $rounds 1190 &movdqu ($inout1,&QWP(16*1,$inp)); 1191 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1192 &movdqu ($inout2,&QWP(16*2,$inp)); 1193 &pxor ($inout1,$rndkey0); 1194 &movdqu ($inout3,&QWP(16*3,$inp)); 1195 &pxor ($inout2,$rndkey0); 1196 &movdqu ($inout4,&QWP(16*4,$inp)); 1197 &pxor ($inout3,$rndkey0); 1198 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1199 &pxor ($inout4,$rndkey0); 1200 &lea ($inp,&DWP(16*6,$inp)); 1201 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1202 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1203 &pxor ($inout5,$rndkey1); 1204 1205 &$movekey ($rndkey1,&QWP(16,$key_)); 1206 &pxor ($inout1,&QWP(16*1,"esp")); 1207 &pxor ($inout2,&QWP(16*2,"esp")); 1208 &aesenc ($inout0,$rndkey1); 1209 &pxor ($inout3,&QWP(16*3,"esp")); 1210 &pxor ($inout4,&QWP(16*4,"esp")); 1211 &aesenc ($inout1,$rndkey1); 1212 &pxor ($inout5,$rndkey0); 1213 &$movekey ($rndkey0,&QWP(32,$key_)); 1214 &aesenc ($inout2,$rndkey1); 1215 &aesenc ($inout3,$rndkey1); 1216 &aesenc ($inout4,$rndkey1); 1217 &aesenc ($inout5,$rndkey1); 1218 &call (&label("_aesni_encrypt6_enter")); 1219 1220 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1221 &pxor ($twtmp,$twtmp); 1222 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1223 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1224 &xorps ($inout1,&QWP(16*1,"esp")); 1225 &movups (&QWP(16*0,$out),$inout0); # write output 1226 &xorps ($inout2,&QWP(16*2,"esp")); 1227 &movups (&QWP(16*1,$out),$inout1); 1228 &xorps ($inout3,&QWP(16*3,"esp")); 1229 &movups (&QWP(16*2,$out),$inout2); 1230 &xorps ($inout4,&QWP(16*4,"esp")); 1231 &movups (&QWP(16*3,$out),$inout3); 1232 &xorps ($inout5,$tweak); 1233 &movups (&QWP(16*4,$out),$inout4); 1234 &pshufd ($twres,$twtmp,0x13); 1235 &movups (&QWP(16*5,$out),$inout5); 1236 &lea ($out,&DWP(16*6,$out)); 1237 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1238 1239 &pxor ($twtmp,$twtmp); 1240 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1241 &pand ($twres,$twmask); # isolate carry and residue 1242 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1243 &pxor ($tweak,$twres); 1244 1245 &sub ($len,16*6); 1246 &jnc (&label("xts_enc_loop6")); 1247 1248 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1249 &mov ($key,$key_); # restore $key 1250 &mov ($rounds_,$rounds); 1251 1252&set_label("xts_enc_short"); 1253 &add ($len,16*6); 1254 &jz (&label("xts_enc_done6x")); 1255 1256 &movdqa ($inout3,$tweak); # put aside previous tweak 1257 &cmp ($len,0x20); 1258 &jb (&label("xts_enc_one")); 1259 1260 &pshufd ($twres,$twtmp,0x13); 1261 &pxor ($twtmp,$twtmp); 1262 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1263 &pand ($twres,$twmask); # isolate carry and residue 1264 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1265 &pxor ($tweak,$twres); 1266 &je (&label("xts_enc_two")); 1267 1268 &pshufd ($twres,$twtmp,0x13); 1269 &pxor ($twtmp,$twtmp); 1270 &movdqa ($inout4,$tweak); # put aside previous tweak 1271 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1272 &pand ($twres,$twmask); # isolate carry and residue 1273 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1274 &pxor ($tweak,$twres); 1275 &cmp ($len,0x40); 1276 &jb (&label("xts_enc_three")); 1277 1278 &pshufd ($twres,$twtmp,0x13); 1279 &pxor ($twtmp,$twtmp); 1280 &movdqa ($inout5,$tweak); # put aside previous tweak 1281 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1282 &pand ($twres,$twmask); # isolate carry and residue 1283 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1284 &pxor ($tweak,$twres); 1285 &movdqa (&QWP(16*0,"esp"),$inout3); 1286 &movdqa (&QWP(16*1,"esp"),$inout4); 1287 &je (&label("xts_enc_four")); 1288 1289 &movdqa (&QWP(16*2,"esp"),$inout5); 1290 &pshufd ($inout5,$twtmp,0x13); 1291 &movdqa (&QWP(16*3,"esp"),$tweak); 1292 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1293 &pand ($inout5,$twmask); # isolate carry and residue 1294 &pxor ($inout5,$tweak); 1295 1296 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1297 &movdqu ($inout1,&QWP(16*1,$inp)); 1298 &movdqu ($inout2,&QWP(16*2,$inp)); 1299 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1300 &movdqu ($inout3,&QWP(16*3,$inp)); 1301 &pxor ($inout1,&QWP(16*1,"esp")); 1302 &movdqu ($inout4,&QWP(16*4,$inp)); 1303 &pxor ($inout2,&QWP(16*2,"esp")); 1304 &lea ($inp,&DWP(16*5,$inp)); 1305 &pxor ($inout3,&QWP(16*3,"esp")); 1306 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1307 &pxor ($inout4,$inout5); 1308 1309 &call ("_aesni_encrypt6"); 1310 1311 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1312 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1313 &xorps ($inout1,&QWP(16*1,"esp")); 1314 &xorps ($inout2,&QWP(16*2,"esp")); 1315 &movups (&QWP(16*0,$out),$inout0); # write output 1316 &xorps ($inout3,&QWP(16*3,"esp")); 1317 &movups (&QWP(16*1,$out),$inout1); 1318 &xorps ($inout4,$tweak); 1319 &movups (&QWP(16*2,$out),$inout2); 1320 &movups (&QWP(16*3,$out),$inout3); 1321 &movups (&QWP(16*4,$out),$inout4); 1322 &lea ($out,&DWP(16*5,$out)); 1323 &jmp (&label("xts_enc_done")); 1324 1325&set_label("xts_enc_one",16); 1326 &movups ($inout0,&QWP(16*0,$inp)); # load input 1327 &lea ($inp,&DWP(16*1,$inp)); 1328 &xorps ($inout0,$inout3); # input^=tweak 1329 if ($inline) 1330 { &aesni_inline_generate1("enc"); } 1331 else 1332 { &call ("_aesni_encrypt1"); } 1333 &xorps ($inout0,$inout3); # output^=tweak 1334 &movups (&QWP(16*0,$out),$inout0); # write output 1335 &lea ($out,&DWP(16*1,$out)); 1336 1337 &movdqa ($tweak,$inout3); # last tweak 1338 &jmp (&label("xts_enc_done")); 1339 1340&set_label("xts_enc_two",16); 1341 &movaps ($inout4,$tweak); # put aside last tweak 1342 1343 &movups ($inout0,&QWP(16*0,$inp)); # load input 1344 &movups ($inout1,&QWP(16*1,$inp)); 1345 &lea ($inp,&DWP(16*2,$inp)); 1346 &xorps ($inout0,$inout3); # input^=tweak 1347 &xorps ($inout1,$inout4); 1348 1349 &call ("_aesni_encrypt2"); 1350 1351 &xorps ($inout0,$inout3); # output^=tweak 1352 &xorps ($inout1,$inout4); 1353 &movups (&QWP(16*0,$out),$inout0); # write output 1354 &movups (&QWP(16*1,$out),$inout1); 1355 &lea ($out,&DWP(16*2,$out)); 1356 1357 &movdqa ($tweak,$inout4); # last tweak 1358 &jmp (&label("xts_enc_done")); 1359 1360&set_label("xts_enc_three",16); 1361 &movaps ($inout5,$tweak); # put aside last tweak 1362 &movups ($inout0,&QWP(16*0,$inp)); # load input 1363 &movups ($inout1,&QWP(16*1,$inp)); 1364 &movups ($inout2,&QWP(16*2,$inp)); 1365 &lea ($inp,&DWP(16*3,$inp)); 1366 &xorps ($inout0,$inout3); # input^=tweak 1367 &xorps ($inout1,$inout4); 1368 &xorps ($inout2,$inout5); 1369 1370 &call ("_aesni_encrypt3"); 1371 1372 &xorps ($inout0,$inout3); # output^=tweak 1373 &xorps ($inout1,$inout4); 1374 &xorps ($inout2,$inout5); 1375 &movups (&QWP(16*0,$out),$inout0); # write output 1376 &movups (&QWP(16*1,$out),$inout1); 1377 &movups (&QWP(16*2,$out),$inout2); 1378 &lea ($out,&DWP(16*3,$out)); 1379 1380 &movdqa ($tweak,$inout5); # last tweak 1381 &jmp (&label("xts_enc_done")); 1382 1383&set_label("xts_enc_four",16); 1384 &movaps ($inout4,$tweak); # put aside last tweak 1385 1386 &movups ($inout0,&QWP(16*0,$inp)); # load input 1387 &movups ($inout1,&QWP(16*1,$inp)); 1388 &movups ($inout2,&QWP(16*2,$inp)); 1389 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1390 &movups ($inout3,&QWP(16*3,$inp)); 1391 &lea ($inp,&DWP(16*4,$inp)); 1392 &xorps ($inout1,&QWP(16*1,"esp")); 1393 &xorps ($inout2,$inout5); 1394 &xorps ($inout3,$inout4); 1395 1396 &call ("_aesni_encrypt4"); 1397 1398 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1399 &xorps ($inout1,&QWP(16*1,"esp")); 1400 &xorps ($inout2,$inout5); 1401 &movups (&QWP(16*0,$out),$inout0); # write output 1402 &xorps ($inout3,$inout4); 1403 &movups (&QWP(16*1,$out),$inout1); 1404 &movups (&QWP(16*2,$out),$inout2); 1405 &movups (&QWP(16*3,$out),$inout3); 1406 &lea ($out,&DWP(16*4,$out)); 1407 1408 &movdqa ($tweak,$inout4); # last tweak 1409 &jmp (&label("xts_enc_done")); 1410 1411&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1412 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1413 &and ($len,15); 1414 &jz (&label("xts_enc_ret")); 1415 &movdqa ($inout3,$tweak); 1416 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1417 &jmp (&label("xts_enc_steal")); 1418 1419&set_label("xts_enc_done",16); 1420 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1421 &pxor ($twtmp,$twtmp); 1422 &and ($len,15); 1423 &jz (&label("xts_enc_ret")); 1424 1425 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1426 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1427 &pshufd ($inout3,$twtmp,0x13); 1428 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1429 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1430 &pxor ($inout3,$tweak); 1431 1432&set_label("xts_enc_steal"); 1433 &movz ($rounds,&BP(0,$inp)); 1434 &movz ($key,&BP(-16,$out)); 1435 &lea ($inp,&DWP(1,$inp)); 1436 &mov (&BP(-16,$out),&LB($rounds)); 1437 &mov (&BP(0,$out),&LB($key)); 1438 &lea ($out,&DWP(1,$out)); 1439 &sub ($len,1); 1440 &jnz (&label("xts_enc_steal")); 1441 1442 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1443 &mov ($key,$key_); # restore $key 1444 &mov ($rounds,$rounds_); # restore $rounds 1445 1446 &movups ($inout0,&QWP(-16,$out)); # load input 1447 &xorps ($inout0,$inout3); # input^=tweak 1448 if ($inline) 1449 { &aesni_inline_generate1("enc"); } 1450 else 1451 { &call ("_aesni_encrypt1"); } 1452 &xorps ($inout0,$inout3); # output^=tweak 1453 &movups (&QWP(-16,$out),$inout0); # write output 1454 1455&set_label("xts_enc_ret"); 1456 &pxor ("xmm0","xmm0"); # clear register bank 1457 &pxor ("xmm1","xmm1"); 1458 &pxor ("xmm2","xmm2"); 1459 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1460 &pxor ("xmm3","xmm3"); 1461 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1462 &pxor ("xmm4","xmm4"); 1463 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1464 &pxor ("xmm5","xmm5"); 1465 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1466 &pxor ("xmm6","xmm6"); 1467 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1468 &pxor ("xmm7","xmm7"); 1469 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1470 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1471&function_end("aesni_xts_encrypt"); 1472 1473&function_begin("aesni_xts_decrypt"); 1474 &mov ($key,&wparam(4)); # key2 1475 &mov ($inp,&wparam(5)); # clear-text tweak 1476 1477 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1478 &movups ($inout0,&QWP(0,$inp)); 1479 if ($inline) 1480 { &aesni_inline_generate1("enc"); } 1481 else 1482 { &call ("_aesni_encrypt1"); } 1483 1484 &mov ($inp,&wparam(0)); 1485 &mov ($out,&wparam(1)); 1486 &mov ($len,&wparam(2)); 1487 &mov ($key,&wparam(3)); # key1 1488 1489 &mov ($key_,"esp"); 1490 &sub ("esp",16*7+8); 1491 &and ("esp",-16); # align stack 1492 1493 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1494 &test ($len,15); 1495 &setnz (&LB($rounds_)); 1496 &shl ($rounds_,4); 1497 &sub ($len,$rounds_); 1498 1499 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1500 &mov (&DWP(16*6+4,"esp"),0); 1501 &mov (&DWP(16*6+8,"esp"),1); 1502 &mov (&DWP(16*6+12,"esp"),0); 1503 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1504 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1505 1506 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1507 &mov ($key_,$key); # backup $key 1508 &mov ($rounds_,$rounds); # backup $rounds 1509 1510 &movdqa ($tweak,$inout0); 1511 &pxor ($twtmp,$twtmp); 1512 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1513 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1514 1515 &and ($len,-16); 1516 &sub ($len,16*6); 1517 &jc (&label("xts_dec_short")); 1518 1519 &shl ($rounds,4); 1520 &mov ($rounds_,16); 1521 &sub ($rounds_,$rounds); 1522 &lea ($key,&DWP(32,$key,$rounds)); 1523 &jmp (&label("xts_dec_loop6")); 1524 1525&set_label("xts_dec_loop6",16); 1526 for ($i=0;$i<4;$i++) { 1527 &pshufd ($twres,$twtmp,0x13); 1528 &pxor ($twtmp,$twtmp); 1529 &movdqa (&QWP(16*$i,"esp"),$tweak); 1530 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1531 &pand ($twres,$twmask); # isolate carry and residue 1532 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1533 &pxor ($tweak,$twres); 1534 } 1535 &pshufd ($inout5,$twtmp,0x13); 1536 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1537 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1538 &$movekey ($rndkey0,&QWP(0,$key_)); 1539 &pand ($inout5,$twmask); # isolate carry and residue 1540 &movups ($inout0,&QWP(0,$inp)); # load input 1541 &pxor ($inout5,$tweak); 1542 1543 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1544 &mov ($rounds,$rounds_); 1545 &movdqu ($inout1,&QWP(16*1,$inp)); 1546 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1547 &movdqu ($inout2,&QWP(16*2,$inp)); 1548 &pxor ($inout1,$rndkey0); 1549 &movdqu ($inout3,&QWP(16*3,$inp)); 1550 &pxor ($inout2,$rndkey0); 1551 &movdqu ($inout4,&QWP(16*4,$inp)); 1552 &pxor ($inout3,$rndkey0); 1553 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1554 &pxor ($inout4,$rndkey0); 1555 &lea ($inp,&DWP(16*6,$inp)); 1556 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1557 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1558 &pxor ($inout5,$rndkey1); 1559 1560 &$movekey ($rndkey1,&QWP(16,$key_)); 1561 &pxor ($inout1,&QWP(16*1,"esp")); 1562 &pxor ($inout2,&QWP(16*2,"esp")); 1563 &aesdec ($inout0,$rndkey1); 1564 &pxor ($inout3,&QWP(16*3,"esp")); 1565 &pxor ($inout4,&QWP(16*4,"esp")); 1566 &aesdec ($inout1,$rndkey1); 1567 &pxor ($inout5,$rndkey0); 1568 &$movekey ($rndkey0,&QWP(32,$key_)); 1569 &aesdec ($inout2,$rndkey1); 1570 &aesdec ($inout3,$rndkey1); 1571 &aesdec ($inout4,$rndkey1); 1572 &aesdec ($inout5,$rndkey1); 1573 &call (&label("_aesni_decrypt6_enter")); 1574 1575 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1576 &pxor ($twtmp,$twtmp); 1577 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1578 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1579 &xorps ($inout1,&QWP(16*1,"esp")); 1580 &movups (&QWP(16*0,$out),$inout0); # write output 1581 &xorps ($inout2,&QWP(16*2,"esp")); 1582 &movups (&QWP(16*1,$out),$inout1); 1583 &xorps ($inout3,&QWP(16*3,"esp")); 1584 &movups (&QWP(16*2,$out),$inout2); 1585 &xorps ($inout4,&QWP(16*4,"esp")); 1586 &movups (&QWP(16*3,$out),$inout3); 1587 &xorps ($inout5,$tweak); 1588 &movups (&QWP(16*4,$out),$inout4); 1589 &pshufd ($twres,$twtmp,0x13); 1590 &movups (&QWP(16*5,$out),$inout5); 1591 &lea ($out,&DWP(16*6,$out)); 1592 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1593 1594 &pxor ($twtmp,$twtmp); 1595 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1596 &pand ($twres,$twmask); # isolate carry and residue 1597 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1598 &pxor ($tweak,$twres); 1599 1600 &sub ($len,16*6); 1601 &jnc (&label("xts_dec_loop6")); 1602 1603 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1604 &mov ($key,$key_); # restore $key 1605 &mov ($rounds_,$rounds); 1606 1607&set_label("xts_dec_short"); 1608 &add ($len,16*6); 1609 &jz (&label("xts_dec_done6x")); 1610 1611 &movdqa ($inout3,$tweak); # put aside previous tweak 1612 &cmp ($len,0x20); 1613 &jb (&label("xts_dec_one")); 1614 1615 &pshufd ($twres,$twtmp,0x13); 1616 &pxor ($twtmp,$twtmp); 1617 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1618 &pand ($twres,$twmask); # isolate carry and residue 1619 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1620 &pxor ($tweak,$twres); 1621 &je (&label("xts_dec_two")); 1622 1623 &pshufd ($twres,$twtmp,0x13); 1624 &pxor ($twtmp,$twtmp); 1625 &movdqa ($inout4,$tweak); # put aside previous tweak 1626 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1627 &pand ($twres,$twmask); # isolate carry and residue 1628 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1629 &pxor ($tweak,$twres); 1630 &cmp ($len,0x40); 1631 &jb (&label("xts_dec_three")); 1632 1633 &pshufd ($twres,$twtmp,0x13); 1634 &pxor ($twtmp,$twtmp); 1635 &movdqa ($inout5,$tweak); # put aside previous tweak 1636 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1637 &pand ($twres,$twmask); # isolate carry and residue 1638 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1639 &pxor ($tweak,$twres); 1640 &movdqa (&QWP(16*0,"esp"),$inout3); 1641 &movdqa (&QWP(16*1,"esp"),$inout4); 1642 &je (&label("xts_dec_four")); 1643 1644 &movdqa (&QWP(16*2,"esp"),$inout5); 1645 &pshufd ($inout5,$twtmp,0x13); 1646 &movdqa (&QWP(16*3,"esp"),$tweak); 1647 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1648 &pand ($inout5,$twmask); # isolate carry and residue 1649 &pxor ($inout5,$tweak); 1650 1651 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1652 &movdqu ($inout1,&QWP(16*1,$inp)); 1653 &movdqu ($inout2,&QWP(16*2,$inp)); 1654 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1655 &movdqu ($inout3,&QWP(16*3,$inp)); 1656 &pxor ($inout1,&QWP(16*1,"esp")); 1657 &movdqu ($inout4,&QWP(16*4,$inp)); 1658 &pxor ($inout2,&QWP(16*2,"esp")); 1659 &lea ($inp,&DWP(16*5,$inp)); 1660 &pxor ($inout3,&QWP(16*3,"esp")); 1661 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1662 &pxor ($inout4,$inout5); 1663 1664 &call ("_aesni_decrypt6"); 1665 1666 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1667 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1668 &xorps ($inout1,&QWP(16*1,"esp")); 1669 &xorps ($inout2,&QWP(16*2,"esp")); 1670 &movups (&QWP(16*0,$out),$inout0); # write output 1671 &xorps ($inout3,&QWP(16*3,"esp")); 1672 &movups (&QWP(16*1,$out),$inout1); 1673 &xorps ($inout4,$tweak); 1674 &movups (&QWP(16*2,$out),$inout2); 1675 &movups (&QWP(16*3,$out),$inout3); 1676 &movups (&QWP(16*4,$out),$inout4); 1677 &lea ($out,&DWP(16*5,$out)); 1678 &jmp (&label("xts_dec_done")); 1679 1680&set_label("xts_dec_one",16); 1681 &movups ($inout0,&QWP(16*0,$inp)); # load input 1682 &lea ($inp,&DWP(16*1,$inp)); 1683 &xorps ($inout0,$inout3); # input^=tweak 1684 if ($inline) 1685 { &aesni_inline_generate1("dec"); } 1686 else 1687 { &call ("_aesni_decrypt1"); } 1688 &xorps ($inout0,$inout3); # output^=tweak 1689 &movups (&QWP(16*0,$out),$inout0); # write output 1690 &lea ($out,&DWP(16*1,$out)); 1691 1692 &movdqa ($tweak,$inout3); # last tweak 1693 &jmp (&label("xts_dec_done")); 1694 1695&set_label("xts_dec_two",16); 1696 &movaps ($inout4,$tweak); # put aside last tweak 1697 1698 &movups ($inout0,&QWP(16*0,$inp)); # load input 1699 &movups ($inout1,&QWP(16*1,$inp)); 1700 &lea ($inp,&DWP(16*2,$inp)); 1701 &xorps ($inout0,$inout3); # input^=tweak 1702 &xorps ($inout1,$inout4); 1703 1704 &call ("_aesni_decrypt2"); 1705 1706 &xorps ($inout0,$inout3); # output^=tweak 1707 &xorps ($inout1,$inout4); 1708 &movups (&QWP(16*0,$out),$inout0); # write output 1709 &movups (&QWP(16*1,$out),$inout1); 1710 &lea ($out,&DWP(16*2,$out)); 1711 1712 &movdqa ($tweak,$inout4); # last tweak 1713 &jmp (&label("xts_dec_done")); 1714 1715&set_label("xts_dec_three",16); 1716 &movaps ($inout5,$tweak); # put aside last tweak 1717 &movups ($inout0,&QWP(16*0,$inp)); # load input 1718 &movups ($inout1,&QWP(16*1,$inp)); 1719 &movups ($inout2,&QWP(16*2,$inp)); 1720 &lea ($inp,&DWP(16*3,$inp)); 1721 &xorps ($inout0,$inout3); # input^=tweak 1722 &xorps ($inout1,$inout4); 1723 &xorps ($inout2,$inout5); 1724 1725 &call ("_aesni_decrypt3"); 1726 1727 &xorps ($inout0,$inout3); # output^=tweak 1728 &xorps ($inout1,$inout4); 1729 &xorps ($inout2,$inout5); 1730 &movups (&QWP(16*0,$out),$inout0); # write output 1731 &movups (&QWP(16*1,$out),$inout1); 1732 &movups (&QWP(16*2,$out),$inout2); 1733 &lea ($out,&DWP(16*3,$out)); 1734 1735 &movdqa ($tweak,$inout5); # last tweak 1736 &jmp (&label("xts_dec_done")); 1737 1738&set_label("xts_dec_four",16); 1739 &movaps ($inout4,$tweak); # put aside last tweak 1740 1741 &movups ($inout0,&QWP(16*0,$inp)); # load input 1742 &movups ($inout1,&QWP(16*1,$inp)); 1743 &movups ($inout2,&QWP(16*2,$inp)); 1744 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1745 &movups ($inout3,&QWP(16*3,$inp)); 1746 &lea ($inp,&DWP(16*4,$inp)); 1747 &xorps ($inout1,&QWP(16*1,"esp")); 1748 &xorps ($inout2,$inout5); 1749 &xorps ($inout3,$inout4); 1750 1751 &call ("_aesni_decrypt4"); 1752 1753 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1754 &xorps ($inout1,&QWP(16*1,"esp")); 1755 &xorps ($inout2,$inout5); 1756 &movups (&QWP(16*0,$out),$inout0); # write output 1757 &xorps ($inout3,$inout4); 1758 &movups (&QWP(16*1,$out),$inout1); 1759 &movups (&QWP(16*2,$out),$inout2); 1760 &movups (&QWP(16*3,$out),$inout3); 1761 &lea ($out,&DWP(16*4,$out)); 1762 1763 &movdqa ($tweak,$inout4); # last tweak 1764 &jmp (&label("xts_dec_done")); 1765 1766&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1767 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1768 &and ($len,15); 1769 &jz (&label("xts_dec_ret")); 1770 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1771 &jmp (&label("xts_dec_only_one_more")); 1772 1773&set_label("xts_dec_done",16); 1774 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1775 &pxor ($twtmp,$twtmp); 1776 &and ($len,15); 1777 &jz (&label("xts_dec_ret")); 1778 1779 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1780 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1781 &pshufd ($twres,$twtmp,0x13); 1782 &pxor ($twtmp,$twtmp); 1783 &movdqa ($twmask,&QWP(16*6,"esp")); 1784 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1785 &pand ($twres,$twmask); # isolate carry and residue 1786 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1787 &pxor ($tweak,$twres); 1788 1789&set_label("xts_dec_only_one_more"); 1790 &pshufd ($inout3,$twtmp,0x13); 1791 &movdqa ($inout4,$tweak); # put aside previous tweak 1792 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1793 &pand ($inout3,$twmask); # isolate carry and residue 1794 &pxor ($inout3,$tweak); 1795 1796 &mov ($key,$key_); # restore $key 1797 &mov ($rounds,$rounds_); # restore $rounds 1798 1799 &movups ($inout0,&QWP(0,$inp)); # load input 1800 &xorps ($inout0,$inout3); # input^=tweak 1801 if ($inline) 1802 { &aesni_inline_generate1("dec"); } 1803 else 1804 { &call ("_aesni_decrypt1"); } 1805 &xorps ($inout0,$inout3); # output^=tweak 1806 &movups (&QWP(0,$out),$inout0); # write output 1807 1808&set_label("xts_dec_steal"); 1809 &movz ($rounds,&BP(16,$inp)); 1810 &movz ($key,&BP(0,$out)); 1811 &lea ($inp,&DWP(1,$inp)); 1812 &mov (&BP(0,$out),&LB($rounds)); 1813 &mov (&BP(16,$out),&LB($key)); 1814 &lea ($out,&DWP(1,$out)); 1815 &sub ($len,1); 1816 &jnz (&label("xts_dec_steal")); 1817 1818 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1819 &mov ($key,$key_); # restore $key 1820 &mov ($rounds,$rounds_); # restore $rounds 1821 1822 &movups ($inout0,&QWP(0,$out)); # load input 1823 &xorps ($inout0,$inout4); # input^=tweak 1824 if ($inline) 1825 { &aesni_inline_generate1("dec"); } 1826 else 1827 { &call ("_aesni_decrypt1"); } 1828 &xorps ($inout0,$inout4); # output^=tweak 1829 &movups (&QWP(0,$out),$inout0); # write output 1830 1831&set_label("xts_dec_ret"); 1832 &pxor ("xmm0","xmm0"); # clear register bank 1833 &pxor ("xmm1","xmm1"); 1834 &pxor ("xmm2","xmm2"); 1835 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1836 &pxor ("xmm3","xmm3"); 1837 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1838 &pxor ("xmm4","xmm4"); 1839 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1840 &pxor ("xmm5","xmm5"); 1841 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1842 &pxor ("xmm6","xmm6"); 1843 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1844 &pxor ("xmm7","xmm7"); 1845 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1846 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1847&function_end("aesni_xts_decrypt"); 1848} 1849 1850###################################################################### 1851# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 1852# const AES_KEY *key, unsigned int start_block_num, 1853# unsigned char offset_i[16], const unsigned char L_[][16], 1854# unsigned char checksum[16]); 1855# 1856{ 1857# offsets within stack frame 1858my $checksum = 16*6; 1859my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4)); 1860 1861# reassigned registers 1862my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out); 1863# $l_, $blocks, $inp, $key are permanently allocated in registers; 1864# remaining non-volatile ones are offloaded to stack, which even 1865# stay invariant after written to stack. 1866 1867&function_begin("aesni_ocb_encrypt"); 1868 &mov ($rounds,&wparam(5)); # &offset_i 1869 &mov ($rounds_,&wparam(7)); # &checksum 1870 1871 &mov ($inp,&wparam(0)); 1872 &mov ($out,&wparam(1)); 1873 &mov ($len,&wparam(2)); 1874 &mov ($key,&wparam(3)); 1875 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 1876 &mov ($block,&wparam(4)); # start_block_num 1877 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 1878 &mov ($l_,&wparam(6)); # L_ 1879 1880 &mov ($rounds,"esp"); 1881 &sub ("esp",$esp_off+4); # alloca 1882 &and ("esp",-16); # align stack 1883 1884 &sub ($out,$inp); 1885 &shl ($len,4); 1886 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 1887 &mov (&DWP($out_off,"esp"),$out); 1888 &mov (&DWP($end_off,"esp"),$len); 1889 &mov (&DWP($esp_off,"esp"),$rounds); 1890 1891 &mov ($rounds,&DWP(240,$key)); 1892 1893 &test ($block,1); 1894 &jnz (&label("odd")); 1895 1896 &bsf ($i3,$block); 1897 &add ($block,1); 1898 &shl ($i3,4); 1899 &movdqu ($inout5,&QWP(0,$l_,$i3)); 1900 &mov ($i3,$key); # put aside key 1901 1902 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1903 &lea ($inp,&DWP(16,$inp)); 1904 1905 &pxor ($inout5,$rndkey0); # ^ last offset_i 1906 &pxor ($rndkey1,$inout0); # checksum 1907 &pxor ($inout0,$inout5); # ^ offset_i 1908 1909 &movdqa ($inout4,$rndkey1); 1910 if ($inline) 1911 { &aesni_inline_generate1("enc"); } 1912 else 1913 { &call ("_aesni_encrypt1"); } 1914 1915 &xorps ($inout0,$inout5); # ^ offset_i 1916 &movdqa ($rndkey0,$inout5); # pass last offset_i 1917 &movdqa ($rndkey1,$inout4); # pass the checksum 1918 1919 &movups (&QWP(-16,$out,$inp),$inout0); # store output 1920 1921 &mov ($rounds,&DWP(240,$i3)); 1922 &mov ($key,$i3); # restore key 1923 &mov ($len,&DWP($end_off,"esp")); 1924 1925&set_label("odd"); 1926 &shl ($rounds,4); 1927 &mov ($out,16); 1928 &sub ($out,$rounds); # twisted rounds 1929 &mov (&DWP($key_off,"esp"),$key); 1930 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 1931 &mov (&DWP($rounds_off,"esp"),$out); 1932 1933 &cmp ($inp,$len); 1934 &ja (&label("short")); 1935 &jmp (&label("grandloop")); 1936 1937&set_label("grandloop",32); 1938 &lea ($i1,&DWP(1,$block)); 1939 &lea ($i3,&DWP(3,$block)); 1940 &lea ($i5,&DWP(5,$block)); 1941 &add ($block,6); 1942 &bsf ($i1,$i1); 1943 &bsf ($i3,$i3); 1944 &bsf ($i5,$i5); 1945 &shl ($i1,4); 1946 &shl ($i3,4); 1947 &shl ($i5,4); 1948 &movdqu ($inout0,&QWP(0,$l_)); 1949 &movdqu ($inout1,&QWP(0,$l_,$i1)); 1950 &mov ($rounds,&DWP($rounds_off,"esp")); 1951 &movdqa ($inout2,$inout0); 1952 &movdqu ($inout3,&QWP(0,$l_,$i3)); 1953 &movdqa ($inout4,$inout0); 1954 &movdqu ($inout5,&QWP(0,$l_,$i5)); 1955 1956 &pxor ($inout0,$rndkey0); # ^ last offset_i 1957 &pxor ($inout1,$inout0); 1958 &movdqa (&QWP(16*0,"esp"),$inout0); 1959 &pxor ($inout2,$inout1); 1960 &movdqa (&QWP(16*1,"esp"),$inout1); 1961 &pxor ($inout3,$inout2); 1962 &movdqa (&QWP(16*2,"esp"),$inout2); 1963 &pxor ($inout4,$inout3); 1964 &movdqa (&QWP(16*3,"esp"),$inout3); 1965 &pxor ($inout5,$inout4); 1966 &movdqa (&QWP(16*4,"esp"),$inout4); 1967 &movdqa (&QWP(16*5,"esp"),$inout5); 1968 1969 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 1970 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1971 &movdqu ($inout1,&QWP(16*1,$inp)); 1972 &movdqu ($inout2,&QWP(16*2,$inp)); 1973 &movdqu ($inout3,&QWP(16*3,$inp)); 1974 &movdqu ($inout4,&QWP(16*4,$inp)); 1975 &movdqu ($inout5,&QWP(16*5,$inp)); 1976 &lea ($inp,&DWP(16*6,$inp)); 1977 1978 &pxor ($rndkey1,$inout0); # checksum 1979 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 1980 &pxor ($rndkey1,$inout1); 1981 &pxor ($inout1,$rndkey0); 1982 &pxor ($rndkey1,$inout2); 1983 &pxor ($inout2,$rndkey0); 1984 &pxor ($rndkey1,$inout3); 1985 &pxor ($inout3,$rndkey0); 1986 &pxor ($rndkey1,$inout4); 1987 &pxor ($inout4,$rndkey0); 1988 &pxor ($rndkey1,$inout5); 1989 &pxor ($inout5,$rndkey0); 1990 &movdqa (&QWP($checksum,"esp"),$rndkey1); 1991 1992 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 1993 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 1994 &pxor ($inout1,&QWP(16*1,"esp")); 1995 &pxor ($inout2,&QWP(16*2,"esp")); 1996 &pxor ($inout3,&QWP(16*3,"esp")); 1997 &pxor ($inout4,&QWP(16*4,"esp")); 1998 &pxor ($inout5,&QWP(16*5,"esp")); 1999 2000 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2001 &aesenc ($inout0,$rndkey1); 2002 &aesenc ($inout1,$rndkey1); 2003 &aesenc ($inout2,$rndkey1); 2004 &aesenc ($inout3,$rndkey1); 2005 &aesenc ($inout4,$rndkey1); 2006 &aesenc ($inout5,$rndkey1); 2007 2008 &mov ($out,&DWP($out_off,"esp")); 2009 &mov ($len,&DWP($end_off,"esp")); 2010 &call ("_aesni_encrypt6_enter"); 2011 2012 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 2013 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2014 &pxor ($inout1,&QWP(16*1,"esp")); 2015 &pxor ($inout2,&QWP(16*2,"esp")); 2016 &pxor ($inout3,&QWP(16*3,"esp")); 2017 &pxor ($inout4,&QWP(16*4,"esp")); 2018 &pxor ($inout5,$rndkey0); 2019 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2020 2021 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 2022 &movdqu (&QWP(-16*5,$out,$inp),$inout1); 2023 &movdqu (&QWP(-16*4,$out,$inp),$inout2); 2024 &movdqu (&QWP(-16*3,$out,$inp),$inout3); 2025 &movdqu (&QWP(-16*2,$out,$inp),$inout4); 2026 &movdqu (&QWP(-16*1,$out,$inp),$inout5); 2027 &cmp ($inp,$len); # done yet? 2028 &jbe (&label("grandloop")); 2029 2030&set_label("short"); 2031 &add ($len,16*6); 2032 &sub ($len,$inp); 2033 &jz (&label("done")); 2034 2035 &cmp ($len,16*2); 2036 &jb (&label("one")); 2037 &je (&label("two")); 2038 2039 &cmp ($len,16*4); 2040 &jb (&label("three")); 2041 &je (&label("four")); 2042 2043 &lea ($i1,&DWP(1,$block)); 2044 &lea ($i3,&DWP(3,$block)); 2045 &bsf ($i1,$i1); 2046 &bsf ($i3,$i3); 2047 &shl ($i1,4); 2048 &shl ($i3,4); 2049 &movdqu ($inout0,&QWP(0,$l_)); 2050 &movdqu ($inout1,&QWP(0,$l_,$i1)); 2051 &mov ($rounds,&DWP($rounds_off,"esp")); 2052 &movdqa ($inout2,$inout0); 2053 &movdqu ($inout3,&QWP(0,$l_,$i3)); 2054 &movdqa ($inout4,$inout0); 2055 2056 &pxor ($inout0,$rndkey0); # ^ last offset_i 2057 &pxor ($inout1,$inout0); 2058 &movdqa (&QWP(16*0,"esp"),$inout0); 2059 &pxor ($inout2,$inout1); 2060 &movdqa (&QWP(16*1,"esp"),$inout1); 2061 &pxor ($inout3,$inout2); 2062 &movdqa (&QWP(16*2,"esp"),$inout2); 2063 &pxor ($inout4,$inout3); 2064 &movdqa (&QWP(16*3,"esp"),$inout3); 2065 &pxor ($inout5,$inout4); 2066 &movdqa (&QWP(16*4,"esp"),$inout4); 2067 2068 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2069 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2070 &movdqu ($inout1,&QWP(16*1,$inp)); 2071 &movdqu ($inout2,&QWP(16*2,$inp)); 2072 &movdqu ($inout3,&QWP(16*3,$inp)); 2073 &movdqu ($inout4,&QWP(16*4,$inp)); 2074 &pxor ($inout5,$inout5); 2075 2076 &pxor ($rndkey1,$inout0); # checksum 2077 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2078 &pxor ($rndkey1,$inout1); 2079 &pxor ($inout1,$rndkey0); 2080 &pxor ($rndkey1,$inout2); 2081 &pxor ($inout2,$rndkey0); 2082 &pxor ($rndkey1,$inout3); 2083 &pxor ($inout3,$rndkey0); 2084 &pxor ($rndkey1,$inout4); 2085 &pxor ($inout4,$rndkey0); 2086 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2087 2088 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2089 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2090 &pxor ($inout1,&QWP(16*1,"esp")); 2091 &pxor ($inout2,&QWP(16*2,"esp")); 2092 &pxor ($inout3,&QWP(16*3,"esp")); 2093 &pxor ($inout4,&QWP(16*4,"esp")); 2094 2095 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2096 &aesenc ($inout0,$rndkey1); 2097 &aesenc ($inout1,$rndkey1); 2098 &aesenc ($inout2,$rndkey1); 2099 &aesenc ($inout3,$rndkey1); 2100 &aesenc ($inout4,$rndkey1); 2101 &aesenc ($inout5,$rndkey1); 2102 2103 &mov ($out,&DWP($out_off,"esp")); 2104 &call ("_aesni_encrypt6_enter"); 2105 2106 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 2107 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2108 &pxor ($inout1,&QWP(16*1,"esp")); 2109 &pxor ($inout2,&QWP(16*2,"esp")); 2110 &pxor ($inout3,&QWP(16*3,"esp")); 2111 &pxor ($inout4,$rndkey0); 2112 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2113 2114 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 2115 &movdqu (&QWP(16*1,$out,$inp),$inout1); 2116 &movdqu (&QWP(16*2,$out,$inp),$inout2); 2117 &movdqu (&QWP(16*3,$out,$inp),$inout3); 2118 &movdqu (&QWP(16*4,$out,$inp),$inout4); 2119 2120 &jmp (&label("done")); 2121 2122&set_label("one",16); 2123 &movdqu ($inout5,&QWP(0,$l_)); 2124 &mov ($key,&DWP($key_off,"esp")); # restore key 2125 2126 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2127 &mov ($rounds,&DWP(240,$key)); 2128 2129 &pxor ($inout5,$rndkey0); # ^ last offset_i 2130 &pxor ($rndkey1,$inout0); # checksum 2131 &pxor ($inout0,$inout5); # ^ offset_i 2132 2133 &movdqa ($inout4,$rndkey1); 2134 &mov ($out,&DWP($out_off,"esp")); 2135 if ($inline) 2136 { &aesni_inline_generate1("enc"); } 2137 else 2138 { &call ("_aesni_encrypt1"); } 2139 2140 &xorps ($inout0,$inout5); # ^ offset_i 2141 &movdqa ($rndkey0,$inout5); # pass last offset_i 2142 &movdqa ($rndkey1,$inout4); # pass the checksum 2143 &movups (&QWP(0,$out,$inp),$inout0); 2144 2145 &jmp (&label("done")); 2146 2147&set_label("two",16); 2148 &lea ($i1,&DWP(1,$block)); 2149 &mov ($key,&DWP($key_off,"esp")); # restore key 2150 &bsf ($i1,$i1); 2151 &shl ($i1,4); 2152 &movdqu ($inout4,&QWP(0,$l_)); 2153 &movdqu ($inout5,&QWP(0,$l_,$i1)); 2154 2155 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2156 &movdqu ($inout1,&QWP(16*1,$inp)); 2157 &mov ($rounds,&DWP(240,$key)); 2158 2159 &pxor ($inout4,$rndkey0); # ^ last offset_i 2160 &pxor ($inout5,$inout4); 2161 2162 &pxor ($rndkey1,$inout0); # checksum 2163 &pxor ($inout0,$inout4); # ^ offset_i 2164 &pxor ($rndkey1,$inout1); 2165 &pxor ($inout1,$inout5); 2166 2167 &movdqa ($inout3,$rndkey1) 2168 &mov ($out,&DWP($out_off,"esp")); 2169 &call ("_aesni_encrypt2"); 2170 2171 &xorps ($inout0,$inout4); # ^ offset_i 2172 &xorps ($inout1,$inout5); 2173 &movdqa ($rndkey0,$inout5); # pass last offset_i 2174 &movdqa ($rndkey1,$inout3); # pass the checksum 2175 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2176 &movups (&QWP(16*1,$out,$inp),$inout1); 2177 2178 &jmp (&label("done")); 2179 2180&set_label("three",16); 2181 &lea ($i1,&DWP(1,$block)); 2182 &mov ($key,&DWP($key_off,"esp")); # restore key 2183 &bsf ($i1,$i1); 2184 &shl ($i1,4); 2185 &movdqu ($inout3,&QWP(0,$l_)); 2186 &movdqu ($inout4,&QWP(0,$l_,$i1)); 2187 &movdqa ($inout5,$inout3); 2188 2189 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2190 &movdqu ($inout1,&QWP(16*1,$inp)); 2191 &movdqu ($inout2,&QWP(16*2,$inp)); 2192 &mov ($rounds,&DWP(240,$key)); 2193 2194 &pxor ($inout3,$rndkey0); # ^ last offset_i 2195 &pxor ($inout4,$inout3); 2196 &pxor ($inout5,$inout4); 2197 2198 &pxor ($rndkey1,$inout0); # checksum 2199 &pxor ($inout0,$inout3); # ^ offset_i 2200 &pxor ($rndkey1,$inout1); 2201 &pxor ($inout1,$inout4); 2202 &pxor ($rndkey1,$inout2); 2203 &pxor ($inout2,$inout5); 2204 2205 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2206 &mov ($out,&DWP($out_off,"esp")); 2207 &call ("_aesni_encrypt3"); 2208 2209 &xorps ($inout0,$inout3); # ^ offset_i 2210 &xorps ($inout1,$inout4); 2211 &xorps ($inout2,$inout5); 2212 &movdqa ($rndkey0,$inout5); # pass last offset_i 2213 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2214 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2215 &movups (&QWP(16*1,$out,$inp),$inout1); 2216 &movups (&QWP(16*2,$out,$inp),$inout2); 2217 2218 &jmp (&label("done")); 2219 2220&set_label("four",16); 2221 &lea ($i1,&DWP(1,$block)); 2222 &lea ($i3,&DWP(3,$block)); 2223 &bsf ($i1,$i1); 2224 &bsf ($i3,$i3); 2225 &mov ($key,&DWP($key_off,"esp")); # restore key 2226 &shl ($i1,4); 2227 &shl ($i3,4); 2228 &movdqu ($inout2,&QWP(0,$l_)); 2229 &movdqu ($inout3,&QWP(0,$l_,$i1)); 2230 &movdqa ($inout4,$inout2); 2231 &movdqu ($inout5,&QWP(0,$l_,$i3)); 2232 2233 &pxor ($inout2,$rndkey0); # ^ last offset_i 2234 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2235 &pxor ($inout3,$inout2); 2236 &movdqu ($inout1,&QWP(16*1,$inp)); 2237 &pxor ($inout4,$inout3); 2238 &movdqa (&QWP(16*0,"esp"),$inout2); 2239 &pxor ($inout5,$inout4); 2240 &movdqa (&QWP(16*1,"esp"),$inout3); 2241 &movdqu ($inout2,&QWP(16*2,$inp)); 2242 &movdqu ($inout3,&QWP(16*3,$inp)); 2243 &mov ($rounds,&DWP(240,$key)); 2244 2245 &pxor ($rndkey1,$inout0); # checksum 2246 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2247 &pxor ($rndkey1,$inout1); 2248 &pxor ($inout1,&QWP(16*1,"esp")); 2249 &pxor ($rndkey1,$inout2); 2250 &pxor ($inout2,$inout4); 2251 &pxor ($rndkey1,$inout3); 2252 &pxor ($inout3,$inout5); 2253 2254 &movdqa (&QWP($checksum,"esp"),$rndkey1) 2255 &mov ($out,&DWP($out_off,"esp")); 2256 &call ("_aesni_encrypt4"); 2257 2258 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2259 &xorps ($inout1,&QWP(16*1,"esp")); 2260 &xorps ($inout2,$inout4); 2261 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2262 &xorps ($inout3,$inout5); 2263 &movups (&QWP(16*1,$out,$inp),$inout1); 2264 &movdqa ($rndkey0,$inout5); # pass last offset_i 2265 &movups (&QWP(16*2,$out,$inp),$inout2); 2266 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2267 &movups (&QWP(16*3,$out,$inp),$inout3); 2268 2269&set_label("done"); 2270 &mov ($key,&DWP($esp_off,"esp")); 2271 &pxor ($inout0,$inout0); # clear register bank 2272 &pxor ($inout1,$inout1); 2273 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 2274 &pxor ($inout2,$inout2); 2275 &movdqa (&QWP(16*1,"esp"),$inout0); 2276 &pxor ($inout3,$inout3); 2277 &movdqa (&QWP(16*2,"esp"),$inout0); 2278 &pxor ($inout4,$inout4); 2279 &movdqa (&QWP(16*3,"esp"),$inout0); 2280 &pxor ($inout5,$inout5); 2281 &movdqa (&QWP(16*4,"esp"),$inout0); 2282 &movdqa (&QWP(16*5,"esp"),$inout0); 2283 &movdqa (&QWP(16*6,"esp"),$inout0); 2284 2285 &lea ("esp",&DWP(0,$key)); 2286 &mov ($rounds,&wparam(5)); # &offset_i 2287 &mov ($rounds_,&wparam(7)); # &checksum 2288 &movdqu (&QWP(0,$rounds),$rndkey0); 2289 &pxor ($rndkey0,$rndkey0); 2290 &movdqu (&QWP(0,$rounds_),$rndkey1); 2291 &pxor ($rndkey1,$rndkey1); 2292&function_end("aesni_ocb_encrypt"); 2293 2294&function_begin("aesni_ocb_decrypt"); 2295 &mov ($rounds,&wparam(5)); # &offset_i 2296 &mov ($rounds_,&wparam(7)); # &checksum 2297 2298 &mov ($inp,&wparam(0)); 2299 &mov ($out,&wparam(1)); 2300 &mov ($len,&wparam(2)); 2301 &mov ($key,&wparam(3)); 2302 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 2303 &mov ($block,&wparam(4)); # start_block_num 2304 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 2305 &mov ($l_,&wparam(6)); # L_ 2306 2307 &mov ($rounds,"esp"); 2308 &sub ("esp",$esp_off+4); # alloca 2309 &and ("esp",-16); # align stack 2310 2311 &sub ($out,$inp); 2312 &shl ($len,4); 2313 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 2314 &mov (&DWP($out_off,"esp"),$out); 2315 &mov (&DWP($end_off,"esp"),$len); 2316 &mov (&DWP($esp_off,"esp"),$rounds); 2317 2318 &mov ($rounds,&DWP(240,$key)); 2319 2320 &test ($block,1); 2321 &jnz (&label("odd")); 2322 2323 &bsf ($i3,$block); 2324 &add ($block,1); 2325 &shl ($i3,4); 2326 &movdqu ($inout5,&QWP(0,$l_,$i3)); 2327 &mov ($i3,$key); # put aside key 2328 2329 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2330 &lea ($inp,&DWP(16,$inp)); 2331 2332 &pxor ($inout5,$rndkey0); # ^ last offset_i 2333 &pxor ($inout0,$inout5); # ^ offset_i 2334 2335 &movdqa ($inout4,$rndkey1); 2336 if ($inline) 2337 { &aesni_inline_generate1("dec"); } 2338 else 2339 { &call ("_aesni_decrypt1"); } 2340 2341 &xorps ($inout0,$inout5); # ^ offset_i 2342 &movaps ($rndkey1,$inout4); # pass the checksum 2343 &movdqa ($rndkey0,$inout5); # pass last offset_i 2344 &xorps ($rndkey1,$inout0); # checksum 2345 &movups (&QWP(-16,$out,$inp),$inout0); # store output 2346 2347 &mov ($rounds,&DWP(240,$i3)); 2348 &mov ($key,$i3); # restore key 2349 &mov ($len,&DWP($end_off,"esp")); 2350 2351&set_label("odd"); 2352 &shl ($rounds,4); 2353 &mov ($out,16); 2354 &sub ($out,$rounds); # twisted rounds 2355 &mov (&DWP($key_off,"esp"),$key); 2356 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 2357 &mov (&DWP($rounds_off,"esp"),$out); 2358 2359 &cmp ($inp,$len); 2360 &ja (&label("short")); 2361 &jmp (&label("grandloop")); 2362 2363&set_label("grandloop",32); 2364 &lea ($i1,&DWP(1,$block)); 2365 &lea ($i3,&DWP(3,$block)); 2366 &lea ($i5,&DWP(5,$block)); 2367 &add ($block,6); 2368 &bsf ($i1,$i1); 2369 &bsf ($i3,$i3); 2370 &bsf ($i5,$i5); 2371 &shl ($i1,4); 2372 &shl ($i3,4); 2373 &shl ($i5,4); 2374 &movdqu ($inout0,&QWP(0,$l_)); 2375 &movdqu ($inout1,&QWP(0,$l_,$i1)); 2376 &mov ($rounds,&DWP($rounds_off,"esp")); 2377 &movdqa ($inout2,$inout0); 2378 &movdqu ($inout3,&QWP(0,$l_,$i3)); 2379 &movdqa ($inout4,$inout0); 2380 &movdqu ($inout5,&QWP(0,$l_,$i5)); 2381 2382 &pxor ($inout0,$rndkey0); # ^ last offset_i 2383 &pxor ($inout1,$inout0); 2384 &movdqa (&QWP(16*0,"esp"),$inout0); 2385 &pxor ($inout2,$inout1); 2386 &movdqa (&QWP(16*1,"esp"),$inout1); 2387 &pxor ($inout3,$inout2); 2388 &movdqa (&QWP(16*2,"esp"),$inout2); 2389 &pxor ($inout4,$inout3); 2390 &movdqa (&QWP(16*3,"esp"),$inout3); 2391 &pxor ($inout5,$inout4); 2392 &movdqa (&QWP(16*4,"esp"),$inout4); 2393 &movdqa (&QWP(16*5,"esp"),$inout5); 2394 2395 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2396 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2397 &movdqu ($inout1,&QWP(16*1,$inp)); 2398 &movdqu ($inout2,&QWP(16*2,$inp)); 2399 &movdqu ($inout3,&QWP(16*3,$inp)); 2400 &movdqu ($inout4,&QWP(16*4,$inp)); 2401 &movdqu ($inout5,&QWP(16*5,$inp)); 2402 &lea ($inp,&DWP(16*6,$inp)); 2403 2404 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2405 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2406 &pxor ($inout1,$rndkey0); 2407 &pxor ($inout2,$rndkey0); 2408 &pxor ($inout3,$rndkey0); 2409 &pxor ($inout4,$rndkey0); 2410 &pxor ($inout5,$rndkey0); 2411 2412 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2413 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2414 &pxor ($inout1,&QWP(16*1,"esp")); 2415 &pxor ($inout2,&QWP(16*2,"esp")); 2416 &pxor ($inout3,&QWP(16*3,"esp")); 2417 &pxor ($inout4,&QWP(16*4,"esp")); 2418 &pxor ($inout5,&QWP(16*5,"esp")); 2419 2420 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2421 &aesdec ($inout0,$rndkey1); 2422 &aesdec ($inout1,$rndkey1); 2423 &aesdec ($inout2,$rndkey1); 2424 &aesdec ($inout3,$rndkey1); 2425 &aesdec ($inout4,$rndkey1); 2426 &aesdec ($inout5,$rndkey1); 2427 2428 &mov ($out,&DWP($out_off,"esp")); 2429 &mov ($len,&DWP($end_off,"esp")); 2430 &call ("_aesni_decrypt6_enter"); 2431 2432 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 2433 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2434 &movdqa ($rndkey1,&QWP($checksum,"esp")); 2435 &pxor ($inout1,&QWP(16*1,"esp")); 2436 &pxor ($inout2,&QWP(16*2,"esp")); 2437 &pxor ($inout3,&QWP(16*3,"esp")); 2438 &pxor ($inout4,&QWP(16*4,"esp")); 2439 &pxor ($inout5,$rndkey0); 2440 2441 &pxor ($rndkey1,$inout0); # checksum 2442 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 2443 &pxor ($rndkey1,$inout1); 2444 &movdqu (&QWP(-16*5,$out,$inp),$inout1); 2445 &pxor ($rndkey1,$inout2); 2446 &movdqu (&QWP(-16*4,$out,$inp),$inout2); 2447 &pxor ($rndkey1,$inout3); 2448 &movdqu (&QWP(-16*3,$out,$inp),$inout3); 2449 &pxor ($rndkey1,$inout4); 2450 &movdqu (&QWP(-16*2,$out,$inp),$inout4); 2451 &pxor ($rndkey1,$inout5); 2452 &movdqu (&QWP(-16*1,$out,$inp),$inout5); 2453 &cmp ($inp,$len); # done yet? 2454 &jbe (&label("grandloop")); 2455 2456&set_label("short"); 2457 &add ($len,16*6); 2458 &sub ($len,$inp); 2459 &jz (&label("done")); 2460 2461 &cmp ($len,16*2); 2462 &jb (&label("one")); 2463 &je (&label("two")); 2464 2465 &cmp ($len,16*4); 2466 &jb (&label("three")); 2467 &je (&label("four")); 2468 2469 &lea ($i1,&DWP(1,$block)); 2470 &lea ($i3,&DWP(3,$block)); 2471 &bsf ($i1,$i1); 2472 &bsf ($i3,$i3); 2473 &shl ($i1,4); 2474 &shl ($i3,4); 2475 &movdqu ($inout0,&QWP(0,$l_)); 2476 &movdqu ($inout1,&QWP(0,$l_,$i1)); 2477 &mov ($rounds,&DWP($rounds_off,"esp")); 2478 &movdqa ($inout2,$inout0); 2479 &movdqu ($inout3,&QWP(0,$l_,$i3)); 2480 &movdqa ($inout4,$inout0); 2481 2482 &pxor ($inout0,$rndkey0); # ^ last offset_i 2483 &pxor ($inout1,$inout0); 2484 &movdqa (&QWP(16*0,"esp"),$inout0); 2485 &pxor ($inout2,$inout1); 2486 &movdqa (&QWP(16*1,"esp"),$inout1); 2487 &pxor ($inout3,$inout2); 2488 &movdqa (&QWP(16*2,"esp"),$inout2); 2489 &pxor ($inout4,$inout3); 2490 &movdqa (&QWP(16*3,"esp"),$inout3); 2491 &pxor ($inout5,$inout4); 2492 &movdqa (&QWP(16*4,"esp"),$inout4); 2493 2494 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2495 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2496 &movdqu ($inout1,&QWP(16*1,$inp)); 2497 &movdqu ($inout2,&QWP(16*2,$inp)); 2498 &movdqu ($inout3,&QWP(16*3,$inp)); 2499 &movdqu ($inout4,&QWP(16*4,$inp)); 2500 &pxor ($inout5,$inout5); 2501 2502 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2503 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2504 &pxor ($inout1,$rndkey0); 2505 &pxor ($inout2,$rndkey0); 2506 &pxor ($inout3,$rndkey0); 2507 &pxor ($inout4,$rndkey0); 2508 2509 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2510 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2511 &pxor ($inout1,&QWP(16*1,"esp")); 2512 &pxor ($inout2,&QWP(16*2,"esp")); 2513 &pxor ($inout3,&QWP(16*3,"esp")); 2514 &pxor ($inout4,&QWP(16*4,"esp")); 2515 2516 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2517 &aesdec ($inout0,$rndkey1); 2518 &aesdec ($inout1,$rndkey1); 2519 &aesdec ($inout2,$rndkey1); 2520 &aesdec ($inout3,$rndkey1); 2521 &aesdec ($inout4,$rndkey1); 2522 &aesdec ($inout5,$rndkey1); 2523 2524 &mov ($out,&DWP($out_off,"esp")); 2525 &call ("_aesni_decrypt6_enter"); 2526 2527 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 2528 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2529 &movdqa ($rndkey1,&QWP($checksum,"esp")); 2530 &pxor ($inout1,&QWP(16*1,"esp")); 2531 &pxor ($inout2,&QWP(16*2,"esp")); 2532 &pxor ($inout3,&QWP(16*3,"esp")); 2533 &pxor ($inout4,$rndkey0); 2534 2535 &pxor ($rndkey1,$inout0); # checksum 2536 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 2537 &pxor ($rndkey1,$inout1); 2538 &movdqu (&QWP(16*1,$out,$inp),$inout1); 2539 &pxor ($rndkey1,$inout2); 2540 &movdqu (&QWP(16*2,$out,$inp),$inout2); 2541 &pxor ($rndkey1,$inout3); 2542 &movdqu (&QWP(16*3,$out,$inp),$inout3); 2543 &pxor ($rndkey1,$inout4); 2544 &movdqu (&QWP(16*4,$out,$inp),$inout4); 2545 2546 &jmp (&label("done")); 2547 2548&set_label("one",16); 2549 &movdqu ($inout5,&QWP(0,$l_)); 2550 &mov ($key,&DWP($key_off,"esp")); # restore key 2551 2552 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2553 &mov ($rounds,&DWP(240,$key)); 2554 2555 &pxor ($inout5,$rndkey0); # ^ last offset_i 2556 &pxor ($inout0,$inout5); # ^ offset_i 2557 2558 &movdqa ($inout4,$rndkey1); 2559 &mov ($out,&DWP($out_off,"esp")); 2560 if ($inline) 2561 { &aesni_inline_generate1("dec"); } 2562 else 2563 { &call ("_aesni_decrypt1"); } 2564 2565 &xorps ($inout0,$inout5); # ^ offset_i 2566 &movaps ($rndkey1,$inout4); # pass the checksum 2567 &movdqa ($rndkey0,$inout5); # pass last offset_i 2568 &xorps ($rndkey1,$inout0); # checksum 2569 &movups (&QWP(0,$out,$inp),$inout0); 2570 2571 &jmp (&label("done")); 2572 2573&set_label("two",16); 2574 &lea ($i1,&DWP(1,$block)); 2575 &mov ($key,&DWP($key_off,"esp")); # restore key 2576 &bsf ($i1,$i1); 2577 &shl ($i1,4); 2578 &movdqu ($inout4,&QWP(0,$l_)); 2579 &movdqu ($inout5,&QWP(0,$l_,$i1)); 2580 2581 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2582 &movdqu ($inout1,&QWP(16*1,$inp)); 2583 &mov ($rounds,&DWP(240,$key)); 2584 2585 &movdqa ($inout3,$rndkey1); 2586 &pxor ($inout4,$rndkey0); # ^ last offset_i 2587 &pxor ($inout5,$inout4); 2588 2589 &pxor ($inout0,$inout4); # ^ offset_i 2590 &pxor ($inout1,$inout5); 2591 2592 &mov ($out,&DWP($out_off,"esp")); 2593 &call ("_aesni_decrypt2"); 2594 2595 &xorps ($inout0,$inout4); # ^ offset_i 2596 &xorps ($inout1,$inout5); 2597 &movdqa ($rndkey0,$inout5); # pass last offset_i 2598 &xorps ($inout3,$inout0); # checksum 2599 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2600 &xorps ($inout3,$inout1); 2601 &movups (&QWP(16*1,$out,$inp),$inout1); 2602 &movaps ($rndkey1,$inout3); # pass the checksum 2603 2604 &jmp (&label("done")); 2605 2606&set_label("three",16); 2607 &lea ($i1,&DWP(1,$block)); 2608 &mov ($key,&DWP($key_off,"esp")); # restore key 2609 &bsf ($i1,$i1); 2610 &shl ($i1,4); 2611 &movdqu ($inout3,&QWP(0,$l_)); 2612 &movdqu ($inout4,&QWP(0,$l_,$i1)); 2613 &movdqa ($inout5,$inout3); 2614 2615 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2616 &movdqu ($inout1,&QWP(16*1,$inp)); 2617 &movdqu ($inout2,&QWP(16*2,$inp)); 2618 &mov ($rounds,&DWP(240,$key)); 2619 2620 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2621 &pxor ($inout3,$rndkey0); # ^ last offset_i 2622 &pxor ($inout4,$inout3); 2623 &pxor ($inout5,$inout4); 2624 2625 &pxor ($inout0,$inout3); # ^ offset_i 2626 &pxor ($inout1,$inout4); 2627 &pxor ($inout2,$inout5); 2628 2629 &mov ($out,&DWP($out_off,"esp")); 2630 &call ("_aesni_decrypt3"); 2631 2632 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2633 &xorps ($inout0,$inout3); # ^ offset_i 2634 &xorps ($inout1,$inout4); 2635 &xorps ($inout2,$inout5); 2636 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2637 &pxor ($rndkey1,$inout0); # checksum 2638 &movdqa ($rndkey0,$inout5); # pass last offset_i 2639 &movups (&QWP(16*1,$out,$inp),$inout1); 2640 &pxor ($rndkey1,$inout1); 2641 &movups (&QWP(16*2,$out,$inp),$inout2); 2642 &pxor ($rndkey1,$inout2); 2643 2644 &jmp (&label("done")); 2645 2646&set_label("four",16); 2647 &lea ($i1,&DWP(1,$block)); 2648 &lea ($i3,&DWP(3,$block)); 2649 &bsf ($i1,$i1); 2650 &bsf ($i3,$i3); 2651 &mov ($key,&DWP($key_off,"esp")); # restore key 2652 &shl ($i1,4); 2653 &shl ($i3,4); 2654 &movdqu ($inout2,&QWP(0,$l_)); 2655 &movdqu ($inout3,&QWP(0,$l_,$i1)); 2656 &movdqa ($inout4,$inout2); 2657 &movdqu ($inout5,&QWP(0,$l_,$i3)); 2658 2659 &pxor ($inout2,$rndkey0); # ^ last offset_i 2660 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2661 &pxor ($inout3,$inout2); 2662 &movdqu ($inout1,&QWP(16*1,$inp)); 2663 &pxor ($inout4,$inout3); 2664 &movdqa (&QWP(16*0,"esp"),$inout2); 2665 &pxor ($inout5,$inout4); 2666 &movdqa (&QWP(16*1,"esp"),$inout3); 2667 &movdqu ($inout2,&QWP(16*2,$inp)); 2668 &movdqu ($inout3,&QWP(16*3,$inp)); 2669 &mov ($rounds,&DWP(240,$key)); 2670 2671 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2672 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2673 &pxor ($inout1,&QWP(16*1,"esp")); 2674 &pxor ($inout2,$inout4); 2675 &pxor ($inout3,$inout5); 2676 2677 &mov ($out,&DWP($out_off,"esp")); 2678 &call ("_aesni_decrypt4"); 2679 2680 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2681 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2682 &xorps ($inout1,&QWP(16*1,"esp")); 2683 &xorps ($inout2,$inout4); 2684 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2685 &pxor ($rndkey1,$inout0); # checksum 2686 &xorps ($inout3,$inout5); 2687 &movups (&QWP(16*1,$out,$inp),$inout1); 2688 &pxor ($rndkey1,$inout1); 2689 &movdqa ($rndkey0,$inout5); # pass last offset_i 2690 &movups (&QWP(16*2,$out,$inp),$inout2); 2691 &pxor ($rndkey1,$inout2); 2692 &movups (&QWP(16*3,$out,$inp),$inout3); 2693 &pxor ($rndkey1,$inout3); 2694 2695&set_label("done"); 2696 &mov ($key,&DWP($esp_off,"esp")); 2697 &pxor ($inout0,$inout0); # clear register bank 2698 &pxor ($inout1,$inout1); 2699 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 2700 &pxor ($inout2,$inout2); 2701 &movdqa (&QWP(16*1,"esp"),$inout0); 2702 &pxor ($inout3,$inout3); 2703 &movdqa (&QWP(16*2,"esp"),$inout0); 2704 &pxor ($inout4,$inout4); 2705 &movdqa (&QWP(16*3,"esp"),$inout0); 2706 &pxor ($inout5,$inout5); 2707 &movdqa (&QWP(16*4,"esp"),$inout0); 2708 &movdqa (&QWP(16*5,"esp"),$inout0); 2709 &movdqa (&QWP(16*6,"esp"),$inout0); 2710 2711 &lea ("esp",&DWP(0,$key)); 2712 &mov ($rounds,&wparam(5)); # &offset_i 2713 &mov ($rounds_,&wparam(7)); # &checksum 2714 &movdqu (&QWP(0,$rounds),$rndkey0); 2715 &pxor ($rndkey0,$rndkey0); 2716 &movdqu (&QWP(0,$rounds_),$rndkey1); 2717 &pxor ($rndkey1,$rndkey1); 2718&function_end("aesni_ocb_decrypt"); 2719} 2720} 2721 2722###################################################################### 2723# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2724# size_t length, const AES_KEY *key, 2725# unsigned char *ivp,const int enc); 2726&function_begin("${PREFIX}_cbc_encrypt"); 2727 &mov ($inp,&wparam(0)); 2728 &mov ($rounds_,"esp"); 2729 &mov ($out,&wparam(1)); 2730 &sub ($rounds_,24); 2731 &mov ($len,&wparam(2)); 2732 &and ($rounds_,-16); 2733 &mov ($key,&wparam(3)); 2734 &mov ($key_,&wparam(4)); 2735 &test ($len,$len); 2736 &jz (&label("cbc_abort")); 2737 2738 &cmp (&wparam(5),0); 2739 &xchg ($rounds_,"esp"); # alloca 2740 &movups ($ivec,&QWP(0,$key_)); # load IV 2741 &mov ($rounds,&DWP(240,$key)); 2742 &mov ($key_,$key); # backup $key 2743 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 2744 &mov ($rounds_,$rounds); # backup $rounds 2745 &je (&label("cbc_decrypt")); 2746 2747 &movaps ($inout0,$ivec); 2748 &cmp ($len,16); 2749 &jb (&label("cbc_enc_tail")); 2750 &sub ($len,16); 2751 &jmp (&label("cbc_enc_loop")); 2752 2753&set_label("cbc_enc_loop",16); 2754 &movups ($ivec,&QWP(0,$inp)); # input actually 2755 &lea ($inp,&DWP(16,$inp)); 2756 if ($inline) 2757 { &aesni_inline_generate1("enc",$inout0,$ivec); } 2758 else 2759 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 2760 &mov ($rounds,$rounds_); # restore $rounds 2761 &mov ($key,$key_); # restore $key 2762 &movups (&QWP(0,$out),$inout0); # store output 2763 &lea ($out,&DWP(16,$out)); 2764 &sub ($len,16); 2765 &jnc (&label("cbc_enc_loop")); 2766 &add ($len,16); 2767 &jnz (&label("cbc_enc_tail")); 2768 &movaps ($ivec,$inout0); 2769 &pxor ($inout0,$inout0); 2770 &jmp (&label("cbc_ret")); 2771 2772&set_label("cbc_enc_tail"); 2773 &mov ("ecx",$len); # zaps $rounds 2774 &data_word(0xA4F3F689); # rep movsb 2775 &mov ("ecx",16); # zero tail 2776 &sub ("ecx",$len); 2777 &xor ("eax","eax"); # zaps $len 2778 &data_word(0xAAF3F689); # rep stosb 2779 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 2780 &mov ($rounds,$rounds_); # restore $rounds 2781 &mov ($inp,$out); # $inp and $out are the same 2782 &mov ($key,$key_); # restore $key 2783 &jmp (&label("cbc_enc_loop")); 2784###################################################################### 2785&set_label("cbc_decrypt",16); 2786 &cmp ($len,0x50); 2787 &jbe (&label("cbc_dec_tail")); 2788 &movaps (&QWP(0,"esp"),$ivec); # save IV 2789 &sub ($len,0x50); 2790 &jmp (&label("cbc_dec_loop6_enter")); 2791 2792&set_label("cbc_dec_loop6",16); 2793 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 2794 &movups (&QWP(0,$out),$inout5); 2795 &lea ($out,&DWP(0x10,$out)); 2796&set_label("cbc_dec_loop6_enter"); 2797 &movdqu ($inout0,&QWP(0,$inp)); 2798 &movdqu ($inout1,&QWP(0x10,$inp)); 2799 &movdqu ($inout2,&QWP(0x20,$inp)); 2800 &movdqu ($inout3,&QWP(0x30,$inp)); 2801 &movdqu ($inout4,&QWP(0x40,$inp)); 2802 &movdqu ($inout5,&QWP(0x50,$inp)); 2803 2804 &call ("_aesni_decrypt6"); 2805 2806 &movups ($rndkey1,&QWP(0,$inp)); 2807 &movups ($rndkey0,&QWP(0x10,$inp)); 2808 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 2809 &xorps ($inout1,$rndkey1); 2810 &movups ($rndkey1,&QWP(0x20,$inp)); 2811 &xorps ($inout2,$rndkey0); 2812 &movups ($rndkey0,&QWP(0x30,$inp)); 2813 &xorps ($inout3,$rndkey1); 2814 &movups ($rndkey1,&QWP(0x40,$inp)); 2815 &xorps ($inout4,$rndkey0); 2816 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 2817 &xorps ($inout5,$rndkey1); 2818 &movups (&QWP(0,$out),$inout0); 2819 &movups (&QWP(0x10,$out),$inout1); 2820 &lea ($inp,&DWP(0x60,$inp)); 2821 &movups (&QWP(0x20,$out),$inout2); 2822 &mov ($rounds,$rounds_); # restore $rounds 2823 &movups (&QWP(0x30,$out),$inout3); 2824 &mov ($key,$key_); # restore $key 2825 &movups (&QWP(0x40,$out),$inout4); 2826 &lea ($out,&DWP(0x50,$out)); 2827 &sub ($len,0x60); 2828 &ja (&label("cbc_dec_loop6")); 2829 2830 &movaps ($inout0,$inout5); 2831 &movaps ($ivec,$rndkey0); 2832 &add ($len,0x50); 2833 &jle (&label("cbc_dec_clear_tail_collected")); 2834 &movups (&QWP(0,$out),$inout0); 2835 &lea ($out,&DWP(0x10,$out)); 2836&set_label("cbc_dec_tail"); 2837 &movups ($inout0,&QWP(0,$inp)); 2838 &movaps ($in0,$inout0); 2839 &cmp ($len,0x10); 2840 &jbe (&label("cbc_dec_one")); 2841 2842 &movups ($inout1,&QWP(0x10,$inp)); 2843 &movaps ($in1,$inout1); 2844 &cmp ($len,0x20); 2845 &jbe (&label("cbc_dec_two")); 2846 2847 &movups ($inout2,&QWP(0x20,$inp)); 2848 &cmp ($len,0x30); 2849 &jbe (&label("cbc_dec_three")); 2850 2851 &movups ($inout3,&QWP(0x30,$inp)); 2852 &cmp ($len,0x40); 2853 &jbe (&label("cbc_dec_four")); 2854 2855 &movups ($inout4,&QWP(0x40,$inp)); 2856 &movaps (&QWP(0,"esp"),$ivec); # save IV 2857 &movups ($inout0,&QWP(0,$inp)); 2858 &xorps ($inout5,$inout5); 2859 &call ("_aesni_decrypt6"); 2860 &movups ($rndkey1,&QWP(0,$inp)); 2861 &movups ($rndkey0,&QWP(0x10,$inp)); 2862 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 2863 &xorps ($inout1,$rndkey1); 2864 &movups ($rndkey1,&QWP(0x20,$inp)); 2865 &xorps ($inout2,$rndkey0); 2866 &movups ($rndkey0,&QWP(0x30,$inp)); 2867 &xorps ($inout3,$rndkey1); 2868 &movups ($ivec,&QWP(0x40,$inp)); # IV 2869 &xorps ($inout4,$rndkey0); 2870 &movups (&QWP(0,$out),$inout0); 2871 &movups (&QWP(0x10,$out),$inout1); 2872 &pxor ($inout1,$inout1); 2873 &movups (&QWP(0x20,$out),$inout2); 2874 &pxor ($inout2,$inout2); 2875 &movups (&QWP(0x30,$out),$inout3); 2876 &pxor ($inout3,$inout3); 2877 &lea ($out,&DWP(0x40,$out)); 2878 &movaps ($inout0,$inout4); 2879 &pxor ($inout4,$inout4); 2880 &sub ($len,0x50); 2881 &jmp (&label("cbc_dec_tail_collected")); 2882 2883&set_label("cbc_dec_one",16); 2884 if ($inline) 2885 { &aesni_inline_generate1("dec"); } 2886 else 2887 { &call ("_aesni_decrypt1"); } 2888 &xorps ($inout0,$ivec); 2889 &movaps ($ivec,$in0); 2890 &sub ($len,0x10); 2891 &jmp (&label("cbc_dec_tail_collected")); 2892 2893&set_label("cbc_dec_two",16); 2894 &call ("_aesni_decrypt2"); 2895 &xorps ($inout0,$ivec); 2896 &xorps ($inout1,$in0); 2897 &movups (&QWP(0,$out),$inout0); 2898 &movaps ($inout0,$inout1); 2899 &pxor ($inout1,$inout1); 2900 &lea ($out,&DWP(0x10,$out)); 2901 &movaps ($ivec,$in1); 2902 &sub ($len,0x20); 2903 &jmp (&label("cbc_dec_tail_collected")); 2904 2905&set_label("cbc_dec_three",16); 2906 &call ("_aesni_decrypt3"); 2907 &xorps ($inout0,$ivec); 2908 &xorps ($inout1,$in0); 2909 &xorps ($inout2,$in1); 2910 &movups (&QWP(0,$out),$inout0); 2911 &movaps ($inout0,$inout2); 2912 &pxor ($inout2,$inout2); 2913 &movups (&QWP(0x10,$out),$inout1); 2914 &pxor ($inout1,$inout1); 2915 &lea ($out,&DWP(0x20,$out)); 2916 &movups ($ivec,&QWP(0x20,$inp)); 2917 &sub ($len,0x30); 2918 &jmp (&label("cbc_dec_tail_collected")); 2919 2920&set_label("cbc_dec_four",16); 2921 &call ("_aesni_decrypt4"); 2922 &movups ($rndkey1,&QWP(0x10,$inp)); 2923 &movups ($rndkey0,&QWP(0x20,$inp)); 2924 &xorps ($inout0,$ivec); 2925 &movups ($ivec,&QWP(0x30,$inp)); 2926 &xorps ($inout1,$in0); 2927 &movups (&QWP(0,$out),$inout0); 2928 &xorps ($inout2,$rndkey1); 2929 &movups (&QWP(0x10,$out),$inout1); 2930 &pxor ($inout1,$inout1); 2931 &xorps ($inout3,$rndkey0); 2932 &movups (&QWP(0x20,$out),$inout2); 2933 &pxor ($inout2,$inout2); 2934 &lea ($out,&DWP(0x30,$out)); 2935 &movaps ($inout0,$inout3); 2936 &pxor ($inout3,$inout3); 2937 &sub ($len,0x40); 2938 &jmp (&label("cbc_dec_tail_collected")); 2939 2940&set_label("cbc_dec_clear_tail_collected",16); 2941 &pxor ($inout1,$inout1); 2942 &pxor ($inout2,$inout2); 2943 &pxor ($inout3,$inout3); 2944 &pxor ($inout4,$inout4); 2945&set_label("cbc_dec_tail_collected"); 2946 &and ($len,15); 2947 &jnz (&label("cbc_dec_tail_partial")); 2948 &movups (&QWP(0,$out),$inout0); 2949 &pxor ($rndkey0,$rndkey0); 2950 &jmp (&label("cbc_ret")); 2951 2952&set_label("cbc_dec_tail_partial",16); 2953 &movaps (&QWP(0,"esp"),$inout0); 2954 &pxor ($rndkey0,$rndkey0); 2955 &mov ("ecx",16); 2956 &mov ($inp,"esp"); 2957 &sub ("ecx",$len); 2958 &data_word(0xA4F3F689); # rep movsb 2959 &movdqa (&QWP(0,"esp"),$inout0); 2960 2961&set_label("cbc_ret"); 2962 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2963 &mov ($key_,&wparam(4)); 2964 &pxor ($inout0,$inout0); 2965 &pxor ($rndkey1,$rndkey1); 2966 &movups (&QWP(0,$key_),$ivec); # output IV 2967 &pxor ($ivec,$ivec); 2968&set_label("cbc_abort"); 2969&function_end("${PREFIX}_cbc_encrypt"); 2970 2971###################################################################### 2972# Mechanical port from aesni-x86_64.pl. 2973# 2974# _aesni_set_encrypt_key is private interface, 2975# input: 2976# "eax" const unsigned char *userKey 2977# $rounds int bits 2978# $key AES_KEY *key 2979# output: 2980# "eax" return code 2981# $round rounds 2982 2983&function_begin_B("_aesni_set_encrypt_key"); 2984 &push ("ebp"); 2985 &push ("ebx"); 2986 &test ("eax","eax"); 2987 &jz (&label("bad_pointer")); 2988 &test ($key,$key); 2989 &jz (&label("bad_pointer")); 2990 2991 &call (&label("pic")); 2992&set_label("pic"); 2993 &blindpop("ebx"); 2994 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2995 2996 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2997 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2998 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2999 &mov ("ebp",&DWP(4,"ebp")); 3000 &lea ($key,&DWP(16,$key)); 3001 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 3002 &cmp ($rounds,256); 3003 &je (&label("14rounds")); 3004 &cmp ($rounds,192); 3005 &je (&label("12rounds")); 3006 &cmp ($rounds,128); 3007 &jne (&label("bad_keybits")); 3008 3009&set_label("10rounds",16); 3010 &cmp ("ebp",1<<28); 3011 &je (&label("10rounds_alt")); 3012 3013 &mov ($rounds,9); 3014 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 3015 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 3016 &call (&label("key_128_cold")); 3017 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 3018 &call (&label("key_128")); 3019 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 3020 &call (&label("key_128")); 3021 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 3022 &call (&label("key_128")); 3023 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 3024 &call (&label("key_128")); 3025 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 3026 &call (&label("key_128")); 3027 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 3028 &call (&label("key_128")); 3029 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 3030 &call (&label("key_128")); 3031 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 3032 &call (&label("key_128")); 3033 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 3034 &call (&label("key_128")); 3035 &$movekey (&QWP(0,$key),"xmm0"); 3036 &mov (&DWP(80,$key),$rounds); 3037 3038 &jmp (&label("good_key")); 3039 3040&set_label("key_128",16); 3041 &$movekey (&QWP(0,$key),"xmm0"); 3042 &lea ($key,&DWP(16,$key)); 3043&set_label("key_128_cold"); 3044 &shufps ("xmm4","xmm0",0b00010000); 3045 &xorps ("xmm0","xmm4"); 3046 &shufps ("xmm4","xmm0",0b10001100); 3047 &xorps ("xmm0","xmm4"); 3048 &shufps ("xmm1","xmm1",0b11111111); # critical path 3049 &xorps ("xmm0","xmm1"); 3050 &ret(); 3051 3052&set_label("10rounds_alt",16); 3053 &movdqa ("xmm5",&QWP(0x00,"ebx")); 3054 &mov ($rounds,8); 3055 &movdqa ("xmm4",&QWP(0x20,"ebx")); 3056 &movdqa ("xmm2","xmm0"); 3057 &movdqu (&QWP(-16,$key),"xmm0"); 3058 3059&set_label("loop_key128"); 3060 &pshufb ("xmm0","xmm5"); 3061 &aesenclast ("xmm0","xmm4"); 3062 &pslld ("xmm4",1); 3063 &lea ($key,&DWP(16,$key)); 3064 3065 &movdqa ("xmm3","xmm2"); 3066 &pslldq ("xmm2",4); 3067 &pxor ("xmm3","xmm2"); 3068 &pslldq ("xmm2",4); 3069 &pxor ("xmm3","xmm2"); 3070 &pslldq ("xmm2",4); 3071 &pxor ("xmm2","xmm3"); 3072 3073 &pxor ("xmm0","xmm2"); 3074 &movdqu (&QWP(-16,$key),"xmm0"); 3075 &movdqa ("xmm2","xmm0"); 3076 3077 &dec ($rounds); 3078 &jnz (&label("loop_key128")); 3079 3080 &movdqa ("xmm4",&QWP(0x30,"ebx")); 3081 3082 &pshufb ("xmm0","xmm5"); 3083 &aesenclast ("xmm0","xmm4"); 3084 &pslld ("xmm4",1); 3085 3086 &movdqa ("xmm3","xmm2"); 3087 &pslldq ("xmm2",4); 3088 &pxor ("xmm3","xmm2"); 3089 &pslldq ("xmm2",4); 3090 &pxor ("xmm3","xmm2"); 3091 &pslldq ("xmm2",4); 3092 &pxor ("xmm2","xmm3"); 3093 3094 &pxor ("xmm0","xmm2"); 3095 &movdqu (&QWP(0,$key),"xmm0"); 3096 3097 &movdqa ("xmm2","xmm0"); 3098 &pshufb ("xmm0","xmm5"); 3099 &aesenclast ("xmm0","xmm4"); 3100 3101 &movdqa ("xmm3","xmm2"); 3102 &pslldq ("xmm2",4); 3103 &pxor ("xmm3","xmm2"); 3104 &pslldq ("xmm2",4); 3105 &pxor ("xmm3","xmm2"); 3106 &pslldq ("xmm2",4); 3107 &pxor ("xmm2","xmm3"); 3108 3109 &pxor ("xmm0","xmm2"); 3110 &movdqu (&QWP(16,$key),"xmm0"); 3111 3112 &mov ($rounds,9); 3113 &mov (&DWP(96,$key),$rounds); 3114 3115 &jmp (&label("good_key")); 3116 3117&set_label("12rounds",16); 3118 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 3119 &cmp ("ebp",1<<28); 3120 &je (&label("12rounds_alt")); 3121 3122 &mov ($rounds,11); 3123 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 3124 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 3125 &call (&label("key_192a_cold")); 3126 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 3127 &call (&label("key_192b")); 3128 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 3129 &call (&label("key_192a")); 3130 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 3131 &call (&label("key_192b")); 3132 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 3133 &call (&label("key_192a")); 3134 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 3135 &call (&label("key_192b")); 3136 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 3137 &call (&label("key_192a")); 3138 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 3139 &call (&label("key_192b")); 3140 &$movekey (&QWP(0,$key),"xmm0"); 3141 &mov (&DWP(48,$key),$rounds); 3142 3143 &jmp (&label("good_key")); 3144 3145&set_label("key_192a",16); 3146 &$movekey (&QWP(0,$key),"xmm0"); 3147 &lea ($key,&DWP(16,$key)); 3148&set_label("key_192a_cold",16); 3149 &movaps ("xmm5","xmm2"); 3150&set_label("key_192b_warm"); 3151 &shufps ("xmm4","xmm0",0b00010000); 3152 &movdqa ("xmm3","xmm2"); 3153 &xorps ("xmm0","xmm4"); 3154 &shufps ("xmm4","xmm0",0b10001100); 3155 &pslldq ("xmm3",4); 3156 &xorps ("xmm0","xmm4"); 3157 &pshufd ("xmm1","xmm1",0b01010101); # critical path 3158 &pxor ("xmm2","xmm3"); 3159 &pxor ("xmm0","xmm1"); 3160 &pshufd ("xmm3","xmm0",0b11111111); 3161 &pxor ("xmm2","xmm3"); 3162 &ret(); 3163 3164&set_label("key_192b",16); 3165 &movaps ("xmm3","xmm0"); 3166 &shufps ("xmm5","xmm0",0b01000100); 3167 &$movekey (&QWP(0,$key),"xmm5"); 3168 &shufps ("xmm3","xmm2",0b01001110); 3169 &$movekey (&QWP(16,$key),"xmm3"); 3170 &lea ($key,&DWP(32,$key)); 3171 &jmp (&label("key_192b_warm")); 3172 3173&set_label("12rounds_alt",16); 3174 &movdqa ("xmm5",&QWP(0x10,"ebx")); 3175 &movdqa ("xmm4",&QWP(0x20,"ebx")); 3176 &mov ($rounds,8); 3177 &movdqu (&QWP(-16,$key),"xmm0"); 3178 3179&set_label("loop_key192"); 3180 &movq (&QWP(0,$key),"xmm2"); 3181 &movdqa ("xmm1","xmm2"); 3182 &pshufb ("xmm2","xmm5"); 3183 &aesenclast ("xmm2","xmm4"); 3184 &pslld ("xmm4",1); 3185 &lea ($key,&DWP(24,$key)); 3186 3187 &movdqa ("xmm3","xmm0"); 3188 &pslldq ("xmm0",4); 3189 &pxor ("xmm3","xmm0"); 3190 &pslldq ("xmm0",4); 3191 &pxor ("xmm3","xmm0"); 3192 &pslldq ("xmm0",4); 3193 &pxor ("xmm0","xmm3"); 3194 3195 &pshufd ("xmm3","xmm0",0xff); 3196 &pxor ("xmm3","xmm1"); 3197 &pslldq ("xmm1",4); 3198 &pxor ("xmm3","xmm1"); 3199 3200 &pxor ("xmm0","xmm2"); 3201 &pxor ("xmm2","xmm3"); 3202 &movdqu (&QWP(-16,$key),"xmm0"); 3203 3204 &dec ($rounds); 3205 &jnz (&label("loop_key192")); 3206 3207 &mov ($rounds,11); 3208 &mov (&DWP(32,$key),$rounds); 3209 3210 &jmp (&label("good_key")); 3211 3212&set_label("14rounds",16); 3213 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 3214 &lea ($key,&DWP(16,$key)); 3215 &cmp ("ebp",1<<28); 3216 &je (&label("14rounds_alt")); 3217 3218 &mov ($rounds,13); 3219 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 3220 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 3221 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 3222 &call (&label("key_256a_cold")); 3223 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 3224 &call (&label("key_256b")); 3225 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 3226 &call (&label("key_256a")); 3227 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 3228 &call (&label("key_256b")); 3229 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 3230 &call (&label("key_256a")); 3231 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 3232 &call (&label("key_256b")); 3233 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 3234 &call (&label("key_256a")); 3235 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 3236 &call (&label("key_256b")); 3237 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 3238 &call (&label("key_256a")); 3239 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 3240 &call (&label("key_256b")); 3241 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 3242 &call (&label("key_256a")); 3243 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 3244 &call (&label("key_256b")); 3245 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 3246 &call (&label("key_256a")); 3247 &$movekey (&QWP(0,$key),"xmm0"); 3248 &mov (&DWP(16,$key),$rounds); 3249 &xor ("eax","eax"); 3250 3251 &jmp (&label("good_key")); 3252 3253&set_label("key_256a",16); 3254 &$movekey (&QWP(0,$key),"xmm2"); 3255 &lea ($key,&DWP(16,$key)); 3256&set_label("key_256a_cold"); 3257 &shufps ("xmm4","xmm0",0b00010000); 3258 &xorps ("xmm0","xmm4"); 3259 &shufps ("xmm4","xmm0",0b10001100); 3260 &xorps ("xmm0","xmm4"); 3261 &shufps ("xmm1","xmm1",0b11111111); # critical path 3262 &xorps ("xmm0","xmm1"); 3263 &ret(); 3264 3265&set_label("key_256b",16); 3266 &$movekey (&QWP(0,$key),"xmm0"); 3267 &lea ($key,&DWP(16,$key)); 3268 3269 &shufps ("xmm4","xmm2",0b00010000); 3270 &xorps ("xmm2","xmm4"); 3271 &shufps ("xmm4","xmm2",0b10001100); 3272 &xorps ("xmm2","xmm4"); 3273 &shufps ("xmm1","xmm1",0b10101010); # critical path 3274 &xorps ("xmm2","xmm1"); 3275 &ret(); 3276 3277&set_label("14rounds_alt",16); 3278 &movdqa ("xmm5",&QWP(0x00,"ebx")); 3279 &movdqa ("xmm4",&QWP(0x20,"ebx")); 3280 &mov ($rounds,7); 3281 &movdqu (&QWP(-32,$key),"xmm0"); 3282 &movdqa ("xmm1","xmm2"); 3283 &movdqu (&QWP(-16,$key),"xmm2"); 3284 3285&set_label("loop_key256"); 3286 &pshufb ("xmm2","xmm5"); 3287 &aesenclast ("xmm2","xmm4"); 3288 3289 &movdqa ("xmm3","xmm0"); 3290 &pslldq ("xmm0",4); 3291 &pxor ("xmm3","xmm0"); 3292 &pslldq ("xmm0",4); 3293 &pxor ("xmm3","xmm0"); 3294 &pslldq ("xmm0",4); 3295 &pxor ("xmm0","xmm3"); 3296 &pslld ("xmm4",1); 3297 3298 &pxor ("xmm0","xmm2"); 3299 &movdqu (&QWP(0,$key),"xmm0"); 3300 3301 &dec ($rounds); 3302 &jz (&label("done_key256")); 3303 3304 &pshufd ("xmm2","xmm0",0xff); 3305 &pxor ("xmm3","xmm3"); 3306 &aesenclast ("xmm2","xmm3"); 3307 3308 &movdqa ("xmm3","xmm1"); 3309 &pslldq ("xmm1",4); 3310 &pxor ("xmm3","xmm1"); 3311 &pslldq ("xmm1",4); 3312 &pxor ("xmm3","xmm1"); 3313 &pslldq ("xmm1",4); 3314 &pxor ("xmm1","xmm3"); 3315 3316 &pxor ("xmm2","xmm1"); 3317 &movdqu (&QWP(16,$key),"xmm2"); 3318 &lea ($key,&DWP(32,$key)); 3319 &movdqa ("xmm1","xmm2"); 3320 &jmp (&label("loop_key256")); 3321 3322&set_label("done_key256"); 3323 &mov ($rounds,13); 3324 &mov (&DWP(16,$key),$rounds); 3325 3326&set_label("good_key"); 3327 &pxor ("xmm0","xmm0"); 3328 &pxor ("xmm1","xmm1"); 3329 &pxor ("xmm2","xmm2"); 3330 &pxor ("xmm3","xmm3"); 3331 &pxor ("xmm4","xmm4"); 3332 &pxor ("xmm5","xmm5"); 3333 &xor ("eax","eax"); 3334 &pop ("ebx"); 3335 &pop ("ebp"); 3336 &ret (); 3337 3338&set_label("bad_pointer",4); 3339 &mov ("eax",-1); 3340 &pop ("ebx"); 3341 &pop ("ebp"); 3342 &ret (); 3343&set_label("bad_keybits",4); 3344 &pxor ("xmm0","xmm0"); 3345 &mov ("eax",-2); 3346 &pop ("ebx"); 3347 &pop ("ebp"); 3348 &ret (); 3349&function_end_B("_aesni_set_encrypt_key"); 3350 3351# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 3352# AES_KEY *key) 3353&function_begin_B("${PREFIX}_set_encrypt_key"); 3354 &mov ("eax",&wparam(0)); 3355 &mov ($rounds,&wparam(1)); 3356 &mov ($key,&wparam(2)); 3357 &call ("_aesni_set_encrypt_key"); 3358 &ret (); 3359&function_end_B("${PREFIX}_set_encrypt_key"); 3360 3361# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 3362# AES_KEY *key) 3363&function_begin_B("${PREFIX}_set_decrypt_key"); 3364 &mov ("eax",&wparam(0)); 3365 &mov ($rounds,&wparam(1)); 3366 &mov ($key,&wparam(2)); 3367 &call ("_aesni_set_encrypt_key"); 3368 &mov ($key,&wparam(2)); 3369 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 3370 &test ("eax","eax"); 3371 &jnz (&label("dec_key_ret")); 3372 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 3373 3374 &$movekey ("xmm0",&QWP(0,$key)); # just swap 3375 &$movekey ("xmm1",&QWP(0,"eax")); 3376 &$movekey (&QWP(0,"eax"),"xmm0"); 3377 &$movekey (&QWP(0,$key),"xmm1"); 3378 &lea ($key,&DWP(16,$key)); 3379 &lea ("eax",&DWP(-16,"eax")); 3380 3381&set_label("dec_key_inverse"); 3382 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 3383 &$movekey ("xmm1",&QWP(0,"eax")); 3384 &aesimc ("xmm0","xmm0"); 3385 &aesimc ("xmm1","xmm1"); 3386 &lea ($key,&DWP(16,$key)); 3387 &lea ("eax",&DWP(-16,"eax")); 3388 &$movekey (&QWP(16,"eax"),"xmm0"); 3389 &$movekey (&QWP(-16,$key),"xmm1"); 3390 &cmp ("eax",$key); 3391 &ja (&label("dec_key_inverse")); 3392 3393 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 3394 &aesimc ("xmm0","xmm0"); 3395 &$movekey (&QWP(0,$key),"xmm0"); 3396 3397 &pxor ("xmm0","xmm0"); 3398 &pxor ("xmm1","xmm1"); 3399 &xor ("eax","eax"); # return success 3400&set_label("dec_key_ret"); 3401 &ret (); 3402&function_end_B("${PREFIX}_set_decrypt_key"); 3403 3404&set_label("key_const",64); 3405&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 3406&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 3407&data_word(1,1,1,1); 3408&data_word(0x1b,0x1b,0x1b,0x1b); 3409&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 3410 3411&asm_finish(); 3412 3413close STDOUT or die "error closing STDOUT: $!"; 3414