1#! /usr/bin/env perl 2# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# April 2019 31# 32# Key to performance of parallelize-able modes is round instruction 33# interleaving. But which factor to use? There is optimal one for 34# each combination of instruction latency and issue rate, beyond 35# which increasing interleave factor doesn't pay off. While on cons 36# side we have code size increase and resource waste on platforms for 37# which interleave factor is too high. In other words you want it to 38# be just right. So far interleave factor of 3x was serving well all 39# platforms. But for ThunderX2 optimal interleave factor was measured 40# to be 5x... 41# 42# Performance in cycles per byte processed with 128-bit key: 43# 44# CBC enc CBC dec CTR 45# Apple A7 2.39 1.20 1.20 46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 48# Cortex-A72 1.33 0.85/0.88 0.92/0.96 49# Denver 1.96 0.65/0.86 0.76/0.80 50# Mongoose 1.33 1.23/1.20 1.30/1.20 51# Kryo 1.26 0.87/0.94 1.00/1.00 52# ThunderX2 5.95 1.25 1.30 53# 54# (*) original 3.64/1.34/1.32 results were for r0p0 revision 55# and are still same even for updated module; 56# (**) numbers after slash are for 32-bit code, which is 3x- 57# interleaved; 58 59# $output is the last argument if it looks like a file (it has an extension) 60# $flavour is the first argument if it doesn't look like a file 61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71*STDOUT=*OUT; 72 73$prefix="aes_v8"; 74 75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 76 77$code=<<___; 78#include "arm_arch.h" 79 80#if __ARM_MAX_ARCH__>=7 81___ 82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 83$code.=<<___ if ($flavour !~ /64/); 84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 85.fpu neon 86#ifdef __thumb2__ 87.syntax unified 88.thumb 89# define INST(a,b,c,d) $_byte c,d|0xc,a,b 90#else 91.code 32 92# define INST(a,b,c,d) $_byte a,b,c,d 93#endif 94 95.text 96___ 97 98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 100# maintain both 32- and 64-bit codes within single module and 101# transliterate common code to either flavour with regex vodoo. 102# 103{{{ 104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 107 108 109$code.=<<___; 110.align 5 111.Lrcon: 112.long 0x01,0x01,0x01,0x01 113.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 114.long 0x1b,0x1b,0x1b,0x1b 115 116.globl ${prefix}_set_encrypt_key 117.type ${prefix}_set_encrypt_key,%function 118.align 5 119${prefix}_set_encrypt_key: 120.Lenc_key: 121___ 122$code.=<<___ if ($flavour =~ /64/); 123 AARCH64_VALID_CALL_TARGET 124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 125 stp x29,x30,[sp,#-16]! 126 add x29,sp,#0 127___ 128$code.=<<___; 129 mov $ptr,#-1 130 cmp $inp,#0 131 b.eq .Lenc_key_abort 132 cmp $out,#0 133 b.eq .Lenc_key_abort 134 mov $ptr,#-2 135 cmp $bits,#128 136 b.lt .Lenc_key_abort 137 cmp $bits,#256 138 b.gt .Lenc_key_abort 139 tst $bits,#0x3f 140 b.ne .Lenc_key_abort 141 142 adr $ptr,.Lrcon 143 cmp $bits,#192 144 145 veor $zero,$zero,$zero 146 vld1.8 {$in0},[$inp],#16 147 mov $bits,#8 // reuse $bits 148 vld1.32 {$rcon,$mask},[$ptr],#32 149 150 b.lt .Loop128 151 b.eq .L192 152 b .L256 153 154.align 4 155.Loop128: 156 vtbl.8 $key,{$in0},$mask 157 vext.8 $tmp,$zero,$in0,#12 158 vst1.32 {$in0},[$out],#16 159 aese $key,$zero 160 subs $bits,$bits,#1 161 162 veor $in0,$in0,$tmp 163 vext.8 $tmp,$zero,$tmp,#12 164 veor $in0,$in0,$tmp 165 vext.8 $tmp,$zero,$tmp,#12 166 veor $key,$key,$rcon 167 veor $in0,$in0,$tmp 168 vshl.u8 $rcon,$rcon,#1 169 veor $in0,$in0,$key 170 b.ne .Loop128 171 172 vld1.32 {$rcon},[$ptr] 173 174 vtbl.8 $key,{$in0},$mask 175 vext.8 $tmp,$zero,$in0,#12 176 vst1.32 {$in0},[$out],#16 177 aese $key,$zero 178 179 veor $in0,$in0,$tmp 180 vext.8 $tmp,$zero,$tmp,#12 181 veor $in0,$in0,$tmp 182 vext.8 $tmp,$zero,$tmp,#12 183 veor $key,$key,$rcon 184 veor $in0,$in0,$tmp 185 vshl.u8 $rcon,$rcon,#1 186 veor $in0,$in0,$key 187 188 vtbl.8 $key,{$in0},$mask 189 vext.8 $tmp,$zero,$in0,#12 190 vst1.32 {$in0},[$out],#16 191 aese $key,$zero 192 193 veor $in0,$in0,$tmp 194 vext.8 $tmp,$zero,$tmp,#12 195 veor $in0,$in0,$tmp 196 vext.8 $tmp,$zero,$tmp,#12 197 veor $key,$key,$rcon 198 veor $in0,$in0,$tmp 199 veor $in0,$in0,$key 200 vst1.32 {$in0},[$out] 201 add $out,$out,#0x50 202 203 mov $rounds,#10 204 b .Ldone 205 206.align 4 207.L192: 208 vld1.8 {$in1},[$inp],#8 209 vmov.i8 $key,#8 // borrow $key 210 vst1.32 {$in0},[$out],#16 211 vsub.i8 $mask,$mask,$key // adjust the mask 212 213.Loop192: 214 vtbl.8 $key,{$in1},$mask 215 vext.8 $tmp,$zero,$in0,#12 216#ifdef __ARMEB__ 217 vst1.32 {$in1},[$out],#16 218 sub $out,$out,#8 219#else 220 vst1.32 {$in1},[$out],#8 221#endif 222 aese $key,$zero 223 subs $bits,$bits,#1 224 225 veor $in0,$in0,$tmp 226 vext.8 $tmp,$zero,$tmp,#12 227 veor $in0,$in0,$tmp 228 vext.8 $tmp,$zero,$tmp,#12 229 veor $in0,$in0,$tmp 230 231 vdup.32 $tmp,${in0}[3] 232 veor $tmp,$tmp,$in1 233 veor $key,$key,$rcon 234 vext.8 $in1,$zero,$in1,#12 235 vshl.u8 $rcon,$rcon,#1 236 veor $in1,$in1,$tmp 237 veor $in0,$in0,$key 238 veor $in1,$in1,$key 239 vst1.32 {$in0},[$out],#16 240 b.ne .Loop192 241 242 mov $rounds,#12 243 add $out,$out,#0x20 244 b .Ldone 245 246.align 4 247.L256: 248 vld1.8 {$in1},[$inp] 249 mov $bits,#7 250 mov $rounds,#14 251 vst1.32 {$in0},[$out],#16 252 253.Loop256: 254 vtbl.8 $key,{$in1},$mask 255 vext.8 $tmp,$zero,$in0,#12 256 vst1.32 {$in1},[$out],#16 257 aese $key,$zero 258 subs $bits,$bits,#1 259 260 veor $in0,$in0,$tmp 261 vext.8 $tmp,$zero,$tmp,#12 262 veor $in0,$in0,$tmp 263 vext.8 $tmp,$zero,$tmp,#12 264 veor $key,$key,$rcon 265 veor $in0,$in0,$tmp 266 vshl.u8 $rcon,$rcon,#1 267 veor $in0,$in0,$key 268 vst1.32 {$in0},[$out],#16 269 b.eq .Ldone 270 271 vdup.32 $key,${in0}[3] // just splat 272 vext.8 $tmp,$zero,$in1,#12 273 aese $key,$zero 274 275 veor $in1,$in1,$tmp 276 vext.8 $tmp,$zero,$tmp,#12 277 veor $in1,$in1,$tmp 278 vext.8 $tmp,$zero,$tmp,#12 279 veor $in1,$in1,$tmp 280 281 veor $in1,$in1,$key 282 b .Loop256 283 284.Ldone: 285 str $rounds,[$out] 286 mov $ptr,#0 287 288.Lenc_key_abort: 289 mov x0,$ptr // return value 290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 291 ret 292.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 293 294.globl ${prefix}_set_decrypt_key 295.type ${prefix}_set_decrypt_key,%function 296.align 5 297${prefix}_set_decrypt_key: 298___ 299$code.=<<___ if ($flavour =~ /64/); 300 AARCH64_SIGN_LINK_REGISTER 301 stp x29,x30,[sp,#-16]! 302 add x29,sp,#0 303___ 304$code.=<<___ if ($flavour !~ /64/); 305 stmdb sp!,{r4,lr} 306___ 307$code.=<<___; 308 bl .Lenc_key 309 310 cmp x0,#0 311 b.ne .Ldec_key_abort 312 313 sub $out,$out,#240 // restore original $out 314 mov x4,#-16 315 add $inp,$out,x12,lsl#4 // end of key schedule 316 317 vld1.32 {v0.16b},[$out] 318 vld1.32 {v1.16b},[$inp] 319 vst1.32 {v0.16b},[$inp],x4 320 vst1.32 {v1.16b},[$out],#16 321 322.Loop_imc: 323 vld1.32 {v0.16b},[$out] 324 vld1.32 {v1.16b},[$inp] 325 aesimc v0.16b,v0.16b 326 aesimc v1.16b,v1.16b 327 vst1.32 {v0.16b},[$inp],x4 328 vst1.32 {v1.16b},[$out],#16 329 cmp $inp,$out 330 b.hi .Loop_imc 331 332 vld1.32 {v0.16b},[$out] 333 aesimc v0.16b,v0.16b 334 vst1.32 {v0.16b},[$inp] 335 336 eor x0,x0,x0 // return value 337.Ldec_key_abort: 338___ 339$code.=<<___ if ($flavour !~ /64/); 340 ldmia sp!,{r4,pc} 341___ 342$code.=<<___ if ($flavour =~ /64/); 343 ldp x29,x30,[sp],#16 344 AARCH64_VALIDATE_LINK_REGISTER 345 ret 346___ 347$code.=<<___; 348.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 349___ 350}}} 351{{{ 352sub gen_block () { 353my $dir = shift; 354my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 355my ($inp,$out,$key)=map("x$_",(0..2)); 356my $rounds="w3"; 357my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 358 359$code.=<<___; 360.globl ${prefix}_${dir}crypt 361.type ${prefix}_${dir}crypt,%function 362.align 5 363${prefix}_${dir}crypt: 364___ 365$code.=<<___ if ($flavour =~ /64/); 366 AARCH64_VALID_CALL_TARGET 367___ 368$code.=<<___; 369 ldr $rounds,[$key,#240] 370 vld1.32 {$rndkey0},[$key],#16 371 vld1.8 {$inout},[$inp] 372 sub $rounds,$rounds,#2 373 vld1.32 {$rndkey1},[$key],#16 374 375.Loop_${dir}c: 376 aes$e $inout,$rndkey0 377 aes$mc $inout,$inout 378 vld1.32 {$rndkey0},[$key],#16 379 subs $rounds,$rounds,#2 380 aes$e $inout,$rndkey1 381 aes$mc $inout,$inout 382 vld1.32 {$rndkey1},[$key],#16 383 b.gt .Loop_${dir}c 384 385 aes$e $inout,$rndkey0 386 aes$mc $inout,$inout 387 vld1.32 {$rndkey0},[$key] 388 aes$e $inout,$rndkey1 389 veor $inout,$inout,$rndkey0 390 391 vst1.8 {$inout},[$out] 392 ret 393.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 394___ 395} 396&gen_block("en"); 397&gen_block("de"); 398}}} 399 400# Performance in cycles per byte. 401# Processed with AES-ECB different key size. 402# It shows the value before and after optimization as below: 403# (before/after): 404# 405# AES-128-ECB AES-192-ECB AES-256-ECB 406# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 407# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 408 409# Optimization is implemented by loop unrolling and interleaving. 410# Commonly, we choose the unrolling factor as 5, if the input 411# data size smaller than 5 blocks, but not smaller than 3 blocks, 412# choose 3 as the unrolling factor. 413# If the input data size dsize >= 5*16 bytes, then take 5 blocks 414# as one iteration, every loop the left size lsize -= 5*16. 415# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, 416# every loop lsize -=3*16. 417# If lsize < 3*16 bytes, treat them as the tail, interleave the 418# two blocks AES instructions. 419# There is one special case, if the original input data size dsize 420# = 16 bytes, we will treat it separately to improve the 421# performance: one independent code block without LR, FP load and 422# store, just looks like what the original ECB implementation does. 423 424{{{ 425my ($inp,$out,$len,$key)=map("x$_",(0..3)); 426my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); 427my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 428 429my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 430 431### q7 last round key 432### q10-q15 q7 Last 7 round keys 433### q8-q9 preloaded round keys except last 7 keys for big size 434### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte 435 436{ 437my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 438 439my ($dat3,$in3,$tmp3); # used only in 64-bit mode 440my ($dat4,$in4,$tmp4); 441if ($flavour =~ /64/) { 442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 443} 444 445$code.=<<___; 446.globl ${prefix}_ecb_encrypt 447.type ${prefix}_ecb_encrypt,%function 448.align 5 449${prefix}_ecb_encrypt: 450___ 451$code.=<<___ if ($flavour =~ /64/); 452 AARCH64_VALID_CALL_TARGET 453 subs $len,$len,#16 454 // Original input data size bigger than 16, jump to big size processing. 455 b.ne .Lecb_big_size 456 vld1.8 {$dat0},[$inp] 457 cmp $enc,#0 // en- or decrypting? 458 ldr $rounds,[$key,#240] 459 vld1.32 {q5-q6},[$key],#32 // load key schedule... 460 461 b.eq .Lecb_small_dec 462 aese $dat0,q5 463 aesmc $dat0,$dat0 464 vld1.32 {q8-q9},[$key],#32 // load key schedule... 465 aese $dat0,q6 466 aesmc $dat0,$dat0 467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing 468 b.eq .Lecb_128_enc 469.Lecb_round_loop: 470 aese $dat0,q8 471 aesmc $dat0,$dat0 472 vld1.32 {q8},[$key],#16 // load key schedule... 473 aese $dat0,q9 474 aesmc $dat0,$dat0 475 vld1.32 {q9},[$key],#16 // load key schedule... 476 subs $rounds,$rounds,#2 // bias 477 b.gt .Lecb_round_loop 478.Lecb_128_enc: 479 vld1.32 {q10-q11},[$key],#32 // load key schedule... 480 aese $dat0,q8 481 aesmc $dat0,$dat0 482 aese $dat0,q9 483 aesmc $dat0,$dat0 484 vld1.32 {q12-q13},[$key],#32 // load key schedule... 485 aese $dat0,q10 486 aesmc $dat0,$dat0 487 aese $dat0,q11 488 aesmc $dat0,$dat0 489 vld1.32 {q14-q15},[$key],#32 // load key schedule... 490 aese $dat0,q12 491 aesmc $dat0,$dat0 492 aese $dat0,q13 493 aesmc $dat0,$dat0 494 vld1.32 {$rndlast},[$key] 495 aese $dat0,q14 496 aesmc $dat0,$dat0 497 aese $dat0,q15 498 veor $dat0,$dat0,$rndlast 499 vst1.8 {$dat0},[$out] 500 b .Lecb_Final_abort 501.Lecb_small_dec: 502 aesd $dat0,q5 503 aesimc $dat0,$dat0 504 vld1.32 {q8-q9},[$key],#32 // load key schedule... 505 aesd $dat0,q6 506 aesimc $dat0,$dat0 507 subs $rounds,$rounds,#10 // bias 508 b.eq .Lecb_128_dec 509.Lecb_dec_round_loop: 510 aesd $dat0,q8 511 aesimc $dat0,$dat0 512 vld1.32 {q8},[$key],#16 // load key schedule... 513 aesd $dat0,q9 514 aesimc $dat0,$dat0 515 vld1.32 {q9},[$key],#16 // load key schedule... 516 subs $rounds,$rounds,#2 // bias 517 b.gt .Lecb_dec_round_loop 518.Lecb_128_dec: 519 vld1.32 {q10-q11},[$key],#32 // load key schedule... 520 aesd $dat0,q8 521 aesimc $dat0,$dat0 522 aesd $dat0,q9 523 aesimc $dat0,$dat0 524 vld1.32 {q12-q13},[$key],#32 // load key schedule... 525 aesd $dat0,q10 526 aesimc $dat0,$dat0 527 aesd $dat0,q11 528 aesimc $dat0,$dat0 529 vld1.32 {q14-q15},[$key],#32 // load key schedule... 530 aesd $dat0,q12 531 aesimc $dat0,$dat0 532 aesd $dat0,q13 533 aesimc $dat0,$dat0 534 vld1.32 {$rndlast},[$key] 535 aesd $dat0,q14 536 aesimc $dat0,$dat0 537 aesd $dat0,q15 538 veor $dat0,$dat0,$rndlast 539 vst1.8 {$dat0},[$out] 540 b .Lecb_Final_abort 541.Lecb_big_size: 542___ 543$code.=<<___ if ($flavour =~ /64/); 544 stp x29,x30,[sp,#-16]! 545 add x29,sp,#0 546___ 547$code.=<<___ if ($flavour !~ /64/); 548 mov ip,sp 549 stmdb sp!,{r4-r8,lr} 550 vstmdb sp!,{d8-d15} @ ABI specification says so 551 ldmia ip,{r4-r5} @ load remaining args 552 subs $len,$len,#16 553___ 554$code.=<<___; 555 mov $step,#16 556 b.lo .Lecb_done 557 cclr $step,eq 558 559 cmp $enc,#0 // en- or decrypting? 560 ldr $rounds,[$key,#240] 561 and $len,$len,#-16 562 vld1.8 {$dat},[$inp],$step 563 564 vld1.32 {q8-q9},[$key] // load key schedule... 565 sub $rounds,$rounds,#6 566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 567 sub $rounds,$rounds,#2 568 vld1.32 {q10-q11},[$key_],#32 569 vld1.32 {q12-q13},[$key_],#32 570 vld1.32 {q14-q15},[$key_],#32 571 vld1.32 {$rndlast},[$key_] 572 573 add $key_,$key,#32 574 mov $cnt,$rounds 575 b.eq .Lecb_dec 576 577 vld1.8 {$dat1},[$inp],#16 578 subs $len,$len,#32 // bias 579 add $cnt,$rounds,#2 580 vorr $in1,$dat1,$dat1 581 vorr $dat2,$dat1,$dat1 582 vorr $dat1,$dat,$dat 583 b.lo .Lecb_enc_tail 584 585 vorr $dat1,$in1,$in1 586 vld1.8 {$dat2},[$inp],#16 587___ 588$code.=<<___ if ($flavour =~ /64/); 589 cmp $len,#32 590 b.lo .Loop3x_ecb_enc 591 592 vld1.8 {$dat3},[$inp],#16 593 vld1.8 {$dat4},[$inp],#16 594 sub $len,$len,#32 // bias 595 mov $cnt,$rounds 596 597.Loop5x_ecb_enc: 598 aese $dat0,q8 599 aesmc $dat0,$dat0 600 aese $dat1,q8 601 aesmc $dat1,$dat1 602 aese $dat2,q8 603 aesmc $dat2,$dat2 604 aese $dat3,q8 605 aesmc $dat3,$dat3 606 aese $dat4,q8 607 aesmc $dat4,$dat4 608 vld1.32 {q8},[$key_],#16 609 subs $cnt,$cnt,#2 610 aese $dat0,q9 611 aesmc $dat0,$dat0 612 aese $dat1,q9 613 aesmc $dat1,$dat1 614 aese $dat2,q9 615 aesmc $dat2,$dat2 616 aese $dat3,q9 617 aesmc $dat3,$dat3 618 aese $dat4,q9 619 aesmc $dat4,$dat4 620 vld1.32 {q9},[$key_],#16 621 b.gt .Loop5x_ecb_enc 622 623 aese $dat0,q8 624 aesmc $dat0,$dat0 625 aese $dat1,q8 626 aesmc $dat1,$dat1 627 aese $dat2,q8 628 aesmc $dat2,$dat2 629 aese $dat3,q8 630 aesmc $dat3,$dat3 631 aese $dat4,q8 632 aesmc $dat4,$dat4 633 cmp $len,#0x40 // because .Lecb_enc_tail4x 634 sub $len,$len,#0x50 635 636 aese $dat0,q9 637 aesmc $dat0,$dat0 638 aese $dat1,q9 639 aesmc $dat1,$dat1 640 aese $dat2,q9 641 aesmc $dat2,$dat2 642 aese $dat3,q9 643 aesmc $dat3,$dat3 644 aese $dat4,q9 645 aesmc $dat4,$dat4 646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 647 mov $key_,$key 648 649 aese $dat0,q10 650 aesmc $dat0,$dat0 651 aese $dat1,q10 652 aesmc $dat1,$dat1 653 aese $dat2,q10 654 aesmc $dat2,$dat2 655 aese $dat3,q10 656 aesmc $dat3,$dat3 657 aese $dat4,q10 658 aesmc $dat4,$dat4 659 add $inp,$inp,x6 // $inp is adjusted in such way that 660 // at exit from the loop $dat1-$dat4 661 // are loaded with last "words" 662 add x6,$len,#0x60 // because .Lecb_enc_tail4x 663 664 aese $dat0,q11 665 aesmc $dat0,$dat0 666 aese $dat1,q11 667 aesmc $dat1,$dat1 668 aese $dat2,q11 669 aesmc $dat2,$dat2 670 aese $dat3,q11 671 aesmc $dat3,$dat3 672 aese $dat4,q11 673 aesmc $dat4,$dat4 674 675 aese $dat0,q12 676 aesmc $dat0,$dat0 677 aese $dat1,q12 678 aesmc $dat1,$dat1 679 aese $dat2,q12 680 aesmc $dat2,$dat2 681 aese $dat3,q12 682 aesmc $dat3,$dat3 683 aese $dat4,q12 684 aesmc $dat4,$dat4 685 686 aese $dat0,q13 687 aesmc $dat0,$dat0 688 aese $dat1,q13 689 aesmc $dat1,$dat1 690 aese $dat2,q13 691 aesmc $dat2,$dat2 692 aese $dat3,q13 693 aesmc $dat3,$dat3 694 aese $dat4,q13 695 aesmc $dat4,$dat4 696 697 aese $dat0,q14 698 aesmc $dat0,$dat0 699 aese $dat1,q14 700 aesmc $dat1,$dat1 701 aese $dat2,q14 702 aesmc $dat2,$dat2 703 aese $dat3,q14 704 aesmc $dat3,$dat3 705 aese $dat4,q14 706 aesmc $dat4,$dat4 707 708 aese $dat0,q15 709 vld1.8 {$in0},[$inp],#16 710 aese $dat1,q15 711 vld1.8 {$in1},[$inp],#16 712 aese $dat2,q15 713 vld1.8 {$in2},[$inp],#16 714 aese $dat3,q15 715 vld1.8 {$in3},[$inp],#16 716 aese $dat4,q15 717 vld1.8 {$in4},[$inp],#16 718 cbz x6,.Lecb_enc_tail4x 719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 720 veor $tmp0,$rndlast,$dat0 721 vorr $dat0,$in0,$in0 722 veor $tmp1,$rndlast,$dat1 723 vorr $dat1,$in1,$in1 724 veor $tmp2,$rndlast,$dat2 725 vorr $dat2,$in2,$in2 726 veor $tmp3,$rndlast,$dat3 727 vorr $dat3,$in3,$in3 728 veor $tmp4,$rndlast,$dat4 729 vst1.8 {$tmp0},[$out],#16 730 vorr $dat4,$in4,$in4 731 vst1.8 {$tmp1},[$out],#16 732 mov $cnt,$rounds 733 vst1.8 {$tmp2},[$out],#16 734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 735 vst1.8 {$tmp3},[$out],#16 736 vst1.8 {$tmp4},[$out],#16 737 b.hs .Loop5x_ecb_enc 738 739 add $len,$len,#0x50 740 cbz $len,.Lecb_done 741 742 add $cnt,$rounds,#2 743 subs $len,$len,#0x30 744 vorr $dat0,$in2,$in2 745 vorr $dat1,$in3,$in3 746 vorr $dat2,$in4,$in4 747 b.lo .Lecb_enc_tail 748 749 b .Loop3x_ecb_enc 750 751.align 4 752.Lecb_enc_tail4x: 753 veor $tmp1,$rndlast,$dat1 754 veor $tmp2,$rndlast,$dat2 755 veor $tmp3,$rndlast,$dat3 756 veor $tmp4,$rndlast,$dat4 757 vst1.8 {$tmp1},[$out],#16 758 vst1.8 {$tmp2},[$out],#16 759 vst1.8 {$tmp3},[$out],#16 760 vst1.8 {$tmp4},[$out],#16 761 762 b .Lecb_done 763.align 4 764___ 765$code.=<<___; 766.Loop3x_ecb_enc: 767 aese $dat0,q8 768 aesmc $dat0,$dat0 769 aese $dat1,q8 770 aesmc $dat1,$dat1 771 aese $dat2,q8 772 aesmc $dat2,$dat2 773 vld1.32 {q8},[$key_],#16 774 subs $cnt,$cnt,#2 775 aese $dat0,q9 776 aesmc $dat0,$dat0 777 aese $dat1,q9 778 aesmc $dat1,$dat1 779 aese $dat2,q9 780 aesmc $dat2,$dat2 781 vld1.32 {q9},[$key_],#16 782 b.gt .Loop3x_ecb_enc 783 784 aese $dat0,q8 785 aesmc $dat0,$dat0 786 aese $dat1,q8 787 aesmc $dat1,$dat1 788 aese $dat2,q8 789 aesmc $dat2,$dat2 790 subs $len,$len,#0x30 791 mov.lo x6,$len // x6, $cnt, is zero at this point 792 aese $dat0,q9 793 aesmc $dat0,$dat0 794 aese $dat1,q9 795 aesmc $dat1,$dat1 796 aese $dat2,q9 797 aesmc $dat2,$dat2 798 add $inp,$inp,x6 // $inp is adjusted in such way that 799 // at exit from the loop $dat1-$dat2 800 // are loaded with last "words" 801 mov $key_,$key 802 aese $dat0,q12 803 aesmc $dat0,$dat0 804 aese $dat1,q12 805 aesmc $dat1,$dat1 806 aese $dat2,q12 807 aesmc $dat2,$dat2 808 vld1.8 {$in0},[$inp],#16 809 aese $dat0,q13 810 aesmc $dat0,$dat0 811 aese $dat1,q13 812 aesmc $dat1,$dat1 813 aese $dat2,q13 814 aesmc $dat2,$dat2 815 vld1.8 {$in1},[$inp],#16 816 aese $dat0,q14 817 aesmc $dat0,$dat0 818 aese $dat1,q14 819 aesmc $dat1,$dat1 820 aese $dat2,q14 821 aesmc $dat2,$dat2 822 vld1.8 {$in2},[$inp],#16 823 aese $dat0,q15 824 aese $dat1,q15 825 aese $dat2,q15 826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 827 add $cnt,$rounds,#2 828 veor $tmp0,$rndlast,$dat0 829 veor $tmp1,$rndlast,$dat1 830 veor $dat2,$dat2,$rndlast 831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 832 vst1.8 {$tmp0},[$out],#16 833 vorr $dat0,$in0,$in0 834 vst1.8 {$tmp1},[$out],#16 835 vorr $dat1,$in1,$in1 836 vst1.8 {$dat2},[$out],#16 837 vorr $dat2,$in2,$in2 838 b.hs .Loop3x_ecb_enc 839 840 cmn $len,#0x30 841 b.eq .Lecb_done 842 nop 843 844.Lecb_enc_tail: 845 aese $dat1,q8 846 aesmc $dat1,$dat1 847 aese $dat2,q8 848 aesmc $dat2,$dat2 849 vld1.32 {q8},[$key_],#16 850 subs $cnt,$cnt,#2 851 aese $dat1,q9 852 aesmc $dat1,$dat1 853 aese $dat2,q9 854 aesmc $dat2,$dat2 855 vld1.32 {q9},[$key_],#16 856 b.gt .Lecb_enc_tail 857 858 aese $dat1,q8 859 aesmc $dat1,$dat1 860 aese $dat2,q8 861 aesmc $dat2,$dat2 862 aese $dat1,q9 863 aesmc $dat1,$dat1 864 aese $dat2,q9 865 aesmc $dat2,$dat2 866 aese $dat1,q12 867 aesmc $dat1,$dat1 868 aese $dat2,q12 869 aesmc $dat2,$dat2 870 cmn $len,#0x20 871 aese $dat1,q13 872 aesmc $dat1,$dat1 873 aese $dat2,q13 874 aesmc $dat2,$dat2 875 aese $dat1,q14 876 aesmc $dat1,$dat1 877 aese $dat2,q14 878 aesmc $dat2,$dat2 879 aese $dat1,q15 880 aese $dat2,q15 881 b.eq .Lecb_enc_one 882 veor $tmp1,$rndlast,$dat1 883 veor $tmp2,$rndlast,$dat2 884 vst1.8 {$tmp1},[$out],#16 885 vst1.8 {$tmp2},[$out],#16 886 b .Lecb_done 887 888.Lecb_enc_one: 889 veor $tmp1,$rndlast,$dat2 890 vst1.8 {$tmp1},[$out],#16 891 b .Lecb_done 892___ 893 894$code.=<<___; 895.align 5 896.Lecb_dec: 897 vld1.8 {$dat1},[$inp],#16 898 subs $len,$len,#32 // bias 899 add $cnt,$rounds,#2 900 vorr $in1,$dat1,$dat1 901 vorr $dat2,$dat1,$dat1 902 vorr $dat1,$dat,$dat 903 b.lo .Lecb_dec_tail 904 905 vorr $dat1,$in1,$in1 906 vld1.8 {$dat2},[$inp],#16 907___ 908$code.=<<___ if ($flavour =~ /64/); 909 cmp $len,#32 910 b.lo .Loop3x_ecb_dec 911 912 vld1.8 {$dat3},[$inp],#16 913 vld1.8 {$dat4},[$inp],#16 914 sub $len,$len,#32 // bias 915 mov $cnt,$rounds 916 917.Loop5x_ecb_dec: 918 aesd $dat0,q8 919 aesimc $dat0,$dat0 920 aesd $dat1,q8 921 aesimc $dat1,$dat1 922 aesd $dat2,q8 923 aesimc $dat2,$dat2 924 aesd $dat3,q8 925 aesimc $dat3,$dat3 926 aesd $dat4,q8 927 aesimc $dat4,$dat4 928 vld1.32 {q8},[$key_],#16 929 subs $cnt,$cnt,#2 930 aesd $dat0,q9 931 aesimc $dat0,$dat0 932 aesd $dat1,q9 933 aesimc $dat1,$dat1 934 aesd $dat2,q9 935 aesimc $dat2,$dat2 936 aesd $dat3,q9 937 aesimc $dat3,$dat3 938 aesd $dat4,q9 939 aesimc $dat4,$dat4 940 vld1.32 {q9},[$key_],#16 941 b.gt .Loop5x_ecb_dec 942 943 aesd $dat0,q8 944 aesimc $dat0,$dat0 945 aesd $dat1,q8 946 aesimc $dat1,$dat1 947 aesd $dat2,q8 948 aesimc $dat2,$dat2 949 aesd $dat3,q8 950 aesimc $dat3,$dat3 951 aesd $dat4,q8 952 aesimc $dat4,$dat4 953 cmp $len,#0x40 // because .Lecb_tail4x 954 sub $len,$len,#0x50 955 956 aesd $dat0,q9 957 aesimc $dat0,$dat0 958 aesd $dat1,q9 959 aesimc $dat1,$dat1 960 aesd $dat2,q9 961 aesimc $dat2,$dat2 962 aesd $dat3,q9 963 aesimc $dat3,$dat3 964 aesd $dat4,q9 965 aesimc $dat4,$dat4 966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 967 mov $key_,$key 968 969 aesd $dat0,q10 970 aesimc $dat0,$dat0 971 aesd $dat1,q10 972 aesimc $dat1,$dat1 973 aesd $dat2,q10 974 aesimc $dat2,$dat2 975 aesd $dat3,q10 976 aesimc $dat3,$dat3 977 aesd $dat4,q10 978 aesimc $dat4,$dat4 979 add $inp,$inp,x6 // $inp is adjusted in such way that 980 // at exit from the loop $dat1-$dat4 981 // are loaded with last "words" 982 add x6,$len,#0x60 // because .Lecb_tail4x 983 984 aesd $dat0,q11 985 aesimc $dat0,$dat0 986 aesd $dat1,q11 987 aesimc $dat1,$dat1 988 aesd $dat2,q11 989 aesimc $dat2,$dat2 990 aesd $dat3,q11 991 aesimc $dat3,$dat3 992 aesd $dat4,q11 993 aesimc $dat4,$dat4 994 995 aesd $dat0,q12 996 aesimc $dat0,$dat0 997 aesd $dat1,q12 998 aesimc $dat1,$dat1 999 aesd $dat2,q12 1000 aesimc $dat2,$dat2 1001 aesd $dat3,q12 1002 aesimc $dat3,$dat3 1003 aesd $dat4,q12 1004 aesimc $dat4,$dat4 1005 1006 aesd $dat0,q13 1007 aesimc $dat0,$dat0 1008 aesd $dat1,q13 1009 aesimc $dat1,$dat1 1010 aesd $dat2,q13 1011 aesimc $dat2,$dat2 1012 aesd $dat3,q13 1013 aesimc $dat3,$dat3 1014 aesd $dat4,q13 1015 aesimc $dat4,$dat4 1016 1017 aesd $dat0,q14 1018 aesimc $dat0,$dat0 1019 aesd $dat1,q14 1020 aesimc $dat1,$dat1 1021 aesd $dat2,q14 1022 aesimc $dat2,$dat2 1023 aesd $dat3,q14 1024 aesimc $dat3,$dat3 1025 aesd $dat4,q14 1026 aesimc $dat4,$dat4 1027 1028 aesd $dat0,q15 1029 vld1.8 {$in0},[$inp],#16 1030 aesd $dat1,q15 1031 vld1.8 {$in1},[$inp],#16 1032 aesd $dat2,q15 1033 vld1.8 {$in2},[$inp],#16 1034 aesd $dat3,q15 1035 vld1.8 {$in3},[$inp],#16 1036 aesd $dat4,q15 1037 vld1.8 {$in4},[$inp],#16 1038 cbz x6,.Lecb_tail4x 1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1040 veor $tmp0,$rndlast,$dat0 1041 vorr $dat0,$in0,$in0 1042 veor $tmp1,$rndlast,$dat1 1043 vorr $dat1,$in1,$in1 1044 veor $tmp2,$rndlast,$dat2 1045 vorr $dat2,$in2,$in2 1046 veor $tmp3,$rndlast,$dat3 1047 vorr $dat3,$in3,$in3 1048 veor $tmp4,$rndlast,$dat4 1049 vst1.8 {$tmp0},[$out],#16 1050 vorr $dat4,$in4,$in4 1051 vst1.8 {$tmp1},[$out],#16 1052 mov $cnt,$rounds 1053 vst1.8 {$tmp2},[$out],#16 1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1055 vst1.8 {$tmp3},[$out],#16 1056 vst1.8 {$tmp4},[$out],#16 1057 b.hs .Loop5x_ecb_dec 1058 1059 add $len,$len,#0x50 1060 cbz $len,.Lecb_done 1061 1062 add $cnt,$rounds,#2 1063 subs $len,$len,#0x30 1064 vorr $dat0,$in2,$in2 1065 vorr $dat1,$in3,$in3 1066 vorr $dat2,$in4,$in4 1067 b.lo .Lecb_dec_tail 1068 1069 b .Loop3x_ecb_dec 1070 1071.align 4 1072.Lecb_tail4x: 1073 veor $tmp1,$rndlast,$dat1 1074 veor $tmp2,$rndlast,$dat2 1075 veor $tmp3,$rndlast,$dat3 1076 veor $tmp4,$rndlast,$dat4 1077 vst1.8 {$tmp1},[$out],#16 1078 vst1.8 {$tmp2},[$out],#16 1079 vst1.8 {$tmp3},[$out],#16 1080 vst1.8 {$tmp4},[$out],#16 1081 1082 b .Lecb_done 1083.align 4 1084___ 1085$code.=<<___; 1086.Loop3x_ecb_dec: 1087 aesd $dat0,q8 1088 aesimc $dat0,$dat0 1089 aesd $dat1,q8 1090 aesimc $dat1,$dat1 1091 aesd $dat2,q8 1092 aesimc $dat2,$dat2 1093 vld1.32 {q8},[$key_],#16 1094 subs $cnt,$cnt,#2 1095 aesd $dat0,q9 1096 aesimc $dat0,$dat0 1097 aesd $dat1,q9 1098 aesimc $dat1,$dat1 1099 aesd $dat2,q9 1100 aesimc $dat2,$dat2 1101 vld1.32 {q9},[$key_],#16 1102 b.gt .Loop3x_ecb_dec 1103 1104 aesd $dat0,q8 1105 aesimc $dat0,$dat0 1106 aesd $dat1,q8 1107 aesimc $dat1,$dat1 1108 aesd $dat2,q8 1109 aesimc $dat2,$dat2 1110 subs $len,$len,#0x30 1111 mov.lo x6,$len // x6, $cnt, is zero at this point 1112 aesd $dat0,q9 1113 aesimc $dat0,$dat0 1114 aesd $dat1,q9 1115 aesimc $dat1,$dat1 1116 aesd $dat2,q9 1117 aesimc $dat2,$dat2 1118 add $inp,$inp,x6 // $inp is adjusted in such way that 1119 // at exit from the loop $dat1-$dat2 1120 // are loaded with last "words" 1121 mov $key_,$key 1122 aesd $dat0,q12 1123 aesimc $dat0,$dat0 1124 aesd $dat1,q12 1125 aesimc $dat1,$dat1 1126 aesd $dat2,q12 1127 aesimc $dat2,$dat2 1128 vld1.8 {$in0},[$inp],#16 1129 aesd $dat0,q13 1130 aesimc $dat0,$dat0 1131 aesd $dat1,q13 1132 aesimc $dat1,$dat1 1133 aesd $dat2,q13 1134 aesimc $dat2,$dat2 1135 vld1.8 {$in1},[$inp],#16 1136 aesd $dat0,q14 1137 aesimc $dat0,$dat0 1138 aesd $dat1,q14 1139 aesimc $dat1,$dat1 1140 aesd $dat2,q14 1141 aesimc $dat2,$dat2 1142 vld1.8 {$in2},[$inp],#16 1143 aesd $dat0,q15 1144 aesd $dat1,q15 1145 aesd $dat2,q15 1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1147 add $cnt,$rounds,#2 1148 veor $tmp0,$rndlast,$dat0 1149 veor $tmp1,$rndlast,$dat1 1150 veor $dat2,$dat2,$rndlast 1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1152 vst1.8 {$tmp0},[$out],#16 1153 vorr $dat0,$in0,$in0 1154 vst1.8 {$tmp1},[$out],#16 1155 vorr $dat1,$in1,$in1 1156 vst1.8 {$dat2},[$out],#16 1157 vorr $dat2,$in2,$in2 1158 b.hs .Loop3x_ecb_dec 1159 1160 cmn $len,#0x30 1161 b.eq .Lecb_done 1162 nop 1163 1164.Lecb_dec_tail: 1165 aesd $dat1,q8 1166 aesimc $dat1,$dat1 1167 aesd $dat2,q8 1168 aesimc $dat2,$dat2 1169 vld1.32 {q8},[$key_],#16 1170 subs $cnt,$cnt,#2 1171 aesd $dat1,q9 1172 aesimc $dat1,$dat1 1173 aesd $dat2,q9 1174 aesimc $dat2,$dat2 1175 vld1.32 {q9},[$key_],#16 1176 b.gt .Lecb_dec_tail 1177 1178 aesd $dat1,q8 1179 aesimc $dat1,$dat1 1180 aesd $dat2,q8 1181 aesimc $dat2,$dat2 1182 aesd $dat1,q9 1183 aesimc $dat1,$dat1 1184 aesd $dat2,q9 1185 aesimc $dat2,$dat2 1186 aesd $dat1,q12 1187 aesimc $dat1,$dat1 1188 aesd $dat2,q12 1189 aesimc $dat2,$dat2 1190 cmn $len,#0x20 1191 aesd $dat1,q13 1192 aesimc $dat1,$dat1 1193 aesd $dat2,q13 1194 aesimc $dat2,$dat2 1195 aesd $dat1,q14 1196 aesimc $dat1,$dat1 1197 aesd $dat2,q14 1198 aesimc $dat2,$dat2 1199 aesd $dat1,q15 1200 aesd $dat2,q15 1201 b.eq .Lecb_dec_one 1202 veor $tmp1,$rndlast,$dat1 1203 veor $tmp2,$rndlast,$dat2 1204 vst1.8 {$tmp1},[$out],#16 1205 vst1.8 {$tmp2},[$out],#16 1206 b .Lecb_done 1207 1208.Lecb_dec_one: 1209 veor $tmp1,$rndlast,$dat2 1210 vst1.8 {$tmp1},[$out],#16 1211 1212.Lecb_done: 1213___ 1214} 1215$code.=<<___ if ($flavour !~ /64/); 1216 vldmia sp!,{d8-d15} 1217 ldmia sp!,{r4-r8,pc} 1218___ 1219$code.=<<___ if ($flavour =~ /64/); 1220 ldr x29,[sp],#16 1221___ 1222$code.=<<___ if ($flavour =~ /64/); 1223.Lecb_Final_abort: 1224 ret 1225___ 1226$code.=<<___; 1227.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 1228___ 1229}}} 1230{{{ 1231my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 1232my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 1233my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1234 1235my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 1236my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 1237 1238### q8-q15 preloaded key schedule 1239 1240$code.=<<___; 1241.globl ${prefix}_cbc_encrypt 1242.type ${prefix}_cbc_encrypt,%function 1243.align 5 1244${prefix}_cbc_encrypt: 1245___ 1246$code.=<<___ if ($flavour =~ /64/); 1247 AARCH64_VALID_CALL_TARGET 1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1249 stp x29,x30,[sp,#-16]! 1250 add x29,sp,#0 1251___ 1252$code.=<<___ if ($flavour !~ /64/); 1253 mov ip,sp 1254 stmdb sp!,{r4-r8,lr} 1255 vstmdb sp!,{d8-d15} @ ABI specification says so 1256 ldmia ip,{r4-r5} @ load remaining args 1257___ 1258$code.=<<___; 1259 subs $len,$len,#16 1260 mov $step,#16 1261 b.lo .Lcbc_abort 1262 cclr $step,eq 1263 1264 cmp $enc,#0 // en- or decrypting? 1265 ldr $rounds,[$key,#240] 1266 and $len,$len,#-16 1267 vld1.8 {$ivec},[$ivp] 1268 vld1.8 {$dat},[$inp],$step 1269 1270 vld1.32 {q8-q9},[$key] // load key schedule... 1271 sub $rounds,$rounds,#6 1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 1273 sub $rounds,$rounds,#2 1274 vld1.32 {q10-q11},[$key_],#32 1275 vld1.32 {q12-q13},[$key_],#32 1276 vld1.32 {q14-q15},[$key_],#32 1277 vld1.32 {$rndlast},[$key_] 1278 1279 add $key_,$key,#32 1280 mov $cnt,$rounds 1281 b.eq .Lcbc_dec 1282 1283 cmp $rounds,#2 1284 veor $dat,$dat,$ivec 1285 veor $rndzero_n_last,q8,$rndlast 1286 b.eq .Lcbc_enc128 1287 1288 vld1.32 {$in0-$in1},[$key_] 1289 add $key_,$key,#16 1290 add $key4,$key,#16*4 1291 add $key5,$key,#16*5 1292 aese $dat,q8 1293 aesmc $dat,$dat 1294 add $key6,$key,#16*6 1295 add $key7,$key,#16*7 1296 b .Lenter_cbc_enc 1297 1298.align 4 1299.Loop_cbc_enc: 1300 aese $dat,q8 1301 aesmc $dat,$dat 1302 vst1.8 {$ivec},[$out],#16 1303.Lenter_cbc_enc: 1304 aese $dat,q9 1305 aesmc $dat,$dat 1306 aese $dat,$in0 1307 aesmc $dat,$dat 1308 vld1.32 {q8},[$key4] 1309 cmp $rounds,#4 1310 aese $dat,$in1 1311 aesmc $dat,$dat 1312 vld1.32 {q9},[$key5] 1313 b.eq .Lcbc_enc192 1314 1315 aese $dat,q8 1316 aesmc $dat,$dat 1317 vld1.32 {q8},[$key6] 1318 aese $dat,q9 1319 aesmc $dat,$dat 1320 vld1.32 {q9},[$key7] 1321 nop 1322 1323.Lcbc_enc192: 1324 aese $dat,q8 1325 aesmc $dat,$dat 1326 subs $len,$len,#16 1327 aese $dat,q9 1328 aesmc $dat,$dat 1329 cclr $step,eq 1330 aese $dat,q10 1331 aesmc $dat,$dat 1332 aese $dat,q11 1333 aesmc $dat,$dat 1334 vld1.8 {q8},[$inp],$step 1335 aese $dat,q12 1336 aesmc $dat,$dat 1337 veor q8,q8,$rndzero_n_last 1338 aese $dat,q13 1339 aesmc $dat,$dat 1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 1341 aese $dat,q14 1342 aesmc $dat,$dat 1343 aese $dat,q15 1344 veor $ivec,$dat,$rndlast 1345 b.hs .Loop_cbc_enc 1346 1347 vst1.8 {$ivec},[$out],#16 1348 b .Lcbc_done 1349 1350.align 5 1351.Lcbc_enc128: 1352 vld1.32 {$in0-$in1},[$key_] 1353 aese $dat,q8 1354 aesmc $dat,$dat 1355 b .Lenter_cbc_enc128 1356.Loop_cbc_enc128: 1357 aese $dat,q8 1358 aesmc $dat,$dat 1359 vst1.8 {$ivec},[$out],#16 1360.Lenter_cbc_enc128: 1361 aese $dat,q9 1362 aesmc $dat,$dat 1363 subs $len,$len,#16 1364 aese $dat,$in0 1365 aesmc $dat,$dat 1366 cclr $step,eq 1367 aese $dat,$in1 1368 aesmc $dat,$dat 1369 aese $dat,q10 1370 aesmc $dat,$dat 1371 aese $dat,q11 1372 aesmc $dat,$dat 1373 vld1.8 {q8},[$inp],$step 1374 aese $dat,q12 1375 aesmc $dat,$dat 1376 aese $dat,q13 1377 aesmc $dat,$dat 1378 aese $dat,q14 1379 aesmc $dat,$dat 1380 veor q8,q8,$rndzero_n_last 1381 aese $dat,q15 1382 veor $ivec,$dat,$rndlast 1383 b.hs .Loop_cbc_enc128 1384 1385 vst1.8 {$ivec},[$out],#16 1386 b .Lcbc_done 1387___ 1388{ 1389my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1390 1391my ($dat3,$in3,$tmp3); # used only in 64-bit mode 1392my ($dat4,$in4,$tmp4); 1393if ($flavour =~ /64/) { 1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 1395} 1396 1397$code.=<<___; 1398.align 5 1399.Lcbc_dec: 1400 vld1.8 {$dat2},[$inp],#16 1401 subs $len,$len,#32 // bias 1402 add $cnt,$rounds,#2 1403 vorr $in1,$dat,$dat 1404 vorr $dat1,$dat,$dat 1405 vorr $in2,$dat2,$dat2 1406 b.lo .Lcbc_dec_tail 1407 1408 vorr $dat1,$dat2,$dat2 1409 vld1.8 {$dat2},[$inp],#16 1410 vorr $in0,$dat,$dat 1411 vorr $in1,$dat1,$dat1 1412 vorr $in2,$dat2,$dat2 1413___ 1414$code.=<<___ if ($flavour =~ /64/); 1415 cmp $len,#32 1416 b.lo .Loop3x_cbc_dec 1417 1418 vld1.8 {$dat3},[$inp],#16 1419 vld1.8 {$dat4},[$inp],#16 1420 sub $len,$len,#32 // bias 1421 mov $cnt,$rounds 1422 vorr $in3,$dat3,$dat3 1423 vorr $in4,$dat4,$dat4 1424 1425.Loop5x_cbc_dec: 1426 aesd $dat0,q8 1427 aesimc $dat0,$dat0 1428 aesd $dat1,q8 1429 aesimc $dat1,$dat1 1430 aesd $dat2,q8 1431 aesimc $dat2,$dat2 1432 aesd $dat3,q8 1433 aesimc $dat3,$dat3 1434 aesd $dat4,q8 1435 aesimc $dat4,$dat4 1436 vld1.32 {q8},[$key_],#16 1437 subs $cnt,$cnt,#2 1438 aesd $dat0,q9 1439 aesimc $dat0,$dat0 1440 aesd $dat1,q9 1441 aesimc $dat1,$dat1 1442 aesd $dat2,q9 1443 aesimc $dat2,$dat2 1444 aesd $dat3,q9 1445 aesimc $dat3,$dat3 1446 aesd $dat4,q9 1447 aesimc $dat4,$dat4 1448 vld1.32 {q9},[$key_],#16 1449 b.gt .Loop5x_cbc_dec 1450 1451 aesd $dat0,q8 1452 aesimc $dat0,$dat0 1453 aesd $dat1,q8 1454 aesimc $dat1,$dat1 1455 aesd $dat2,q8 1456 aesimc $dat2,$dat2 1457 aesd $dat3,q8 1458 aesimc $dat3,$dat3 1459 aesd $dat4,q8 1460 aesimc $dat4,$dat4 1461 cmp $len,#0x40 // because .Lcbc_tail4x 1462 sub $len,$len,#0x50 1463 1464 aesd $dat0,q9 1465 aesimc $dat0,$dat0 1466 aesd $dat1,q9 1467 aesimc $dat1,$dat1 1468 aesd $dat2,q9 1469 aesimc $dat2,$dat2 1470 aesd $dat3,q9 1471 aesimc $dat3,$dat3 1472 aesd $dat4,q9 1473 aesimc $dat4,$dat4 1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 1475 mov $key_,$key 1476 1477 aesd $dat0,q10 1478 aesimc $dat0,$dat0 1479 aesd $dat1,q10 1480 aesimc $dat1,$dat1 1481 aesd $dat2,q10 1482 aesimc $dat2,$dat2 1483 aesd $dat3,q10 1484 aesimc $dat3,$dat3 1485 aesd $dat4,q10 1486 aesimc $dat4,$dat4 1487 add $inp,$inp,x6 // $inp is adjusted in such way that 1488 // at exit from the loop $dat1-$dat4 1489 // are loaded with last "words" 1490 add x6,$len,#0x60 // because .Lcbc_tail4x 1491 1492 aesd $dat0,q11 1493 aesimc $dat0,$dat0 1494 aesd $dat1,q11 1495 aesimc $dat1,$dat1 1496 aesd $dat2,q11 1497 aesimc $dat2,$dat2 1498 aesd $dat3,q11 1499 aesimc $dat3,$dat3 1500 aesd $dat4,q11 1501 aesimc $dat4,$dat4 1502 1503 aesd $dat0,q12 1504 aesimc $dat0,$dat0 1505 aesd $dat1,q12 1506 aesimc $dat1,$dat1 1507 aesd $dat2,q12 1508 aesimc $dat2,$dat2 1509 aesd $dat3,q12 1510 aesimc $dat3,$dat3 1511 aesd $dat4,q12 1512 aesimc $dat4,$dat4 1513 1514 aesd $dat0,q13 1515 aesimc $dat0,$dat0 1516 aesd $dat1,q13 1517 aesimc $dat1,$dat1 1518 aesd $dat2,q13 1519 aesimc $dat2,$dat2 1520 aesd $dat3,q13 1521 aesimc $dat3,$dat3 1522 aesd $dat4,q13 1523 aesimc $dat4,$dat4 1524 1525 aesd $dat0,q14 1526 aesimc $dat0,$dat0 1527 aesd $dat1,q14 1528 aesimc $dat1,$dat1 1529 aesd $dat2,q14 1530 aesimc $dat2,$dat2 1531 aesd $dat3,q14 1532 aesimc $dat3,$dat3 1533 aesd $dat4,q14 1534 aesimc $dat4,$dat4 1535 1536 veor $tmp0,$ivec,$rndlast 1537 aesd $dat0,q15 1538 veor $tmp1,$in0,$rndlast 1539 vld1.8 {$in0},[$inp],#16 1540 aesd $dat1,q15 1541 veor $tmp2,$in1,$rndlast 1542 vld1.8 {$in1},[$inp],#16 1543 aesd $dat2,q15 1544 veor $tmp3,$in2,$rndlast 1545 vld1.8 {$in2},[$inp],#16 1546 aesd $dat3,q15 1547 veor $tmp4,$in3,$rndlast 1548 vld1.8 {$in3},[$inp],#16 1549 aesd $dat4,q15 1550 vorr $ivec,$in4,$in4 1551 vld1.8 {$in4},[$inp],#16 1552 cbz x6,.Lcbc_tail4x 1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1554 veor $tmp0,$tmp0,$dat0 1555 vorr $dat0,$in0,$in0 1556 veor $tmp1,$tmp1,$dat1 1557 vorr $dat1,$in1,$in1 1558 veor $tmp2,$tmp2,$dat2 1559 vorr $dat2,$in2,$in2 1560 veor $tmp3,$tmp3,$dat3 1561 vorr $dat3,$in3,$in3 1562 veor $tmp4,$tmp4,$dat4 1563 vst1.8 {$tmp0},[$out],#16 1564 vorr $dat4,$in4,$in4 1565 vst1.8 {$tmp1},[$out],#16 1566 mov $cnt,$rounds 1567 vst1.8 {$tmp2},[$out],#16 1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1569 vst1.8 {$tmp3},[$out],#16 1570 vst1.8 {$tmp4},[$out],#16 1571 b.hs .Loop5x_cbc_dec 1572 1573 add $len,$len,#0x50 1574 cbz $len,.Lcbc_done 1575 1576 add $cnt,$rounds,#2 1577 subs $len,$len,#0x30 1578 vorr $dat0,$in2,$in2 1579 vorr $in0,$in2,$in2 1580 vorr $dat1,$in3,$in3 1581 vorr $in1,$in3,$in3 1582 vorr $dat2,$in4,$in4 1583 vorr $in2,$in4,$in4 1584 b.lo .Lcbc_dec_tail 1585 1586 b .Loop3x_cbc_dec 1587 1588.align 4 1589.Lcbc_tail4x: 1590 veor $tmp1,$tmp0,$dat1 1591 veor $tmp2,$tmp2,$dat2 1592 veor $tmp3,$tmp3,$dat3 1593 veor $tmp4,$tmp4,$dat4 1594 vst1.8 {$tmp1},[$out],#16 1595 vst1.8 {$tmp2},[$out],#16 1596 vst1.8 {$tmp3},[$out],#16 1597 vst1.8 {$tmp4},[$out],#16 1598 1599 b .Lcbc_done 1600.align 4 1601___ 1602$code.=<<___; 1603.Loop3x_cbc_dec: 1604 aesd $dat0,q8 1605 aesimc $dat0,$dat0 1606 aesd $dat1,q8 1607 aesimc $dat1,$dat1 1608 aesd $dat2,q8 1609 aesimc $dat2,$dat2 1610 vld1.32 {q8},[$key_],#16 1611 subs $cnt,$cnt,#2 1612 aesd $dat0,q9 1613 aesimc $dat0,$dat0 1614 aesd $dat1,q9 1615 aesimc $dat1,$dat1 1616 aesd $dat2,q9 1617 aesimc $dat2,$dat2 1618 vld1.32 {q9},[$key_],#16 1619 b.gt .Loop3x_cbc_dec 1620 1621 aesd $dat0,q8 1622 aesimc $dat0,$dat0 1623 aesd $dat1,q8 1624 aesimc $dat1,$dat1 1625 aesd $dat2,q8 1626 aesimc $dat2,$dat2 1627 veor $tmp0,$ivec,$rndlast 1628 subs $len,$len,#0x30 1629 veor $tmp1,$in0,$rndlast 1630 mov.lo x6,$len // x6, $cnt, is zero at this point 1631 aesd $dat0,q9 1632 aesimc $dat0,$dat0 1633 aesd $dat1,q9 1634 aesimc $dat1,$dat1 1635 aesd $dat2,q9 1636 aesimc $dat2,$dat2 1637 veor $tmp2,$in1,$rndlast 1638 add $inp,$inp,x6 // $inp is adjusted in such way that 1639 // at exit from the loop $dat1-$dat2 1640 // are loaded with last "words" 1641 vorr $ivec,$in2,$in2 1642 mov $key_,$key 1643 aesd $dat0,q12 1644 aesimc $dat0,$dat0 1645 aesd $dat1,q12 1646 aesimc $dat1,$dat1 1647 aesd $dat2,q12 1648 aesimc $dat2,$dat2 1649 vld1.8 {$in0},[$inp],#16 1650 aesd $dat0,q13 1651 aesimc $dat0,$dat0 1652 aesd $dat1,q13 1653 aesimc $dat1,$dat1 1654 aesd $dat2,q13 1655 aesimc $dat2,$dat2 1656 vld1.8 {$in1},[$inp],#16 1657 aesd $dat0,q14 1658 aesimc $dat0,$dat0 1659 aesd $dat1,q14 1660 aesimc $dat1,$dat1 1661 aesd $dat2,q14 1662 aesimc $dat2,$dat2 1663 vld1.8 {$in2},[$inp],#16 1664 aesd $dat0,q15 1665 aesd $dat1,q15 1666 aesd $dat2,q15 1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1668 add $cnt,$rounds,#2 1669 veor $tmp0,$tmp0,$dat0 1670 veor $tmp1,$tmp1,$dat1 1671 veor $dat2,$dat2,$tmp2 1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1673 vst1.8 {$tmp0},[$out],#16 1674 vorr $dat0,$in0,$in0 1675 vst1.8 {$tmp1},[$out],#16 1676 vorr $dat1,$in1,$in1 1677 vst1.8 {$dat2},[$out],#16 1678 vorr $dat2,$in2,$in2 1679 b.hs .Loop3x_cbc_dec 1680 1681 cmn $len,#0x30 1682 b.eq .Lcbc_done 1683 nop 1684 1685.Lcbc_dec_tail: 1686 aesd $dat1,q8 1687 aesimc $dat1,$dat1 1688 aesd $dat2,q8 1689 aesimc $dat2,$dat2 1690 vld1.32 {q8},[$key_],#16 1691 subs $cnt,$cnt,#2 1692 aesd $dat1,q9 1693 aesimc $dat1,$dat1 1694 aesd $dat2,q9 1695 aesimc $dat2,$dat2 1696 vld1.32 {q9},[$key_],#16 1697 b.gt .Lcbc_dec_tail 1698 1699 aesd $dat1,q8 1700 aesimc $dat1,$dat1 1701 aesd $dat2,q8 1702 aesimc $dat2,$dat2 1703 aesd $dat1,q9 1704 aesimc $dat1,$dat1 1705 aesd $dat2,q9 1706 aesimc $dat2,$dat2 1707 aesd $dat1,q12 1708 aesimc $dat1,$dat1 1709 aesd $dat2,q12 1710 aesimc $dat2,$dat2 1711 cmn $len,#0x20 1712 aesd $dat1,q13 1713 aesimc $dat1,$dat1 1714 aesd $dat2,q13 1715 aesimc $dat2,$dat2 1716 veor $tmp1,$ivec,$rndlast 1717 aesd $dat1,q14 1718 aesimc $dat1,$dat1 1719 aesd $dat2,q14 1720 aesimc $dat2,$dat2 1721 veor $tmp2,$in1,$rndlast 1722 aesd $dat1,q15 1723 aesd $dat2,q15 1724 b.eq .Lcbc_dec_one 1725 veor $tmp1,$tmp1,$dat1 1726 veor $tmp2,$tmp2,$dat2 1727 vorr $ivec,$in2,$in2 1728 vst1.8 {$tmp1},[$out],#16 1729 vst1.8 {$tmp2},[$out],#16 1730 b .Lcbc_done 1731 1732.Lcbc_dec_one: 1733 veor $tmp1,$tmp1,$dat2 1734 vorr $ivec,$in2,$in2 1735 vst1.8 {$tmp1},[$out],#16 1736 1737.Lcbc_done: 1738 vst1.8 {$ivec},[$ivp] 1739.Lcbc_abort: 1740___ 1741} 1742$code.=<<___ if ($flavour !~ /64/); 1743 vldmia sp!,{d8-d15} 1744 ldmia sp!,{r4-r8,pc} 1745___ 1746$code.=<<___ if ($flavour =~ /64/); 1747 ldr x29,[sp],#16 1748 ret 1749___ 1750$code.=<<___; 1751.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1752___ 1753}}} 1754 1755{{{ 1756my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 1757my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7"); 1758my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 1759my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15)); 1760my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23)); 1761 1762# q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15 1763my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3)); 1764my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9)); 1765my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15)); 1766my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21)); 1767my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27)); 1768my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27)); 1769 1770#q_X => qX, for ldp & stp 1771my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7)); 1772my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23)); 1773 1774my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11)); 1775 1776$code.=<<___ if ($flavour =~ /64/); 1777.globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3 1778.type ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function 1779.align 5 1780${prefix}_ctr32_encrypt_blocks_unroll12_eor3: 1781 AARCH64_VALID_CALL_TARGET 1782 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1783 stp x29,x30,[sp,#-80]! 1784 stp d8,d9,[sp, #16] 1785 stp d10,d11,[sp, #32] 1786 stp d12,d13,[sp, #48] 1787 stp d14,d15,[sp, #64] 1788 add x29,sp,#0 1789 1790 ldr $rounds,[$key,#240] 1791 1792 ldr $ctr, [$ivp, #12] 1793#ifdef __AARCH64EB__ 1794 vld1.8 {$dat0},[$ivp] 1795#else 1796 vld1.32 {$dat0},[$ivp] 1797#endif 1798 vld1.32 {$rndping-$rndpang},[$key] // load key schedule... 1799 sub $rounds,$rounds,#4 1800 cmp $len,#2 1801 add $key_,$key,$roundsx,lsl#4 // pointer to last round key 1802 sub $rounds,$rounds,#2 1803 add $key_, $key_, #64 1804 vld1.32 {$rndlast},[$key_] 1805 add $key_,$key,#32 1806 mov $cnt,$rounds 1807#ifndef __AARCH64EB__ 1808 rev $ctr, $ctr 1809#endif 1810 1811 vorr $dat1,$dat0,$dat0 1812 add $tctr1, $ctr, #1 1813 vorr $dat2,$dat0,$dat0 1814 add $ctr, $ctr, #2 1815 vorr $ivec,$dat0,$dat0 1816 rev $tctr1, $tctr1 1817 vmov.32 ${dat1}[3],$tctr1 1818 b.ls .Lctr32_tail_unroll 1819 cmp $len,#6 1820 rev $tctr2, $ctr 1821 sub $len,$len,#3 // bias 1822 vmov.32 ${dat2}[3],$tctr2 1823 b.lo .Loop3x_ctr32_unroll 1824 cmp $len,#9 1825 vorr $dat3,$dat0,$dat0 1826 add $tctr3, $ctr, #1 1827 vorr $dat4,$dat0,$dat0 1828 add $tctr4, $ctr, #2 1829 rev $tctr3, $tctr3 1830 vorr $dat5,$dat0,$dat0 1831 add $ctr, $ctr, #3 1832 rev $tctr4, $tctr4 1833 vmov.32 ${dat3}[3],$tctr3 1834 rev $tctr5, $ctr 1835 vmov.32 ${dat4}[3],$tctr4 1836 vmov.32 ${dat5}[3],$tctr5 1837 sub $len,$len,#3 1838 b.lo .Loop6x_ctr32_unroll 1839 1840 // push regs to stack when 12 data chunks are interleaved 1841 stp x19,x20,[sp,#-16]! 1842 stp x21,x22,[sp,#-16]! 1843 stp x23,x24,[sp,#-16]! 1844 stp $dat8d,$dat9d,[sp,#-32]! 1845 stp $dat10d,$dat11d,[sp,#-32]! 1846 1847 add $tctr6,$ctr,#1 1848 add $tctr7,$ctr,#2 1849 add $tctr8,$ctr,#3 1850 add $tctr9,$ctr,#4 1851 add $tctr10,$ctr,#5 1852 add $ctr,$ctr,#6 1853 vorr $dat6,$dat0,$dat0 1854 rev $tctr6,$tctr6 1855 vorr $dat7,$dat0,$dat0 1856 rev $tctr7,$tctr7 1857 vorr $dat8,$dat0,$dat0 1858 rev $tctr8,$tctr8 1859 vorr $dat9,$dat0,$dat0 1860 rev $tctr9,$tctr9 1861 vorr $dat10,$dat0,$dat0 1862 rev $tctr10,$tctr10 1863 vorr $dat11,$dat0,$dat0 1864 rev $tctr11,$ctr 1865 1866 sub $len,$len,#6 // bias 1867 vmov.32 ${dat6}[3],$tctr6 1868 vmov.32 ${dat7}[3],$tctr7 1869 vmov.32 ${dat8}[3],$tctr8 1870 vmov.32 ${dat9}[3],$tctr9 1871 vmov.32 ${dat10}[3],$tctr10 1872 vmov.32 ${dat11}[3],$tctr11 1873 b .Loop12x_ctr32_unroll 1874 1875.align 4 1876.Loop12x_ctr32_unroll: 1877 aese $dat0,$rndping 1878 aesmc $dat0,$dat0 1879 aese $dat1,$rndping 1880 aesmc $dat1,$dat1 1881 aese $dat2,$rndping 1882 aesmc $dat2,$dat2 1883 aese $dat3,$rndping 1884 aesmc $dat3,$dat3 1885 aese $dat4,$rndping 1886 aesmc $dat4,$dat4 1887 aese $dat5,$rndping 1888 aesmc $dat5,$dat5 1889 aese $dat6,$rndping 1890 aesmc $dat6,$dat6 1891 aese $dat7,$rndping 1892 aesmc $dat7,$dat7 1893 aese $dat8,$rndping 1894 aesmc $dat8,$dat8 1895 aese $dat9,$rndping 1896 aesmc $dat9,$dat9 1897 aese $dat10,$rndping 1898 aesmc $dat10,$dat10 1899 aese $dat11,$rndping 1900 aesmc $dat11,$dat11 1901 vld1.32 {$rndping},[$key_],#16 1902 subs $cnt,$cnt,#2 1903 aese $dat0,$rndpang 1904 aesmc $dat0,$dat0 1905 aese $dat1,$rndpang 1906 aesmc $dat1,$dat1 1907 aese $dat2,$rndpang 1908 aesmc $dat2,$dat2 1909 aese $dat3,$rndpang 1910 aesmc $dat3,$dat3 1911 aese $dat4,$rndpang 1912 aesmc $dat4,$dat4 1913 aese $dat5,$rndpang 1914 aesmc $dat5,$dat5 1915 aese $dat6,$rndpang 1916 aesmc $dat6,$dat6 1917 aese $dat7,$rndpang 1918 aesmc $dat7,$dat7 1919 aese $dat8,$rndpang 1920 aesmc $dat8,$dat8 1921 aese $dat9,$rndpang 1922 aesmc $dat9,$dat9 1923 aese $dat10,$rndpang 1924 aesmc $dat10,$dat10 1925 aese $dat11,$rndpang 1926 aesmc $dat11,$dat11 1927 vld1.32 {$rndpang},[$key_],#16 1928 b.gt .Loop12x_ctr32_unroll 1929 1930 aese $dat0,$rndping 1931 aesmc $dat0,$dat0 1932 aese $dat1,$rndping 1933 aesmc $dat1,$dat1 1934 aese $dat2,$rndping 1935 aesmc $dat2,$dat2 1936 aese $dat3,$rndping 1937 aesmc $dat3,$dat3 1938 aese $dat4,$rndping 1939 aesmc $dat4,$dat4 1940 aese $dat5,$rndping 1941 aesmc $dat5,$dat5 1942 aese $dat6,$rndping 1943 aesmc $dat6,$dat6 1944 aese $dat7,$rndping 1945 aesmc $dat7,$dat7 1946 aese $dat8,$rndping 1947 aesmc $dat8,$dat8 1948 aese $dat9,$rndping 1949 aesmc $dat9,$dat9 1950 aese $dat10,$rndping 1951 aesmc $dat10,$dat10 1952 aese $dat11,$rndping 1953 aesmc $dat11,$dat11 1954 vld1.32 {$rndping},[$key_],#16 1955 1956 aese $dat0,$rndpang 1957 aesmc $dat0,$dat0 1958 aese $dat1,$rndpang 1959 aesmc $dat1,$dat1 1960 aese $dat2,$rndpang 1961 aesmc $dat2,$dat2 1962 aese $dat3,$rndpang 1963 aesmc $dat3,$dat3 1964 aese $dat4,$rndpang 1965 aesmc $dat4,$dat4 1966 aese $dat5,$rndpang 1967 aesmc $dat5,$dat5 1968 aese $dat6,$rndpang 1969 aesmc $dat6,$dat6 1970 aese $dat7,$rndpang 1971 aesmc $dat7,$dat7 1972 aese $dat8,$rndpang 1973 aesmc $dat8,$dat8 1974 aese $dat9,$rndpang 1975 aesmc $dat9,$dat9 1976 aese $dat10,$rndpang 1977 aesmc $dat10,$dat10 1978 aese $dat11,$rndpang 1979 aesmc $dat11,$dat11 1980 vld1.32 {$rndpang},[$key_],#16 1981 1982 aese $dat0,$rndping 1983 aesmc $dat0,$dat0 1984 add $tctr0,$ctr,#1 1985 add $tctr1,$ctr,#2 1986 aese $dat1,$rndping 1987 aesmc $dat1,$dat1 1988 add $tctr2,$ctr,#3 1989 add $tctr3,$ctr,#4 1990 aese $dat2,$rndping 1991 aesmc $dat2,$dat2 1992 add $tctr4,$ctr,#5 1993 add $tctr5,$ctr,#6 1994 rev $tctr0,$tctr0 1995 aese $dat3,$rndping 1996 aesmc $dat3,$dat3 1997 add $tctr6,$ctr,#7 1998 add $tctr7,$ctr,#8 1999 rev $tctr1,$tctr1 2000 rev $tctr2,$tctr2 2001 aese $dat4,$rndping 2002 aesmc $dat4,$dat4 2003 add $tctr8,$ctr,#9 2004 add $tctr9,$ctr,#10 2005 rev $tctr3,$tctr3 2006 rev $tctr4,$tctr4 2007 aese $dat5,$rndping 2008 aesmc $dat5,$dat5 2009 add $tctr10,$ctr,#11 2010 add $tctr11,$ctr,#12 2011 rev $tctr5,$tctr5 2012 rev $tctr6,$tctr6 2013 aese $dat6,$rndping 2014 aesmc $dat6,$dat6 2015 rev $tctr7,$tctr7 2016 rev $tctr8,$tctr8 2017 aese $dat7,$rndping 2018 aesmc $dat7,$dat7 2019 rev $tctr9,$tctr9 2020 rev $tctr10,$tctr10 2021 aese $dat8,$rndping 2022 aesmc $dat8,$dat8 2023 rev $tctr11,$tctr11 2024 aese $dat9,$rndping 2025 aesmc $dat9,$dat9 2026 aese $dat10,$rndping 2027 aesmc $dat10,$dat10 2028 aese $dat11,$rndping 2029 aesmc $dat11,$dat11 2030 vld1.32 {$rndping},[$key_],#16 2031 2032 aese $dat0,$rndpang 2033 aesmc $dat0,$dat0 2034 aese $dat1,$rndpang 2035 aesmc $dat1,$dat1 2036 aese $dat2,$rndpang 2037 aesmc $dat2,$dat2 2038 aese $dat3,$rndpang 2039 aesmc $dat3,$dat3 2040 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64 2041 aese $dat4,$rndpang 2042 aesmc $dat4,$dat4 2043 aese $dat5,$rndpang 2044 aesmc $dat5,$dat5 2045 aese $dat6,$rndpang 2046 aesmc $dat6,$dat6 2047 aese $dat7,$rndpang 2048 aesmc $dat7,$dat7 2049 vld1.8 {$in4,$in5,$in6,$in7},[$inp],#64 2050 aese $dat8,$rndpang 2051 aesmc $dat8,$dat8 2052 aese $dat9,$rndpang 2053 aesmc $dat9,$dat9 2054 aese $dat10,$rndpang 2055 aesmc $dat10,$dat10 2056 aese $dat11,$rndpang 2057 aesmc $dat11,$dat11 2058 vld1.8 {$in8,$in9,$in10,$in11},[$inp],#64 2059 vld1.32 {$rndpang},[$key_],#16 2060 2061 mov $key_, $key 2062 aese $dat0,$rndping 2063 aesmc $dat0,$dat0 2064 aese $dat1,$rndping 2065 aesmc $dat1,$dat1 2066 aese $dat2,$rndping 2067 aesmc $dat2,$dat2 2068 aese $dat3,$rndping 2069 aesmc $dat3,$dat3 2070 aese $dat4,$rndping 2071 aesmc $dat4,$dat4 2072 aese $dat5,$rndping 2073 aesmc $dat5,$dat5 2074 aese $dat6,$rndping 2075 aesmc $dat6,$dat6 2076 aese $dat7,$rndping 2077 aesmc $dat7,$dat7 2078 aese $dat8,$rndping 2079 aesmc $dat8,$dat8 2080 aese $dat9,$rndping 2081 aesmc $dat9,$dat9 2082 aese $dat10,$rndping 2083 aesmc $dat10,$dat10 2084 aese $dat11,$rndping 2085 aesmc $dat11,$dat11 2086 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] 2087 2088 aese $dat0,$rndpang 2089 eor3 $in0,$in0,$rndlast,$dat0 2090 vorr $dat0,$ivec,$ivec 2091 aese $dat1,$rndpang 2092 eor3 $in1,$in1,$rndlast,$dat1 2093 vorr $dat1,$ivec,$ivec 2094 aese $dat2,$rndpang 2095 eor3 $in2,$in2,$rndlast,$dat2 2096 vorr $dat2,$ivec,$ivec 2097 aese $dat3,$rndpang 2098 eor3 $in3,$in3,$rndlast,$dat3 2099 vorr $dat3,$ivec,$ivec 2100 aese $dat4,$rndpang 2101 eor3 $in4,$in4,$rndlast,$dat4 2102 vorr $dat4,$ivec,$ivec 2103 aese $dat5,$rndpang 2104 eor3 $in5,$in5,$rndlast,$dat5 2105 vorr $dat5,$ivec,$ivec 2106 aese $dat6,$rndpang 2107 eor3 $in6,$in6,$rndlast,$dat6 2108 vorr $dat6,$ivec,$ivec 2109 aese $dat7,$rndpang 2110 eor3 $in7,$in7,$rndlast,$dat7 2111 vorr $dat7,$ivec,$ivec 2112 aese $dat8,$rndpang 2113 eor3 $in8,$in8,$rndlast,$dat8 2114 vorr $dat8,$ivec,$ivec 2115 aese $dat9,$rndpang 2116 eor3 $in9,$in9,$rndlast,$dat9 2117 vorr $dat9,$ivec,$ivec 2118 aese $dat10,$rndpang 2119 eor3 $in10,$in10,$rndlast,$dat10 2120 vorr $dat10,$ivec,$ivec 2121 aese $dat11,$rndpang 2122 eor3 $in11,$in11,$rndlast,$dat11 2123 vorr $dat11,$ivec,$ivec 2124 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] 2125 2126 vmov.32 ${dat0}[3],$tctr0 2127 vmov.32 ${dat1}[3],$tctr1 2128 vmov.32 ${dat2}[3],$tctr2 2129 vmov.32 ${dat3}[3],$tctr3 2130 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64 2131 vmov.32 ${dat4}[3],$tctr4 2132 vmov.32 ${dat5}[3],$tctr5 2133 vmov.32 ${dat6}[3],$tctr6 2134 vmov.32 ${dat7}[3],$tctr7 2135 vst1.8 {$in4,$in5,$in6,$in7},[$out],#64 2136 vmov.32 ${dat8}[3],$tctr8 2137 vmov.32 ${dat9}[3],$tctr9 2138 vmov.32 ${dat10}[3],$tctr10 2139 vmov.32 ${dat11}[3],$tctr11 2140 vst1.8 {$in8,$in9,$in10,$in11},[$out],#64 2141 2142 mov $cnt,$rounds 2143 2144 add $ctr,$ctr,#12 2145 subs $len,$len,#12 2146 b.hs .Loop12x_ctr32_unroll 2147 2148 // pop regs from stack when 12 data chunks are interleaved 2149 ldp $dat10d,$dat11d,[sp],#32 2150 ldp $dat8d,$dat9d,[sp],#32 2151 ldp x23,x24,[sp],#16 2152 ldp x21,x22,[sp],#16 2153 ldp x19,x20,[sp],#16 2154 2155 add $len,$len,#12 2156 cbz $len,.Lctr32_done_unroll 2157 sub $ctr,$ctr,#12 2158 2159 cmp $len,#2 2160 b.ls .Lctr32_tail_unroll 2161 2162 cmp $len,#6 2163 sub $len,$len,#3 // bias 2164 add $ctr,$ctr,#3 2165 b.lo .Loop3x_ctr32_unroll 2166 2167 sub $len,$len,#3 2168 add $ctr,$ctr,#3 2169 b.lo .Loop6x_ctr32_unroll 2170 2171.align 4 2172.Loop6x_ctr32_unroll: 2173 aese $dat0,$rndping 2174 aesmc $dat0,$dat0 2175 aese $dat1,$rndping 2176 aesmc $dat1,$dat1 2177 aese $dat2,$rndping 2178 aesmc $dat2,$dat2 2179 aese $dat3,$rndping 2180 aesmc $dat3,$dat3 2181 aese $dat4,$rndping 2182 aesmc $dat4,$dat4 2183 aese $dat5,$rndping 2184 aesmc $dat5,$dat5 2185 vld1.32 {$rndping},[$key_],#16 2186 subs $cnt,$cnt,#2 2187 aese $dat0,$rndpang 2188 aesmc $dat0,$dat0 2189 aese $dat1,$rndpang 2190 aesmc $dat1,$dat1 2191 aese $dat2,$rndpang 2192 aesmc $dat2,$dat2 2193 aese $dat3,$rndpang 2194 aesmc $dat3,$dat3 2195 aese $dat4,$rndpang 2196 aesmc $dat4,$dat4 2197 aese $dat5,$rndpang 2198 aesmc $dat5,$dat5 2199 vld1.32 {$rndpang},[$key_],#16 2200 b.gt .Loop6x_ctr32_unroll 2201 2202 aese $dat0,$rndping 2203 aesmc $dat0,$dat0 2204 aese $dat1,$rndping 2205 aesmc $dat1,$dat1 2206 aese $dat2,$rndping 2207 aesmc $dat2,$dat2 2208 aese $dat3,$rndping 2209 aesmc $dat3,$dat3 2210 aese $dat4,$rndping 2211 aesmc $dat4,$dat4 2212 aese $dat5,$rndping 2213 aesmc $dat5,$dat5 2214 vld1.32 {$rndping},[$key_],#16 2215 2216 aese $dat0,$rndpang 2217 aesmc $dat0,$dat0 2218 aese $dat1,$rndpang 2219 aesmc $dat1,$dat1 2220 aese $dat2,$rndpang 2221 aesmc $dat2,$dat2 2222 aese $dat3,$rndpang 2223 aesmc $dat3,$dat3 2224 aese $dat4,$rndpang 2225 aesmc $dat4,$dat4 2226 aese $dat5,$rndpang 2227 aesmc $dat5,$dat5 2228 vld1.32 {$rndpang},[$key_],#16 2229 2230 aese $dat0,$rndping 2231 aesmc $dat0,$dat0 2232 add $tctr0,$ctr,#1 2233 add $tctr1,$ctr,#2 2234 aese $dat1,$rndping 2235 aesmc $dat1,$dat1 2236 add $tctr2,$ctr,#3 2237 add $tctr3,$ctr,#4 2238 aese $dat2,$rndping 2239 aesmc $dat2,$dat2 2240 add $tctr4,$ctr,#5 2241 add $tctr5,$ctr,#6 2242 rev $tctr0,$tctr0 2243 aese $dat3,$rndping 2244 aesmc $dat3,$dat3 2245 rev $tctr1,$tctr1 2246 rev $tctr2,$tctr2 2247 aese $dat4,$rndping 2248 aesmc $dat4,$dat4 2249 rev $tctr3,$tctr3 2250 rev $tctr4,$tctr4 2251 aese $dat5,$rndping 2252 aesmc $dat5,$dat5 2253 rev $tctr5,$tctr5 2254 vld1.32 {$rndping},[$key_],#16 2255 2256 aese $dat0,$rndpang 2257 aesmc $dat0,$dat0 2258 aese $dat1,$rndpang 2259 aesmc $dat1,$dat1 2260 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64 2261 aese $dat2,$rndpang 2262 aesmc $dat2,$dat2 2263 aese $dat3,$rndpang 2264 aesmc $dat3,$dat3 2265 vld1.8 {$in4,$in5},[$inp],#32 2266 aese $dat4,$rndpang 2267 aesmc $dat4,$dat4 2268 aese $dat5,$rndpang 2269 aesmc $dat5,$dat5 2270 vld1.32 {$rndpang},[$key_],#16 2271 2272 mov $key_, $key 2273 aese $dat0,$rndping 2274 aesmc $dat0,$dat0 2275 aese $dat1,$rndping 2276 aesmc $dat1,$dat1 2277 aese $dat2,$rndping 2278 aesmc $dat2,$dat2 2279 aese $dat3,$rndping 2280 aesmc $dat3,$dat3 2281 aese $dat4,$rndping 2282 aesmc $dat4,$dat4 2283 aese $dat5,$rndping 2284 aesmc $dat5,$dat5 2285 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] 2286 2287 aese $dat0,$rndpang 2288 eor3 $in0,$in0,$rndlast,$dat0 2289 aese $dat1,$rndpang 2290 eor3 $in1,$in1,$rndlast,$dat1 2291 aese $dat2,$rndpang 2292 eor3 $in2,$in2,$rndlast,$dat2 2293 aese $dat3,$rndpang 2294 eor3 $in3,$in3,$rndlast,$dat3 2295 aese $dat4,$rndpang 2296 eor3 $in4,$in4,$rndlast,$dat4 2297 aese $dat5,$rndpang 2298 eor3 $in5,$in5,$rndlast,$dat5 2299 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] 2300 2301 vorr $dat0,$ivec,$ivec 2302 vorr $dat1,$ivec,$ivec 2303 vorr $dat2,$ivec,$ivec 2304 vorr $dat3,$ivec,$ivec 2305 vorr $dat4,$ivec,$ivec 2306 vorr $dat5,$ivec,$ivec 2307 2308 vmov.32 ${dat0}[3],$tctr0 2309 vmov.32 ${dat1}[3],$tctr1 2310 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64 2311 vmov.32 ${dat2}[3],$tctr2 2312 vmov.32 ${dat3}[3],$tctr3 2313 vst1.8 {$in4,$in5},[$out],#32 2314 vmov.32 ${dat4}[3],$tctr4 2315 vmov.32 ${dat5}[3],$tctr5 2316 2317 cbz $len,.Lctr32_done_unroll 2318 mov $cnt,$rounds 2319 2320 cmp $len,#2 2321 b.ls .Lctr32_tail_unroll 2322 2323 sub $len,$len,#3 // bias 2324 add $ctr,$ctr,#3 2325 b .Loop3x_ctr32_unroll 2326 2327.align 4 2328.Loop3x_ctr32_unroll: 2329 aese $dat0,$rndping 2330 aesmc $dat0,$dat0 2331 aese $dat1,$rndping 2332 aesmc $dat1,$dat1 2333 aese $dat2,$rndping 2334 aesmc $dat2,$dat2 2335 vld1.32 {$rndping},[$key_],#16 2336 subs $cnt,$cnt,#2 2337 aese $dat0,$rndpang 2338 aesmc $dat0,$dat0 2339 aese $dat1,$rndpang 2340 aesmc $dat1,$dat1 2341 aese $dat2,$rndpang 2342 aesmc $dat2,$dat2 2343 vld1.32 {$rndpang},[$key_],#16 2344 b.gt .Loop3x_ctr32_unroll 2345 2346 aese $dat0,$rndping 2347 aesmc $tmp0,$dat0 2348 aese $dat1,$rndping 2349 aesmc $tmp1,$dat1 2350 vld1.8 {$in0,$in1,$in2},[$inp],#48 2351 vorr $dat0,$ivec,$ivec 2352 aese $dat2,$rndping 2353 aesmc $dat2,$dat2 2354 vld1.32 {$rndping},[$key_],#16 2355 vorr $dat1,$ivec,$ivec 2356 aese $tmp0,$rndpang 2357 aesmc $tmp0,$tmp0 2358 aese $tmp1,$rndpang 2359 aesmc $tmp1,$tmp1 2360 aese $dat2,$rndpang 2361 aesmc $tmp2,$dat2 2362 vld1.32 {$rndpang},[$key_],#16 2363 vorr $dat2,$ivec,$ivec 2364 add $tctr0,$ctr,#1 2365 aese $tmp0,$rndping 2366 aesmc $tmp0,$tmp0 2367 aese $tmp1,$rndping 2368 aesmc $tmp1,$tmp1 2369 add $tctr1,$ctr,#2 2370 aese $tmp2,$rndping 2371 aesmc $tmp2,$tmp2 2372 vld1.32 {$rndping},[$key_],#16 2373 add $ctr,$ctr,#3 2374 aese $tmp0,$rndpang 2375 aesmc $tmp0,$tmp0 2376 aese $tmp1,$rndpang 2377 aesmc $tmp1,$tmp1 2378 2379 rev $tctr0,$tctr0 2380 aese $tmp2,$rndpang 2381 aesmc $tmp2,$tmp2 2382 vld1.32 {$rndpang},[$key_],#16 2383 vmov.32 ${dat0}[3], $tctr0 2384 mov $key_,$key 2385 rev $tctr1,$tctr1 2386 aese $tmp0,$rndping 2387 aesmc $tmp0,$tmp0 2388 2389 aese $tmp1,$rndping 2390 aesmc $tmp1,$tmp1 2391 vmov.32 ${dat1}[3], $tctr1 2392 rev $tctr2,$ctr 2393 aese $tmp2,$rndping 2394 aesmc $tmp2,$tmp2 2395 vmov.32 ${dat2}[3], $tctr2 2396 2397 aese $tmp0,$rndpang 2398 aese $tmp1,$rndpang 2399 aese $tmp2,$rndpang 2400 2401 eor3 $in0,$in0,$rndlast,$tmp0 2402 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] 2403 eor3 $in1,$in1,$rndlast,$tmp1 2404 mov $cnt,$rounds 2405 eor3 $in2,$in2,$rndlast,$tmp2 2406 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] 2407 vst1.8 {$in0,$in1,$in2},[$out],#48 2408 2409 cbz $len,.Lctr32_done_unroll 2410 2411.Lctr32_tail_unroll: 2412 cmp $len,#1 2413 b.eq .Lctr32_tail_1_unroll 2414 2415.Lctr32_tail_2_unroll: 2416 aese $dat0,$rndping 2417 aesmc $dat0,$dat0 2418 aese $dat1,$rndping 2419 aesmc $dat1,$dat1 2420 vld1.32 {$rndping},[$key_],#16 2421 subs $cnt,$cnt,#2 2422 aese $dat0,$rndpang 2423 aesmc $dat0,$dat0 2424 aese $dat1,$rndpang 2425 aesmc $dat1,$dat1 2426 vld1.32 {$rndpang},[$key_],#16 2427 b.gt .Lctr32_tail_2_unroll 2428 2429 aese $dat0,$rndping 2430 aesmc $dat0,$dat0 2431 aese $dat1,$rndping 2432 aesmc $dat1,$dat1 2433 vld1.32 {$rndping},[$key_],#16 2434 aese $dat0,$rndpang 2435 aesmc $dat0,$dat0 2436 aese $dat1,$rndpang 2437 aesmc $dat1,$dat1 2438 vld1.32 {$rndpang},[$key_],#16 2439 vld1.8 {$in0,$in1},[$inp],#32 2440 aese $dat0,$rndping 2441 aesmc $dat0,$dat0 2442 aese $dat1,$rndping 2443 aesmc $dat1,$dat1 2444 vld1.32 {$rndping},[$key_],#16 2445 aese $dat0,$rndpang 2446 aesmc $dat0,$dat0 2447 aese $dat1,$rndpang 2448 aesmc $dat1,$dat1 2449 vld1.32 {$rndpang},[$key_],#16 2450 aese $dat0,$rndping 2451 aesmc $dat0,$dat0 2452 aese $dat1,$rndping 2453 aesmc $dat1,$dat1 2454 aese $dat0,$rndpang 2455 aese $dat1,$rndpang 2456 2457 eor3 $in0,$in0,$rndlast,$dat0 2458 eor3 $in1,$in1,$rndlast,$dat1 2459 vst1.8 {$in0,$in1},[$out],#32 2460 b .Lctr32_done_unroll 2461 2462.Lctr32_tail_1_unroll: 2463 aese $dat0,$rndping 2464 aesmc $dat0,$dat0 2465 vld1.32 {$rndping},[$key_],#16 2466 subs $cnt,$cnt,#2 2467 aese $dat0,$rndpang 2468 aesmc $dat0,$dat0 2469 vld1.32 {$rndpang},[$key_],#16 2470 b.gt .Lctr32_tail_1_unroll 2471 2472 aese $dat0,$rndping 2473 aesmc $dat0,$dat0 2474 vld1.32 {$rndping},[$key_],#16 2475 aese $dat0,$rndpang 2476 aesmc $dat0,$dat0 2477 vld1.32 {$rndpang},[$key_],#16 2478 vld1.8 {$in0},[$inp] 2479 aese $dat0,$rndping 2480 aesmc $dat0,$dat0 2481 vld1.32 {$rndping},[$key_],#16 2482 aese $dat0,$rndpang 2483 aesmc $dat0,$dat0 2484 vld1.32 {$rndpang},[$key_],#16 2485 aese $dat0,$rndping 2486 aesmc $dat0,$dat0 2487 aese $dat0,$rndpang 2488 2489 eor3 $in0,$in0,$rndlast,$dat0 2490 vst1.8 {$in0},[$out],#16 2491 2492.Lctr32_done_unroll: 2493 ldp d8,d9,[sp, #16] 2494 ldp d10,d11,[sp, #32] 2495 ldp d12,d13,[sp, #48] 2496 ldp d15,d16,[sp, #64] 2497 ldr x29,[sp],#80 2498 ret 2499.size ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3 2500___ 2501}}} 2502 2503{{{ 2504my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 2505my ($rounds,$cnt,$key_)=("w5","w6","x7"); 2506my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 2507my $step="x12"; # aliases with $tctr2 2508 2509my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 2510my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2511 2512# used only in 64-bit mode... 2513my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); 2514 2515my ($dat,$tmp)=($dat0,$tmp0); 2516 2517### q8-q15 preloaded key schedule 2518 2519$code.=<<___; 2520.globl ${prefix}_ctr32_encrypt_blocks 2521.type ${prefix}_ctr32_encrypt_blocks,%function 2522.align 5 2523${prefix}_ctr32_encrypt_blocks: 2524___ 2525$code.=<<___ if ($flavour =~ /64/); 2526 AARCH64_VALID_CALL_TARGET 2527 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 2528 stp x29,x30,[sp,#-16]! 2529 add x29,sp,#0 2530___ 2531$code.=<<___ if ($flavour !~ /64/); 2532 mov ip,sp 2533 stmdb sp!,{r4-r10,lr} 2534 vstmdb sp!,{d8-d15} @ ABI specification says so 2535 ldr r4, [ip] @ load remaining arg 2536___ 2537$code.=<<___; 2538 ldr $rounds,[$key,#240] 2539 2540 ldr $ctr, [$ivp, #12] 2541#ifdef __ARMEB__ 2542 vld1.8 {$dat0},[$ivp] 2543#else 2544 vld1.32 {$dat0},[$ivp] 2545#endif 2546 vld1.32 {q8-q9},[$key] // load key schedule... 2547 sub $rounds,$rounds,#4 2548 mov $step,#16 2549 cmp $len,#2 2550 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 2551 sub $rounds,$rounds,#2 2552 vld1.32 {q12-q13},[$key_],#32 2553 vld1.32 {q14-q15},[$key_],#32 2554 vld1.32 {$rndlast},[$key_] 2555 add $key_,$key,#32 2556 mov $cnt,$rounds 2557 cclr $step,lo 2558#ifndef __ARMEB__ 2559 rev $ctr, $ctr 2560#endif 2561___ 2562$code.=<<___ if ($flavour =~ /64/); 2563 vorr $dat1,$dat0,$dat0 2564 add $tctr1, $ctr, #1 2565 vorr $dat2,$dat0,$dat0 2566 add $ctr, $ctr, #2 2567 vorr $ivec,$dat0,$dat0 2568 rev $tctr1, $tctr1 2569 vmov.32 ${dat1}[3],$tctr1 2570 b.ls .Lctr32_tail 2571 rev $tctr2, $ctr 2572 sub $len,$len,#3 // bias 2573 vmov.32 ${dat2}[3],$tctr2 2574___ 2575$code.=<<___ if ($flavour !~ /64/); 2576 add $tctr1, $ctr, #1 2577 vorr $ivec,$dat0,$dat0 2578 rev $tctr1, $tctr1 2579 vmov.32 ${ivec}[3],$tctr1 2580 add $ctr, $ctr, #2 2581 vorr $dat1,$ivec,$ivec 2582 b.ls .Lctr32_tail 2583 rev $tctr2, $ctr 2584 vmov.32 ${ivec}[3],$tctr2 2585 sub $len,$len,#3 // bias 2586 vorr $dat2,$ivec,$ivec 2587___ 2588$code.=<<___ if ($flavour =~ /64/); 2589 cmp $len,#32 2590 b.lo .Loop3x_ctr32 2591 2592 add w13,$ctr,#1 2593 add w14,$ctr,#2 2594 vorr $dat3,$dat0,$dat0 2595 rev w13,w13 2596 vorr $dat4,$dat0,$dat0 2597 rev w14,w14 2598 vmov.32 ${dat3}[3],w13 2599 sub $len,$len,#2 // bias 2600 vmov.32 ${dat4}[3],w14 2601 add $ctr,$ctr,#2 2602 b .Loop5x_ctr32 2603 2604.align 4 2605.Loop5x_ctr32: 2606 aese $dat0,q8 2607 aesmc $dat0,$dat0 2608 aese $dat1,q8 2609 aesmc $dat1,$dat1 2610 aese $dat2,q8 2611 aesmc $dat2,$dat2 2612 aese $dat3,q8 2613 aesmc $dat3,$dat3 2614 aese $dat4,q8 2615 aesmc $dat4,$dat4 2616 vld1.32 {q8},[$key_],#16 2617 subs $cnt,$cnt,#2 2618 aese $dat0,q9 2619 aesmc $dat0,$dat0 2620 aese $dat1,q9 2621 aesmc $dat1,$dat1 2622 aese $dat2,q9 2623 aesmc $dat2,$dat2 2624 aese $dat3,q9 2625 aesmc $dat3,$dat3 2626 aese $dat4,q9 2627 aesmc $dat4,$dat4 2628 vld1.32 {q9},[$key_],#16 2629 b.gt .Loop5x_ctr32 2630 2631 mov $key_,$key 2632 aese $dat0,q8 2633 aesmc $dat0,$dat0 2634 aese $dat1,q8 2635 aesmc $dat1,$dat1 2636 aese $dat2,q8 2637 aesmc $dat2,$dat2 2638 aese $dat3,q8 2639 aesmc $dat3,$dat3 2640 aese $dat4,q8 2641 aesmc $dat4,$dat4 2642 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2643 2644 aese $dat0,q9 2645 aesmc $dat0,$dat0 2646 aese $dat1,q9 2647 aesmc $dat1,$dat1 2648 aese $dat2,q9 2649 aesmc $dat2,$dat2 2650 aese $dat3,q9 2651 aesmc $dat3,$dat3 2652 aese $dat4,q9 2653 aesmc $dat4,$dat4 2654 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2655 2656 aese $dat0,q12 2657 aesmc $dat0,$dat0 2658 add $tctr0,$ctr,#1 2659 add $tctr1,$ctr,#2 2660 aese $dat1,q12 2661 aesmc $dat1,$dat1 2662 add $tctr2,$ctr,#3 2663 add w13,$ctr,#4 2664 aese $dat2,q12 2665 aesmc $dat2,$dat2 2666 add w14,$ctr,#5 2667 rev $tctr0,$tctr0 2668 aese $dat3,q12 2669 aesmc $dat3,$dat3 2670 rev $tctr1,$tctr1 2671 rev $tctr2,$tctr2 2672 aese $dat4,q12 2673 aesmc $dat4,$dat4 2674 rev w13,w13 2675 rev w14,w14 2676 2677 aese $dat0,q13 2678 aesmc $dat0,$dat0 2679 aese $dat1,q13 2680 aesmc $dat1,$dat1 2681 aese $dat2,q13 2682 aesmc $dat2,$dat2 2683 aese $dat3,q13 2684 aesmc $dat3,$dat3 2685 aese $dat4,q13 2686 aesmc $dat4,$dat4 2687 2688 aese $dat0,q14 2689 aesmc $dat0,$dat0 2690 vld1.8 {$in0},[$inp],#16 2691 aese $dat1,q14 2692 aesmc $dat1,$dat1 2693 vld1.8 {$in1},[$inp],#16 2694 aese $dat2,q14 2695 aesmc $dat2,$dat2 2696 vld1.8 {$in2},[$inp],#16 2697 aese $dat3,q14 2698 aesmc $dat3,$dat3 2699 vld1.8 {$in3},[$inp],#16 2700 aese $dat4,q14 2701 aesmc $dat4,$dat4 2702 vld1.8 {$in4},[$inp],#16 2703 2704 aese $dat0,q15 2705 veor $in0,$in0,$rndlast 2706 aese $dat1,q15 2707 veor $in1,$in1,$rndlast 2708 aese $dat2,q15 2709 veor $in2,$in2,$rndlast 2710 aese $dat3,q15 2711 veor $in3,$in3,$rndlast 2712 aese $dat4,q15 2713 veor $in4,$in4,$rndlast 2714 2715 veor $in0,$in0,$dat0 2716 vorr $dat0,$ivec,$ivec 2717 veor $in1,$in1,$dat1 2718 vorr $dat1,$ivec,$ivec 2719 veor $in2,$in2,$dat2 2720 vorr $dat2,$ivec,$ivec 2721 veor $in3,$in3,$dat3 2722 vorr $dat3,$ivec,$ivec 2723 veor $in4,$in4,$dat4 2724 vorr $dat4,$ivec,$ivec 2725 2726 vst1.8 {$in0},[$out],#16 2727 vmov.32 ${dat0}[3],$tctr0 2728 vst1.8 {$in1},[$out],#16 2729 vmov.32 ${dat1}[3],$tctr1 2730 vst1.8 {$in2},[$out],#16 2731 vmov.32 ${dat2}[3],$tctr2 2732 vst1.8 {$in3},[$out],#16 2733 vmov.32 ${dat3}[3],w13 2734 vst1.8 {$in4},[$out],#16 2735 vmov.32 ${dat4}[3],w14 2736 2737 mov $cnt,$rounds 2738 cbz $len,.Lctr32_done 2739 2740 add $ctr,$ctr,#5 2741 subs $len,$len,#5 2742 b.hs .Loop5x_ctr32 2743 2744 add $len,$len,#5 2745 sub $ctr,$ctr,#5 2746 2747 cmp $len,#2 2748 mov $step,#16 2749 cclr $step,lo 2750 b.ls .Lctr32_tail 2751 2752 sub $len,$len,#3 // bias 2753 add $ctr,$ctr,#3 2754___ 2755$code.=<<___; 2756 b .Loop3x_ctr32 2757 2758.align 4 2759.Loop3x_ctr32: 2760 aese $dat0,q8 2761 aesmc $dat0,$dat0 2762 aese $dat1,q8 2763 aesmc $dat1,$dat1 2764 aese $dat2,q8 2765 aesmc $dat2,$dat2 2766 vld1.32 {q8},[$key_],#16 2767 subs $cnt,$cnt,#2 2768 aese $dat0,q9 2769 aesmc $dat0,$dat0 2770 aese $dat1,q9 2771 aesmc $dat1,$dat1 2772 aese $dat2,q9 2773 aesmc $dat2,$dat2 2774 vld1.32 {q9},[$key_],#16 2775 b.gt .Loop3x_ctr32 2776 2777 aese $dat0,q8 2778 aesmc $tmp0,$dat0 2779 aese $dat1,q8 2780 aesmc $tmp1,$dat1 2781 vld1.8 {$in0},[$inp],#16 2782___ 2783$code.=<<___ if ($flavour =~ /64/); 2784 vorr $dat0,$ivec,$ivec 2785___ 2786$code.=<<___ if ($flavour !~ /64/); 2787 add $tctr0,$ctr,#1 2788___ 2789$code.=<<___; 2790 aese $dat2,q8 2791 aesmc $dat2,$dat2 2792 vld1.8 {$in1},[$inp],#16 2793___ 2794$code.=<<___ if ($flavour =~ /64/); 2795 vorr $dat1,$ivec,$ivec 2796___ 2797$code.=<<___ if ($flavour !~ /64/); 2798 rev $tctr0,$tctr0 2799___ 2800$code.=<<___; 2801 aese $tmp0,q9 2802 aesmc $tmp0,$tmp0 2803 aese $tmp1,q9 2804 aesmc $tmp1,$tmp1 2805 vld1.8 {$in2},[$inp],#16 2806 mov $key_,$key 2807 aese $dat2,q9 2808 aesmc $tmp2,$dat2 2809___ 2810$code.=<<___ if ($flavour =~ /64/); 2811 vorr $dat2,$ivec,$ivec 2812 add $tctr0,$ctr,#1 2813___ 2814$code.=<<___; 2815 aese $tmp0,q12 2816 aesmc $tmp0,$tmp0 2817 aese $tmp1,q12 2818 aesmc $tmp1,$tmp1 2819 veor $in0,$in0,$rndlast 2820 add $tctr1,$ctr,#2 2821 aese $tmp2,q12 2822 aesmc $tmp2,$tmp2 2823 veor $in1,$in1,$rndlast 2824 add $ctr,$ctr,#3 2825 aese $tmp0,q13 2826 aesmc $tmp0,$tmp0 2827 aese $tmp1,q13 2828 aesmc $tmp1,$tmp1 2829 veor $in2,$in2,$rndlast 2830___ 2831$code.=<<___ if ($flavour =~ /64/); 2832 rev $tctr0,$tctr0 2833 aese $tmp2,q13 2834 aesmc $tmp2,$tmp2 2835 vmov.32 ${dat0}[3], $tctr0 2836___ 2837$code.=<<___ if ($flavour !~ /64/); 2838 vmov.32 ${ivec}[3], $tctr0 2839 aese $tmp2,q13 2840 aesmc $tmp2,$tmp2 2841 vorr $dat0,$ivec,$ivec 2842___ 2843$code.=<<___; 2844 rev $tctr1,$tctr1 2845 aese $tmp0,q14 2846 aesmc $tmp0,$tmp0 2847___ 2848$code.=<<___ if ($flavour !~ /64/); 2849 vmov.32 ${ivec}[3], $tctr1 2850 rev $tctr2,$ctr 2851___ 2852$code.=<<___; 2853 aese $tmp1,q14 2854 aesmc $tmp1,$tmp1 2855___ 2856$code.=<<___ if ($flavour =~ /64/); 2857 vmov.32 ${dat1}[3], $tctr1 2858 rev $tctr2,$ctr 2859 aese $tmp2,q14 2860 aesmc $tmp2,$tmp2 2861 vmov.32 ${dat2}[3], $tctr2 2862___ 2863$code.=<<___ if ($flavour !~ /64/); 2864 vorr $dat1,$ivec,$ivec 2865 vmov.32 ${ivec}[3], $tctr2 2866 aese $tmp2,q14 2867 aesmc $tmp2,$tmp2 2868 vorr $dat2,$ivec,$ivec 2869___ 2870$code.=<<___; 2871 subs $len,$len,#3 2872 aese $tmp0,q15 2873 aese $tmp1,q15 2874 aese $tmp2,q15 2875 2876 veor $in0,$in0,$tmp0 2877 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2878 vst1.8 {$in0},[$out],#16 2879 veor $in1,$in1,$tmp1 2880 mov $cnt,$rounds 2881 vst1.8 {$in1},[$out],#16 2882 veor $in2,$in2,$tmp2 2883 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2884 vst1.8 {$in2},[$out],#16 2885 b.hs .Loop3x_ctr32 2886 2887 adds $len,$len,#3 2888 b.eq .Lctr32_done 2889 cmp $len,#1 2890 mov $step,#16 2891 cclr $step,eq 2892 2893.Lctr32_tail: 2894 aese $dat0,q8 2895 aesmc $dat0,$dat0 2896 aese $dat1,q8 2897 aesmc $dat1,$dat1 2898 vld1.32 {q8},[$key_],#16 2899 subs $cnt,$cnt,#2 2900 aese $dat0,q9 2901 aesmc $dat0,$dat0 2902 aese $dat1,q9 2903 aesmc $dat1,$dat1 2904 vld1.32 {q9},[$key_],#16 2905 b.gt .Lctr32_tail 2906 2907 aese $dat0,q8 2908 aesmc $dat0,$dat0 2909 aese $dat1,q8 2910 aesmc $dat1,$dat1 2911 aese $dat0,q9 2912 aesmc $dat0,$dat0 2913 aese $dat1,q9 2914 aesmc $dat1,$dat1 2915 vld1.8 {$in0},[$inp],$step 2916 aese $dat0,q12 2917 aesmc $dat0,$dat0 2918 aese $dat1,q12 2919 aesmc $dat1,$dat1 2920 vld1.8 {$in1},[$inp] 2921 aese $dat0,q13 2922 aesmc $dat0,$dat0 2923 aese $dat1,q13 2924 aesmc $dat1,$dat1 2925 veor $in0,$in0,$rndlast 2926 aese $dat0,q14 2927 aesmc $dat0,$dat0 2928 aese $dat1,q14 2929 aesmc $dat1,$dat1 2930 veor $in1,$in1,$rndlast 2931 aese $dat0,q15 2932 aese $dat1,q15 2933 2934 cmp $len,#1 2935 veor $in0,$in0,$dat0 2936 veor $in1,$in1,$dat1 2937 vst1.8 {$in0},[$out],#16 2938 b.eq .Lctr32_done 2939 vst1.8 {$in1},[$out] 2940 2941.Lctr32_done: 2942___ 2943$code.=<<___ if ($flavour !~ /64/); 2944 vldmia sp!,{d8-d15} 2945 ldmia sp!,{r4-r10,pc} 2946___ 2947$code.=<<___ if ($flavour =~ /64/); 2948 ldr x29,[sp],#16 2949 ret 2950___ 2951$code.=<<___; 2952.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 2953___ 2954}}} 2955# Performance in cycles per byte. 2956# Processed with AES-XTS different key size. 2957# It shows the value before and after optimization as below: 2958# (before/after): 2959# 2960# AES-128-XTS AES-256-XTS 2961# Cortex-A57 3.36/1.09 4.02/1.37 2962# Cortex-A72 3.03/1.02 3.28/1.33 2963 2964# Optimization is implemented by loop unrolling and interleaving. 2965# Commonly, we choose the unrolling factor as 5, if the input 2966# data size smaller than 5 blocks, but not smaller than 3 blocks, 2967# choose 3 as the unrolling factor. 2968# If the input data size dsize >= 5*16 bytes, then take 5 blocks 2969# as one iteration, every loop the left size lsize -= 5*16. 2970# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes 2971# will be processed specially, which be integrated into the 5*16 bytes 2972# loop to improve the efficiency. 2973# There is one special case, if the original input data size dsize 2974# = 16 bytes, we will treat it separately to improve the 2975# performance: one independent code block without LR, FP load and 2976# store. 2977# Encryption will process the (length -tailcnt) bytes as mentioned 2978# previously, then encrypt the composite block as last second 2979# cipher block. 2980# Decryption will process the (length -tailcnt -1) bytes as mentioned 2981# previously, then decrypt the last second cipher block to get the 2982# last plain block(tail), decrypt the composite block as last second 2983# plain text block. 2984 2985{{{ 2986my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2987my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2988my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2989my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2990my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2991my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2992my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); 2993my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2994my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2995 2996my ($tmpin)=("v26.16b"); 2997my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2998 2999# q7 last round key 3000# q10-q15, q7 Last 7 round keys 3001# q8-q9 preloaded round keys except last 7 keys for big size 3002# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 3003 3004 3005my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 3006 3007my ($dat3,$in3,$tmp3); # used only in 64-bit mode 3008my ($dat4,$in4,$tmp4); 3009if ($flavour =~ /64/) { 3010 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 3011} 3012 3013$code.=<<___ if ($flavour =~ /64/); 3014.globl ${prefix}_xts_encrypt 3015.type ${prefix}_xts_encrypt,%function 3016.align 5 3017${prefix}_xts_encrypt: 3018___ 3019$code.=<<___ if ($flavour =~ /64/); 3020 AARCH64_VALID_CALL_TARGET 3021 cmp $len,#16 3022 // Original input data size bigger than 16, jump to big size processing. 3023 b.ne .Lxts_enc_big_size 3024 // Encrypt the iv with key2, as the first XEX iv. 3025 ldr $rounds,[$key2,#240] 3026 vld1.32 {$dat},[$key2],#16 3027 vld1.8 {$iv0},[$ivp] 3028 sub $rounds,$rounds,#2 3029 vld1.32 {$dat1},[$key2],#16 3030 3031.Loop_enc_iv_enc: 3032 aese $iv0,$dat 3033 aesmc $iv0,$iv0 3034 vld1.32 {$dat},[$key2],#16 3035 subs $rounds,$rounds,#2 3036 aese $iv0,$dat1 3037 aesmc $iv0,$iv0 3038 vld1.32 {$dat1},[$key2],#16 3039 b.gt .Loop_enc_iv_enc 3040 3041 aese $iv0,$dat 3042 aesmc $iv0,$iv0 3043 vld1.32 {$dat},[$key2] 3044 aese $iv0,$dat1 3045 veor $iv0,$iv0,$dat 3046 3047 vld1.8 {$dat0},[$inp] 3048 veor $dat0,$iv0,$dat0 3049 3050 ldr $rounds,[$key1,#240] 3051 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 3052 3053 aese $dat0,q20 3054 aesmc $dat0,$dat0 3055 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 3056 aese $dat0,q21 3057 aesmc $dat0,$dat0 3058 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing 3059 b.eq .Lxts_128_enc 3060.Lxts_enc_round_loop: 3061 aese $dat0,q8 3062 aesmc $dat0,$dat0 3063 vld1.32 {q8},[$key1],#16 // load key schedule... 3064 aese $dat0,q9 3065 aesmc $dat0,$dat0 3066 vld1.32 {q9},[$key1],#16 // load key schedule... 3067 subs $rounds,$rounds,#2 // bias 3068 b.gt .Lxts_enc_round_loop 3069.Lxts_128_enc: 3070 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 3071 aese $dat0,q8 3072 aesmc $dat0,$dat0 3073 aese $dat0,q9 3074 aesmc $dat0,$dat0 3075 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 3076 aese $dat0,q10 3077 aesmc $dat0,$dat0 3078 aese $dat0,q11 3079 aesmc $dat0,$dat0 3080 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 3081 aese $dat0,q12 3082 aesmc $dat0,$dat0 3083 aese $dat0,q13 3084 aesmc $dat0,$dat0 3085 vld1.32 {$rndlast},[$key1] 3086 aese $dat0,q14 3087 aesmc $dat0,$dat0 3088 aese $dat0,q15 3089 veor $dat0,$dat0,$rndlast 3090 veor $dat0,$dat0,$iv0 3091 vst1.8 {$dat0},[$out] 3092 b .Lxts_enc_final_abort 3093 3094.align 4 3095.Lxts_enc_big_size: 3096___ 3097$code.=<<___ if ($flavour =~ /64/); 3098 stp $constnumx,$tmpinp,[sp,#-64]! 3099 stp $tailcnt,$midnumx,[sp,#48] 3100 stp $ivd10,$ivd20,[sp,#32] 3101 stp $ivd30,$ivd40,[sp,#16] 3102 3103 // tailcnt store the tail value of length%16. 3104 and $tailcnt,$len,#0xf 3105 and $len,$len,#-16 3106 subs $len,$len,#16 3107 mov $step,#16 3108 b.lo .Lxts_abort 3109 csel $step,xzr,$step,eq 3110 3111 // Firstly, encrypt the iv with key2, as the first iv of XEX. 3112 ldr $rounds,[$key2,#240] 3113 vld1.32 {$dat},[$key2],#16 3114 vld1.8 {$iv0},[$ivp] 3115 sub $rounds,$rounds,#2 3116 vld1.32 {$dat1},[$key2],#16 3117 3118.Loop_iv_enc: 3119 aese $iv0,$dat 3120 aesmc $iv0,$iv0 3121 vld1.32 {$dat},[$key2],#16 3122 subs $rounds,$rounds,#2 3123 aese $iv0,$dat1 3124 aesmc $iv0,$iv0 3125 vld1.32 {$dat1},[$key2],#16 3126 b.gt .Loop_iv_enc 3127 3128 aese $iv0,$dat 3129 aesmc $iv0,$iv0 3130 vld1.32 {$dat},[$key2] 3131 aese $iv0,$dat1 3132 veor $iv0,$iv0,$dat 3133 3134 // The iv for second block 3135 // $ivl- iv(low), $ivh - iv(high) 3136 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3137 fmov $ivl,$ivd00 3138 fmov $ivh,$ivd01 3139 mov $constnum,#0x87 3140 extr $midnumx,$ivh,$ivh,#32 3141 extr $ivh,$ivh,$ivl,#63 3142 and $tmpmw,$constnum,$midnum,asr#31 3143 eor $ivl,$tmpmx,$ivl,lsl#1 3144 fmov $ivd10,$ivl 3145 fmov $ivd11,$ivh 3146 3147 ldr $rounds0,[$key1,#240] // next starting point 3148 vld1.8 {$dat},[$inp],$step 3149 3150 vld1.32 {q8-q9},[$key1] // load key schedule... 3151 sub $rounds0,$rounds0,#6 3152 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3153 sub $rounds0,$rounds0,#2 3154 vld1.32 {q10-q11},[$key_],#32 3155 vld1.32 {q12-q13},[$key_],#32 3156 vld1.32 {q14-q15},[$key_],#32 3157 vld1.32 {$rndlast},[$key_] 3158 3159 add $key_,$key1,#32 3160 mov $rounds,$rounds0 3161 3162 // Encryption 3163.Lxts_enc: 3164 vld1.8 {$dat2},[$inp],#16 3165 subs $len,$len,#32 // bias 3166 add $rounds,$rounds0,#2 3167 vorr $in1,$dat,$dat 3168 vorr $dat1,$dat,$dat 3169 vorr $in3,$dat,$dat 3170 vorr $in2,$dat2,$dat2 3171 vorr $in4,$dat2,$dat2 3172 b.lo .Lxts_inner_enc_tail 3173 veor $dat,$dat,$iv0 // before encryption, xor with iv 3174 veor $dat2,$dat2,$iv1 3175 3176 // The iv for third block 3177 extr $midnumx,$ivh,$ivh,#32 3178 extr $ivh,$ivh,$ivl,#63 3179 and $tmpmw,$constnum,$midnum,asr#31 3180 eor $ivl,$tmpmx,$ivl,lsl#1 3181 fmov $ivd20,$ivl 3182 fmov $ivd21,$ivh 3183 3184 3185 vorr $dat1,$dat2,$dat2 3186 vld1.8 {$dat2},[$inp],#16 3187 vorr $in0,$dat,$dat 3188 vorr $in1,$dat1,$dat1 3189 veor $in2,$dat2,$iv2 // the third block 3190 veor $dat2,$dat2,$iv2 3191 cmp $len,#32 3192 b.lo .Lxts_outer_enc_tail 3193 3194 // The iv for fourth block 3195 extr $midnumx,$ivh,$ivh,#32 3196 extr $ivh,$ivh,$ivl,#63 3197 and $tmpmw,$constnum,$midnum,asr#31 3198 eor $ivl,$tmpmx,$ivl,lsl#1 3199 fmov $ivd30,$ivl 3200 fmov $ivd31,$ivh 3201 3202 vld1.8 {$dat3},[$inp],#16 3203 // The iv for fifth block 3204 extr $midnumx,$ivh,$ivh,#32 3205 extr $ivh,$ivh,$ivl,#63 3206 and $tmpmw,$constnum,$midnum,asr#31 3207 eor $ivl,$tmpmx,$ivl,lsl#1 3208 fmov $ivd40,$ivl 3209 fmov $ivd41,$ivh 3210 3211 vld1.8 {$dat4},[$inp],#16 3212 veor $dat3,$dat3,$iv3 // the fourth block 3213 veor $dat4,$dat4,$iv4 3214 sub $len,$len,#32 // bias 3215 mov $rounds,$rounds0 3216 b .Loop5x_xts_enc 3217 3218.align 4 3219.Loop5x_xts_enc: 3220 aese $dat0,q8 3221 aesmc $dat0,$dat0 3222 aese $dat1,q8 3223 aesmc $dat1,$dat1 3224 aese $dat2,q8 3225 aesmc $dat2,$dat2 3226 aese $dat3,q8 3227 aesmc $dat3,$dat3 3228 aese $dat4,q8 3229 aesmc $dat4,$dat4 3230 vld1.32 {q8},[$key_],#16 3231 subs $rounds,$rounds,#2 3232 aese $dat0,q9 3233 aesmc $dat0,$dat0 3234 aese $dat1,q9 3235 aesmc $dat1,$dat1 3236 aese $dat2,q9 3237 aesmc $dat2,$dat2 3238 aese $dat3,q9 3239 aesmc $dat3,$dat3 3240 aese $dat4,q9 3241 aesmc $dat4,$dat4 3242 vld1.32 {q9},[$key_],#16 3243 b.gt .Loop5x_xts_enc 3244 3245 aese $dat0,q8 3246 aesmc $dat0,$dat0 3247 aese $dat1,q8 3248 aesmc $dat1,$dat1 3249 aese $dat2,q8 3250 aesmc $dat2,$dat2 3251 aese $dat3,q8 3252 aesmc $dat3,$dat3 3253 aese $dat4,q8 3254 aesmc $dat4,$dat4 3255 subs $len,$len,#0x50 // because .Lxts_enc_tail4x 3256 3257 aese $dat0,q9 3258 aesmc $dat0,$dat0 3259 aese $dat1,q9 3260 aesmc $dat1,$dat1 3261 aese $dat2,q9 3262 aesmc $dat2,$dat2 3263 aese $dat3,q9 3264 aesmc $dat3,$dat3 3265 aese $dat4,q9 3266 aesmc $dat4,$dat4 3267 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3268 mov $key_,$key1 3269 3270 aese $dat0,q10 3271 aesmc $dat0,$dat0 3272 aese $dat1,q10 3273 aesmc $dat1,$dat1 3274 aese $dat2,q10 3275 aesmc $dat2,$dat2 3276 aese $dat3,q10 3277 aesmc $dat3,$dat3 3278 aese $dat4,q10 3279 aesmc $dat4,$dat4 3280 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3281 // at exit from the loop v1.16b-v26.16b 3282 // are loaded with last "words" 3283 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x 3284 3285 aese $dat0,q11 3286 aesmc $dat0,$dat0 3287 aese $dat1,q11 3288 aesmc $dat1,$dat1 3289 aese $dat2,q11 3290 aesmc $dat2,$dat2 3291 aese $dat3,q11 3292 aesmc $dat3,$dat3 3293 aese $dat4,q11 3294 aesmc $dat4,$dat4 3295 3296 aese $dat0,q12 3297 aesmc $dat0,$dat0 3298 aese $dat1,q12 3299 aesmc $dat1,$dat1 3300 aese $dat2,q12 3301 aesmc $dat2,$dat2 3302 aese $dat3,q12 3303 aesmc $dat3,$dat3 3304 aese $dat4,q12 3305 aesmc $dat4,$dat4 3306 3307 aese $dat0,q13 3308 aesmc $dat0,$dat0 3309 aese $dat1,q13 3310 aesmc $dat1,$dat1 3311 aese $dat2,q13 3312 aesmc $dat2,$dat2 3313 aese $dat3,q13 3314 aesmc $dat3,$dat3 3315 aese $dat4,q13 3316 aesmc $dat4,$dat4 3317 3318 aese $dat0,q14 3319 aesmc $dat0,$dat0 3320 aese $dat1,q14 3321 aesmc $dat1,$dat1 3322 aese $dat2,q14 3323 aesmc $dat2,$dat2 3324 aese $dat3,q14 3325 aesmc $dat3,$dat3 3326 aese $dat4,q14 3327 aesmc $dat4,$dat4 3328 3329 veor $tmp0,$rndlast,$iv0 3330 aese $dat0,q15 3331 // The iv for first block of one iteration 3332 extr $midnumx,$ivh,$ivh,#32 3333 extr $ivh,$ivh,$ivl,#63 3334 and $tmpmw,$constnum,$midnum,asr#31 3335 eor $ivl,$tmpmx,$ivl,lsl#1 3336 fmov $ivd00,$ivl 3337 fmov $ivd01,$ivh 3338 veor $tmp1,$rndlast,$iv1 3339 vld1.8 {$in0},[$inp],#16 3340 aese $dat1,q15 3341 // The iv for second block 3342 extr $midnumx,$ivh,$ivh,#32 3343 extr $ivh,$ivh,$ivl,#63 3344 and $tmpmw,$constnum,$midnum,asr#31 3345 eor $ivl,$tmpmx,$ivl,lsl#1 3346 fmov $ivd10,$ivl 3347 fmov $ivd11,$ivh 3348 veor $tmp2,$rndlast,$iv2 3349 vld1.8 {$in1},[$inp],#16 3350 aese $dat2,q15 3351 // The iv for third block 3352 extr $midnumx,$ivh,$ivh,#32 3353 extr $ivh,$ivh,$ivl,#63 3354 and $tmpmw,$constnum,$midnum,asr#31 3355 eor $ivl,$tmpmx,$ivl,lsl#1 3356 fmov $ivd20,$ivl 3357 fmov $ivd21,$ivh 3358 veor $tmp3,$rndlast,$iv3 3359 vld1.8 {$in2},[$inp],#16 3360 aese $dat3,q15 3361 // The iv for fourth block 3362 extr $midnumx,$ivh,$ivh,#32 3363 extr $ivh,$ivh,$ivl,#63 3364 and $tmpmw,$constnum,$midnum,asr#31 3365 eor $ivl,$tmpmx,$ivl,lsl#1 3366 fmov $ivd30,$ivl 3367 fmov $ivd31,$ivh 3368 veor $tmp4,$rndlast,$iv4 3369 vld1.8 {$in3},[$inp],#16 3370 aese $dat4,q15 3371 3372 // The iv for fifth block 3373 extr $midnumx,$ivh,$ivh,#32 3374 extr $ivh,$ivh,$ivl,#63 3375 and $tmpmw,$constnum,$midnum,asr #31 3376 eor $ivl,$tmpmx,$ivl,lsl #1 3377 fmov $ivd40,$ivl 3378 fmov $ivd41,$ivh 3379 3380 vld1.8 {$in4},[$inp],#16 3381 cbz $xoffset,.Lxts_enc_tail4x 3382 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3383 veor $tmp0,$tmp0,$dat0 3384 veor $dat0,$in0,$iv0 3385 veor $tmp1,$tmp1,$dat1 3386 veor $dat1,$in1,$iv1 3387 veor $tmp2,$tmp2,$dat2 3388 veor $dat2,$in2,$iv2 3389 veor $tmp3,$tmp3,$dat3 3390 veor $dat3,$in3,$iv3 3391 veor $tmp4,$tmp4,$dat4 3392 vst1.8 {$tmp0},[$out],#16 3393 veor $dat4,$in4,$iv4 3394 vst1.8 {$tmp1},[$out],#16 3395 mov $rounds,$rounds0 3396 vst1.8 {$tmp2},[$out],#16 3397 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3398 vst1.8 {$tmp3},[$out],#16 3399 vst1.8 {$tmp4},[$out],#16 3400 b.hs .Loop5x_xts_enc 3401 3402 3403 // If left 4 blocks, borrow the five block's processing. 3404 cmn $len,#0x10 3405 b.ne .Loop5x_enc_after 3406 vorr $iv4,$iv3,$iv3 3407 vorr $iv3,$iv2,$iv2 3408 vorr $iv2,$iv1,$iv1 3409 vorr $iv1,$iv0,$iv0 3410 fmov $ivl,$ivd40 3411 fmov $ivh,$ivd41 3412 veor $dat0,$iv0,$in0 3413 veor $dat1,$iv1,$in1 3414 veor $dat2,$in2,$iv2 3415 veor $dat3,$in3,$iv3 3416 veor $dat4,$in4,$iv4 3417 b.eq .Loop5x_xts_enc 3418 3419.Loop5x_enc_after: 3420 add $len,$len,#0x50 3421 cbz $len,.Lxts_enc_done 3422 3423 add $rounds,$rounds0,#2 3424 subs $len,$len,#0x30 3425 b.lo .Lxts_inner_enc_tail 3426 3427 veor $dat0,$iv0,$in2 3428 veor $dat1,$iv1,$in3 3429 veor $dat2,$in4,$iv2 3430 b .Lxts_outer_enc_tail 3431 3432.align 4 3433.Lxts_enc_tail4x: 3434 add $inp,$inp,#16 3435 veor $tmp1,$dat1,$tmp1 3436 vst1.8 {$tmp1},[$out],#16 3437 veor $tmp2,$dat2,$tmp2 3438 vst1.8 {$tmp2},[$out],#16 3439 veor $tmp3,$dat3,$tmp3 3440 veor $tmp4,$dat4,$tmp4 3441 vst1.8 {$tmp3-$tmp4},[$out],#32 3442 3443 b .Lxts_enc_done 3444.align 4 3445.Lxts_outer_enc_tail: 3446 aese $dat0,q8 3447 aesmc $dat0,$dat0 3448 aese $dat1,q8 3449 aesmc $dat1,$dat1 3450 aese $dat2,q8 3451 aesmc $dat2,$dat2 3452 vld1.32 {q8},[$key_],#16 3453 subs $rounds,$rounds,#2 3454 aese $dat0,q9 3455 aesmc $dat0,$dat0 3456 aese $dat1,q9 3457 aesmc $dat1,$dat1 3458 aese $dat2,q9 3459 aesmc $dat2,$dat2 3460 vld1.32 {q9},[$key_],#16 3461 b.gt .Lxts_outer_enc_tail 3462 3463 aese $dat0,q8 3464 aesmc $dat0,$dat0 3465 aese $dat1,q8 3466 aesmc $dat1,$dat1 3467 aese $dat2,q8 3468 aesmc $dat2,$dat2 3469 veor $tmp0,$iv0,$rndlast 3470 subs $len,$len,#0x30 3471 // The iv for first block 3472 fmov $ivl,$ivd20 3473 fmov $ivh,$ivd21 3474 //mov $constnum,#0x87 3475 extr $midnumx,$ivh,$ivh,#32 3476 extr $ivh,$ivh,$ivl,#63 3477 and $tmpmw,$constnum,$midnum,asr#31 3478 eor $ivl,$tmpmx,$ivl,lsl#1 3479 fmov $ivd00,$ivl 3480 fmov $ivd01,$ivh 3481 veor $tmp1,$iv1,$rndlast 3482 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 3483 aese $dat0,q9 3484 aesmc $dat0,$dat0 3485 aese $dat1,q9 3486 aesmc $dat1,$dat1 3487 aese $dat2,q9 3488 aesmc $dat2,$dat2 3489 veor $tmp2,$iv2,$rndlast 3490 3491 add $xoffset,$xoffset,#0x20 3492 add $inp,$inp,$xoffset 3493 mov $key_,$key1 3494 3495 aese $dat0,q12 3496 aesmc $dat0,$dat0 3497 aese $dat1,q12 3498 aesmc $dat1,$dat1 3499 aese $dat2,q12 3500 aesmc $dat2,$dat2 3501 aese $dat0,q13 3502 aesmc $dat0,$dat0 3503 aese $dat1,q13 3504 aesmc $dat1,$dat1 3505 aese $dat2,q13 3506 aesmc $dat2,$dat2 3507 aese $dat0,q14 3508 aesmc $dat0,$dat0 3509 aese $dat1,q14 3510 aesmc $dat1,$dat1 3511 aese $dat2,q14 3512 aesmc $dat2,$dat2 3513 aese $dat0,q15 3514 aese $dat1,q15 3515 aese $dat2,q15 3516 vld1.8 {$in2},[$inp],#16 3517 add $rounds,$rounds0,#2 3518 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3519 veor $tmp0,$tmp0,$dat0 3520 veor $tmp1,$tmp1,$dat1 3521 veor $dat2,$dat2,$tmp2 3522 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3523 vst1.8 {$tmp0},[$out],#16 3524 vst1.8 {$tmp1},[$out],#16 3525 vst1.8 {$dat2},[$out],#16 3526 cmn $len,#0x30 3527 b.eq .Lxts_enc_done 3528.Lxts_encxor_one: 3529 vorr $in3,$in1,$in1 3530 vorr $in4,$in2,$in2 3531 nop 3532 3533.Lxts_inner_enc_tail: 3534 cmn $len,#0x10 3535 veor $dat1,$in3,$iv0 3536 veor $dat2,$in4,$iv1 3537 b.eq .Lxts_enc_tail_loop 3538 veor $dat2,$in4,$iv0 3539.Lxts_enc_tail_loop: 3540 aese $dat1,q8 3541 aesmc $dat1,$dat1 3542 aese $dat2,q8 3543 aesmc $dat2,$dat2 3544 vld1.32 {q8},[$key_],#16 3545 subs $rounds,$rounds,#2 3546 aese $dat1,q9 3547 aesmc $dat1,$dat1 3548 aese $dat2,q9 3549 aesmc $dat2,$dat2 3550 vld1.32 {q9},[$key_],#16 3551 b.gt .Lxts_enc_tail_loop 3552 3553 aese $dat1,q8 3554 aesmc $dat1,$dat1 3555 aese $dat2,q8 3556 aesmc $dat2,$dat2 3557 aese $dat1,q9 3558 aesmc $dat1,$dat1 3559 aese $dat2,q9 3560 aesmc $dat2,$dat2 3561 aese $dat1,q12 3562 aesmc $dat1,$dat1 3563 aese $dat2,q12 3564 aesmc $dat2,$dat2 3565 cmn $len,#0x20 3566 aese $dat1,q13 3567 aesmc $dat1,$dat1 3568 aese $dat2,q13 3569 aesmc $dat2,$dat2 3570 veor $tmp1,$iv0,$rndlast 3571 aese $dat1,q14 3572 aesmc $dat1,$dat1 3573 aese $dat2,q14 3574 aesmc $dat2,$dat2 3575 veor $tmp2,$iv1,$rndlast 3576 aese $dat1,q15 3577 aese $dat2,q15 3578 b.eq .Lxts_enc_one 3579 veor $tmp1,$tmp1,$dat1 3580 vst1.8 {$tmp1},[$out],#16 3581 veor $tmp2,$tmp2,$dat2 3582 vorr $iv0,$iv1,$iv1 3583 vst1.8 {$tmp2},[$out],#16 3584 fmov $ivl,$ivd10 3585 fmov $ivh,$ivd11 3586 mov $constnum,#0x87 3587 extr $midnumx,$ivh,$ivh,#32 3588 extr $ivh,$ivh,$ivl,#63 3589 and $tmpmw,$constnum,$midnum,asr #31 3590 eor $ivl,$tmpmx,$ivl,lsl #1 3591 fmov $ivd00,$ivl 3592 fmov $ivd01,$ivh 3593 b .Lxts_enc_done 3594 3595.Lxts_enc_one: 3596 veor $tmp1,$tmp1,$dat2 3597 vorr $iv0,$iv0,$iv0 3598 vst1.8 {$tmp1},[$out],#16 3599 fmov $ivl,$ivd00 3600 fmov $ivh,$ivd01 3601 mov $constnum,#0x87 3602 extr $midnumx,$ivh,$ivh,#32 3603 extr $ivh,$ivh,$ivl,#63 3604 and $tmpmw,$constnum,$midnum,asr #31 3605 eor $ivl,$tmpmx,$ivl,lsl #1 3606 fmov $ivd00,$ivl 3607 fmov $ivd01,$ivh 3608 b .Lxts_enc_done 3609.align 5 3610.Lxts_enc_done: 3611 // Process the tail block with cipher stealing. 3612 tst $tailcnt,#0xf 3613 b.eq .Lxts_abort 3614 3615 mov $tmpinp,$inp 3616 mov $tmpoutp,$out 3617 sub $out,$out,#16 3618.composite_enc_loop: 3619 subs $tailcnt,$tailcnt,#1 3620 ldrb $l2outp,[$out,$tailcnt] 3621 ldrb $loutp,[$tmpinp,$tailcnt] 3622 strb $l2outp,[$tmpoutp,$tailcnt] 3623 strb $loutp,[$out,$tailcnt] 3624 b.gt .composite_enc_loop 3625.Lxts_enc_load_done: 3626 vld1.8 {$tmpin},[$out] 3627 veor $tmpin,$tmpin,$iv0 3628 3629 // Encrypt the composite block to get the last second encrypted text block 3630 ldr $rounds,[$key1,#240] // load key schedule... 3631 vld1.32 {$dat},[$key1],#16 3632 sub $rounds,$rounds,#2 3633 vld1.32 {$dat1},[$key1],#16 // load key schedule... 3634.Loop_final_enc: 3635 aese $tmpin,$dat0 3636 aesmc $tmpin,$tmpin 3637 vld1.32 {$dat0},[$key1],#16 3638 subs $rounds,$rounds,#2 3639 aese $tmpin,$dat1 3640 aesmc $tmpin,$tmpin 3641 vld1.32 {$dat1},[$key1],#16 3642 b.gt .Loop_final_enc 3643 3644 aese $tmpin,$dat0 3645 aesmc $tmpin,$tmpin 3646 vld1.32 {$dat0},[$key1] 3647 aese $tmpin,$dat1 3648 veor $tmpin,$tmpin,$dat0 3649 veor $tmpin,$tmpin,$iv0 3650 vst1.8 {$tmpin},[$out] 3651 3652.Lxts_abort: 3653 ldp $tailcnt,$midnumx,[sp,#48] 3654 ldp $ivd10,$ivd20,[sp,#32] 3655 ldp $ivd30,$ivd40,[sp,#16] 3656 ldp $constnumx,$tmpinp,[sp],#64 3657.Lxts_enc_final_abort: 3658 ret 3659.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt 3660___ 3661 3662}}} 3663{{{ 3664my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 3665my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 3666my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 3667my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 3668my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 3669my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 3670my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); 3671my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 3672my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 3673 3674my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 3675 3676# q7 last round key 3677# q10-q15, q7 Last 7 round keys 3678# q8-q9 preloaded round keys except last 7 keys for big size 3679# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 3680 3681{ 3682my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 3683 3684my ($dat3,$in3,$tmp3); # used only in 64-bit mode 3685my ($dat4,$in4,$tmp4); 3686if ($flavour =~ /64/) { 3687 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 3688} 3689 3690$code.=<<___ if ($flavour =~ /64/); 3691.globl ${prefix}_xts_decrypt 3692.type ${prefix}_xts_decrypt,%function 3693.align 5 3694${prefix}_xts_decrypt: 3695 AARCH64_VALID_CALL_TARGET 3696___ 3697$code.=<<___ if ($flavour =~ /64/); 3698 cmp $len,#16 3699 // Original input data size bigger than 16, jump to big size processing. 3700 b.ne .Lxts_dec_big_size 3701 // Encrypt the iv with key2, as the first XEX iv. 3702 ldr $rounds,[$key2,#240] 3703 vld1.32 {$dat},[$key2],#16 3704 vld1.8 {$iv0},[$ivp] 3705 sub $rounds,$rounds,#2 3706 vld1.32 {$dat1},[$key2],#16 3707 3708.Loop_dec_small_iv_enc: 3709 aese $iv0,$dat 3710 aesmc $iv0,$iv0 3711 vld1.32 {$dat},[$key2],#16 3712 subs $rounds,$rounds,#2 3713 aese $iv0,$dat1 3714 aesmc $iv0,$iv0 3715 vld1.32 {$dat1},[$key2],#16 3716 b.gt .Loop_dec_small_iv_enc 3717 3718 aese $iv0,$dat 3719 aesmc $iv0,$iv0 3720 vld1.32 {$dat},[$key2] 3721 aese $iv0,$dat1 3722 veor $iv0,$iv0,$dat 3723 3724 vld1.8 {$dat0},[$inp] 3725 veor $dat0,$iv0,$dat0 3726 3727 ldr $rounds,[$key1,#240] 3728 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 3729 3730 aesd $dat0,q20 3731 aesimc $dat0,$dat0 3732 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 3733 aesd $dat0,q21 3734 aesimc $dat0,$dat0 3735 subs $rounds,$rounds,#10 // bias 3736 b.eq .Lxts_128_dec 3737.Lxts_dec_round_loop: 3738 aesd $dat0,q8 3739 aesimc $dat0,$dat0 3740 vld1.32 {q8},[$key1],#16 // load key schedule... 3741 aesd $dat0,q9 3742 aesimc $dat0,$dat0 3743 vld1.32 {q9},[$key1],#16 // load key schedule... 3744 subs $rounds,$rounds,#2 // bias 3745 b.gt .Lxts_dec_round_loop 3746.Lxts_128_dec: 3747 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 3748 aesd $dat0,q8 3749 aesimc $dat0,$dat0 3750 aesd $dat0,q9 3751 aesimc $dat0,$dat0 3752 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 3753 aesd $dat0,q10 3754 aesimc $dat0,$dat0 3755 aesd $dat0,q11 3756 aesimc $dat0,$dat0 3757 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 3758 aesd $dat0,q12 3759 aesimc $dat0,$dat0 3760 aesd $dat0,q13 3761 aesimc $dat0,$dat0 3762 vld1.32 {$rndlast},[$key1] 3763 aesd $dat0,q14 3764 aesimc $dat0,$dat0 3765 aesd $dat0,q15 3766 veor $dat0,$dat0,$rndlast 3767 veor $dat0,$iv0,$dat0 3768 vst1.8 {$dat0},[$out] 3769 b .Lxts_dec_final_abort 3770.Lxts_dec_big_size: 3771___ 3772$code.=<<___ if ($flavour =~ /64/); 3773 stp $constnumx,$tmpinp,[sp,#-64]! 3774 stp $tailcnt,$midnumx,[sp,#48] 3775 stp $ivd10,$ivd20,[sp,#32] 3776 stp $ivd30,$ivd40,[sp,#16] 3777 3778 and $tailcnt,$len,#0xf 3779 and $len,$len,#-16 3780 subs $len,$len,#16 3781 mov $step,#16 3782 b.lo .Lxts_dec_abort 3783 3784 // Encrypt the iv with key2, as the first XEX iv 3785 ldr $rounds,[$key2,#240] 3786 vld1.32 {$dat},[$key2],#16 3787 vld1.8 {$iv0},[$ivp] 3788 sub $rounds,$rounds,#2 3789 vld1.32 {$dat1},[$key2],#16 3790 3791.Loop_dec_iv_enc: 3792 aese $iv0,$dat 3793 aesmc $iv0,$iv0 3794 vld1.32 {$dat},[$key2],#16 3795 subs $rounds,$rounds,#2 3796 aese $iv0,$dat1 3797 aesmc $iv0,$iv0 3798 vld1.32 {$dat1},[$key2],#16 3799 b.gt .Loop_dec_iv_enc 3800 3801 aese $iv0,$dat 3802 aesmc $iv0,$iv0 3803 vld1.32 {$dat},[$key2] 3804 aese $iv0,$dat1 3805 veor $iv0,$iv0,$dat 3806 3807 // The iv for second block 3808 // $ivl- iv(low), $ivh - iv(high) 3809 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3810 fmov $ivl,$ivd00 3811 fmov $ivh,$ivd01 3812 mov $constnum,#0x87 3813 extr $midnumx,$ivh,$ivh,#32 3814 extr $ivh,$ivh,$ivl,#63 3815 and $tmpmw,$constnum,$midnum,asr #31 3816 eor $ivl,$tmpmx,$ivl,lsl #1 3817 fmov $ivd10,$ivl 3818 fmov $ivd11,$ivh 3819 3820 ldr $rounds0,[$key1,#240] // load rounds number 3821 3822 // The iv for third block 3823 extr $midnumx,$ivh,$ivh,#32 3824 extr $ivh,$ivh,$ivl,#63 3825 and $tmpmw,$constnum,$midnum,asr #31 3826 eor $ivl,$tmpmx,$ivl,lsl #1 3827 fmov $ivd20,$ivl 3828 fmov $ivd21,$ivh 3829 3830 vld1.32 {q8-q9},[$key1] // load key schedule... 3831 sub $rounds0,$rounds0,#6 3832 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3833 sub $rounds0,$rounds0,#2 3834 vld1.32 {q10-q11},[$key_],#32 // load key schedule... 3835 vld1.32 {q12-q13},[$key_],#32 3836 vld1.32 {q14-q15},[$key_],#32 3837 vld1.32 {$rndlast},[$key_] 3838 3839 // The iv for fourth block 3840 extr $midnumx,$ivh,$ivh,#32 3841 extr $ivh,$ivh,$ivl,#63 3842 and $tmpmw,$constnum,$midnum,asr #31 3843 eor $ivl,$tmpmx,$ivl,lsl #1 3844 fmov $ivd30,$ivl 3845 fmov $ivd31,$ivh 3846 3847 add $key_,$key1,#32 3848 mov $rounds,$rounds0 3849 b .Lxts_dec 3850 3851 // Decryption 3852.align 5 3853.Lxts_dec: 3854 tst $tailcnt,#0xf 3855 b.eq .Lxts_dec_begin 3856 subs $len,$len,#16 3857 csel $step,xzr,$step,eq 3858 vld1.8 {$dat},[$inp],#16 3859 b.lo .Lxts_done 3860 sub $inp,$inp,#16 3861.Lxts_dec_begin: 3862 vld1.8 {$dat},[$inp],$step 3863 subs $len,$len,#32 // bias 3864 add $rounds,$rounds0,#2 3865 vorr $in1,$dat,$dat 3866 vorr $dat1,$dat,$dat 3867 vorr $in3,$dat,$dat 3868 vld1.8 {$dat2},[$inp],#16 3869 vorr $in2,$dat2,$dat2 3870 vorr $in4,$dat2,$dat2 3871 b.lo .Lxts_inner_dec_tail 3872 veor $dat,$dat,$iv0 // before decryt, xor with iv 3873 veor $dat2,$dat2,$iv1 3874 3875 vorr $dat1,$dat2,$dat2 3876 vld1.8 {$dat2},[$inp],#16 3877 vorr $in0,$dat,$dat 3878 vorr $in1,$dat1,$dat1 3879 veor $in2,$dat2,$iv2 // third block xox with third iv 3880 veor $dat2,$dat2,$iv2 3881 cmp $len,#32 3882 b.lo .Lxts_outer_dec_tail 3883 3884 vld1.8 {$dat3},[$inp],#16 3885 3886 // The iv for fifth block 3887 extr $midnumx,$ivh,$ivh,#32 3888 extr $ivh,$ivh,$ivl,#63 3889 and $tmpmw,$constnum,$midnum,asr #31 3890 eor $ivl,$tmpmx,$ivl,lsl #1 3891 fmov $ivd40,$ivl 3892 fmov $ivd41,$ivh 3893 3894 vld1.8 {$dat4},[$inp],#16 3895 veor $dat3,$dat3,$iv3 // the fourth block 3896 veor $dat4,$dat4,$iv4 3897 sub $len,$len,#32 // bias 3898 mov $rounds,$rounds0 3899 b .Loop5x_xts_dec 3900 3901.align 4 3902.Loop5x_xts_dec: 3903 aesd $dat0,q8 3904 aesimc $dat0,$dat0 3905 aesd $dat1,q8 3906 aesimc $dat1,$dat1 3907 aesd $dat2,q8 3908 aesimc $dat2,$dat2 3909 aesd $dat3,q8 3910 aesimc $dat3,$dat3 3911 aesd $dat4,q8 3912 aesimc $dat4,$dat4 3913 vld1.32 {q8},[$key_],#16 // load key schedule... 3914 subs $rounds,$rounds,#2 3915 aesd $dat0,q9 3916 aesimc $dat0,$dat0 3917 aesd $dat1,q9 3918 aesimc $dat1,$dat1 3919 aesd $dat2,q9 3920 aesimc $dat2,$dat2 3921 aesd $dat3,q9 3922 aesimc $dat3,$dat3 3923 aesd $dat4,q9 3924 aesimc $dat4,$dat4 3925 vld1.32 {q9},[$key_],#16 // load key schedule... 3926 b.gt .Loop5x_xts_dec 3927 3928 aesd $dat0,q8 3929 aesimc $dat0,$dat0 3930 aesd $dat1,q8 3931 aesimc $dat1,$dat1 3932 aesd $dat2,q8 3933 aesimc $dat2,$dat2 3934 aesd $dat3,q8 3935 aesimc $dat3,$dat3 3936 aesd $dat4,q8 3937 aesimc $dat4,$dat4 3938 subs $len,$len,#0x50 // because .Lxts_dec_tail4x 3939 3940 aesd $dat0,q9 3941 aesimc $dat0,$dat 3942 aesd $dat1,q9 3943 aesimc $dat1,$dat1 3944 aesd $dat2,q9 3945 aesimc $dat2,$dat2 3946 aesd $dat3,q9 3947 aesimc $dat3,$dat3 3948 aesd $dat4,q9 3949 aesimc $dat4,$dat4 3950 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3951 mov $key_,$key1 3952 3953 aesd $dat0,q10 3954 aesimc $dat0,$dat0 3955 aesd $dat1,q10 3956 aesimc $dat1,$dat1 3957 aesd $dat2,q10 3958 aesimc $dat2,$dat2 3959 aesd $dat3,q10 3960 aesimc $dat3,$dat3 3961 aesd $dat4,q10 3962 aesimc $dat4,$dat4 3963 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3964 // at exit from the loop v1.16b-v26.16b 3965 // are loaded with last "words" 3966 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x 3967 3968 aesd $dat0,q11 3969 aesimc $dat0,$dat0 3970 aesd $dat1,q11 3971 aesimc $dat1,$dat1 3972 aesd $dat2,q11 3973 aesimc $dat2,$dat2 3974 aesd $dat3,q11 3975 aesimc $dat3,$dat3 3976 aesd $dat4,q11 3977 aesimc $dat4,$dat4 3978 3979 aesd $dat0,q12 3980 aesimc $dat0,$dat0 3981 aesd $dat1,q12 3982 aesimc $dat1,$dat1 3983 aesd $dat2,q12 3984 aesimc $dat2,$dat2 3985 aesd $dat3,q12 3986 aesimc $dat3,$dat3 3987 aesd $dat4,q12 3988 aesimc $dat4,$dat4 3989 3990 aesd $dat0,q13 3991 aesimc $dat0,$dat0 3992 aesd $dat1,q13 3993 aesimc $dat1,$dat1 3994 aesd $dat2,q13 3995 aesimc $dat2,$dat2 3996 aesd $dat3,q13 3997 aesimc $dat3,$dat3 3998 aesd $dat4,q13 3999 aesimc $dat4,$dat4 4000 4001 aesd $dat0,q14 4002 aesimc $dat0,$dat0 4003 aesd $dat1,q14 4004 aesimc $dat1,$dat1 4005 aesd $dat2,q14 4006 aesimc $dat2,$dat2 4007 aesd $dat3,q14 4008 aesimc $dat3,$dat3 4009 aesd $dat4,q14 4010 aesimc $dat4,$dat4 4011 4012 veor $tmp0,$rndlast,$iv0 4013 aesd $dat0,q15 4014 // The iv for first block of next iteration. 4015 extr $midnumx,$ivh,$ivh,#32 4016 extr $ivh,$ivh,$ivl,#63 4017 and $tmpmw,$constnum,$midnum,asr #31 4018 eor $ivl,$tmpmx,$ivl,lsl #1 4019 fmov $ivd00,$ivl 4020 fmov $ivd01,$ivh 4021 veor $tmp1,$rndlast,$iv1 4022 vld1.8 {$in0},[$inp],#16 4023 aesd $dat1,q15 4024 // The iv for second block 4025 extr $midnumx,$ivh,$ivh,#32 4026 extr $ivh,$ivh,$ivl,#63 4027 and $tmpmw,$constnum,$midnum,asr #31 4028 eor $ivl,$tmpmx,$ivl,lsl #1 4029 fmov $ivd10,$ivl 4030 fmov $ivd11,$ivh 4031 veor $tmp2,$rndlast,$iv2 4032 vld1.8 {$in1},[$inp],#16 4033 aesd $dat2,q15 4034 // The iv for third block 4035 extr $midnumx,$ivh,$ivh,#32 4036 extr $ivh,$ivh,$ivl,#63 4037 and $tmpmw,$constnum,$midnum,asr #31 4038 eor $ivl,$tmpmx,$ivl,lsl #1 4039 fmov $ivd20,$ivl 4040 fmov $ivd21,$ivh 4041 veor $tmp3,$rndlast,$iv3 4042 vld1.8 {$in2},[$inp],#16 4043 aesd $dat3,q15 4044 // The iv for fourth block 4045 extr $midnumx,$ivh,$ivh,#32 4046 extr $ivh,$ivh,$ivl,#63 4047 and $tmpmw,$constnum,$midnum,asr #31 4048 eor $ivl,$tmpmx,$ivl,lsl #1 4049 fmov $ivd30,$ivl 4050 fmov $ivd31,$ivh 4051 veor $tmp4,$rndlast,$iv4 4052 vld1.8 {$in3},[$inp],#16 4053 aesd $dat4,q15 4054 4055 // The iv for fifth block 4056 extr $midnumx,$ivh,$ivh,#32 4057 extr $ivh,$ivh,$ivl,#63 4058 and $tmpmw,$constnum,$midnum,asr #31 4059 eor $ivl,$tmpmx,$ivl,lsl #1 4060 fmov $ivd40,$ivl 4061 fmov $ivd41,$ivh 4062 4063 vld1.8 {$in4},[$inp],#16 4064 cbz $xoffset,.Lxts_dec_tail4x 4065 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 4066 veor $tmp0,$tmp0,$dat0 4067 veor $dat0,$in0,$iv0 4068 veor $tmp1,$tmp1,$dat1 4069 veor $dat1,$in1,$iv1 4070 veor $tmp2,$tmp2,$dat2 4071 veor $dat2,$in2,$iv2 4072 veor $tmp3,$tmp3,$dat3 4073 veor $dat3,$in3,$iv3 4074 veor $tmp4,$tmp4,$dat4 4075 vst1.8 {$tmp0},[$out],#16 4076 veor $dat4,$in4,$iv4 4077 vst1.8 {$tmp1},[$out],#16 4078 mov $rounds,$rounds0 4079 vst1.8 {$tmp2},[$out],#16 4080 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 4081 vst1.8 {$tmp3},[$out],#16 4082 vst1.8 {$tmp4},[$out],#16 4083 b.hs .Loop5x_xts_dec 4084 4085 cmn $len,#0x10 4086 b.ne .Loop5x_dec_after 4087 // If x2($len) equal to -0x10, the left blocks is 4. 4088 // After specially processing, utilize the five blocks processing again. 4089 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. 4090 vorr $iv4,$iv3,$iv3 4091 vorr $iv3,$iv2,$iv2 4092 vorr $iv2,$iv1,$iv1 4093 vorr $iv1,$iv0,$iv0 4094 fmov $ivl,$ivd40 4095 fmov $ivh,$ivd41 4096 veor $dat0,$iv0,$in0 4097 veor $dat1,$iv1,$in1 4098 veor $dat2,$in2,$iv2 4099 veor $dat3,$in3,$iv3 4100 veor $dat4,$in4,$iv4 4101 b.eq .Loop5x_xts_dec 4102 4103.Loop5x_dec_after: 4104 add $len,$len,#0x50 4105 cbz $len,.Lxts_done 4106 4107 add $rounds,$rounds0,#2 4108 subs $len,$len,#0x30 4109 b.lo .Lxts_inner_dec_tail 4110 4111 veor $dat0,$iv0,$in2 4112 veor $dat1,$iv1,$in3 4113 veor $dat2,$in4,$iv2 4114 b .Lxts_outer_dec_tail 4115 4116.align 4 4117.Lxts_dec_tail4x: 4118 add $inp,$inp,#16 4119 tst $tailcnt,#0xf 4120 veor $tmp1,$dat1,$tmp0 4121 vst1.8 {$tmp1},[$out],#16 4122 veor $tmp2,$dat2,$tmp2 4123 vst1.8 {$tmp2},[$out],#16 4124 veor $tmp3,$dat3,$tmp3 4125 veor $tmp4,$dat4,$tmp4 4126 vst1.8 {$tmp3-$tmp4},[$out],#32 4127 4128 b.eq .Lxts_dec_abort 4129 vld1.8 {$dat0},[$inp],#16 4130 b .Lxts_done 4131.align 4 4132.Lxts_outer_dec_tail: 4133 aesd $dat0,q8 4134 aesimc $dat0,$dat0 4135 aesd $dat1,q8 4136 aesimc $dat1,$dat1 4137 aesd $dat2,q8 4138 aesimc $dat2,$dat2 4139 vld1.32 {q8},[$key_],#16 4140 subs $rounds,$rounds,#2 4141 aesd $dat0,q9 4142 aesimc $dat0,$dat0 4143 aesd $dat1,q9 4144 aesimc $dat1,$dat1 4145 aesd $dat2,q9 4146 aesimc $dat2,$dat2 4147 vld1.32 {q9},[$key_],#16 4148 b.gt .Lxts_outer_dec_tail 4149 4150 aesd $dat0,q8 4151 aesimc $dat0,$dat0 4152 aesd $dat1,q8 4153 aesimc $dat1,$dat1 4154 aesd $dat2,q8 4155 aesimc $dat2,$dat2 4156 veor $tmp0,$iv0,$rndlast 4157 subs $len,$len,#0x30 4158 // The iv for first block 4159 fmov $ivl,$ivd20 4160 fmov $ivh,$ivd21 4161 mov $constnum,#0x87 4162 extr $midnumx,$ivh,$ivh,#32 4163 extr $ivh,$ivh,$ivl,#63 4164 and $tmpmw,$constnum,$midnum,asr #31 4165 eor $ivl,$tmpmx,$ivl,lsl #1 4166 fmov $ivd00,$ivl 4167 fmov $ivd01,$ivh 4168 veor $tmp1,$iv1,$rndlast 4169 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 4170 aesd $dat0,q9 4171 aesimc $dat0,$dat0 4172 aesd $dat1,q9 4173 aesimc $dat1,$dat1 4174 aesd $dat2,q9 4175 aesimc $dat2,$dat2 4176 veor $tmp2,$iv2,$rndlast 4177 // The iv for second block 4178 extr $midnumx,$ivh,$ivh,#32 4179 extr $ivh,$ivh,$ivl,#63 4180 and $tmpmw,$constnum,$midnum,asr #31 4181 eor $ivl,$tmpmx,$ivl,lsl #1 4182 fmov $ivd10,$ivl 4183 fmov $ivd11,$ivh 4184 4185 add $xoffset,$xoffset,#0x20 4186 add $inp,$inp,$xoffset // $inp is adjusted to the last data 4187 4188 mov $key_,$key1 4189 4190 // The iv for third block 4191 extr $midnumx,$ivh,$ivh,#32 4192 extr $ivh,$ivh,$ivl,#63 4193 and $tmpmw,$constnum,$midnum,asr #31 4194 eor $ivl,$tmpmx,$ivl,lsl #1 4195 fmov $ivd20,$ivl 4196 fmov $ivd21,$ivh 4197 4198 aesd $dat0,q12 4199 aesimc $dat0,$dat0 4200 aesd $dat1,q12 4201 aesimc $dat1,$dat1 4202 aesd $dat2,q12 4203 aesimc $dat2,$dat2 4204 aesd $dat0,q13 4205 aesimc $dat0,$dat0 4206 aesd $dat1,q13 4207 aesimc $dat1,$dat1 4208 aesd $dat2,q13 4209 aesimc $dat2,$dat2 4210 aesd $dat0,q14 4211 aesimc $dat0,$dat0 4212 aesd $dat1,q14 4213 aesimc $dat1,$dat1 4214 aesd $dat2,q14 4215 aesimc $dat2,$dat2 4216 vld1.8 {$in2},[$inp],#16 4217 aesd $dat0,q15 4218 aesd $dat1,q15 4219 aesd $dat2,q15 4220 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 4221 add $rounds,$rounds0,#2 4222 veor $tmp0,$tmp0,$dat0 4223 veor $tmp1,$tmp1,$dat1 4224 veor $dat2,$dat2,$tmp2 4225 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 4226 vst1.8 {$tmp0},[$out],#16 4227 vst1.8 {$tmp1},[$out],#16 4228 vst1.8 {$dat2},[$out],#16 4229 4230 cmn $len,#0x30 4231 add $len,$len,#0x30 4232 b.eq .Lxts_done 4233 sub $len,$len,#0x30 4234 vorr $in3,$in1,$in1 4235 vorr $in4,$in2,$in2 4236 nop 4237 4238.Lxts_inner_dec_tail: 4239 // $len == -0x10 means two blocks left. 4240 cmn $len,#0x10 4241 veor $dat1,$in3,$iv0 4242 veor $dat2,$in4,$iv1 4243 b.eq .Lxts_dec_tail_loop 4244 veor $dat2,$in4,$iv0 4245.Lxts_dec_tail_loop: 4246 aesd $dat1,q8 4247 aesimc $dat1,$dat1 4248 aesd $dat2,q8 4249 aesimc $dat2,$dat2 4250 vld1.32 {q8},[$key_],#16 4251 subs $rounds,$rounds,#2 4252 aesd $dat1,q9 4253 aesimc $dat1,$dat1 4254 aesd $dat2,q9 4255 aesimc $dat2,$dat2 4256 vld1.32 {q9},[$key_],#16 4257 b.gt .Lxts_dec_tail_loop 4258 4259 aesd $dat1,q8 4260 aesimc $dat1,$dat1 4261 aesd $dat2,q8 4262 aesimc $dat2,$dat2 4263 aesd $dat1,q9 4264 aesimc $dat1,$dat1 4265 aesd $dat2,q9 4266 aesimc $dat2,$dat2 4267 aesd $dat1,q12 4268 aesimc $dat1,$dat1 4269 aesd $dat2,q12 4270 aesimc $dat2,$dat2 4271 cmn $len,#0x20 4272 aesd $dat1,q13 4273 aesimc $dat1,$dat1 4274 aesd $dat2,q13 4275 aesimc $dat2,$dat2 4276 veor $tmp1,$iv0,$rndlast 4277 aesd $dat1,q14 4278 aesimc $dat1,$dat1 4279 aesd $dat2,q14 4280 aesimc $dat2,$dat2 4281 veor $tmp2,$iv1,$rndlast 4282 aesd $dat1,q15 4283 aesd $dat2,q15 4284 b.eq .Lxts_dec_one 4285 veor $tmp1,$tmp1,$dat1 4286 veor $tmp2,$tmp2,$dat2 4287 vorr $iv0,$iv2,$iv2 4288 vorr $iv1,$iv3,$iv3 4289 vst1.8 {$tmp1},[$out],#16 4290 vst1.8 {$tmp2},[$out],#16 4291 add $len,$len,#16 4292 b .Lxts_done 4293 4294.Lxts_dec_one: 4295 veor $tmp1,$tmp1,$dat2 4296 vorr $iv0,$iv1,$iv1 4297 vorr $iv1,$iv2,$iv2 4298 vst1.8 {$tmp1},[$out],#16 4299 add $len,$len,#32 4300 4301.Lxts_done: 4302 tst $tailcnt,#0xf 4303 b.eq .Lxts_dec_abort 4304 // Processing the last two blocks with cipher stealing. 4305 mov x7,x3 4306 cbnz x2,.Lxts_dec_1st_done 4307 vld1.8 {$dat0},[$inp],#16 4308 4309 // Decrypt the last second block to get the last plain text block 4310.Lxts_dec_1st_done: 4311 eor $tmpin,$dat0,$iv1 4312 ldr $rounds,[$key1,#240] 4313 vld1.32 {$dat0},[$key1],#16 4314 sub $rounds,$rounds,#2 4315 vld1.32 {$dat1},[$key1],#16 4316.Loop_final_2nd_dec: 4317 aesd $tmpin,$dat0 4318 aesimc $tmpin,$tmpin 4319 vld1.32 {$dat0},[$key1],#16 // load key schedule... 4320 subs $rounds,$rounds,#2 4321 aesd $tmpin,$dat1 4322 aesimc $tmpin,$tmpin 4323 vld1.32 {$dat1},[$key1],#16 // load key schedule... 4324 b.gt .Loop_final_2nd_dec 4325 4326 aesd $tmpin,$dat0 4327 aesimc $tmpin,$tmpin 4328 vld1.32 {$dat0},[$key1] 4329 aesd $tmpin,$dat1 4330 veor $tmpin,$tmpin,$dat0 4331 veor $tmpin,$tmpin,$iv1 4332 vst1.8 {$tmpin},[$out] 4333 4334 mov $tmpinp,$inp 4335 add $tmpoutp,$out,#16 4336 4337 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 4338 // to get the last encrypted block. 4339.composite_dec_loop: 4340 subs $tailcnt,$tailcnt,#1 4341 ldrb $l2outp,[$out,$tailcnt] 4342 ldrb $loutp,[$tmpinp,$tailcnt] 4343 strb $l2outp,[$tmpoutp,$tailcnt] 4344 strb $loutp,[$out,$tailcnt] 4345 b.gt .composite_dec_loop 4346.Lxts_dec_load_done: 4347 vld1.8 {$tmpin},[$out] 4348 veor $tmpin,$tmpin,$iv0 4349 4350 // Decrypt the composite block to get the last second plain text block 4351 ldr $rounds,[$key_,#240] 4352 vld1.32 {$dat},[$key_],#16 4353 sub $rounds,$rounds,#2 4354 vld1.32 {$dat1},[$key_],#16 4355.Loop_final_dec: 4356 aesd $tmpin,$dat0 4357 aesimc $tmpin,$tmpin 4358 vld1.32 {$dat0},[$key_],#16 // load key schedule... 4359 subs $rounds,$rounds,#2 4360 aesd $tmpin,$dat1 4361 aesimc $tmpin,$tmpin 4362 vld1.32 {$dat1},[$key_],#16 // load key schedule... 4363 b.gt .Loop_final_dec 4364 4365 aesd $tmpin,$dat0 4366 aesimc $tmpin,$tmpin 4367 vld1.32 {$dat0},[$key_] 4368 aesd $tmpin,$dat1 4369 veor $tmpin,$tmpin,$dat0 4370 veor $tmpin,$tmpin,$iv0 4371 vst1.8 {$tmpin},[$out] 4372 4373.Lxts_dec_abort: 4374 ldp $tailcnt,$midnumx,[sp,#48] 4375 ldp $ivd10,$ivd20,[sp,#32] 4376 ldp $ivd30,$ivd40,[sp,#16] 4377 ldp $constnumx,$tmpinp,[sp],#64 4378 4379.Lxts_dec_final_abort: 4380 ret 4381.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt 4382___ 4383} 4384}}} 4385$code.=<<___; 4386#endif 4387___ 4388######################################## 4389if ($flavour =~ /64/) { ######## 64-bit code 4390 my %opcode = ( 4391 "aesd" => 0x4e285800, "aese" => 0x4e284800, 4392 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800, 4393 "eor3" => 0xce000000, ); 4394 4395 local *unaes = sub { 4396 my ($mnemonic,$arg)=@_; 4397 4398 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 4399 sprintf ".inst\t0x%08x\t//%s %s", 4400 $opcode{$mnemonic}|$1|($2<<5), 4401 $mnemonic,$arg; 4402 }; 4403 4404 sub unsha3 { 4405 my ($mnemonic,$arg)=@_; 4406 4407 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 4408 && 4409 sprintf ".inst\t0x%08x\t//%s %s", 4410 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 4411 $mnemonic,$arg; 4412 } 4413 4414 foreach(split("\n",$code)) { 4415 s/\`([^\`]*)\`/eval($1)/geo; 4416 4417 s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo; # old->new registers 4418 s/\bq_([0-9]+)\b/"q".$1/geo; # old->new registers 4419 s/@\s/\/\//o; # old->new style commentary 4420 4421 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 4422 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 4423 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 4424 s/vmov\.i8/movi/o or # fix up legacy mnemonics 4425 s/vext\.8/ext/o or 4426 s/vrev32\.8/rev32/o or 4427 s/vtst\.8/cmtst/o or 4428 s/vshr/ushr/o or 4429 s/^(\s+)v/$1/o or # strip off v prefix 4430 s/\bbx\s+lr\b/ret/o; 4431 s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge; 4432 4433 # fix up remaining legacy suffixes 4434 s/\.[ui]?8//o; 4435 m/\],#8/o and s/\.16b/\.8b/go; 4436 s/\.[ui]?32//o and s/\.16b/\.4s/go; 4437 s/\.[ui]?64//o and s/\.16b/\.2d/go; 4438 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 4439 4440 # Switch preprocessor checks to aarch64 versions. 4441 s/__ARME([BL])__/__AARCH64E$1__/go; 4442 4443 print $_,"\n"; 4444 } 4445} else { ######## 32-bit code 4446 my %opcode = ( 4447 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 4448 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 4449 4450 local *unaes = sub { 4451 my ($mnemonic,$arg)=@_; 4452 4453 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 4454 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 4455 |(($2&7)<<1) |(($2&8)<<2); 4456 # since ARMv7 instructions are always encoded little-endian. 4457 # correct solution is to use .inst directive, but older 4458 # assemblers don't implement it:-( 4459 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 4460 $word&0xff,($word>>8)&0xff, 4461 ($word>>16)&0xff,($word>>24)&0xff, 4462 $mnemonic,$arg; 4463 } 4464 }; 4465 4466 sub unvtbl { 4467 my $arg=shift; 4468 4469 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 4470 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 4471 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 4472 } 4473 4474 sub unvdup32 { 4475 my $arg=shift; 4476 4477 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 4478 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 4479 } 4480 4481 sub unvmov32 { 4482 my $arg=shift; 4483 4484 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 4485 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 4486 } 4487 4488 foreach(split("\n",$code)) { 4489 s/\`([^\`]*)\`/eval($1)/geo; 4490 4491 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 4492 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 4493 s/\/\/\s?/@ /o; # new->old style commentary 4494 4495 # fix up remaining new-style suffixes 4496 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 4497 s/\],#[0-9]+/]!/o; 4498 4499 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 4500 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 4501 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 4502 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 4503 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 4504 s/^(\s+)b\./$1b/o or 4505 s/^(\s+)ret/$1bx\tlr/o; 4506 4507 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 4508 print " it $2\n"; 4509 } 4510 4511 print $_,"\n"; 4512 } 4513} 4514 4515close STDOUT or die "error closing STDOUT: $!"; 4516