1#! /usr/bin/env perl 2# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# April 2019 31# 32# Key to performance of parallelize-able modes is round instruction 33# interleaving. But which factor to use? There is optimal one for 34# each combination of instruction latency and issue rate, beyond 35# which increasing interleave factor doesn't pay off. While on cons 36# side we have code size increase and resource waste on platforms for 37# which interleave factor is too high. In other words you want it to 38# be just right. So far interleave factor of 3x was serving well all 39# platforms. But for ThunderX2 optimal interleave factor was measured 40# to be 5x... 41# 42# Performance in cycles per byte processed with 128-bit key: 43# 44# CBC enc CBC dec CTR 45# Apple A7 2.39 1.20 1.20 46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 48# Cortex-A72 1.33 0.85/0.88 0.92/0.96 49# Denver 1.96 0.65/0.86 0.76/0.80 50# Mongoose 1.33 1.23/1.20 1.30/1.20 51# Kryo 1.26 0.87/0.94 1.00/1.00 52# ThunderX2 5.95 1.25 1.30 53# 54# (*) original 3.64/1.34/1.32 results were for r0p0 revision 55# and are still same even for updated module; 56# (**) numbers after slash are for 32-bit code, which is 3x- 57# interleaved; 58 59# $output is the last argument if it looks like a file (it has an extension) 60# $flavour is the first argument if it doesn't look like a file 61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71*STDOUT=*OUT; 72 73$prefix="aes_v8"; 74 75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 76 77$code=<<___; 78#include "arm_arch.h" 79 80#if __ARM_MAX_ARCH__>=7 81___ 82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 83$code.=<<___ if ($flavour !~ /64/); 84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 85.fpu neon 86#ifdef __thumb2__ 87.syntax unified 88.thumb 89# define INST(a,b,c,d) $_byte c,d|0xc,a,b 90#else 91.code 32 92# define INST(a,b,c,d) $_byte a,b,c,d 93#endif 94 95.text 96___ 97 98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 100# maintain both 32- and 64-bit codes within single module and 101# transliterate common code to either flavour with regex vodoo. 102# 103{{{ 104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 107 108 109# 110# This file generates .s file for 64-bit and 32-bit CPUs. 111# We don't implement .rodata on 32-bit CPUs yet. 112# 113$code.=".rodata\n" if ($flavour =~ /64/); 114$code.=<<___; 115.align 5 116.Lrcon: 117.long 0x01,0x01,0x01,0x01 118.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 119.long 0x1b,0x1b,0x1b,0x1b 120___ 121$code.=".previous\n" if ($flavour =~ /64/); 122 123$code.=<<___; 124.globl ${prefix}_set_encrypt_key 125.type ${prefix}_set_encrypt_key,%function 126.align 5 127${prefix}_set_encrypt_key: 128.Lenc_key: 129___ 130$code.=<<___ if ($flavour =~ /64/); 131 AARCH64_VALID_CALL_TARGET 132 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 133 stp x29,x30,[sp,#-16]! 134 add x29,sp,#0 135___ 136$code.=<<___; 137 mov $ptr,#-1 138 cmp $inp,#0 139 b.eq .Lenc_key_abort 140 cmp $out,#0 141 b.eq .Lenc_key_abort 142 mov $ptr,#-2 143 cmp $bits,#128 144 b.lt .Lenc_key_abort 145 cmp $bits,#256 146 b.gt .Lenc_key_abort 147 tst $bits,#0x3f 148 b.ne .Lenc_key_abort 149 150___ 151$code.=<<___ if ($flavour =~ /64/); 152 adrp $ptr,.Lrcon 153 add $ptr,$ptr,:lo12:.Lrcon 154___ 155$code.=<<___ if ($flavour =~ /32/); 156 adr $ptr,.Lrcon 157___ 158$code.=<<___; 159 cmp $bits,#192 160 161 veor $zero,$zero,$zero 162 vld1.8 {$in0},[$inp],#16 163 mov $bits,#8 // reuse $bits 164 vld1.32 {$rcon,$mask},[$ptr],#32 165 166 b.lt .Loop128 167 b.eq .L192 168 b .L256 169 170.align 4 171.Loop128: 172 vtbl.8 $key,{$in0},$mask 173 vext.8 $tmp,$zero,$in0,#12 174 vst1.32 {$in0},[$out],#16 175 aese $key,$zero 176 subs $bits,$bits,#1 177 178 veor $in0,$in0,$tmp 179 vext.8 $tmp,$zero,$tmp,#12 180 veor $in0,$in0,$tmp 181 vext.8 $tmp,$zero,$tmp,#12 182 veor $key,$key,$rcon 183 veor $in0,$in0,$tmp 184 vshl.u8 $rcon,$rcon,#1 185 veor $in0,$in0,$key 186 b.ne .Loop128 187 188 vld1.32 {$rcon},[$ptr] 189 190 vtbl.8 $key,{$in0},$mask 191 vext.8 $tmp,$zero,$in0,#12 192 vst1.32 {$in0},[$out],#16 193 aese $key,$zero 194 195 veor $in0,$in0,$tmp 196 vext.8 $tmp,$zero,$tmp,#12 197 veor $in0,$in0,$tmp 198 vext.8 $tmp,$zero,$tmp,#12 199 veor $key,$key,$rcon 200 veor $in0,$in0,$tmp 201 vshl.u8 $rcon,$rcon,#1 202 veor $in0,$in0,$key 203 204 vtbl.8 $key,{$in0},$mask 205 vext.8 $tmp,$zero,$in0,#12 206 vst1.32 {$in0},[$out],#16 207 aese $key,$zero 208 209 veor $in0,$in0,$tmp 210 vext.8 $tmp,$zero,$tmp,#12 211 veor $in0,$in0,$tmp 212 vext.8 $tmp,$zero,$tmp,#12 213 veor $key,$key,$rcon 214 veor $in0,$in0,$tmp 215 veor $in0,$in0,$key 216 vst1.32 {$in0},[$out] 217 add $out,$out,#0x50 218 219 mov $rounds,#10 220 b .Ldone 221 222.align 4 223.L192: 224 vld1.8 {$in1},[$inp],#8 225 vmov.i8 $key,#8 // borrow $key 226 vst1.32 {$in0},[$out],#16 227 vsub.i8 $mask,$mask,$key // adjust the mask 228 229.Loop192: 230 vtbl.8 $key,{$in1},$mask 231 vext.8 $tmp,$zero,$in0,#12 232#ifdef __ARMEB__ 233 vst1.32 {$in1},[$out],#16 234 sub $out,$out,#8 235#else 236 vst1.32 {$in1},[$out],#8 237#endif 238 aese $key,$zero 239 subs $bits,$bits,#1 240 241 veor $in0,$in0,$tmp 242 vext.8 $tmp,$zero,$tmp,#12 243 veor $in0,$in0,$tmp 244 vext.8 $tmp,$zero,$tmp,#12 245 veor $in0,$in0,$tmp 246 247 vdup.32 $tmp,${in0}[3] 248 veor $tmp,$tmp,$in1 249 veor $key,$key,$rcon 250 vext.8 $in1,$zero,$in1,#12 251 vshl.u8 $rcon,$rcon,#1 252 veor $in1,$in1,$tmp 253 veor $in0,$in0,$key 254 veor $in1,$in1,$key 255 vst1.32 {$in0},[$out],#16 256 b.ne .Loop192 257 258 mov $rounds,#12 259 add $out,$out,#0x20 260 b .Ldone 261 262.align 4 263.L256: 264 vld1.8 {$in1},[$inp] 265 mov $bits,#7 266 mov $rounds,#14 267 vst1.32 {$in0},[$out],#16 268 269.Loop256: 270 vtbl.8 $key,{$in1},$mask 271 vext.8 $tmp,$zero,$in0,#12 272 vst1.32 {$in1},[$out],#16 273 aese $key,$zero 274 subs $bits,$bits,#1 275 276 veor $in0,$in0,$tmp 277 vext.8 $tmp,$zero,$tmp,#12 278 veor $in0,$in0,$tmp 279 vext.8 $tmp,$zero,$tmp,#12 280 veor $key,$key,$rcon 281 veor $in0,$in0,$tmp 282 vshl.u8 $rcon,$rcon,#1 283 veor $in0,$in0,$key 284 vst1.32 {$in0},[$out],#16 285 b.eq .Ldone 286 287 vdup.32 $key,${in0}[3] // just splat 288 vext.8 $tmp,$zero,$in1,#12 289 aese $key,$zero 290 291 veor $in1,$in1,$tmp 292 vext.8 $tmp,$zero,$tmp,#12 293 veor $in1,$in1,$tmp 294 vext.8 $tmp,$zero,$tmp,#12 295 veor $in1,$in1,$tmp 296 297 veor $in1,$in1,$key 298 b .Loop256 299 300.Ldone: 301 str $rounds,[$out] 302 mov $ptr,#0 303 304.Lenc_key_abort: 305 mov x0,$ptr // return value 306 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 307 ret 308.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 309 310.globl ${prefix}_set_decrypt_key 311.type ${prefix}_set_decrypt_key,%function 312.align 5 313${prefix}_set_decrypt_key: 314___ 315$code.=<<___ if ($flavour =~ /64/); 316 AARCH64_SIGN_LINK_REGISTER 317 stp x29,x30,[sp,#-16]! 318 add x29,sp,#0 319___ 320$code.=<<___ if ($flavour !~ /64/); 321 stmdb sp!,{r4,lr} 322___ 323$code.=<<___; 324 bl .Lenc_key 325 326 cmp x0,#0 327 b.ne .Ldec_key_abort 328 329 sub $out,$out,#240 // restore original $out 330 mov x4,#-16 331 add $inp,$out,x12,lsl#4 // end of key schedule 332 333 vld1.32 {v0.16b},[$out] 334 vld1.32 {v1.16b},[$inp] 335 vst1.32 {v0.16b},[$inp],x4 336 vst1.32 {v1.16b},[$out],#16 337 338.Loop_imc: 339 vld1.32 {v0.16b},[$out] 340 vld1.32 {v1.16b},[$inp] 341 aesimc v0.16b,v0.16b 342 aesimc v1.16b,v1.16b 343 vst1.32 {v0.16b},[$inp],x4 344 vst1.32 {v1.16b},[$out],#16 345 cmp $inp,$out 346 b.hi .Loop_imc 347 348 vld1.32 {v0.16b},[$out] 349 aesimc v0.16b,v0.16b 350 vst1.32 {v0.16b},[$inp] 351 352 eor x0,x0,x0 // return value 353.Ldec_key_abort: 354___ 355$code.=<<___ if ($flavour !~ /64/); 356 ldmia sp!,{r4,pc} 357___ 358$code.=<<___ if ($flavour =~ /64/); 359 ldp x29,x30,[sp],#16 360 AARCH64_VALIDATE_LINK_REGISTER 361 ret 362___ 363$code.=<<___; 364.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 365___ 366}}} 367{{{ 368sub gen_block () { 369my $dir = shift; 370my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 371my ($inp,$out,$key)=map("x$_",(0..2)); 372my $rounds="w3"; 373my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 374 375$code.=<<___; 376.globl ${prefix}_${dir}crypt 377.type ${prefix}_${dir}crypt,%function 378.align 5 379${prefix}_${dir}crypt: 380___ 381$code.=<<___ if ($flavour =~ /64/); 382 AARCH64_VALID_CALL_TARGET 383___ 384$code.=<<___; 385 ldr $rounds,[$key,#240] 386 vld1.32 {$rndkey0},[$key],#16 387 vld1.8 {$inout},[$inp] 388 sub $rounds,$rounds,#2 389 vld1.32 {$rndkey1},[$key],#16 390 391.Loop_${dir}c: 392 aes$e $inout,$rndkey0 393 aes$mc $inout,$inout 394 vld1.32 {$rndkey0},[$key],#16 395 subs $rounds,$rounds,#2 396 aes$e $inout,$rndkey1 397 aes$mc $inout,$inout 398 vld1.32 {$rndkey1},[$key],#16 399 b.gt .Loop_${dir}c 400 401 aes$e $inout,$rndkey0 402 aes$mc $inout,$inout 403 vld1.32 {$rndkey0},[$key] 404 aes$e $inout,$rndkey1 405 veor $inout,$inout,$rndkey0 406 407 vst1.8 {$inout},[$out] 408 ret 409.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 410___ 411} 412&gen_block("en"); 413&gen_block("de"); 414}}} 415 416# Performance in cycles per byte. 417# Processed with AES-ECB different key size. 418# It shows the value before and after optimization as below: 419# (before/after): 420# 421# AES-128-ECB AES-192-ECB AES-256-ECB 422# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 423# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 424 425# Optimization is implemented by loop unrolling and interleaving. 426# Commonly, we choose the unrolling factor as 5, if the input 427# data size smaller than 5 blocks, but not smaller than 3 blocks, 428# choose 3 as the unrolling factor. 429# If the input data size dsize >= 5*16 bytes, then take 5 blocks 430# as one iteration, every loop the left size lsize -= 5*16. 431# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, 432# every loop lsize -=3*16. 433# If lsize < 3*16 bytes, treat them as the tail, interleave the 434# two blocks AES instructions. 435# There is one special case, if the original input data size dsize 436# = 16 bytes, we will treat it separately to improve the 437# performance: one independent code block without LR, FP load and 438# store, just looks like what the original ECB implementation does. 439 440{{{ 441my ($inp,$out,$len,$key)=map("x$_",(0..3)); 442my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); 443my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 444 445my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 446 447### q7 last round key 448### q10-q15 q7 Last 7 round keys 449### q8-q9 preloaded round keys except last 7 keys for big size 450### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte 451 452{ 453my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 454 455my ($dat3,$in3,$tmp3); # used only in 64-bit mode 456my ($dat4,$in4,$tmp4); 457if ($flavour =~ /64/) { 458 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 459} 460 461$code.=<<___; 462.globl ${prefix}_ecb_encrypt 463.type ${prefix}_ecb_encrypt,%function 464.align 5 465${prefix}_ecb_encrypt: 466___ 467$code.=<<___ if ($flavour =~ /64/); 468 AARCH64_VALID_CALL_TARGET 469 subs $len,$len,#16 470 // Original input data size bigger than 16, jump to big size processing. 471 b.ne .Lecb_big_size 472 vld1.8 {$dat0},[$inp] 473 cmp $enc,#0 // en- or decrypting? 474 ldr $rounds,[$key,#240] 475 vld1.32 {q5-q6},[$key],#32 // load key schedule... 476 477 b.eq .Lecb_small_dec 478 aese $dat0,q5 479 aesmc $dat0,$dat0 480 vld1.32 {q8-q9},[$key],#32 // load key schedule... 481 aese $dat0,q6 482 aesmc $dat0,$dat0 483 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing 484 b.eq .Lecb_128_enc 485.Lecb_round_loop: 486 aese $dat0,q8 487 aesmc $dat0,$dat0 488 vld1.32 {q8},[$key],#16 // load key schedule... 489 aese $dat0,q9 490 aesmc $dat0,$dat0 491 vld1.32 {q9},[$key],#16 // load key schedule... 492 subs $rounds,$rounds,#2 // bias 493 b.gt .Lecb_round_loop 494.Lecb_128_enc: 495 vld1.32 {q10-q11},[$key],#32 // load key schedule... 496 aese $dat0,q8 497 aesmc $dat0,$dat0 498 aese $dat0,q9 499 aesmc $dat0,$dat0 500 vld1.32 {q12-q13},[$key],#32 // load key schedule... 501 aese $dat0,q10 502 aesmc $dat0,$dat0 503 aese $dat0,q11 504 aesmc $dat0,$dat0 505 vld1.32 {q14-q15},[$key],#32 // load key schedule... 506 aese $dat0,q12 507 aesmc $dat0,$dat0 508 aese $dat0,q13 509 aesmc $dat0,$dat0 510 vld1.32 {$rndlast},[$key] 511 aese $dat0,q14 512 aesmc $dat0,$dat0 513 aese $dat0,q15 514 veor $dat0,$dat0,$rndlast 515 vst1.8 {$dat0},[$out] 516 b .Lecb_Final_abort 517.Lecb_small_dec: 518 aesd $dat0,q5 519 aesimc $dat0,$dat0 520 vld1.32 {q8-q9},[$key],#32 // load key schedule... 521 aesd $dat0,q6 522 aesimc $dat0,$dat0 523 subs $rounds,$rounds,#10 // bias 524 b.eq .Lecb_128_dec 525.Lecb_dec_round_loop: 526 aesd $dat0,q8 527 aesimc $dat0,$dat0 528 vld1.32 {q8},[$key],#16 // load key schedule... 529 aesd $dat0,q9 530 aesimc $dat0,$dat0 531 vld1.32 {q9},[$key],#16 // load key schedule... 532 subs $rounds,$rounds,#2 // bias 533 b.gt .Lecb_dec_round_loop 534.Lecb_128_dec: 535 vld1.32 {q10-q11},[$key],#32 // load key schedule... 536 aesd $dat0,q8 537 aesimc $dat0,$dat0 538 aesd $dat0,q9 539 aesimc $dat0,$dat0 540 vld1.32 {q12-q13},[$key],#32 // load key schedule... 541 aesd $dat0,q10 542 aesimc $dat0,$dat0 543 aesd $dat0,q11 544 aesimc $dat0,$dat0 545 vld1.32 {q14-q15},[$key],#32 // load key schedule... 546 aesd $dat0,q12 547 aesimc $dat0,$dat0 548 aesd $dat0,q13 549 aesimc $dat0,$dat0 550 vld1.32 {$rndlast},[$key] 551 aesd $dat0,q14 552 aesimc $dat0,$dat0 553 aesd $dat0,q15 554 veor $dat0,$dat0,$rndlast 555 vst1.8 {$dat0},[$out] 556 b .Lecb_Final_abort 557.Lecb_big_size: 558___ 559$code.=<<___ if ($flavour =~ /64/); 560 stp x29,x30,[sp,#-16]! 561 add x29,sp,#0 562___ 563$code.=<<___ if ($flavour !~ /64/); 564 mov ip,sp 565 stmdb sp!,{r4-r8,lr} 566 vstmdb sp!,{d8-d15} @ ABI specification says so 567 ldmia ip,{r4-r5} @ load remaining args 568 subs $len,$len,#16 569___ 570$code.=<<___; 571 mov $step,#16 572 b.lo .Lecb_done 573 cclr $step,eq 574 575 cmp $enc,#0 // en- or decrypting? 576 ldr $rounds,[$key,#240] 577 and $len,$len,#-16 578 vld1.8 {$dat},[$inp],$step 579 580 vld1.32 {q8-q9},[$key] // load key schedule... 581 sub $rounds,$rounds,#6 582 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 583 sub $rounds,$rounds,#2 584 vld1.32 {q10-q11},[$key_],#32 585 vld1.32 {q12-q13},[$key_],#32 586 vld1.32 {q14-q15},[$key_],#32 587 vld1.32 {$rndlast},[$key_] 588 589 add $key_,$key,#32 590 mov $cnt,$rounds 591 b.eq .Lecb_dec 592 593 vld1.8 {$dat1},[$inp],#16 594 subs $len,$len,#32 // bias 595 add $cnt,$rounds,#2 596 vorr $in1,$dat1,$dat1 597 vorr $dat2,$dat1,$dat1 598 vorr $dat1,$dat,$dat 599 b.lo .Lecb_enc_tail 600 601 vorr $dat1,$in1,$in1 602 vld1.8 {$dat2},[$inp],#16 603___ 604$code.=<<___ if ($flavour =~ /64/); 605 cmp $len,#32 606 b.lo .Loop3x_ecb_enc 607 608 vld1.8 {$dat3},[$inp],#16 609 vld1.8 {$dat4},[$inp],#16 610 sub $len,$len,#32 // bias 611 mov $cnt,$rounds 612 613.Loop5x_ecb_enc: 614 aese $dat0,q8 615 aesmc $dat0,$dat0 616 aese $dat1,q8 617 aesmc $dat1,$dat1 618 aese $dat2,q8 619 aesmc $dat2,$dat2 620 aese $dat3,q8 621 aesmc $dat3,$dat3 622 aese $dat4,q8 623 aesmc $dat4,$dat4 624 vld1.32 {q8},[$key_],#16 625 subs $cnt,$cnt,#2 626 aese $dat0,q9 627 aesmc $dat0,$dat0 628 aese $dat1,q9 629 aesmc $dat1,$dat1 630 aese $dat2,q9 631 aesmc $dat2,$dat2 632 aese $dat3,q9 633 aesmc $dat3,$dat3 634 aese $dat4,q9 635 aesmc $dat4,$dat4 636 vld1.32 {q9},[$key_],#16 637 b.gt .Loop5x_ecb_enc 638 639 aese $dat0,q8 640 aesmc $dat0,$dat0 641 aese $dat1,q8 642 aesmc $dat1,$dat1 643 aese $dat2,q8 644 aesmc $dat2,$dat2 645 aese $dat3,q8 646 aesmc $dat3,$dat3 647 aese $dat4,q8 648 aesmc $dat4,$dat4 649 cmp $len,#0x40 // because .Lecb_enc_tail4x 650 sub $len,$len,#0x50 651 652 aese $dat0,q9 653 aesmc $dat0,$dat0 654 aese $dat1,q9 655 aesmc $dat1,$dat1 656 aese $dat2,q9 657 aesmc $dat2,$dat2 658 aese $dat3,q9 659 aesmc $dat3,$dat3 660 aese $dat4,q9 661 aesmc $dat4,$dat4 662 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 663 mov $key_,$key 664 665 aese $dat0,q10 666 aesmc $dat0,$dat0 667 aese $dat1,q10 668 aesmc $dat1,$dat1 669 aese $dat2,q10 670 aesmc $dat2,$dat2 671 aese $dat3,q10 672 aesmc $dat3,$dat3 673 aese $dat4,q10 674 aesmc $dat4,$dat4 675 add $inp,$inp,x6 // $inp is adjusted in such way that 676 // at exit from the loop $dat1-$dat4 677 // are loaded with last "words" 678 add x6,$len,#0x60 // because .Lecb_enc_tail4x 679 680 aese $dat0,q11 681 aesmc $dat0,$dat0 682 aese $dat1,q11 683 aesmc $dat1,$dat1 684 aese $dat2,q11 685 aesmc $dat2,$dat2 686 aese $dat3,q11 687 aesmc $dat3,$dat3 688 aese $dat4,q11 689 aesmc $dat4,$dat4 690 691 aese $dat0,q12 692 aesmc $dat0,$dat0 693 aese $dat1,q12 694 aesmc $dat1,$dat1 695 aese $dat2,q12 696 aesmc $dat2,$dat2 697 aese $dat3,q12 698 aesmc $dat3,$dat3 699 aese $dat4,q12 700 aesmc $dat4,$dat4 701 702 aese $dat0,q13 703 aesmc $dat0,$dat0 704 aese $dat1,q13 705 aesmc $dat1,$dat1 706 aese $dat2,q13 707 aesmc $dat2,$dat2 708 aese $dat3,q13 709 aesmc $dat3,$dat3 710 aese $dat4,q13 711 aesmc $dat4,$dat4 712 713 aese $dat0,q14 714 aesmc $dat0,$dat0 715 aese $dat1,q14 716 aesmc $dat1,$dat1 717 aese $dat2,q14 718 aesmc $dat2,$dat2 719 aese $dat3,q14 720 aesmc $dat3,$dat3 721 aese $dat4,q14 722 aesmc $dat4,$dat4 723 724 aese $dat0,q15 725 vld1.8 {$in0},[$inp],#16 726 aese $dat1,q15 727 vld1.8 {$in1},[$inp],#16 728 aese $dat2,q15 729 vld1.8 {$in2},[$inp],#16 730 aese $dat3,q15 731 vld1.8 {$in3},[$inp],#16 732 aese $dat4,q15 733 vld1.8 {$in4},[$inp],#16 734 cbz x6,.Lecb_enc_tail4x 735 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 736 veor $tmp0,$rndlast,$dat0 737 vorr $dat0,$in0,$in0 738 veor $tmp1,$rndlast,$dat1 739 vorr $dat1,$in1,$in1 740 veor $tmp2,$rndlast,$dat2 741 vorr $dat2,$in2,$in2 742 veor $tmp3,$rndlast,$dat3 743 vorr $dat3,$in3,$in3 744 veor $tmp4,$rndlast,$dat4 745 vst1.8 {$tmp0},[$out],#16 746 vorr $dat4,$in4,$in4 747 vst1.8 {$tmp1},[$out],#16 748 mov $cnt,$rounds 749 vst1.8 {$tmp2},[$out],#16 750 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 751 vst1.8 {$tmp3},[$out],#16 752 vst1.8 {$tmp4},[$out],#16 753 b.hs .Loop5x_ecb_enc 754 755 add $len,$len,#0x50 756 cbz $len,.Lecb_done 757 758 add $cnt,$rounds,#2 759 subs $len,$len,#0x30 760 vorr $dat0,$in2,$in2 761 vorr $dat1,$in3,$in3 762 vorr $dat2,$in4,$in4 763 b.lo .Lecb_enc_tail 764 765 b .Loop3x_ecb_enc 766 767.align 4 768.Lecb_enc_tail4x: 769 veor $tmp1,$rndlast,$dat1 770 veor $tmp2,$rndlast,$dat2 771 veor $tmp3,$rndlast,$dat3 772 veor $tmp4,$rndlast,$dat4 773 vst1.8 {$tmp1},[$out],#16 774 vst1.8 {$tmp2},[$out],#16 775 vst1.8 {$tmp3},[$out],#16 776 vst1.8 {$tmp4},[$out],#16 777 778 b .Lecb_done 779.align 4 780___ 781$code.=<<___; 782.Loop3x_ecb_enc: 783 aese $dat0,q8 784 aesmc $dat0,$dat0 785 aese $dat1,q8 786 aesmc $dat1,$dat1 787 aese $dat2,q8 788 aesmc $dat2,$dat2 789 vld1.32 {q8},[$key_],#16 790 subs $cnt,$cnt,#2 791 aese $dat0,q9 792 aesmc $dat0,$dat0 793 aese $dat1,q9 794 aesmc $dat1,$dat1 795 aese $dat2,q9 796 aesmc $dat2,$dat2 797 vld1.32 {q9},[$key_],#16 798 b.gt .Loop3x_ecb_enc 799 800 aese $dat0,q8 801 aesmc $dat0,$dat0 802 aese $dat1,q8 803 aesmc $dat1,$dat1 804 aese $dat2,q8 805 aesmc $dat2,$dat2 806 subs $len,$len,#0x30 807 mov.lo x6,$len // x6, $cnt, is zero at this point 808 aese $dat0,q9 809 aesmc $dat0,$dat0 810 aese $dat1,q9 811 aesmc $dat1,$dat1 812 aese $dat2,q9 813 aesmc $dat2,$dat2 814 add $inp,$inp,x6 // $inp is adjusted in such way that 815 // at exit from the loop $dat1-$dat2 816 // are loaded with last "words" 817 mov $key_,$key 818 aese $dat0,q12 819 aesmc $dat0,$dat0 820 aese $dat1,q12 821 aesmc $dat1,$dat1 822 aese $dat2,q12 823 aesmc $dat2,$dat2 824 vld1.8 {$in0},[$inp],#16 825 aese $dat0,q13 826 aesmc $dat0,$dat0 827 aese $dat1,q13 828 aesmc $dat1,$dat1 829 aese $dat2,q13 830 aesmc $dat2,$dat2 831 vld1.8 {$in1},[$inp],#16 832 aese $dat0,q14 833 aesmc $dat0,$dat0 834 aese $dat1,q14 835 aesmc $dat1,$dat1 836 aese $dat2,q14 837 aesmc $dat2,$dat2 838 vld1.8 {$in2},[$inp],#16 839 aese $dat0,q15 840 aese $dat1,q15 841 aese $dat2,q15 842 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 843 add $cnt,$rounds,#2 844 veor $tmp0,$rndlast,$dat0 845 veor $tmp1,$rndlast,$dat1 846 veor $dat2,$dat2,$rndlast 847 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 848 vst1.8 {$tmp0},[$out],#16 849 vorr $dat0,$in0,$in0 850 vst1.8 {$tmp1},[$out],#16 851 vorr $dat1,$in1,$in1 852 vst1.8 {$dat2},[$out],#16 853 vorr $dat2,$in2,$in2 854 b.hs .Loop3x_ecb_enc 855 856 cmn $len,#0x30 857 b.eq .Lecb_done 858 nop 859 860.Lecb_enc_tail: 861 aese $dat1,q8 862 aesmc $dat1,$dat1 863 aese $dat2,q8 864 aesmc $dat2,$dat2 865 vld1.32 {q8},[$key_],#16 866 subs $cnt,$cnt,#2 867 aese $dat1,q9 868 aesmc $dat1,$dat1 869 aese $dat2,q9 870 aesmc $dat2,$dat2 871 vld1.32 {q9},[$key_],#16 872 b.gt .Lecb_enc_tail 873 874 aese $dat1,q8 875 aesmc $dat1,$dat1 876 aese $dat2,q8 877 aesmc $dat2,$dat2 878 aese $dat1,q9 879 aesmc $dat1,$dat1 880 aese $dat2,q9 881 aesmc $dat2,$dat2 882 aese $dat1,q12 883 aesmc $dat1,$dat1 884 aese $dat2,q12 885 aesmc $dat2,$dat2 886 cmn $len,#0x20 887 aese $dat1,q13 888 aesmc $dat1,$dat1 889 aese $dat2,q13 890 aesmc $dat2,$dat2 891 aese $dat1,q14 892 aesmc $dat1,$dat1 893 aese $dat2,q14 894 aesmc $dat2,$dat2 895 aese $dat1,q15 896 aese $dat2,q15 897 b.eq .Lecb_enc_one 898 veor $tmp1,$rndlast,$dat1 899 veor $tmp2,$rndlast,$dat2 900 vst1.8 {$tmp1},[$out],#16 901 vst1.8 {$tmp2},[$out],#16 902 b .Lecb_done 903 904.Lecb_enc_one: 905 veor $tmp1,$rndlast,$dat2 906 vst1.8 {$tmp1},[$out],#16 907 b .Lecb_done 908___ 909 910$code.=<<___; 911.align 5 912.Lecb_dec: 913 vld1.8 {$dat1},[$inp],#16 914 subs $len,$len,#32 // bias 915 add $cnt,$rounds,#2 916 vorr $in1,$dat1,$dat1 917 vorr $dat2,$dat1,$dat1 918 vorr $dat1,$dat,$dat 919 b.lo .Lecb_dec_tail 920 921 vorr $dat1,$in1,$in1 922 vld1.8 {$dat2},[$inp],#16 923___ 924$code.=<<___ if ($flavour =~ /64/); 925 cmp $len,#32 926 b.lo .Loop3x_ecb_dec 927 928 vld1.8 {$dat3},[$inp],#16 929 vld1.8 {$dat4},[$inp],#16 930 sub $len,$len,#32 // bias 931 mov $cnt,$rounds 932 933.Loop5x_ecb_dec: 934 aesd $dat0,q8 935 aesimc $dat0,$dat0 936 aesd $dat1,q8 937 aesimc $dat1,$dat1 938 aesd $dat2,q8 939 aesimc $dat2,$dat2 940 aesd $dat3,q8 941 aesimc $dat3,$dat3 942 aesd $dat4,q8 943 aesimc $dat4,$dat4 944 vld1.32 {q8},[$key_],#16 945 subs $cnt,$cnt,#2 946 aesd $dat0,q9 947 aesimc $dat0,$dat0 948 aesd $dat1,q9 949 aesimc $dat1,$dat1 950 aesd $dat2,q9 951 aesimc $dat2,$dat2 952 aesd $dat3,q9 953 aesimc $dat3,$dat3 954 aesd $dat4,q9 955 aesimc $dat4,$dat4 956 vld1.32 {q9},[$key_],#16 957 b.gt .Loop5x_ecb_dec 958 959 aesd $dat0,q8 960 aesimc $dat0,$dat0 961 aesd $dat1,q8 962 aesimc $dat1,$dat1 963 aesd $dat2,q8 964 aesimc $dat2,$dat2 965 aesd $dat3,q8 966 aesimc $dat3,$dat3 967 aesd $dat4,q8 968 aesimc $dat4,$dat4 969 cmp $len,#0x40 // because .Lecb_tail4x 970 sub $len,$len,#0x50 971 972 aesd $dat0,q9 973 aesimc $dat0,$dat0 974 aesd $dat1,q9 975 aesimc $dat1,$dat1 976 aesd $dat2,q9 977 aesimc $dat2,$dat2 978 aesd $dat3,q9 979 aesimc $dat3,$dat3 980 aesd $dat4,q9 981 aesimc $dat4,$dat4 982 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 983 mov $key_,$key 984 985 aesd $dat0,q10 986 aesimc $dat0,$dat0 987 aesd $dat1,q10 988 aesimc $dat1,$dat1 989 aesd $dat2,q10 990 aesimc $dat2,$dat2 991 aesd $dat3,q10 992 aesimc $dat3,$dat3 993 aesd $dat4,q10 994 aesimc $dat4,$dat4 995 add $inp,$inp,x6 // $inp is adjusted in such way that 996 // at exit from the loop $dat1-$dat4 997 // are loaded with last "words" 998 add x6,$len,#0x60 // because .Lecb_tail4x 999 1000 aesd $dat0,q11 1001 aesimc $dat0,$dat0 1002 aesd $dat1,q11 1003 aesimc $dat1,$dat1 1004 aesd $dat2,q11 1005 aesimc $dat2,$dat2 1006 aesd $dat3,q11 1007 aesimc $dat3,$dat3 1008 aesd $dat4,q11 1009 aesimc $dat4,$dat4 1010 1011 aesd $dat0,q12 1012 aesimc $dat0,$dat0 1013 aesd $dat1,q12 1014 aesimc $dat1,$dat1 1015 aesd $dat2,q12 1016 aesimc $dat2,$dat2 1017 aesd $dat3,q12 1018 aesimc $dat3,$dat3 1019 aesd $dat4,q12 1020 aesimc $dat4,$dat4 1021 1022 aesd $dat0,q13 1023 aesimc $dat0,$dat0 1024 aesd $dat1,q13 1025 aesimc $dat1,$dat1 1026 aesd $dat2,q13 1027 aesimc $dat2,$dat2 1028 aesd $dat3,q13 1029 aesimc $dat3,$dat3 1030 aesd $dat4,q13 1031 aesimc $dat4,$dat4 1032 1033 aesd $dat0,q14 1034 aesimc $dat0,$dat0 1035 aesd $dat1,q14 1036 aesimc $dat1,$dat1 1037 aesd $dat2,q14 1038 aesimc $dat2,$dat2 1039 aesd $dat3,q14 1040 aesimc $dat3,$dat3 1041 aesd $dat4,q14 1042 aesimc $dat4,$dat4 1043 1044 aesd $dat0,q15 1045 vld1.8 {$in0},[$inp],#16 1046 aesd $dat1,q15 1047 vld1.8 {$in1},[$inp],#16 1048 aesd $dat2,q15 1049 vld1.8 {$in2},[$inp],#16 1050 aesd $dat3,q15 1051 vld1.8 {$in3},[$inp],#16 1052 aesd $dat4,q15 1053 vld1.8 {$in4},[$inp],#16 1054 cbz x6,.Lecb_tail4x 1055 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1056 veor $tmp0,$rndlast,$dat0 1057 vorr $dat0,$in0,$in0 1058 veor $tmp1,$rndlast,$dat1 1059 vorr $dat1,$in1,$in1 1060 veor $tmp2,$rndlast,$dat2 1061 vorr $dat2,$in2,$in2 1062 veor $tmp3,$rndlast,$dat3 1063 vorr $dat3,$in3,$in3 1064 veor $tmp4,$rndlast,$dat4 1065 vst1.8 {$tmp0},[$out],#16 1066 vorr $dat4,$in4,$in4 1067 vst1.8 {$tmp1},[$out],#16 1068 mov $cnt,$rounds 1069 vst1.8 {$tmp2},[$out],#16 1070 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1071 vst1.8 {$tmp3},[$out],#16 1072 vst1.8 {$tmp4},[$out],#16 1073 b.hs .Loop5x_ecb_dec 1074 1075 add $len,$len,#0x50 1076 cbz $len,.Lecb_done 1077 1078 add $cnt,$rounds,#2 1079 subs $len,$len,#0x30 1080 vorr $dat0,$in2,$in2 1081 vorr $dat1,$in3,$in3 1082 vorr $dat2,$in4,$in4 1083 b.lo .Lecb_dec_tail 1084 1085 b .Loop3x_ecb_dec 1086 1087.align 4 1088.Lecb_tail4x: 1089 veor $tmp1,$rndlast,$dat1 1090 veor $tmp2,$rndlast,$dat2 1091 veor $tmp3,$rndlast,$dat3 1092 veor $tmp4,$rndlast,$dat4 1093 vst1.8 {$tmp1},[$out],#16 1094 vst1.8 {$tmp2},[$out],#16 1095 vst1.8 {$tmp3},[$out],#16 1096 vst1.8 {$tmp4},[$out],#16 1097 1098 b .Lecb_done 1099.align 4 1100___ 1101$code.=<<___; 1102.Loop3x_ecb_dec: 1103 aesd $dat0,q8 1104 aesimc $dat0,$dat0 1105 aesd $dat1,q8 1106 aesimc $dat1,$dat1 1107 aesd $dat2,q8 1108 aesimc $dat2,$dat2 1109 vld1.32 {q8},[$key_],#16 1110 subs $cnt,$cnt,#2 1111 aesd $dat0,q9 1112 aesimc $dat0,$dat0 1113 aesd $dat1,q9 1114 aesimc $dat1,$dat1 1115 aesd $dat2,q9 1116 aesimc $dat2,$dat2 1117 vld1.32 {q9},[$key_],#16 1118 b.gt .Loop3x_ecb_dec 1119 1120 aesd $dat0,q8 1121 aesimc $dat0,$dat0 1122 aesd $dat1,q8 1123 aesimc $dat1,$dat1 1124 aesd $dat2,q8 1125 aesimc $dat2,$dat2 1126 subs $len,$len,#0x30 1127 mov.lo x6,$len // x6, $cnt, is zero at this point 1128 aesd $dat0,q9 1129 aesimc $dat0,$dat0 1130 aesd $dat1,q9 1131 aesimc $dat1,$dat1 1132 aesd $dat2,q9 1133 aesimc $dat2,$dat2 1134 add $inp,$inp,x6 // $inp is adjusted in such way that 1135 // at exit from the loop $dat1-$dat2 1136 // are loaded with last "words" 1137 mov $key_,$key 1138 aesd $dat0,q12 1139 aesimc $dat0,$dat0 1140 aesd $dat1,q12 1141 aesimc $dat1,$dat1 1142 aesd $dat2,q12 1143 aesimc $dat2,$dat2 1144 vld1.8 {$in0},[$inp],#16 1145 aesd $dat0,q13 1146 aesimc $dat0,$dat0 1147 aesd $dat1,q13 1148 aesimc $dat1,$dat1 1149 aesd $dat2,q13 1150 aesimc $dat2,$dat2 1151 vld1.8 {$in1},[$inp],#16 1152 aesd $dat0,q14 1153 aesimc $dat0,$dat0 1154 aesd $dat1,q14 1155 aesimc $dat1,$dat1 1156 aesd $dat2,q14 1157 aesimc $dat2,$dat2 1158 vld1.8 {$in2},[$inp],#16 1159 aesd $dat0,q15 1160 aesd $dat1,q15 1161 aesd $dat2,q15 1162 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1163 add $cnt,$rounds,#2 1164 veor $tmp0,$rndlast,$dat0 1165 veor $tmp1,$rndlast,$dat1 1166 veor $dat2,$dat2,$rndlast 1167 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1168 vst1.8 {$tmp0},[$out],#16 1169 vorr $dat0,$in0,$in0 1170 vst1.8 {$tmp1},[$out],#16 1171 vorr $dat1,$in1,$in1 1172 vst1.8 {$dat2},[$out],#16 1173 vorr $dat2,$in2,$in2 1174 b.hs .Loop3x_ecb_dec 1175 1176 cmn $len,#0x30 1177 b.eq .Lecb_done 1178 nop 1179 1180.Lecb_dec_tail: 1181 aesd $dat1,q8 1182 aesimc $dat1,$dat1 1183 aesd $dat2,q8 1184 aesimc $dat2,$dat2 1185 vld1.32 {q8},[$key_],#16 1186 subs $cnt,$cnt,#2 1187 aesd $dat1,q9 1188 aesimc $dat1,$dat1 1189 aesd $dat2,q9 1190 aesimc $dat2,$dat2 1191 vld1.32 {q9},[$key_],#16 1192 b.gt .Lecb_dec_tail 1193 1194 aesd $dat1,q8 1195 aesimc $dat1,$dat1 1196 aesd $dat2,q8 1197 aesimc $dat2,$dat2 1198 aesd $dat1,q9 1199 aesimc $dat1,$dat1 1200 aesd $dat2,q9 1201 aesimc $dat2,$dat2 1202 aesd $dat1,q12 1203 aesimc $dat1,$dat1 1204 aesd $dat2,q12 1205 aesimc $dat2,$dat2 1206 cmn $len,#0x20 1207 aesd $dat1,q13 1208 aesimc $dat1,$dat1 1209 aesd $dat2,q13 1210 aesimc $dat2,$dat2 1211 aesd $dat1,q14 1212 aesimc $dat1,$dat1 1213 aesd $dat2,q14 1214 aesimc $dat2,$dat2 1215 aesd $dat1,q15 1216 aesd $dat2,q15 1217 b.eq .Lecb_dec_one 1218 veor $tmp1,$rndlast,$dat1 1219 veor $tmp2,$rndlast,$dat2 1220 vst1.8 {$tmp1},[$out],#16 1221 vst1.8 {$tmp2},[$out],#16 1222 b .Lecb_done 1223 1224.Lecb_dec_one: 1225 veor $tmp1,$rndlast,$dat2 1226 vst1.8 {$tmp1},[$out],#16 1227 1228.Lecb_done: 1229___ 1230} 1231$code.=<<___ if ($flavour !~ /64/); 1232 vldmia sp!,{d8-d15} 1233 ldmia sp!,{r4-r8,pc} 1234___ 1235$code.=<<___ if ($flavour =~ /64/); 1236 ldr x29,[sp],#16 1237___ 1238$code.=<<___ if ($flavour =~ /64/); 1239.Lecb_Final_abort: 1240 ret 1241___ 1242$code.=<<___; 1243.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 1244___ 1245}}} 1246{{{ 1247my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 1248my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 1249my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1250 1251my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 1252my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 1253 1254### q8-q15 preloaded key schedule 1255 1256$code.=<<___; 1257.globl ${prefix}_cbc_encrypt 1258.type ${prefix}_cbc_encrypt,%function 1259.align 5 1260${prefix}_cbc_encrypt: 1261___ 1262$code.=<<___ if ($flavour =~ /64/); 1263 AARCH64_VALID_CALL_TARGET 1264 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1265 stp x29,x30,[sp,#-16]! 1266 add x29,sp,#0 1267___ 1268$code.=<<___ if ($flavour !~ /64/); 1269 mov ip,sp 1270 stmdb sp!,{r4-r8,lr} 1271 vstmdb sp!,{d8-d15} @ ABI specification says so 1272 ldmia ip,{r4-r5} @ load remaining args 1273___ 1274$code.=<<___; 1275 subs $len,$len,#16 1276 mov $step,#16 1277 b.lo .Lcbc_abort 1278 cclr $step,eq 1279 1280 cmp $enc,#0 // en- or decrypting? 1281 ldr $rounds,[$key,#240] 1282 and $len,$len,#-16 1283 vld1.8 {$ivec},[$ivp] 1284 vld1.8 {$dat},[$inp],$step 1285 1286 vld1.32 {q8-q9},[$key] // load key schedule... 1287 sub $rounds,$rounds,#6 1288 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 1289 sub $rounds,$rounds,#2 1290 vld1.32 {q10-q11},[$key_],#32 1291 vld1.32 {q12-q13},[$key_],#32 1292 vld1.32 {q14-q15},[$key_],#32 1293 vld1.32 {$rndlast},[$key_] 1294 1295 add $key_,$key,#32 1296 mov $cnt,$rounds 1297 b.eq .Lcbc_dec 1298 1299 cmp $rounds,#2 1300 veor $dat,$dat,$ivec 1301 veor $rndzero_n_last,q8,$rndlast 1302 b.eq .Lcbc_enc128 1303 1304 vld1.32 {$in0-$in1},[$key_] 1305 add $key_,$key,#16 1306 add $key4,$key,#16*4 1307 add $key5,$key,#16*5 1308 aese $dat,q8 1309 aesmc $dat,$dat 1310 add $key6,$key,#16*6 1311 add $key7,$key,#16*7 1312 b .Lenter_cbc_enc 1313 1314.align 4 1315.Loop_cbc_enc: 1316 aese $dat,q8 1317 aesmc $dat,$dat 1318 vst1.8 {$ivec},[$out],#16 1319.Lenter_cbc_enc: 1320 aese $dat,q9 1321 aesmc $dat,$dat 1322 aese $dat,$in0 1323 aesmc $dat,$dat 1324 vld1.32 {q8},[$key4] 1325 cmp $rounds,#4 1326 aese $dat,$in1 1327 aesmc $dat,$dat 1328 vld1.32 {q9},[$key5] 1329 b.eq .Lcbc_enc192 1330 1331 aese $dat,q8 1332 aesmc $dat,$dat 1333 vld1.32 {q8},[$key6] 1334 aese $dat,q9 1335 aesmc $dat,$dat 1336 vld1.32 {q9},[$key7] 1337 nop 1338 1339.Lcbc_enc192: 1340 aese $dat,q8 1341 aesmc $dat,$dat 1342 subs $len,$len,#16 1343 aese $dat,q9 1344 aesmc $dat,$dat 1345 cclr $step,eq 1346 aese $dat,q10 1347 aesmc $dat,$dat 1348 aese $dat,q11 1349 aesmc $dat,$dat 1350 vld1.8 {q8},[$inp],$step 1351 aese $dat,q12 1352 aesmc $dat,$dat 1353 veor q8,q8,$rndzero_n_last 1354 aese $dat,q13 1355 aesmc $dat,$dat 1356 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 1357 aese $dat,q14 1358 aesmc $dat,$dat 1359 aese $dat,q15 1360 veor $ivec,$dat,$rndlast 1361 b.hs .Loop_cbc_enc 1362 1363 vst1.8 {$ivec},[$out],#16 1364 b .Lcbc_done 1365 1366.align 5 1367.Lcbc_enc128: 1368 vld1.32 {$in0-$in1},[$key_] 1369 aese $dat,q8 1370 aesmc $dat,$dat 1371 b .Lenter_cbc_enc128 1372.Loop_cbc_enc128: 1373 aese $dat,q8 1374 aesmc $dat,$dat 1375 vst1.8 {$ivec},[$out],#16 1376.Lenter_cbc_enc128: 1377 aese $dat,q9 1378 aesmc $dat,$dat 1379 subs $len,$len,#16 1380 aese $dat,$in0 1381 aesmc $dat,$dat 1382 cclr $step,eq 1383 aese $dat,$in1 1384 aesmc $dat,$dat 1385 aese $dat,q10 1386 aesmc $dat,$dat 1387 aese $dat,q11 1388 aesmc $dat,$dat 1389 vld1.8 {q8},[$inp],$step 1390 aese $dat,q12 1391 aesmc $dat,$dat 1392 aese $dat,q13 1393 aesmc $dat,$dat 1394 aese $dat,q14 1395 aesmc $dat,$dat 1396 veor q8,q8,$rndzero_n_last 1397 aese $dat,q15 1398 veor $ivec,$dat,$rndlast 1399 b.hs .Loop_cbc_enc128 1400 1401 vst1.8 {$ivec},[$out],#16 1402 b .Lcbc_done 1403___ 1404{ 1405my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1406 1407my ($dat3,$in3,$tmp3); # used only in 64-bit mode 1408my ($dat4,$in4,$tmp4); 1409if ($flavour =~ /64/) { 1410 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 1411} 1412 1413$code.=<<___; 1414.align 5 1415.Lcbc_dec: 1416 vld1.8 {$dat2},[$inp],#16 1417 subs $len,$len,#32 // bias 1418 add $cnt,$rounds,#2 1419 vorr $in1,$dat,$dat 1420 vorr $dat1,$dat,$dat 1421 vorr $in2,$dat2,$dat2 1422 b.lo .Lcbc_dec_tail 1423 1424 vorr $dat1,$dat2,$dat2 1425 vld1.8 {$dat2},[$inp],#16 1426 vorr $in0,$dat,$dat 1427 vorr $in1,$dat1,$dat1 1428 vorr $in2,$dat2,$dat2 1429___ 1430$code.=<<___ if ($flavour =~ /64/); 1431 cmp $len,#32 1432 b.lo .Loop3x_cbc_dec 1433 1434 vld1.8 {$dat3},[$inp],#16 1435 vld1.8 {$dat4},[$inp],#16 1436 sub $len,$len,#32 // bias 1437 mov $cnt,$rounds 1438 vorr $in3,$dat3,$dat3 1439 vorr $in4,$dat4,$dat4 1440 1441.Loop5x_cbc_dec: 1442 aesd $dat0,q8 1443 aesimc $dat0,$dat0 1444 aesd $dat1,q8 1445 aesimc $dat1,$dat1 1446 aesd $dat2,q8 1447 aesimc $dat2,$dat2 1448 aesd $dat3,q8 1449 aesimc $dat3,$dat3 1450 aesd $dat4,q8 1451 aesimc $dat4,$dat4 1452 vld1.32 {q8},[$key_],#16 1453 subs $cnt,$cnt,#2 1454 aesd $dat0,q9 1455 aesimc $dat0,$dat0 1456 aesd $dat1,q9 1457 aesimc $dat1,$dat1 1458 aesd $dat2,q9 1459 aesimc $dat2,$dat2 1460 aesd $dat3,q9 1461 aesimc $dat3,$dat3 1462 aesd $dat4,q9 1463 aesimc $dat4,$dat4 1464 vld1.32 {q9},[$key_],#16 1465 b.gt .Loop5x_cbc_dec 1466 1467 aesd $dat0,q8 1468 aesimc $dat0,$dat0 1469 aesd $dat1,q8 1470 aesimc $dat1,$dat1 1471 aesd $dat2,q8 1472 aesimc $dat2,$dat2 1473 aesd $dat3,q8 1474 aesimc $dat3,$dat3 1475 aesd $dat4,q8 1476 aesimc $dat4,$dat4 1477 cmp $len,#0x40 // because .Lcbc_tail4x 1478 sub $len,$len,#0x50 1479 1480 aesd $dat0,q9 1481 aesimc $dat0,$dat0 1482 aesd $dat1,q9 1483 aesimc $dat1,$dat1 1484 aesd $dat2,q9 1485 aesimc $dat2,$dat2 1486 aesd $dat3,q9 1487 aesimc $dat3,$dat3 1488 aesd $dat4,q9 1489 aesimc $dat4,$dat4 1490 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 1491 mov $key_,$key 1492 1493 aesd $dat0,q10 1494 aesimc $dat0,$dat0 1495 aesd $dat1,q10 1496 aesimc $dat1,$dat1 1497 aesd $dat2,q10 1498 aesimc $dat2,$dat2 1499 aesd $dat3,q10 1500 aesimc $dat3,$dat3 1501 aesd $dat4,q10 1502 aesimc $dat4,$dat4 1503 add $inp,$inp,x6 // $inp is adjusted in such way that 1504 // at exit from the loop $dat1-$dat4 1505 // are loaded with last "words" 1506 add x6,$len,#0x60 // because .Lcbc_tail4x 1507 1508 aesd $dat0,q11 1509 aesimc $dat0,$dat0 1510 aesd $dat1,q11 1511 aesimc $dat1,$dat1 1512 aesd $dat2,q11 1513 aesimc $dat2,$dat2 1514 aesd $dat3,q11 1515 aesimc $dat3,$dat3 1516 aesd $dat4,q11 1517 aesimc $dat4,$dat4 1518 1519 aesd $dat0,q12 1520 aesimc $dat0,$dat0 1521 aesd $dat1,q12 1522 aesimc $dat1,$dat1 1523 aesd $dat2,q12 1524 aesimc $dat2,$dat2 1525 aesd $dat3,q12 1526 aesimc $dat3,$dat3 1527 aesd $dat4,q12 1528 aesimc $dat4,$dat4 1529 1530 aesd $dat0,q13 1531 aesimc $dat0,$dat0 1532 aesd $dat1,q13 1533 aesimc $dat1,$dat1 1534 aesd $dat2,q13 1535 aesimc $dat2,$dat2 1536 aesd $dat3,q13 1537 aesimc $dat3,$dat3 1538 aesd $dat4,q13 1539 aesimc $dat4,$dat4 1540 1541 aesd $dat0,q14 1542 aesimc $dat0,$dat0 1543 aesd $dat1,q14 1544 aesimc $dat1,$dat1 1545 aesd $dat2,q14 1546 aesimc $dat2,$dat2 1547 aesd $dat3,q14 1548 aesimc $dat3,$dat3 1549 aesd $dat4,q14 1550 aesimc $dat4,$dat4 1551 1552 veor $tmp0,$ivec,$rndlast 1553 aesd $dat0,q15 1554 veor $tmp1,$in0,$rndlast 1555 vld1.8 {$in0},[$inp],#16 1556 aesd $dat1,q15 1557 veor $tmp2,$in1,$rndlast 1558 vld1.8 {$in1},[$inp],#16 1559 aesd $dat2,q15 1560 veor $tmp3,$in2,$rndlast 1561 vld1.8 {$in2},[$inp],#16 1562 aesd $dat3,q15 1563 veor $tmp4,$in3,$rndlast 1564 vld1.8 {$in3},[$inp],#16 1565 aesd $dat4,q15 1566 vorr $ivec,$in4,$in4 1567 vld1.8 {$in4},[$inp],#16 1568 cbz x6,.Lcbc_tail4x 1569 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1570 veor $tmp0,$tmp0,$dat0 1571 vorr $dat0,$in0,$in0 1572 veor $tmp1,$tmp1,$dat1 1573 vorr $dat1,$in1,$in1 1574 veor $tmp2,$tmp2,$dat2 1575 vorr $dat2,$in2,$in2 1576 veor $tmp3,$tmp3,$dat3 1577 vorr $dat3,$in3,$in3 1578 veor $tmp4,$tmp4,$dat4 1579 vst1.8 {$tmp0},[$out],#16 1580 vorr $dat4,$in4,$in4 1581 vst1.8 {$tmp1},[$out],#16 1582 mov $cnt,$rounds 1583 vst1.8 {$tmp2},[$out],#16 1584 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1585 vst1.8 {$tmp3},[$out],#16 1586 vst1.8 {$tmp4},[$out],#16 1587 b.hs .Loop5x_cbc_dec 1588 1589 add $len,$len,#0x50 1590 cbz $len,.Lcbc_done 1591 1592 add $cnt,$rounds,#2 1593 subs $len,$len,#0x30 1594 vorr $dat0,$in2,$in2 1595 vorr $in0,$in2,$in2 1596 vorr $dat1,$in3,$in3 1597 vorr $in1,$in3,$in3 1598 vorr $dat2,$in4,$in4 1599 vorr $in2,$in4,$in4 1600 b.lo .Lcbc_dec_tail 1601 1602 b .Loop3x_cbc_dec 1603 1604.align 4 1605.Lcbc_tail4x: 1606 veor $tmp1,$tmp0,$dat1 1607 veor $tmp2,$tmp2,$dat2 1608 veor $tmp3,$tmp3,$dat3 1609 veor $tmp4,$tmp4,$dat4 1610 vst1.8 {$tmp1},[$out],#16 1611 vst1.8 {$tmp2},[$out],#16 1612 vst1.8 {$tmp3},[$out],#16 1613 vst1.8 {$tmp4},[$out],#16 1614 1615 b .Lcbc_done 1616.align 4 1617___ 1618$code.=<<___; 1619.Loop3x_cbc_dec: 1620 aesd $dat0,q8 1621 aesimc $dat0,$dat0 1622 aesd $dat1,q8 1623 aesimc $dat1,$dat1 1624 aesd $dat2,q8 1625 aesimc $dat2,$dat2 1626 vld1.32 {q8},[$key_],#16 1627 subs $cnt,$cnt,#2 1628 aesd $dat0,q9 1629 aesimc $dat0,$dat0 1630 aesd $dat1,q9 1631 aesimc $dat1,$dat1 1632 aesd $dat2,q9 1633 aesimc $dat2,$dat2 1634 vld1.32 {q9},[$key_],#16 1635 b.gt .Loop3x_cbc_dec 1636 1637 aesd $dat0,q8 1638 aesimc $dat0,$dat0 1639 aesd $dat1,q8 1640 aesimc $dat1,$dat1 1641 aesd $dat2,q8 1642 aesimc $dat2,$dat2 1643 veor $tmp0,$ivec,$rndlast 1644 subs $len,$len,#0x30 1645 veor $tmp1,$in0,$rndlast 1646 mov.lo x6,$len // x6, $cnt, is zero at this point 1647 aesd $dat0,q9 1648 aesimc $dat0,$dat0 1649 aesd $dat1,q9 1650 aesimc $dat1,$dat1 1651 aesd $dat2,q9 1652 aesimc $dat2,$dat2 1653 veor $tmp2,$in1,$rndlast 1654 add $inp,$inp,x6 // $inp is adjusted in such way that 1655 // at exit from the loop $dat1-$dat2 1656 // are loaded with last "words" 1657 vorr $ivec,$in2,$in2 1658 mov $key_,$key 1659 aesd $dat0,q12 1660 aesimc $dat0,$dat0 1661 aesd $dat1,q12 1662 aesimc $dat1,$dat1 1663 aesd $dat2,q12 1664 aesimc $dat2,$dat2 1665 vld1.8 {$in0},[$inp],#16 1666 aesd $dat0,q13 1667 aesimc $dat0,$dat0 1668 aesd $dat1,q13 1669 aesimc $dat1,$dat1 1670 aesd $dat2,q13 1671 aesimc $dat2,$dat2 1672 vld1.8 {$in1},[$inp],#16 1673 aesd $dat0,q14 1674 aesimc $dat0,$dat0 1675 aesd $dat1,q14 1676 aesimc $dat1,$dat1 1677 aesd $dat2,q14 1678 aesimc $dat2,$dat2 1679 vld1.8 {$in2},[$inp],#16 1680 aesd $dat0,q15 1681 aesd $dat1,q15 1682 aesd $dat2,q15 1683 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1684 add $cnt,$rounds,#2 1685 veor $tmp0,$tmp0,$dat0 1686 veor $tmp1,$tmp1,$dat1 1687 veor $dat2,$dat2,$tmp2 1688 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1689 vst1.8 {$tmp0},[$out],#16 1690 vorr $dat0,$in0,$in0 1691 vst1.8 {$tmp1},[$out],#16 1692 vorr $dat1,$in1,$in1 1693 vst1.8 {$dat2},[$out],#16 1694 vorr $dat2,$in2,$in2 1695 b.hs .Loop3x_cbc_dec 1696 1697 cmn $len,#0x30 1698 b.eq .Lcbc_done 1699 nop 1700 1701.Lcbc_dec_tail: 1702 aesd $dat1,q8 1703 aesimc $dat1,$dat1 1704 aesd $dat2,q8 1705 aesimc $dat2,$dat2 1706 vld1.32 {q8},[$key_],#16 1707 subs $cnt,$cnt,#2 1708 aesd $dat1,q9 1709 aesimc $dat1,$dat1 1710 aesd $dat2,q9 1711 aesimc $dat2,$dat2 1712 vld1.32 {q9},[$key_],#16 1713 b.gt .Lcbc_dec_tail 1714 1715 aesd $dat1,q8 1716 aesimc $dat1,$dat1 1717 aesd $dat2,q8 1718 aesimc $dat2,$dat2 1719 aesd $dat1,q9 1720 aesimc $dat1,$dat1 1721 aesd $dat2,q9 1722 aesimc $dat2,$dat2 1723 aesd $dat1,q12 1724 aesimc $dat1,$dat1 1725 aesd $dat2,q12 1726 aesimc $dat2,$dat2 1727 cmn $len,#0x20 1728 aesd $dat1,q13 1729 aesimc $dat1,$dat1 1730 aesd $dat2,q13 1731 aesimc $dat2,$dat2 1732 veor $tmp1,$ivec,$rndlast 1733 aesd $dat1,q14 1734 aesimc $dat1,$dat1 1735 aesd $dat2,q14 1736 aesimc $dat2,$dat2 1737 veor $tmp2,$in1,$rndlast 1738 aesd $dat1,q15 1739 aesd $dat2,q15 1740 b.eq .Lcbc_dec_one 1741 veor $tmp1,$tmp1,$dat1 1742 veor $tmp2,$tmp2,$dat2 1743 vorr $ivec,$in2,$in2 1744 vst1.8 {$tmp1},[$out],#16 1745 vst1.8 {$tmp2},[$out],#16 1746 b .Lcbc_done 1747 1748.Lcbc_dec_one: 1749 veor $tmp1,$tmp1,$dat2 1750 vorr $ivec,$in2,$in2 1751 vst1.8 {$tmp1},[$out],#16 1752 1753.Lcbc_done: 1754 vst1.8 {$ivec},[$ivp] 1755.Lcbc_abort: 1756___ 1757} 1758$code.=<<___ if ($flavour !~ /64/); 1759 vldmia sp!,{d8-d15} 1760 ldmia sp!,{r4-r8,pc} 1761___ 1762$code.=<<___ if ($flavour =~ /64/); 1763 ldr x29,[sp],#16 1764 ret 1765___ 1766$code.=<<___; 1767.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1768___ 1769}}} 1770 1771{{{ 1772my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 1773my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7"); 1774my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 1775my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15)); 1776my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23)); 1777 1778# q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15 1779my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3)); 1780my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9)); 1781my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15)); 1782my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21)); 1783my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27)); 1784my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27)); 1785 1786#q_X => qX, for ldp & stp 1787my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7)); 1788my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23)); 1789 1790my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11)); 1791 1792$code.=<<___ if ($flavour =~ /64/); 1793.globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3 1794.type ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function 1795.align 5 1796${prefix}_ctr32_encrypt_blocks_unroll12_eor3: 1797 AARCH64_VALID_CALL_TARGET 1798 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1799 stp x29,x30,[sp,#-80]! 1800 stp d8,d9,[sp, #16] 1801 stp d10,d11,[sp, #32] 1802 stp d12,d13,[sp, #48] 1803 stp d14,d15,[sp, #64] 1804 add x29,sp,#0 1805 1806 ldr $rounds,[$key,#240] 1807 1808 ldr $ctr, [$ivp, #12] 1809#ifdef __AARCH64EB__ 1810 vld1.8 {$dat0},[$ivp] 1811#else 1812 vld1.32 {$dat0},[$ivp] 1813#endif 1814 vld1.32 {$rndping-$rndpang},[$key] // load key schedule... 1815 sub $rounds,$rounds,#4 1816 cmp $len,#2 1817 add $key_,$key,$roundsx,lsl#4 // pointer to last round key 1818 sub $rounds,$rounds,#2 1819 add $key_, $key_, #64 1820 vld1.32 {$rndlast},[$key_] 1821 add $key_,$key,#32 1822 mov $cnt,$rounds 1823#ifndef __AARCH64EB__ 1824 rev $ctr, $ctr 1825#endif 1826 1827 vorr $dat1,$dat0,$dat0 1828 add $tctr1, $ctr, #1 1829 vorr $dat2,$dat0,$dat0 1830 add $ctr, $ctr, #2 1831 vorr $ivec,$dat0,$dat0 1832 rev $tctr1, $tctr1 1833 vmov.32 ${dat1}[3],$tctr1 1834 b.ls .Lctr32_tail_unroll 1835 cmp $len,#6 1836 rev $tctr2, $ctr 1837 sub $len,$len,#3 // bias 1838 vmov.32 ${dat2}[3],$tctr2 1839 b.lo .Loop3x_ctr32_unroll 1840 cmp $len,#9 1841 vorr $dat3,$dat0,$dat0 1842 add $tctr3, $ctr, #1 1843 vorr $dat4,$dat0,$dat0 1844 add $tctr4, $ctr, #2 1845 rev $tctr3, $tctr3 1846 vorr $dat5,$dat0,$dat0 1847 add $ctr, $ctr, #3 1848 rev $tctr4, $tctr4 1849 vmov.32 ${dat3}[3],$tctr3 1850 rev $tctr5, $ctr 1851 vmov.32 ${dat4}[3],$tctr4 1852 vmov.32 ${dat5}[3],$tctr5 1853 sub $len,$len,#3 1854 b.lo .Loop6x_ctr32_unroll 1855 1856 // push regs to stack when 12 data chunks are interleaved 1857 stp x19,x20,[sp,#-16]! 1858 stp x21,x22,[sp,#-16]! 1859 stp x23,x24,[sp,#-16]! 1860 stp $dat8d,$dat9d,[sp,#-32]! 1861 stp $dat10d,$dat11d,[sp,#-32]! 1862 1863 add $tctr6,$ctr,#1 1864 add $tctr7,$ctr,#2 1865 add $tctr8,$ctr,#3 1866 add $tctr9,$ctr,#4 1867 add $tctr10,$ctr,#5 1868 add $ctr,$ctr,#6 1869 vorr $dat6,$dat0,$dat0 1870 rev $tctr6,$tctr6 1871 vorr $dat7,$dat0,$dat0 1872 rev $tctr7,$tctr7 1873 vorr $dat8,$dat0,$dat0 1874 rev $tctr8,$tctr8 1875 vorr $dat9,$dat0,$dat0 1876 rev $tctr9,$tctr9 1877 vorr $dat10,$dat0,$dat0 1878 rev $tctr10,$tctr10 1879 vorr $dat11,$dat0,$dat0 1880 rev $tctr11,$ctr 1881 1882 sub $len,$len,#6 // bias 1883 vmov.32 ${dat6}[3],$tctr6 1884 vmov.32 ${dat7}[3],$tctr7 1885 vmov.32 ${dat8}[3],$tctr8 1886 vmov.32 ${dat9}[3],$tctr9 1887 vmov.32 ${dat10}[3],$tctr10 1888 vmov.32 ${dat11}[3],$tctr11 1889 b .Loop12x_ctr32_unroll 1890 1891.align 4 1892.Loop12x_ctr32_unroll: 1893 aese $dat0,$rndping 1894 aesmc $dat0,$dat0 1895 aese $dat1,$rndping 1896 aesmc $dat1,$dat1 1897 aese $dat2,$rndping 1898 aesmc $dat2,$dat2 1899 aese $dat3,$rndping 1900 aesmc $dat3,$dat3 1901 aese $dat4,$rndping 1902 aesmc $dat4,$dat4 1903 aese $dat5,$rndping 1904 aesmc $dat5,$dat5 1905 aese $dat6,$rndping 1906 aesmc $dat6,$dat6 1907 aese $dat7,$rndping 1908 aesmc $dat7,$dat7 1909 aese $dat8,$rndping 1910 aesmc $dat8,$dat8 1911 aese $dat9,$rndping 1912 aesmc $dat9,$dat9 1913 aese $dat10,$rndping 1914 aesmc $dat10,$dat10 1915 aese $dat11,$rndping 1916 aesmc $dat11,$dat11 1917 vld1.32 {$rndping},[$key_],#16 1918 subs $cnt,$cnt,#2 1919 aese $dat0,$rndpang 1920 aesmc $dat0,$dat0 1921 aese $dat1,$rndpang 1922 aesmc $dat1,$dat1 1923 aese $dat2,$rndpang 1924 aesmc $dat2,$dat2 1925 aese $dat3,$rndpang 1926 aesmc $dat3,$dat3 1927 aese $dat4,$rndpang 1928 aesmc $dat4,$dat4 1929 aese $dat5,$rndpang 1930 aesmc $dat5,$dat5 1931 aese $dat6,$rndpang 1932 aesmc $dat6,$dat6 1933 aese $dat7,$rndpang 1934 aesmc $dat7,$dat7 1935 aese $dat8,$rndpang 1936 aesmc $dat8,$dat8 1937 aese $dat9,$rndpang 1938 aesmc $dat9,$dat9 1939 aese $dat10,$rndpang 1940 aesmc $dat10,$dat10 1941 aese $dat11,$rndpang 1942 aesmc $dat11,$dat11 1943 vld1.32 {$rndpang},[$key_],#16 1944 b.gt .Loop12x_ctr32_unroll 1945 1946 aese $dat0,$rndping 1947 aesmc $dat0,$dat0 1948 aese $dat1,$rndping 1949 aesmc $dat1,$dat1 1950 aese $dat2,$rndping 1951 aesmc $dat2,$dat2 1952 aese $dat3,$rndping 1953 aesmc $dat3,$dat3 1954 aese $dat4,$rndping 1955 aesmc $dat4,$dat4 1956 aese $dat5,$rndping 1957 aesmc $dat5,$dat5 1958 aese $dat6,$rndping 1959 aesmc $dat6,$dat6 1960 aese $dat7,$rndping 1961 aesmc $dat7,$dat7 1962 aese $dat8,$rndping 1963 aesmc $dat8,$dat8 1964 aese $dat9,$rndping 1965 aesmc $dat9,$dat9 1966 aese $dat10,$rndping 1967 aesmc $dat10,$dat10 1968 aese $dat11,$rndping 1969 aesmc $dat11,$dat11 1970 vld1.32 {$rndping},[$key_],#16 1971 1972 aese $dat0,$rndpang 1973 aesmc $dat0,$dat0 1974 aese $dat1,$rndpang 1975 aesmc $dat1,$dat1 1976 aese $dat2,$rndpang 1977 aesmc $dat2,$dat2 1978 aese $dat3,$rndpang 1979 aesmc $dat3,$dat3 1980 aese $dat4,$rndpang 1981 aesmc $dat4,$dat4 1982 aese $dat5,$rndpang 1983 aesmc $dat5,$dat5 1984 aese $dat6,$rndpang 1985 aesmc $dat6,$dat6 1986 aese $dat7,$rndpang 1987 aesmc $dat7,$dat7 1988 aese $dat8,$rndpang 1989 aesmc $dat8,$dat8 1990 aese $dat9,$rndpang 1991 aesmc $dat9,$dat9 1992 aese $dat10,$rndpang 1993 aesmc $dat10,$dat10 1994 aese $dat11,$rndpang 1995 aesmc $dat11,$dat11 1996 vld1.32 {$rndpang},[$key_],#16 1997 1998 aese $dat0,$rndping 1999 aesmc $dat0,$dat0 2000 add $tctr0,$ctr,#1 2001 add $tctr1,$ctr,#2 2002 aese $dat1,$rndping 2003 aesmc $dat1,$dat1 2004 add $tctr2,$ctr,#3 2005 add $tctr3,$ctr,#4 2006 aese $dat2,$rndping 2007 aesmc $dat2,$dat2 2008 add $tctr4,$ctr,#5 2009 add $tctr5,$ctr,#6 2010 rev $tctr0,$tctr0 2011 aese $dat3,$rndping 2012 aesmc $dat3,$dat3 2013 add $tctr6,$ctr,#7 2014 add $tctr7,$ctr,#8 2015 rev $tctr1,$tctr1 2016 rev $tctr2,$tctr2 2017 aese $dat4,$rndping 2018 aesmc $dat4,$dat4 2019 add $tctr8,$ctr,#9 2020 add $tctr9,$ctr,#10 2021 rev $tctr3,$tctr3 2022 rev $tctr4,$tctr4 2023 aese $dat5,$rndping 2024 aesmc $dat5,$dat5 2025 add $tctr10,$ctr,#11 2026 add $tctr11,$ctr,#12 2027 rev $tctr5,$tctr5 2028 rev $tctr6,$tctr6 2029 aese $dat6,$rndping 2030 aesmc $dat6,$dat6 2031 rev $tctr7,$tctr7 2032 rev $tctr8,$tctr8 2033 aese $dat7,$rndping 2034 aesmc $dat7,$dat7 2035 rev $tctr9,$tctr9 2036 rev $tctr10,$tctr10 2037 aese $dat8,$rndping 2038 aesmc $dat8,$dat8 2039 rev $tctr11,$tctr11 2040 aese $dat9,$rndping 2041 aesmc $dat9,$dat9 2042 aese $dat10,$rndping 2043 aesmc $dat10,$dat10 2044 aese $dat11,$rndping 2045 aesmc $dat11,$dat11 2046 vld1.32 {$rndping},[$key_],#16 2047 2048 aese $dat0,$rndpang 2049 aesmc $dat0,$dat0 2050 aese $dat1,$rndpang 2051 aesmc $dat1,$dat1 2052 aese $dat2,$rndpang 2053 aesmc $dat2,$dat2 2054 aese $dat3,$rndpang 2055 aesmc $dat3,$dat3 2056 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64 2057 aese $dat4,$rndpang 2058 aesmc $dat4,$dat4 2059 aese $dat5,$rndpang 2060 aesmc $dat5,$dat5 2061 aese $dat6,$rndpang 2062 aesmc $dat6,$dat6 2063 aese $dat7,$rndpang 2064 aesmc $dat7,$dat7 2065 vld1.8 {$in4,$in5,$in6,$in7},[$inp],#64 2066 aese $dat8,$rndpang 2067 aesmc $dat8,$dat8 2068 aese $dat9,$rndpang 2069 aesmc $dat9,$dat9 2070 aese $dat10,$rndpang 2071 aesmc $dat10,$dat10 2072 aese $dat11,$rndpang 2073 aesmc $dat11,$dat11 2074 vld1.8 {$in8,$in9,$in10,$in11},[$inp],#64 2075 vld1.32 {$rndpang},[$key_],#16 2076 2077 mov $key_, $key 2078 aese $dat0,$rndping 2079 aesmc $dat0,$dat0 2080 aese $dat1,$rndping 2081 aesmc $dat1,$dat1 2082 aese $dat2,$rndping 2083 aesmc $dat2,$dat2 2084 aese $dat3,$rndping 2085 aesmc $dat3,$dat3 2086 aese $dat4,$rndping 2087 aesmc $dat4,$dat4 2088 aese $dat5,$rndping 2089 aesmc $dat5,$dat5 2090 aese $dat6,$rndping 2091 aesmc $dat6,$dat6 2092 aese $dat7,$rndping 2093 aesmc $dat7,$dat7 2094 aese $dat8,$rndping 2095 aesmc $dat8,$dat8 2096 aese $dat9,$rndping 2097 aesmc $dat9,$dat9 2098 aese $dat10,$rndping 2099 aesmc $dat10,$dat10 2100 aese $dat11,$rndping 2101 aesmc $dat11,$dat11 2102 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] 2103 2104 aese $dat0,$rndpang 2105 eor3 $in0,$in0,$rndlast,$dat0 2106 vorr $dat0,$ivec,$ivec 2107 aese $dat1,$rndpang 2108 eor3 $in1,$in1,$rndlast,$dat1 2109 vorr $dat1,$ivec,$ivec 2110 aese $dat2,$rndpang 2111 eor3 $in2,$in2,$rndlast,$dat2 2112 vorr $dat2,$ivec,$ivec 2113 aese $dat3,$rndpang 2114 eor3 $in3,$in3,$rndlast,$dat3 2115 vorr $dat3,$ivec,$ivec 2116 aese $dat4,$rndpang 2117 eor3 $in4,$in4,$rndlast,$dat4 2118 vorr $dat4,$ivec,$ivec 2119 aese $dat5,$rndpang 2120 eor3 $in5,$in5,$rndlast,$dat5 2121 vorr $dat5,$ivec,$ivec 2122 aese $dat6,$rndpang 2123 eor3 $in6,$in6,$rndlast,$dat6 2124 vorr $dat6,$ivec,$ivec 2125 aese $dat7,$rndpang 2126 eor3 $in7,$in7,$rndlast,$dat7 2127 vorr $dat7,$ivec,$ivec 2128 aese $dat8,$rndpang 2129 eor3 $in8,$in8,$rndlast,$dat8 2130 vorr $dat8,$ivec,$ivec 2131 aese $dat9,$rndpang 2132 eor3 $in9,$in9,$rndlast,$dat9 2133 vorr $dat9,$ivec,$ivec 2134 aese $dat10,$rndpang 2135 eor3 $in10,$in10,$rndlast,$dat10 2136 vorr $dat10,$ivec,$ivec 2137 aese $dat11,$rndpang 2138 eor3 $in11,$in11,$rndlast,$dat11 2139 vorr $dat11,$ivec,$ivec 2140 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] 2141 2142 vmov.32 ${dat0}[3],$tctr0 2143 vmov.32 ${dat1}[3],$tctr1 2144 vmov.32 ${dat2}[3],$tctr2 2145 vmov.32 ${dat3}[3],$tctr3 2146 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64 2147 vmov.32 ${dat4}[3],$tctr4 2148 vmov.32 ${dat5}[3],$tctr5 2149 vmov.32 ${dat6}[3],$tctr6 2150 vmov.32 ${dat7}[3],$tctr7 2151 vst1.8 {$in4,$in5,$in6,$in7},[$out],#64 2152 vmov.32 ${dat8}[3],$tctr8 2153 vmov.32 ${dat9}[3],$tctr9 2154 vmov.32 ${dat10}[3],$tctr10 2155 vmov.32 ${dat11}[3],$tctr11 2156 vst1.8 {$in8,$in9,$in10,$in11},[$out],#64 2157 2158 mov $cnt,$rounds 2159 2160 add $ctr,$ctr,#12 2161 subs $len,$len,#12 2162 b.hs .Loop12x_ctr32_unroll 2163 2164 // pop regs from stack when 12 data chunks are interleaved 2165 ldp $dat10d,$dat11d,[sp],#32 2166 ldp $dat8d,$dat9d,[sp],#32 2167 ldp x23,x24,[sp],#16 2168 ldp x21,x22,[sp],#16 2169 ldp x19,x20,[sp],#16 2170 2171 add $len,$len,#12 2172 cbz $len,.Lctr32_done_unroll 2173 sub $ctr,$ctr,#12 2174 2175 cmp $len,#2 2176 b.ls .Lctr32_tail_unroll 2177 2178 cmp $len,#6 2179 sub $len,$len,#3 // bias 2180 add $ctr,$ctr,#3 2181 b.lo .Loop3x_ctr32_unroll 2182 2183 sub $len,$len,#3 2184 add $ctr,$ctr,#3 2185 b.lo .Loop6x_ctr32_unroll 2186 2187.align 4 2188.Loop6x_ctr32_unroll: 2189 aese $dat0,$rndping 2190 aesmc $dat0,$dat0 2191 aese $dat1,$rndping 2192 aesmc $dat1,$dat1 2193 aese $dat2,$rndping 2194 aesmc $dat2,$dat2 2195 aese $dat3,$rndping 2196 aesmc $dat3,$dat3 2197 aese $dat4,$rndping 2198 aesmc $dat4,$dat4 2199 aese $dat5,$rndping 2200 aesmc $dat5,$dat5 2201 vld1.32 {$rndping},[$key_],#16 2202 subs $cnt,$cnt,#2 2203 aese $dat0,$rndpang 2204 aesmc $dat0,$dat0 2205 aese $dat1,$rndpang 2206 aesmc $dat1,$dat1 2207 aese $dat2,$rndpang 2208 aesmc $dat2,$dat2 2209 aese $dat3,$rndpang 2210 aesmc $dat3,$dat3 2211 aese $dat4,$rndpang 2212 aesmc $dat4,$dat4 2213 aese $dat5,$rndpang 2214 aesmc $dat5,$dat5 2215 vld1.32 {$rndpang},[$key_],#16 2216 b.gt .Loop6x_ctr32_unroll 2217 2218 aese $dat0,$rndping 2219 aesmc $dat0,$dat0 2220 aese $dat1,$rndping 2221 aesmc $dat1,$dat1 2222 aese $dat2,$rndping 2223 aesmc $dat2,$dat2 2224 aese $dat3,$rndping 2225 aesmc $dat3,$dat3 2226 aese $dat4,$rndping 2227 aesmc $dat4,$dat4 2228 aese $dat5,$rndping 2229 aesmc $dat5,$dat5 2230 vld1.32 {$rndping},[$key_],#16 2231 2232 aese $dat0,$rndpang 2233 aesmc $dat0,$dat0 2234 aese $dat1,$rndpang 2235 aesmc $dat1,$dat1 2236 aese $dat2,$rndpang 2237 aesmc $dat2,$dat2 2238 aese $dat3,$rndpang 2239 aesmc $dat3,$dat3 2240 aese $dat4,$rndpang 2241 aesmc $dat4,$dat4 2242 aese $dat5,$rndpang 2243 aesmc $dat5,$dat5 2244 vld1.32 {$rndpang},[$key_],#16 2245 2246 aese $dat0,$rndping 2247 aesmc $dat0,$dat0 2248 add $tctr0,$ctr,#1 2249 add $tctr1,$ctr,#2 2250 aese $dat1,$rndping 2251 aesmc $dat1,$dat1 2252 add $tctr2,$ctr,#3 2253 add $tctr3,$ctr,#4 2254 aese $dat2,$rndping 2255 aesmc $dat2,$dat2 2256 add $tctr4,$ctr,#5 2257 add $tctr5,$ctr,#6 2258 rev $tctr0,$tctr0 2259 aese $dat3,$rndping 2260 aesmc $dat3,$dat3 2261 rev $tctr1,$tctr1 2262 rev $tctr2,$tctr2 2263 aese $dat4,$rndping 2264 aesmc $dat4,$dat4 2265 rev $tctr3,$tctr3 2266 rev $tctr4,$tctr4 2267 aese $dat5,$rndping 2268 aesmc $dat5,$dat5 2269 rev $tctr5,$tctr5 2270 vld1.32 {$rndping},[$key_],#16 2271 2272 aese $dat0,$rndpang 2273 aesmc $dat0,$dat0 2274 aese $dat1,$rndpang 2275 aesmc $dat1,$dat1 2276 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64 2277 aese $dat2,$rndpang 2278 aesmc $dat2,$dat2 2279 aese $dat3,$rndpang 2280 aesmc $dat3,$dat3 2281 vld1.8 {$in4,$in5},[$inp],#32 2282 aese $dat4,$rndpang 2283 aesmc $dat4,$dat4 2284 aese $dat5,$rndpang 2285 aesmc $dat5,$dat5 2286 vld1.32 {$rndpang},[$key_],#16 2287 2288 mov $key_, $key 2289 aese $dat0,$rndping 2290 aesmc $dat0,$dat0 2291 aese $dat1,$rndping 2292 aesmc $dat1,$dat1 2293 aese $dat2,$rndping 2294 aesmc $dat2,$dat2 2295 aese $dat3,$rndping 2296 aesmc $dat3,$dat3 2297 aese $dat4,$rndping 2298 aesmc $dat4,$dat4 2299 aese $dat5,$rndping 2300 aesmc $dat5,$dat5 2301 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] 2302 2303 aese $dat0,$rndpang 2304 eor3 $in0,$in0,$rndlast,$dat0 2305 aese $dat1,$rndpang 2306 eor3 $in1,$in1,$rndlast,$dat1 2307 aese $dat2,$rndpang 2308 eor3 $in2,$in2,$rndlast,$dat2 2309 aese $dat3,$rndpang 2310 eor3 $in3,$in3,$rndlast,$dat3 2311 aese $dat4,$rndpang 2312 eor3 $in4,$in4,$rndlast,$dat4 2313 aese $dat5,$rndpang 2314 eor3 $in5,$in5,$rndlast,$dat5 2315 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] 2316 2317 vorr $dat0,$ivec,$ivec 2318 vorr $dat1,$ivec,$ivec 2319 vorr $dat2,$ivec,$ivec 2320 vorr $dat3,$ivec,$ivec 2321 vorr $dat4,$ivec,$ivec 2322 vorr $dat5,$ivec,$ivec 2323 2324 vmov.32 ${dat0}[3],$tctr0 2325 vmov.32 ${dat1}[3],$tctr1 2326 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64 2327 vmov.32 ${dat2}[3],$tctr2 2328 vmov.32 ${dat3}[3],$tctr3 2329 vst1.8 {$in4,$in5},[$out],#32 2330 vmov.32 ${dat4}[3],$tctr4 2331 vmov.32 ${dat5}[3],$tctr5 2332 2333 cbz $len,.Lctr32_done_unroll 2334 mov $cnt,$rounds 2335 2336 cmp $len,#2 2337 b.ls .Lctr32_tail_unroll 2338 2339 sub $len,$len,#3 // bias 2340 add $ctr,$ctr,#3 2341 b .Loop3x_ctr32_unroll 2342 2343.align 4 2344.Loop3x_ctr32_unroll: 2345 aese $dat0,$rndping 2346 aesmc $dat0,$dat0 2347 aese $dat1,$rndping 2348 aesmc $dat1,$dat1 2349 aese $dat2,$rndping 2350 aesmc $dat2,$dat2 2351 vld1.32 {$rndping},[$key_],#16 2352 subs $cnt,$cnt,#2 2353 aese $dat0,$rndpang 2354 aesmc $dat0,$dat0 2355 aese $dat1,$rndpang 2356 aesmc $dat1,$dat1 2357 aese $dat2,$rndpang 2358 aesmc $dat2,$dat2 2359 vld1.32 {$rndpang},[$key_],#16 2360 b.gt .Loop3x_ctr32_unroll 2361 2362 aese $dat0,$rndping 2363 aesmc $tmp0,$dat0 2364 aese $dat1,$rndping 2365 aesmc $tmp1,$dat1 2366 vld1.8 {$in0,$in1,$in2},[$inp],#48 2367 vorr $dat0,$ivec,$ivec 2368 aese $dat2,$rndping 2369 aesmc $dat2,$dat2 2370 vld1.32 {$rndping},[$key_],#16 2371 vorr $dat1,$ivec,$ivec 2372 aese $tmp0,$rndpang 2373 aesmc $tmp0,$tmp0 2374 aese $tmp1,$rndpang 2375 aesmc $tmp1,$tmp1 2376 aese $dat2,$rndpang 2377 aesmc $tmp2,$dat2 2378 vld1.32 {$rndpang},[$key_],#16 2379 vorr $dat2,$ivec,$ivec 2380 add $tctr0,$ctr,#1 2381 aese $tmp0,$rndping 2382 aesmc $tmp0,$tmp0 2383 aese $tmp1,$rndping 2384 aesmc $tmp1,$tmp1 2385 add $tctr1,$ctr,#2 2386 aese $tmp2,$rndping 2387 aesmc $tmp2,$tmp2 2388 vld1.32 {$rndping},[$key_],#16 2389 add $ctr,$ctr,#3 2390 aese $tmp0,$rndpang 2391 aesmc $tmp0,$tmp0 2392 aese $tmp1,$rndpang 2393 aesmc $tmp1,$tmp1 2394 2395 rev $tctr0,$tctr0 2396 aese $tmp2,$rndpang 2397 aesmc $tmp2,$tmp2 2398 vld1.32 {$rndpang},[$key_],#16 2399 vmov.32 ${dat0}[3], $tctr0 2400 mov $key_,$key 2401 rev $tctr1,$tctr1 2402 aese $tmp0,$rndping 2403 aesmc $tmp0,$tmp0 2404 2405 aese $tmp1,$rndping 2406 aesmc $tmp1,$tmp1 2407 vmov.32 ${dat1}[3], $tctr1 2408 rev $tctr2,$ctr 2409 aese $tmp2,$rndping 2410 aesmc $tmp2,$tmp2 2411 vmov.32 ${dat2}[3], $tctr2 2412 2413 aese $tmp0,$rndpang 2414 aese $tmp1,$rndpang 2415 aese $tmp2,$rndpang 2416 2417 eor3 $in0,$in0,$rndlast,$tmp0 2418 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0] 2419 eor3 $in1,$in1,$rndlast,$tmp1 2420 mov $cnt,$rounds 2421 eor3 $in2,$in2,$rndlast,$tmp2 2422 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1] 2423 vst1.8 {$in0,$in1,$in2},[$out],#48 2424 2425 cbz $len,.Lctr32_done_unroll 2426 2427.Lctr32_tail_unroll: 2428 cmp $len,#1 2429 b.eq .Lctr32_tail_1_unroll 2430 2431.Lctr32_tail_2_unroll: 2432 aese $dat0,$rndping 2433 aesmc $dat0,$dat0 2434 aese $dat1,$rndping 2435 aesmc $dat1,$dat1 2436 vld1.32 {$rndping},[$key_],#16 2437 subs $cnt,$cnt,#2 2438 aese $dat0,$rndpang 2439 aesmc $dat0,$dat0 2440 aese $dat1,$rndpang 2441 aesmc $dat1,$dat1 2442 vld1.32 {$rndpang},[$key_],#16 2443 b.gt .Lctr32_tail_2_unroll 2444 2445 aese $dat0,$rndping 2446 aesmc $dat0,$dat0 2447 aese $dat1,$rndping 2448 aesmc $dat1,$dat1 2449 vld1.32 {$rndping},[$key_],#16 2450 aese $dat0,$rndpang 2451 aesmc $dat0,$dat0 2452 aese $dat1,$rndpang 2453 aesmc $dat1,$dat1 2454 vld1.32 {$rndpang},[$key_],#16 2455 vld1.8 {$in0,$in1},[$inp],#32 2456 aese $dat0,$rndping 2457 aesmc $dat0,$dat0 2458 aese $dat1,$rndping 2459 aesmc $dat1,$dat1 2460 vld1.32 {$rndping},[$key_],#16 2461 aese $dat0,$rndpang 2462 aesmc $dat0,$dat0 2463 aese $dat1,$rndpang 2464 aesmc $dat1,$dat1 2465 vld1.32 {$rndpang},[$key_],#16 2466 aese $dat0,$rndping 2467 aesmc $dat0,$dat0 2468 aese $dat1,$rndping 2469 aesmc $dat1,$dat1 2470 aese $dat0,$rndpang 2471 aese $dat1,$rndpang 2472 2473 eor3 $in0,$in0,$rndlast,$dat0 2474 eor3 $in1,$in1,$rndlast,$dat1 2475 vst1.8 {$in0,$in1},[$out],#32 2476 b .Lctr32_done_unroll 2477 2478.Lctr32_tail_1_unroll: 2479 aese $dat0,$rndping 2480 aesmc $dat0,$dat0 2481 vld1.32 {$rndping},[$key_],#16 2482 subs $cnt,$cnt,#2 2483 aese $dat0,$rndpang 2484 aesmc $dat0,$dat0 2485 vld1.32 {$rndpang},[$key_],#16 2486 b.gt .Lctr32_tail_1_unroll 2487 2488 aese $dat0,$rndping 2489 aesmc $dat0,$dat0 2490 vld1.32 {$rndping},[$key_],#16 2491 aese $dat0,$rndpang 2492 aesmc $dat0,$dat0 2493 vld1.32 {$rndpang},[$key_],#16 2494 vld1.8 {$in0},[$inp] 2495 aese $dat0,$rndping 2496 aesmc $dat0,$dat0 2497 vld1.32 {$rndping},[$key_],#16 2498 aese $dat0,$rndpang 2499 aesmc $dat0,$dat0 2500 vld1.32 {$rndpang},[$key_],#16 2501 aese $dat0,$rndping 2502 aesmc $dat0,$dat0 2503 aese $dat0,$rndpang 2504 2505 eor3 $in0,$in0,$rndlast,$dat0 2506 vst1.8 {$in0},[$out],#16 2507 2508.Lctr32_done_unroll: 2509 ldp d8,d9,[sp, #16] 2510 ldp d10,d11,[sp, #32] 2511 ldp d12,d13,[sp, #48] 2512 ldp d14,d15,[sp, #64] 2513 ldr x29,[sp],#80 2514 ret 2515.size ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3 2516___ 2517}}} 2518 2519{{{ 2520my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 2521my ($rounds,$cnt,$key_)=("w5","w6","x7"); 2522my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 2523my $step="x12"; # aliases with $tctr2 2524 2525my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 2526my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2527 2528# used only in 64-bit mode... 2529my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); 2530 2531my ($dat,$tmp)=($dat0,$tmp0); 2532 2533### q8-q15 preloaded key schedule 2534 2535$code.=<<___; 2536.globl ${prefix}_ctr32_encrypt_blocks 2537.type ${prefix}_ctr32_encrypt_blocks,%function 2538.align 5 2539${prefix}_ctr32_encrypt_blocks: 2540___ 2541$code.=<<___ if ($flavour =~ /64/); 2542 AARCH64_VALID_CALL_TARGET 2543 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 2544 stp x29,x30,[sp,#-16]! 2545 add x29,sp,#0 2546___ 2547$code.=<<___ if ($flavour !~ /64/); 2548 mov ip,sp 2549 stmdb sp!,{r4-r10,lr} 2550 vstmdb sp!,{d8-d15} @ ABI specification says so 2551 ldr r4, [ip] @ load remaining arg 2552___ 2553$code.=<<___; 2554 ldr $rounds,[$key,#240] 2555 2556 ldr $ctr, [$ivp, #12] 2557#ifdef __ARMEB__ 2558 vld1.8 {$dat0},[$ivp] 2559#else 2560 vld1.32 {$dat0},[$ivp] 2561#endif 2562 vld1.32 {q8-q9},[$key] // load key schedule... 2563 sub $rounds,$rounds,#4 2564 mov $step,#16 2565 cmp $len,#2 2566 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 2567 sub $rounds,$rounds,#2 2568 vld1.32 {q12-q13},[$key_],#32 2569 vld1.32 {q14-q15},[$key_],#32 2570 vld1.32 {$rndlast},[$key_] 2571 add $key_,$key,#32 2572 mov $cnt,$rounds 2573 cclr $step,lo 2574#ifndef __ARMEB__ 2575 rev $ctr, $ctr 2576#endif 2577___ 2578$code.=<<___ if ($flavour =~ /64/); 2579 vorr $dat1,$dat0,$dat0 2580 add $tctr1, $ctr, #1 2581 vorr $dat2,$dat0,$dat0 2582 add $ctr, $ctr, #2 2583 vorr $ivec,$dat0,$dat0 2584 rev $tctr1, $tctr1 2585 vmov.32 ${dat1}[3],$tctr1 2586 b.ls .Lctr32_tail 2587 rev $tctr2, $ctr 2588 sub $len,$len,#3 // bias 2589 vmov.32 ${dat2}[3],$tctr2 2590___ 2591$code.=<<___ if ($flavour !~ /64/); 2592 add $tctr1, $ctr, #1 2593 vorr $ivec,$dat0,$dat0 2594 rev $tctr1, $tctr1 2595 vmov.32 ${ivec}[3],$tctr1 2596 add $ctr, $ctr, #2 2597 vorr $dat1,$ivec,$ivec 2598 b.ls .Lctr32_tail 2599 rev $tctr2, $ctr 2600 vmov.32 ${ivec}[3],$tctr2 2601 sub $len,$len,#3 // bias 2602 vorr $dat2,$ivec,$ivec 2603___ 2604$code.=<<___ if ($flavour =~ /64/); 2605 cmp $len,#32 2606 b.lo .Loop3x_ctr32 2607 2608 add w13,$ctr,#1 2609 add w14,$ctr,#2 2610 vorr $dat3,$dat0,$dat0 2611 rev w13,w13 2612 vorr $dat4,$dat0,$dat0 2613 rev w14,w14 2614 vmov.32 ${dat3}[3],w13 2615 sub $len,$len,#2 // bias 2616 vmov.32 ${dat4}[3],w14 2617 add $ctr,$ctr,#2 2618 b .Loop5x_ctr32 2619 2620.align 4 2621.Loop5x_ctr32: 2622 aese $dat0,q8 2623 aesmc $dat0,$dat0 2624 aese $dat1,q8 2625 aesmc $dat1,$dat1 2626 aese $dat2,q8 2627 aesmc $dat2,$dat2 2628 aese $dat3,q8 2629 aesmc $dat3,$dat3 2630 aese $dat4,q8 2631 aesmc $dat4,$dat4 2632 vld1.32 {q8},[$key_],#16 2633 subs $cnt,$cnt,#2 2634 aese $dat0,q9 2635 aesmc $dat0,$dat0 2636 aese $dat1,q9 2637 aesmc $dat1,$dat1 2638 aese $dat2,q9 2639 aesmc $dat2,$dat2 2640 aese $dat3,q9 2641 aesmc $dat3,$dat3 2642 aese $dat4,q9 2643 aesmc $dat4,$dat4 2644 vld1.32 {q9},[$key_],#16 2645 b.gt .Loop5x_ctr32 2646 2647 mov $key_,$key 2648 aese $dat0,q8 2649 aesmc $dat0,$dat0 2650 aese $dat1,q8 2651 aesmc $dat1,$dat1 2652 aese $dat2,q8 2653 aesmc $dat2,$dat2 2654 aese $dat3,q8 2655 aesmc $dat3,$dat3 2656 aese $dat4,q8 2657 aesmc $dat4,$dat4 2658 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2659 2660 aese $dat0,q9 2661 aesmc $dat0,$dat0 2662 aese $dat1,q9 2663 aesmc $dat1,$dat1 2664 aese $dat2,q9 2665 aesmc $dat2,$dat2 2666 aese $dat3,q9 2667 aesmc $dat3,$dat3 2668 aese $dat4,q9 2669 aesmc $dat4,$dat4 2670 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2671 2672 aese $dat0,q12 2673 aesmc $dat0,$dat0 2674 add $tctr0,$ctr,#1 2675 add $tctr1,$ctr,#2 2676 aese $dat1,q12 2677 aesmc $dat1,$dat1 2678 add $tctr2,$ctr,#3 2679 add w13,$ctr,#4 2680 aese $dat2,q12 2681 aesmc $dat2,$dat2 2682 add w14,$ctr,#5 2683 rev $tctr0,$tctr0 2684 aese $dat3,q12 2685 aesmc $dat3,$dat3 2686 rev $tctr1,$tctr1 2687 rev $tctr2,$tctr2 2688 aese $dat4,q12 2689 aesmc $dat4,$dat4 2690 rev w13,w13 2691 rev w14,w14 2692 2693 aese $dat0,q13 2694 aesmc $dat0,$dat0 2695 aese $dat1,q13 2696 aesmc $dat1,$dat1 2697 aese $dat2,q13 2698 aesmc $dat2,$dat2 2699 aese $dat3,q13 2700 aesmc $dat3,$dat3 2701 aese $dat4,q13 2702 aesmc $dat4,$dat4 2703 2704 aese $dat0,q14 2705 aesmc $dat0,$dat0 2706 vld1.8 {$in0},[$inp],#16 2707 aese $dat1,q14 2708 aesmc $dat1,$dat1 2709 vld1.8 {$in1},[$inp],#16 2710 aese $dat2,q14 2711 aesmc $dat2,$dat2 2712 vld1.8 {$in2},[$inp],#16 2713 aese $dat3,q14 2714 aesmc $dat3,$dat3 2715 vld1.8 {$in3},[$inp],#16 2716 aese $dat4,q14 2717 aesmc $dat4,$dat4 2718 vld1.8 {$in4},[$inp],#16 2719 2720 aese $dat0,q15 2721 veor $in0,$in0,$rndlast 2722 aese $dat1,q15 2723 veor $in1,$in1,$rndlast 2724 aese $dat2,q15 2725 veor $in2,$in2,$rndlast 2726 aese $dat3,q15 2727 veor $in3,$in3,$rndlast 2728 aese $dat4,q15 2729 veor $in4,$in4,$rndlast 2730 2731 veor $in0,$in0,$dat0 2732 vorr $dat0,$ivec,$ivec 2733 veor $in1,$in1,$dat1 2734 vorr $dat1,$ivec,$ivec 2735 veor $in2,$in2,$dat2 2736 vorr $dat2,$ivec,$ivec 2737 veor $in3,$in3,$dat3 2738 vorr $dat3,$ivec,$ivec 2739 veor $in4,$in4,$dat4 2740 vorr $dat4,$ivec,$ivec 2741 2742 vst1.8 {$in0},[$out],#16 2743 vmov.32 ${dat0}[3],$tctr0 2744 vst1.8 {$in1},[$out],#16 2745 vmov.32 ${dat1}[3],$tctr1 2746 vst1.8 {$in2},[$out],#16 2747 vmov.32 ${dat2}[3],$tctr2 2748 vst1.8 {$in3},[$out],#16 2749 vmov.32 ${dat3}[3],w13 2750 vst1.8 {$in4},[$out],#16 2751 vmov.32 ${dat4}[3],w14 2752 2753 mov $cnt,$rounds 2754 cbz $len,.Lctr32_done 2755 2756 add $ctr,$ctr,#5 2757 subs $len,$len,#5 2758 b.hs .Loop5x_ctr32 2759 2760 add $len,$len,#5 2761 sub $ctr,$ctr,#5 2762 2763 cmp $len,#2 2764 mov $step,#16 2765 cclr $step,lo 2766 b.ls .Lctr32_tail 2767 2768 sub $len,$len,#3 // bias 2769 add $ctr,$ctr,#3 2770___ 2771$code.=<<___; 2772 b .Loop3x_ctr32 2773 2774.align 4 2775.Loop3x_ctr32: 2776 aese $dat0,q8 2777 aesmc $dat0,$dat0 2778 aese $dat1,q8 2779 aesmc $dat1,$dat1 2780 aese $dat2,q8 2781 aesmc $dat2,$dat2 2782 vld1.32 {q8},[$key_],#16 2783 subs $cnt,$cnt,#2 2784 aese $dat0,q9 2785 aesmc $dat0,$dat0 2786 aese $dat1,q9 2787 aesmc $dat1,$dat1 2788 aese $dat2,q9 2789 aesmc $dat2,$dat2 2790 vld1.32 {q9},[$key_],#16 2791 b.gt .Loop3x_ctr32 2792 2793 aese $dat0,q8 2794 aesmc $tmp0,$dat0 2795 aese $dat1,q8 2796 aesmc $tmp1,$dat1 2797 vld1.8 {$in0},[$inp],#16 2798___ 2799$code.=<<___ if ($flavour =~ /64/); 2800 vorr $dat0,$ivec,$ivec 2801___ 2802$code.=<<___ if ($flavour !~ /64/); 2803 add $tctr0,$ctr,#1 2804___ 2805$code.=<<___; 2806 aese $dat2,q8 2807 aesmc $dat2,$dat2 2808 vld1.8 {$in1},[$inp],#16 2809___ 2810$code.=<<___ if ($flavour =~ /64/); 2811 vorr $dat1,$ivec,$ivec 2812___ 2813$code.=<<___ if ($flavour !~ /64/); 2814 rev $tctr0,$tctr0 2815___ 2816$code.=<<___; 2817 aese $tmp0,q9 2818 aesmc $tmp0,$tmp0 2819 aese $tmp1,q9 2820 aesmc $tmp1,$tmp1 2821 vld1.8 {$in2},[$inp],#16 2822 mov $key_,$key 2823 aese $dat2,q9 2824 aesmc $tmp2,$dat2 2825___ 2826$code.=<<___ if ($flavour =~ /64/); 2827 vorr $dat2,$ivec,$ivec 2828 add $tctr0,$ctr,#1 2829___ 2830$code.=<<___; 2831 aese $tmp0,q12 2832 aesmc $tmp0,$tmp0 2833 aese $tmp1,q12 2834 aesmc $tmp1,$tmp1 2835 veor $in0,$in0,$rndlast 2836 add $tctr1,$ctr,#2 2837 aese $tmp2,q12 2838 aesmc $tmp2,$tmp2 2839 veor $in1,$in1,$rndlast 2840 add $ctr,$ctr,#3 2841 aese $tmp0,q13 2842 aesmc $tmp0,$tmp0 2843 aese $tmp1,q13 2844 aesmc $tmp1,$tmp1 2845 veor $in2,$in2,$rndlast 2846___ 2847$code.=<<___ if ($flavour =~ /64/); 2848 rev $tctr0,$tctr0 2849 aese $tmp2,q13 2850 aesmc $tmp2,$tmp2 2851 vmov.32 ${dat0}[3], $tctr0 2852___ 2853$code.=<<___ if ($flavour !~ /64/); 2854 vmov.32 ${ivec}[3], $tctr0 2855 aese $tmp2,q13 2856 aesmc $tmp2,$tmp2 2857 vorr $dat0,$ivec,$ivec 2858___ 2859$code.=<<___; 2860 rev $tctr1,$tctr1 2861 aese $tmp0,q14 2862 aesmc $tmp0,$tmp0 2863___ 2864$code.=<<___ if ($flavour !~ /64/); 2865 vmov.32 ${ivec}[3], $tctr1 2866 rev $tctr2,$ctr 2867___ 2868$code.=<<___; 2869 aese $tmp1,q14 2870 aesmc $tmp1,$tmp1 2871___ 2872$code.=<<___ if ($flavour =~ /64/); 2873 vmov.32 ${dat1}[3], $tctr1 2874 rev $tctr2,$ctr 2875 aese $tmp2,q14 2876 aesmc $tmp2,$tmp2 2877 vmov.32 ${dat2}[3], $tctr2 2878___ 2879$code.=<<___ if ($flavour !~ /64/); 2880 vorr $dat1,$ivec,$ivec 2881 vmov.32 ${ivec}[3], $tctr2 2882 aese $tmp2,q14 2883 aesmc $tmp2,$tmp2 2884 vorr $dat2,$ivec,$ivec 2885___ 2886$code.=<<___; 2887 subs $len,$len,#3 2888 aese $tmp0,q15 2889 aese $tmp1,q15 2890 aese $tmp2,q15 2891 2892 veor $in0,$in0,$tmp0 2893 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2894 vst1.8 {$in0},[$out],#16 2895 veor $in1,$in1,$tmp1 2896 mov $cnt,$rounds 2897 vst1.8 {$in1},[$out],#16 2898 veor $in2,$in2,$tmp2 2899 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2900 vst1.8 {$in2},[$out],#16 2901 b.hs .Loop3x_ctr32 2902 2903 adds $len,$len,#3 2904 b.eq .Lctr32_done 2905 cmp $len,#1 2906 mov $step,#16 2907 cclr $step,eq 2908 2909.Lctr32_tail: 2910 aese $dat0,q8 2911 aesmc $dat0,$dat0 2912 aese $dat1,q8 2913 aesmc $dat1,$dat1 2914 vld1.32 {q8},[$key_],#16 2915 subs $cnt,$cnt,#2 2916 aese $dat0,q9 2917 aesmc $dat0,$dat0 2918 aese $dat1,q9 2919 aesmc $dat1,$dat1 2920 vld1.32 {q9},[$key_],#16 2921 b.gt .Lctr32_tail 2922 2923 aese $dat0,q8 2924 aesmc $dat0,$dat0 2925 aese $dat1,q8 2926 aesmc $dat1,$dat1 2927 aese $dat0,q9 2928 aesmc $dat0,$dat0 2929 aese $dat1,q9 2930 aesmc $dat1,$dat1 2931 vld1.8 {$in0},[$inp],$step 2932 aese $dat0,q12 2933 aesmc $dat0,$dat0 2934 aese $dat1,q12 2935 aesmc $dat1,$dat1 2936 vld1.8 {$in1},[$inp] 2937 aese $dat0,q13 2938 aesmc $dat0,$dat0 2939 aese $dat1,q13 2940 aesmc $dat1,$dat1 2941 veor $in0,$in0,$rndlast 2942 aese $dat0,q14 2943 aesmc $dat0,$dat0 2944 aese $dat1,q14 2945 aesmc $dat1,$dat1 2946 veor $in1,$in1,$rndlast 2947 aese $dat0,q15 2948 aese $dat1,q15 2949 2950 cmp $len,#1 2951 veor $in0,$in0,$dat0 2952 veor $in1,$in1,$dat1 2953 vst1.8 {$in0},[$out],#16 2954 b.eq .Lctr32_done 2955 vst1.8 {$in1},[$out] 2956 2957.Lctr32_done: 2958___ 2959$code.=<<___ if ($flavour !~ /64/); 2960 vldmia sp!,{d8-d15} 2961 ldmia sp!,{r4-r10,pc} 2962___ 2963$code.=<<___ if ($flavour =~ /64/); 2964 ldr x29,[sp],#16 2965 ret 2966___ 2967$code.=<<___; 2968.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 2969___ 2970}}} 2971# Performance in cycles per byte. 2972# Processed with AES-XTS different key size. 2973# It shows the value before and after optimization as below: 2974# (before/after): 2975# 2976# AES-128-XTS AES-256-XTS 2977# Cortex-A57 3.36/1.09 4.02/1.37 2978# Cortex-A72 3.03/1.02 3.28/1.33 2979 2980# Optimization is implemented by loop unrolling and interleaving. 2981# Commonly, we choose the unrolling factor as 5, if the input 2982# data size smaller than 5 blocks, but not smaller than 3 blocks, 2983# choose 3 as the unrolling factor. 2984# If the input data size dsize >= 5*16 bytes, then take 5 blocks 2985# as one iteration, every loop the left size lsize -= 5*16. 2986# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes 2987# will be processed specially, which be integrated into the 5*16 bytes 2988# loop to improve the efficiency. 2989# There is one special case, if the original input data size dsize 2990# = 16 bytes, we will treat it separately to improve the 2991# performance: one independent code block without LR, FP load and 2992# store. 2993# Encryption will process the (length -tailcnt) bytes as mentioned 2994# previously, then encrypt the composite block as last second 2995# cipher block. 2996# Decryption will process the (length -tailcnt -1) bytes as mentioned 2997# previously, then decrypt the last second cipher block to get the 2998# last plain block(tail), decrypt the composite block as last second 2999# plain text block. 3000 3001{{{ 3002my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 3003my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 3004my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 3005my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 3006my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 3007my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 3008my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); 3009my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 3010my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 3011 3012my ($tmpin)=("v26.16b"); 3013my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 3014 3015# q7 last round key 3016# q10-q15, q7 Last 7 round keys 3017# q8-q9 preloaded round keys except last 7 keys for big size 3018# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 3019 3020 3021my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 3022 3023my ($dat3,$in3,$tmp3); # used only in 64-bit mode 3024my ($dat4,$in4,$tmp4); 3025if ($flavour =~ /64/) { 3026 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 3027} 3028 3029$code.=<<___ if ($flavour =~ /64/); 3030.globl ${prefix}_xts_encrypt 3031.type ${prefix}_xts_encrypt,%function 3032.align 5 3033${prefix}_xts_encrypt: 3034___ 3035$code.=<<___ if ($flavour =~ /64/); 3036 AARCH64_VALID_CALL_TARGET 3037 cmp $len,#16 3038 // Original input data size bigger than 16, jump to big size processing. 3039 b.ne .Lxts_enc_big_size 3040 // Encrypt the iv with key2, as the first XEX iv. 3041 ldr $rounds,[$key2,#240] 3042 vld1.32 {$dat},[$key2],#16 3043 vld1.8 {$iv0},[$ivp] 3044 sub $rounds,$rounds,#2 3045 vld1.32 {$dat1},[$key2],#16 3046 3047.Loop_enc_iv_enc: 3048 aese $iv0,$dat 3049 aesmc $iv0,$iv0 3050 vld1.32 {$dat},[$key2],#16 3051 subs $rounds,$rounds,#2 3052 aese $iv0,$dat1 3053 aesmc $iv0,$iv0 3054 vld1.32 {$dat1},[$key2],#16 3055 b.gt .Loop_enc_iv_enc 3056 3057 aese $iv0,$dat 3058 aesmc $iv0,$iv0 3059 vld1.32 {$dat},[$key2] 3060 aese $iv0,$dat1 3061 veor $iv0,$iv0,$dat 3062 3063 vld1.8 {$dat0},[$inp] 3064 veor $dat0,$iv0,$dat0 3065 3066 ldr $rounds,[$key1,#240] 3067 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 3068 3069 aese $dat0,q20 3070 aesmc $dat0,$dat0 3071 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 3072 aese $dat0,q21 3073 aesmc $dat0,$dat0 3074 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing 3075 b.eq .Lxts_128_enc 3076.Lxts_enc_round_loop: 3077 aese $dat0,q8 3078 aesmc $dat0,$dat0 3079 vld1.32 {q8},[$key1],#16 // load key schedule... 3080 aese $dat0,q9 3081 aesmc $dat0,$dat0 3082 vld1.32 {q9},[$key1],#16 // load key schedule... 3083 subs $rounds,$rounds,#2 // bias 3084 b.gt .Lxts_enc_round_loop 3085.Lxts_128_enc: 3086 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 3087 aese $dat0,q8 3088 aesmc $dat0,$dat0 3089 aese $dat0,q9 3090 aesmc $dat0,$dat0 3091 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 3092 aese $dat0,q10 3093 aesmc $dat0,$dat0 3094 aese $dat0,q11 3095 aesmc $dat0,$dat0 3096 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 3097 aese $dat0,q12 3098 aesmc $dat0,$dat0 3099 aese $dat0,q13 3100 aesmc $dat0,$dat0 3101 vld1.32 {$rndlast},[$key1] 3102 aese $dat0,q14 3103 aesmc $dat0,$dat0 3104 aese $dat0,q15 3105 veor $dat0,$dat0,$rndlast 3106 veor $dat0,$dat0,$iv0 3107 vst1.8 {$dat0},[$out] 3108 b .Lxts_enc_final_abort 3109 3110.align 4 3111.Lxts_enc_big_size: 3112___ 3113$code.=<<___ if ($flavour =~ /64/); 3114 stp $constnumx,$tmpinp,[sp,#-64]! 3115 stp $tailcnt,$midnumx,[sp,#48] 3116 stp $ivd10,$ivd20,[sp,#32] 3117 stp $ivd30,$ivd40,[sp,#16] 3118 3119 // tailcnt store the tail value of length%16. 3120 and $tailcnt,$len,#0xf 3121 and $len,$len,#-16 3122 subs $len,$len,#16 3123 mov $step,#16 3124 b.lo .Lxts_abort 3125 csel $step,xzr,$step,eq 3126 3127 // Firstly, encrypt the iv with key2, as the first iv of XEX. 3128 ldr $rounds,[$key2,#240] 3129 vld1.32 {$dat},[$key2],#16 3130 vld1.8 {$iv0},[$ivp] 3131 sub $rounds,$rounds,#2 3132 vld1.32 {$dat1},[$key2],#16 3133 3134.Loop_iv_enc: 3135 aese $iv0,$dat 3136 aesmc $iv0,$iv0 3137 vld1.32 {$dat},[$key2],#16 3138 subs $rounds,$rounds,#2 3139 aese $iv0,$dat1 3140 aesmc $iv0,$iv0 3141 vld1.32 {$dat1},[$key2],#16 3142 b.gt .Loop_iv_enc 3143 3144 aese $iv0,$dat 3145 aesmc $iv0,$iv0 3146 vld1.32 {$dat},[$key2] 3147 aese $iv0,$dat1 3148 veor $iv0,$iv0,$dat 3149 3150 // The iv for second block 3151 // $ivl- iv(low), $ivh - iv(high) 3152 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3153 fmov $ivl,$ivd00 3154 fmov $ivh,$ivd01 3155 mov $constnum,#0x87 3156 extr $midnumx,$ivh,$ivh,#32 3157 extr $ivh,$ivh,$ivl,#63 3158 and $tmpmw,$constnum,$midnum,asr#31 3159 eor $ivl,$tmpmx,$ivl,lsl#1 3160 fmov $ivd10,$ivl 3161 fmov $ivd11,$ivh 3162 3163 ldr $rounds0,[$key1,#240] // next starting point 3164 vld1.8 {$dat},[$inp],$step 3165 3166 vld1.32 {q8-q9},[$key1] // load key schedule... 3167 sub $rounds0,$rounds0,#6 3168 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3169 sub $rounds0,$rounds0,#2 3170 vld1.32 {q10-q11},[$key_],#32 3171 vld1.32 {q12-q13},[$key_],#32 3172 vld1.32 {q14-q15},[$key_],#32 3173 vld1.32 {$rndlast},[$key_] 3174 3175 add $key_,$key1,#32 3176 mov $rounds,$rounds0 3177 3178 // Encryption 3179.Lxts_enc: 3180 vld1.8 {$dat2},[$inp],#16 3181 subs $len,$len,#32 // bias 3182 add $rounds,$rounds0,#2 3183 vorr $in1,$dat,$dat 3184 vorr $dat1,$dat,$dat 3185 vorr $in3,$dat,$dat 3186 vorr $in2,$dat2,$dat2 3187 vorr $in4,$dat2,$dat2 3188 b.lo .Lxts_inner_enc_tail 3189 veor $dat,$dat,$iv0 // before encryption, xor with iv 3190 veor $dat2,$dat2,$iv1 3191 3192 // The iv for third block 3193 extr $midnumx,$ivh,$ivh,#32 3194 extr $ivh,$ivh,$ivl,#63 3195 and $tmpmw,$constnum,$midnum,asr#31 3196 eor $ivl,$tmpmx,$ivl,lsl#1 3197 fmov $ivd20,$ivl 3198 fmov $ivd21,$ivh 3199 3200 3201 vorr $dat1,$dat2,$dat2 3202 vld1.8 {$dat2},[$inp],#16 3203 vorr $in0,$dat,$dat 3204 vorr $in1,$dat1,$dat1 3205 veor $in2,$dat2,$iv2 // the third block 3206 veor $dat2,$dat2,$iv2 3207 cmp $len,#32 3208 b.lo .Lxts_outer_enc_tail 3209 3210 // The iv for fourth block 3211 extr $midnumx,$ivh,$ivh,#32 3212 extr $ivh,$ivh,$ivl,#63 3213 and $tmpmw,$constnum,$midnum,asr#31 3214 eor $ivl,$tmpmx,$ivl,lsl#1 3215 fmov $ivd30,$ivl 3216 fmov $ivd31,$ivh 3217 3218 vld1.8 {$dat3},[$inp],#16 3219 // The iv for fifth block 3220 extr $midnumx,$ivh,$ivh,#32 3221 extr $ivh,$ivh,$ivl,#63 3222 and $tmpmw,$constnum,$midnum,asr#31 3223 eor $ivl,$tmpmx,$ivl,lsl#1 3224 fmov $ivd40,$ivl 3225 fmov $ivd41,$ivh 3226 3227 vld1.8 {$dat4},[$inp],#16 3228 veor $dat3,$dat3,$iv3 // the fourth block 3229 veor $dat4,$dat4,$iv4 3230 sub $len,$len,#32 // bias 3231 mov $rounds,$rounds0 3232 b .Loop5x_xts_enc 3233 3234.align 4 3235.Loop5x_xts_enc: 3236 aese $dat0,q8 3237 aesmc $dat0,$dat0 3238 aese $dat1,q8 3239 aesmc $dat1,$dat1 3240 aese $dat2,q8 3241 aesmc $dat2,$dat2 3242 aese $dat3,q8 3243 aesmc $dat3,$dat3 3244 aese $dat4,q8 3245 aesmc $dat4,$dat4 3246 vld1.32 {q8},[$key_],#16 3247 subs $rounds,$rounds,#2 3248 aese $dat0,q9 3249 aesmc $dat0,$dat0 3250 aese $dat1,q9 3251 aesmc $dat1,$dat1 3252 aese $dat2,q9 3253 aesmc $dat2,$dat2 3254 aese $dat3,q9 3255 aesmc $dat3,$dat3 3256 aese $dat4,q9 3257 aesmc $dat4,$dat4 3258 vld1.32 {q9},[$key_],#16 3259 b.gt .Loop5x_xts_enc 3260 3261 aese $dat0,q8 3262 aesmc $dat0,$dat0 3263 aese $dat1,q8 3264 aesmc $dat1,$dat1 3265 aese $dat2,q8 3266 aesmc $dat2,$dat2 3267 aese $dat3,q8 3268 aesmc $dat3,$dat3 3269 aese $dat4,q8 3270 aesmc $dat4,$dat4 3271 subs $len,$len,#0x50 // because .Lxts_enc_tail4x 3272 3273 aese $dat0,q9 3274 aesmc $dat0,$dat0 3275 aese $dat1,q9 3276 aesmc $dat1,$dat1 3277 aese $dat2,q9 3278 aesmc $dat2,$dat2 3279 aese $dat3,q9 3280 aesmc $dat3,$dat3 3281 aese $dat4,q9 3282 aesmc $dat4,$dat4 3283 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3284 mov $key_,$key1 3285 3286 aese $dat0,q10 3287 aesmc $dat0,$dat0 3288 aese $dat1,q10 3289 aesmc $dat1,$dat1 3290 aese $dat2,q10 3291 aesmc $dat2,$dat2 3292 aese $dat3,q10 3293 aesmc $dat3,$dat3 3294 aese $dat4,q10 3295 aesmc $dat4,$dat4 3296 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3297 // at exit from the loop v1.16b-v26.16b 3298 // are loaded with last "words" 3299 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x 3300 3301 aese $dat0,q11 3302 aesmc $dat0,$dat0 3303 aese $dat1,q11 3304 aesmc $dat1,$dat1 3305 aese $dat2,q11 3306 aesmc $dat2,$dat2 3307 aese $dat3,q11 3308 aesmc $dat3,$dat3 3309 aese $dat4,q11 3310 aesmc $dat4,$dat4 3311 3312 aese $dat0,q12 3313 aesmc $dat0,$dat0 3314 aese $dat1,q12 3315 aesmc $dat1,$dat1 3316 aese $dat2,q12 3317 aesmc $dat2,$dat2 3318 aese $dat3,q12 3319 aesmc $dat3,$dat3 3320 aese $dat4,q12 3321 aesmc $dat4,$dat4 3322 3323 aese $dat0,q13 3324 aesmc $dat0,$dat0 3325 aese $dat1,q13 3326 aesmc $dat1,$dat1 3327 aese $dat2,q13 3328 aesmc $dat2,$dat2 3329 aese $dat3,q13 3330 aesmc $dat3,$dat3 3331 aese $dat4,q13 3332 aesmc $dat4,$dat4 3333 3334 aese $dat0,q14 3335 aesmc $dat0,$dat0 3336 aese $dat1,q14 3337 aesmc $dat1,$dat1 3338 aese $dat2,q14 3339 aesmc $dat2,$dat2 3340 aese $dat3,q14 3341 aesmc $dat3,$dat3 3342 aese $dat4,q14 3343 aesmc $dat4,$dat4 3344 3345 veor $tmp0,$rndlast,$iv0 3346 aese $dat0,q15 3347 // The iv for first block of one iteration 3348 extr $midnumx,$ivh,$ivh,#32 3349 extr $ivh,$ivh,$ivl,#63 3350 and $tmpmw,$constnum,$midnum,asr#31 3351 eor $ivl,$tmpmx,$ivl,lsl#1 3352 fmov $ivd00,$ivl 3353 fmov $ivd01,$ivh 3354 veor $tmp1,$rndlast,$iv1 3355 vld1.8 {$in0},[$inp],#16 3356 aese $dat1,q15 3357 // The iv for second block 3358 extr $midnumx,$ivh,$ivh,#32 3359 extr $ivh,$ivh,$ivl,#63 3360 and $tmpmw,$constnum,$midnum,asr#31 3361 eor $ivl,$tmpmx,$ivl,lsl#1 3362 fmov $ivd10,$ivl 3363 fmov $ivd11,$ivh 3364 veor $tmp2,$rndlast,$iv2 3365 vld1.8 {$in1},[$inp],#16 3366 aese $dat2,q15 3367 // The iv for third block 3368 extr $midnumx,$ivh,$ivh,#32 3369 extr $ivh,$ivh,$ivl,#63 3370 and $tmpmw,$constnum,$midnum,asr#31 3371 eor $ivl,$tmpmx,$ivl,lsl#1 3372 fmov $ivd20,$ivl 3373 fmov $ivd21,$ivh 3374 veor $tmp3,$rndlast,$iv3 3375 vld1.8 {$in2},[$inp],#16 3376 aese $dat3,q15 3377 // The iv for fourth block 3378 extr $midnumx,$ivh,$ivh,#32 3379 extr $ivh,$ivh,$ivl,#63 3380 and $tmpmw,$constnum,$midnum,asr#31 3381 eor $ivl,$tmpmx,$ivl,lsl#1 3382 fmov $ivd30,$ivl 3383 fmov $ivd31,$ivh 3384 veor $tmp4,$rndlast,$iv4 3385 vld1.8 {$in3},[$inp],#16 3386 aese $dat4,q15 3387 3388 // The iv for fifth block 3389 extr $midnumx,$ivh,$ivh,#32 3390 extr $ivh,$ivh,$ivl,#63 3391 and $tmpmw,$constnum,$midnum,asr #31 3392 eor $ivl,$tmpmx,$ivl,lsl #1 3393 fmov $ivd40,$ivl 3394 fmov $ivd41,$ivh 3395 3396 vld1.8 {$in4},[$inp],#16 3397 cbz $xoffset,.Lxts_enc_tail4x 3398 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3399 veor $tmp0,$tmp0,$dat0 3400 veor $dat0,$in0,$iv0 3401 veor $tmp1,$tmp1,$dat1 3402 veor $dat1,$in1,$iv1 3403 veor $tmp2,$tmp2,$dat2 3404 veor $dat2,$in2,$iv2 3405 veor $tmp3,$tmp3,$dat3 3406 veor $dat3,$in3,$iv3 3407 veor $tmp4,$tmp4,$dat4 3408 vst1.8 {$tmp0},[$out],#16 3409 veor $dat4,$in4,$iv4 3410 vst1.8 {$tmp1},[$out],#16 3411 mov $rounds,$rounds0 3412 vst1.8 {$tmp2},[$out],#16 3413 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3414 vst1.8 {$tmp3},[$out],#16 3415 vst1.8 {$tmp4},[$out],#16 3416 b.hs .Loop5x_xts_enc 3417 3418 3419 // If left 4 blocks, borrow the five block's processing. 3420 cmn $len,#0x10 3421 b.ne .Loop5x_enc_after 3422 vorr $iv4,$iv3,$iv3 3423 vorr $iv3,$iv2,$iv2 3424 vorr $iv2,$iv1,$iv1 3425 vorr $iv1,$iv0,$iv0 3426 fmov $ivl,$ivd40 3427 fmov $ivh,$ivd41 3428 veor $dat0,$iv0,$in0 3429 veor $dat1,$iv1,$in1 3430 veor $dat2,$in2,$iv2 3431 veor $dat3,$in3,$iv3 3432 veor $dat4,$in4,$iv4 3433 b.eq .Loop5x_xts_enc 3434 3435.Loop5x_enc_after: 3436 add $len,$len,#0x50 3437 cbz $len,.Lxts_enc_done 3438 3439 add $rounds,$rounds0,#2 3440 subs $len,$len,#0x30 3441 b.lo .Lxts_inner_enc_tail 3442 3443 veor $dat0,$iv0,$in2 3444 veor $dat1,$iv1,$in3 3445 veor $dat2,$in4,$iv2 3446 b .Lxts_outer_enc_tail 3447 3448.align 4 3449.Lxts_enc_tail4x: 3450 add $inp,$inp,#16 3451 veor $tmp1,$dat1,$tmp1 3452 vst1.8 {$tmp1},[$out],#16 3453 veor $tmp2,$dat2,$tmp2 3454 vst1.8 {$tmp2},[$out],#16 3455 veor $tmp3,$dat3,$tmp3 3456 veor $tmp4,$dat4,$tmp4 3457 vst1.8 {$tmp3-$tmp4},[$out],#32 3458 3459 b .Lxts_enc_done 3460.align 4 3461.Lxts_outer_enc_tail: 3462 aese $dat0,q8 3463 aesmc $dat0,$dat0 3464 aese $dat1,q8 3465 aesmc $dat1,$dat1 3466 aese $dat2,q8 3467 aesmc $dat2,$dat2 3468 vld1.32 {q8},[$key_],#16 3469 subs $rounds,$rounds,#2 3470 aese $dat0,q9 3471 aesmc $dat0,$dat0 3472 aese $dat1,q9 3473 aesmc $dat1,$dat1 3474 aese $dat2,q9 3475 aesmc $dat2,$dat2 3476 vld1.32 {q9},[$key_],#16 3477 b.gt .Lxts_outer_enc_tail 3478 3479 aese $dat0,q8 3480 aesmc $dat0,$dat0 3481 aese $dat1,q8 3482 aesmc $dat1,$dat1 3483 aese $dat2,q8 3484 aesmc $dat2,$dat2 3485 veor $tmp0,$iv0,$rndlast 3486 subs $len,$len,#0x30 3487 // The iv for first block 3488 fmov $ivl,$ivd20 3489 fmov $ivh,$ivd21 3490 //mov $constnum,#0x87 3491 extr $midnumx,$ivh,$ivh,#32 3492 extr $ivh,$ivh,$ivl,#63 3493 and $tmpmw,$constnum,$midnum,asr#31 3494 eor $ivl,$tmpmx,$ivl,lsl#1 3495 fmov $ivd00,$ivl 3496 fmov $ivd01,$ivh 3497 veor $tmp1,$iv1,$rndlast 3498 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 3499 aese $dat0,q9 3500 aesmc $dat0,$dat0 3501 aese $dat1,q9 3502 aesmc $dat1,$dat1 3503 aese $dat2,q9 3504 aesmc $dat2,$dat2 3505 veor $tmp2,$iv2,$rndlast 3506 3507 add $xoffset,$xoffset,#0x20 3508 add $inp,$inp,$xoffset 3509 mov $key_,$key1 3510 3511 aese $dat0,q12 3512 aesmc $dat0,$dat0 3513 aese $dat1,q12 3514 aesmc $dat1,$dat1 3515 aese $dat2,q12 3516 aesmc $dat2,$dat2 3517 aese $dat0,q13 3518 aesmc $dat0,$dat0 3519 aese $dat1,q13 3520 aesmc $dat1,$dat1 3521 aese $dat2,q13 3522 aesmc $dat2,$dat2 3523 aese $dat0,q14 3524 aesmc $dat0,$dat0 3525 aese $dat1,q14 3526 aesmc $dat1,$dat1 3527 aese $dat2,q14 3528 aesmc $dat2,$dat2 3529 aese $dat0,q15 3530 aese $dat1,q15 3531 aese $dat2,q15 3532 vld1.8 {$in2},[$inp],#16 3533 add $rounds,$rounds0,#2 3534 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3535 veor $tmp0,$tmp0,$dat0 3536 veor $tmp1,$tmp1,$dat1 3537 veor $dat2,$dat2,$tmp2 3538 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3539 vst1.8 {$tmp0},[$out],#16 3540 vst1.8 {$tmp1},[$out],#16 3541 vst1.8 {$dat2},[$out],#16 3542 cmn $len,#0x30 3543 b.eq .Lxts_enc_done 3544.Lxts_encxor_one: 3545 vorr $in3,$in1,$in1 3546 vorr $in4,$in2,$in2 3547 nop 3548 3549.Lxts_inner_enc_tail: 3550 cmn $len,#0x10 3551 veor $dat1,$in3,$iv0 3552 veor $dat2,$in4,$iv1 3553 b.eq .Lxts_enc_tail_loop 3554 veor $dat2,$in4,$iv0 3555.Lxts_enc_tail_loop: 3556 aese $dat1,q8 3557 aesmc $dat1,$dat1 3558 aese $dat2,q8 3559 aesmc $dat2,$dat2 3560 vld1.32 {q8},[$key_],#16 3561 subs $rounds,$rounds,#2 3562 aese $dat1,q9 3563 aesmc $dat1,$dat1 3564 aese $dat2,q9 3565 aesmc $dat2,$dat2 3566 vld1.32 {q9},[$key_],#16 3567 b.gt .Lxts_enc_tail_loop 3568 3569 aese $dat1,q8 3570 aesmc $dat1,$dat1 3571 aese $dat2,q8 3572 aesmc $dat2,$dat2 3573 aese $dat1,q9 3574 aesmc $dat1,$dat1 3575 aese $dat2,q9 3576 aesmc $dat2,$dat2 3577 aese $dat1,q12 3578 aesmc $dat1,$dat1 3579 aese $dat2,q12 3580 aesmc $dat2,$dat2 3581 cmn $len,#0x20 3582 aese $dat1,q13 3583 aesmc $dat1,$dat1 3584 aese $dat2,q13 3585 aesmc $dat2,$dat2 3586 veor $tmp1,$iv0,$rndlast 3587 aese $dat1,q14 3588 aesmc $dat1,$dat1 3589 aese $dat2,q14 3590 aesmc $dat2,$dat2 3591 veor $tmp2,$iv1,$rndlast 3592 aese $dat1,q15 3593 aese $dat2,q15 3594 b.eq .Lxts_enc_one 3595 veor $tmp1,$tmp1,$dat1 3596 vst1.8 {$tmp1},[$out],#16 3597 veor $tmp2,$tmp2,$dat2 3598 vorr $iv0,$iv1,$iv1 3599 vst1.8 {$tmp2},[$out],#16 3600 fmov $ivl,$ivd10 3601 fmov $ivh,$ivd11 3602 mov $constnum,#0x87 3603 extr $midnumx,$ivh,$ivh,#32 3604 extr $ivh,$ivh,$ivl,#63 3605 and $tmpmw,$constnum,$midnum,asr #31 3606 eor $ivl,$tmpmx,$ivl,lsl #1 3607 fmov $ivd00,$ivl 3608 fmov $ivd01,$ivh 3609 b .Lxts_enc_done 3610 3611.Lxts_enc_one: 3612 veor $tmp1,$tmp1,$dat2 3613 vorr $iv0,$iv0,$iv0 3614 vst1.8 {$tmp1},[$out],#16 3615 fmov $ivl,$ivd00 3616 fmov $ivh,$ivd01 3617 mov $constnum,#0x87 3618 extr $midnumx,$ivh,$ivh,#32 3619 extr $ivh,$ivh,$ivl,#63 3620 and $tmpmw,$constnum,$midnum,asr #31 3621 eor $ivl,$tmpmx,$ivl,lsl #1 3622 fmov $ivd00,$ivl 3623 fmov $ivd01,$ivh 3624 b .Lxts_enc_done 3625.align 5 3626.Lxts_enc_done: 3627 // Process the tail block with cipher stealing. 3628 tst $tailcnt,#0xf 3629 b.eq .Lxts_abort 3630 3631 mov $tmpinp,$inp 3632 mov $tmpoutp,$out 3633 sub $out,$out,#16 3634.composite_enc_loop: 3635 subs $tailcnt,$tailcnt,#1 3636 ldrb $l2outp,[$out,$tailcnt] 3637 ldrb $loutp,[$tmpinp,$tailcnt] 3638 strb $l2outp,[$tmpoutp,$tailcnt] 3639 strb $loutp,[$out,$tailcnt] 3640 b.gt .composite_enc_loop 3641.Lxts_enc_load_done: 3642 vld1.8 {$tmpin},[$out] 3643 veor $tmpin,$tmpin,$iv0 3644 3645 // Encrypt the composite block to get the last second encrypted text block 3646 ldr $rounds,[$key1,#240] // load key schedule... 3647 vld1.32 {$dat},[$key1],#16 3648 sub $rounds,$rounds,#2 3649 vld1.32 {$dat1},[$key1],#16 // load key schedule... 3650.Loop_final_enc: 3651 aese $tmpin,$dat0 3652 aesmc $tmpin,$tmpin 3653 vld1.32 {$dat0},[$key1],#16 3654 subs $rounds,$rounds,#2 3655 aese $tmpin,$dat1 3656 aesmc $tmpin,$tmpin 3657 vld1.32 {$dat1},[$key1],#16 3658 b.gt .Loop_final_enc 3659 3660 aese $tmpin,$dat0 3661 aesmc $tmpin,$tmpin 3662 vld1.32 {$dat0},[$key1] 3663 aese $tmpin,$dat1 3664 veor $tmpin,$tmpin,$dat0 3665 veor $tmpin,$tmpin,$iv0 3666 vst1.8 {$tmpin},[$out] 3667 3668.Lxts_abort: 3669 ldp $tailcnt,$midnumx,[sp,#48] 3670 ldp $ivd10,$ivd20,[sp,#32] 3671 ldp $ivd30,$ivd40,[sp,#16] 3672 ldp $constnumx,$tmpinp,[sp],#64 3673.Lxts_enc_final_abort: 3674 ret 3675.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt 3676___ 3677 3678}}} 3679{{{ 3680my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 3681my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 3682my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 3683my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 3684my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 3685my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 3686my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); 3687my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 3688my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 3689 3690my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 3691 3692# q7 last round key 3693# q10-q15, q7 Last 7 round keys 3694# q8-q9 preloaded round keys except last 7 keys for big size 3695# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 3696 3697{ 3698my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 3699 3700my ($dat3,$in3,$tmp3); # used only in 64-bit mode 3701my ($dat4,$in4,$tmp4); 3702if ($flavour =~ /64/) { 3703 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 3704} 3705 3706$code.=<<___ if ($flavour =~ /64/); 3707.globl ${prefix}_xts_decrypt 3708.type ${prefix}_xts_decrypt,%function 3709.align 5 3710${prefix}_xts_decrypt: 3711 AARCH64_VALID_CALL_TARGET 3712___ 3713$code.=<<___ if ($flavour =~ /64/); 3714 cmp $len,#16 3715 // Original input data size bigger than 16, jump to big size processing. 3716 b.ne .Lxts_dec_big_size 3717 // Encrypt the iv with key2, as the first XEX iv. 3718 ldr $rounds,[$key2,#240] 3719 vld1.32 {$dat},[$key2],#16 3720 vld1.8 {$iv0},[$ivp] 3721 sub $rounds,$rounds,#2 3722 vld1.32 {$dat1},[$key2],#16 3723 3724.Loop_dec_small_iv_enc: 3725 aese $iv0,$dat 3726 aesmc $iv0,$iv0 3727 vld1.32 {$dat},[$key2],#16 3728 subs $rounds,$rounds,#2 3729 aese $iv0,$dat1 3730 aesmc $iv0,$iv0 3731 vld1.32 {$dat1},[$key2],#16 3732 b.gt .Loop_dec_small_iv_enc 3733 3734 aese $iv0,$dat 3735 aesmc $iv0,$iv0 3736 vld1.32 {$dat},[$key2] 3737 aese $iv0,$dat1 3738 veor $iv0,$iv0,$dat 3739 3740 vld1.8 {$dat0},[$inp] 3741 veor $dat0,$iv0,$dat0 3742 3743 ldr $rounds,[$key1,#240] 3744 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 3745 3746 aesd $dat0,q20 3747 aesimc $dat0,$dat0 3748 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 3749 aesd $dat0,q21 3750 aesimc $dat0,$dat0 3751 subs $rounds,$rounds,#10 // bias 3752 b.eq .Lxts_128_dec 3753.Lxts_dec_round_loop: 3754 aesd $dat0,q8 3755 aesimc $dat0,$dat0 3756 vld1.32 {q8},[$key1],#16 // load key schedule... 3757 aesd $dat0,q9 3758 aesimc $dat0,$dat0 3759 vld1.32 {q9},[$key1],#16 // load key schedule... 3760 subs $rounds,$rounds,#2 // bias 3761 b.gt .Lxts_dec_round_loop 3762.Lxts_128_dec: 3763 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 3764 aesd $dat0,q8 3765 aesimc $dat0,$dat0 3766 aesd $dat0,q9 3767 aesimc $dat0,$dat0 3768 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 3769 aesd $dat0,q10 3770 aesimc $dat0,$dat0 3771 aesd $dat0,q11 3772 aesimc $dat0,$dat0 3773 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 3774 aesd $dat0,q12 3775 aesimc $dat0,$dat0 3776 aesd $dat0,q13 3777 aesimc $dat0,$dat0 3778 vld1.32 {$rndlast},[$key1] 3779 aesd $dat0,q14 3780 aesimc $dat0,$dat0 3781 aesd $dat0,q15 3782 veor $dat0,$dat0,$rndlast 3783 veor $dat0,$iv0,$dat0 3784 vst1.8 {$dat0},[$out] 3785 b .Lxts_dec_final_abort 3786.Lxts_dec_big_size: 3787___ 3788$code.=<<___ if ($flavour =~ /64/); 3789 stp $constnumx,$tmpinp,[sp,#-64]! 3790 stp $tailcnt,$midnumx,[sp,#48] 3791 stp $ivd10,$ivd20,[sp,#32] 3792 stp $ivd30,$ivd40,[sp,#16] 3793 3794 and $tailcnt,$len,#0xf 3795 and $len,$len,#-16 3796 subs $len,$len,#16 3797 mov $step,#16 3798 b.lo .Lxts_dec_abort 3799 3800 // Encrypt the iv with key2, as the first XEX iv 3801 ldr $rounds,[$key2,#240] 3802 vld1.32 {$dat},[$key2],#16 3803 vld1.8 {$iv0},[$ivp] 3804 sub $rounds,$rounds,#2 3805 vld1.32 {$dat1},[$key2],#16 3806 3807.Loop_dec_iv_enc: 3808 aese $iv0,$dat 3809 aesmc $iv0,$iv0 3810 vld1.32 {$dat},[$key2],#16 3811 subs $rounds,$rounds,#2 3812 aese $iv0,$dat1 3813 aesmc $iv0,$iv0 3814 vld1.32 {$dat1},[$key2],#16 3815 b.gt .Loop_dec_iv_enc 3816 3817 aese $iv0,$dat 3818 aesmc $iv0,$iv0 3819 vld1.32 {$dat},[$key2] 3820 aese $iv0,$dat1 3821 veor $iv0,$iv0,$dat 3822 3823 // The iv for second block 3824 // $ivl- iv(low), $ivh - iv(high) 3825 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3826 fmov $ivl,$ivd00 3827 fmov $ivh,$ivd01 3828 mov $constnum,#0x87 3829 extr $midnumx,$ivh,$ivh,#32 3830 extr $ivh,$ivh,$ivl,#63 3831 and $tmpmw,$constnum,$midnum,asr #31 3832 eor $ivl,$tmpmx,$ivl,lsl #1 3833 fmov $ivd10,$ivl 3834 fmov $ivd11,$ivh 3835 3836 ldr $rounds0,[$key1,#240] // load rounds number 3837 3838 // The iv for third block 3839 extr $midnumx,$ivh,$ivh,#32 3840 extr $ivh,$ivh,$ivl,#63 3841 and $tmpmw,$constnum,$midnum,asr #31 3842 eor $ivl,$tmpmx,$ivl,lsl #1 3843 fmov $ivd20,$ivl 3844 fmov $ivd21,$ivh 3845 3846 vld1.32 {q8-q9},[$key1] // load key schedule... 3847 sub $rounds0,$rounds0,#6 3848 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3849 sub $rounds0,$rounds0,#2 3850 vld1.32 {q10-q11},[$key_],#32 // load key schedule... 3851 vld1.32 {q12-q13},[$key_],#32 3852 vld1.32 {q14-q15},[$key_],#32 3853 vld1.32 {$rndlast},[$key_] 3854 3855 // The iv for fourth block 3856 extr $midnumx,$ivh,$ivh,#32 3857 extr $ivh,$ivh,$ivl,#63 3858 and $tmpmw,$constnum,$midnum,asr #31 3859 eor $ivl,$tmpmx,$ivl,lsl #1 3860 fmov $ivd30,$ivl 3861 fmov $ivd31,$ivh 3862 3863 add $key_,$key1,#32 3864 mov $rounds,$rounds0 3865 b .Lxts_dec 3866 3867 // Decryption 3868.align 5 3869.Lxts_dec: 3870 tst $tailcnt,#0xf 3871 b.eq .Lxts_dec_begin 3872 subs $len,$len,#16 3873 csel $step,xzr,$step,eq 3874 vld1.8 {$dat},[$inp],#16 3875 b.lo .Lxts_done 3876 sub $inp,$inp,#16 3877.Lxts_dec_begin: 3878 vld1.8 {$dat},[$inp],$step 3879 subs $len,$len,#32 // bias 3880 add $rounds,$rounds0,#2 3881 vorr $in1,$dat,$dat 3882 vorr $dat1,$dat,$dat 3883 vorr $in3,$dat,$dat 3884 vld1.8 {$dat2},[$inp],#16 3885 vorr $in2,$dat2,$dat2 3886 vorr $in4,$dat2,$dat2 3887 b.lo .Lxts_inner_dec_tail 3888 veor $dat,$dat,$iv0 // before decryt, xor with iv 3889 veor $dat2,$dat2,$iv1 3890 3891 vorr $dat1,$dat2,$dat2 3892 vld1.8 {$dat2},[$inp],#16 3893 vorr $in0,$dat,$dat 3894 vorr $in1,$dat1,$dat1 3895 veor $in2,$dat2,$iv2 // third block xox with third iv 3896 veor $dat2,$dat2,$iv2 3897 cmp $len,#32 3898 b.lo .Lxts_outer_dec_tail 3899 3900 vld1.8 {$dat3},[$inp],#16 3901 3902 // The iv for fifth block 3903 extr $midnumx,$ivh,$ivh,#32 3904 extr $ivh,$ivh,$ivl,#63 3905 and $tmpmw,$constnum,$midnum,asr #31 3906 eor $ivl,$tmpmx,$ivl,lsl #1 3907 fmov $ivd40,$ivl 3908 fmov $ivd41,$ivh 3909 3910 vld1.8 {$dat4},[$inp],#16 3911 veor $dat3,$dat3,$iv3 // the fourth block 3912 veor $dat4,$dat4,$iv4 3913 sub $len,$len,#32 // bias 3914 mov $rounds,$rounds0 3915 b .Loop5x_xts_dec 3916 3917.align 4 3918.Loop5x_xts_dec: 3919 aesd $dat0,q8 3920 aesimc $dat0,$dat0 3921 aesd $dat1,q8 3922 aesimc $dat1,$dat1 3923 aesd $dat2,q8 3924 aesimc $dat2,$dat2 3925 aesd $dat3,q8 3926 aesimc $dat3,$dat3 3927 aesd $dat4,q8 3928 aesimc $dat4,$dat4 3929 vld1.32 {q8},[$key_],#16 // load key schedule... 3930 subs $rounds,$rounds,#2 3931 aesd $dat0,q9 3932 aesimc $dat0,$dat0 3933 aesd $dat1,q9 3934 aesimc $dat1,$dat1 3935 aesd $dat2,q9 3936 aesimc $dat2,$dat2 3937 aesd $dat3,q9 3938 aesimc $dat3,$dat3 3939 aesd $dat4,q9 3940 aesimc $dat4,$dat4 3941 vld1.32 {q9},[$key_],#16 // load key schedule... 3942 b.gt .Loop5x_xts_dec 3943 3944 aesd $dat0,q8 3945 aesimc $dat0,$dat0 3946 aesd $dat1,q8 3947 aesimc $dat1,$dat1 3948 aesd $dat2,q8 3949 aesimc $dat2,$dat2 3950 aesd $dat3,q8 3951 aesimc $dat3,$dat3 3952 aesd $dat4,q8 3953 aesimc $dat4,$dat4 3954 subs $len,$len,#0x50 // because .Lxts_dec_tail4x 3955 3956 aesd $dat0,q9 3957 aesimc $dat0,$dat 3958 aesd $dat1,q9 3959 aesimc $dat1,$dat1 3960 aesd $dat2,q9 3961 aesimc $dat2,$dat2 3962 aesd $dat3,q9 3963 aesimc $dat3,$dat3 3964 aesd $dat4,q9 3965 aesimc $dat4,$dat4 3966 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3967 mov $key_,$key1 3968 3969 aesd $dat0,q10 3970 aesimc $dat0,$dat0 3971 aesd $dat1,q10 3972 aesimc $dat1,$dat1 3973 aesd $dat2,q10 3974 aesimc $dat2,$dat2 3975 aesd $dat3,q10 3976 aesimc $dat3,$dat3 3977 aesd $dat4,q10 3978 aesimc $dat4,$dat4 3979 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3980 // at exit from the loop v1.16b-v26.16b 3981 // are loaded with last "words" 3982 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x 3983 3984 aesd $dat0,q11 3985 aesimc $dat0,$dat0 3986 aesd $dat1,q11 3987 aesimc $dat1,$dat1 3988 aesd $dat2,q11 3989 aesimc $dat2,$dat2 3990 aesd $dat3,q11 3991 aesimc $dat3,$dat3 3992 aesd $dat4,q11 3993 aesimc $dat4,$dat4 3994 3995 aesd $dat0,q12 3996 aesimc $dat0,$dat0 3997 aesd $dat1,q12 3998 aesimc $dat1,$dat1 3999 aesd $dat2,q12 4000 aesimc $dat2,$dat2 4001 aesd $dat3,q12 4002 aesimc $dat3,$dat3 4003 aesd $dat4,q12 4004 aesimc $dat4,$dat4 4005 4006 aesd $dat0,q13 4007 aesimc $dat0,$dat0 4008 aesd $dat1,q13 4009 aesimc $dat1,$dat1 4010 aesd $dat2,q13 4011 aesimc $dat2,$dat2 4012 aesd $dat3,q13 4013 aesimc $dat3,$dat3 4014 aesd $dat4,q13 4015 aesimc $dat4,$dat4 4016 4017 aesd $dat0,q14 4018 aesimc $dat0,$dat0 4019 aesd $dat1,q14 4020 aesimc $dat1,$dat1 4021 aesd $dat2,q14 4022 aesimc $dat2,$dat2 4023 aesd $dat3,q14 4024 aesimc $dat3,$dat3 4025 aesd $dat4,q14 4026 aesimc $dat4,$dat4 4027 4028 veor $tmp0,$rndlast,$iv0 4029 aesd $dat0,q15 4030 // The iv for first block of next iteration. 4031 extr $midnumx,$ivh,$ivh,#32 4032 extr $ivh,$ivh,$ivl,#63 4033 and $tmpmw,$constnum,$midnum,asr #31 4034 eor $ivl,$tmpmx,$ivl,lsl #1 4035 fmov $ivd00,$ivl 4036 fmov $ivd01,$ivh 4037 veor $tmp1,$rndlast,$iv1 4038 vld1.8 {$in0},[$inp],#16 4039 aesd $dat1,q15 4040 // The iv for second block 4041 extr $midnumx,$ivh,$ivh,#32 4042 extr $ivh,$ivh,$ivl,#63 4043 and $tmpmw,$constnum,$midnum,asr #31 4044 eor $ivl,$tmpmx,$ivl,lsl #1 4045 fmov $ivd10,$ivl 4046 fmov $ivd11,$ivh 4047 veor $tmp2,$rndlast,$iv2 4048 vld1.8 {$in1},[$inp],#16 4049 aesd $dat2,q15 4050 // The iv for third block 4051 extr $midnumx,$ivh,$ivh,#32 4052 extr $ivh,$ivh,$ivl,#63 4053 and $tmpmw,$constnum,$midnum,asr #31 4054 eor $ivl,$tmpmx,$ivl,lsl #1 4055 fmov $ivd20,$ivl 4056 fmov $ivd21,$ivh 4057 veor $tmp3,$rndlast,$iv3 4058 vld1.8 {$in2},[$inp],#16 4059 aesd $dat3,q15 4060 // The iv for fourth block 4061 extr $midnumx,$ivh,$ivh,#32 4062 extr $ivh,$ivh,$ivl,#63 4063 and $tmpmw,$constnum,$midnum,asr #31 4064 eor $ivl,$tmpmx,$ivl,lsl #1 4065 fmov $ivd30,$ivl 4066 fmov $ivd31,$ivh 4067 veor $tmp4,$rndlast,$iv4 4068 vld1.8 {$in3},[$inp],#16 4069 aesd $dat4,q15 4070 4071 // The iv for fifth block 4072 extr $midnumx,$ivh,$ivh,#32 4073 extr $ivh,$ivh,$ivl,#63 4074 and $tmpmw,$constnum,$midnum,asr #31 4075 eor $ivl,$tmpmx,$ivl,lsl #1 4076 fmov $ivd40,$ivl 4077 fmov $ivd41,$ivh 4078 4079 vld1.8 {$in4},[$inp],#16 4080 cbz $xoffset,.Lxts_dec_tail4x 4081 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 4082 veor $tmp0,$tmp0,$dat0 4083 veor $dat0,$in0,$iv0 4084 veor $tmp1,$tmp1,$dat1 4085 veor $dat1,$in1,$iv1 4086 veor $tmp2,$tmp2,$dat2 4087 veor $dat2,$in2,$iv2 4088 veor $tmp3,$tmp3,$dat3 4089 veor $dat3,$in3,$iv3 4090 veor $tmp4,$tmp4,$dat4 4091 vst1.8 {$tmp0},[$out],#16 4092 veor $dat4,$in4,$iv4 4093 vst1.8 {$tmp1},[$out],#16 4094 mov $rounds,$rounds0 4095 vst1.8 {$tmp2},[$out],#16 4096 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 4097 vst1.8 {$tmp3},[$out],#16 4098 vst1.8 {$tmp4},[$out],#16 4099 b.hs .Loop5x_xts_dec 4100 4101 cmn $len,#0x10 4102 b.ne .Loop5x_dec_after 4103 // If x2($len) equal to -0x10, the left blocks is 4. 4104 // After specially processing, utilize the five blocks processing again. 4105 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. 4106 vorr $iv4,$iv3,$iv3 4107 vorr $iv3,$iv2,$iv2 4108 vorr $iv2,$iv1,$iv1 4109 vorr $iv1,$iv0,$iv0 4110 fmov $ivl,$ivd40 4111 fmov $ivh,$ivd41 4112 veor $dat0,$iv0,$in0 4113 veor $dat1,$iv1,$in1 4114 veor $dat2,$in2,$iv2 4115 veor $dat3,$in3,$iv3 4116 veor $dat4,$in4,$iv4 4117 b.eq .Loop5x_xts_dec 4118 4119.Loop5x_dec_after: 4120 add $len,$len,#0x50 4121 cbz $len,.Lxts_done 4122 4123 add $rounds,$rounds0,#2 4124 subs $len,$len,#0x30 4125 b.lo .Lxts_inner_dec_tail 4126 4127 veor $dat0,$iv0,$in2 4128 veor $dat1,$iv1,$in3 4129 veor $dat2,$in4,$iv2 4130 b .Lxts_outer_dec_tail 4131 4132.align 4 4133.Lxts_dec_tail4x: 4134 add $inp,$inp,#16 4135 tst $tailcnt,#0xf 4136 veor $tmp1,$dat1,$tmp0 4137 vst1.8 {$tmp1},[$out],#16 4138 veor $tmp2,$dat2,$tmp2 4139 vst1.8 {$tmp2},[$out],#16 4140 veor $tmp3,$dat3,$tmp3 4141 veor $tmp4,$dat4,$tmp4 4142 vst1.8 {$tmp3-$tmp4},[$out],#32 4143 4144 b.eq .Lxts_dec_abort 4145 vld1.8 {$dat0},[$inp],#16 4146 b .Lxts_done 4147.align 4 4148.Lxts_outer_dec_tail: 4149 aesd $dat0,q8 4150 aesimc $dat0,$dat0 4151 aesd $dat1,q8 4152 aesimc $dat1,$dat1 4153 aesd $dat2,q8 4154 aesimc $dat2,$dat2 4155 vld1.32 {q8},[$key_],#16 4156 subs $rounds,$rounds,#2 4157 aesd $dat0,q9 4158 aesimc $dat0,$dat0 4159 aesd $dat1,q9 4160 aesimc $dat1,$dat1 4161 aesd $dat2,q9 4162 aesimc $dat2,$dat2 4163 vld1.32 {q9},[$key_],#16 4164 b.gt .Lxts_outer_dec_tail 4165 4166 aesd $dat0,q8 4167 aesimc $dat0,$dat0 4168 aesd $dat1,q8 4169 aesimc $dat1,$dat1 4170 aesd $dat2,q8 4171 aesimc $dat2,$dat2 4172 veor $tmp0,$iv0,$rndlast 4173 subs $len,$len,#0x30 4174 // The iv for first block 4175 fmov $ivl,$ivd20 4176 fmov $ivh,$ivd21 4177 mov $constnum,#0x87 4178 extr $midnumx,$ivh,$ivh,#32 4179 extr $ivh,$ivh,$ivl,#63 4180 and $tmpmw,$constnum,$midnum,asr #31 4181 eor $ivl,$tmpmx,$ivl,lsl #1 4182 fmov $ivd00,$ivl 4183 fmov $ivd01,$ivh 4184 veor $tmp1,$iv1,$rndlast 4185 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 4186 aesd $dat0,q9 4187 aesimc $dat0,$dat0 4188 aesd $dat1,q9 4189 aesimc $dat1,$dat1 4190 aesd $dat2,q9 4191 aesimc $dat2,$dat2 4192 veor $tmp2,$iv2,$rndlast 4193 // The iv for second block 4194 extr $midnumx,$ivh,$ivh,#32 4195 extr $ivh,$ivh,$ivl,#63 4196 and $tmpmw,$constnum,$midnum,asr #31 4197 eor $ivl,$tmpmx,$ivl,lsl #1 4198 fmov $ivd10,$ivl 4199 fmov $ivd11,$ivh 4200 4201 add $xoffset,$xoffset,#0x20 4202 add $inp,$inp,$xoffset // $inp is adjusted to the last data 4203 4204 mov $key_,$key1 4205 4206 // The iv for third block 4207 extr $midnumx,$ivh,$ivh,#32 4208 extr $ivh,$ivh,$ivl,#63 4209 and $tmpmw,$constnum,$midnum,asr #31 4210 eor $ivl,$tmpmx,$ivl,lsl #1 4211 fmov $ivd20,$ivl 4212 fmov $ivd21,$ivh 4213 4214 aesd $dat0,q12 4215 aesimc $dat0,$dat0 4216 aesd $dat1,q12 4217 aesimc $dat1,$dat1 4218 aesd $dat2,q12 4219 aesimc $dat2,$dat2 4220 aesd $dat0,q13 4221 aesimc $dat0,$dat0 4222 aesd $dat1,q13 4223 aesimc $dat1,$dat1 4224 aesd $dat2,q13 4225 aesimc $dat2,$dat2 4226 aesd $dat0,q14 4227 aesimc $dat0,$dat0 4228 aesd $dat1,q14 4229 aesimc $dat1,$dat1 4230 aesd $dat2,q14 4231 aesimc $dat2,$dat2 4232 vld1.8 {$in2},[$inp],#16 4233 aesd $dat0,q15 4234 aesd $dat1,q15 4235 aesd $dat2,q15 4236 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 4237 add $rounds,$rounds0,#2 4238 veor $tmp0,$tmp0,$dat0 4239 veor $tmp1,$tmp1,$dat1 4240 veor $dat2,$dat2,$tmp2 4241 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 4242 vst1.8 {$tmp0},[$out],#16 4243 vst1.8 {$tmp1},[$out],#16 4244 vst1.8 {$dat2},[$out],#16 4245 4246 cmn $len,#0x30 4247 add $len,$len,#0x30 4248 b.eq .Lxts_done 4249 sub $len,$len,#0x30 4250 vorr $in3,$in1,$in1 4251 vorr $in4,$in2,$in2 4252 nop 4253 4254.Lxts_inner_dec_tail: 4255 // $len == -0x10 means two blocks left. 4256 cmn $len,#0x10 4257 veor $dat1,$in3,$iv0 4258 veor $dat2,$in4,$iv1 4259 b.eq .Lxts_dec_tail_loop 4260 veor $dat2,$in4,$iv0 4261.Lxts_dec_tail_loop: 4262 aesd $dat1,q8 4263 aesimc $dat1,$dat1 4264 aesd $dat2,q8 4265 aesimc $dat2,$dat2 4266 vld1.32 {q8},[$key_],#16 4267 subs $rounds,$rounds,#2 4268 aesd $dat1,q9 4269 aesimc $dat1,$dat1 4270 aesd $dat2,q9 4271 aesimc $dat2,$dat2 4272 vld1.32 {q9},[$key_],#16 4273 b.gt .Lxts_dec_tail_loop 4274 4275 aesd $dat1,q8 4276 aesimc $dat1,$dat1 4277 aesd $dat2,q8 4278 aesimc $dat2,$dat2 4279 aesd $dat1,q9 4280 aesimc $dat1,$dat1 4281 aesd $dat2,q9 4282 aesimc $dat2,$dat2 4283 aesd $dat1,q12 4284 aesimc $dat1,$dat1 4285 aesd $dat2,q12 4286 aesimc $dat2,$dat2 4287 cmn $len,#0x20 4288 aesd $dat1,q13 4289 aesimc $dat1,$dat1 4290 aesd $dat2,q13 4291 aesimc $dat2,$dat2 4292 veor $tmp1,$iv0,$rndlast 4293 aesd $dat1,q14 4294 aesimc $dat1,$dat1 4295 aesd $dat2,q14 4296 aesimc $dat2,$dat2 4297 veor $tmp2,$iv1,$rndlast 4298 aesd $dat1,q15 4299 aesd $dat2,q15 4300 b.eq .Lxts_dec_one 4301 veor $tmp1,$tmp1,$dat1 4302 veor $tmp2,$tmp2,$dat2 4303 vorr $iv0,$iv2,$iv2 4304 vorr $iv1,$iv3,$iv3 4305 vst1.8 {$tmp1},[$out],#16 4306 vst1.8 {$tmp2},[$out],#16 4307 add $len,$len,#16 4308 b .Lxts_done 4309 4310.Lxts_dec_one: 4311 veor $tmp1,$tmp1,$dat2 4312 vorr $iv0,$iv1,$iv1 4313 vorr $iv1,$iv2,$iv2 4314 vst1.8 {$tmp1},[$out],#16 4315 add $len,$len,#32 4316 4317.Lxts_done: 4318 tst $tailcnt,#0xf 4319 b.eq .Lxts_dec_abort 4320 // Processing the last two blocks with cipher stealing. 4321 mov x7,x3 4322 cbnz x2,.Lxts_dec_1st_done 4323 vld1.8 {$dat0},[$inp],#16 4324 4325 // Decrypt the last second block to get the last plain text block 4326.Lxts_dec_1st_done: 4327 eor $tmpin,$dat0,$iv1 4328 ldr $rounds,[$key1,#240] 4329 vld1.32 {$dat0},[$key1],#16 4330 sub $rounds,$rounds,#2 4331 vld1.32 {$dat1},[$key1],#16 4332.Loop_final_2nd_dec: 4333 aesd $tmpin,$dat0 4334 aesimc $tmpin,$tmpin 4335 vld1.32 {$dat0},[$key1],#16 // load key schedule... 4336 subs $rounds,$rounds,#2 4337 aesd $tmpin,$dat1 4338 aesimc $tmpin,$tmpin 4339 vld1.32 {$dat1},[$key1],#16 // load key schedule... 4340 b.gt .Loop_final_2nd_dec 4341 4342 aesd $tmpin,$dat0 4343 aesimc $tmpin,$tmpin 4344 vld1.32 {$dat0},[$key1] 4345 aesd $tmpin,$dat1 4346 veor $tmpin,$tmpin,$dat0 4347 veor $tmpin,$tmpin,$iv1 4348 vst1.8 {$tmpin},[$out] 4349 4350 mov $tmpinp,$inp 4351 add $tmpoutp,$out,#16 4352 4353 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 4354 // to get the last encrypted block. 4355.composite_dec_loop: 4356 subs $tailcnt,$tailcnt,#1 4357 ldrb $l2outp,[$out,$tailcnt] 4358 ldrb $loutp,[$tmpinp,$tailcnt] 4359 strb $l2outp,[$tmpoutp,$tailcnt] 4360 strb $loutp,[$out,$tailcnt] 4361 b.gt .composite_dec_loop 4362.Lxts_dec_load_done: 4363 vld1.8 {$tmpin},[$out] 4364 veor $tmpin,$tmpin,$iv0 4365 4366 // Decrypt the composite block to get the last second plain text block 4367 ldr $rounds,[$key_,#240] 4368 vld1.32 {$dat},[$key_],#16 4369 sub $rounds,$rounds,#2 4370 vld1.32 {$dat1},[$key_],#16 4371.Loop_final_dec: 4372 aesd $tmpin,$dat0 4373 aesimc $tmpin,$tmpin 4374 vld1.32 {$dat0},[$key_],#16 // load key schedule... 4375 subs $rounds,$rounds,#2 4376 aesd $tmpin,$dat1 4377 aesimc $tmpin,$tmpin 4378 vld1.32 {$dat1},[$key_],#16 // load key schedule... 4379 b.gt .Loop_final_dec 4380 4381 aesd $tmpin,$dat0 4382 aesimc $tmpin,$tmpin 4383 vld1.32 {$dat0},[$key_] 4384 aesd $tmpin,$dat1 4385 veor $tmpin,$tmpin,$dat0 4386 veor $tmpin,$tmpin,$iv0 4387 vst1.8 {$tmpin},[$out] 4388 4389.Lxts_dec_abort: 4390 ldp $tailcnt,$midnumx,[sp,#48] 4391 ldp $ivd10,$ivd20,[sp,#32] 4392 ldp $ivd30,$ivd40,[sp,#16] 4393 ldp $constnumx,$tmpinp,[sp],#64 4394 4395.Lxts_dec_final_abort: 4396 ret 4397.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt 4398___ 4399} 4400}}} 4401$code.=<<___; 4402#endif 4403___ 4404######################################## 4405if ($flavour =~ /64/) { ######## 64-bit code 4406 my %opcode = ( 4407 "aesd" => 0x4e285800, "aese" => 0x4e284800, 4408 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800, 4409 "eor3" => 0xce000000, ); 4410 4411 local *unaes = sub { 4412 my ($mnemonic,$arg)=@_; 4413 4414 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 4415 sprintf ".inst\t0x%08x\t//%s %s", 4416 $opcode{$mnemonic}|$1|($2<<5), 4417 $mnemonic,$arg; 4418 }; 4419 4420 sub unsha3 { 4421 my ($mnemonic,$arg)=@_; 4422 4423 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 4424 && 4425 sprintf ".inst\t0x%08x\t//%s %s", 4426 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 4427 $mnemonic,$arg; 4428 } 4429 4430 foreach(split("\n",$code)) { 4431 s/\`([^\`]*)\`/eval($1)/geo; 4432 4433 s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo; # old->new registers 4434 s/\bq_([0-9]+)\b/"q".$1/geo; # old->new registers 4435 s/@\s/\/\//o; # old->new style commentary 4436 4437 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 4438 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 4439 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 4440 s/vmov\.i8/movi/o or # fix up legacy mnemonics 4441 s/vext\.8/ext/o or 4442 s/vrev32\.8/rev32/o or 4443 s/vtst\.8/cmtst/o or 4444 s/vshr/ushr/o or 4445 s/^(\s+)v/$1/o or # strip off v prefix 4446 s/\bbx\s+lr\b/ret/o; 4447 s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge; 4448 4449 # fix up remaining legacy suffixes 4450 s/\.[ui]?8//o; 4451 m/\],#8/o and s/\.16b/\.8b/go; 4452 s/\.[ui]?32//o and s/\.16b/\.4s/go; 4453 s/\.[ui]?64//o and s/\.16b/\.2d/go; 4454 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 4455 4456 # Switch preprocessor checks to aarch64 versions. 4457 s/__ARME([BL])__/__AARCH64E$1__/go; 4458 4459 print $_,"\n"; 4460 } 4461} else { ######## 32-bit code 4462 my %opcode = ( 4463 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 4464 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 4465 4466 local *unaes = sub { 4467 my ($mnemonic,$arg)=@_; 4468 4469 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 4470 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 4471 |(($2&7)<<1) |(($2&8)<<2); 4472 # since ARMv7 instructions are always encoded little-endian. 4473 # correct solution is to use .inst directive, but older 4474 # assemblers don't implement it:-( 4475 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 4476 $word&0xff,($word>>8)&0xff, 4477 ($word>>16)&0xff,($word>>24)&0xff, 4478 $mnemonic,$arg; 4479 } 4480 }; 4481 4482 sub unvtbl { 4483 my $arg=shift; 4484 4485 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 4486 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 4487 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 4488 } 4489 4490 sub unvdup32 { 4491 my $arg=shift; 4492 4493 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 4494 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 4495 } 4496 4497 sub unvmov32 { 4498 my $arg=shift; 4499 4500 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 4501 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 4502 } 4503 4504 foreach(split("\n",$code)) { 4505 s/\`([^\`]*)\`/eval($1)/geo; 4506 4507 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 4508 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 4509 s/\/\/\s?/@ /o; # new->old style commentary 4510 4511 # fix up remaining new-style suffixes 4512 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 4513 s/\],#[0-9]+/]!/o; 4514 4515 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 4516 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 4517 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 4518 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 4519 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 4520 s/^(\s+)b\./$1b/o or 4521 s/^(\s+)ret/$1bx\tlr/o; 4522 4523 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 4524 print " it $2\n"; 4525 } 4526 4527 print $_,"\n"; 4528 } 4529} 4530 4531close STDOUT or die "error closing STDOUT: $!"; 4532