1#! /usr/bin/env perl 2# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# April 2019 31# 32# Key to performance of parallelize-able modes is round instruction 33# interleaving. But which factor to use? There is optimal one for 34# each combination of instruction latency and issue rate, beyond 35# which increasing interleave factor doesn't pay off. While on cons 36# side we have code size increase and resource waste on platforms for 37# which interleave factor is too high. In other words you want it to 38# be just right. So far interleave factor of 3x was serving well all 39# platforms. But for ThunderX2 optimal interleave factor was measured 40# to be 5x... 41# 42# Performance in cycles per byte processed with 128-bit key: 43# 44# CBC enc CBC dec CTR 45# Apple A7 2.39 1.20 1.20 46# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 47# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 48# Cortex-A72 1.33 0.85/0.88 0.92/0.96 49# Denver 1.96 0.65/0.86 0.76/0.80 50# Mongoose 1.33 1.23/1.20 1.30/1.20 51# Kryo 1.26 0.87/0.94 1.00/1.00 52# ThunderX2 5.95 1.25 1.30 53# 54# (*) original 3.64/1.34/1.32 results were for r0p0 revision 55# and are still same even for updated module; 56# (**) numbers after slash are for 32-bit code, which is 3x- 57# interleaved; 58 59# $output is the last argument if it looks like a file (it has an extension) 60# $flavour is the first argument if it doesn't look like a file 61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71*STDOUT=*OUT; 72 73$prefix="aes_v8"; 74 75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 76 77$code=<<___; 78#include "arm_arch.h" 79 80#if __ARM_MAX_ARCH__>=7 81___ 82$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 83$code.=<<___ if ($flavour !~ /64/); 84.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 85.fpu neon 86#ifdef __thumb2__ 87.syntax unified 88.thumb 89# define INST(a,b,c,d) $_byte c,d|0xc,a,b 90#else 91.code 32 92# define INST(a,b,c,d) $_byte a,b,c,d 93#endif 94 95.text 96___ 97 98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 100# maintain both 32- and 64-bit codes within single module and 101# transliterate common code to either flavour with regex vodoo. 102# 103{{{ 104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 107 108 109$code.=<<___; 110.align 5 111.Lrcon: 112.long 0x01,0x01,0x01,0x01 113.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 114.long 0x1b,0x1b,0x1b,0x1b 115 116.globl ${prefix}_set_encrypt_key 117.type ${prefix}_set_encrypt_key,%function 118.align 5 119${prefix}_set_encrypt_key: 120.Lenc_key: 121___ 122$code.=<<___ if ($flavour =~ /64/); 123 AARCH64_VALID_CALL_TARGET 124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 125 stp x29,x30,[sp,#-16]! 126 add x29,sp,#0 127___ 128$code.=<<___; 129 mov $ptr,#-1 130 cmp $inp,#0 131 b.eq .Lenc_key_abort 132 cmp $out,#0 133 b.eq .Lenc_key_abort 134 mov $ptr,#-2 135 cmp $bits,#128 136 b.lt .Lenc_key_abort 137 cmp $bits,#256 138 b.gt .Lenc_key_abort 139 tst $bits,#0x3f 140 b.ne .Lenc_key_abort 141 142 adr $ptr,.Lrcon 143 cmp $bits,#192 144 145 veor $zero,$zero,$zero 146 vld1.8 {$in0},[$inp],#16 147 mov $bits,#8 // reuse $bits 148 vld1.32 {$rcon,$mask},[$ptr],#32 149 150 b.lt .Loop128 151 b.eq .L192 152 b .L256 153 154.align 4 155.Loop128: 156 vtbl.8 $key,{$in0},$mask 157 vext.8 $tmp,$zero,$in0,#12 158 vst1.32 {$in0},[$out],#16 159 aese $key,$zero 160 subs $bits,$bits,#1 161 162 veor $in0,$in0,$tmp 163 vext.8 $tmp,$zero,$tmp,#12 164 veor $in0,$in0,$tmp 165 vext.8 $tmp,$zero,$tmp,#12 166 veor $key,$key,$rcon 167 veor $in0,$in0,$tmp 168 vshl.u8 $rcon,$rcon,#1 169 veor $in0,$in0,$key 170 b.ne .Loop128 171 172 vld1.32 {$rcon},[$ptr] 173 174 vtbl.8 $key,{$in0},$mask 175 vext.8 $tmp,$zero,$in0,#12 176 vst1.32 {$in0},[$out],#16 177 aese $key,$zero 178 179 veor $in0,$in0,$tmp 180 vext.8 $tmp,$zero,$tmp,#12 181 veor $in0,$in0,$tmp 182 vext.8 $tmp,$zero,$tmp,#12 183 veor $key,$key,$rcon 184 veor $in0,$in0,$tmp 185 vshl.u8 $rcon,$rcon,#1 186 veor $in0,$in0,$key 187 188 vtbl.8 $key,{$in0},$mask 189 vext.8 $tmp,$zero,$in0,#12 190 vst1.32 {$in0},[$out],#16 191 aese $key,$zero 192 193 veor $in0,$in0,$tmp 194 vext.8 $tmp,$zero,$tmp,#12 195 veor $in0,$in0,$tmp 196 vext.8 $tmp,$zero,$tmp,#12 197 veor $key,$key,$rcon 198 veor $in0,$in0,$tmp 199 veor $in0,$in0,$key 200 vst1.32 {$in0},[$out] 201 add $out,$out,#0x50 202 203 mov $rounds,#10 204 b .Ldone 205 206.align 4 207.L192: 208 vld1.8 {$in1},[$inp],#8 209 vmov.i8 $key,#8 // borrow $key 210 vst1.32 {$in0},[$out],#16 211 vsub.i8 $mask,$mask,$key // adjust the mask 212 213.Loop192: 214 vtbl.8 $key,{$in1},$mask 215 vext.8 $tmp,$zero,$in0,#12 216#ifdef __ARMEB__ 217 vst1.32 {$in1},[$out],#16 218 sub $out,$out,#8 219#else 220 vst1.32 {$in1},[$out],#8 221#endif 222 aese $key,$zero 223 subs $bits,$bits,#1 224 225 veor $in0,$in0,$tmp 226 vext.8 $tmp,$zero,$tmp,#12 227 veor $in0,$in0,$tmp 228 vext.8 $tmp,$zero,$tmp,#12 229 veor $in0,$in0,$tmp 230 231 vdup.32 $tmp,${in0}[3] 232 veor $tmp,$tmp,$in1 233 veor $key,$key,$rcon 234 vext.8 $in1,$zero,$in1,#12 235 vshl.u8 $rcon,$rcon,#1 236 veor $in1,$in1,$tmp 237 veor $in0,$in0,$key 238 veor $in1,$in1,$key 239 vst1.32 {$in0},[$out],#16 240 b.ne .Loop192 241 242 mov $rounds,#12 243 add $out,$out,#0x20 244 b .Ldone 245 246.align 4 247.L256: 248 vld1.8 {$in1},[$inp] 249 mov $bits,#7 250 mov $rounds,#14 251 vst1.32 {$in0},[$out],#16 252 253.Loop256: 254 vtbl.8 $key,{$in1},$mask 255 vext.8 $tmp,$zero,$in0,#12 256 vst1.32 {$in1},[$out],#16 257 aese $key,$zero 258 subs $bits,$bits,#1 259 260 veor $in0,$in0,$tmp 261 vext.8 $tmp,$zero,$tmp,#12 262 veor $in0,$in0,$tmp 263 vext.8 $tmp,$zero,$tmp,#12 264 veor $key,$key,$rcon 265 veor $in0,$in0,$tmp 266 vshl.u8 $rcon,$rcon,#1 267 veor $in0,$in0,$key 268 vst1.32 {$in0},[$out],#16 269 b.eq .Ldone 270 271 vdup.32 $key,${in0}[3] // just splat 272 vext.8 $tmp,$zero,$in1,#12 273 aese $key,$zero 274 275 veor $in1,$in1,$tmp 276 vext.8 $tmp,$zero,$tmp,#12 277 veor $in1,$in1,$tmp 278 vext.8 $tmp,$zero,$tmp,#12 279 veor $in1,$in1,$tmp 280 281 veor $in1,$in1,$key 282 b .Loop256 283 284.Ldone: 285 str $rounds,[$out] 286 mov $ptr,#0 287 288.Lenc_key_abort: 289 mov x0,$ptr // return value 290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 291 ret 292.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 293 294.globl ${prefix}_set_decrypt_key 295.type ${prefix}_set_decrypt_key,%function 296.align 5 297${prefix}_set_decrypt_key: 298___ 299$code.=<<___ if ($flavour =~ /64/); 300 AARCH64_SIGN_LINK_REGISTER 301 stp x29,x30,[sp,#-16]! 302 add x29,sp,#0 303___ 304$code.=<<___ if ($flavour !~ /64/); 305 stmdb sp!,{r4,lr} 306___ 307$code.=<<___; 308 bl .Lenc_key 309 310 cmp x0,#0 311 b.ne .Ldec_key_abort 312 313 sub $out,$out,#240 // restore original $out 314 mov x4,#-16 315 add $inp,$out,x12,lsl#4 // end of key schedule 316 317 vld1.32 {v0.16b},[$out] 318 vld1.32 {v1.16b},[$inp] 319 vst1.32 {v0.16b},[$inp],x4 320 vst1.32 {v1.16b},[$out],#16 321 322.Loop_imc: 323 vld1.32 {v0.16b},[$out] 324 vld1.32 {v1.16b},[$inp] 325 aesimc v0.16b,v0.16b 326 aesimc v1.16b,v1.16b 327 vst1.32 {v0.16b},[$inp],x4 328 vst1.32 {v1.16b},[$out],#16 329 cmp $inp,$out 330 b.hi .Loop_imc 331 332 vld1.32 {v0.16b},[$out] 333 aesimc v0.16b,v0.16b 334 vst1.32 {v0.16b},[$inp] 335 336 eor x0,x0,x0 // return value 337.Ldec_key_abort: 338___ 339$code.=<<___ if ($flavour !~ /64/); 340 ldmia sp!,{r4,pc} 341___ 342$code.=<<___ if ($flavour =~ /64/); 343 ldp x29,x30,[sp],#16 344 AARCH64_VALIDATE_LINK_REGISTER 345 ret 346___ 347$code.=<<___; 348.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 349___ 350}}} 351{{{ 352sub gen_block () { 353my $dir = shift; 354my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 355my ($inp,$out,$key)=map("x$_",(0..2)); 356my $rounds="w3"; 357my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 358 359$code.=<<___; 360.globl ${prefix}_${dir}crypt 361.type ${prefix}_${dir}crypt,%function 362.align 5 363${prefix}_${dir}crypt: 364___ 365$code.=<<___ if ($flavour =~ /64/); 366 AARCH64_VALID_CALL_TARGET 367___ 368$code.=<<___; 369 ldr $rounds,[$key,#240] 370 vld1.32 {$rndkey0},[$key],#16 371 vld1.8 {$inout},[$inp] 372 sub $rounds,$rounds,#2 373 vld1.32 {$rndkey1},[$key],#16 374 375.Loop_${dir}c: 376 aes$e $inout,$rndkey0 377 aes$mc $inout,$inout 378 vld1.32 {$rndkey0},[$key],#16 379 subs $rounds,$rounds,#2 380 aes$e $inout,$rndkey1 381 aes$mc $inout,$inout 382 vld1.32 {$rndkey1},[$key],#16 383 b.gt .Loop_${dir}c 384 385 aes$e $inout,$rndkey0 386 aes$mc $inout,$inout 387 vld1.32 {$rndkey0},[$key] 388 aes$e $inout,$rndkey1 389 veor $inout,$inout,$rndkey0 390 391 vst1.8 {$inout},[$out] 392 ret 393.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 394___ 395} 396&gen_block("en"); 397&gen_block("de"); 398}}} 399 400# Performance in cycles per byte. 401# Processed with AES-ECB different key size. 402# It shows the value before and after optimization as below: 403# (before/after): 404# 405# AES-128-ECB AES-192-ECB AES-256-ECB 406# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 407# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 408 409# Optimization is implemented by loop unrolling and interleaving. 410# Commonly, we choose the unrolling factor as 5, if the input 411# data size smaller than 5 blocks, but not smaller than 3 blocks, 412# choose 3 as the unrolling factor. 413# If the input data size dsize >= 5*16 bytes, then take 5 blocks 414# as one iteration, every loop the left size lsize -= 5*16. 415# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, 416# every loop lsize -=3*16. 417# If lsize < 3*16 bytes, treat them as the tail, interleave the 418# two blocks AES instructions. 419# There is one special case, if the original input data size dsize 420# = 16 bytes, we will treat it separately to improve the 421# performance: one independent code block without LR, FP load and 422# store, just looks like what the original ECB implementation does. 423 424{{{ 425my ($inp,$out,$len,$key)=map("x$_",(0..3)); 426my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); 427my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 428 429my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 430 431### q7 last round key 432### q10-q15 q7 Last 7 round keys 433### q8-q9 preloaded round keys except last 7 keys for big size 434### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte 435 436{ 437my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 438 439my ($dat3,$in3,$tmp3); # used only in 64-bit mode 440my ($dat4,$in4,$tmp4); 441if ($flavour =~ /64/) { 442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 443} 444 445$code.=<<___; 446.globl ${prefix}_ecb_encrypt 447.type ${prefix}_ecb_encrypt,%function 448.align 5 449${prefix}_ecb_encrypt: 450___ 451$code.=<<___ if ($flavour =~ /64/); 452 AARCH64_VALID_CALL_TARGET 453 subs $len,$len,#16 454 // Original input data size bigger than 16, jump to big size processing. 455 b.ne .Lecb_big_size 456 vld1.8 {$dat0},[$inp] 457 cmp $enc,#0 // en- or decrypting? 458 ldr $rounds,[$key,#240] 459 vld1.32 {q5-q6},[$key],#32 // load key schedule... 460 461 b.eq .Lecb_small_dec 462 aese $dat0,q5 463 aesmc $dat0,$dat0 464 vld1.32 {q8-q9},[$key],#32 // load key schedule... 465 aese $dat0,q6 466 aesmc $dat0,$dat0 467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing 468 b.eq .Lecb_128_enc 469.Lecb_round_loop: 470 aese $dat0,q8 471 aesmc $dat0,$dat0 472 vld1.32 {q8},[$key],#16 // load key schedule... 473 aese $dat0,q9 474 aesmc $dat0,$dat0 475 vld1.32 {q9},[$key],#16 // load key schedule... 476 subs $rounds,$rounds,#2 // bias 477 b.gt .Lecb_round_loop 478.Lecb_128_enc: 479 vld1.32 {q10-q11},[$key],#32 // load key schedule... 480 aese $dat0,q8 481 aesmc $dat0,$dat0 482 aese $dat0,q9 483 aesmc $dat0,$dat0 484 vld1.32 {q12-q13},[$key],#32 // load key schedule... 485 aese $dat0,q10 486 aesmc $dat0,$dat0 487 aese $dat0,q11 488 aesmc $dat0,$dat0 489 vld1.32 {q14-q15},[$key],#32 // load key schedule... 490 aese $dat0,q12 491 aesmc $dat0,$dat0 492 aese $dat0,q13 493 aesmc $dat0,$dat0 494 vld1.32 {$rndlast},[$key] 495 aese $dat0,q14 496 aesmc $dat0,$dat0 497 aese $dat0,q15 498 veor $dat0,$dat0,$rndlast 499 vst1.8 {$dat0},[$out] 500 b .Lecb_Final_abort 501.Lecb_small_dec: 502 aesd $dat0,q5 503 aesimc $dat0,$dat0 504 vld1.32 {q8-q9},[$key],#32 // load key schedule... 505 aesd $dat0,q6 506 aesimc $dat0,$dat0 507 subs $rounds,$rounds,#10 // bias 508 b.eq .Lecb_128_dec 509.Lecb_dec_round_loop: 510 aesd $dat0,q8 511 aesimc $dat0,$dat0 512 vld1.32 {q8},[$key],#16 // load key schedule... 513 aesd $dat0,q9 514 aesimc $dat0,$dat0 515 vld1.32 {q9},[$key],#16 // load key schedule... 516 subs $rounds,$rounds,#2 // bias 517 b.gt .Lecb_dec_round_loop 518.Lecb_128_dec: 519 vld1.32 {q10-q11},[$key],#32 // load key schedule... 520 aesd $dat0,q8 521 aesimc $dat0,$dat0 522 aesd $dat0,q9 523 aesimc $dat0,$dat0 524 vld1.32 {q12-q13},[$key],#32 // load key schedule... 525 aesd $dat0,q10 526 aesimc $dat0,$dat0 527 aesd $dat0,q11 528 aesimc $dat0,$dat0 529 vld1.32 {q14-q15},[$key],#32 // load key schedule... 530 aesd $dat0,q12 531 aesimc $dat0,$dat0 532 aesd $dat0,q13 533 aesimc $dat0,$dat0 534 vld1.32 {$rndlast},[$key] 535 aesd $dat0,q14 536 aesimc $dat0,$dat0 537 aesd $dat0,q15 538 veor $dat0,$dat0,$rndlast 539 vst1.8 {$dat0},[$out] 540 b .Lecb_Final_abort 541.Lecb_big_size: 542___ 543$code.=<<___ if ($flavour =~ /64/); 544 stp x29,x30,[sp,#-16]! 545 add x29,sp,#0 546___ 547$code.=<<___ if ($flavour !~ /64/); 548 mov ip,sp 549 stmdb sp!,{r4-r8,lr} 550 vstmdb sp!,{d8-d15} @ ABI specification says so 551 ldmia ip,{r4-r5} @ load remaining args 552 subs $len,$len,#16 553___ 554$code.=<<___; 555 mov $step,#16 556 b.lo .Lecb_done 557 cclr $step,eq 558 559 cmp $enc,#0 // en- or decrypting? 560 ldr $rounds,[$key,#240] 561 and $len,$len,#-16 562 vld1.8 {$dat},[$inp],$step 563 564 vld1.32 {q8-q9},[$key] // load key schedule... 565 sub $rounds,$rounds,#6 566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 567 sub $rounds,$rounds,#2 568 vld1.32 {q10-q11},[$key_],#32 569 vld1.32 {q12-q13},[$key_],#32 570 vld1.32 {q14-q15},[$key_],#32 571 vld1.32 {$rndlast},[$key_] 572 573 add $key_,$key,#32 574 mov $cnt,$rounds 575 b.eq .Lecb_dec 576 577 vld1.8 {$dat1},[$inp],#16 578 subs $len,$len,#32 // bias 579 add $cnt,$rounds,#2 580 vorr $in1,$dat1,$dat1 581 vorr $dat2,$dat1,$dat1 582 vorr $dat1,$dat,$dat 583 b.lo .Lecb_enc_tail 584 585 vorr $dat1,$in1,$in1 586 vld1.8 {$dat2},[$inp],#16 587___ 588$code.=<<___ if ($flavour =~ /64/); 589 cmp $len,#32 590 b.lo .Loop3x_ecb_enc 591 592 vld1.8 {$dat3},[$inp],#16 593 vld1.8 {$dat4},[$inp],#16 594 sub $len,$len,#32 // bias 595 mov $cnt,$rounds 596 597.Loop5x_ecb_enc: 598 aese $dat0,q8 599 aesmc $dat0,$dat0 600 aese $dat1,q8 601 aesmc $dat1,$dat1 602 aese $dat2,q8 603 aesmc $dat2,$dat2 604 aese $dat3,q8 605 aesmc $dat3,$dat3 606 aese $dat4,q8 607 aesmc $dat4,$dat4 608 vld1.32 {q8},[$key_],#16 609 subs $cnt,$cnt,#2 610 aese $dat0,q9 611 aesmc $dat0,$dat0 612 aese $dat1,q9 613 aesmc $dat1,$dat1 614 aese $dat2,q9 615 aesmc $dat2,$dat2 616 aese $dat3,q9 617 aesmc $dat3,$dat3 618 aese $dat4,q9 619 aesmc $dat4,$dat4 620 vld1.32 {q9},[$key_],#16 621 b.gt .Loop5x_ecb_enc 622 623 aese $dat0,q8 624 aesmc $dat0,$dat0 625 aese $dat1,q8 626 aesmc $dat1,$dat1 627 aese $dat2,q8 628 aesmc $dat2,$dat2 629 aese $dat3,q8 630 aesmc $dat3,$dat3 631 aese $dat4,q8 632 aesmc $dat4,$dat4 633 cmp $len,#0x40 // because .Lecb_enc_tail4x 634 sub $len,$len,#0x50 635 636 aese $dat0,q9 637 aesmc $dat0,$dat0 638 aese $dat1,q9 639 aesmc $dat1,$dat1 640 aese $dat2,q9 641 aesmc $dat2,$dat2 642 aese $dat3,q9 643 aesmc $dat3,$dat3 644 aese $dat4,q9 645 aesmc $dat4,$dat4 646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 647 mov $key_,$key 648 649 aese $dat0,q10 650 aesmc $dat0,$dat0 651 aese $dat1,q10 652 aesmc $dat1,$dat1 653 aese $dat2,q10 654 aesmc $dat2,$dat2 655 aese $dat3,q10 656 aesmc $dat3,$dat3 657 aese $dat4,q10 658 aesmc $dat4,$dat4 659 add $inp,$inp,x6 // $inp is adjusted in such way that 660 // at exit from the loop $dat1-$dat4 661 // are loaded with last "words" 662 add x6,$len,#0x60 // because .Lecb_enc_tail4x 663 664 aese $dat0,q11 665 aesmc $dat0,$dat0 666 aese $dat1,q11 667 aesmc $dat1,$dat1 668 aese $dat2,q11 669 aesmc $dat2,$dat2 670 aese $dat3,q11 671 aesmc $dat3,$dat3 672 aese $dat4,q11 673 aesmc $dat4,$dat4 674 675 aese $dat0,q12 676 aesmc $dat0,$dat0 677 aese $dat1,q12 678 aesmc $dat1,$dat1 679 aese $dat2,q12 680 aesmc $dat2,$dat2 681 aese $dat3,q12 682 aesmc $dat3,$dat3 683 aese $dat4,q12 684 aesmc $dat4,$dat4 685 686 aese $dat0,q13 687 aesmc $dat0,$dat0 688 aese $dat1,q13 689 aesmc $dat1,$dat1 690 aese $dat2,q13 691 aesmc $dat2,$dat2 692 aese $dat3,q13 693 aesmc $dat3,$dat3 694 aese $dat4,q13 695 aesmc $dat4,$dat4 696 697 aese $dat0,q14 698 aesmc $dat0,$dat0 699 aese $dat1,q14 700 aesmc $dat1,$dat1 701 aese $dat2,q14 702 aesmc $dat2,$dat2 703 aese $dat3,q14 704 aesmc $dat3,$dat3 705 aese $dat4,q14 706 aesmc $dat4,$dat4 707 708 aese $dat0,q15 709 vld1.8 {$in0},[$inp],#16 710 aese $dat1,q15 711 vld1.8 {$in1},[$inp],#16 712 aese $dat2,q15 713 vld1.8 {$in2},[$inp],#16 714 aese $dat3,q15 715 vld1.8 {$in3},[$inp],#16 716 aese $dat4,q15 717 vld1.8 {$in4},[$inp],#16 718 cbz x6,.Lecb_enc_tail4x 719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 720 veor $tmp0,$rndlast,$dat0 721 vorr $dat0,$in0,$in0 722 veor $tmp1,$rndlast,$dat1 723 vorr $dat1,$in1,$in1 724 veor $tmp2,$rndlast,$dat2 725 vorr $dat2,$in2,$in2 726 veor $tmp3,$rndlast,$dat3 727 vorr $dat3,$in3,$in3 728 veor $tmp4,$rndlast,$dat4 729 vst1.8 {$tmp0},[$out],#16 730 vorr $dat4,$in4,$in4 731 vst1.8 {$tmp1},[$out],#16 732 mov $cnt,$rounds 733 vst1.8 {$tmp2},[$out],#16 734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 735 vst1.8 {$tmp3},[$out],#16 736 vst1.8 {$tmp4},[$out],#16 737 b.hs .Loop5x_ecb_enc 738 739 add $len,$len,#0x50 740 cbz $len,.Lecb_done 741 742 add $cnt,$rounds,#2 743 subs $len,$len,#0x30 744 vorr $dat0,$in2,$in2 745 vorr $dat1,$in3,$in3 746 vorr $dat2,$in4,$in4 747 b.lo .Lecb_enc_tail 748 749 b .Loop3x_ecb_enc 750 751.align 4 752.Lecb_enc_tail4x: 753 veor $tmp1,$rndlast,$dat1 754 veor $tmp2,$rndlast,$dat2 755 veor $tmp3,$rndlast,$dat3 756 veor $tmp4,$rndlast,$dat4 757 vst1.8 {$tmp1},[$out],#16 758 vst1.8 {$tmp2},[$out],#16 759 vst1.8 {$tmp3},[$out],#16 760 vst1.8 {$tmp4},[$out],#16 761 762 b .Lecb_done 763.align 4 764___ 765$code.=<<___; 766.Loop3x_ecb_enc: 767 aese $dat0,q8 768 aesmc $dat0,$dat0 769 aese $dat1,q8 770 aesmc $dat1,$dat1 771 aese $dat2,q8 772 aesmc $dat2,$dat2 773 vld1.32 {q8},[$key_],#16 774 subs $cnt,$cnt,#2 775 aese $dat0,q9 776 aesmc $dat0,$dat0 777 aese $dat1,q9 778 aesmc $dat1,$dat1 779 aese $dat2,q9 780 aesmc $dat2,$dat2 781 vld1.32 {q9},[$key_],#16 782 b.gt .Loop3x_ecb_enc 783 784 aese $dat0,q8 785 aesmc $dat0,$dat0 786 aese $dat1,q8 787 aesmc $dat1,$dat1 788 aese $dat2,q8 789 aesmc $dat2,$dat2 790 subs $len,$len,#0x30 791 mov.lo x6,$len // x6, $cnt, is zero at this point 792 aese $dat0,q9 793 aesmc $dat0,$dat0 794 aese $dat1,q9 795 aesmc $dat1,$dat1 796 aese $dat2,q9 797 aesmc $dat2,$dat2 798 add $inp,$inp,x6 // $inp is adjusted in such way that 799 // at exit from the loop $dat1-$dat2 800 // are loaded with last "words" 801 mov $key_,$key 802 aese $dat0,q12 803 aesmc $dat0,$dat0 804 aese $dat1,q12 805 aesmc $dat1,$dat1 806 aese $dat2,q12 807 aesmc $dat2,$dat2 808 vld1.8 {$in0},[$inp],#16 809 aese $dat0,q13 810 aesmc $dat0,$dat0 811 aese $dat1,q13 812 aesmc $dat1,$dat1 813 aese $dat2,q13 814 aesmc $dat2,$dat2 815 vld1.8 {$in1},[$inp],#16 816 aese $dat0,q14 817 aesmc $dat0,$dat0 818 aese $dat1,q14 819 aesmc $dat1,$dat1 820 aese $dat2,q14 821 aesmc $dat2,$dat2 822 vld1.8 {$in2},[$inp],#16 823 aese $dat0,q15 824 aese $dat1,q15 825 aese $dat2,q15 826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 827 add $cnt,$rounds,#2 828 veor $tmp0,$rndlast,$dat0 829 veor $tmp1,$rndlast,$dat1 830 veor $dat2,$dat2,$rndlast 831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 832 vst1.8 {$tmp0},[$out],#16 833 vorr $dat0,$in0,$in0 834 vst1.8 {$tmp1},[$out],#16 835 vorr $dat1,$in1,$in1 836 vst1.8 {$dat2},[$out],#16 837 vorr $dat2,$in2,$in2 838 b.hs .Loop3x_ecb_enc 839 840 cmn $len,#0x30 841 b.eq .Lecb_done 842 nop 843 844.Lecb_enc_tail: 845 aese $dat1,q8 846 aesmc $dat1,$dat1 847 aese $dat2,q8 848 aesmc $dat2,$dat2 849 vld1.32 {q8},[$key_],#16 850 subs $cnt,$cnt,#2 851 aese $dat1,q9 852 aesmc $dat1,$dat1 853 aese $dat2,q9 854 aesmc $dat2,$dat2 855 vld1.32 {q9},[$key_],#16 856 b.gt .Lecb_enc_tail 857 858 aese $dat1,q8 859 aesmc $dat1,$dat1 860 aese $dat2,q8 861 aesmc $dat2,$dat2 862 aese $dat1,q9 863 aesmc $dat1,$dat1 864 aese $dat2,q9 865 aesmc $dat2,$dat2 866 aese $dat1,q12 867 aesmc $dat1,$dat1 868 aese $dat2,q12 869 aesmc $dat2,$dat2 870 cmn $len,#0x20 871 aese $dat1,q13 872 aesmc $dat1,$dat1 873 aese $dat2,q13 874 aesmc $dat2,$dat2 875 aese $dat1,q14 876 aesmc $dat1,$dat1 877 aese $dat2,q14 878 aesmc $dat2,$dat2 879 aese $dat1,q15 880 aese $dat2,q15 881 b.eq .Lecb_enc_one 882 veor $tmp1,$rndlast,$dat1 883 veor $tmp2,$rndlast,$dat2 884 vst1.8 {$tmp1},[$out],#16 885 vst1.8 {$tmp2},[$out],#16 886 b .Lecb_done 887 888.Lecb_enc_one: 889 veor $tmp1,$rndlast,$dat2 890 vst1.8 {$tmp1},[$out],#16 891 b .Lecb_done 892___ 893 894$code.=<<___; 895.align 5 896.Lecb_dec: 897 vld1.8 {$dat1},[$inp],#16 898 subs $len,$len,#32 // bias 899 add $cnt,$rounds,#2 900 vorr $in1,$dat1,$dat1 901 vorr $dat2,$dat1,$dat1 902 vorr $dat1,$dat,$dat 903 b.lo .Lecb_dec_tail 904 905 vorr $dat1,$in1,$in1 906 vld1.8 {$dat2},[$inp],#16 907___ 908$code.=<<___ if ($flavour =~ /64/); 909 cmp $len,#32 910 b.lo .Loop3x_ecb_dec 911 912 vld1.8 {$dat3},[$inp],#16 913 vld1.8 {$dat4},[$inp],#16 914 sub $len,$len,#32 // bias 915 mov $cnt,$rounds 916 917.Loop5x_ecb_dec: 918 aesd $dat0,q8 919 aesimc $dat0,$dat0 920 aesd $dat1,q8 921 aesimc $dat1,$dat1 922 aesd $dat2,q8 923 aesimc $dat2,$dat2 924 aesd $dat3,q8 925 aesimc $dat3,$dat3 926 aesd $dat4,q8 927 aesimc $dat4,$dat4 928 vld1.32 {q8},[$key_],#16 929 subs $cnt,$cnt,#2 930 aesd $dat0,q9 931 aesimc $dat0,$dat0 932 aesd $dat1,q9 933 aesimc $dat1,$dat1 934 aesd $dat2,q9 935 aesimc $dat2,$dat2 936 aesd $dat3,q9 937 aesimc $dat3,$dat3 938 aesd $dat4,q9 939 aesimc $dat4,$dat4 940 vld1.32 {q9},[$key_],#16 941 b.gt .Loop5x_ecb_dec 942 943 aesd $dat0,q8 944 aesimc $dat0,$dat0 945 aesd $dat1,q8 946 aesimc $dat1,$dat1 947 aesd $dat2,q8 948 aesimc $dat2,$dat2 949 aesd $dat3,q8 950 aesimc $dat3,$dat3 951 aesd $dat4,q8 952 aesimc $dat4,$dat4 953 cmp $len,#0x40 // because .Lecb_tail4x 954 sub $len,$len,#0x50 955 956 aesd $dat0,q9 957 aesimc $dat0,$dat0 958 aesd $dat1,q9 959 aesimc $dat1,$dat1 960 aesd $dat2,q9 961 aesimc $dat2,$dat2 962 aesd $dat3,q9 963 aesimc $dat3,$dat3 964 aesd $dat4,q9 965 aesimc $dat4,$dat4 966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 967 mov $key_,$key 968 969 aesd $dat0,q10 970 aesimc $dat0,$dat0 971 aesd $dat1,q10 972 aesimc $dat1,$dat1 973 aesd $dat2,q10 974 aesimc $dat2,$dat2 975 aesd $dat3,q10 976 aesimc $dat3,$dat3 977 aesd $dat4,q10 978 aesimc $dat4,$dat4 979 add $inp,$inp,x6 // $inp is adjusted in such way that 980 // at exit from the loop $dat1-$dat4 981 // are loaded with last "words" 982 add x6,$len,#0x60 // because .Lecb_tail4x 983 984 aesd $dat0,q11 985 aesimc $dat0,$dat0 986 aesd $dat1,q11 987 aesimc $dat1,$dat1 988 aesd $dat2,q11 989 aesimc $dat2,$dat2 990 aesd $dat3,q11 991 aesimc $dat3,$dat3 992 aesd $dat4,q11 993 aesimc $dat4,$dat4 994 995 aesd $dat0,q12 996 aesimc $dat0,$dat0 997 aesd $dat1,q12 998 aesimc $dat1,$dat1 999 aesd $dat2,q12 1000 aesimc $dat2,$dat2 1001 aesd $dat3,q12 1002 aesimc $dat3,$dat3 1003 aesd $dat4,q12 1004 aesimc $dat4,$dat4 1005 1006 aesd $dat0,q13 1007 aesimc $dat0,$dat0 1008 aesd $dat1,q13 1009 aesimc $dat1,$dat1 1010 aesd $dat2,q13 1011 aesimc $dat2,$dat2 1012 aesd $dat3,q13 1013 aesimc $dat3,$dat3 1014 aesd $dat4,q13 1015 aesimc $dat4,$dat4 1016 1017 aesd $dat0,q14 1018 aesimc $dat0,$dat0 1019 aesd $dat1,q14 1020 aesimc $dat1,$dat1 1021 aesd $dat2,q14 1022 aesimc $dat2,$dat2 1023 aesd $dat3,q14 1024 aesimc $dat3,$dat3 1025 aesd $dat4,q14 1026 aesimc $dat4,$dat4 1027 1028 aesd $dat0,q15 1029 vld1.8 {$in0},[$inp],#16 1030 aesd $dat1,q15 1031 vld1.8 {$in1},[$inp],#16 1032 aesd $dat2,q15 1033 vld1.8 {$in2},[$inp],#16 1034 aesd $dat3,q15 1035 vld1.8 {$in3},[$inp],#16 1036 aesd $dat4,q15 1037 vld1.8 {$in4},[$inp],#16 1038 cbz x6,.Lecb_tail4x 1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1040 veor $tmp0,$rndlast,$dat0 1041 vorr $dat0,$in0,$in0 1042 veor $tmp1,$rndlast,$dat1 1043 vorr $dat1,$in1,$in1 1044 veor $tmp2,$rndlast,$dat2 1045 vorr $dat2,$in2,$in2 1046 veor $tmp3,$rndlast,$dat3 1047 vorr $dat3,$in3,$in3 1048 veor $tmp4,$rndlast,$dat4 1049 vst1.8 {$tmp0},[$out],#16 1050 vorr $dat4,$in4,$in4 1051 vst1.8 {$tmp1},[$out],#16 1052 mov $cnt,$rounds 1053 vst1.8 {$tmp2},[$out],#16 1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1055 vst1.8 {$tmp3},[$out],#16 1056 vst1.8 {$tmp4},[$out],#16 1057 b.hs .Loop5x_ecb_dec 1058 1059 add $len,$len,#0x50 1060 cbz $len,.Lecb_done 1061 1062 add $cnt,$rounds,#2 1063 subs $len,$len,#0x30 1064 vorr $dat0,$in2,$in2 1065 vorr $dat1,$in3,$in3 1066 vorr $dat2,$in4,$in4 1067 b.lo .Lecb_dec_tail 1068 1069 b .Loop3x_ecb_dec 1070 1071.align 4 1072.Lecb_tail4x: 1073 veor $tmp1,$rndlast,$dat1 1074 veor $tmp2,$rndlast,$dat2 1075 veor $tmp3,$rndlast,$dat3 1076 veor $tmp4,$rndlast,$dat4 1077 vst1.8 {$tmp1},[$out],#16 1078 vst1.8 {$tmp2},[$out],#16 1079 vst1.8 {$tmp3},[$out],#16 1080 vst1.8 {$tmp4},[$out],#16 1081 1082 b .Lecb_done 1083.align 4 1084___ 1085$code.=<<___; 1086.Loop3x_ecb_dec: 1087 aesd $dat0,q8 1088 aesimc $dat0,$dat0 1089 aesd $dat1,q8 1090 aesimc $dat1,$dat1 1091 aesd $dat2,q8 1092 aesimc $dat2,$dat2 1093 vld1.32 {q8},[$key_],#16 1094 subs $cnt,$cnt,#2 1095 aesd $dat0,q9 1096 aesimc $dat0,$dat0 1097 aesd $dat1,q9 1098 aesimc $dat1,$dat1 1099 aesd $dat2,q9 1100 aesimc $dat2,$dat2 1101 vld1.32 {q9},[$key_],#16 1102 b.gt .Loop3x_ecb_dec 1103 1104 aesd $dat0,q8 1105 aesimc $dat0,$dat0 1106 aesd $dat1,q8 1107 aesimc $dat1,$dat1 1108 aesd $dat2,q8 1109 aesimc $dat2,$dat2 1110 subs $len,$len,#0x30 1111 mov.lo x6,$len // x6, $cnt, is zero at this point 1112 aesd $dat0,q9 1113 aesimc $dat0,$dat0 1114 aesd $dat1,q9 1115 aesimc $dat1,$dat1 1116 aesd $dat2,q9 1117 aesimc $dat2,$dat2 1118 add $inp,$inp,x6 // $inp is adjusted in such way that 1119 // at exit from the loop $dat1-$dat2 1120 // are loaded with last "words" 1121 mov $key_,$key 1122 aesd $dat0,q12 1123 aesimc $dat0,$dat0 1124 aesd $dat1,q12 1125 aesimc $dat1,$dat1 1126 aesd $dat2,q12 1127 aesimc $dat2,$dat2 1128 vld1.8 {$in0},[$inp],#16 1129 aesd $dat0,q13 1130 aesimc $dat0,$dat0 1131 aesd $dat1,q13 1132 aesimc $dat1,$dat1 1133 aesd $dat2,q13 1134 aesimc $dat2,$dat2 1135 vld1.8 {$in1},[$inp],#16 1136 aesd $dat0,q14 1137 aesimc $dat0,$dat0 1138 aesd $dat1,q14 1139 aesimc $dat1,$dat1 1140 aesd $dat2,q14 1141 aesimc $dat2,$dat2 1142 vld1.8 {$in2},[$inp],#16 1143 aesd $dat0,q15 1144 aesd $dat1,q15 1145 aesd $dat2,q15 1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1147 add $cnt,$rounds,#2 1148 veor $tmp0,$rndlast,$dat0 1149 veor $tmp1,$rndlast,$dat1 1150 veor $dat2,$dat2,$rndlast 1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1152 vst1.8 {$tmp0},[$out],#16 1153 vorr $dat0,$in0,$in0 1154 vst1.8 {$tmp1},[$out],#16 1155 vorr $dat1,$in1,$in1 1156 vst1.8 {$dat2},[$out],#16 1157 vorr $dat2,$in2,$in2 1158 b.hs .Loop3x_ecb_dec 1159 1160 cmn $len,#0x30 1161 b.eq .Lecb_done 1162 nop 1163 1164.Lecb_dec_tail: 1165 aesd $dat1,q8 1166 aesimc $dat1,$dat1 1167 aesd $dat2,q8 1168 aesimc $dat2,$dat2 1169 vld1.32 {q8},[$key_],#16 1170 subs $cnt,$cnt,#2 1171 aesd $dat1,q9 1172 aesimc $dat1,$dat1 1173 aesd $dat2,q9 1174 aesimc $dat2,$dat2 1175 vld1.32 {q9},[$key_],#16 1176 b.gt .Lecb_dec_tail 1177 1178 aesd $dat1,q8 1179 aesimc $dat1,$dat1 1180 aesd $dat2,q8 1181 aesimc $dat2,$dat2 1182 aesd $dat1,q9 1183 aesimc $dat1,$dat1 1184 aesd $dat2,q9 1185 aesimc $dat2,$dat2 1186 aesd $dat1,q12 1187 aesimc $dat1,$dat1 1188 aesd $dat2,q12 1189 aesimc $dat2,$dat2 1190 cmn $len,#0x20 1191 aesd $dat1,q13 1192 aesimc $dat1,$dat1 1193 aesd $dat2,q13 1194 aesimc $dat2,$dat2 1195 aesd $dat1,q14 1196 aesimc $dat1,$dat1 1197 aesd $dat2,q14 1198 aesimc $dat2,$dat2 1199 aesd $dat1,q15 1200 aesd $dat2,q15 1201 b.eq .Lecb_dec_one 1202 veor $tmp1,$rndlast,$dat1 1203 veor $tmp2,$rndlast,$dat2 1204 vst1.8 {$tmp1},[$out],#16 1205 vst1.8 {$tmp2},[$out],#16 1206 b .Lecb_done 1207 1208.Lecb_dec_one: 1209 veor $tmp1,$rndlast,$dat2 1210 vst1.8 {$tmp1},[$out],#16 1211 1212.Lecb_done: 1213___ 1214} 1215$code.=<<___ if ($flavour !~ /64/); 1216 vldmia sp!,{d8-d15} 1217 ldmia sp!,{r4-r8,pc} 1218___ 1219$code.=<<___ if ($flavour =~ /64/); 1220 ldr x29,[sp],#16 1221___ 1222$code.=<<___ if ($flavour =~ /64/); 1223.Lecb_Final_abort: 1224 ret 1225___ 1226$code.=<<___; 1227.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 1228___ 1229}}} 1230{{{ 1231my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 1232my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 1233my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1234 1235my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 1236my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 1237 1238### q8-q15 preloaded key schedule 1239 1240$code.=<<___; 1241.globl ${prefix}_cbc_encrypt 1242.type ${prefix}_cbc_encrypt,%function 1243.align 5 1244${prefix}_cbc_encrypt: 1245___ 1246$code.=<<___ if ($flavour =~ /64/); 1247 AARCH64_VALID_CALL_TARGET 1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1249 stp x29,x30,[sp,#-16]! 1250 add x29,sp,#0 1251___ 1252$code.=<<___ if ($flavour !~ /64/); 1253 mov ip,sp 1254 stmdb sp!,{r4-r8,lr} 1255 vstmdb sp!,{d8-d15} @ ABI specification says so 1256 ldmia ip,{r4-r5} @ load remaining args 1257___ 1258$code.=<<___; 1259 subs $len,$len,#16 1260 mov $step,#16 1261 b.lo .Lcbc_abort 1262 cclr $step,eq 1263 1264 cmp $enc,#0 // en- or decrypting? 1265 ldr $rounds,[$key,#240] 1266 and $len,$len,#-16 1267 vld1.8 {$ivec},[$ivp] 1268 vld1.8 {$dat},[$inp],$step 1269 1270 vld1.32 {q8-q9},[$key] // load key schedule... 1271 sub $rounds,$rounds,#6 1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 1273 sub $rounds,$rounds,#2 1274 vld1.32 {q10-q11},[$key_],#32 1275 vld1.32 {q12-q13},[$key_],#32 1276 vld1.32 {q14-q15},[$key_],#32 1277 vld1.32 {$rndlast},[$key_] 1278 1279 add $key_,$key,#32 1280 mov $cnt,$rounds 1281 b.eq .Lcbc_dec 1282 1283 cmp $rounds,#2 1284 veor $dat,$dat,$ivec 1285 veor $rndzero_n_last,q8,$rndlast 1286 b.eq .Lcbc_enc128 1287 1288 vld1.32 {$in0-$in1},[$key_] 1289 add $key_,$key,#16 1290 add $key4,$key,#16*4 1291 add $key5,$key,#16*5 1292 aese $dat,q8 1293 aesmc $dat,$dat 1294 add $key6,$key,#16*6 1295 add $key7,$key,#16*7 1296 b .Lenter_cbc_enc 1297 1298.align 4 1299.Loop_cbc_enc: 1300 aese $dat,q8 1301 aesmc $dat,$dat 1302 vst1.8 {$ivec},[$out],#16 1303.Lenter_cbc_enc: 1304 aese $dat,q9 1305 aesmc $dat,$dat 1306 aese $dat,$in0 1307 aesmc $dat,$dat 1308 vld1.32 {q8},[$key4] 1309 cmp $rounds,#4 1310 aese $dat,$in1 1311 aesmc $dat,$dat 1312 vld1.32 {q9},[$key5] 1313 b.eq .Lcbc_enc192 1314 1315 aese $dat,q8 1316 aesmc $dat,$dat 1317 vld1.32 {q8},[$key6] 1318 aese $dat,q9 1319 aesmc $dat,$dat 1320 vld1.32 {q9},[$key7] 1321 nop 1322 1323.Lcbc_enc192: 1324 aese $dat,q8 1325 aesmc $dat,$dat 1326 subs $len,$len,#16 1327 aese $dat,q9 1328 aesmc $dat,$dat 1329 cclr $step,eq 1330 aese $dat,q10 1331 aesmc $dat,$dat 1332 aese $dat,q11 1333 aesmc $dat,$dat 1334 vld1.8 {q8},[$inp],$step 1335 aese $dat,q12 1336 aesmc $dat,$dat 1337 veor q8,q8,$rndzero_n_last 1338 aese $dat,q13 1339 aesmc $dat,$dat 1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 1341 aese $dat,q14 1342 aesmc $dat,$dat 1343 aese $dat,q15 1344 veor $ivec,$dat,$rndlast 1345 b.hs .Loop_cbc_enc 1346 1347 vst1.8 {$ivec},[$out],#16 1348 b .Lcbc_done 1349 1350.align 5 1351.Lcbc_enc128: 1352 vld1.32 {$in0-$in1},[$key_] 1353 aese $dat,q8 1354 aesmc $dat,$dat 1355 b .Lenter_cbc_enc128 1356.Loop_cbc_enc128: 1357 aese $dat,q8 1358 aesmc $dat,$dat 1359 vst1.8 {$ivec},[$out],#16 1360.Lenter_cbc_enc128: 1361 aese $dat,q9 1362 aesmc $dat,$dat 1363 subs $len,$len,#16 1364 aese $dat,$in0 1365 aesmc $dat,$dat 1366 cclr $step,eq 1367 aese $dat,$in1 1368 aesmc $dat,$dat 1369 aese $dat,q10 1370 aesmc $dat,$dat 1371 aese $dat,q11 1372 aesmc $dat,$dat 1373 vld1.8 {q8},[$inp],$step 1374 aese $dat,q12 1375 aesmc $dat,$dat 1376 aese $dat,q13 1377 aesmc $dat,$dat 1378 aese $dat,q14 1379 aesmc $dat,$dat 1380 veor q8,q8,$rndzero_n_last 1381 aese $dat,q15 1382 veor $ivec,$dat,$rndlast 1383 b.hs .Loop_cbc_enc128 1384 1385 vst1.8 {$ivec},[$out],#16 1386 b .Lcbc_done 1387___ 1388{ 1389my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1390 1391my ($dat3,$in3,$tmp3); # used only in 64-bit mode 1392my ($dat4,$in4,$tmp4); 1393if ($flavour =~ /64/) { 1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 1395} 1396 1397$code.=<<___; 1398.align 5 1399.Lcbc_dec: 1400 vld1.8 {$dat2},[$inp],#16 1401 subs $len,$len,#32 // bias 1402 add $cnt,$rounds,#2 1403 vorr $in1,$dat,$dat 1404 vorr $dat1,$dat,$dat 1405 vorr $in2,$dat2,$dat2 1406 b.lo .Lcbc_dec_tail 1407 1408 vorr $dat1,$dat2,$dat2 1409 vld1.8 {$dat2},[$inp],#16 1410 vorr $in0,$dat,$dat 1411 vorr $in1,$dat1,$dat1 1412 vorr $in2,$dat2,$dat2 1413___ 1414$code.=<<___ if ($flavour =~ /64/); 1415 cmp $len,#32 1416 b.lo .Loop3x_cbc_dec 1417 1418 vld1.8 {$dat3},[$inp],#16 1419 vld1.8 {$dat4},[$inp],#16 1420 sub $len,$len,#32 // bias 1421 mov $cnt,$rounds 1422 vorr $in3,$dat3,$dat3 1423 vorr $in4,$dat4,$dat4 1424 1425.Loop5x_cbc_dec: 1426 aesd $dat0,q8 1427 aesimc $dat0,$dat0 1428 aesd $dat1,q8 1429 aesimc $dat1,$dat1 1430 aesd $dat2,q8 1431 aesimc $dat2,$dat2 1432 aesd $dat3,q8 1433 aesimc $dat3,$dat3 1434 aesd $dat4,q8 1435 aesimc $dat4,$dat4 1436 vld1.32 {q8},[$key_],#16 1437 subs $cnt,$cnt,#2 1438 aesd $dat0,q9 1439 aesimc $dat0,$dat0 1440 aesd $dat1,q9 1441 aesimc $dat1,$dat1 1442 aesd $dat2,q9 1443 aesimc $dat2,$dat2 1444 aesd $dat3,q9 1445 aesimc $dat3,$dat3 1446 aesd $dat4,q9 1447 aesimc $dat4,$dat4 1448 vld1.32 {q9},[$key_],#16 1449 b.gt .Loop5x_cbc_dec 1450 1451 aesd $dat0,q8 1452 aesimc $dat0,$dat0 1453 aesd $dat1,q8 1454 aesimc $dat1,$dat1 1455 aesd $dat2,q8 1456 aesimc $dat2,$dat2 1457 aesd $dat3,q8 1458 aesimc $dat3,$dat3 1459 aesd $dat4,q8 1460 aesimc $dat4,$dat4 1461 cmp $len,#0x40 // because .Lcbc_tail4x 1462 sub $len,$len,#0x50 1463 1464 aesd $dat0,q9 1465 aesimc $dat0,$dat0 1466 aesd $dat1,q9 1467 aesimc $dat1,$dat1 1468 aesd $dat2,q9 1469 aesimc $dat2,$dat2 1470 aesd $dat3,q9 1471 aesimc $dat3,$dat3 1472 aesd $dat4,q9 1473 aesimc $dat4,$dat4 1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo 1475 mov $key_,$key 1476 1477 aesd $dat0,q10 1478 aesimc $dat0,$dat0 1479 aesd $dat1,q10 1480 aesimc $dat1,$dat1 1481 aesd $dat2,q10 1482 aesimc $dat2,$dat2 1483 aesd $dat3,q10 1484 aesimc $dat3,$dat3 1485 aesd $dat4,q10 1486 aesimc $dat4,$dat4 1487 add $inp,$inp,x6 // $inp is adjusted in such way that 1488 // at exit from the loop $dat1-$dat4 1489 // are loaded with last "words" 1490 add x6,$len,#0x60 // because .Lcbc_tail4x 1491 1492 aesd $dat0,q11 1493 aesimc $dat0,$dat0 1494 aesd $dat1,q11 1495 aesimc $dat1,$dat1 1496 aesd $dat2,q11 1497 aesimc $dat2,$dat2 1498 aesd $dat3,q11 1499 aesimc $dat3,$dat3 1500 aesd $dat4,q11 1501 aesimc $dat4,$dat4 1502 1503 aesd $dat0,q12 1504 aesimc $dat0,$dat0 1505 aesd $dat1,q12 1506 aesimc $dat1,$dat1 1507 aesd $dat2,q12 1508 aesimc $dat2,$dat2 1509 aesd $dat3,q12 1510 aesimc $dat3,$dat3 1511 aesd $dat4,q12 1512 aesimc $dat4,$dat4 1513 1514 aesd $dat0,q13 1515 aesimc $dat0,$dat0 1516 aesd $dat1,q13 1517 aesimc $dat1,$dat1 1518 aesd $dat2,q13 1519 aesimc $dat2,$dat2 1520 aesd $dat3,q13 1521 aesimc $dat3,$dat3 1522 aesd $dat4,q13 1523 aesimc $dat4,$dat4 1524 1525 aesd $dat0,q14 1526 aesimc $dat0,$dat0 1527 aesd $dat1,q14 1528 aesimc $dat1,$dat1 1529 aesd $dat2,q14 1530 aesimc $dat2,$dat2 1531 aesd $dat3,q14 1532 aesimc $dat3,$dat3 1533 aesd $dat4,q14 1534 aesimc $dat4,$dat4 1535 1536 veor $tmp0,$ivec,$rndlast 1537 aesd $dat0,q15 1538 veor $tmp1,$in0,$rndlast 1539 vld1.8 {$in0},[$inp],#16 1540 aesd $dat1,q15 1541 veor $tmp2,$in1,$rndlast 1542 vld1.8 {$in1},[$inp],#16 1543 aesd $dat2,q15 1544 veor $tmp3,$in2,$rndlast 1545 vld1.8 {$in2},[$inp],#16 1546 aesd $dat3,q15 1547 veor $tmp4,$in3,$rndlast 1548 vld1.8 {$in3},[$inp],#16 1549 aesd $dat4,q15 1550 vorr $ivec,$in4,$in4 1551 vld1.8 {$in4},[$inp],#16 1552 cbz x6,.Lcbc_tail4x 1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1554 veor $tmp0,$tmp0,$dat0 1555 vorr $dat0,$in0,$in0 1556 veor $tmp1,$tmp1,$dat1 1557 vorr $dat1,$in1,$in1 1558 veor $tmp2,$tmp2,$dat2 1559 vorr $dat2,$in2,$in2 1560 veor $tmp3,$tmp3,$dat3 1561 vorr $dat3,$in3,$in3 1562 veor $tmp4,$tmp4,$dat4 1563 vst1.8 {$tmp0},[$out],#16 1564 vorr $dat4,$in4,$in4 1565 vst1.8 {$tmp1},[$out],#16 1566 mov $cnt,$rounds 1567 vst1.8 {$tmp2},[$out],#16 1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1569 vst1.8 {$tmp3},[$out],#16 1570 vst1.8 {$tmp4},[$out],#16 1571 b.hs .Loop5x_cbc_dec 1572 1573 add $len,$len,#0x50 1574 cbz $len,.Lcbc_done 1575 1576 add $cnt,$rounds,#2 1577 subs $len,$len,#0x30 1578 vorr $dat0,$in2,$in2 1579 vorr $in0,$in2,$in2 1580 vorr $dat1,$in3,$in3 1581 vorr $in1,$in3,$in3 1582 vorr $dat2,$in4,$in4 1583 vorr $in2,$in4,$in4 1584 b.lo .Lcbc_dec_tail 1585 1586 b .Loop3x_cbc_dec 1587 1588.align 4 1589.Lcbc_tail4x: 1590 veor $tmp1,$tmp0,$dat1 1591 veor $tmp2,$tmp2,$dat2 1592 veor $tmp3,$tmp3,$dat3 1593 veor $tmp4,$tmp4,$dat4 1594 vst1.8 {$tmp1},[$out],#16 1595 vst1.8 {$tmp2},[$out],#16 1596 vst1.8 {$tmp3},[$out],#16 1597 vst1.8 {$tmp4},[$out],#16 1598 1599 b .Lcbc_done 1600.align 4 1601___ 1602$code.=<<___; 1603.Loop3x_cbc_dec: 1604 aesd $dat0,q8 1605 aesimc $dat0,$dat0 1606 aesd $dat1,q8 1607 aesimc $dat1,$dat1 1608 aesd $dat2,q8 1609 aesimc $dat2,$dat2 1610 vld1.32 {q8},[$key_],#16 1611 subs $cnt,$cnt,#2 1612 aesd $dat0,q9 1613 aesimc $dat0,$dat0 1614 aesd $dat1,q9 1615 aesimc $dat1,$dat1 1616 aesd $dat2,q9 1617 aesimc $dat2,$dat2 1618 vld1.32 {q9},[$key_],#16 1619 b.gt .Loop3x_cbc_dec 1620 1621 aesd $dat0,q8 1622 aesimc $dat0,$dat0 1623 aesd $dat1,q8 1624 aesimc $dat1,$dat1 1625 aesd $dat2,q8 1626 aesimc $dat2,$dat2 1627 veor $tmp0,$ivec,$rndlast 1628 subs $len,$len,#0x30 1629 veor $tmp1,$in0,$rndlast 1630 mov.lo x6,$len // x6, $cnt, is zero at this point 1631 aesd $dat0,q9 1632 aesimc $dat0,$dat0 1633 aesd $dat1,q9 1634 aesimc $dat1,$dat1 1635 aesd $dat2,q9 1636 aesimc $dat2,$dat2 1637 veor $tmp2,$in1,$rndlast 1638 add $inp,$inp,x6 // $inp is adjusted in such way that 1639 // at exit from the loop $dat1-$dat2 1640 // are loaded with last "words" 1641 vorr $ivec,$in2,$in2 1642 mov $key_,$key 1643 aesd $dat0,q12 1644 aesimc $dat0,$dat0 1645 aesd $dat1,q12 1646 aesimc $dat1,$dat1 1647 aesd $dat2,q12 1648 aesimc $dat2,$dat2 1649 vld1.8 {$in0},[$inp],#16 1650 aesd $dat0,q13 1651 aesimc $dat0,$dat0 1652 aesd $dat1,q13 1653 aesimc $dat1,$dat1 1654 aesd $dat2,q13 1655 aesimc $dat2,$dat2 1656 vld1.8 {$in1},[$inp],#16 1657 aesd $dat0,q14 1658 aesimc $dat0,$dat0 1659 aesd $dat1,q14 1660 aesimc $dat1,$dat1 1661 aesd $dat2,q14 1662 aesimc $dat2,$dat2 1663 vld1.8 {$in2},[$inp],#16 1664 aesd $dat0,q15 1665 aesd $dat1,q15 1666 aesd $dat2,q15 1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1668 add $cnt,$rounds,#2 1669 veor $tmp0,$tmp0,$dat0 1670 veor $tmp1,$tmp1,$dat1 1671 veor $dat2,$dat2,$tmp2 1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1673 vst1.8 {$tmp0},[$out],#16 1674 vorr $dat0,$in0,$in0 1675 vst1.8 {$tmp1},[$out],#16 1676 vorr $dat1,$in1,$in1 1677 vst1.8 {$dat2},[$out],#16 1678 vorr $dat2,$in2,$in2 1679 b.hs .Loop3x_cbc_dec 1680 1681 cmn $len,#0x30 1682 b.eq .Lcbc_done 1683 nop 1684 1685.Lcbc_dec_tail: 1686 aesd $dat1,q8 1687 aesimc $dat1,$dat1 1688 aesd $dat2,q8 1689 aesimc $dat2,$dat2 1690 vld1.32 {q8},[$key_],#16 1691 subs $cnt,$cnt,#2 1692 aesd $dat1,q9 1693 aesimc $dat1,$dat1 1694 aesd $dat2,q9 1695 aesimc $dat2,$dat2 1696 vld1.32 {q9},[$key_],#16 1697 b.gt .Lcbc_dec_tail 1698 1699 aesd $dat1,q8 1700 aesimc $dat1,$dat1 1701 aesd $dat2,q8 1702 aesimc $dat2,$dat2 1703 aesd $dat1,q9 1704 aesimc $dat1,$dat1 1705 aesd $dat2,q9 1706 aesimc $dat2,$dat2 1707 aesd $dat1,q12 1708 aesimc $dat1,$dat1 1709 aesd $dat2,q12 1710 aesimc $dat2,$dat2 1711 cmn $len,#0x20 1712 aesd $dat1,q13 1713 aesimc $dat1,$dat1 1714 aesd $dat2,q13 1715 aesimc $dat2,$dat2 1716 veor $tmp1,$ivec,$rndlast 1717 aesd $dat1,q14 1718 aesimc $dat1,$dat1 1719 aesd $dat2,q14 1720 aesimc $dat2,$dat2 1721 veor $tmp2,$in1,$rndlast 1722 aesd $dat1,q15 1723 aesd $dat2,q15 1724 b.eq .Lcbc_dec_one 1725 veor $tmp1,$tmp1,$dat1 1726 veor $tmp2,$tmp2,$dat2 1727 vorr $ivec,$in2,$in2 1728 vst1.8 {$tmp1},[$out],#16 1729 vst1.8 {$tmp2},[$out],#16 1730 b .Lcbc_done 1731 1732.Lcbc_dec_one: 1733 veor $tmp1,$tmp1,$dat2 1734 vorr $ivec,$in2,$in2 1735 vst1.8 {$tmp1},[$out],#16 1736 1737.Lcbc_done: 1738 vst1.8 {$ivec},[$ivp] 1739.Lcbc_abort: 1740___ 1741} 1742$code.=<<___ if ($flavour !~ /64/); 1743 vldmia sp!,{d8-d15} 1744 ldmia sp!,{r4-r8,pc} 1745___ 1746$code.=<<___ if ($flavour =~ /64/); 1747 ldr x29,[sp],#16 1748 ret 1749___ 1750$code.=<<___; 1751.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1752___ 1753}}} 1754{{{ 1755my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 1756my ($rounds,$cnt,$key_)=("w5","w6","x7"); 1757my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 1758my $step="x12"; # aliases with $tctr2 1759 1760my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 1761my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 1762 1763# used only in 64-bit mode... 1764my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); 1765 1766my ($dat,$tmp)=($dat0,$tmp0); 1767 1768### q8-q15 preloaded key schedule 1769 1770$code.=<<___; 1771.globl ${prefix}_ctr32_encrypt_blocks 1772.type ${prefix}_ctr32_encrypt_blocks,%function 1773.align 5 1774${prefix}_ctr32_encrypt_blocks: 1775___ 1776$code.=<<___ if ($flavour =~ /64/); 1777 AARCH64_VALID_CALL_TARGET 1778 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1779 stp x29,x30,[sp,#-16]! 1780 add x29,sp,#0 1781___ 1782$code.=<<___ if ($flavour !~ /64/); 1783 mov ip,sp 1784 stmdb sp!,{r4-r10,lr} 1785 vstmdb sp!,{d8-d15} @ ABI specification says so 1786 ldr r4, [ip] @ load remaining arg 1787___ 1788$code.=<<___; 1789 ldr $rounds,[$key,#240] 1790 1791 ldr $ctr, [$ivp, #12] 1792#ifdef __ARMEB__ 1793 vld1.8 {$dat0},[$ivp] 1794#else 1795 vld1.32 {$dat0},[$ivp] 1796#endif 1797 vld1.32 {q8-q9},[$key] // load key schedule... 1798 sub $rounds,$rounds,#4 1799 mov $step,#16 1800 cmp $len,#2 1801 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 1802 sub $rounds,$rounds,#2 1803 vld1.32 {q12-q13},[$key_],#32 1804 vld1.32 {q14-q15},[$key_],#32 1805 vld1.32 {$rndlast},[$key_] 1806 add $key_,$key,#32 1807 mov $cnt,$rounds 1808 cclr $step,lo 1809#ifndef __ARMEB__ 1810 rev $ctr, $ctr 1811#endif 1812___ 1813$code.=<<___ if ($flavour =~ /64/); 1814 vorr $dat1,$dat0,$dat0 1815 add $tctr1, $ctr, #1 1816 vorr $dat2,$dat0,$dat0 1817 add $ctr, $ctr, #2 1818 vorr $ivec,$dat0,$dat0 1819 rev $tctr1, $tctr1 1820 vmov.32 ${dat1}[3],$tctr1 1821 b.ls .Lctr32_tail 1822 rev $tctr2, $ctr 1823 sub $len,$len,#3 // bias 1824 vmov.32 ${dat2}[3],$tctr2 1825___ 1826$code.=<<___ if ($flavour !~ /64/); 1827 add $tctr1, $ctr, #1 1828 vorr $ivec,$dat0,$dat0 1829 rev $tctr1, $tctr1 1830 vmov.32 ${ivec}[3],$tctr1 1831 add $ctr, $ctr, #2 1832 vorr $dat1,$ivec,$ivec 1833 b.ls .Lctr32_tail 1834 rev $tctr2, $ctr 1835 vmov.32 ${ivec}[3],$tctr2 1836 sub $len,$len,#3 // bias 1837 vorr $dat2,$ivec,$ivec 1838___ 1839$code.=<<___ if ($flavour =~ /64/); 1840 cmp $len,#32 1841 b.lo .Loop3x_ctr32 1842 1843 add w13,$ctr,#1 1844 add w14,$ctr,#2 1845 vorr $dat3,$dat0,$dat0 1846 rev w13,w13 1847 vorr $dat4,$dat0,$dat0 1848 rev w14,w14 1849 vmov.32 ${dat3}[3],w13 1850 sub $len,$len,#2 // bias 1851 vmov.32 ${dat4}[3],w14 1852 add $ctr,$ctr,#2 1853 b .Loop5x_ctr32 1854 1855.align 4 1856.Loop5x_ctr32: 1857 aese $dat0,q8 1858 aesmc $dat0,$dat0 1859 aese $dat1,q8 1860 aesmc $dat1,$dat1 1861 aese $dat2,q8 1862 aesmc $dat2,$dat2 1863 aese $dat3,q8 1864 aesmc $dat3,$dat3 1865 aese $dat4,q8 1866 aesmc $dat4,$dat4 1867 vld1.32 {q8},[$key_],#16 1868 subs $cnt,$cnt,#2 1869 aese $dat0,q9 1870 aesmc $dat0,$dat0 1871 aese $dat1,q9 1872 aesmc $dat1,$dat1 1873 aese $dat2,q9 1874 aesmc $dat2,$dat2 1875 aese $dat3,q9 1876 aesmc $dat3,$dat3 1877 aese $dat4,q9 1878 aesmc $dat4,$dat4 1879 vld1.32 {q9},[$key_],#16 1880 b.gt .Loop5x_ctr32 1881 1882 mov $key_,$key 1883 aese $dat0,q8 1884 aesmc $dat0,$dat0 1885 aese $dat1,q8 1886 aesmc $dat1,$dat1 1887 aese $dat2,q8 1888 aesmc $dat2,$dat2 1889 aese $dat3,q8 1890 aesmc $dat3,$dat3 1891 aese $dat4,q8 1892 aesmc $dat4,$dat4 1893 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 1894 1895 aese $dat0,q9 1896 aesmc $dat0,$dat0 1897 aese $dat1,q9 1898 aesmc $dat1,$dat1 1899 aese $dat2,q9 1900 aesmc $dat2,$dat2 1901 aese $dat3,q9 1902 aesmc $dat3,$dat3 1903 aese $dat4,q9 1904 aesmc $dat4,$dat4 1905 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 1906 1907 aese $dat0,q12 1908 aesmc $dat0,$dat0 1909 add $tctr0,$ctr,#1 1910 add $tctr1,$ctr,#2 1911 aese $dat1,q12 1912 aesmc $dat1,$dat1 1913 add $tctr2,$ctr,#3 1914 add w13,$ctr,#4 1915 aese $dat2,q12 1916 aesmc $dat2,$dat2 1917 add w14,$ctr,#5 1918 rev $tctr0,$tctr0 1919 aese $dat3,q12 1920 aesmc $dat3,$dat3 1921 rev $tctr1,$tctr1 1922 rev $tctr2,$tctr2 1923 aese $dat4,q12 1924 aesmc $dat4,$dat4 1925 rev w13,w13 1926 rev w14,w14 1927 1928 aese $dat0,q13 1929 aesmc $dat0,$dat0 1930 aese $dat1,q13 1931 aesmc $dat1,$dat1 1932 aese $dat2,q13 1933 aesmc $dat2,$dat2 1934 aese $dat3,q13 1935 aesmc $dat3,$dat3 1936 aese $dat4,q13 1937 aesmc $dat4,$dat4 1938 1939 aese $dat0,q14 1940 aesmc $dat0,$dat0 1941 vld1.8 {$in0},[$inp],#16 1942 aese $dat1,q14 1943 aesmc $dat1,$dat1 1944 vld1.8 {$in1},[$inp],#16 1945 aese $dat2,q14 1946 aesmc $dat2,$dat2 1947 vld1.8 {$in2},[$inp],#16 1948 aese $dat3,q14 1949 aesmc $dat3,$dat3 1950 vld1.8 {$in3},[$inp],#16 1951 aese $dat4,q14 1952 aesmc $dat4,$dat4 1953 vld1.8 {$in4},[$inp],#16 1954 1955 aese $dat0,q15 1956 veor $in0,$in0,$rndlast 1957 aese $dat1,q15 1958 veor $in1,$in1,$rndlast 1959 aese $dat2,q15 1960 veor $in2,$in2,$rndlast 1961 aese $dat3,q15 1962 veor $in3,$in3,$rndlast 1963 aese $dat4,q15 1964 veor $in4,$in4,$rndlast 1965 1966 veor $in0,$in0,$dat0 1967 vorr $dat0,$ivec,$ivec 1968 veor $in1,$in1,$dat1 1969 vorr $dat1,$ivec,$ivec 1970 veor $in2,$in2,$dat2 1971 vorr $dat2,$ivec,$ivec 1972 veor $in3,$in3,$dat3 1973 vorr $dat3,$ivec,$ivec 1974 veor $in4,$in4,$dat4 1975 vorr $dat4,$ivec,$ivec 1976 1977 vst1.8 {$in0},[$out],#16 1978 vmov.32 ${dat0}[3],$tctr0 1979 vst1.8 {$in1},[$out],#16 1980 vmov.32 ${dat1}[3],$tctr1 1981 vst1.8 {$in2},[$out],#16 1982 vmov.32 ${dat2}[3],$tctr2 1983 vst1.8 {$in3},[$out],#16 1984 vmov.32 ${dat3}[3],w13 1985 vst1.8 {$in4},[$out],#16 1986 vmov.32 ${dat4}[3],w14 1987 1988 mov $cnt,$rounds 1989 cbz $len,.Lctr32_done 1990 1991 add $ctr,$ctr,#5 1992 subs $len,$len,#5 1993 b.hs .Loop5x_ctr32 1994 1995 add $len,$len,#5 1996 sub $ctr,$ctr,#5 1997 1998 cmp $len,#2 1999 mov $step,#16 2000 cclr $step,lo 2001 b.ls .Lctr32_tail 2002 2003 sub $len,$len,#3 // bias 2004 add $ctr,$ctr,#3 2005___ 2006$code.=<<___; 2007 b .Loop3x_ctr32 2008 2009.align 4 2010.Loop3x_ctr32: 2011 aese $dat0,q8 2012 aesmc $dat0,$dat0 2013 aese $dat1,q8 2014 aesmc $dat1,$dat1 2015 aese $dat2,q8 2016 aesmc $dat2,$dat2 2017 vld1.32 {q8},[$key_],#16 2018 subs $cnt,$cnt,#2 2019 aese $dat0,q9 2020 aesmc $dat0,$dat0 2021 aese $dat1,q9 2022 aesmc $dat1,$dat1 2023 aese $dat2,q9 2024 aesmc $dat2,$dat2 2025 vld1.32 {q9},[$key_],#16 2026 b.gt .Loop3x_ctr32 2027 2028 aese $dat0,q8 2029 aesmc $tmp0,$dat0 2030 aese $dat1,q8 2031 aesmc $tmp1,$dat1 2032 vld1.8 {$in0},[$inp],#16 2033___ 2034$code.=<<___ if ($flavour =~ /64/); 2035 vorr $dat0,$ivec,$ivec 2036___ 2037$code.=<<___ if ($flavour !~ /64/); 2038 add $tctr0,$ctr,#1 2039___ 2040$code.=<<___; 2041 aese $dat2,q8 2042 aesmc $dat2,$dat2 2043 vld1.8 {$in1},[$inp],#16 2044___ 2045$code.=<<___ if ($flavour =~ /64/); 2046 vorr $dat1,$ivec,$ivec 2047___ 2048$code.=<<___ if ($flavour !~ /64/); 2049 rev $tctr0,$tctr0 2050___ 2051$code.=<<___; 2052 aese $tmp0,q9 2053 aesmc $tmp0,$tmp0 2054 aese $tmp1,q9 2055 aesmc $tmp1,$tmp1 2056 vld1.8 {$in2},[$inp],#16 2057 mov $key_,$key 2058 aese $dat2,q9 2059 aesmc $tmp2,$dat2 2060___ 2061$code.=<<___ if ($flavour =~ /64/); 2062 vorr $dat2,$ivec,$ivec 2063 add $tctr0,$ctr,#1 2064___ 2065$code.=<<___; 2066 aese $tmp0,q12 2067 aesmc $tmp0,$tmp0 2068 aese $tmp1,q12 2069 aesmc $tmp1,$tmp1 2070 veor $in0,$in0,$rndlast 2071 add $tctr1,$ctr,#2 2072 aese $tmp2,q12 2073 aesmc $tmp2,$tmp2 2074 veor $in1,$in1,$rndlast 2075 add $ctr,$ctr,#3 2076 aese $tmp0,q13 2077 aesmc $tmp0,$tmp0 2078 aese $tmp1,q13 2079 aesmc $tmp1,$tmp1 2080 veor $in2,$in2,$rndlast 2081___ 2082$code.=<<___ if ($flavour =~ /64/); 2083 rev $tctr0,$tctr0 2084 aese $tmp2,q13 2085 aesmc $tmp2,$tmp2 2086 vmov.32 ${dat0}[3], $tctr0 2087___ 2088$code.=<<___ if ($flavour !~ /64/); 2089 vmov.32 ${ivec}[3], $tctr0 2090 aese $tmp2,q13 2091 aesmc $tmp2,$tmp2 2092 vorr $dat0,$ivec,$ivec 2093___ 2094$code.=<<___; 2095 rev $tctr1,$tctr1 2096 aese $tmp0,q14 2097 aesmc $tmp0,$tmp0 2098___ 2099$code.=<<___ if ($flavour !~ /64/); 2100 vmov.32 ${ivec}[3], $tctr1 2101 rev $tctr2,$ctr 2102___ 2103$code.=<<___; 2104 aese $tmp1,q14 2105 aesmc $tmp1,$tmp1 2106___ 2107$code.=<<___ if ($flavour =~ /64/); 2108 vmov.32 ${dat1}[3], $tctr1 2109 rev $tctr2,$ctr 2110 aese $tmp2,q14 2111 aesmc $tmp2,$tmp2 2112 vmov.32 ${dat2}[3], $tctr2 2113___ 2114$code.=<<___ if ($flavour !~ /64/); 2115 vorr $dat1,$ivec,$ivec 2116 vmov.32 ${ivec}[3], $tctr2 2117 aese $tmp2,q14 2118 aesmc $tmp2,$tmp2 2119 vorr $dat2,$ivec,$ivec 2120___ 2121$code.=<<___; 2122 subs $len,$len,#3 2123 aese $tmp0,q15 2124 aese $tmp1,q15 2125 aese $tmp2,q15 2126 2127 veor $in0,$in0,$tmp0 2128 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2129 vst1.8 {$in0},[$out],#16 2130 veor $in1,$in1,$tmp1 2131 mov $cnt,$rounds 2132 vst1.8 {$in1},[$out],#16 2133 veor $in2,$in2,$tmp2 2134 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2135 vst1.8 {$in2},[$out],#16 2136 b.hs .Loop3x_ctr32 2137 2138 adds $len,$len,#3 2139 b.eq .Lctr32_done 2140 cmp $len,#1 2141 mov $step,#16 2142 cclr $step,eq 2143 2144.Lctr32_tail: 2145 aese $dat0,q8 2146 aesmc $dat0,$dat0 2147 aese $dat1,q8 2148 aesmc $dat1,$dat1 2149 vld1.32 {q8},[$key_],#16 2150 subs $cnt,$cnt,#2 2151 aese $dat0,q9 2152 aesmc $dat0,$dat0 2153 aese $dat1,q9 2154 aesmc $dat1,$dat1 2155 vld1.32 {q9},[$key_],#16 2156 b.gt .Lctr32_tail 2157 2158 aese $dat0,q8 2159 aesmc $dat0,$dat0 2160 aese $dat1,q8 2161 aesmc $dat1,$dat1 2162 aese $dat0,q9 2163 aesmc $dat0,$dat0 2164 aese $dat1,q9 2165 aesmc $dat1,$dat1 2166 vld1.8 {$in0},[$inp],$step 2167 aese $dat0,q12 2168 aesmc $dat0,$dat0 2169 aese $dat1,q12 2170 aesmc $dat1,$dat1 2171 vld1.8 {$in1},[$inp] 2172 aese $dat0,q13 2173 aesmc $dat0,$dat0 2174 aese $dat1,q13 2175 aesmc $dat1,$dat1 2176 veor $in0,$in0,$rndlast 2177 aese $dat0,q14 2178 aesmc $dat0,$dat0 2179 aese $dat1,q14 2180 aesmc $dat1,$dat1 2181 veor $in1,$in1,$rndlast 2182 aese $dat0,q15 2183 aese $dat1,q15 2184 2185 cmp $len,#1 2186 veor $in0,$in0,$dat0 2187 veor $in1,$in1,$dat1 2188 vst1.8 {$in0},[$out],#16 2189 b.eq .Lctr32_done 2190 vst1.8 {$in1},[$out] 2191 2192.Lctr32_done: 2193___ 2194$code.=<<___ if ($flavour !~ /64/); 2195 vldmia sp!,{d8-d15} 2196 ldmia sp!,{r4-r10,pc} 2197___ 2198$code.=<<___ if ($flavour =~ /64/); 2199 ldr x29,[sp],#16 2200 ret 2201___ 2202$code.=<<___; 2203.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 2204___ 2205}}} 2206# Performance in cycles per byte. 2207# Processed with AES-XTS different key size. 2208# It shows the value before and after optimization as below: 2209# (before/after): 2210# 2211# AES-128-XTS AES-256-XTS 2212# Cortex-A57 3.36/1.09 4.02/1.37 2213# Cortex-A72 3.03/1.02 3.28/1.33 2214 2215# Optimization is implemented by loop unrolling and interleaving. 2216# Commonly, we choose the unrolling factor as 5, if the input 2217# data size smaller than 5 blocks, but not smaller than 3 blocks, 2218# choose 3 as the unrolling factor. 2219# If the input data size dsize >= 5*16 bytes, then take 5 blocks 2220# as one iteration, every loop the left size lsize -= 5*16. 2221# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes 2222# will be processed specially, which be integrated into the 5*16 bytes 2223# loop to improve the efficiency. 2224# There is one special case, if the original input data size dsize 2225# = 16 bytes, we will treat it separately to improve the 2226# performance: one independent code block without LR, FP load and 2227# store. 2228# Encryption will process the (length -tailcnt) bytes as mentioned 2229# previously, then encrypt the composite block as last second 2230# cipher block. 2231# Decryption will process the (length -tailcnt -1) bytes as mentioned 2232# previously, then decrypt the last second cipher block to get the 2233# last plain block(tail), decrypt the composite block as last second 2234# plain text block. 2235 2236{{{ 2237my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2238my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2239my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2240my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2241my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2242my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2243my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); 2244my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2245my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2246 2247my ($tmpin)=("v26.16b"); 2248my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2249 2250# q7 last round key 2251# q10-q15, q7 Last 7 round keys 2252# q8-q9 preloaded round keys except last 7 keys for big size 2253# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 2254 2255 2256my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2257 2258my ($dat3,$in3,$tmp3); # used only in 64-bit mode 2259my ($dat4,$in4,$tmp4); 2260if ($flavour =~ /64/) { 2261 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 2262} 2263 2264$code.=<<___ if ($flavour =~ /64/); 2265.globl ${prefix}_xts_encrypt 2266.type ${prefix}_xts_encrypt,%function 2267.align 5 2268${prefix}_xts_encrypt: 2269___ 2270$code.=<<___ if ($flavour =~ /64/); 2271 AARCH64_VALID_CALL_TARGET 2272 cmp $len,#16 2273 // Original input data size bigger than 16, jump to big size processing. 2274 b.ne .Lxts_enc_big_size 2275 // Encrypt the iv with key2, as the first XEX iv. 2276 ldr $rounds,[$key2,#240] 2277 vld1.8 {$dat},[$key2],#16 2278 vld1.8 {$iv0},[$ivp] 2279 sub $rounds,$rounds,#2 2280 vld1.8 {$dat1},[$key2],#16 2281 2282.Loop_enc_iv_enc: 2283 aese $iv0,$dat 2284 aesmc $iv0,$iv0 2285 vld1.32 {$dat},[$key2],#16 2286 subs $rounds,$rounds,#2 2287 aese $iv0,$dat1 2288 aesmc $iv0,$iv0 2289 vld1.32 {$dat1},[$key2],#16 2290 b.gt .Loop_enc_iv_enc 2291 2292 aese $iv0,$dat 2293 aesmc $iv0,$iv0 2294 vld1.32 {$dat},[$key2] 2295 aese $iv0,$dat1 2296 veor $iv0,$iv0,$dat 2297 2298 vld1.8 {$dat0},[$inp] 2299 veor $dat0,$iv0,$dat0 2300 2301 ldr $rounds,[$key1,#240] 2302 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 2303 2304 aese $dat0,q20 2305 aesmc $dat0,$dat0 2306 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 2307 aese $dat0,q21 2308 aesmc $dat0,$dat0 2309 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing 2310 b.eq .Lxts_128_enc 2311.Lxts_enc_round_loop: 2312 aese $dat0,q8 2313 aesmc $dat0,$dat0 2314 vld1.32 {q8},[$key1],#16 // load key schedule... 2315 aese $dat0,q9 2316 aesmc $dat0,$dat0 2317 vld1.32 {q9},[$key1],#16 // load key schedule... 2318 subs $rounds,$rounds,#2 // bias 2319 b.gt .Lxts_enc_round_loop 2320.Lxts_128_enc: 2321 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 2322 aese $dat0,q8 2323 aesmc $dat0,$dat0 2324 aese $dat0,q9 2325 aesmc $dat0,$dat0 2326 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 2327 aese $dat0,q10 2328 aesmc $dat0,$dat0 2329 aese $dat0,q11 2330 aesmc $dat0,$dat0 2331 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 2332 aese $dat0,q12 2333 aesmc $dat0,$dat0 2334 aese $dat0,q13 2335 aesmc $dat0,$dat0 2336 vld1.32 {$rndlast},[$key1] 2337 aese $dat0,q14 2338 aesmc $dat0,$dat0 2339 aese $dat0,q15 2340 veor $dat0,$dat0,$rndlast 2341 veor $dat0,$dat0,$iv0 2342 vst1.8 {$dat0},[$out] 2343 b .Lxts_enc_final_abort 2344 2345.align 4 2346.Lxts_enc_big_size: 2347___ 2348$code.=<<___ if ($flavour =~ /64/); 2349 stp $constnumx,$tmpinp,[sp,#-64]! 2350 stp $tailcnt,$midnumx,[sp,#48] 2351 stp $ivd10,$ivd20,[sp,#32] 2352 stp $ivd30,$ivd40,[sp,#16] 2353 2354 // tailcnt store the tail value of length%16. 2355 and $tailcnt,$len,#0xf 2356 and $len,$len,#-16 2357 subs $len,$len,#16 2358 mov $step,#16 2359 b.lo .Lxts_abort 2360 csel $step,xzr,$step,eq 2361 2362 // Firstly, encrypt the iv with key2, as the first iv of XEX. 2363 ldr $rounds,[$key2,#240] 2364 vld1.32 {$dat},[$key2],#16 2365 vld1.8 {$iv0},[$ivp] 2366 sub $rounds,$rounds,#2 2367 vld1.32 {$dat1},[$key2],#16 2368 2369.Loop_iv_enc: 2370 aese $iv0,$dat 2371 aesmc $iv0,$iv0 2372 vld1.32 {$dat},[$key2],#16 2373 subs $rounds,$rounds,#2 2374 aese $iv0,$dat1 2375 aesmc $iv0,$iv0 2376 vld1.32 {$dat1},[$key2],#16 2377 b.gt .Loop_iv_enc 2378 2379 aese $iv0,$dat 2380 aesmc $iv0,$iv0 2381 vld1.32 {$dat},[$key2] 2382 aese $iv0,$dat1 2383 veor $iv0,$iv0,$dat 2384 2385 // The iv for second block 2386 // $ivl- iv(low), $ivh - iv(high) 2387 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 2388 fmov $ivl,$ivd00 2389 fmov $ivh,$ivd01 2390 mov $constnum,#0x87 2391 extr $midnumx,$ivh,$ivh,#32 2392 extr $ivh,$ivh,$ivl,#63 2393 and $tmpmw,$constnum,$midnum,asr#31 2394 eor $ivl,$tmpmx,$ivl,lsl#1 2395 fmov $ivd10,$ivl 2396 fmov $ivd11,$ivh 2397 2398 ldr $rounds0,[$key1,#240] // next starting point 2399 vld1.8 {$dat},[$inp],$step 2400 2401 vld1.32 {q8-q9},[$key1] // load key schedule... 2402 sub $rounds0,$rounds0,#6 2403 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 2404 sub $rounds0,$rounds0,#2 2405 vld1.32 {q10-q11},[$key_],#32 2406 vld1.32 {q12-q13},[$key_],#32 2407 vld1.32 {q14-q15},[$key_],#32 2408 vld1.32 {$rndlast},[$key_] 2409 2410 add $key_,$key1,#32 2411 mov $rounds,$rounds0 2412 2413 // Encryption 2414.Lxts_enc: 2415 vld1.8 {$dat2},[$inp],#16 2416 subs $len,$len,#32 // bias 2417 add $rounds,$rounds0,#2 2418 vorr $in1,$dat,$dat 2419 vorr $dat1,$dat,$dat 2420 vorr $in3,$dat,$dat 2421 vorr $in2,$dat2,$dat2 2422 vorr $in4,$dat2,$dat2 2423 b.lo .Lxts_inner_enc_tail 2424 veor $dat,$dat,$iv0 // before encryption, xor with iv 2425 veor $dat2,$dat2,$iv1 2426 2427 // The iv for third block 2428 extr $midnumx,$ivh,$ivh,#32 2429 extr $ivh,$ivh,$ivl,#63 2430 and $tmpmw,$constnum,$midnum,asr#31 2431 eor $ivl,$tmpmx,$ivl,lsl#1 2432 fmov $ivd20,$ivl 2433 fmov $ivd21,$ivh 2434 2435 2436 vorr $dat1,$dat2,$dat2 2437 vld1.8 {$dat2},[$inp],#16 2438 vorr $in0,$dat,$dat 2439 vorr $in1,$dat1,$dat1 2440 veor $in2,$dat2,$iv2 // the third block 2441 veor $dat2,$dat2,$iv2 2442 cmp $len,#32 2443 b.lo .Lxts_outer_enc_tail 2444 2445 // The iv for fourth block 2446 extr $midnumx,$ivh,$ivh,#32 2447 extr $ivh,$ivh,$ivl,#63 2448 and $tmpmw,$constnum,$midnum,asr#31 2449 eor $ivl,$tmpmx,$ivl,lsl#1 2450 fmov $ivd30,$ivl 2451 fmov $ivd31,$ivh 2452 2453 vld1.8 {$dat3},[$inp],#16 2454 // The iv for fifth block 2455 extr $midnumx,$ivh,$ivh,#32 2456 extr $ivh,$ivh,$ivl,#63 2457 and $tmpmw,$constnum,$midnum,asr#31 2458 eor $ivl,$tmpmx,$ivl,lsl#1 2459 fmov $ivd40,$ivl 2460 fmov $ivd41,$ivh 2461 2462 vld1.8 {$dat4},[$inp],#16 2463 veor $dat3,$dat3,$iv3 // the fourth block 2464 veor $dat4,$dat4,$iv4 2465 sub $len,$len,#32 // bias 2466 mov $rounds,$rounds0 2467 b .Loop5x_xts_enc 2468 2469.align 4 2470.Loop5x_xts_enc: 2471 aese $dat0,q8 2472 aesmc $dat0,$dat0 2473 aese $dat1,q8 2474 aesmc $dat1,$dat1 2475 aese $dat2,q8 2476 aesmc $dat2,$dat2 2477 aese $dat3,q8 2478 aesmc $dat3,$dat3 2479 aese $dat4,q8 2480 aesmc $dat4,$dat4 2481 vld1.32 {q8},[$key_],#16 2482 subs $rounds,$rounds,#2 2483 aese $dat0,q9 2484 aesmc $dat0,$dat0 2485 aese $dat1,q9 2486 aesmc $dat1,$dat1 2487 aese $dat2,q9 2488 aesmc $dat2,$dat2 2489 aese $dat3,q9 2490 aesmc $dat3,$dat3 2491 aese $dat4,q9 2492 aesmc $dat4,$dat4 2493 vld1.32 {q9},[$key_],#16 2494 b.gt .Loop5x_xts_enc 2495 2496 aese $dat0,q8 2497 aesmc $dat0,$dat0 2498 aese $dat1,q8 2499 aesmc $dat1,$dat1 2500 aese $dat2,q8 2501 aesmc $dat2,$dat2 2502 aese $dat3,q8 2503 aesmc $dat3,$dat3 2504 aese $dat4,q8 2505 aesmc $dat4,$dat4 2506 subs $len,$len,#0x50 // because .Lxts_enc_tail4x 2507 2508 aese $dat0,q9 2509 aesmc $dat0,$dat0 2510 aese $dat1,q9 2511 aesmc $dat1,$dat1 2512 aese $dat2,q9 2513 aesmc $dat2,$dat2 2514 aese $dat3,q9 2515 aesmc $dat3,$dat3 2516 aese $dat4,q9 2517 aesmc $dat4,$dat4 2518 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 2519 mov $key_,$key1 2520 2521 aese $dat0,q10 2522 aesmc $dat0,$dat0 2523 aese $dat1,q10 2524 aesmc $dat1,$dat1 2525 aese $dat2,q10 2526 aesmc $dat2,$dat2 2527 aese $dat3,q10 2528 aesmc $dat3,$dat3 2529 aese $dat4,q10 2530 aesmc $dat4,$dat4 2531 add $inp,$inp,$xoffset // x0 is adjusted in such way that 2532 // at exit from the loop v1.16b-v26.16b 2533 // are loaded with last "words" 2534 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x 2535 2536 aese $dat0,q11 2537 aesmc $dat0,$dat0 2538 aese $dat1,q11 2539 aesmc $dat1,$dat1 2540 aese $dat2,q11 2541 aesmc $dat2,$dat2 2542 aese $dat3,q11 2543 aesmc $dat3,$dat3 2544 aese $dat4,q11 2545 aesmc $dat4,$dat4 2546 2547 aese $dat0,q12 2548 aesmc $dat0,$dat0 2549 aese $dat1,q12 2550 aesmc $dat1,$dat1 2551 aese $dat2,q12 2552 aesmc $dat2,$dat2 2553 aese $dat3,q12 2554 aesmc $dat3,$dat3 2555 aese $dat4,q12 2556 aesmc $dat4,$dat4 2557 2558 aese $dat0,q13 2559 aesmc $dat0,$dat0 2560 aese $dat1,q13 2561 aesmc $dat1,$dat1 2562 aese $dat2,q13 2563 aesmc $dat2,$dat2 2564 aese $dat3,q13 2565 aesmc $dat3,$dat3 2566 aese $dat4,q13 2567 aesmc $dat4,$dat4 2568 2569 aese $dat0,q14 2570 aesmc $dat0,$dat0 2571 aese $dat1,q14 2572 aesmc $dat1,$dat1 2573 aese $dat2,q14 2574 aesmc $dat2,$dat2 2575 aese $dat3,q14 2576 aesmc $dat3,$dat3 2577 aese $dat4,q14 2578 aesmc $dat4,$dat4 2579 2580 veor $tmp0,$rndlast,$iv0 2581 aese $dat0,q15 2582 // The iv for first block of one iteration 2583 extr $midnumx,$ivh,$ivh,#32 2584 extr $ivh,$ivh,$ivl,#63 2585 and $tmpmw,$constnum,$midnum,asr#31 2586 eor $ivl,$tmpmx,$ivl,lsl#1 2587 fmov $ivd00,$ivl 2588 fmov $ivd01,$ivh 2589 veor $tmp1,$rndlast,$iv1 2590 vld1.8 {$in0},[$inp],#16 2591 aese $dat1,q15 2592 // The iv for second block 2593 extr $midnumx,$ivh,$ivh,#32 2594 extr $ivh,$ivh,$ivl,#63 2595 and $tmpmw,$constnum,$midnum,asr#31 2596 eor $ivl,$tmpmx,$ivl,lsl#1 2597 fmov $ivd10,$ivl 2598 fmov $ivd11,$ivh 2599 veor $tmp2,$rndlast,$iv2 2600 vld1.8 {$in1},[$inp],#16 2601 aese $dat2,q15 2602 // The iv for third block 2603 extr $midnumx,$ivh,$ivh,#32 2604 extr $ivh,$ivh,$ivl,#63 2605 and $tmpmw,$constnum,$midnum,asr#31 2606 eor $ivl,$tmpmx,$ivl,lsl#1 2607 fmov $ivd20,$ivl 2608 fmov $ivd21,$ivh 2609 veor $tmp3,$rndlast,$iv3 2610 vld1.8 {$in2},[$inp],#16 2611 aese $dat3,q15 2612 // The iv for fourth block 2613 extr $midnumx,$ivh,$ivh,#32 2614 extr $ivh,$ivh,$ivl,#63 2615 and $tmpmw,$constnum,$midnum,asr#31 2616 eor $ivl,$tmpmx,$ivl,lsl#1 2617 fmov $ivd30,$ivl 2618 fmov $ivd31,$ivh 2619 veor $tmp4,$rndlast,$iv4 2620 vld1.8 {$in3},[$inp],#16 2621 aese $dat4,q15 2622 2623 // The iv for fifth block 2624 extr $midnumx,$ivh,$ivh,#32 2625 extr $ivh,$ivh,$ivl,#63 2626 and $tmpmw,$constnum,$midnum,asr #31 2627 eor $ivl,$tmpmx,$ivl,lsl #1 2628 fmov $ivd40,$ivl 2629 fmov $ivd41,$ivh 2630 2631 vld1.8 {$in4},[$inp],#16 2632 cbz $xoffset,.Lxts_enc_tail4x 2633 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2634 veor $tmp0,$tmp0,$dat0 2635 veor $dat0,$in0,$iv0 2636 veor $tmp1,$tmp1,$dat1 2637 veor $dat1,$in1,$iv1 2638 veor $tmp2,$tmp2,$dat2 2639 veor $dat2,$in2,$iv2 2640 veor $tmp3,$tmp3,$dat3 2641 veor $dat3,$in3,$iv3 2642 veor $tmp4,$tmp4,$dat4 2643 vst1.8 {$tmp0},[$out],#16 2644 veor $dat4,$in4,$iv4 2645 vst1.8 {$tmp1},[$out],#16 2646 mov $rounds,$rounds0 2647 vst1.8 {$tmp2},[$out],#16 2648 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2649 vst1.8 {$tmp3},[$out],#16 2650 vst1.8 {$tmp4},[$out],#16 2651 b.hs .Loop5x_xts_enc 2652 2653 2654 // If left 4 blocks, borrow the five block's processing. 2655 cmn $len,#0x10 2656 b.ne .Loop5x_enc_after 2657 vorr $iv4,$iv3,$iv3 2658 vorr $iv3,$iv2,$iv2 2659 vorr $iv2,$iv1,$iv1 2660 vorr $iv1,$iv0,$iv0 2661 fmov $ivl,$ivd40 2662 fmov $ivh,$ivd41 2663 veor $dat0,$iv0,$in0 2664 veor $dat1,$iv1,$in1 2665 veor $dat2,$in2,$iv2 2666 veor $dat3,$in3,$iv3 2667 veor $dat4,$in4,$iv4 2668 b.eq .Loop5x_xts_enc 2669 2670.Loop5x_enc_after: 2671 add $len,$len,#0x50 2672 cbz $len,.Lxts_enc_done 2673 2674 add $rounds,$rounds0,#2 2675 subs $len,$len,#0x30 2676 b.lo .Lxts_inner_enc_tail 2677 2678 veor $dat0,$iv0,$in2 2679 veor $dat1,$iv1,$in3 2680 veor $dat2,$in4,$iv2 2681 b .Lxts_outer_enc_tail 2682 2683.align 4 2684.Lxts_enc_tail4x: 2685 add $inp,$inp,#16 2686 veor $tmp1,$dat1,$tmp1 2687 vst1.8 {$tmp1},[$out],#16 2688 veor $tmp2,$dat2,$tmp2 2689 vst1.8 {$tmp2},[$out],#16 2690 veor $tmp3,$dat3,$tmp3 2691 veor $tmp4,$dat4,$tmp4 2692 vst1.8 {$tmp3-$tmp4},[$out],#32 2693 2694 b .Lxts_enc_done 2695.align 4 2696.Lxts_outer_enc_tail: 2697 aese $dat0,q8 2698 aesmc $dat0,$dat0 2699 aese $dat1,q8 2700 aesmc $dat1,$dat1 2701 aese $dat2,q8 2702 aesmc $dat2,$dat2 2703 vld1.32 {q8},[$key_],#16 2704 subs $rounds,$rounds,#2 2705 aese $dat0,q9 2706 aesmc $dat0,$dat0 2707 aese $dat1,q9 2708 aesmc $dat1,$dat1 2709 aese $dat2,q9 2710 aesmc $dat2,$dat2 2711 vld1.32 {q9},[$key_],#16 2712 b.gt .Lxts_outer_enc_tail 2713 2714 aese $dat0,q8 2715 aesmc $dat0,$dat0 2716 aese $dat1,q8 2717 aesmc $dat1,$dat1 2718 aese $dat2,q8 2719 aesmc $dat2,$dat2 2720 veor $tmp0,$iv0,$rndlast 2721 subs $len,$len,#0x30 2722 // The iv for first block 2723 fmov $ivl,$ivd20 2724 fmov $ivh,$ivd21 2725 //mov $constnum,#0x87 2726 extr $midnumx,$ivh,$ivh,#32 2727 extr $ivh,$ivh,$ivl,#63 2728 and $tmpmw,$constnum,$midnum,asr#31 2729 eor $ivl,$tmpmx,$ivl,lsl#1 2730 fmov $ivd00,$ivl 2731 fmov $ivd01,$ivh 2732 veor $tmp1,$iv1,$rndlast 2733 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 2734 aese $dat0,q9 2735 aesmc $dat0,$dat0 2736 aese $dat1,q9 2737 aesmc $dat1,$dat1 2738 aese $dat2,q9 2739 aesmc $dat2,$dat2 2740 veor $tmp2,$iv2,$rndlast 2741 2742 add $xoffset,$xoffset,#0x20 2743 add $inp,$inp,$xoffset 2744 mov $key_,$key1 2745 2746 aese $dat0,q12 2747 aesmc $dat0,$dat0 2748 aese $dat1,q12 2749 aesmc $dat1,$dat1 2750 aese $dat2,q12 2751 aesmc $dat2,$dat2 2752 aese $dat0,q13 2753 aesmc $dat0,$dat0 2754 aese $dat1,q13 2755 aesmc $dat1,$dat1 2756 aese $dat2,q13 2757 aesmc $dat2,$dat2 2758 aese $dat0,q14 2759 aesmc $dat0,$dat0 2760 aese $dat1,q14 2761 aesmc $dat1,$dat1 2762 aese $dat2,q14 2763 aesmc $dat2,$dat2 2764 aese $dat0,q15 2765 aese $dat1,q15 2766 aese $dat2,q15 2767 vld1.8 {$in2},[$inp],#16 2768 add $rounds,$rounds0,#2 2769 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 2770 veor $tmp0,$tmp0,$dat0 2771 veor $tmp1,$tmp1,$dat1 2772 veor $dat2,$dat2,$tmp2 2773 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 2774 vst1.8 {$tmp0},[$out],#16 2775 vst1.8 {$tmp1},[$out],#16 2776 vst1.8 {$dat2},[$out],#16 2777 cmn $len,#0x30 2778 b.eq .Lxts_enc_done 2779.Lxts_encxor_one: 2780 vorr $in3,$in1,$in1 2781 vorr $in4,$in2,$in2 2782 nop 2783 2784.Lxts_inner_enc_tail: 2785 cmn $len,#0x10 2786 veor $dat1,$in3,$iv0 2787 veor $dat2,$in4,$iv1 2788 b.eq .Lxts_enc_tail_loop 2789 veor $dat2,$in4,$iv0 2790.Lxts_enc_tail_loop: 2791 aese $dat1,q8 2792 aesmc $dat1,$dat1 2793 aese $dat2,q8 2794 aesmc $dat2,$dat2 2795 vld1.32 {q8},[$key_],#16 2796 subs $rounds,$rounds,#2 2797 aese $dat1,q9 2798 aesmc $dat1,$dat1 2799 aese $dat2,q9 2800 aesmc $dat2,$dat2 2801 vld1.32 {q9},[$key_],#16 2802 b.gt .Lxts_enc_tail_loop 2803 2804 aese $dat1,q8 2805 aesmc $dat1,$dat1 2806 aese $dat2,q8 2807 aesmc $dat2,$dat2 2808 aese $dat1,q9 2809 aesmc $dat1,$dat1 2810 aese $dat2,q9 2811 aesmc $dat2,$dat2 2812 aese $dat1,q12 2813 aesmc $dat1,$dat1 2814 aese $dat2,q12 2815 aesmc $dat2,$dat2 2816 cmn $len,#0x20 2817 aese $dat1,q13 2818 aesmc $dat1,$dat1 2819 aese $dat2,q13 2820 aesmc $dat2,$dat2 2821 veor $tmp1,$iv0,$rndlast 2822 aese $dat1,q14 2823 aesmc $dat1,$dat1 2824 aese $dat2,q14 2825 aesmc $dat2,$dat2 2826 veor $tmp2,$iv1,$rndlast 2827 aese $dat1,q15 2828 aese $dat2,q15 2829 b.eq .Lxts_enc_one 2830 veor $tmp1,$tmp1,$dat1 2831 vst1.8 {$tmp1},[$out],#16 2832 veor $tmp2,$tmp2,$dat2 2833 vorr $iv0,$iv1,$iv1 2834 vst1.8 {$tmp2},[$out],#16 2835 fmov $ivl,$ivd10 2836 fmov $ivh,$ivd11 2837 mov $constnum,#0x87 2838 extr $midnumx,$ivh,$ivh,#32 2839 extr $ivh,$ivh,$ivl,#63 2840 and $tmpmw,$constnum,$midnum,asr #31 2841 eor $ivl,$tmpmx,$ivl,lsl #1 2842 fmov $ivd00,$ivl 2843 fmov $ivd01,$ivh 2844 b .Lxts_enc_done 2845 2846.Lxts_enc_one: 2847 veor $tmp1,$tmp1,$dat2 2848 vorr $iv0,$iv0,$iv0 2849 vst1.8 {$tmp1},[$out],#16 2850 fmov $ivl,$ivd00 2851 fmov $ivh,$ivd01 2852 mov $constnum,#0x87 2853 extr $midnumx,$ivh,$ivh,#32 2854 extr $ivh,$ivh,$ivl,#63 2855 and $tmpmw,$constnum,$midnum,asr #31 2856 eor $ivl,$tmpmx,$ivl,lsl #1 2857 fmov $ivd00,$ivl 2858 fmov $ivd01,$ivh 2859 b .Lxts_enc_done 2860.align 5 2861.Lxts_enc_done: 2862 // Process the tail block with cipher stealing. 2863 tst $tailcnt,#0xf 2864 b.eq .Lxts_abort 2865 2866 mov $tmpinp,$inp 2867 mov $tmpoutp,$out 2868 sub $out,$out,#16 2869.composite_enc_loop: 2870 subs $tailcnt,$tailcnt,#1 2871 ldrb $l2outp,[$out,$tailcnt] 2872 ldrb $loutp,[$tmpinp,$tailcnt] 2873 strb $l2outp,[$tmpoutp,$tailcnt] 2874 strb $loutp,[$out,$tailcnt] 2875 b.gt .composite_enc_loop 2876.Lxts_enc_load_done: 2877 vld1.8 {$tmpin},[$out] 2878 veor $tmpin,$tmpin,$iv0 2879 2880 // Encrypt the composite block to get the last second encrypted text block 2881 ldr $rounds,[$key1,#240] // load key schedule... 2882 vld1.8 {$dat},[$key1],#16 2883 sub $rounds,$rounds,#2 2884 vld1.8 {$dat1},[$key1],#16 // load key schedule... 2885.Loop_final_enc: 2886 aese $tmpin,$dat0 2887 aesmc $tmpin,$tmpin 2888 vld1.32 {$dat0},[$key1],#16 2889 subs $rounds,$rounds,#2 2890 aese $tmpin,$dat1 2891 aesmc $tmpin,$tmpin 2892 vld1.32 {$dat1},[$key1],#16 2893 b.gt .Loop_final_enc 2894 2895 aese $tmpin,$dat0 2896 aesmc $tmpin,$tmpin 2897 vld1.32 {$dat0},[$key1] 2898 aese $tmpin,$dat1 2899 veor $tmpin,$tmpin,$dat0 2900 veor $tmpin,$tmpin,$iv0 2901 vst1.8 {$tmpin},[$out] 2902 2903.Lxts_abort: 2904 ldp $tailcnt,$midnumx,[sp,#48] 2905 ldp $ivd10,$ivd20,[sp,#32] 2906 ldp $ivd30,$ivd40,[sp,#16] 2907 ldp $constnumx,$tmpinp,[sp],#64 2908.Lxts_enc_final_abort: 2909 ret 2910.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt 2911___ 2912 2913}}} 2914{{{ 2915my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); 2916my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); 2917my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); 2918my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); 2919my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); 2920my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); 2921my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); 2922my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); 2923my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); 2924 2925my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 2926 2927# q7 last round key 2928# q10-q15, q7 Last 7 round keys 2929# q8-q9 preloaded round keys except last 7 keys for big size 2930# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte 2931 2932{ 2933my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 2934 2935my ($dat3,$in3,$tmp3); # used only in 64-bit mode 2936my ($dat4,$in4,$tmp4); 2937if ($flavour =~ /64/) { 2938 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); 2939} 2940 2941$code.=<<___ if ($flavour =~ /64/); 2942.globl ${prefix}_xts_decrypt 2943.type ${prefix}_xts_decrypt,%function 2944.align 5 2945${prefix}_xts_decrypt: 2946 AARCH64_VALID_CALL_TARGET 2947___ 2948$code.=<<___ if ($flavour =~ /64/); 2949 cmp $len,#16 2950 // Original input data size bigger than 16, jump to big size processing. 2951 b.ne .Lxts_dec_big_size 2952 // Encrypt the iv with key2, as the first XEX iv. 2953 ldr $rounds,[$key2,#240] 2954 vld1.8 {$dat},[$key2],#16 2955 vld1.8 {$iv0},[$ivp] 2956 sub $rounds,$rounds,#2 2957 vld1.8 {$dat1},[$key2],#16 2958 2959.Loop_dec_small_iv_enc: 2960 aese $iv0,$dat 2961 aesmc $iv0,$iv0 2962 vld1.32 {$dat},[$key2],#16 2963 subs $rounds,$rounds,#2 2964 aese $iv0,$dat1 2965 aesmc $iv0,$iv0 2966 vld1.32 {$dat1},[$key2],#16 2967 b.gt .Loop_dec_small_iv_enc 2968 2969 aese $iv0,$dat 2970 aesmc $iv0,$iv0 2971 vld1.32 {$dat},[$key2] 2972 aese $iv0,$dat1 2973 veor $iv0,$iv0,$dat 2974 2975 vld1.8 {$dat0},[$inp] 2976 veor $dat0,$iv0,$dat0 2977 2978 ldr $rounds,[$key1,#240] 2979 vld1.32 {q20-q21},[$key1],#32 // load key schedule... 2980 2981 aesd $dat0,q20 2982 aesimc $dat0,$dat0 2983 vld1.32 {q8-q9},[$key1],#32 // load key schedule... 2984 aesd $dat0,q21 2985 aesimc $dat0,$dat0 2986 subs $rounds,$rounds,#10 // bias 2987 b.eq .Lxts_128_dec 2988.Lxts_dec_round_loop: 2989 aesd $dat0,q8 2990 aesimc $dat0,$dat0 2991 vld1.32 {q8},[$key1],#16 // load key schedule... 2992 aesd $dat0,q9 2993 aesimc $dat0,$dat0 2994 vld1.32 {q9},[$key1],#16 // load key schedule... 2995 subs $rounds,$rounds,#2 // bias 2996 b.gt .Lxts_dec_round_loop 2997.Lxts_128_dec: 2998 vld1.32 {q10-q11},[$key1],#32 // load key schedule... 2999 aesd $dat0,q8 3000 aesimc $dat0,$dat0 3001 aesd $dat0,q9 3002 aesimc $dat0,$dat0 3003 vld1.32 {q12-q13},[$key1],#32 // load key schedule... 3004 aesd $dat0,q10 3005 aesimc $dat0,$dat0 3006 aesd $dat0,q11 3007 aesimc $dat0,$dat0 3008 vld1.32 {q14-q15},[$key1],#32 // load key schedule... 3009 aesd $dat0,q12 3010 aesimc $dat0,$dat0 3011 aesd $dat0,q13 3012 aesimc $dat0,$dat0 3013 vld1.32 {$rndlast},[$key1] 3014 aesd $dat0,q14 3015 aesimc $dat0,$dat0 3016 aesd $dat0,q15 3017 veor $dat0,$dat0,$rndlast 3018 veor $dat0,$iv0,$dat0 3019 vst1.8 {$dat0},[$out] 3020 b .Lxts_dec_final_abort 3021.Lxts_dec_big_size: 3022___ 3023$code.=<<___ if ($flavour =~ /64/); 3024 stp $constnumx,$tmpinp,[sp,#-64]! 3025 stp $tailcnt,$midnumx,[sp,#48] 3026 stp $ivd10,$ivd20,[sp,#32] 3027 stp $ivd30,$ivd40,[sp,#16] 3028 3029 and $tailcnt,$len,#0xf 3030 and $len,$len,#-16 3031 subs $len,$len,#16 3032 mov $step,#16 3033 b.lo .Lxts_dec_abort 3034 3035 // Encrypt the iv with key2, as the first XEX iv 3036 ldr $rounds,[$key2,#240] 3037 vld1.8 {$dat},[$key2],#16 3038 vld1.8 {$iv0},[$ivp] 3039 sub $rounds,$rounds,#2 3040 vld1.8 {$dat1},[$key2],#16 3041 3042.Loop_dec_iv_enc: 3043 aese $iv0,$dat 3044 aesmc $iv0,$iv0 3045 vld1.32 {$dat},[$key2],#16 3046 subs $rounds,$rounds,#2 3047 aese $iv0,$dat1 3048 aesmc $iv0,$iv0 3049 vld1.32 {$dat1},[$key2],#16 3050 b.gt .Loop_dec_iv_enc 3051 3052 aese $iv0,$dat 3053 aesmc $iv0,$iv0 3054 vld1.32 {$dat},[$key2] 3055 aese $iv0,$dat1 3056 veor $iv0,$iv0,$dat 3057 3058 // The iv for second block 3059 // $ivl- iv(low), $ivh - iv(high) 3060 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 3061 fmov $ivl,$ivd00 3062 fmov $ivh,$ivd01 3063 mov $constnum,#0x87 3064 extr $midnumx,$ivh,$ivh,#32 3065 extr $ivh,$ivh,$ivl,#63 3066 and $tmpmw,$constnum,$midnum,asr #31 3067 eor $ivl,$tmpmx,$ivl,lsl #1 3068 fmov $ivd10,$ivl 3069 fmov $ivd11,$ivh 3070 3071 ldr $rounds0,[$key1,#240] // load rounds number 3072 3073 // The iv for third block 3074 extr $midnumx,$ivh,$ivh,#32 3075 extr $ivh,$ivh,$ivl,#63 3076 and $tmpmw,$constnum,$midnum,asr #31 3077 eor $ivl,$tmpmx,$ivl,lsl #1 3078 fmov $ivd20,$ivl 3079 fmov $ivd21,$ivh 3080 3081 vld1.32 {q8-q9},[$key1] // load key schedule... 3082 sub $rounds0,$rounds0,#6 3083 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys 3084 sub $rounds0,$rounds0,#2 3085 vld1.32 {q10-q11},[$key_],#32 // load key schedule... 3086 vld1.32 {q12-q13},[$key_],#32 3087 vld1.32 {q14-q15},[$key_],#32 3088 vld1.32 {$rndlast},[$key_] 3089 3090 // The iv for fourth block 3091 extr $midnumx,$ivh,$ivh,#32 3092 extr $ivh,$ivh,$ivl,#63 3093 and $tmpmw,$constnum,$midnum,asr #31 3094 eor $ivl,$tmpmx,$ivl,lsl #1 3095 fmov $ivd30,$ivl 3096 fmov $ivd31,$ivh 3097 3098 add $key_,$key1,#32 3099 mov $rounds,$rounds0 3100 b .Lxts_dec 3101 3102 // Decryption 3103.align 5 3104.Lxts_dec: 3105 tst $tailcnt,#0xf 3106 b.eq .Lxts_dec_begin 3107 subs $len,$len,#16 3108 csel $step,xzr,$step,eq 3109 vld1.8 {$dat},[$inp],#16 3110 b.lo .Lxts_done 3111 sub $inp,$inp,#16 3112.Lxts_dec_begin: 3113 vld1.8 {$dat},[$inp],$step 3114 subs $len,$len,#32 // bias 3115 add $rounds,$rounds0,#2 3116 vorr $in1,$dat,$dat 3117 vorr $dat1,$dat,$dat 3118 vorr $in3,$dat,$dat 3119 vld1.8 {$dat2},[$inp],#16 3120 vorr $in2,$dat2,$dat2 3121 vorr $in4,$dat2,$dat2 3122 b.lo .Lxts_inner_dec_tail 3123 veor $dat,$dat,$iv0 // before decryt, xor with iv 3124 veor $dat2,$dat2,$iv1 3125 3126 vorr $dat1,$dat2,$dat2 3127 vld1.8 {$dat2},[$inp],#16 3128 vorr $in0,$dat,$dat 3129 vorr $in1,$dat1,$dat1 3130 veor $in2,$dat2,$iv2 // third block xox with third iv 3131 veor $dat2,$dat2,$iv2 3132 cmp $len,#32 3133 b.lo .Lxts_outer_dec_tail 3134 3135 vld1.8 {$dat3},[$inp],#16 3136 3137 // The iv for fifth block 3138 extr $midnumx,$ivh,$ivh,#32 3139 extr $ivh,$ivh,$ivl,#63 3140 and $tmpmw,$constnum,$midnum,asr #31 3141 eor $ivl,$tmpmx,$ivl,lsl #1 3142 fmov $ivd40,$ivl 3143 fmov $ivd41,$ivh 3144 3145 vld1.8 {$dat4},[$inp],#16 3146 veor $dat3,$dat3,$iv3 // the fourth block 3147 veor $dat4,$dat4,$iv4 3148 sub $len,$len,#32 // bias 3149 mov $rounds,$rounds0 3150 b .Loop5x_xts_dec 3151 3152.align 4 3153.Loop5x_xts_dec: 3154 aesd $dat0,q8 3155 aesimc $dat0,$dat0 3156 aesd $dat1,q8 3157 aesimc $dat1,$dat1 3158 aesd $dat2,q8 3159 aesimc $dat2,$dat2 3160 aesd $dat3,q8 3161 aesimc $dat3,$dat3 3162 aesd $dat4,q8 3163 aesimc $dat4,$dat4 3164 vld1.32 {q8},[$key_],#16 // load key schedule... 3165 subs $rounds,$rounds,#2 3166 aesd $dat0,q9 3167 aesimc $dat0,$dat0 3168 aesd $dat1,q9 3169 aesimc $dat1,$dat1 3170 aesd $dat2,q9 3171 aesimc $dat2,$dat2 3172 aesd $dat3,q9 3173 aesimc $dat3,$dat3 3174 aesd $dat4,q9 3175 aesimc $dat4,$dat4 3176 vld1.32 {q9},[$key_],#16 // load key schedule... 3177 b.gt .Loop5x_xts_dec 3178 3179 aesd $dat0,q8 3180 aesimc $dat0,$dat0 3181 aesd $dat1,q8 3182 aesimc $dat1,$dat1 3183 aesd $dat2,q8 3184 aesimc $dat2,$dat2 3185 aesd $dat3,q8 3186 aesimc $dat3,$dat3 3187 aesd $dat4,q8 3188 aesimc $dat4,$dat4 3189 subs $len,$len,#0x50 // because .Lxts_dec_tail4x 3190 3191 aesd $dat0,q9 3192 aesimc $dat0,$dat 3193 aesd $dat1,q9 3194 aesimc $dat1,$dat1 3195 aesd $dat2,q9 3196 aesimc $dat2,$dat2 3197 aesd $dat3,q9 3198 aesimc $dat3,$dat3 3199 aesd $dat4,q9 3200 aesimc $dat4,$dat4 3201 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo 3202 mov $key_,$key1 3203 3204 aesd $dat0,q10 3205 aesimc $dat0,$dat0 3206 aesd $dat1,q10 3207 aesimc $dat1,$dat1 3208 aesd $dat2,q10 3209 aesimc $dat2,$dat2 3210 aesd $dat3,q10 3211 aesimc $dat3,$dat3 3212 aesd $dat4,q10 3213 aesimc $dat4,$dat4 3214 add $inp,$inp,$xoffset // x0 is adjusted in such way that 3215 // at exit from the loop v1.16b-v26.16b 3216 // are loaded with last "words" 3217 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x 3218 3219 aesd $dat0,q11 3220 aesimc $dat0,$dat0 3221 aesd $dat1,q11 3222 aesimc $dat1,$dat1 3223 aesd $dat2,q11 3224 aesimc $dat2,$dat2 3225 aesd $dat3,q11 3226 aesimc $dat3,$dat3 3227 aesd $dat4,q11 3228 aesimc $dat4,$dat4 3229 3230 aesd $dat0,q12 3231 aesimc $dat0,$dat0 3232 aesd $dat1,q12 3233 aesimc $dat1,$dat1 3234 aesd $dat2,q12 3235 aesimc $dat2,$dat2 3236 aesd $dat3,q12 3237 aesimc $dat3,$dat3 3238 aesd $dat4,q12 3239 aesimc $dat4,$dat4 3240 3241 aesd $dat0,q13 3242 aesimc $dat0,$dat0 3243 aesd $dat1,q13 3244 aesimc $dat1,$dat1 3245 aesd $dat2,q13 3246 aesimc $dat2,$dat2 3247 aesd $dat3,q13 3248 aesimc $dat3,$dat3 3249 aesd $dat4,q13 3250 aesimc $dat4,$dat4 3251 3252 aesd $dat0,q14 3253 aesimc $dat0,$dat0 3254 aesd $dat1,q14 3255 aesimc $dat1,$dat1 3256 aesd $dat2,q14 3257 aesimc $dat2,$dat2 3258 aesd $dat3,q14 3259 aesimc $dat3,$dat3 3260 aesd $dat4,q14 3261 aesimc $dat4,$dat4 3262 3263 veor $tmp0,$rndlast,$iv0 3264 aesd $dat0,q15 3265 // The iv for first block of next iteration. 3266 extr $midnumx,$ivh,$ivh,#32 3267 extr $ivh,$ivh,$ivl,#63 3268 and $tmpmw,$constnum,$midnum,asr #31 3269 eor $ivl,$tmpmx,$ivl,lsl #1 3270 fmov $ivd00,$ivl 3271 fmov $ivd01,$ivh 3272 veor $tmp1,$rndlast,$iv1 3273 vld1.8 {$in0},[$inp],#16 3274 aesd $dat1,q15 3275 // The iv for second block 3276 extr $midnumx,$ivh,$ivh,#32 3277 extr $ivh,$ivh,$ivl,#63 3278 and $tmpmw,$constnum,$midnum,asr #31 3279 eor $ivl,$tmpmx,$ivl,lsl #1 3280 fmov $ivd10,$ivl 3281 fmov $ivd11,$ivh 3282 veor $tmp2,$rndlast,$iv2 3283 vld1.8 {$in1},[$inp],#16 3284 aesd $dat2,q15 3285 // The iv for third block 3286 extr $midnumx,$ivh,$ivh,#32 3287 extr $ivh,$ivh,$ivl,#63 3288 and $tmpmw,$constnum,$midnum,asr #31 3289 eor $ivl,$tmpmx,$ivl,lsl #1 3290 fmov $ivd20,$ivl 3291 fmov $ivd21,$ivh 3292 veor $tmp3,$rndlast,$iv3 3293 vld1.8 {$in2},[$inp],#16 3294 aesd $dat3,q15 3295 // The iv for fourth block 3296 extr $midnumx,$ivh,$ivh,#32 3297 extr $ivh,$ivh,$ivl,#63 3298 and $tmpmw,$constnum,$midnum,asr #31 3299 eor $ivl,$tmpmx,$ivl,lsl #1 3300 fmov $ivd30,$ivl 3301 fmov $ivd31,$ivh 3302 veor $tmp4,$rndlast,$iv4 3303 vld1.8 {$in3},[$inp],#16 3304 aesd $dat4,q15 3305 3306 // The iv for fifth block 3307 extr $midnumx,$ivh,$ivh,#32 3308 extr $ivh,$ivh,$ivl,#63 3309 and $tmpmw,$constnum,$midnum,asr #31 3310 eor $ivl,$tmpmx,$ivl,lsl #1 3311 fmov $ivd40,$ivl 3312 fmov $ivd41,$ivh 3313 3314 vld1.8 {$in4},[$inp],#16 3315 cbz $xoffset,.Lxts_dec_tail4x 3316 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3317 veor $tmp0,$tmp0,$dat0 3318 veor $dat0,$in0,$iv0 3319 veor $tmp1,$tmp1,$dat1 3320 veor $dat1,$in1,$iv1 3321 veor $tmp2,$tmp2,$dat2 3322 veor $dat2,$in2,$iv2 3323 veor $tmp3,$tmp3,$dat3 3324 veor $dat3,$in3,$iv3 3325 veor $tmp4,$tmp4,$dat4 3326 vst1.8 {$tmp0},[$out],#16 3327 veor $dat4,$in4,$iv4 3328 vst1.8 {$tmp1},[$out],#16 3329 mov $rounds,$rounds0 3330 vst1.8 {$tmp2},[$out],#16 3331 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3332 vst1.8 {$tmp3},[$out],#16 3333 vst1.8 {$tmp4},[$out],#16 3334 b.hs .Loop5x_xts_dec 3335 3336 cmn $len,#0x10 3337 b.ne .Loop5x_dec_after 3338 // If x2($len) equal to -0x10, the left blocks is 4. 3339 // After specially processing, utilize the five blocks processing again. 3340 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. 3341 vorr $iv4,$iv3,$iv3 3342 vorr $iv3,$iv2,$iv2 3343 vorr $iv2,$iv1,$iv1 3344 vorr $iv1,$iv0,$iv0 3345 fmov $ivl,$ivd40 3346 fmov $ivh,$ivd41 3347 veor $dat0,$iv0,$in0 3348 veor $dat1,$iv1,$in1 3349 veor $dat2,$in2,$iv2 3350 veor $dat3,$in3,$iv3 3351 veor $dat4,$in4,$iv4 3352 b.eq .Loop5x_xts_dec 3353 3354.Loop5x_dec_after: 3355 add $len,$len,#0x50 3356 cbz $len,.Lxts_done 3357 3358 add $rounds,$rounds0,#2 3359 subs $len,$len,#0x30 3360 b.lo .Lxts_inner_dec_tail 3361 3362 veor $dat0,$iv0,$in2 3363 veor $dat1,$iv1,$in3 3364 veor $dat2,$in4,$iv2 3365 b .Lxts_outer_dec_tail 3366 3367.align 4 3368.Lxts_dec_tail4x: 3369 add $inp,$inp,#16 3370 vld1.32 {$dat0},[$inp],#16 3371 veor $tmp1,$dat1,$tmp0 3372 vst1.8 {$tmp1},[$out],#16 3373 veor $tmp2,$dat2,$tmp2 3374 vst1.8 {$tmp2},[$out],#16 3375 veor $tmp3,$dat3,$tmp3 3376 veor $tmp4,$dat4,$tmp4 3377 vst1.8 {$tmp3-$tmp4},[$out],#32 3378 3379 b .Lxts_done 3380.align 4 3381.Lxts_outer_dec_tail: 3382 aesd $dat0,q8 3383 aesimc $dat0,$dat0 3384 aesd $dat1,q8 3385 aesimc $dat1,$dat1 3386 aesd $dat2,q8 3387 aesimc $dat2,$dat2 3388 vld1.32 {q8},[$key_],#16 3389 subs $rounds,$rounds,#2 3390 aesd $dat0,q9 3391 aesimc $dat0,$dat0 3392 aesd $dat1,q9 3393 aesimc $dat1,$dat1 3394 aesd $dat2,q9 3395 aesimc $dat2,$dat2 3396 vld1.32 {q9},[$key_],#16 3397 b.gt .Lxts_outer_dec_tail 3398 3399 aesd $dat0,q8 3400 aesimc $dat0,$dat0 3401 aesd $dat1,q8 3402 aesimc $dat1,$dat1 3403 aesd $dat2,q8 3404 aesimc $dat2,$dat2 3405 veor $tmp0,$iv0,$rndlast 3406 subs $len,$len,#0x30 3407 // The iv for first block 3408 fmov $ivl,$ivd20 3409 fmov $ivh,$ivd21 3410 mov $constnum,#0x87 3411 extr $midnumx,$ivh,$ivh,#32 3412 extr $ivh,$ivh,$ivl,#63 3413 and $tmpmw,$constnum,$midnum,asr #31 3414 eor $ivl,$tmpmx,$ivl,lsl #1 3415 fmov $ivd00,$ivl 3416 fmov $ivd01,$ivh 3417 veor $tmp1,$iv1,$rndlast 3418 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point 3419 aesd $dat0,q9 3420 aesimc $dat0,$dat0 3421 aesd $dat1,q9 3422 aesimc $dat1,$dat1 3423 aesd $dat2,q9 3424 aesimc $dat2,$dat2 3425 veor $tmp2,$iv2,$rndlast 3426 // The iv for second block 3427 extr $midnumx,$ivh,$ivh,#32 3428 extr $ivh,$ivh,$ivl,#63 3429 and $tmpmw,$constnum,$midnum,asr #31 3430 eor $ivl,$tmpmx,$ivl,lsl #1 3431 fmov $ivd10,$ivl 3432 fmov $ivd11,$ivh 3433 3434 add $xoffset,$xoffset,#0x20 3435 add $inp,$inp,$xoffset // $inp is adjusted to the last data 3436 3437 mov $key_,$key1 3438 3439 // The iv for third block 3440 extr $midnumx,$ivh,$ivh,#32 3441 extr $ivh,$ivh,$ivl,#63 3442 and $tmpmw,$constnum,$midnum,asr #31 3443 eor $ivl,$tmpmx,$ivl,lsl #1 3444 fmov $ivd20,$ivl 3445 fmov $ivd21,$ivh 3446 3447 aesd $dat0,q12 3448 aesimc $dat0,$dat0 3449 aesd $dat1,q12 3450 aesimc $dat1,$dat1 3451 aesd $dat2,q12 3452 aesimc $dat2,$dat2 3453 aesd $dat0,q13 3454 aesimc $dat0,$dat0 3455 aesd $dat1,q13 3456 aesimc $dat1,$dat1 3457 aesd $dat2,q13 3458 aesimc $dat2,$dat2 3459 aesd $dat0,q14 3460 aesimc $dat0,$dat0 3461 aesd $dat1,q14 3462 aesimc $dat1,$dat1 3463 aesd $dat2,q14 3464 aesimc $dat2,$dat2 3465 vld1.8 {$in2},[$inp],#16 3466 aesd $dat0,q15 3467 aesd $dat1,q15 3468 aesd $dat2,q15 3469 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 3470 add $rounds,$rounds0,#2 3471 veor $tmp0,$tmp0,$dat0 3472 veor $tmp1,$tmp1,$dat1 3473 veor $dat2,$dat2,$tmp2 3474 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 3475 vst1.8 {$tmp0},[$out],#16 3476 vst1.8 {$tmp1},[$out],#16 3477 vst1.8 {$dat2},[$out],#16 3478 3479 cmn $len,#0x30 3480 add $len,$len,#0x30 3481 b.eq .Lxts_done 3482 sub $len,$len,#0x30 3483 vorr $in3,$in1,$in1 3484 vorr $in4,$in2,$in2 3485 nop 3486 3487.Lxts_inner_dec_tail: 3488 // $len == -0x10 means two blocks left. 3489 cmn $len,#0x10 3490 veor $dat1,$in3,$iv0 3491 veor $dat2,$in4,$iv1 3492 b.eq .Lxts_dec_tail_loop 3493 veor $dat2,$in4,$iv0 3494.Lxts_dec_tail_loop: 3495 aesd $dat1,q8 3496 aesimc $dat1,$dat1 3497 aesd $dat2,q8 3498 aesimc $dat2,$dat2 3499 vld1.32 {q8},[$key_],#16 3500 subs $rounds,$rounds,#2 3501 aesd $dat1,q9 3502 aesimc $dat1,$dat1 3503 aesd $dat2,q9 3504 aesimc $dat2,$dat2 3505 vld1.32 {q9},[$key_],#16 3506 b.gt .Lxts_dec_tail_loop 3507 3508 aesd $dat1,q8 3509 aesimc $dat1,$dat1 3510 aesd $dat2,q8 3511 aesimc $dat2,$dat2 3512 aesd $dat1,q9 3513 aesimc $dat1,$dat1 3514 aesd $dat2,q9 3515 aesimc $dat2,$dat2 3516 aesd $dat1,q12 3517 aesimc $dat1,$dat1 3518 aesd $dat2,q12 3519 aesimc $dat2,$dat2 3520 cmn $len,#0x20 3521 aesd $dat1,q13 3522 aesimc $dat1,$dat1 3523 aesd $dat2,q13 3524 aesimc $dat2,$dat2 3525 veor $tmp1,$iv0,$rndlast 3526 aesd $dat1,q14 3527 aesimc $dat1,$dat1 3528 aesd $dat2,q14 3529 aesimc $dat2,$dat2 3530 veor $tmp2,$iv1,$rndlast 3531 aesd $dat1,q15 3532 aesd $dat2,q15 3533 b.eq .Lxts_dec_one 3534 veor $tmp1,$tmp1,$dat1 3535 veor $tmp2,$tmp2,$dat2 3536 vorr $iv0,$iv2,$iv2 3537 vorr $iv1,$iv3,$iv3 3538 vst1.8 {$tmp1},[$out],#16 3539 vst1.8 {$tmp2},[$out],#16 3540 add $len,$len,#16 3541 b .Lxts_done 3542 3543.Lxts_dec_one: 3544 veor $tmp1,$tmp1,$dat2 3545 vorr $iv0,$iv1,$iv1 3546 vorr $iv1,$iv2,$iv2 3547 vst1.8 {$tmp1},[$out],#16 3548 add $len,$len,#32 3549 3550.Lxts_done: 3551 tst $tailcnt,#0xf 3552 b.eq .Lxts_dec_abort 3553 // Processing the last two blocks with cipher stealing. 3554 mov x7,x3 3555 cbnz x2,.Lxts_dec_1st_done 3556 vld1.32 {$dat0},[$inp],#16 3557 3558 // Decrypt the last second block to get the last plain text block 3559.Lxts_dec_1st_done: 3560 eor $tmpin,$dat0,$iv1 3561 ldr $rounds,[$key1,#240] 3562 vld1.32 {$dat0},[$key1],#16 3563 sub $rounds,$rounds,#2 3564 vld1.32 {$dat1},[$key1],#16 3565.Loop_final_2nd_dec: 3566 aesd $tmpin,$dat0 3567 aesimc $tmpin,$tmpin 3568 vld1.32 {$dat0},[$key1],#16 // load key schedule... 3569 subs $rounds,$rounds,#2 3570 aesd $tmpin,$dat1 3571 aesimc $tmpin,$tmpin 3572 vld1.32 {$dat1},[$key1],#16 // load key schedule... 3573 b.gt .Loop_final_2nd_dec 3574 3575 aesd $tmpin,$dat0 3576 aesimc $tmpin,$tmpin 3577 vld1.32 {$dat0},[$key1] 3578 aesd $tmpin,$dat1 3579 veor $tmpin,$tmpin,$dat0 3580 veor $tmpin,$tmpin,$iv1 3581 vst1.8 {$tmpin},[$out] 3582 3583 mov $tmpinp,$inp 3584 add $tmpoutp,$out,#16 3585 3586 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3587 // to get the last encrypted block. 3588.composite_dec_loop: 3589 subs $tailcnt,$tailcnt,#1 3590 ldrb $l2outp,[$out,$tailcnt] 3591 ldrb $loutp,[$tmpinp,$tailcnt] 3592 strb $l2outp,[$tmpoutp,$tailcnt] 3593 strb $loutp,[$out,$tailcnt] 3594 b.gt .composite_dec_loop 3595.Lxts_dec_load_done: 3596 vld1.8 {$tmpin},[$out] 3597 veor $tmpin,$tmpin,$iv0 3598 3599 // Decrypt the composite block to get the last second plain text block 3600 ldr $rounds,[$key_,#240] 3601 vld1.8 {$dat},[$key_],#16 3602 sub $rounds,$rounds,#2 3603 vld1.8 {$dat1},[$key_],#16 3604.Loop_final_dec: 3605 aesd $tmpin,$dat0 3606 aesimc $tmpin,$tmpin 3607 vld1.32 {$dat0},[$key_],#16 // load key schedule... 3608 subs $rounds,$rounds,#2 3609 aesd $tmpin,$dat1 3610 aesimc $tmpin,$tmpin 3611 vld1.32 {$dat1},[$key_],#16 // load key schedule... 3612 b.gt .Loop_final_dec 3613 3614 aesd $tmpin,$dat0 3615 aesimc $tmpin,$tmpin 3616 vld1.32 {$dat0},[$key_] 3617 aesd $tmpin,$dat1 3618 veor $tmpin,$tmpin,$dat0 3619 veor $tmpin,$tmpin,$iv0 3620 vst1.8 {$tmpin},[$out] 3621 3622.Lxts_dec_abort: 3623 ldp $tailcnt,$midnumx,[sp,#48] 3624 ldp $ivd10,$ivd20,[sp,#32] 3625 ldp $ivd30,$ivd40,[sp,#16] 3626 ldp $constnumx,$tmpinp,[sp],#64 3627 3628.Lxts_dec_final_abort: 3629 ret 3630.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt 3631___ 3632} 3633}}} 3634$code.=<<___; 3635#endif 3636___ 3637######################################## 3638if ($flavour =~ /64/) { ######## 64-bit code 3639 my %opcode = ( 3640 "aesd" => 0x4e285800, "aese" => 0x4e284800, 3641 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 3642 3643 local *unaes = sub { 3644 my ($mnemonic,$arg)=@_; 3645 3646 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 3647 sprintf ".inst\t0x%08x\t//%s %s", 3648 $opcode{$mnemonic}|$1|($2<<5), 3649 $mnemonic,$arg; 3650 }; 3651 3652 foreach(split("\n",$code)) { 3653 s/\`([^\`]*)\`/eval($1)/geo; 3654 3655 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 3656 s/@\s/\/\//o; # old->new style commentary 3657 3658 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 3659 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 3660 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 3661 s/vmov\.i8/movi/o or # fix up legacy mnemonics 3662 s/vext\.8/ext/o or 3663 s/vrev32\.8/rev32/o or 3664 s/vtst\.8/cmtst/o or 3665 s/vshr/ushr/o or 3666 s/^(\s+)v/$1/o or # strip off v prefix 3667 s/\bbx\s+lr\b/ret/o; 3668 3669 # fix up remaining legacy suffixes 3670 s/\.[ui]?8//o; 3671 m/\],#8/o and s/\.16b/\.8b/go; 3672 s/\.[ui]?32//o and s/\.16b/\.4s/go; 3673 s/\.[ui]?64//o and s/\.16b/\.2d/go; 3674 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 3675 3676 # Switch preprocessor checks to aarch64 versions. 3677 s/__ARME([BL])__/__AARCH64E$1__/go; 3678 3679 print $_,"\n"; 3680 } 3681} else { ######## 32-bit code 3682 my %opcode = ( 3683 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 3684 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 3685 3686 local *unaes = sub { 3687 my ($mnemonic,$arg)=@_; 3688 3689 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 3690 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 3691 |(($2&7)<<1) |(($2&8)<<2); 3692 # since ARMv7 instructions are always encoded little-endian. 3693 # correct solution is to use .inst directive, but older 3694 # assemblers don't implement it:-( 3695 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 3696 $word&0xff,($word>>8)&0xff, 3697 ($word>>16)&0xff,($word>>24)&0xff, 3698 $mnemonic,$arg; 3699 } 3700 }; 3701 3702 sub unvtbl { 3703 my $arg=shift; 3704 3705 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 3706 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 3707 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 3708 } 3709 3710 sub unvdup32 { 3711 my $arg=shift; 3712 3713 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 3714 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 3715 } 3716 3717 sub unvmov32 { 3718 my $arg=shift; 3719 3720 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 3721 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 3722 } 3723 3724 foreach(split("\n",$code)) { 3725 s/\`([^\`]*)\`/eval($1)/geo; 3726 3727 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 3728 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 3729 s/\/\/\s?/@ /o; # new->old style commentary 3730 3731 # fix up remaining new-style suffixes 3732 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 3733 s/\],#[0-9]+/]!/o; 3734 3735 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 3736 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 3737 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 3738 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 3739 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 3740 s/^(\s+)b\./$1b/o or 3741 s/^(\s+)ret/$1bx\tlr/o; 3742 3743 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 3744 print " it $2\n"; 3745 } 3746 3747 print $_,"\n"; 3748 } 3749} 3750 3751close STDOUT or die "error closing STDOUT: $!"; 3752