1#! /usr/bin/env perl 2# Copyright 2022-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# This module implements SM4 with ASIMD and AESE on AARCH64 11# 12# Dec 2022 13# 14 15# $output is the last argument if it looks like a file (it has an extension) 16# $flavour is the first argument if it doesn't look like a file 17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 19 20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 23die "can't locate arm-xlate.pl"; 24 25open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 26 or die "can't call $xlate: $!"; 27*STDOUT=*OUT; 28 29$prefix="vpsm4_ex"; 30my @vtmp=map("v$_",(0..3)); 31my @qtmp=map("q$_",(0..3)); 32my @data=map("v$_",(4..7)); 33my @datax=map("v$_",(8..11)); 34my ($rk0,$rk1)=("v12","v13"); 35my ($rka,$rkb)=("v14","v15"); 36my @vtmpx=map("v$_",(12..15)); 37my ($vtmp4,$vtmp5)=("v24","v25"); 38my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); 39my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); 40 41my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); 42my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); 43my ($xtmp1,$xtmp2)=("x8","x9"); 44my ($ptr,$counter)=("x10","w11"); 45my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); 46 47sub rev32() { 48 my $dst = shift; 49 my $src = shift; 50 51 if ($src and ("$src" ne "$dst")) { 52$code.=<<___; 53#ifndef __AARCH64EB__ 54 rev32 $dst.16b,$src.16b 55#else 56 mov $dst.16b,$src.16b 57#endif 58___ 59 } else { 60$code.=<<___; 61#ifndef __AARCH64EB__ 62 rev32 $dst.16b,$dst.16b 63#endif 64___ 65 } 66} 67 68sub rev32_armeb() { 69 my $dst = shift; 70 my $src = shift; 71 72 if ($src and ("$src" ne "$dst")) { 73$code.=<<___; 74#ifdef __AARCH64EB__ 75 rev32 $dst.16b,$src.16b 76#else 77 mov $dst.16b,$src.16b 78#endif 79___ 80 } else { 81$code.=<<___; 82#ifdef __AARCH64EB__ 83 rev32 $dst.16b,$dst.16b 84#endif 85___ 86 } 87} 88 89sub rbit() { 90 my $dst = shift; 91 my $src = shift; 92 my $std = shift; 93 94 if ($src and ("$src" ne "$dst")) { 95 if ($std eq "_gb") { 96$code.=<<___; 97 rbit $dst.16b,$src.16b 98___ 99 } else { 100$code.=<<___; 101 mov $dst.16b,$src.16b 102___ 103 } 104 } else { 105 if ($std eq "_gb") { 106$code.=<<___; 107 rbit $dst.16b,$src.16b 108___ 109 } 110 } 111} 112 113sub transpose() { 114 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; 115 116$code.=<<___; 117 zip1 $vt0.4s,$dat0.4s,$dat1.4s 118 zip2 $vt1.4s,$dat0.4s,$dat1.4s 119 zip1 $vt2.4s,$dat2.4s,$dat3.4s 120 zip2 $vt3.4s,$dat2.4s,$dat3.4s 121 zip1 $dat0.2d,$vt0.2d,$vt2.2d 122 zip2 $dat1.2d,$vt0.2d,$vt2.2d 123 zip1 $dat2.2d,$vt1.2d,$vt3.2d 124 zip2 $dat3.2d,$vt1.2d,$vt3.2d 125___ 126} 127 128# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) 129sub mul_matrix() { 130 my $x = shift; 131 my $higherMat = shift; 132 my $lowerMat = shift; 133 my $tmp = shift; 134$code.=<<___; 135 ushr $tmp.16b, $x.16b, 4 136 and $x.16b, $x.16b, $ANDMaskV.16b 137 tbl $x.16b, {$lowerMat.16b}, $x.16b 138 tbl $tmp.16b, {$higherMat.16b}, $tmp.16b 139 eor $x.16b, $x.16b, $tmp.16b 140___ 141} 142 143# sbox operations for 4-lane of words 144# sbox operation for 4-lane of words 145sub sbox() { 146 my $dat = shift; 147 148$code.=<<___; 149 // optimize sbox using AESE instruction 150 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b 151___ 152 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); 153$code.=<<___; 154 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b 155 aese @vtmp[0].16b,@vtmp[1].16b 156___ 157 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); 158$code.=<<___; 159 mov $dat.16b,@vtmp[0].16b 160 161 // linear transformation 162 ushr @vtmp[0].4s,$dat.4s,32-2 163 ushr @vtmp[1].4s,$dat.4s,32-10 164 ushr @vtmp[2].4s,$dat.4s,32-18 165 ushr @vtmp[3].4s,$dat.4s,32-24 166 sli @vtmp[0].4s,$dat.4s,2 167 sli @vtmp[1].4s,$dat.4s,10 168 sli @vtmp[2].4s,$dat.4s,18 169 sli @vtmp[3].4s,$dat.4s,24 170 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b 171 eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b 172 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b 173 eor $dat.16b,$dat.16b,$vtmp4.16b 174___ 175} 176 177# sbox operation for 8-lane of words 178sub sbox_double() { 179 my $dat = shift; 180 my $datx = shift; 181 182$code.=<<___; 183 // optimize sbox using AESE instruction 184 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b 185 tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b 186___ 187 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); 188 &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); 189$code.=<<___; 190 eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b 191 aese @vtmp[0].16b,$vtmp5.16b 192 aese @vtmp[1].16b,$vtmp5.16b 193___ 194 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); 195 &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); 196$code.=<<___; 197 mov $dat.16b,@vtmp[0].16b 198 mov $datx.16b,@vtmp[1].16b 199 200 // linear transformation 201 ushr @vtmp[0].4s,$dat.4s,32-2 202 ushr $vtmp5.4s,$datx.4s,32-2 203 ushr @vtmp[1].4s,$dat.4s,32-10 204 ushr @vtmp[2].4s,$dat.4s,32-18 205 ushr @vtmp[3].4s,$dat.4s,32-24 206 sli @vtmp[0].4s,$dat.4s,2 207 sli $vtmp5.4s,$datx.4s,2 208 sli @vtmp[1].4s,$dat.4s,10 209 sli @vtmp[2].4s,$dat.4s,18 210 sli @vtmp[3].4s,$dat.4s,24 211 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b 212 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b 213 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b 214 eor $dat.16b,$dat.16b,$vtmp4.16b 215 ushr @vtmp[1].4s,$datx.4s,32-10 216 ushr @vtmp[2].4s,$datx.4s,32-18 217 ushr @vtmp[3].4s,$datx.4s,32-24 218 sli @vtmp[1].4s,$datx.4s,10 219 sli @vtmp[2].4s,$datx.4s,18 220 sli @vtmp[3].4s,$datx.4s,24 221 eor $vtmp4.16b,$vtmp5.16b,$datx.16b 222 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b 223 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b 224 eor $datx.16b,$datx.16b,$vtmp4.16b 225___ 226} 227 228# sbox operation for one single word 229sub sbox_1word () { 230 my $word = shift; 231 232$code.=<<___; 233 mov @vtmp[3].s[0],$word 234 // optimize sbox using AESE instruction 235 tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b 236___ 237 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); 238$code.=<<___; 239 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b 240 aese @vtmp[0].16b,@vtmp[1].16b 241___ 242 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); 243$code.=<<___; 244 245 mov $wtmp0,@vtmp[0].s[0] 246 eor $word,$wtmp0,$wtmp0,ror #32-2 247 eor $word,$word,$wtmp0,ror #32-10 248 eor $word,$word,$wtmp0,ror #32-18 249 eor $word,$word,$wtmp0,ror #32-24 250___ 251} 252 253# sm4 for one block of data, in scalar registers word0/word1/word2/word3 254sub sm4_1blk () { 255 my $kptr = shift; 256 257$code.=<<___; 258 ldp $wtmp0,$wtmp1,[$kptr],8 259 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 260 eor $tmpw,$word2,$word3 261 eor $wtmp2,$wtmp0,$word1 262 eor $tmpw,$tmpw,$wtmp2 263___ 264 &sbox_1word($tmpw); 265$code.=<<___; 266 eor $word0,$word0,$tmpw 267 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 268 eor $tmpw,$word2,$word3 269 eor $wtmp2,$word0,$wtmp1 270 eor $tmpw,$tmpw,$wtmp2 271___ 272 &sbox_1word($tmpw); 273$code.=<<___; 274 ldp $wtmp0,$wtmp1,[$kptr],8 275 eor $word1,$word1,$tmpw 276 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 277 eor $tmpw,$word0,$word1 278 eor $wtmp2,$wtmp0,$word3 279 eor $tmpw,$tmpw,$wtmp2 280___ 281 &sbox_1word($tmpw); 282$code.=<<___; 283 eor $word2,$word2,$tmpw 284 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 285 eor $tmpw,$word0,$word1 286 eor $wtmp2,$word2,$wtmp1 287 eor $tmpw,$tmpw,$wtmp2 288___ 289 &sbox_1word($tmpw); 290$code.=<<___; 291 eor $word3,$word3,$tmpw 292___ 293} 294 295# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 296sub sm4_4blks () { 297 my $kptr = shift; 298 299$code.=<<___; 300 ldp $wtmp0,$wtmp1,[$kptr],8 301 dup $rk0.4s,$wtmp0 302 dup $rk1.4s,$wtmp1 303 304 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 305 eor $rka.16b,@data[2].16b,@data[3].16b 306 eor $rk0.16b,@data[1].16b,$rk0.16b 307 eor $rk0.16b,$rka.16b,$rk0.16b 308___ 309 &sbox($rk0); 310$code.=<<___; 311 eor @data[0].16b,@data[0].16b,$rk0.16b 312 313 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 314 eor $rka.16b,$rka.16b,@data[0].16b 315 eor $rk1.16b,$rka.16b,$rk1.16b 316___ 317 &sbox($rk1); 318$code.=<<___; 319 ldp $wtmp0,$wtmp1,[$kptr],8 320 eor @data[1].16b,@data[1].16b,$rk1.16b 321 322 dup $rk0.4s,$wtmp0 323 dup $rk1.4s,$wtmp1 324 325 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 326 eor $rka.16b,@data[0].16b,@data[1].16b 327 eor $rk0.16b,@data[3].16b,$rk0.16b 328 eor $rk0.16b,$rka.16b,$rk0.16b 329___ 330 &sbox($rk0); 331$code.=<<___; 332 eor @data[2].16b,@data[2].16b,$rk0.16b 333 334 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 335 eor $rka.16b,$rka.16b,@data[2].16b 336 eor $rk1.16b,$rka.16b,$rk1.16b 337___ 338 &sbox($rk1); 339$code.=<<___; 340 eor @data[3].16b,@data[3].16b,$rk1.16b 341___ 342} 343 344# sm4 for 8 lanes of data, in neon registers 345# data0/data1/data2/data3 datax0/datax1/datax2/datax3 346sub sm4_8blks () { 347 my $kptr = shift; 348 349$code.=<<___; 350 ldp $wtmp0,$wtmp1,[$kptr],8 351 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 352 dup $rk0.4s,$wtmp0 353 eor $rka.16b,@data[2].16b,@data[3].16b 354 eor $rkb.16b,@datax[2].16b,@datax[3].16b 355 eor @vtmp[0].16b,@data[1].16b,$rk0.16b 356 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b 357 eor $rk0.16b,$rka.16b,@vtmp[0].16b 358 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 359___ 360 &sbox_double($rk0,$rk1); 361$code.=<<___; 362 eor @data[0].16b,@data[0].16b,$rk0.16b 363 eor @datax[0].16b,@datax[0].16b,$rk1.16b 364 365 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 366 dup $rk1.4s,$wtmp1 367 eor $rka.16b,$rka.16b,@data[0].16b 368 eor $rkb.16b,$rkb.16b,@datax[0].16b 369 eor $rk0.16b,$rka.16b,$rk1.16b 370 eor $rk1.16b,$rkb.16b,$rk1.16b 371___ 372 &sbox_double($rk0,$rk1); 373$code.=<<___; 374 ldp $wtmp0,$wtmp1,[$kptr],8 375 eor @data[1].16b,@data[1].16b,$rk0.16b 376 eor @datax[1].16b,@datax[1].16b,$rk1.16b 377 378 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 379 dup $rk0.4s,$wtmp0 380 eor $rka.16b,@data[0].16b,@data[1].16b 381 eor $rkb.16b,@datax[0].16b,@datax[1].16b 382 eor @vtmp[0].16b,@data[3].16b,$rk0.16b 383 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b 384 eor $rk0.16b,$rka.16b,@vtmp[0].16b 385 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 386___ 387 &sbox_double($rk0,$rk1); 388$code.=<<___; 389 eor @data[2].16b,@data[2].16b,$rk0.16b 390 eor @datax[2].16b,@datax[2].16b,$rk1.16b 391 392 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 393 dup $rk1.4s,$wtmp1 394 eor $rka.16b,$rka.16b,@data[2].16b 395 eor $rkb.16b,$rkb.16b,@datax[2].16b 396 eor $rk0.16b,$rka.16b,$rk1.16b 397 eor $rk1.16b,$rkb.16b,$rk1.16b 398___ 399 &sbox_double($rk0,$rk1); 400$code.=<<___; 401 eor @data[3].16b,@data[3].16b,$rk0.16b 402 eor @datax[3].16b,@datax[3].16b,$rk1.16b 403___ 404} 405 406sub encrypt_1blk_norev() { 407 my $dat = shift; 408 409$code.=<<___; 410 mov $ptr,$rks 411 mov $counter,#8 412 mov $word0,$dat.s[0] 413 mov $word1,$dat.s[1] 414 mov $word2,$dat.s[2] 415 mov $word3,$dat.s[3] 41610: 417___ 418 &sm4_1blk($ptr); 419$code.=<<___; 420 subs $counter,$counter,#1 421 b.ne 10b 422 mov $dat.s[0],$word3 423 mov $dat.s[1],$word2 424 mov $dat.s[2],$word1 425 mov $dat.s[3],$word0 426___ 427} 428 429sub encrypt_1blk() { 430 my $dat = shift; 431 432 &encrypt_1blk_norev($dat); 433 &rev32($dat,$dat); 434} 435 436sub encrypt_4blks() { 437$code.=<<___; 438 mov $ptr,$rks 439 mov $counter,#8 44010: 441___ 442 &sm4_4blks($ptr); 443$code.=<<___; 444 subs $counter,$counter,#1 445 b.ne 10b 446___ 447 &rev32(@vtmp[3],@data[0]); 448 &rev32(@vtmp[2],@data[1]); 449 &rev32(@vtmp[1],@data[2]); 450 &rev32(@vtmp[0],@data[3]); 451} 452 453sub encrypt_8blks() { 454$code.=<<___; 455 mov $ptr,$rks 456 mov $counter,#8 45710: 458___ 459 &sm4_8blks($ptr); 460$code.=<<___; 461 subs $counter,$counter,#1 462 b.ne 10b 463___ 464 &rev32(@vtmp[3],@data[0]); 465 &rev32(@vtmp[2],@data[1]); 466 &rev32(@vtmp[1],@data[2]); 467 &rev32(@vtmp[0],@data[3]); 468 &rev32(@data[3],@datax[0]); 469 &rev32(@data[2],@datax[1]); 470 &rev32(@data[1],@datax[2]); 471 &rev32(@data[0],@datax[3]); 472} 473 474sub load_sbox () { 475 my $data = shift; 476 477$code.=<<___; 478 ldr $MaskQ, .Lsbox_magic 479 ldr $TAHMatQ, .Lsbox_magic+16 480 ldr $TALMatQ, .Lsbox_magic+32 481 ldr $ATAHMatQ, .Lsbox_magic+48 482 ldr $ATALMatQ, .Lsbox_magic+64 483 ldr $ANDMaskQ, .Lsbox_magic+80 484___ 485} 486 487sub mov_reg_to_vec() { 488 my $src0 = shift; 489 my $src1 = shift; 490 my $desv = shift; 491$code.=<<___; 492 mov $desv.d[0],$src0 493 mov $desv.d[1],$src1 494___ 495 &rev32_armeb($desv,$desv); 496} 497 498sub mov_vec_to_reg() { 499 my $srcv = shift; 500 my $des0 = shift; 501 my $des1 = shift; 502$code.=<<___; 503 mov $des0,$srcv.d[0] 504 mov $des1,$srcv.d[1] 505___ 506} 507 508sub compute_tweak() { 509 my $src0 = shift; 510 my $src1 = shift; 511 my $des0 = shift; 512 my $des1 = shift; 513$code.=<<___; 514 mov $wtmp0,0x87 515 extr $xtmp2,$src1,$src1,#32 516 extr $des1,$src1,$src0,#63 517 and $wtmp1,$wtmp0,$wtmp2,asr#31 518 eor $des0,$xtmp1,$src0,lsl#1 519___ 520} 521 522sub compute_tweak_vec() { 523 my $src = shift; 524 my $des = shift; 525 my $std = shift; 526 &rbit(@vtmp[2],$src,$std); 527$code.=<<___; 528 ldr @qtmp[0], .Lxts_magic 529 shl $des.16b, @vtmp[2].16b, #1 530 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 531 ushr @vtmp[1].16b, @vtmp[1].16b, #7 532 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b 533 eor $des.16b, $des.16b, @vtmp[1].16b 534___ 535 &rbit($des,$des,$std); 536} 537 538$code=<<___; 539#include "arm_arch.h" 540.arch armv8-a+crypto 541.text 542 543.type _${prefix}_consts,%object 544.align 7 545_${prefix}_consts: 546.Lck: 547 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 548 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 549 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 550 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 551 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 552 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 553 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 554 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 555.Lfk: 556 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 557.Lshuffles: 558 .quad 0x0B0A090807060504,0x030201000F0E0D0C 559.Lxts_magic: 560 .quad 0x0101010101010187,0x0101010101010101 561.Lsbox_magic: 562 .quad 0x0b0e0104070a0d00,0x0306090c0f020508 563 .quad 0x62185a2042387a00,0x22581a6002783a40 564 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7 565 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead 566 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc 567 .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f 568 569.size _${prefix}_consts,.-_${prefix}_consts 570___ 571 572{{{ 573my ($key,$keys,$enc)=("x0","x1","w2"); 574my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); 575my ($vkey,$vfk,$vmap)=("v5","v6","v7"); 576$code.=<<___; 577.type _${prefix}_set_key,%function 578.align 4 579_${prefix}_set_key: 580 AARCH64_VALID_CALL_TARGET 581 ld1 {$vkey.4s},[$key] 582___ 583 &load_sbox(); 584 &rev32($vkey,$vkey); 585$code.=<<___; 586 adr $pointer,.Lshuffles 587 ld1 {$vmap.2d},[$pointer] 588 adr $pointer,.Lfk 589 ld1 {$vfk.2d},[$pointer] 590 eor $vkey.16b,$vkey.16b,$vfk.16b 591 mov $schedules,#32 592 adr $pointer,.Lck 593 movi @vtmp[0].16b,#64 594 cbnz $enc,1f 595 add $keys,$keys,124 5961: 597 mov $wtmp,$vkey.s[1] 598 ldr $roundkey,[$pointer],#4 599 eor $roundkey,$roundkey,$wtmp 600 mov $wtmp,$vkey.s[2] 601 eor $roundkey,$roundkey,$wtmp 602 mov $wtmp,$vkey.s[3] 603 eor $roundkey,$roundkey,$wtmp 604 // optimize sbox using AESE instruction 605 mov @data[0].s[0],$roundkey 606 tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b 607___ 608 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); 609$code.=<<___; 610 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b 611 aese @vtmp[0].16b,@vtmp[1].16b 612___ 613 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); 614$code.=<<___; 615 mov $wtmp,@vtmp[0].s[0] 616 eor $roundkey,$wtmp,$wtmp,ror #19 617 eor $roundkey,$roundkey,$wtmp,ror #9 618 mov $wtmp,$vkey.s[0] 619 eor $roundkey,$roundkey,$wtmp 620 mov $vkey.s[0],$roundkey 621 cbz $enc,2f 622 str $roundkey,[$keys],#4 623 b 3f 6242: 625 str $roundkey,[$keys],#-4 6263: 627 tbl $vkey.16b,{$vkey.16b},$vmap.16b 628 subs $schedules,$schedules,#1 629 b.ne 1b 630 ret 631.size _${prefix}_set_key,.-_${prefix}_set_key 632___ 633}}} 634 635 636{{{ 637$code.=<<___; 638.type _${prefix}_enc_4blks,%function 639.align 4 640_${prefix}_enc_4blks: 641 AARCH64_VALID_CALL_TARGET 642___ 643 &encrypt_4blks(); 644$code.=<<___; 645 ret 646.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks 647___ 648}}} 649 650{{{ 651$code.=<<___; 652.type _${prefix}_enc_8blks,%function 653.align 4 654_${prefix}_enc_8blks: 655 AARCH64_VALID_CALL_TARGET 656___ 657 &encrypt_8blks(); 658$code.=<<___; 659 ret 660.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks 661___ 662}}} 663 664 665{{{ 666my ($key,$keys)=("x0","x1"); 667$code.=<<___; 668.globl ${prefix}_set_encrypt_key 669.type ${prefix}_set_encrypt_key,%function 670.align 5 671${prefix}_set_encrypt_key: 672 AARCH64_SIGN_LINK_REGISTER 673 stp x29,x30,[sp,#-16]! 674 mov w2,1 675 bl _${prefix}_set_key 676 ldp x29,x30,[sp],#16 677 AARCH64_VALIDATE_LINK_REGISTER 678 ret 679.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 680___ 681}}} 682 683{{{ 684my ($key,$keys)=("x0","x1"); 685$code.=<<___; 686.globl ${prefix}_set_decrypt_key 687.type ${prefix}_set_decrypt_key,%function 688.align 5 689${prefix}_set_decrypt_key: 690 AARCH64_SIGN_LINK_REGISTER 691 stp x29,x30,[sp,#-16]! 692 mov w2,0 693 bl _${prefix}_set_key 694 ldp x29,x30,[sp],#16 695 AARCH64_VALIDATE_LINK_REGISTER 696 ret 697.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 698___ 699}}} 700 701{{{ 702sub gen_block () { 703 my $dir = shift; 704 my ($inp,$outp,$rk)=map("x$_",(0..2)); 705 706$code.=<<___; 707.globl ${prefix}_${dir}crypt 708.type ${prefix}_${dir}crypt,%function 709.align 5 710${prefix}_${dir}crypt: 711 AARCH64_VALID_CALL_TARGET 712 ld1 {@data[0].4s},[$inp] 713___ 714 &load_sbox(); 715 &rev32(@data[0],@data[0]); 716$code.=<<___; 717 mov $rks,$rk 718___ 719 &encrypt_1blk(@data[0]); 720$code.=<<___; 721 st1 {@data[0].4s},[$outp] 722 ret 723.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 724___ 725} 726&gen_block("en"); 727&gen_block("de"); 728}}} 729 730{{{ 731$code.=<<___; 732.globl ${prefix}_ecb_encrypt 733.type ${prefix}_ecb_encrypt,%function 734.align 5 735${prefix}_ecb_encrypt: 736 AARCH64_SIGN_LINK_REGISTER 737 // convert length into blocks 738 lsr x2,x2,4 739 stp d8,d9,[sp,#-80]! 740 stp d10,d11,[sp,#16] 741 stp d12,d13,[sp,#32] 742 stp d14,d15,[sp,#48] 743 stp x29,x30,[sp,#64] 744___ 745 &load_sbox(); 746$code.=<<___; 747.Lecb_8_blocks_process: 748 cmp $blocks,#8 749 b.lt .Lecb_4_blocks_process 750 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 751 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 752___ 753 &rev32(@data[0],@data[0]); 754 &rev32(@data[1],@data[1]); 755 &rev32(@data[2],@data[2]); 756 &rev32(@data[3],@data[3]); 757 &rev32(@datax[0],@datax[0]); 758 &rev32(@datax[1],@datax[1]); 759 &rev32(@datax[2],@datax[2]); 760 &rev32(@datax[3],@datax[3]); 761$code.=<<___; 762 bl _${prefix}_enc_8blks 763 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 764 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 765 subs $blocks,$blocks,#8 766 b.gt .Lecb_8_blocks_process 767 b 100f 768.Lecb_4_blocks_process: 769 cmp $blocks,#4 770 b.lt 1f 771 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 772___ 773 &rev32(@data[0],@data[0]); 774 &rev32(@data[1],@data[1]); 775 &rev32(@data[2],@data[2]); 776 &rev32(@data[3],@data[3]); 777$code.=<<___; 778 bl _${prefix}_enc_4blks 779 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 780 sub $blocks,$blocks,#4 7811: 782 // process last block 783 cmp $blocks,#1 784 b.lt 100f 785 b.gt 1f 786 ld1 {@data[0].4s},[$inp] 787___ 788 &rev32(@data[0],@data[0]); 789 &encrypt_1blk(@data[0]); 790$code.=<<___; 791 st1 {@data[0].4s},[$outp] 792 b 100f 7931: // process last 2 blocks 794 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 795 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 796 cmp $blocks,#2 797 b.gt 1f 798___ 799 &rev32(@data[0],@data[0]); 800 &rev32(@data[1],@data[1]); 801 &rev32(@data[2],@data[2]); 802 &rev32(@data[3],@data[3]); 803$code.=<<___; 804 bl _${prefix}_enc_4blks 805 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 806 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] 807 b 100f 8081: // process last 3 blocks 809 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 810___ 811 &rev32(@data[0],@data[0]); 812 &rev32(@data[1],@data[1]); 813 &rev32(@data[2],@data[2]); 814 &rev32(@data[3],@data[3]); 815$code.=<<___; 816 bl _${prefix}_enc_4blks 817 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 818 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 819 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] 820100: 821 ldp d10,d11,[sp,#16] 822 ldp d12,d13,[sp,#32] 823 ldp d14,d15,[sp,#48] 824 ldp x29,x30,[sp,#64] 825 ldp d8,d9,[sp],#80 826 AARCH64_VALIDATE_LINK_REGISTER 827 ret 828.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 829___ 830}}} 831 832{{{ 833my ($len,$ivp,$enc)=("x2","x4","w5"); 834my $ivec0=("v3"); 835my $ivec1=("v15"); 836 837$code.=<<___; 838.globl ${prefix}_cbc_encrypt 839.type ${prefix}_cbc_encrypt,%function 840.align 5 841${prefix}_cbc_encrypt: 842 AARCH64_VALID_CALL_TARGET 843 lsr $len,$len,4 844___ 845 &load_sbox(); 846$code.=<<___; 847 cbz $enc,.Ldec 848 ld1 {$ivec0.4s},[$ivp] 849.Lcbc_4_blocks_enc: 850 cmp $blocks,#4 851 b.lt 1f 852 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 853 eor @data[0].16b,@data[0].16b,$ivec0.16b 854___ 855 &rev32(@data[1],@data[1]); 856 &rev32(@data[0],@data[0]); 857 &rev32(@data[2],@data[2]); 858 &rev32(@data[3],@data[3]); 859 &encrypt_1blk_norev(@data[0]); 860$code.=<<___; 861 eor @data[1].16b,@data[1].16b,@data[0].16b 862___ 863 &encrypt_1blk_norev(@data[1]); 864 &rev32(@data[0],@data[0]); 865 866$code.=<<___; 867 eor @data[2].16b,@data[2].16b,@data[1].16b 868___ 869 &encrypt_1blk_norev(@data[2]); 870 &rev32(@data[1],@data[1]); 871$code.=<<___; 872 eor @data[3].16b,@data[3].16b,@data[2].16b 873___ 874 &encrypt_1blk_norev(@data[3]); 875 &rev32(@data[2],@data[2]); 876 &rev32(@data[3],@data[3]); 877$code.=<<___; 878 orr $ivec0.16b,@data[3].16b,@data[3].16b 879 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 880 subs $blocks,$blocks,#4 881 b.ne .Lcbc_4_blocks_enc 882 b 2f 8831: 884 subs $blocks,$blocks,#1 885 b.lt 2f 886 ld1 {@data[0].4s},[$inp],#16 887 eor $ivec0.16b,$ivec0.16b,@data[0].16b 888___ 889 &rev32($ivec0,$ivec0); 890 &encrypt_1blk($ivec0); 891$code.=<<___; 892 st1 {$ivec0.4s},[$outp],#16 893 b 1b 8942: 895 // save back IV 896 st1 {$ivec0.4s},[$ivp] 897 ret 898 899.Ldec: 900 // decryption mode starts 901 AARCH64_SIGN_LINK_REGISTER 902 stp d8,d9,[sp,#-80]! 903 stp d10,d11,[sp,#16] 904 stp d12,d13,[sp,#32] 905 stp d14,d15,[sp,#48] 906 stp x29,x30,[sp,#64] 907.Lcbc_8_blocks_dec: 908 cmp $blocks,#8 909 b.lt 1f 910 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 911 add $ptr,$inp,#64 912 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] 913___ 914 &rev32(@data[0],@data[0]); 915 &rev32(@data[1],@data[1]); 916 &rev32(@data[2],@data[2]); 917 &rev32(@data[3],$data[3]); 918 &rev32(@datax[0],@datax[0]); 919 &rev32(@datax[1],@datax[1]); 920 &rev32(@datax[2],@datax[2]); 921 &rev32(@datax[3],$datax[3]); 922$code.=<<___; 923 bl _${prefix}_enc_8blks 924___ 925 &transpose(@vtmp,@datax); 926 &transpose(@data,@datax); 927$code.=<<___; 928 ld1 {$ivec1.4s},[$ivp] 929 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 930 // note ivec1 and vtmpx[3] are reusing the same register 931 // care needs to be taken to avoid conflict 932 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 933 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 934 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b 935 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b 936 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b 937 // save back IV 938 st1 {$vtmpx[3].4s}, [$ivp] 939 eor @data[0].16b,@data[0].16b,$datax[3].16b 940 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b 941 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b 942 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b 943 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 944 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 945 subs $blocks,$blocks,#8 946 b.gt .Lcbc_8_blocks_dec 947 b.eq 100f 9481: 949 ld1 {$ivec1.4s},[$ivp] 950.Lcbc_4_blocks_dec: 951 cmp $blocks,#4 952 b.lt 1f 953 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 954___ 955 &rev32(@data[0],@data[0]); 956 &rev32(@data[1],@data[1]); 957 &rev32(@data[2],@data[2]); 958 &rev32(@data[3],$data[3]); 959$code.=<<___; 960 bl _${prefix}_enc_4blks 961 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 962___ 963 &transpose(@vtmp,@datax); 964$code.=<<___; 965 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 966 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 967 orr $ivec1.16b,@data[3].16b,@data[3].16b 968 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 969 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b 970 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 971 subs $blocks,$blocks,#4 972 b.gt .Lcbc_4_blocks_dec 973 // save back IV 974 st1 {@data[3].4s}, [$ivp] 975 b 100f 9761: // last block 977 subs $blocks,$blocks,#1 978 b.lt 100f 979 b.gt 1f 980 ld1 {@data[0].4s},[$inp],#16 981 // save back IV 982 st1 {$data[0].4s}, [$ivp] 983___ 984 &rev32(@datax[0],@data[0]); 985 &encrypt_1blk(@datax[0]); 986$code.=<<___; 987 eor @datax[0].16b,@datax[0].16b,$ivec1.16b 988 st1 {@datax[0].4s},[$outp],#16 989 b 100f 9901: // last two blocks 991 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] 992 add $ptr,$inp,#16 993 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 994 subs $blocks,$blocks,1 995 b.gt 1f 996___ 997 &rev32(@data[0],@data[0]); 998 &rev32(@data[1],@data[1]); 999 &rev32(@data[2],@data[2]); 1000 &rev32(@data[3],@data[3]); 1001$code.=<<___; 1002 bl _${prefix}_enc_4blks 1003 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1004___ 1005 &transpose(@vtmp,@datax); 1006$code.=<<___; 1007 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1008 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1009 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1010 // save back IV 1011 st1 {@data[1].4s}, [$ivp] 1012 b 100f 10131: // last 3 blocks 1014 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] 1015___ 1016 &rev32(@data[0],@data[0]); 1017 &rev32(@data[1],@data[1]); 1018 &rev32(@data[2],@data[2]); 1019 &rev32(@data[3],@data[3]); 1020$code.=<<___; 1021 bl _${prefix}_enc_4blks 1022 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1023___ 1024 &transpose(@vtmp,@datax); 1025$code.=<<___; 1026 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1027 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1028 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 1029 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1030 // save back IV 1031 st1 {@data[2].4s}, [$ivp] 1032100: 1033 ldp d10,d11,[sp,#16] 1034 ldp d12,d13,[sp,#32] 1035 ldp d14,d15,[sp,#48] 1036 ldp x29,x30,[sp,#64] 1037 ldp d8,d9,[sp],#80 1038 AARCH64_VALIDATE_LINK_REGISTER 1039 ret 1040.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1041___ 1042}}} 1043 1044{{{ 1045my ($ivp)=("x4"); 1046my ($ctr)=("w5"); 1047my $ivec=("v3"); 1048 1049$code.=<<___; 1050.globl ${prefix}_ctr32_encrypt_blocks 1051.type ${prefix}_ctr32_encrypt_blocks,%function 1052.align 5 1053${prefix}_ctr32_encrypt_blocks: 1054 AARCH64_VALID_CALL_TARGET 1055 ld1 {$ivec.4s},[$ivp] 1056___ 1057 &rev32($ivec,$ivec); 1058 &load_sbox(); 1059$code.=<<___; 1060 cmp $blocks,#1 1061 b.ne 1f 1062 // fast processing for one single block without 1063 // context saving overhead 1064___ 1065 &encrypt_1blk($ivec); 1066$code.=<<___; 1067 ld1 {@data[0].4s},[$inp] 1068 eor @data[0].16b,@data[0].16b,$ivec.16b 1069 st1 {@data[0].4s},[$outp] 1070 ret 10711: 1072 AARCH64_SIGN_LINK_REGISTER 1073 stp d8,d9,[sp,#-80]! 1074 stp d10,d11,[sp,#16] 1075 stp d12,d13,[sp,#32] 1076 stp d14,d15,[sp,#48] 1077 stp x29,x30,[sp,#64] 1078 mov $word0,$ivec.s[0] 1079 mov $word1,$ivec.s[1] 1080 mov $word2,$ivec.s[2] 1081 mov $ctr,$ivec.s[3] 1082.Lctr32_4_blocks_process: 1083 cmp $blocks,#4 1084 b.lt 1f 1085 dup @data[0].4s,$word0 1086 dup @data[1].4s,$word1 1087 dup @data[2].4s,$word2 1088 mov @data[3].s[0],$ctr 1089 add $ctr,$ctr,#1 1090 mov $data[3].s[1],$ctr 1091 add $ctr,$ctr,#1 1092 mov @data[3].s[2],$ctr 1093 add $ctr,$ctr,#1 1094 mov @data[3].s[3],$ctr 1095 add $ctr,$ctr,#1 1096 cmp $blocks,#8 1097 b.ge .Lctr32_8_blocks_process 1098 bl _${prefix}_enc_4blks 1099 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1100 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1101 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1102 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1103 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1104 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1105 subs $blocks,$blocks,#4 1106 b.ne .Lctr32_4_blocks_process 1107 b 100f 1108.Lctr32_8_blocks_process: 1109 dup @datax[0].4s,$word0 1110 dup @datax[1].4s,$word1 1111 dup @datax[2].4s,$word2 1112 mov @datax[3].s[0],$ctr 1113 add $ctr,$ctr,#1 1114 mov $datax[3].s[1],$ctr 1115 add $ctr,$ctr,#1 1116 mov @datax[3].s[2],$ctr 1117 add $ctr,$ctr,#1 1118 mov @datax[3].s[3],$ctr 1119 add $ctr,$ctr,#1 1120 bl _${prefix}_enc_8blks 1121 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1122 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1123 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1124 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1125 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1126 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1127 eor @data[0].16b,@data[0].16b,@datax[0].16b 1128 eor @data[1].16b,@data[1].16b,@datax[1].16b 1129 eor @data[2].16b,@data[2].16b,@datax[2].16b 1130 eor @data[3].16b,@data[3].16b,@datax[3].16b 1131 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1132 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1133 subs $blocks,$blocks,#8 1134 b.ne .Lctr32_4_blocks_process 1135 b 100f 11361: // last block processing 1137 subs $blocks,$blocks,#1 1138 b.lt 100f 1139 b.gt 1f 1140 mov $ivec.s[0],$word0 1141 mov $ivec.s[1],$word1 1142 mov $ivec.s[2],$word2 1143 mov $ivec.s[3],$ctr 1144___ 1145 &encrypt_1blk($ivec); 1146$code.=<<___; 1147 ld1 {@data[0].4s},[$inp] 1148 eor @data[0].16b,@data[0].16b,$ivec.16b 1149 st1 {@data[0].4s},[$outp] 1150 b 100f 11511: // last 2 blocks processing 1152 dup @data[0].4s,$word0 1153 dup @data[1].4s,$word1 1154 dup @data[2].4s,$word2 1155 mov @data[3].s[0],$ctr 1156 add $ctr,$ctr,#1 1157 mov @data[3].s[1],$ctr 1158 subs $blocks,$blocks,#1 1159 b.ne 1f 1160 bl _${prefix}_enc_4blks 1161 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1162 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1163 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1164 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1165 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1166 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1167 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1168 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1169 b 100f 11701: // last 3 blocks processing 1171 add $ctr,$ctr,#1 1172 mov @data[3].s[2],$ctr 1173 bl _${prefix}_enc_4blks 1174 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1175 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1176 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 1177 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1178 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1179 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1180 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1181 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1182 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1183 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 1184100: 1185 ldp d10,d11,[sp,#16] 1186 ldp d12,d13,[sp,#32] 1187 ldp d14,d15,[sp,#48] 1188 ldp x29,x30,[sp,#64] 1189 ldp d8,d9,[sp],#80 1190 AARCH64_VALIDATE_LINK_REGISTER 1191 ret 1192.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 1193___ 1194}}} 1195 1196 1197{{{ 1198my ($blocks,$len)=("x2","x2"); 1199my $ivp=("x5"); 1200my @twx=map("x$_",(12..27)); 1201my ($rks1,$rks2)=("x26","x27"); 1202my $lastBlk=("x26"); 1203my $enc=("w28"); 1204my $remain=("x29"); 1205 1206my @tweak=map("v$_",(16..23)); 1207my $lastTweak=("v25"); 1208 1209sub gen_xts_cipher() { 1210 my $std = shift; 1211$code.=<<___; 1212.globl ${prefix}_xts_encrypt${std} 1213.type ${prefix}_xts_encrypt${std},%function 1214.align 5 1215${prefix}_xts_encrypt${std}: 1216 AARCH64_SIGN_LINK_REGISTER 1217 stp x15, x16, [sp, #-0x10]! 1218 stp x17, x18, [sp, #-0x10]! 1219 stp x19, x20, [sp, #-0x10]! 1220 stp x21, x22, [sp, #-0x10]! 1221 stp x23, x24, [sp, #-0x10]! 1222 stp x25, x26, [sp, #-0x10]! 1223 stp x27, x28, [sp, #-0x10]! 1224 stp x29, x30, [sp, #-0x10]! 1225 stp d8, d9, [sp, #-0x10]! 1226 stp d10, d11, [sp, #-0x10]! 1227 stp d12, d13, [sp, #-0x10]! 1228 stp d14, d15, [sp, #-0x10]! 1229 mov $rks1,x3 1230 mov $rks2,x4 1231 mov $enc,w6 1232 ld1 {@tweak[0].4s}, [$ivp] 1233 mov $rks,$rks2 1234___ 1235 &load_sbox(); 1236 &rev32(@tweak[0],@tweak[0]); 1237 &encrypt_1blk(@tweak[0]); 1238$code.=<<___; 1239 mov $rks,$rks1 1240 and $remain,$len,#0x0F 1241 // convert length into blocks 1242 lsr $blocks,$len,4 1243 cmp $blocks,#1 1244 b.lt .return${std} 1245 1246 cmp $remain,0 1247 // If the encryption/decryption Length is N times of 16, 1248 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1249 b.eq .xts_encrypt_blocks${std} 1250 1251 // If the encryption/decryption length is not N times of 16, 1252 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} 1253 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1254 subs $blocks,$blocks,#1 1255 b.eq .only_2blks_tweak${std} 1256.xts_encrypt_blocks${std}: 1257___ 1258 &rbit(@tweak[0],@tweak[0],$std); 1259 &rev32_armeb(@tweak[0],@tweak[0]); 1260 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); 1261 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1262 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1263 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1264 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1265 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1266 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1267 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1268$code.=<<___; 1269.Lxts_8_blocks_process${std}: 1270 cmp $blocks,#8 1271___ 1272 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); 1273 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); 1274 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); 1275 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1276 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); 1277 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1278 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); 1279 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1280 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); 1281 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1282 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); 1283 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1284 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); 1285 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1286 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); 1287 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1288$code.=<<___; 1289 b.lt .Lxts_4_blocks_process${std} 1290 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1291___ 1292 &rbit(@tweak[0],@tweak[0],$std); 1293 &rbit(@tweak[1],@tweak[1],$std); 1294 &rbit(@tweak[2],@tweak[2],$std); 1295 &rbit(@tweak[3],@tweak[3],$std); 1296$code.=<<___; 1297 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1298 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1299 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1300 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1301 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1302___ 1303 &rbit(@tweak[4],@tweak[4],$std); 1304 &rbit(@tweak[5],@tweak[5],$std); 1305 &rbit(@tweak[6],@tweak[6],$std); 1306 &rbit(@tweak[7],@tweak[7],$std); 1307$code.=<<___; 1308 eor @datax[0].16b, @datax[0].16b, @tweak[4].16b 1309 eor @datax[1].16b, @datax[1].16b, @tweak[5].16b 1310 eor @datax[2].16b, @datax[2].16b, @tweak[6].16b 1311 eor @datax[3].16b, @datax[3].16b, @tweak[7].16b 1312___ 1313 &rev32(@data[0],@data[0]); 1314 &rev32(@data[1],@data[1]); 1315 &rev32(@data[2],@data[2]); 1316 &rev32(@data[3],@data[3]); 1317 &rev32(@datax[0],@datax[0]); 1318 &rev32(@datax[1],@datax[1]); 1319 &rev32(@datax[2],@datax[2]); 1320 &rev32(@datax[3],@datax[3]); 1321 &transpose(@data,@vtmp); 1322 &transpose(@datax,@vtmp); 1323$code.=<<___; 1324 bl _${prefix}_enc_8blks 1325___ 1326 &transpose(@vtmp,@datax); 1327 &transpose(@data,@datax); 1328$code.=<<___; 1329 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1330 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1331 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1332 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b 1333 eor @data[0].16b, @data[0].16b, @tweak[4].16b 1334 eor @data[1].16b, @data[1].16b, @tweak[5].16b 1335 eor @data[2].16b, @data[2].16b, @tweak[6].16b 1336 eor @data[3].16b, @data[3].16b, @tweak[7].16b 1337 1338 // save the last tweak 1339 mov $lastTweak.16b,@tweak[7].16b 1340 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1341 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1342 subs $blocks,$blocks,#8 1343 b.gt .Lxts_8_blocks_process${std} 1344 b 100f 1345.Lxts_4_blocks_process${std}: 1346 cmp $blocks,#4 1347 b.lt 1f 1348 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1349___ 1350 &rbit(@tweak[0],@tweak[0],$std); 1351 &rbit(@tweak[1],@tweak[1],$std); 1352 &rbit(@tweak[2],@tweak[2],$std); 1353 &rbit(@tweak[3],@tweak[3],$std); 1354$code.=<<___; 1355 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1356 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1357 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1358 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1359___ 1360 &rev32(@data[0],@data[0]); 1361 &rev32(@data[1],@data[1]); 1362 &rev32(@data[2],@data[2]); 1363 &rev32(@data[3],@data[3]); 1364 &transpose(@data,@vtmp); 1365$code.=<<___; 1366 bl _${prefix}_enc_4blks 1367___ 1368 &transpose(@vtmp,@data); 1369$code.=<<___; 1370 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1371 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1372 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1373 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b 1374 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1375 sub $blocks,$blocks,#4 1376 mov @tweak[0].16b,@tweak[4].16b 1377 mov @tweak[1].16b,@tweak[5].16b 1378 mov @tweak[2].16b,@tweak[6].16b 1379 // save the last tweak 1380 mov $lastTweak.16b,@tweak[3].16b 13811: 1382 // process last block 1383 cmp $blocks,#1 1384 b.lt 100f 1385 b.gt 1f 1386 ld1 {@data[0].4s},[$inp],#16 1387___ 1388 &rbit(@tweak[0],@tweak[0],$std); 1389$code.=<<___; 1390 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1391___ 1392 &rev32(@data[0],@data[0]); 1393 &encrypt_1blk(@data[0]); 1394$code.=<<___; 1395 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1396 st1 {@data[0].4s},[$outp],#16 1397 // save the last tweak 1398 mov $lastTweak.16b,@tweak[0].16b 1399 b 100f 14001: // process last 2 blocks 1401 cmp $blocks,#2 1402 b.gt 1f 1403 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1404___ 1405 &rbit(@tweak[0],@tweak[0],$std); 1406 &rbit(@tweak[1],@tweak[1],$std); 1407$code.=<<___; 1408 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1409 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1410___ 1411 &rev32(@data[0],@data[0]); 1412 &rev32(@data[1],@data[1]); 1413 &transpose(@data,@vtmp); 1414$code.=<<___; 1415 bl _${prefix}_enc_4blks 1416___ 1417 &transpose(@vtmp,@data); 1418$code.=<<___; 1419 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1420 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1421 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1422 // save the last tweak 1423 mov $lastTweak.16b,@tweak[1].16b 1424 b 100f 14251: // process last 3 blocks 1426 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1427___ 1428 &rbit(@tweak[0],@tweak[0],$std); 1429 &rbit(@tweak[1],@tweak[1],$std); 1430 &rbit(@tweak[2],@tweak[2],$std); 1431$code.=<<___; 1432 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1433 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1434 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1435___ 1436 &rev32(@data[0],@data[0]); 1437 &rev32(@data[1],@data[1]); 1438 &rev32(@data[2],@data[2]); 1439 &transpose(@data,@vtmp); 1440$code.=<<___; 1441 bl _${prefix}_enc_4blks 1442___ 1443 &transpose(@vtmp,@data); 1444$code.=<<___; 1445 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1446 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1447 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1448 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1449 // save the last tweak 1450 mov $lastTweak.16b,@tweak[2].16b 1451100: 1452 cmp $remain,0 1453 b.eq .return${std} 1454 1455// This branch calculates the last two tweaks, 1456// while the encryption/decryption length is larger than 32 1457.last_2blks_tweak${std}: 1458___ 1459 &rev32_armeb($lastTweak,$lastTweak); 1460 &compute_tweak_vec($lastTweak,@tweak[1],$std); 1461 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1462$code.=<<___; 1463 b .check_dec${std} 1464 1465 1466// This branch calculates the last two tweaks, 1467// while the encryption/decryption length is equal to 32, who only need two tweaks 1468.only_2blks_tweak${std}: 1469 mov @tweak[1].16b,@tweak[0].16b 1470___ 1471 &rev32_armeb(@tweak[1],@tweak[1]); 1472 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1473$code.=<<___; 1474 b .check_dec${std} 1475 1476 1477// Determine whether encryption or decryption is required. 1478// The last two tweaks need to be swapped for decryption. 1479.check_dec${std}: 1480 // encryption:1 decryption:0 1481 cmp $enc,1 1482 b.eq .process_last_2blks${std} 1483 mov @vtmp[0].16B,@tweak[1].16b 1484 mov @tweak[1].16B,@tweak[2].16b 1485 mov @tweak[2].16B,@vtmp[0].16b 1486 1487.process_last_2blks${std}: 1488___ 1489 &rev32_armeb(@tweak[1],@tweak[1]); 1490 &rev32_armeb(@tweak[2],@tweak[2]); 1491$code.=<<___; 1492 ld1 {@data[0].4s},[$inp],#16 1493 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1494___ 1495 &rev32(@data[0],@data[0]); 1496 &encrypt_1blk(@data[0]); 1497$code.=<<___; 1498 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1499 st1 {@data[0].4s},[$outp],#16 1500 1501 sub $lastBlk,$outp,16 1502 .loop${std}: 1503 subs $remain,$remain,1 1504 ldrb $wtmp0,[$lastBlk,$remain] 1505 ldrb $wtmp1,[$inp,$remain] 1506 strb $wtmp1,[$lastBlk,$remain] 1507 strb $wtmp0,[$outp,$remain] 1508 b.gt .loop${std} 1509 ld1 {@data[0].4s}, [$lastBlk] 1510 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1511___ 1512 &rev32(@data[0],@data[0]); 1513 &encrypt_1blk(@data[0]); 1514$code.=<<___; 1515 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1516 st1 {@data[0].4s}, [$lastBlk] 1517.return${std}: 1518 ldp d14, d15, [sp], #0x10 1519 ldp d12, d13, [sp], #0x10 1520 ldp d10, d11, [sp], #0x10 1521 ldp d8, d9, [sp], #0x10 1522 ldp x29, x30, [sp], #0x10 1523 ldp x27, x28, [sp], #0x10 1524 ldp x25, x26, [sp], #0x10 1525 ldp x23, x24, [sp], #0x10 1526 ldp x21, x22, [sp], #0x10 1527 ldp x19, x20, [sp], #0x10 1528 ldp x17, x18, [sp], #0x10 1529 ldp x15, x16, [sp], #0x10 1530 AARCH64_VALIDATE_LINK_REGISTER 1531 ret 1532.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} 1533___ 1534} # end of gen_xts_cipher 1535&gen_xts_cipher("_gb"); 1536&gen_xts_cipher(""); 1537}}} 1538 1539######################################## 1540open SELF,$0; 1541while(<SELF>) { 1542 next if (/^#!/); 1543 last if (!s/^#/\/\// and !/^$/); 1544 print; 1545} 1546close SELF; 1547 1548foreach(split("\n",$code)) { 1549 s/\`([^\`]*)\`/eval($1)/ge; 1550 print $_,"\n"; 1551} 1552 1553close STDOUT or die "error closing STDOUT: $!"; 1554