1#! /usr/bin/env perl 2# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# This module implements SM4 with ASIMD on aarch64 11# 12# Feb 2022 13# 14 15# $output is the last argument if it looks like a file (it has an extension) 16# $flavour is the first argument if it doesn't look like a file 17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 19 20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 23die "can't locate arm-xlate.pl"; 24 25open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 26 or die "can't call $xlate: $!"; 27*STDOUT=*OUT; 28 29$prefix="vpsm4"; 30my @vtmp=map("v$_",(0..3)); 31my @qtmp=map("q$_",(0..3)); 32my @data=map("v$_",(4..7)); 33my @datax=map("v$_",(8..11)); 34my ($rk0,$rk1)=("v12","v13"); 35my ($rka,$rkb)=("v14","v15"); 36my @vtmpx=map("v$_",(12..15)); 37my @sbox=map("v$_",(16..31)); 38my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); 39my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); 40my ($xtmp1,$xtmp2)=("x8","x9"); 41my ($ptr,$counter)=("x10","w11"); 42my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); 43 44sub rev32() { 45 my $dst = shift; 46 my $src = shift; 47 48 if ($src and ("$src" ne "$dst")) { 49$code.=<<___; 50#ifndef __AARCH64EB__ 51 rev32 $dst.16b,$src.16b 52#else 53 mov $dst.16b,$src.16b 54#endif 55___ 56 } else { 57$code.=<<___; 58#ifndef __AARCH64EB__ 59 rev32 $dst.16b,$dst.16b 60#endif 61___ 62 } 63} 64 65sub rev32_armeb() { 66 my $dst = shift; 67 my $src = shift; 68 69 if ($src and ("$src" ne "$dst")) { 70$code.=<<___; 71#ifdef __AARCH64EB__ 72 rev32 $dst.16b,$src.16b 73#else 74 mov $dst.16b,$src.16b 75#endif 76___ 77 } else { 78$code.=<<___; 79#ifdef __AARCH64EB__ 80 rev32 $dst.16b,$dst.16b 81#endif 82___ 83 } 84} 85 86sub rbit() { 87 my $dst = shift; 88 my $src = shift; 89 my $std = shift; 90 91 if ($src and ("$src" ne "$dst")) { 92 if ($std eq "_gb") { 93$code.=<<___; 94 rbit $dst.16b,$src.16b 95___ 96 } else { 97$code.=<<___; 98 mov $dst.16b,$src.16b 99___ 100 } 101 } else { 102 if ($std eq "_gb") { 103$code.=<<___; 104 rbit $dst.16b,$src.16b 105___ 106 } 107 } 108} 109 110sub transpose() { 111 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; 112 113$code.=<<___; 114 zip1 $vt0.4s,$dat0.4s,$dat1.4s 115 zip2 $vt1.4s,$dat0.4s,$dat1.4s 116 zip1 $vt2.4s,$dat2.4s,$dat3.4s 117 zip2 $vt3.4s,$dat2.4s,$dat3.4s 118 zip1 $dat0.2d,$vt0.2d,$vt2.2d 119 zip2 $dat1.2d,$vt0.2d,$vt2.2d 120 zip1 $dat2.2d,$vt1.2d,$vt3.2d 121 zip2 $dat3.2d,$vt1.2d,$vt3.2d 122___ 123} 124 125# sbox operations for 4-lane of words 126sub sbox() { 127 my $dat = shift; 128 129$code.=<<___; 130 movi @vtmp[0].16b,#64 131 movi @vtmp[1].16b,#128 132 movi @vtmp[2].16b,#192 133 sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b 134 sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b 135 sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b 136 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b 137 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b 138 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b 139 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b 140 add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d 141 add @vtmp[2].2d,@vtmp[2].2d,$dat.2d 142 add $dat.2d,@vtmp[0].2d,@vtmp[2].2d 143 144 ushr @vtmp[0].4s,$dat.4s,32-2 145 sli @vtmp[0].4s,$dat.4s,2 146 ushr @vtmp[2].4s,$dat.4s,32-10 147 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b 148 sli @vtmp[2].4s,$dat.4s,10 149 eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b 150 ushr @vtmp[0].4s,$dat.4s,32-18 151 sli @vtmp[0].4s,$dat.4s,18 152 ushr @vtmp[2].4s,$dat.4s,32-24 153 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b 154 sli @vtmp[2].4s,$dat.4s,24 155 eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b 156___ 157} 158 159# sbox operation for 8-lane of words 160sub sbox_double() { 161 my $dat = shift; 162 my $datx = shift; 163 164$code.=<<___; 165 movi @vtmp[3].16b,#64 166 sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b 167 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b 168 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b 169 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b 170 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b 171 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b 172 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b 173 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d 174 add $dat.2d,@vtmp[2].2d,$dat.2d 175 add $dat.2d,@vtmp[1].2d,$dat.2d 176 177 sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b 178 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b 179 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b 180 tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b 181 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b 182 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b 183 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b 184 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d 185 add $datx.2d,@vtmp[2].2d,$datx.2d 186 add $datx.2d,@vtmp[1].2d,$datx.2d 187 188 ushr @vtmp[0].4s,$dat.4s,32-2 189 sli @vtmp[0].4s,$dat.4s,2 190 ushr @vtmp[2].4s,$datx.4s,32-2 191 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b 192 sli @vtmp[2].4s,$datx.4s,2 193 194 ushr @vtmp[0].4s,$dat.4s,32-10 195 eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b 196 sli @vtmp[0].4s,$dat.4s,10 197 ushr @vtmp[2].4s,$datx.4s,32-10 198 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b 199 sli @vtmp[2].4s,$datx.4s,10 200 201 ushr @vtmp[0].4s,$dat.4s,32-18 202 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b 203 sli @vtmp[0].4s,$dat.4s,18 204 ushr @vtmp[2].4s,$datx.4s,32-18 205 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b 206 sli @vtmp[2].4s,$datx.4s,18 207 208 ushr @vtmp[0].4s,$dat.4s,32-24 209 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b 210 sli @vtmp[0].4s,$dat.4s,24 211 ushr @vtmp[2].4s,$datx.4s,32-24 212 eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b 213 sli @vtmp[2].4s,$datx.4s,24 214 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b 215___ 216} 217 218# sbox operation for one single word 219sub sbox_1word () { 220 my $word = shift; 221 222$code.=<<___; 223 movi @vtmp[1].16b,#64 224 movi @vtmp[2].16b,#128 225 movi @vtmp[3].16b,#192 226 mov @vtmp[0].s[0],$word 227 228 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b 229 sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b 230 sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b 231 232 tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b 233 tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b 234 tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b 235 tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b 236 237 mov $word,@vtmp[0].s[0] 238 mov $wtmp0,@vtmp[1].s[0] 239 mov $wtmp2,@vtmp[2].s[0] 240 add $wtmp0,$word,$wtmp0 241 mov $word,@vtmp[3].s[0] 242 add $wtmp0,$wtmp0,$wtmp2 243 add $wtmp0,$wtmp0,$word 244 245 eor $word,$wtmp0,$wtmp0,ror #32-2 246 eor $word,$word,$wtmp0,ror #32-10 247 eor $word,$word,$wtmp0,ror #32-18 248 eor $word,$word,$wtmp0,ror #32-24 249___ 250} 251 252# sm4 for one block of data, in scalar registers word0/word1/word2/word3 253sub sm4_1blk () { 254 my $kptr = shift; 255 256$code.=<<___; 257 ldp $wtmp0,$wtmp1,[$kptr],8 258 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 259 eor $tmpw,$word2,$word3 260 eor $wtmp2,$wtmp0,$word1 261 eor $tmpw,$tmpw,$wtmp2 262___ 263 &sbox_1word($tmpw); 264$code.=<<___; 265 eor $word0,$word0,$tmpw 266 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 267 eor $tmpw,$word2,$word3 268 eor $wtmp2,$word0,$wtmp1 269 eor $tmpw,$tmpw,$wtmp2 270___ 271 &sbox_1word($tmpw); 272$code.=<<___; 273 ldp $wtmp0,$wtmp1,[$kptr],8 274 eor $word1,$word1,$tmpw 275 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 276 eor $tmpw,$word0,$word1 277 eor $wtmp2,$wtmp0,$word3 278 eor $tmpw,$tmpw,$wtmp2 279___ 280 &sbox_1word($tmpw); 281$code.=<<___; 282 eor $word2,$word2,$tmpw 283 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 284 eor $tmpw,$word0,$word1 285 eor $wtmp2,$word2,$wtmp1 286 eor $tmpw,$tmpw,$wtmp2 287___ 288 &sbox_1word($tmpw); 289$code.=<<___; 290 eor $word3,$word3,$tmpw 291___ 292} 293 294# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 295sub sm4_4blks () { 296 my $kptr = shift; 297 298$code.=<<___; 299 ldp $wtmp0,$wtmp1,[$kptr],8 300 dup $rk0.4s,$wtmp0 301 dup $rk1.4s,$wtmp1 302 303 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 304 eor $rka.16b,@data[2].16b,@data[3].16b 305 eor $rk0.16b,@data[1].16b,$rk0.16b 306 eor $rk0.16b,$rka.16b,$rk0.16b 307___ 308 &sbox($rk0); 309$code.=<<___; 310 eor @data[0].16b,@data[0].16b,$rk0.16b 311 312 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 313 eor $rka.16b,$rka.16b,@data[0].16b 314 eor $rk1.16b,$rka.16b,$rk1.16b 315___ 316 &sbox($rk1); 317$code.=<<___; 318 ldp $wtmp0,$wtmp1,[$kptr],8 319 eor @data[1].16b,@data[1].16b,$rk1.16b 320 321 dup $rk0.4s,$wtmp0 322 dup $rk1.4s,$wtmp1 323 324 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 325 eor $rka.16b,@data[0].16b,@data[1].16b 326 eor $rk0.16b,@data[3].16b,$rk0.16b 327 eor $rk0.16b,$rka.16b,$rk0.16b 328___ 329 &sbox($rk0); 330$code.=<<___; 331 eor @data[2].16b,@data[2].16b,$rk0.16b 332 333 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 334 eor $rka.16b,$rka.16b,@data[2].16b 335 eor $rk1.16b,$rka.16b,$rk1.16b 336___ 337 &sbox($rk1); 338$code.=<<___; 339 eor @data[3].16b,@data[3].16b,$rk1.16b 340___ 341} 342 343# sm4 for 8 lanes of data, in neon registers 344# data0/data1/data2/data3 datax0/datax1/datax2/datax3 345sub sm4_8blks () { 346 my $kptr = shift; 347 348$code.=<<___; 349 ldp $wtmp0,$wtmp1,[$kptr],8 350 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 351 dup $rk0.4s,$wtmp0 352 eor $rka.16b,@data[2].16b,@data[3].16b 353 eor $rkb.16b,@datax[2].16b,@datax[3].16b 354 eor @vtmp[0].16b,@data[1].16b,$rk0.16b 355 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b 356 eor $rk0.16b,$rka.16b,@vtmp[0].16b 357 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 358___ 359 &sbox_double($rk0,$rk1); 360$code.=<<___; 361 eor @data[0].16b,@data[0].16b,$rk0.16b 362 eor @datax[0].16b,@datax[0].16b,$rk1.16b 363 364 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 365 dup $rk1.4s,$wtmp1 366 eor $rka.16b,$rka.16b,@data[0].16b 367 eor $rkb.16b,$rkb.16b,@datax[0].16b 368 eor $rk0.16b,$rka.16b,$rk1.16b 369 eor $rk1.16b,$rkb.16b,$rk1.16b 370___ 371 &sbox_double($rk0,$rk1); 372$code.=<<___; 373 ldp $wtmp0,$wtmp1,[$kptr],8 374 eor @data[1].16b,@data[1].16b,$rk0.16b 375 eor @datax[1].16b,@datax[1].16b,$rk1.16b 376 377 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 378 dup $rk0.4s,$wtmp0 379 eor $rka.16b,@data[0].16b,@data[1].16b 380 eor $rkb.16b,@datax[0].16b,@datax[1].16b 381 eor @vtmp[0].16b,@data[3].16b,$rk0.16b 382 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b 383 eor $rk0.16b,$rka.16b,@vtmp[0].16b 384 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 385___ 386 &sbox_double($rk0,$rk1); 387$code.=<<___; 388 eor @data[2].16b,@data[2].16b,$rk0.16b 389 eor @datax[2].16b,@datax[2].16b,$rk1.16b 390 391 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 392 dup $rk1.4s,$wtmp1 393 eor $rka.16b,$rka.16b,@data[2].16b 394 eor $rkb.16b,$rkb.16b,@datax[2].16b 395 eor $rk0.16b,$rka.16b,$rk1.16b 396 eor $rk1.16b,$rkb.16b,$rk1.16b 397___ 398 &sbox_double($rk0,$rk1); 399$code.=<<___; 400 eor @data[3].16b,@data[3].16b,$rk0.16b 401 eor @datax[3].16b,@datax[3].16b,$rk1.16b 402___ 403} 404 405sub encrypt_1blk_norev() { 406 my $dat = shift; 407 408$code.=<<___; 409 mov $ptr,$rks 410 mov $counter,#8 411 mov $word0,$dat.s[0] 412 mov $word1,$dat.s[1] 413 mov $word2,$dat.s[2] 414 mov $word3,$dat.s[3] 41510: 416___ 417 &sm4_1blk($ptr); 418$code.=<<___; 419 subs $counter,$counter,#1 420 b.ne 10b 421 mov $dat.s[0],$word3 422 mov $dat.s[1],$word2 423 mov $dat.s[2],$word1 424 mov $dat.s[3],$word0 425___ 426} 427 428sub encrypt_1blk() { 429 my $dat = shift; 430 431 &encrypt_1blk_norev($dat); 432 &rev32($dat,$dat); 433} 434 435sub encrypt_4blks() { 436$code.=<<___; 437 mov $ptr,$rks 438 mov $counter,#8 43910: 440___ 441 &sm4_4blks($ptr); 442$code.=<<___; 443 subs $counter,$counter,#1 444 b.ne 10b 445___ 446 &rev32(@vtmp[3],@data[0]); 447 &rev32(@vtmp[2],@data[1]); 448 &rev32(@vtmp[1],@data[2]); 449 &rev32(@vtmp[0],@data[3]); 450} 451 452sub encrypt_8blks() { 453$code.=<<___; 454 mov $ptr,$rks 455 mov $counter,#8 45610: 457___ 458 &sm4_8blks($ptr); 459$code.=<<___; 460 subs $counter,$counter,#1 461 b.ne 10b 462___ 463 &rev32(@vtmp[3],@data[0]); 464 &rev32(@vtmp[2],@data[1]); 465 &rev32(@vtmp[1],@data[2]); 466 &rev32(@vtmp[0],@data[3]); 467 &rev32(@data[3],@datax[0]); 468 &rev32(@data[2],@datax[1]); 469 &rev32(@data[1],@datax[2]); 470 &rev32(@data[0],@datax[3]); 471} 472 473sub load_sbox () { 474 my $data = shift; 475 476$code.=<<___; 477 adrp $ptr,.Lsbox 478 add $ptr,$ptr,#:lo12:.Lsbox 479 ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64 480 ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64 481 ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64 482 ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr] 483___ 484} 485 486 487sub mov_reg_to_vec() { 488 my $src0 = shift; 489 my $src1 = shift; 490 my $desv = shift; 491$code.=<<___; 492 mov $desv.d[0],$src0 493 mov $desv.d[1],$src1 494___ 495 &rev32_armeb($desv,$desv); 496} 497 498sub mov_vec_to_reg() { 499 my $srcv = shift; 500 my $des0 = shift; 501 my $des1 = shift; 502$code.=<<___; 503 mov $des0,$srcv.d[0] 504 mov $des1,$srcv.d[1] 505___ 506} 507 508sub compute_tweak() { 509 my $src0 = shift; 510 my $src1 = shift; 511 my $des0 = shift; 512 my $des1 = shift; 513$code.=<<___; 514 mov $wtmp0,0x87 515 extr $xtmp2,$src1,$src1,#32 516 extr $des1,$src1,$src0,#63 517 and $wtmp1,$wtmp0,$wtmp2,asr#31 518 eor $des0,$xtmp1,$src0,lsl#1 519___ 520} 521 522sub compute_tweak_vec() { 523 my $src = shift; 524 my $des = shift; 525 my $std = shift; 526 &rbit(@vtmp[2],$src,$std); 527$code.=<<___; 528 adrp $ptr,.Lxts_magic 529 ldr @qtmp[0], [$ptr, #:lo12:.Lxts_magic] 530 shl $des.16b, @vtmp[2].16b, #1 531 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 532 ushr @vtmp[1].16b, @vtmp[1].16b, #7 533 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b 534 eor $des.16b, $des.16b, @vtmp[1].16b 535___ 536 &rbit($des,$des,$std); 537} 538 539$code=<<___; 540#include "arm_arch.h" 541.arch armv8-a 542.text 543 544.rodata 545.type _${prefix}_consts,%object 546.align 7 547_${prefix}_consts: 548.Lsbox: 549 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 550 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 551 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 552 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 553 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 554 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 555 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 556 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E 557 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 558 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 559 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F 560 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 561 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 562 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 563 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 564 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 565.Lck: 566 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 567 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 568 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 569 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 570 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 571 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 572 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 573 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 574.Lfk: 575 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 576.Lshuffles: 577 .quad 0x0B0A090807060504,0x030201000F0E0D0C 578.Lxts_magic: 579 .quad 0x0101010101010187,0x0101010101010101 580 581.size _${prefix}_consts,.-_${prefix}_consts 582 583.previous 584 585___ 586 587{{{ 588my ($key,$keys,$enc)=("x0","x1","w2"); 589my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); 590my ($vkey,$vfk,$vmap)=("v5","v6","v7"); 591$code.=<<___; 592.type _vpsm4_set_key,%function 593.align 4 594_vpsm4_set_key: 595 AARCH64_VALID_CALL_TARGET 596 ld1 {$vkey.4s},[$key] 597___ 598 &load_sbox(); 599 &rev32($vkey,$vkey); 600$code.=<<___; 601 adrp $pointer,.Lshuffles 602 add $pointer,$pointer,#:lo12:.Lshuffles 603 ld1 {$vmap.2d},[$pointer] 604 adrp $pointer,.Lfk 605 add $pointer,$pointer,#:lo12:.Lfk 606 ld1 {$vfk.2d},[$pointer] 607 eor $vkey.16b,$vkey.16b,$vfk.16b 608 mov $schedules,#32 609 adrp $pointer,.Lck 610 add $pointer,$pointer,#:lo12:.Lck 611 movi @vtmp[0].16b,#64 612 cbnz $enc,1f 613 add $keys,$keys,124 6141: 615 mov $wtmp,$vkey.s[1] 616 ldr $roundkey,[$pointer],#4 617 eor $roundkey,$roundkey,$wtmp 618 mov $wtmp,$vkey.s[2] 619 eor $roundkey,$roundkey,$wtmp 620 mov $wtmp,$vkey.s[3] 621 eor $roundkey,$roundkey,$wtmp 622 // sbox lookup 623 mov @data[0].s[0],$roundkey 624 tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b 625 sub @data[0].16b,@data[0].16b,@vtmp[0].16b 626 tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b 627 sub @data[0].16b,@data[0].16b,@vtmp[0].16b 628 tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b 629 sub @data[0].16b,@data[0].16b,@vtmp[0].16b 630 tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b 631 mov $wtmp,@vtmp[1].s[0] 632 eor $roundkey,$wtmp,$wtmp,ror #19 633 eor $roundkey,$roundkey,$wtmp,ror #9 634 mov $wtmp,$vkey.s[0] 635 eor $roundkey,$roundkey,$wtmp 636 mov $vkey.s[0],$roundkey 637 cbz $enc,2f 638 str $roundkey,[$keys],#4 639 b 3f 6402: 641 str $roundkey,[$keys],#-4 6423: 643 tbl $vkey.16b,{$vkey.16b},$vmap.16b 644 subs $schedules,$schedules,#1 645 b.ne 1b 646 ret 647.size _vpsm4_set_key,.-_vpsm4_set_key 648___ 649}}} 650 651 652{{{ 653$code.=<<___; 654.type _vpsm4_enc_4blks,%function 655.align 4 656_vpsm4_enc_4blks: 657 AARCH64_VALID_CALL_TARGET 658___ 659 &encrypt_4blks(); 660$code.=<<___; 661 ret 662.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks 663___ 664}}} 665 666{{{ 667$code.=<<___; 668.type _vpsm4_enc_8blks,%function 669.align 4 670_vpsm4_enc_8blks: 671 AARCH64_VALID_CALL_TARGET 672___ 673 &encrypt_8blks(); 674$code.=<<___; 675 ret 676.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks 677___ 678}}} 679 680 681{{{ 682my ($key,$keys)=("x0","x1"); 683$code.=<<___; 684.globl ${prefix}_set_encrypt_key 685.type ${prefix}_set_encrypt_key,%function 686.align 5 687${prefix}_set_encrypt_key: 688 AARCH64_SIGN_LINK_REGISTER 689 stp x29,x30,[sp,#-16]! 690 mov w2,1 691 bl _vpsm4_set_key 692 ldp x29,x30,[sp],#16 693 AARCH64_VALIDATE_LINK_REGISTER 694 ret 695.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 696___ 697}}} 698 699{{{ 700my ($key,$keys)=("x0","x1"); 701$code.=<<___; 702.globl ${prefix}_set_decrypt_key 703.type ${prefix}_set_decrypt_key,%function 704.align 5 705${prefix}_set_decrypt_key: 706 AARCH64_SIGN_LINK_REGISTER 707 stp x29,x30,[sp,#-16]! 708 mov w2,0 709 bl _vpsm4_set_key 710 ldp x29,x30,[sp],#16 711 AARCH64_VALIDATE_LINK_REGISTER 712 ret 713.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 714___ 715}}} 716 717{{{ 718sub gen_block () { 719 my $dir = shift; 720 my ($inp,$outp,$rk)=map("x$_",(0..2)); 721 722$code.=<<___; 723.globl ${prefix}_${dir}crypt 724.type ${prefix}_${dir}crypt,%function 725.align 5 726${prefix}_${dir}crypt: 727 AARCH64_VALID_CALL_TARGET 728 ld1 {@data[0].4s},[$inp] 729___ 730 &load_sbox(); 731 &rev32(@data[0],@data[0]); 732$code.=<<___; 733 mov $rks,x2 734___ 735 &encrypt_1blk(@data[0]); 736$code.=<<___; 737 st1 {@data[0].4s},[$outp] 738 ret 739.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 740___ 741} 742&gen_block("en"); 743&gen_block("de"); 744}}} 745 746{{{ 747my ($enc) = ("w4"); 748my @dat=map("v$_",(16..23)); 749 750$code.=<<___; 751.globl ${prefix}_ecb_encrypt 752.type ${prefix}_ecb_encrypt,%function 753.align 5 754${prefix}_ecb_encrypt: 755 AARCH64_SIGN_LINK_REGISTER 756 // convert length into blocks 757 lsr x2,x2,4 758 stp d8,d9,[sp,#-80]! 759 stp d10,d11,[sp,#16] 760 stp d12,d13,[sp,#32] 761 stp d14,d15,[sp,#48] 762 stp x29,x30,[sp,#64] 763___ 764 &load_sbox(); 765$code.=<<___; 766.Lecb_8_blocks_process: 767 cmp $blocks,#8 768 b.lt .Lecb_4_blocks_process 769 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 770 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 771___ 772 &rev32(@data[0],@data[0]); 773 &rev32(@data[1],@data[1]); 774 &rev32(@data[2],@data[2]); 775 &rev32(@data[3],@data[3]); 776 &rev32(@datax[0],@datax[0]); 777 &rev32(@datax[1],@datax[1]); 778 &rev32(@datax[2],@datax[2]); 779 &rev32(@datax[3],@datax[3]); 780$code.=<<___; 781 bl _vpsm4_enc_8blks 782 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 783 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 784 subs $blocks,$blocks,#8 785 b.gt .Lecb_8_blocks_process 786 b 100f 787.Lecb_4_blocks_process: 788 cmp $blocks,#4 789 b.lt 1f 790 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 791___ 792 &rev32(@data[0],@data[0]); 793 &rev32(@data[1],@data[1]); 794 &rev32(@data[2],@data[2]); 795 &rev32(@data[3],@data[3]); 796$code.=<<___; 797 bl _vpsm4_enc_4blks 798 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 799 sub $blocks,$blocks,#4 8001: 801 // process last block 802 cmp $blocks,#1 803 b.lt 100f 804 b.gt 1f 805 ld1 {@data[0].4s},[$inp] 806___ 807 &rev32(@data[0],@data[0]); 808 &encrypt_1blk(@data[0]); 809$code.=<<___; 810 st1 {@data[0].4s},[$outp] 811 b 100f 8121: // process last 2 blocks 813 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 814 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 815 cmp $blocks,#2 816 b.gt 1f 817___ 818 &rev32(@data[0],@data[0]); 819 &rev32(@data[1],@data[1]); 820 &rev32(@data[2],@data[2]); 821 &rev32(@data[3],@data[3]); 822$code.=<<___; 823 bl _vpsm4_enc_4blks 824 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 825 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] 826 b 100f 8271: // process last 3 blocks 828 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 829___ 830 &rev32(@data[0],@data[0]); 831 &rev32(@data[1],@data[1]); 832 &rev32(@data[2],@data[2]); 833 &rev32(@data[3],@data[3]); 834$code.=<<___; 835 bl _vpsm4_enc_4blks 836 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 837 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 838 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] 839100: 840 ldp d10,d11,[sp,#16] 841 ldp d12,d13,[sp,#32] 842 ldp d14,d15,[sp,#48] 843 ldp x29,x30,[sp,#64] 844 ldp d8,d9,[sp],#80 845 AARCH64_VALIDATE_LINK_REGISTER 846 ret 847.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 848___ 849}}} 850 851{{{ 852my ($len,$ivp,$enc)=("x2","x4","w5"); 853my $ivec0=("v3"); 854my $ivec1=("v15"); 855 856$code.=<<___; 857.globl ${prefix}_cbc_encrypt 858.type ${prefix}_cbc_encrypt,%function 859.align 5 860${prefix}_cbc_encrypt: 861 AARCH64_VALID_CALL_TARGET 862 lsr $len,$len,4 863___ 864 &load_sbox(); 865$code.=<<___; 866 cbz $enc,.Ldec 867 ld1 {$ivec0.4s},[$ivp] 868.Lcbc_4_blocks_enc: 869 cmp $blocks,#4 870 b.lt 1f 871 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 872 eor @data[0].16b,@data[0].16b,$ivec0.16b 873___ 874 &rev32(@data[1],@data[1]); 875 &rev32(@data[0],@data[0]); 876 &rev32(@data[2],@data[2]); 877 &rev32(@data[3],@data[3]); 878 &encrypt_1blk_norev(@data[0]); 879$code.=<<___; 880 eor @data[1].16b,@data[1].16b,@data[0].16b 881___ 882 &encrypt_1blk_norev(@data[1]); 883 &rev32(@data[0],@data[0]); 884 885$code.=<<___; 886 eor @data[2].16b,@data[2].16b,@data[1].16b 887___ 888 &encrypt_1blk_norev(@data[2]); 889 &rev32(@data[1],@data[1]); 890$code.=<<___; 891 eor @data[3].16b,@data[3].16b,@data[2].16b 892___ 893 &encrypt_1blk_norev(@data[3]); 894 &rev32(@data[2],@data[2]); 895 &rev32(@data[3],@data[3]); 896$code.=<<___; 897 orr $ivec0.16b,@data[3].16b,@data[3].16b 898 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 899 subs $blocks,$blocks,#4 900 b.ne .Lcbc_4_blocks_enc 901 b 2f 9021: 903 subs $blocks,$blocks,#1 904 b.lt 2f 905 ld1 {@data[0].4s},[$inp],#16 906 eor $ivec0.16b,$ivec0.16b,@data[0].16b 907___ 908 &rev32($ivec0,$ivec0); 909 &encrypt_1blk($ivec0); 910$code.=<<___; 911 st1 {$ivec0.4s},[$outp],#16 912 b 1b 9132: 914 // save back IV 915 st1 {$ivec0.4s},[$ivp] 916 ret 917 918.Ldec: 919 // decryption mode starts 920 AARCH64_SIGN_LINK_REGISTER 921 stp d8,d9,[sp,#-80]! 922 stp d10,d11,[sp,#16] 923 stp d12,d13,[sp,#32] 924 stp d14,d15,[sp,#48] 925 stp x29,x30,[sp,#64] 926.Lcbc_8_blocks_dec: 927 cmp $blocks,#8 928 b.lt 1f 929 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 930 add $ptr,$inp,#64 931 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] 932___ 933 &rev32(@data[0],@data[0]); 934 &rev32(@data[1],@data[1]); 935 &rev32(@data[2],@data[2]); 936 &rev32(@data[3],$data[3]); 937 &rev32(@datax[0],@datax[0]); 938 &rev32(@datax[1],@datax[1]); 939 &rev32(@datax[2],@datax[2]); 940 &rev32(@datax[3],$datax[3]); 941$code.=<<___; 942 bl _vpsm4_enc_8blks 943___ 944 &transpose(@vtmp,@datax); 945 &transpose(@data,@datax); 946$code.=<<___; 947 ld1 {$ivec1.4s},[$ivp] 948 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 949 // note ivec1 and vtmpx[3] are reusing the same register 950 // care needs to be taken to avoid conflict 951 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 952 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 953 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b 954 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b 955 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b 956 // save back IV 957 st1 {$vtmpx[3].4s}, [$ivp] 958 eor @data[0].16b,@data[0].16b,$datax[3].16b 959 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b 960 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b 961 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b 962 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 963 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 964 subs $blocks,$blocks,#8 965 b.gt .Lcbc_8_blocks_dec 966 b.eq 100f 9671: 968 ld1 {$ivec1.4s},[$ivp] 969.Lcbc_4_blocks_dec: 970 cmp $blocks,#4 971 b.lt 1f 972 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 973___ 974 &rev32(@data[0],@data[0]); 975 &rev32(@data[1],@data[1]); 976 &rev32(@data[2],@data[2]); 977 &rev32(@data[3],$data[3]); 978$code.=<<___; 979 bl _vpsm4_enc_4blks 980 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 981___ 982 &transpose(@vtmp,@datax); 983$code.=<<___; 984 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 985 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 986 orr $ivec1.16b,@data[3].16b,@data[3].16b 987 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 988 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b 989 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 990 subs $blocks,$blocks,#4 991 b.gt .Lcbc_4_blocks_dec 992 // save back IV 993 st1 {@data[3].4s}, [$ivp] 994 b 100f 9951: // last block 996 subs $blocks,$blocks,#1 997 b.lt 100f 998 b.gt 1f 999 ld1 {@data[0].4s},[$inp],#16 1000 // save back IV 1001 st1 {$data[0].4s}, [$ivp] 1002___ 1003 &rev32(@datax[0],@data[0]); 1004 &encrypt_1blk(@datax[0]); 1005$code.=<<___; 1006 eor @datax[0].16b,@datax[0].16b,$ivec1.16b 1007 st1 {@datax[0].4s},[$outp],#16 1008 b 100f 10091: // last two blocks 1010 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] 1011 add $ptr,$inp,#16 1012 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 1013 subs $blocks,$blocks,1 1014 b.gt 1f 1015___ 1016 &rev32(@data[0],@data[0]); 1017 &rev32(@data[1],@data[1]); 1018 &rev32(@data[2],@data[2]); 1019 &rev32(@data[3],@data[3]); 1020$code.=<<___; 1021 bl _vpsm4_enc_4blks 1022 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1023___ 1024 &transpose(@vtmp,@datax); 1025$code.=<<___; 1026 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1027 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1028 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1029 // save back IV 1030 st1 {@data[1].4s}, [$ivp] 1031 b 100f 10321: // last 3 blocks 1033 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] 1034___ 1035 &rev32(@data[0],@data[0]); 1036 &rev32(@data[1],@data[1]); 1037 &rev32(@data[2],@data[2]); 1038 &rev32(@data[3],@data[3]); 1039$code.=<<___; 1040 bl _vpsm4_enc_4blks 1041 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1042___ 1043 &transpose(@vtmp,@datax); 1044$code.=<<___; 1045 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1046 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1047 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 1048 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1049 // save back IV 1050 st1 {@data[2].4s}, [$ivp] 1051100: 1052 ldp d10,d11,[sp,#16] 1053 ldp d12,d13,[sp,#32] 1054 ldp d14,d15,[sp,#48] 1055 ldp x29,x30,[sp,#64] 1056 ldp d8,d9,[sp],#80 1057 AARCH64_VALIDATE_LINK_REGISTER 1058 ret 1059.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1060___ 1061}}} 1062 1063{{{ 1064my ($ivp)=("x4"); 1065my ($ctr)=("w5"); 1066my $ivec=("v3"); 1067 1068$code.=<<___; 1069.globl ${prefix}_ctr32_encrypt_blocks 1070.type ${prefix}_ctr32_encrypt_blocks,%function 1071.align 5 1072${prefix}_ctr32_encrypt_blocks: 1073 AARCH64_VALID_CALL_TARGET 1074 ld1 {$ivec.4s},[$ivp] 1075___ 1076 &rev32($ivec,$ivec); 1077 &load_sbox(); 1078$code.=<<___; 1079 cmp $blocks,#1 1080 b.ne 1f 1081 // fast processing for one single block without 1082 // context saving overhead 1083___ 1084 &encrypt_1blk($ivec); 1085$code.=<<___; 1086 ld1 {@data[0].4s},[$inp] 1087 eor @data[0].16b,@data[0].16b,$ivec.16b 1088 st1 {@data[0].4s},[$outp] 1089 ret 10901: 1091 AARCH64_SIGN_LINK_REGISTER 1092 stp d8,d9,[sp,#-80]! 1093 stp d10,d11,[sp,#16] 1094 stp d12,d13,[sp,#32] 1095 stp d14,d15,[sp,#48] 1096 stp x29,x30,[sp,#64] 1097 mov $word0,$ivec.s[0] 1098 mov $word1,$ivec.s[1] 1099 mov $word2,$ivec.s[2] 1100 mov $ctr,$ivec.s[3] 1101.Lctr32_4_blocks_process: 1102 cmp $blocks,#4 1103 b.lt 1f 1104 dup @data[0].4s,$word0 1105 dup @data[1].4s,$word1 1106 dup @data[2].4s,$word2 1107 mov @data[3].s[0],$ctr 1108 add $ctr,$ctr,#1 1109 mov $data[3].s[1],$ctr 1110 add $ctr,$ctr,#1 1111 mov @data[3].s[2],$ctr 1112 add $ctr,$ctr,#1 1113 mov @data[3].s[3],$ctr 1114 add $ctr,$ctr,#1 1115 cmp $blocks,#8 1116 b.ge .Lctr32_8_blocks_process 1117 bl _vpsm4_enc_4blks 1118 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1119 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1120 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1121 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1122 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1123 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1124 subs $blocks,$blocks,#4 1125 b.ne .Lctr32_4_blocks_process 1126 b 100f 1127.Lctr32_8_blocks_process: 1128 dup @datax[0].4s,$word0 1129 dup @datax[1].4s,$word1 1130 dup @datax[2].4s,$word2 1131 mov @datax[3].s[0],$ctr 1132 add $ctr,$ctr,#1 1133 mov $datax[3].s[1],$ctr 1134 add $ctr,$ctr,#1 1135 mov @datax[3].s[2],$ctr 1136 add $ctr,$ctr,#1 1137 mov @datax[3].s[3],$ctr 1138 add $ctr,$ctr,#1 1139 bl _vpsm4_enc_8blks 1140 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1141 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1142 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1143 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1144 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1145 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1146 eor @data[0].16b,@data[0].16b,@datax[0].16b 1147 eor @data[1].16b,@data[1].16b,@datax[1].16b 1148 eor @data[2].16b,@data[2].16b,@datax[2].16b 1149 eor @data[3].16b,@data[3].16b,@datax[3].16b 1150 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1151 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1152 subs $blocks,$blocks,#8 1153 b.ne .Lctr32_4_blocks_process 1154 b 100f 11551: // last block processing 1156 subs $blocks,$blocks,#1 1157 b.lt 100f 1158 b.gt 1f 1159 mov $ivec.s[0],$word0 1160 mov $ivec.s[1],$word1 1161 mov $ivec.s[2],$word2 1162 mov $ivec.s[3],$ctr 1163___ 1164 &encrypt_1blk($ivec); 1165$code.=<<___; 1166 ld1 {@data[0].4s},[$inp] 1167 eor @data[0].16b,@data[0].16b,$ivec.16b 1168 st1 {@data[0].4s},[$outp] 1169 b 100f 11701: // last 2 blocks processing 1171 dup @data[0].4s,$word0 1172 dup @data[1].4s,$word1 1173 dup @data[2].4s,$word2 1174 mov @data[3].s[0],$ctr 1175 add $ctr,$ctr,#1 1176 mov @data[3].s[1],$ctr 1177 subs $blocks,$blocks,#1 1178 b.ne 1f 1179 bl _vpsm4_enc_4blks 1180 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1181 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1182 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1183 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1184 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1185 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1186 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1187 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1188 b 100f 11891: // last 3 blocks processing 1190 add $ctr,$ctr,#1 1191 mov @data[3].s[2],$ctr 1192 bl _vpsm4_enc_4blks 1193 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1194 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1195 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 1196 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1197 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1198 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1199 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1200 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1201 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1202 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 1203100: 1204 ldp d10,d11,[sp,#16] 1205 ldp d12,d13,[sp,#32] 1206 ldp d14,d15,[sp,#48] 1207 ldp x29,x30,[sp,#64] 1208 ldp d8,d9,[sp],#80 1209 AARCH64_VALIDATE_LINK_REGISTER 1210 ret 1211.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 1212___ 1213}}} 1214 1215{{{ 1216my ($blocks,$len)=("x2","x2"); 1217my $ivp=("x5"); 1218my @twx=map("x$_",(12..27)); 1219my ($rks1,$rks2)=("x26","x27"); 1220my $lastBlk=("x26"); 1221my $enc=("w28"); 1222my $remain=("x29"); 1223 1224my @tweak=@datax; 1225 1226sub gen_xts_cipher() { 1227 my $std = shift; 1228$code.=<<___; 1229.globl ${prefix}_xts_encrypt${std} 1230.type ${prefix}_xts_encrypt${std},%function 1231.align 5 1232${prefix}_xts_encrypt${std}: 1233 AARCH64_SIGN_LINK_REGISTER 1234 stp x15, x16, [sp, #-0x10]! 1235 stp x17, x18, [sp, #-0x10]! 1236 stp x19, x20, [sp, #-0x10]! 1237 stp x21, x22, [sp, #-0x10]! 1238 stp x23, x24, [sp, #-0x10]! 1239 stp x25, x26, [sp, #-0x10]! 1240 stp x27, x28, [sp, #-0x10]! 1241 stp x29, x30, [sp, #-0x10]! 1242 stp d8, d9, [sp, #-0x10]! 1243 stp d10, d11, [sp, #-0x10]! 1244 stp d12, d13, [sp, #-0x10]! 1245 stp d14, d15, [sp, #-0x10]! 1246 mov $rks1,x3 1247 mov $rks2,x4 1248 mov $enc,w6 1249 ld1 {@tweak[0].4s}, [$ivp] 1250 mov $rks,$rks2 1251___ 1252 &load_sbox(); 1253 &rev32(@tweak[0],@tweak[0]); 1254 &encrypt_1blk(@tweak[0]); 1255$code.=<<___; 1256 mov $rks,$rks1 1257 and $remain,$len,#0x0F 1258 // convert length into blocks 1259 lsr $blocks,$len,4 1260 cmp $blocks,#1 1261 b.lt .return${std} 1262 1263 cmp $remain,0 1264 // If the encryption/decryption Length is N times of 16, 1265 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1266 b.eq .xts_encrypt_blocks${std} 1267 1268 // If the encryption/decryption length is not N times of 16, 1269 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} 1270 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1271 subs $blocks,$blocks,#1 1272 b.eq .only_2blks_tweak${std} 1273.xts_encrypt_blocks${std}: 1274___ 1275 &rbit(@tweak[0],@tweak[0],$std); 1276 &rev32_armeb(@tweak[0],@tweak[0]); 1277 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); 1278 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1279 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1280 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1281 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1282 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1283 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1284 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1285$code.=<<___; 1286.Lxts_8_blocks_process${std}: 1287 cmp $blocks,#8 1288 b.lt .Lxts_4_blocks_process${std} 1289___ 1290 &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); 1291 &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); 1292 &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); 1293 &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); 1294 &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); 1295 &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); 1296 &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); 1297 &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); 1298$code.=<<___; 1299 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1300___ 1301 &rbit(@vtmp[0],@vtmp[0],$std); 1302 &rbit(@vtmp[1],@vtmp[1],$std); 1303 &rbit(@vtmp[2],@vtmp[2],$std); 1304 &rbit(@vtmp[3],@vtmp[3],$std); 1305$code.=<<___; 1306 eor @data[0].16b, @data[0].16b, @vtmp[0].16b 1307 eor @data[1].16b, @data[1].16b, @vtmp[1].16b 1308 eor @data[2].16b, @data[2].16b, @vtmp[2].16b 1309 eor @data[3].16b, @data[3].16b, @vtmp[3].16b 1310 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1311___ 1312 &rbit(@vtmpx[0],@vtmpx[0],$std); 1313 &rbit(@vtmpx[1],@vtmpx[1],$std); 1314 &rbit(@vtmpx[2],@vtmpx[2],$std); 1315 &rbit(@vtmpx[3],@vtmpx[3],$std); 1316$code.=<<___; 1317 eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b 1318 eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b 1319 eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b 1320 eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b 1321___ 1322 &rev32(@data[0],@data[0]); 1323 &rev32(@data[1],@data[1]); 1324 &rev32(@data[2],@data[2]); 1325 &rev32(@data[3],@data[3]); 1326 &rev32(@datax[0],@datax[0]); 1327 &rev32(@datax[1],@datax[1]); 1328 &rev32(@datax[2],@datax[2]); 1329 &rev32(@datax[3],@datax[3]); 1330 &transpose(@data,@vtmp); 1331 &transpose(@datax,@vtmp); 1332$code.=<<___; 1333 bl _${prefix}_enc_8blks 1334___ 1335 &transpose(@vtmp,@datax); 1336 &transpose(@data,@datax); 1337 1338 &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); 1339 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); 1340 &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); 1341 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1342 &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); 1343 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1344 &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); 1345 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1346 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); 1347 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1348 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); 1349 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1350 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); 1351 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1352 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); 1353 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1354$code.=<<___; 1355 eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b 1356 eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b 1357 eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b 1358 eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b 1359 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1360 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1361 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1362 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1363 1364 // save the last tweak 1365 st1 {@tweak[3].4s},[$ivp] 1366 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1367 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1368 subs $blocks,$blocks,#8 1369 b.gt .Lxts_8_blocks_process${std} 1370 b 100f 1371.Lxts_4_blocks_process${std}: 1372___ 1373 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); 1374 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); 1375 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); 1376 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); 1377$code.=<<___; 1378 cmp $blocks,#4 1379 b.lt 1f 1380 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1381___ 1382 &rbit(@tweak[0],@tweak[0],$std); 1383 &rbit(@tweak[1],@tweak[1],$std); 1384 &rbit(@tweak[2],@tweak[2],$std); 1385 &rbit(@tweak[3],@tweak[3],$std); 1386$code.=<<___; 1387 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1388 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1389 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1390 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1391___ 1392 &rev32(@data[0],@data[0]); 1393 &rev32(@data[1],@data[1]); 1394 &rev32(@data[2],@data[2]); 1395 &rev32(@data[3],@data[3]); 1396 &transpose(@data,@vtmp); 1397$code.=<<___; 1398 bl _${prefix}_enc_4blks 1399___ 1400 &transpose(@vtmp,@data); 1401$code.=<<___; 1402 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1403 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1404 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1405 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b 1406 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1407 sub $blocks,$blocks,#4 1408___ 1409 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); 1410 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); 1411 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); 1412$code.=<<___; 1413 // save the last tweak 1414 st1 {@tweak[3].4s},[$ivp] 14151: 1416 // process last block 1417 cmp $blocks,#1 1418 b.lt 100f 1419 b.gt 1f 1420 ld1 {@data[0].4s},[$inp],#16 1421___ 1422 &rbit(@tweak[0],@tweak[0],$std); 1423$code.=<<___; 1424 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1425___ 1426 &rev32(@data[0],@data[0]); 1427 &encrypt_1blk(@data[0]); 1428$code.=<<___; 1429 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1430 st1 {@data[0].4s},[$outp],#16 1431 // save the last tweak 1432 st1 {@tweak[0].4s},[$ivp] 1433 b 100f 14341: // process last 2 blocks 1435 cmp $blocks,#2 1436 b.gt 1f 1437 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1438___ 1439 &rbit(@tweak[0],@tweak[0],$std); 1440 &rbit(@tweak[1],@tweak[1],$std); 1441$code.=<<___; 1442 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1443 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1444___ 1445 &rev32(@data[0],@data[0]); 1446 &rev32(@data[1],@data[1]); 1447 &transpose(@data,@vtmp); 1448$code.=<<___; 1449 bl _${prefix}_enc_4blks 1450___ 1451 &transpose(@vtmp,@data); 1452$code.=<<___; 1453 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1454 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1455 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1456 // save the last tweak 1457 st1 {@tweak[1].4s},[$ivp] 1458 b 100f 14591: // process last 3 blocks 1460 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1461___ 1462 &rbit(@tweak[0],@tweak[0],$std); 1463 &rbit(@tweak[1],@tweak[1],$std); 1464 &rbit(@tweak[2],@tweak[2],$std); 1465$code.=<<___; 1466 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1467 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1468 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1469___ 1470 &rev32(@data[0],@data[0]); 1471 &rev32(@data[1],@data[1]); 1472 &rev32(@data[2],@data[2]); 1473 &transpose(@data,@vtmp); 1474$code.=<<___; 1475 bl _${prefix}_enc_4blks 1476___ 1477 &transpose(@vtmp,@data); 1478$code.=<<___; 1479 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1480 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1481 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1482 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1483 // save the last tweak 1484 st1 {@tweak[2].4s},[$ivp] 1485100: 1486 cmp $remain,0 1487 b.eq .return${std} 1488 1489// This branch calculates the last two tweaks, 1490// while the encryption/decryption length is larger than 32 1491.last_2blks_tweak${std}: 1492 ld1 {@tweak[0].4s},[$ivp] 1493___ 1494 &rev32_armeb(@tweak[0],@tweak[0]); 1495 &compute_tweak_vec(@tweak[0],@tweak[1],$std); 1496 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1497$code.=<<___; 1498 b .check_dec${std} 1499 1500 1501// This branch calculates the last two tweaks, 1502// while the encryption/decryption length is equal to 32, who only need two tweaks 1503.only_2blks_tweak${std}: 1504 mov @tweak[1].16b,@tweak[0].16b 1505___ 1506 &rev32_armeb(@tweak[1],@tweak[1]); 1507 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1508$code.=<<___; 1509 b .check_dec${std} 1510 1511 1512// Determine whether encryption or decryption is required. 1513// The last two tweaks need to be swapped for decryption. 1514.check_dec${std}: 1515 // encryption:1 decryption:0 1516 cmp $enc,1 1517 b.eq .process_last_2blks${std} 1518 mov @vtmp[0].16B,@tweak[1].16b 1519 mov @tweak[1].16B,@tweak[2].16b 1520 mov @tweak[2].16B,@vtmp[0].16b 1521 1522.process_last_2blks${std}: 1523___ 1524 &rev32_armeb(@tweak[1],@tweak[1]); 1525 &rev32_armeb(@tweak[2],@tweak[2]); 1526$code.=<<___; 1527 ld1 {@data[0].4s},[$inp],#16 1528 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1529___ 1530 &rev32(@data[0],@data[0]); 1531 &encrypt_1blk(@data[0]); 1532$code.=<<___; 1533 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1534 st1 {@data[0].4s},[$outp],#16 1535 1536 sub $lastBlk,$outp,16 1537 .loop${std}: 1538 subs $remain,$remain,1 1539 ldrb $wtmp0,[$lastBlk,$remain] 1540 ldrb $wtmp1,[$inp,$remain] 1541 strb $wtmp1,[$lastBlk,$remain] 1542 strb $wtmp0,[$outp,$remain] 1543 b.gt .loop${std} 1544 ld1 {@data[0].4s}, [$lastBlk] 1545 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1546___ 1547 &rev32(@data[0],@data[0]); 1548 &encrypt_1blk(@data[0]); 1549$code.=<<___; 1550 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1551 st1 {@data[0].4s}, [$lastBlk] 1552.return${std}: 1553 ldp d14, d15, [sp], #0x10 1554 ldp d12, d13, [sp], #0x10 1555 ldp d10, d11, [sp], #0x10 1556 ldp d8, d9, [sp], #0x10 1557 ldp x29, x30, [sp], #0x10 1558 ldp x27, x28, [sp], #0x10 1559 ldp x25, x26, [sp], #0x10 1560 ldp x23, x24, [sp], #0x10 1561 ldp x21, x22, [sp], #0x10 1562 ldp x19, x20, [sp], #0x10 1563 ldp x17, x18, [sp], #0x10 1564 ldp x15, x16, [sp], #0x10 1565 AARCH64_VALIDATE_LINK_REGISTER 1566 ret 1567.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} 1568___ 1569} # end of gen_xts_cipher 1570&gen_xts_cipher("_gb"); 1571&gen_xts_cipher(""); 1572}}} 1573######################################## 1574open SELF,$0; 1575while(<SELF>) { 1576 next if (/^#!/); 1577 last if (!s/^#/\/\// and !/^$/); 1578 print; 1579} 1580close SELF; 1581 1582foreach(split("\n",$code)) { 1583 s/\`([^\`]*)\`/eval($1)/ge; 1584 print $_,"\n"; 1585} 1586 1587close STDOUT or die "error closing STDOUT: $!"; 1588