1#! /usr/bin/env perl 2# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# This module implements SM4 with ASIMD on aarch64 11# 12# Feb 2022 13# 14 15# $output is the last argument if it looks like a file (it has an extension) 16# $flavour is the first argument if it doesn't look like a file 17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 19 20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 23die "can't locate arm-xlate.pl"; 24 25open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 26 or die "can't call $xlate: $!"; 27*STDOUT=*OUT; 28 29$prefix="vpsm4"; 30my @vtmp=map("v$_",(0..3)); 31my @qtmp=map("q$_",(0..3)); 32my @data=map("v$_",(4..7)); 33my @datax=map("v$_",(8..11)); 34my ($rk0,$rk1)=("v12","v13"); 35my ($rka,$rkb)=("v14","v15"); 36my @vtmpx=map("v$_",(12..15)); 37my @sbox=map("v$_",(16..31)); 38my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); 39my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); 40my ($xtmp1,$xtmp2)=("x8","x9"); 41my ($ptr,$counter)=("x10","w11"); 42my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); 43 44sub rev32() { 45 my $dst = shift; 46 my $src = shift; 47 48 if ($src and ("$src" ne "$dst")) { 49$code.=<<___; 50#ifndef __AARCH64EB__ 51 rev32 $dst.16b,$src.16b 52#else 53 mov $dst.16b,$src.16b 54#endif 55___ 56 } else { 57$code.=<<___; 58#ifndef __AARCH64EB__ 59 rev32 $dst.16b,$dst.16b 60#endif 61___ 62 } 63} 64 65sub rev32_armeb() { 66 my $dst = shift; 67 my $src = shift; 68 69 if ($src and ("$src" ne "$dst")) { 70$code.=<<___; 71#ifdef __AARCH64EB__ 72 rev32 $dst.16b,$src.16b 73#else 74 mov $dst.16b,$src.16b 75#endif 76___ 77 } else { 78$code.=<<___; 79#ifdef __AARCH64EB__ 80 rev32 $dst.16b,$dst.16b 81#endif 82___ 83 } 84} 85 86sub rbit() { 87 my $dst = shift; 88 my $src = shift; 89 my $std = shift; 90 91 if ($src and ("$src" ne "$dst")) { 92 if ($std eq "_gb") { 93$code.=<<___; 94 rbit $dst.16b,$src.16b 95___ 96 } else { 97$code.=<<___; 98 mov $dst.16b,$src.16b 99___ 100 } 101 } else { 102 if ($std eq "_gb") { 103$code.=<<___; 104 rbit $dst.16b,$src.16b 105___ 106 } 107 } 108} 109 110sub transpose() { 111 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; 112 113$code.=<<___; 114 zip1 $vt0.4s,$dat0.4s,$dat1.4s 115 zip2 $vt1.4s,$dat0.4s,$dat1.4s 116 zip1 $vt2.4s,$dat2.4s,$dat3.4s 117 zip2 $vt3.4s,$dat2.4s,$dat3.4s 118 zip1 $dat0.2d,$vt0.2d,$vt2.2d 119 zip2 $dat1.2d,$vt0.2d,$vt2.2d 120 zip1 $dat2.2d,$vt1.2d,$vt3.2d 121 zip2 $dat3.2d,$vt1.2d,$vt3.2d 122___ 123} 124 125# sbox operations for 4-lane of words 126sub sbox() { 127 my $dat = shift; 128 129$code.=<<___; 130 movi @vtmp[0].16b,#64 131 movi @vtmp[1].16b,#128 132 movi @vtmp[2].16b,#192 133 sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b 134 sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b 135 sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b 136 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b 137 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b 138 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b 139 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b 140 add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d 141 add @vtmp[2].2d,@vtmp[2].2d,$dat.2d 142 add $dat.2d,@vtmp[0].2d,@vtmp[2].2d 143 144 ushr @vtmp[0].4s,$dat.4s,32-2 145 sli @vtmp[0].4s,$dat.4s,2 146 ushr @vtmp[2].4s,$dat.4s,32-10 147 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b 148 sli @vtmp[2].4s,$dat.4s,10 149 eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b 150 ushr @vtmp[0].4s,$dat.4s,32-18 151 sli @vtmp[0].4s,$dat.4s,18 152 ushr @vtmp[2].4s,$dat.4s,32-24 153 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b 154 sli @vtmp[2].4s,$dat.4s,24 155 eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b 156___ 157} 158 159# sbox operation for 8-lane of words 160sub sbox_double() { 161 my $dat = shift; 162 my $datx = shift; 163 164$code.=<<___; 165 movi @vtmp[3].16b,#64 166 sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b 167 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b 168 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b 169 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b 170 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b 171 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b 172 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b 173 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d 174 add $dat.2d,@vtmp[2].2d,$dat.2d 175 add $dat.2d,@vtmp[1].2d,$dat.2d 176 177 sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b 178 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b 179 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b 180 tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b 181 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b 182 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b 183 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b 184 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d 185 add $datx.2d,@vtmp[2].2d,$datx.2d 186 add $datx.2d,@vtmp[1].2d,$datx.2d 187 188 ushr @vtmp[0].4s,$dat.4s,32-2 189 sli @vtmp[0].4s,$dat.4s,2 190 ushr @vtmp[2].4s,$datx.4s,32-2 191 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b 192 sli @vtmp[2].4s,$datx.4s,2 193 194 ushr @vtmp[0].4s,$dat.4s,32-10 195 eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b 196 sli @vtmp[0].4s,$dat.4s,10 197 ushr @vtmp[2].4s,$datx.4s,32-10 198 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b 199 sli @vtmp[2].4s,$datx.4s,10 200 201 ushr @vtmp[0].4s,$dat.4s,32-18 202 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b 203 sli @vtmp[0].4s,$dat.4s,18 204 ushr @vtmp[2].4s,$datx.4s,32-18 205 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b 206 sli @vtmp[2].4s,$datx.4s,18 207 208 ushr @vtmp[0].4s,$dat.4s,32-24 209 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b 210 sli @vtmp[0].4s,$dat.4s,24 211 ushr @vtmp[2].4s,$datx.4s,32-24 212 eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b 213 sli @vtmp[2].4s,$datx.4s,24 214 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b 215___ 216} 217 218# sbox operation for one single word 219sub sbox_1word () { 220 my $word = shift; 221 222$code.=<<___; 223 movi @vtmp[1].16b,#64 224 movi @vtmp[2].16b,#128 225 movi @vtmp[3].16b,#192 226 mov @vtmp[0].s[0],$word 227 228 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b 229 sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b 230 sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b 231 232 tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b 233 tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b 234 tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b 235 tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b 236 237 mov $word,@vtmp[0].s[0] 238 mov $wtmp0,@vtmp[1].s[0] 239 mov $wtmp2,@vtmp[2].s[0] 240 add $wtmp0,$word,$wtmp0 241 mov $word,@vtmp[3].s[0] 242 add $wtmp0,$wtmp0,$wtmp2 243 add $wtmp0,$wtmp0,$word 244 245 eor $word,$wtmp0,$wtmp0,ror #32-2 246 eor $word,$word,$wtmp0,ror #32-10 247 eor $word,$word,$wtmp0,ror #32-18 248 eor $word,$word,$wtmp0,ror #32-24 249___ 250} 251 252# sm4 for one block of data, in scalar registers word0/word1/word2/word3 253sub sm4_1blk () { 254 my $kptr = shift; 255 256$code.=<<___; 257 ldp $wtmp0,$wtmp1,[$kptr],8 258 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 259 eor $tmpw,$word2,$word3 260 eor $wtmp2,$wtmp0,$word1 261 eor $tmpw,$tmpw,$wtmp2 262___ 263 &sbox_1word($tmpw); 264$code.=<<___; 265 eor $word0,$word0,$tmpw 266 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 267 eor $tmpw,$word2,$word3 268 eor $wtmp2,$word0,$wtmp1 269 eor $tmpw,$tmpw,$wtmp2 270___ 271 &sbox_1word($tmpw); 272$code.=<<___; 273 ldp $wtmp0,$wtmp1,[$kptr],8 274 eor $word1,$word1,$tmpw 275 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 276 eor $tmpw,$word0,$word1 277 eor $wtmp2,$wtmp0,$word3 278 eor $tmpw,$tmpw,$wtmp2 279___ 280 &sbox_1word($tmpw); 281$code.=<<___; 282 eor $word2,$word2,$tmpw 283 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 284 eor $tmpw,$word0,$word1 285 eor $wtmp2,$word2,$wtmp1 286 eor $tmpw,$tmpw,$wtmp2 287___ 288 &sbox_1word($tmpw); 289$code.=<<___; 290 eor $word3,$word3,$tmpw 291___ 292} 293 294# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 295sub sm4_4blks () { 296 my $kptr = shift; 297 298$code.=<<___; 299 ldp $wtmp0,$wtmp1,[$kptr],8 300 dup $rk0.4s,$wtmp0 301 dup $rk1.4s,$wtmp1 302 303 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 304 eor $rka.16b,@data[2].16b,@data[3].16b 305 eor $rk0.16b,@data[1].16b,$rk0.16b 306 eor $rk0.16b,$rka.16b,$rk0.16b 307___ 308 &sbox($rk0); 309$code.=<<___; 310 eor @data[0].16b,@data[0].16b,$rk0.16b 311 312 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 313 eor $rka.16b,$rka.16b,@data[0].16b 314 eor $rk1.16b,$rka.16b,$rk1.16b 315___ 316 &sbox($rk1); 317$code.=<<___; 318 ldp $wtmp0,$wtmp1,[$kptr],8 319 eor @data[1].16b,@data[1].16b,$rk1.16b 320 321 dup $rk0.4s,$wtmp0 322 dup $rk1.4s,$wtmp1 323 324 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 325 eor $rka.16b,@data[0].16b,@data[1].16b 326 eor $rk0.16b,@data[3].16b,$rk0.16b 327 eor $rk0.16b,$rka.16b,$rk0.16b 328___ 329 &sbox($rk0); 330$code.=<<___; 331 eor @data[2].16b,@data[2].16b,$rk0.16b 332 333 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 334 eor $rka.16b,$rka.16b,@data[2].16b 335 eor $rk1.16b,$rka.16b,$rk1.16b 336___ 337 &sbox($rk1); 338$code.=<<___; 339 eor @data[3].16b,@data[3].16b,$rk1.16b 340___ 341} 342 343# sm4 for 8 lanes of data, in neon registers 344# data0/data1/data2/data3 datax0/datax1/datax2/datax3 345sub sm4_8blks () { 346 my $kptr = shift; 347 348$code.=<<___; 349 ldp $wtmp0,$wtmp1,[$kptr],8 350 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 351 dup $rk0.4s,$wtmp0 352 eor $rka.16b,@data[2].16b,@data[3].16b 353 eor $rkb.16b,@datax[2].16b,@datax[3].16b 354 eor @vtmp[0].16b,@data[1].16b,$rk0.16b 355 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b 356 eor $rk0.16b,$rka.16b,@vtmp[0].16b 357 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 358___ 359 &sbox_double($rk0,$rk1); 360$code.=<<___; 361 eor @data[0].16b,@data[0].16b,$rk0.16b 362 eor @datax[0].16b,@datax[0].16b,$rk1.16b 363 364 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 365 dup $rk1.4s,$wtmp1 366 eor $rka.16b,$rka.16b,@data[0].16b 367 eor $rkb.16b,$rkb.16b,@datax[0].16b 368 eor $rk0.16b,$rka.16b,$rk1.16b 369 eor $rk1.16b,$rkb.16b,$rk1.16b 370___ 371 &sbox_double($rk0,$rk1); 372$code.=<<___; 373 ldp $wtmp0,$wtmp1,[$kptr],8 374 eor @data[1].16b,@data[1].16b,$rk0.16b 375 eor @datax[1].16b,@datax[1].16b,$rk1.16b 376 377 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 378 dup $rk0.4s,$wtmp0 379 eor $rka.16b,@data[0].16b,@data[1].16b 380 eor $rkb.16b,@datax[0].16b,@datax[1].16b 381 eor @vtmp[0].16b,@data[3].16b,$rk0.16b 382 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b 383 eor $rk0.16b,$rka.16b,@vtmp[0].16b 384 eor $rk1.16b,$rkb.16b,@vtmp[1].16b 385___ 386 &sbox_double($rk0,$rk1); 387$code.=<<___; 388 eor @data[2].16b,@data[2].16b,$rk0.16b 389 eor @datax[2].16b,@datax[2].16b,$rk1.16b 390 391 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 392 dup $rk1.4s,$wtmp1 393 eor $rka.16b,$rka.16b,@data[2].16b 394 eor $rkb.16b,$rkb.16b,@datax[2].16b 395 eor $rk0.16b,$rka.16b,$rk1.16b 396 eor $rk1.16b,$rkb.16b,$rk1.16b 397___ 398 &sbox_double($rk0,$rk1); 399$code.=<<___; 400 eor @data[3].16b,@data[3].16b,$rk0.16b 401 eor @datax[3].16b,@datax[3].16b,$rk1.16b 402___ 403} 404 405sub encrypt_1blk_norev() { 406 my $dat = shift; 407 408$code.=<<___; 409 mov $ptr,$rks 410 mov $counter,#8 411 mov $word0,$dat.s[0] 412 mov $word1,$dat.s[1] 413 mov $word2,$dat.s[2] 414 mov $word3,$dat.s[3] 41510: 416___ 417 &sm4_1blk($ptr); 418$code.=<<___; 419 subs $counter,$counter,#1 420 b.ne 10b 421 mov $dat.s[0],$word3 422 mov $dat.s[1],$word2 423 mov $dat.s[2],$word1 424 mov $dat.s[3],$word0 425___ 426} 427 428sub encrypt_1blk() { 429 my $dat = shift; 430 431 &encrypt_1blk_norev($dat); 432 &rev32($dat,$dat); 433} 434 435sub encrypt_4blks() { 436$code.=<<___; 437 mov $ptr,$rks 438 mov $counter,#8 43910: 440___ 441 &sm4_4blks($ptr); 442$code.=<<___; 443 subs $counter,$counter,#1 444 b.ne 10b 445___ 446 &rev32(@vtmp[3],@data[0]); 447 &rev32(@vtmp[2],@data[1]); 448 &rev32(@vtmp[1],@data[2]); 449 &rev32(@vtmp[0],@data[3]); 450} 451 452sub encrypt_8blks() { 453$code.=<<___; 454 mov $ptr,$rks 455 mov $counter,#8 45610: 457___ 458 &sm4_8blks($ptr); 459$code.=<<___; 460 subs $counter,$counter,#1 461 b.ne 10b 462___ 463 &rev32(@vtmp[3],@data[0]); 464 &rev32(@vtmp[2],@data[1]); 465 &rev32(@vtmp[1],@data[2]); 466 &rev32(@vtmp[0],@data[3]); 467 &rev32(@data[3],@datax[0]); 468 &rev32(@data[2],@datax[1]); 469 &rev32(@data[1],@datax[2]); 470 &rev32(@data[0],@datax[3]); 471} 472 473sub load_sbox () { 474 my $data = shift; 475 476$code.=<<___; 477 adr $ptr,.Lsbox 478 ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64 479 ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64 480 ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64 481 ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr] 482___ 483} 484 485 486sub mov_reg_to_vec() { 487 my $src0 = shift; 488 my $src1 = shift; 489 my $desv = shift; 490$code.=<<___; 491 mov $desv.d[0],$src0 492 mov $desv.d[1],$src1 493___ 494 &rev32_armeb($desv,$desv); 495} 496 497sub mov_vec_to_reg() { 498 my $srcv = shift; 499 my $des0 = shift; 500 my $des1 = shift; 501$code.=<<___; 502 mov $des0,$srcv.d[0] 503 mov $des1,$srcv.d[1] 504___ 505} 506 507sub compute_tweak() { 508 my $src0 = shift; 509 my $src1 = shift; 510 my $des0 = shift; 511 my $des1 = shift; 512$code.=<<___; 513 mov $wtmp0,0x87 514 extr $xtmp2,$src1,$src1,#32 515 extr $des1,$src1,$src0,#63 516 and $wtmp1,$wtmp0,$wtmp2,asr#31 517 eor $des0,$xtmp1,$src0,lsl#1 518___ 519} 520 521sub compute_tweak_vec() { 522 my $src = shift; 523 my $des = shift; 524 my $std = shift; 525 &rbit(@vtmp[2],$src,$std); 526$code.=<<___; 527 ldr @qtmp[0], .Lxts_magic 528 shl $des.16b, @vtmp[2].16b, #1 529 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 530 ushr @vtmp[1].16b, @vtmp[1].16b, #7 531 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b 532 eor $des.16b, $des.16b, @vtmp[1].16b 533___ 534 &rbit($des,$des,$std); 535} 536 537$code=<<___; 538#include "arm_arch.h" 539.arch armv8-a 540.text 541 542.type _vpsm4_consts,%object 543.align 7 544_vpsm4_consts: 545.Lsbox: 546 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 547 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 548 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 549 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 550 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 551 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 552 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 553 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E 554 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 555 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 556 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F 557 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 558 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 559 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 560 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 561 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 562.Lck: 563 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 564 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 565 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 566 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 567 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 568 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 569 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 570 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 571.Lfk: 572 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 573.Lshuffles: 574 .quad 0x0B0A090807060504,0x030201000F0E0D0C 575.Lxts_magic: 576 .quad 0x0101010101010187,0x0101010101010101 577 578.size _vpsm4_consts,.-_vpsm4_consts 579___ 580 581{{{ 582my ($key,$keys,$enc)=("x0","x1","w2"); 583my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); 584my ($vkey,$vfk,$vmap)=("v5","v6","v7"); 585$code.=<<___; 586.type _vpsm4_set_key,%function 587.align 4 588_vpsm4_set_key: 589 AARCH64_VALID_CALL_TARGET 590 ld1 {$vkey.4s},[$key] 591___ 592 &load_sbox(); 593 &rev32($vkey,$vkey); 594$code.=<<___; 595 adr $pointer,.Lshuffles 596 ld1 {$vmap.2d},[$pointer] 597 adr $pointer,.Lfk 598 ld1 {$vfk.2d},[$pointer] 599 eor $vkey.16b,$vkey.16b,$vfk.16b 600 mov $schedules,#32 601 adr $pointer,.Lck 602 movi @vtmp[0].16b,#64 603 cbnz $enc,1f 604 add $keys,$keys,124 6051: 606 mov $wtmp,$vkey.s[1] 607 ldr $roundkey,[$pointer],#4 608 eor $roundkey,$roundkey,$wtmp 609 mov $wtmp,$vkey.s[2] 610 eor $roundkey,$roundkey,$wtmp 611 mov $wtmp,$vkey.s[3] 612 eor $roundkey,$roundkey,$wtmp 613 // sbox lookup 614 mov @data[0].s[0],$roundkey 615 tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b 616 sub @data[0].16b,@data[0].16b,@vtmp[0].16b 617 tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b 618 sub @data[0].16b,@data[0].16b,@vtmp[0].16b 619 tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b 620 sub @data[0].16b,@data[0].16b,@vtmp[0].16b 621 tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b 622 mov $wtmp,@vtmp[1].s[0] 623 eor $roundkey,$wtmp,$wtmp,ror #19 624 eor $roundkey,$roundkey,$wtmp,ror #9 625 mov $wtmp,$vkey.s[0] 626 eor $roundkey,$roundkey,$wtmp 627 mov $vkey.s[0],$roundkey 628 cbz $enc,2f 629 str $roundkey,[$keys],#4 630 b 3f 6312: 632 str $roundkey,[$keys],#-4 6333: 634 tbl $vkey.16b,{$vkey.16b},$vmap.16b 635 subs $schedules,$schedules,#1 636 b.ne 1b 637 ret 638.size _vpsm4_set_key,.-_vpsm4_set_key 639___ 640}}} 641 642 643{{{ 644$code.=<<___; 645.type _vpsm4_enc_4blks,%function 646.align 4 647_vpsm4_enc_4blks: 648 AARCH64_VALID_CALL_TARGET 649___ 650 &encrypt_4blks(); 651$code.=<<___; 652 ret 653.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks 654___ 655}}} 656 657{{{ 658$code.=<<___; 659.type _vpsm4_enc_8blks,%function 660.align 4 661_vpsm4_enc_8blks: 662 AARCH64_VALID_CALL_TARGET 663___ 664 &encrypt_8blks(); 665$code.=<<___; 666 ret 667.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks 668___ 669}}} 670 671 672{{{ 673my ($key,$keys)=("x0","x1"); 674$code.=<<___; 675.globl ${prefix}_set_encrypt_key 676.type ${prefix}_set_encrypt_key,%function 677.align 5 678${prefix}_set_encrypt_key: 679 AARCH64_SIGN_LINK_REGISTER 680 stp x29,x30,[sp,#-16]! 681 mov w2,1 682 bl _vpsm4_set_key 683 ldp x29,x30,[sp],#16 684 AARCH64_VALIDATE_LINK_REGISTER 685 ret 686.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 687___ 688}}} 689 690{{{ 691my ($key,$keys)=("x0","x1"); 692$code.=<<___; 693.globl ${prefix}_set_decrypt_key 694.type ${prefix}_set_decrypt_key,%function 695.align 5 696${prefix}_set_decrypt_key: 697 AARCH64_SIGN_LINK_REGISTER 698 stp x29,x30,[sp,#-16]! 699 mov w2,0 700 bl _vpsm4_set_key 701 ldp x29,x30,[sp],#16 702 AARCH64_VALIDATE_LINK_REGISTER 703 ret 704.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 705___ 706}}} 707 708{{{ 709sub gen_block () { 710 my $dir = shift; 711 my ($inp,$outp,$rk)=map("x$_",(0..2)); 712 713$code.=<<___; 714.globl ${prefix}_${dir}crypt 715.type ${prefix}_${dir}crypt,%function 716.align 5 717${prefix}_${dir}crypt: 718 AARCH64_VALID_CALL_TARGET 719 ld1 {@data[0].4s},[$inp] 720___ 721 &load_sbox(); 722 &rev32(@data[0],@data[0]); 723$code.=<<___; 724 mov $rks,x2 725___ 726 &encrypt_1blk(@data[0]); 727$code.=<<___; 728 st1 {@data[0].4s},[$outp] 729 ret 730.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 731___ 732} 733&gen_block("en"); 734&gen_block("de"); 735}}} 736 737{{{ 738my ($enc) = ("w4"); 739my @dat=map("v$_",(16..23)); 740 741$code.=<<___; 742.globl ${prefix}_ecb_encrypt 743.type ${prefix}_ecb_encrypt,%function 744.align 5 745${prefix}_ecb_encrypt: 746 AARCH64_SIGN_LINK_REGISTER 747 // convert length into blocks 748 lsr x2,x2,4 749 stp d8,d9,[sp,#-80]! 750 stp d10,d11,[sp,#16] 751 stp d12,d13,[sp,#32] 752 stp d14,d15,[sp,#48] 753 stp x29,x30,[sp,#64] 754___ 755 &load_sbox(); 756$code.=<<___; 757.Lecb_8_blocks_process: 758 cmp $blocks,#8 759 b.lt .Lecb_4_blocks_process 760 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 761 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 762___ 763 &rev32(@data[0],@data[0]); 764 &rev32(@data[1],@data[1]); 765 &rev32(@data[2],@data[2]); 766 &rev32(@data[3],@data[3]); 767 &rev32(@datax[0],@datax[0]); 768 &rev32(@datax[1],@datax[1]); 769 &rev32(@datax[2],@datax[2]); 770 &rev32(@datax[3],@datax[3]); 771$code.=<<___; 772 bl _vpsm4_enc_8blks 773 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 774 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 775 subs $blocks,$blocks,#8 776 b.gt .Lecb_8_blocks_process 777 b 100f 778.Lecb_4_blocks_process: 779 cmp $blocks,#4 780 b.lt 1f 781 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 782___ 783 &rev32(@data[0],@data[0]); 784 &rev32(@data[1],@data[1]); 785 &rev32(@data[2],@data[2]); 786 &rev32(@data[3],@data[3]); 787$code.=<<___; 788 bl _vpsm4_enc_4blks 789 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 790 sub $blocks,$blocks,#4 7911: 792 // process last block 793 cmp $blocks,#1 794 b.lt 100f 795 b.gt 1f 796 ld1 {@data[0].4s},[$inp] 797___ 798 &rev32(@data[0],@data[0]); 799 &encrypt_1blk(@data[0]); 800$code.=<<___; 801 st1 {@data[0].4s},[$outp] 802 b 100f 8031: // process last 2 blocks 804 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 805 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 806 cmp $blocks,#2 807 b.gt 1f 808___ 809 &rev32(@data[0],@data[0]); 810 &rev32(@data[1],@data[1]); 811 &rev32(@data[2],@data[2]); 812 &rev32(@data[3],@data[3]); 813$code.=<<___; 814 bl _vpsm4_enc_4blks 815 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 816 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] 817 b 100f 8181: // process last 3 blocks 819 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 820___ 821 &rev32(@data[0],@data[0]); 822 &rev32(@data[1],@data[1]); 823 &rev32(@data[2],@data[2]); 824 &rev32(@data[3],@data[3]); 825$code.=<<___; 826 bl _vpsm4_enc_4blks 827 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 828 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 829 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] 830100: 831 ldp d10,d11,[sp,#16] 832 ldp d12,d13,[sp,#32] 833 ldp d14,d15,[sp,#48] 834 ldp x29,x30,[sp,#64] 835 ldp d8,d9,[sp],#80 836 AARCH64_VALIDATE_LINK_REGISTER 837 ret 838.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt 839___ 840}}} 841 842{{{ 843my ($len,$ivp,$enc)=("x2","x4","w5"); 844my $ivec0=("v3"); 845my $ivec1=("v15"); 846 847$code.=<<___; 848.globl ${prefix}_cbc_encrypt 849.type ${prefix}_cbc_encrypt,%function 850.align 5 851${prefix}_cbc_encrypt: 852 AARCH64_VALID_CALL_TARGET 853 lsr $len,$len,4 854___ 855 &load_sbox(); 856$code.=<<___; 857 cbz $enc,.Ldec 858 ld1 {$ivec0.4s},[$ivp] 859.Lcbc_4_blocks_enc: 860 cmp $blocks,#4 861 b.lt 1f 862 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 863 eor @data[0].16b,@data[0].16b,$ivec0.16b 864___ 865 &rev32(@data[1],@data[1]); 866 &rev32(@data[0],@data[0]); 867 &rev32(@data[2],@data[2]); 868 &rev32(@data[3],@data[3]); 869 &encrypt_1blk_norev(@data[0]); 870$code.=<<___; 871 eor @data[1].16b,@data[1].16b,@data[0].16b 872___ 873 &encrypt_1blk_norev(@data[1]); 874 &rev32(@data[0],@data[0]); 875 876$code.=<<___; 877 eor @data[2].16b,@data[2].16b,@data[1].16b 878___ 879 &encrypt_1blk_norev(@data[2]); 880 &rev32(@data[1],@data[1]); 881$code.=<<___; 882 eor @data[3].16b,@data[3].16b,@data[2].16b 883___ 884 &encrypt_1blk_norev(@data[3]); 885 &rev32(@data[2],@data[2]); 886 &rev32(@data[3],@data[3]); 887$code.=<<___; 888 orr $ivec0.16b,@data[3].16b,@data[3].16b 889 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 890 subs $blocks,$blocks,#4 891 b.ne .Lcbc_4_blocks_enc 892 b 2f 8931: 894 subs $blocks,$blocks,#1 895 b.lt 2f 896 ld1 {@data[0].4s},[$inp],#16 897 eor $ivec0.16b,$ivec0.16b,@data[0].16b 898___ 899 &rev32($ivec0,$ivec0); 900 &encrypt_1blk($ivec0); 901$code.=<<___; 902 st1 {$ivec0.4s},[$outp],#16 903 b 1b 9042: 905 // save back IV 906 st1 {$ivec0.4s},[$ivp] 907 ret 908 909.Ldec: 910 // decryption mode starts 911 AARCH64_SIGN_LINK_REGISTER 912 stp d8,d9,[sp,#-80]! 913 stp d10,d11,[sp,#16] 914 stp d12,d13,[sp,#32] 915 stp d14,d15,[sp,#48] 916 stp x29,x30,[sp,#64] 917.Lcbc_8_blocks_dec: 918 cmp $blocks,#8 919 b.lt 1f 920 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 921 add $ptr,$inp,#64 922 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] 923___ 924 &rev32(@data[0],@data[0]); 925 &rev32(@data[1],@data[1]); 926 &rev32(@data[2],@data[2]); 927 &rev32(@data[3],$data[3]); 928 &rev32(@datax[0],@datax[0]); 929 &rev32(@datax[1],@datax[1]); 930 &rev32(@datax[2],@datax[2]); 931 &rev32(@datax[3],$datax[3]); 932$code.=<<___; 933 bl _vpsm4_enc_8blks 934___ 935 &transpose(@vtmp,@datax); 936 &transpose(@data,@datax); 937$code.=<<___; 938 ld1 {$ivec1.4s},[$ivp] 939 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 940 // note ivec1 and vtmpx[3] are reusing the same register 941 // care needs to be taken to avoid conflict 942 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 943 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 944 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b 945 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b 946 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b 947 // save back IV 948 st1 {$vtmpx[3].4s}, [$ivp] 949 eor @data[0].16b,@data[0].16b,$datax[3].16b 950 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b 951 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b 952 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b 953 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 954 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 955 subs $blocks,$blocks,#8 956 b.gt .Lcbc_8_blocks_dec 957 b.eq 100f 9581: 959 ld1 {$ivec1.4s},[$ivp] 960.Lcbc_4_blocks_dec: 961 cmp $blocks,#4 962 b.lt 1f 963 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] 964___ 965 &rev32(@data[0],@data[0]); 966 &rev32(@data[1],@data[1]); 967 &rev32(@data[2],@data[2]); 968 &rev32(@data[3],$data[3]); 969$code.=<<___; 970 bl _vpsm4_enc_4blks 971 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 972___ 973 &transpose(@vtmp,@datax); 974$code.=<<___; 975 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 976 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 977 orr $ivec1.16b,@data[3].16b,@data[3].16b 978 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 979 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b 980 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 981 subs $blocks,$blocks,#4 982 b.gt .Lcbc_4_blocks_dec 983 // save back IV 984 st1 {@data[3].4s}, [$ivp] 985 b 100f 9861: // last block 987 subs $blocks,$blocks,#1 988 b.lt 100f 989 b.gt 1f 990 ld1 {@data[0].4s},[$inp],#16 991 // save back IV 992 st1 {$data[0].4s}, [$ivp] 993___ 994 &rev32(@datax[0],@data[0]); 995 &encrypt_1blk(@datax[0]); 996$code.=<<___; 997 eor @datax[0].16b,@datax[0].16b,$ivec1.16b 998 st1 {@datax[0].4s},[$outp],#16 999 b 100f 10001: // last two blocks 1001 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] 1002 add $ptr,$inp,#16 1003 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 1004 subs $blocks,$blocks,1 1005 b.gt 1f 1006___ 1007 &rev32(@data[0],@data[0]); 1008 &rev32(@data[1],@data[1]); 1009 &rev32(@data[2],@data[2]); 1010 &rev32(@data[3],@data[3]); 1011$code.=<<___; 1012 bl _vpsm4_enc_4blks 1013 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1014___ 1015 &transpose(@vtmp,@datax); 1016$code.=<<___; 1017 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1018 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1019 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1020 // save back IV 1021 st1 {@data[1].4s}, [$ivp] 1022 b 100f 10231: // last 3 blocks 1024 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] 1025___ 1026 &rev32(@data[0],@data[0]); 1027 &rev32(@data[1],@data[1]); 1028 &rev32(@data[2],@data[2]); 1029 &rev32(@data[3],@data[3]); 1030$code.=<<___; 1031 bl _vpsm4_enc_4blks 1032 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1033___ 1034 &transpose(@vtmp,@datax); 1035$code.=<<___; 1036 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b 1037 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b 1038 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b 1039 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1040 // save back IV 1041 st1 {@data[2].4s}, [$ivp] 1042100: 1043 ldp d10,d11,[sp,#16] 1044 ldp d12,d13,[sp,#32] 1045 ldp d14,d15,[sp,#48] 1046 ldp x29,x30,[sp,#64] 1047 ldp d8,d9,[sp],#80 1048 AARCH64_VALIDATE_LINK_REGISTER 1049 ret 1050.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 1051___ 1052}}} 1053 1054{{{ 1055my ($ivp)=("x4"); 1056my ($ctr)=("w5"); 1057my $ivec=("v3"); 1058 1059$code.=<<___; 1060.globl ${prefix}_ctr32_encrypt_blocks 1061.type ${prefix}_ctr32_encrypt_blocks,%function 1062.align 5 1063${prefix}_ctr32_encrypt_blocks: 1064 AARCH64_VALID_CALL_TARGET 1065 ld1 {$ivec.4s},[$ivp] 1066___ 1067 &rev32($ivec,$ivec); 1068 &load_sbox(); 1069$code.=<<___; 1070 cmp $blocks,#1 1071 b.ne 1f 1072 // fast processing for one single block without 1073 // context saving overhead 1074___ 1075 &encrypt_1blk($ivec); 1076$code.=<<___; 1077 ld1 {@data[0].4s},[$inp] 1078 eor @data[0].16b,@data[0].16b,$ivec.16b 1079 st1 {@data[0].4s},[$outp] 1080 ret 10811: 1082 AARCH64_SIGN_LINK_REGISTER 1083 stp d8,d9,[sp,#-80]! 1084 stp d10,d11,[sp,#16] 1085 stp d12,d13,[sp,#32] 1086 stp d14,d15,[sp,#48] 1087 stp x29,x30,[sp,#64] 1088 mov $word0,$ivec.s[0] 1089 mov $word1,$ivec.s[1] 1090 mov $word2,$ivec.s[2] 1091 mov $ctr,$ivec.s[3] 1092.Lctr32_4_blocks_process: 1093 cmp $blocks,#4 1094 b.lt 1f 1095 dup @data[0].4s,$word0 1096 dup @data[1].4s,$word1 1097 dup @data[2].4s,$word2 1098 mov @data[3].s[0],$ctr 1099 add $ctr,$ctr,#1 1100 mov $data[3].s[1],$ctr 1101 add $ctr,$ctr,#1 1102 mov @data[3].s[2],$ctr 1103 add $ctr,$ctr,#1 1104 mov @data[3].s[3],$ctr 1105 add $ctr,$ctr,#1 1106 cmp $blocks,#8 1107 b.ge .Lctr32_8_blocks_process 1108 bl _vpsm4_enc_4blks 1109 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1110 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1111 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1112 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1113 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1114 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1115 subs $blocks,$blocks,#4 1116 b.ne .Lctr32_4_blocks_process 1117 b 100f 1118.Lctr32_8_blocks_process: 1119 dup @datax[0].4s,$word0 1120 dup @datax[1].4s,$word1 1121 dup @datax[2].4s,$word2 1122 mov @datax[3].s[0],$ctr 1123 add $ctr,$ctr,#1 1124 mov $datax[3].s[1],$ctr 1125 add $ctr,$ctr,#1 1126 mov @datax[3].s[2],$ctr 1127 add $ctr,$ctr,#1 1128 mov @datax[3].s[3],$ctr 1129 add $ctr,$ctr,#1 1130 bl _vpsm4_enc_8blks 1131 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 1132 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1133 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1134 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1135 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1136 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1137 eor @data[0].16b,@data[0].16b,@datax[0].16b 1138 eor @data[1].16b,@data[1].16b,@datax[1].16b 1139 eor @data[2].16b,@data[2].16b,@datax[2].16b 1140 eor @data[3].16b,@data[3].16b,@datax[3].16b 1141 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1142 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1143 subs $blocks,$blocks,#8 1144 b.ne .Lctr32_4_blocks_process 1145 b 100f 11461: // last block processing 1147 subs $blocks,$blocks,#1 1148 b.lt 100f 1149 b.gt 1f 1150 mov $ivec.s[0],$word0 1151 mov $ivec.s[1],$word1 1152 mov $ivec.s[2],$word2 1153 mov $ivec.s[3],$ctr 1154___ 1155 &encrypt_1blk($ivec); 1156$code.=<<___; 1157 ld1 {@data[0].4s},[$inp] 1158 eor @data[0].16b,@data[0].16b,$ivec.16b 1159 st1 {@data[0].4s},[$outp] 1160 b 100f 11611: // last 2 blocks processing 1162 dup @data[0].4s,$word0 1163 dup @data[1].4s,$word1 1164 dup @data[2].4s,$word2 1165 mov @data[3].s[0],$ctr 1166 add $ctr,$ctr,#1 1167 mov @data[3].s[1],$ctr 1168 subs $blocks,$blocks,#1 1169 b.ne 1f 1170 bl _vpsm4_enc_4blks 1171 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1172 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1173 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1174 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1175 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1176 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1177 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1178 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1179 b 100f 11801: // last 3 blocks processing 1181 add $ctr,$ctr,#1 1182 mov @data[3].s[2],$ctr 1183 bl _vpsm4_enc_4blks 1184 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 1185 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 1186 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 1187 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b 1188 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b 1189 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b 1190 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b 1191 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 1192 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 1193 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 1194100: 1195 ldp d10,d11,[sp,#16] 1196 ldp d12,d13,[sp,#32] 1197 ldp d14,d15,[sp,#48] 1198 ldp x29,x30,[sp,#64] 1199 ldp d8,d9,[sp],#80 1200 AARCH64_VALIDATE_LINK_REGISTER 1201 ret 1202.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 1203___ 1204}}} 1205 1206{{{ 1207my ($blocks,$len)=("x2","x2"); 1208my $ivp=("x5"); 1209my @twx=map("x$_",(12..27)); 1210my ($rks1,$rks2)=("x26","x27"); 1211my $lastBlk=("x26"); 1212my $enc=("w28"); 1213my $remain=("x29"); 1214 1215my @tweak=@datax; 1216 1217sub gen_xts_cipher() { 1218 my $std = shift; 1219$code.=<<___; 1220.globl ${prefix}_xts_encrypt${std} 1221.type ${prefix}_xts_encrypt${std},%function 1222.align 5 1223${prefix}_xts_encrypt${std}: 1224 AARCH64_SIGN_LINK_REGISTER 1225 stp x15, x16, [sp, #-0x10]! 1226 stp x17, x18, [sp, #-0x10]! 1227 stp x19, x20, [sp, #-0x10]! 1228 stp x21, x22, [sp, #-0x10]! 1229 stp x23, x24, [sp, #-0x10]! 1230 stp x25, x26, [sp, #-0x10]! 1231 stp x27, x28, [sp, #-0x10]! 1232 stp x29, x30, [sp, #-0x10]! 1233 stp d8, d9, [sp, #-0x10]! 1234 stp d10, d11, [sp, #-0x10]! 1235 stp d12, d13, [sp, #-0x10]! 1236 stp d14, d15, [sp, #-0x10]! 1237 mov $rks1,x3 1238 mov $rks2,x4 1239 mov $enc,w6 1240 ld1 {@tweak[0].4s}, [$ivp] 1241 mov $rks,$rks2 1242___ 1243 &load_sbox(); 1244 &rev32(@tweak[0],@tweak[0]); 1245 &encrypt_1blk(@tweak[0]); 1246$code.=<<___; 1247 mov $rks,$rks1 1248 and $remain,$len,#0x0F 1249 // convert length into blocks 1250 lsr $blocks,$len,4 1251 cmp $blocks,#1 1252 b.lt .return${std} 1253 1254 cmp $remain,0 1255 // If the encryption/decryption Length is N times of 16, 1256 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1257 b.eq .xts_encrypt_blocks${std} 1258 1259 // If the encryption/decryption length is not N times of 16, 1260 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} 1261 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} 1262 subs $blocks,$blocks,#1 1263 b.eq .only_2blks_tweak${std} 1264.xts_encrypt_blocks${std}: 1265___ 1266 &rbit(@tweak[0],@tweak[0],$std); 1267 &rev32_armeb(@tweak[0],@tweak[0]); 1268 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); 1269 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1270 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1271 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1272 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1273 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1274 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1275 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1276$code.=<<___; 1277.Lxts_8_blocks_process${std}: 1278 cmp $blocks,#8 1279 b.lt .Lxts_4_blocks_process${std} 1280___ 1281 &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); 1282 &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); 1283 &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); 1284 &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); 1285 &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); 1286 &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); 1287 &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); 1288 &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); 1289$code.=<<___; 1290 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1291___ 1292 &rbit(@vtmp[0],@vtmp[0],$std); 1293 &rbit(@vtmp[1],@vtmp[1],$std); 1294 &rbit(@vtmp[2],@vtmp[2],$std); 1295 &rbit(@vtmp[3],@vtmp[3],$std); 1296$code.=<<___; 1297 eor @data[0].16b, @data[0].16b, @vtmp[0].16b 1298 eor @data[1].16b, @data[1].16b, @vtmp[1].16b 1299 eor @data[2].16b, @data[2].16b, @vtmp[2].16b 1300 eor @data[3].16b, @data[3].16b, @vtmp[3].16b 1301 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 1302___ 1303 &rbit(@vtmpx[0],@vtmpx[0],$std); 1304 &rbit(@vtmpx[1],@vtmpx[1],$std); 1305 &rbit(@vtmpx[2],@vtmpx[2],$std); 1306 &rbit(@vtmpx[3],@vtmpx[3],$std); 1307$code.=<<___; 1308 eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b 1309 eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b 1310 eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b 1311 eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b 1312___ 1313 &rev32(@data[0],@data[0]); 1314 &rev32(@data[1],@data[1]); 1315 &rev32(@data[2],@data[2]); 1316 &rev32(@data[3],@data[3]); 1317 &rev32(@datax[0],@datax[0]); 1318 &rev32(@datax[1],@datax[1]); 1319 &rev32(@datax[2],@datax[2]); 1320 &rev32(@datax[3],@datax[3]); 1321 &transpose(@data,@vtmp); 1322 &transpose(@datax,@vtmp); 1323$code.=<<___; 1324 bl _${prefix}_enc_8blks 1325___ 1326 &transpose(@vtmp,@datax); 1327 &transpose(@data,@datax); 1328 1329 &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); 1330 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); 1331 &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); 1332 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); 1333 &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); 1334 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); 1335 &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); 1336 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); 1337 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); 1338 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); 1339 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); 1340 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); 1341 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); 1342 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); 1343 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); 1344 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); 1345$code.=<<___; 1346 eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b 1347 eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b 1348 eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b 1349 eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b 1350 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1351 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1352 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1353 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1354 1355 // save the last tweak 1356 st1 {@tweak[3].4s},[$ivp] 1357 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1358 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 1359 subs $blocks,$blocks,#8 1360 b.gt .Lxts_8_blocks_process${std} 1361 b 100f 1362.Lxts_4_blocks_process${std}: 1363___ 1364 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); 1365 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); 1366 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); 1367 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); 1368$code.=<<___; 1369 cmp $blocks,#4 1370 b.lt 1f 1371 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 1372___ 1373 &rbit(@tweak[0],@tweak[0],$std); 1374 &rbit(@tweak[1],@tweak[1],$std); 1375 &rbit(@tweak[2],@tweak[2],$std); 1376 &rbit(@tweak[3],@tweak[3],$std); 1377$code.=<<___; 1378 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1379 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1380 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1381 eor @data[3].16b, @data[3].16b, @tweak[3].16b 1382___ 1383 &rev32(@data[0],@data[0]); 1384 &rev32(@data[1],@data[1]); 1385 &rev32(@data[2],@data[2]); 1386 &rev32(@data[3],@data[3]); 1387 &transpose(@data,@vtmp); 1388$code.=<<___; 1389 bl _${prefix}_enc_4blks 1390___ 1391 &transpose(@vtmp,@data); 1392$code.=<<___; 1393 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1394 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1395 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1396 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b 1397 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 1398 sub $blocks,$blocks,#4 1399___ 1400 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); 1401 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); 1402 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); 1403$code.=<<___; 1404 // save the last tweak 1405 st1 {@tweak[3].4s},[$ivp] 14061: 1407 // process last block 1408 cmp $blocks,#1 1409 b.lt 100f 1410 b.gt 1f 1411 ld1 {@data[0].4s},[$inp],#16 1412___ 1413 &rbit(@tweak[0],@tweak[0],$std); 1414$code.=<<___; 1415 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1416___ 1417 &rev32(@data[0],@data[0]); 1418 &encrypt_1blk(@data[0]); 1419$code.=<<___; 1420 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1421 st1 {@data[0].4s},[$outp],#16 1422 // save the last tweak 1423 st1 {@tweak[0].4s},[$ivp] 1424 b 100f 14251: // process last 2 blocks 1426 cmp $blocks,#2 1427 b.gt 1f 1428 ld1 {@data[0].4s,@data[1].4s},[$inp],#32 1429___ 1430 &rbit(@tweak[0],@tweak[0],$std); 1431 &rbit(@tweak[1],@tweak[1],$std); 1432$code.=<<___; 1433 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1434 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1435___ 1436 &rev32(@data[0],@data[0]); 1437 &rev32(@data[1],@data[1]); 1438 &transpose(@data,@vtmp); 1439$code.=<<___; 1440 bl _${prefix}_enc_4blks 1441___ 1442 &transpose(@vtmp,@data); 1443$code.=<<___; 1444 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1445 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1446 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 1447 // save the last tweak 1448 st1 {@tweak[1].4s},[$ivp] 1449 b 100f 14501: // process last 3 blocks 1451 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 1452___ 1453 &rbit(@tweak[0],@tweak[0],$std); 1454 &rbit(@tweak[1],@tweak[1],$std); 1455 &rbit(@tweak[2],@tweak[2],$std); 1456$code.=<<___; 1457 eor @data[0].16b, @data[0].16b, @tweak[0].16b 1458 eor @data[1].16b, @data[1].16b, @tweak[1].16b 1459 eor @data[2].16b, @data[2].16b, @tweak[2].16b 1460___ 1461 &rev32(@data[0],@data[0]); 1462 &rev32(@data[1],@data[1]); 1463 &rev32(@data[2],@data[2]); 1464 &transpose(@data,@vtmp); 1465$code.=<<___; 1466 bl _${prefix}_enc_4blks 1467___ 1468 &transpose(@vtmp,@data); 1469$code.=<<___; 1470 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b 1471 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b 1472 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b 1473 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 1474 // save the last tweak 1475 st1 {@tweak[2].4s},[$ivp] 1476100: 1477 cmp $remain,0 1478 b.eq .return${std} 1479 1480// This branch calculates the last two tweaks, 1481// while the encryption/decryption length is larger than 32 1482.last_2blks_tweak${std}: 1483 ld1 {@tweak[0].4s},[$ivp] 1484___ 1485 &rev32_armeb(@tweak[0],@tweak[0]); 1486 &compute_tweak_vec(@tweak[0],@tweak[1],$std); 1487 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1488$code.=<<___; 1489 b .check_dec${std} 1490 1491 1492// This branch calculates the last two tweaks, 1493// while the encryption/decryption length is equal to 32, who only need two tweaks 1494.only_2blks_tweak${std}: 1495 mov @tweak[1].16b,@tweak[0].16b 1496___ 1497 &rev32_armeb(@tweak[1],@tweak[1]); 1498 &compute_tweak_vec(@tweak[1],@tweak[2],$std); 1499$code.=<<___; 1500 b .check_dec${std} 1501 1502 1503// Determine whether encryption or decryption is required. 1504// The last two tweaks need to be swapped for decryption. 1505.check_dec${std}: 1506 // encryption:1 decryption:0 1507 cmp $enc,1 1508 b.eq .process_last_2blks${std} 1509 mov @vtmp[0].16B,@tweak[1].16b 1510 mov @tweak[1].16B,@tweak[2].16b 1511 mov @tweak[2].16B,@vtmp[0].16b 1512 1513.process_last_2blks${std}: 1514___ 1515 &rev32_armeb(@tweak[1],@tweak[1]); 1516 &rev32_armeb(@tweak[2],@tweak[2]); 1517$code.=<<___; 1518 ld1 {@data[0].4s},[$inp],#16 1519 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1520___ 1521 &rev32(@data[0],@data[0]); 1522 &encrypt_1blk(@data[0]); 1523$code.=<<___; 1524 eor @data[0].16b, @data[0].16b, @tweak[1].16b 1525 st1 {@data[0].4s},[$outp],#16 1526 1527 sub $lastBlk,$outp,16 1528 .loop${std}: 1529 subs $remain,$remain,1 1530 ldrb $wtmp0,[$lastBlk,$remain] 1531 ldrb $wtmp1,[$inp,$remain] 1532 strb $wtmp1,[$lastBlk,$remain] 1533 strb $wtmp0,[$outp,$remain] 1534 b.gt .loop${std} 1535 ld1 {@data[0].4s}, [$lastBlk] 1536 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1537___ 1538 &rev32(@data[0],@data[0]); 1539 &encrypt_1blk(@data[0]); 1540$code.=<<___; 1541 eor @data[0].16b, @data[0].16b, @tweak[2].16b 1542 st1 {@data[0].4s}, [$lastBlk] 1543.return${std}: 1544 ldp d14, d15, [sp], #0x10 1545 ldp d12, d13, [sp], #0x10 1546 ldp d10, d11, [sp], #0x10 1547 ldp d8, d9, [sp], #0x10 1548 ldp x29, x30, [sp], #0x10 1549 ldp x27, x28, [sp], #0x10 1550 ldp x25, x26, [sp], #0x10 1551 ldp x23, x24, [sp], #0x10 1552 ldp x21, x22, [sp], #0x10 1553 ldp x19, x20, [sp], #0x10 1554 ldp x17, x18, [sp], #0x10 1555 ldp x15, x16, [sp], #0x10 1556 AARCH64_VALIDATE_LINK_REGISTER 1557 ret 1558.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} 1559___ 1560} # end of gen_xts_cipher 1561&gen_xts_cipher("_gb"); 1562&gen_xts_cipher(""); 1563}}} 1564######################################## 1565open SELF,$0; 1566while(<SELF>) { 1567 next if (/^#!/); 1568 last if (!s/^#/\/\// and !/^$/); 1569 print; 1570} 1571close SELF; 1572 1573foreach(split("\n",$code)) { 1574 s/\`([^\`]*)\`/eval($1)/ge; 1575 print $_,"\n"; 1576} 1577 1578close STDOUT or die "error closing STDOUT: $!"; 1579