1#! /usr/bin/env perl 2# Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# ChaCha20 for ARMv8 via SVE 11# 12# $output is the last argument if it looks like a file (it has an extension) 13# $flavour is the first argument if it doesn't look like a file 14$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 15$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 16 17$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 18( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 19( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 20die "can't locate arm-xlate.pl"; 21 22open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 23 or die "can't call $xlate: $!"; 24*STDOUT=*OUT; 25 26sub AUTOLOAD() # thunk [simplified] x86-style perlasm 27{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 28 my $arg = pop; 29 $arg = "#$arg" if ($arg*1 eq $arg); 30 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 31} 32 33my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); 34my ($veclen) = ("x5"); 35my ($counter) = ("x6"); 36my ($counter_w) = ("w6"); 37my @xx=(7..22); 38my @sxx=map("x$_",@xx); 39my @sx=map("w$_",@xx); 40my @K=map("x$_",(23..30)); 41my @elem=(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 42my @KL=map("w$_",(23..30)); 43my @mx=map("z$_",@elem); 44my @vx=map("v$_",@elem); 45my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 46 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; 47my ($zctr) = ("z16"); 48my @tt=(17..24); 49my @xt=map("z$_",@tt); 50my @vt=map("v$_",@tt); 51my @perm=map("z$_",(25..30)); 52my ($rot8) = ("z31"); 53my @bak=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],@xt[4],@xt[5],@xt[6],@xt[7],@xt[0],@xt[1],$zctr,@xt[2],@xt[3],$rot8); 54my $debug_encoder=0; 55 56sub SVE_ADD() { 57 my $x = shift; 58 my $y = shift; 59 60$code.=<<___; 61 add @mx[$x].s,@mx[$x].s,@mx[$y].s 62 .if mixin == 1 63 add @sx[$x],@sx[$x],@sx[$y] 64 .endif 65___ 66 if (@_) { 67 &SVE_ADD(@_); 68 } 69} 70 71sub SVE_EOR() { 72 my $x = shift; 73 my $y = shift; 74 75$code.=<<___; 76 eor @mx[$x].d,@mx[$x].d,@mx[$y].d 77 .if mixin == 1 78 eor @sx[$x],@sx[$x],@sx[$y] 79 .endif 80___ 81 if (@_) { 82 &SVE_EOR(@_); 83 } 84} 85 86sub SVE_LSL() { 87 my $bits = shift; 88 my $x = shift; 89 my $y = shift; 90 my $next = $x + 1; 91 92$code.=<<___; 93 lsl @xt[$x].s,@mx[$y].s,$bits 94___ 95 if (@_) { 96 &SVE_LSL($bits,$next,@_); 97 } 98} 99 100sub SVE_LSR() { 101 my $bits = shift; 102 my $x = shift; 103 104$code.=<<___; 105 lsr @mx[$x].s,@mx[$x].s,$bits 106 .if mixin == 1 107 ror @sx[$x],@sx[$x],$bits 108 .endif 109___ 110 if (@_) { 111 &SVE_LSR($bits,@_); 112 } 113} 114 115sub SVE_ORR() { 116 my $x = shift; 117 my $y = shift; 118 my $next = $x + 1; 119 120$code.=<<___; 121 orr @mx[$y].d,@mx[$y].d,@xt[$x].d 122___ 123 if (@_) { 124 &SVE_ORR($next,@_); 125 } 126} 127 128sub SVE_REV16() { 129 my $x = shift; 130 131$code.=<<___; 132 revh @mx[$x].s,p0/m,@mx[$x].s 133 .if mixin == 1 134 ror @sx[$x],@sx[$x],#16 135 .endif 136___ 137 if (@_) { 138 &SVE_REV16(@_); 139 } 140} 141 142sub SVE_ROT8() { 143 my $x = shift; 144 145$code.=<<___; 146 tbl @mx[$x].b,{@mx[$x].b},$rot8.b 147 .if mixin == 1 148 ror @sx[$x],@sx[$x],#24 149 .endif 150___ 151 if (@_) { 152 &SVE_ROT8(@_); 153 } 154} 155 156sub SVE2_XAR() { 157 my $bits = shift; 158 my $x = shift; 159 my $y = shift; 160 my $rbits = 32-$bits; 161 162$code.=<<___; 163 .if mixin == 1 164 eor @sx[$x],@sx[$x],@sx[$y] 165 .endif 166 xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits 167 .if mixin == 1 168 ror @sx[$x],@sx[$x],$rbits 169 .endif 170___ 171 if (@_) { 172 &SVE2_XAR($bits,@_); 173 } 174} 175 176sub SVE2_QR_GROUP() { 177 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; 178 179 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 180 &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 181 182 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 183 &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 184 185 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 186 &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 187 188 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 189 &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 190} 191 192sub SVE_QR_GROUP() { 193 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; 194 195 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 196 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 197 &SVE_REV16($d0,$d1,$d2,$d3); 198 199 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 200 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 201 &SVE_LSL(12,0,$b0,$b1,$b2,$b3); 202 &SVE_LSR(20,$b0,$b1,$b2,$b3); 203 &SVE_ORR(0,$b0,$b1,$b2,$b3); 204 205 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 206 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 207 &SVE_ROT8($d0,$d1,$d2,$d3); 208 209 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 210 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 211 &SVE_LSL(7,0,$b0,$b1,$b2,$b3); 212 &SVE_LSR(25,$b0,$b1,$b2,$b3); 213 &SVE_ORR(0,$b0,$b1,$b2,$b3); 214} 215 216sub SVE_INNER_BLOCK() { 217$code.=<<___; 218 mov $counter,#10 21910: 220.align 5 221___ 222 &SVE_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 223 &SVE_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); 224$code.=<<___; 225 sub $counter,$counter,1 226 cbnz $counter,10b 227___ 228} 229 230sub SVE2_INNER_BLOCK() { 231$code.=<<___; 232 mov $counter,#10 23310: 234.align 5 235___ 236 &SVE2_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 237 &SVE2_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); 238$code.=<<___; 239 sub $counter,$counter,1 240 cbnz $counter,10b 241___ 242} 243 244sub load_regs() { 245 my $offset = shift; 246 my $reg = shift; 247 my $next_offset = $offset + 1; 248$code.=<<___; 249 ld1w {$reg.s},p0/z,[$inp,#$offset,MUL VL] 250#ifdef __AARCH64EB__ 251 revb $reg.s,p0/m,$reg.s 252#endif 253___ 254 if (@_) { 255 &load_regs($next_offset, @_); 256 } else { 257$code.=<<___; 258 addvl $inp,$inp,$next_offset 259___ 260 } 261} 262 263sub load() { 264 if (@_) { 265 &load_regs(0, @_); 266 } 267} 268 269sub store_regs() { 270 my $offset = shift; 271 my $reg = shift; 272 my $next_offset = $offset + 1; 273$code.=<<___; 274#ifdef __AARCH64EB__ 275 revb $reg.s,p0/m,$reg.s 276#endif 277 st1w {$reg.s},p0,[$outp,#$offset,MUL VL] 278___ 279 if (@_) { 280 &store_regs($next_offset, @_); 281 } else { 282$code.=<<___; 283 addvl $outp,$outp,$next_offset 284___ 285 } 286} 287 288sub store() { 289 if (@_) { 290 &store_regs(0, @_); 291 } 292} 293 294sub transpose() { 295 my $xa = shift; 296 my $xb = shift; 297 my $xc = shift; 298 my $xd = shift; 299 my $xa1 = shift; 300 my $xb1 = shift; 301 my $xc1 = shift; 302 my $xd1 = shift; 303$code.=<<___; 304 zip1 @xt[0].s,$xa.s,$xb.s 305 zip2 @xt[1].s,$xa.s,$xb.s 306 zip1 @xt[2].s,$xc.s,$xd.s 307 zip2 @xt[3].s,$xc.s,$xd.s 308 309 zip1 @xt[4].s,$xa1.s,$xb1.s 310 zip2 @xt[5].s,$xa1.s,$xb1.s 311 zip1 @xt[6].s,$xc1.s,$xd1.s 312 zip2 @xt[7].s,$xc1.s,$xd1.s 313 314 zip1 $xa.d,@xt[0].d,@xt[2].d 315 zip2 $xb.d,@xt[0].d,@xt[2].d 316 zip1 $xc.d,@xt[1].d,@xt[3].d 317 zip2 $xd.d,@xt[1].d,@xt[3].d 318 319 zip1 $xa1.d,@xt[4].d,@xt[6].d 320 zip2 $xb1.d,@xt[4].d,@xt[6].d 321 zip1 $xc1.d,@xt[5].d,@xt[7].d 322 zip2 $xd1.d,@xt[5].d,@xt[7].d 323___ 324} 325 326sub ACCUM() { 327 my $idx0 = shift; 328 my $idx1 = $idx0 + 1; 329 my $x0 = @sx[$idx0]; 330 my $xx0 = @sxx[$idx0]; 331 my $x1 = @sx[$idx1]; 332 my $xx1 = @sxx[$idx1]; 333 my $d = $idx0/2; 334 my ($tmp,$tmpw) = ($counter,$counter_w); 335 my $bk0 = @_ ? shift : @bak[$idx0]; 336 my $bk1 = @_ ? shift : @bak[$idx1]; 337 338$code.=<<___; 339 .if mixin == 1 340 add @sx[$idx0],@sx[$idx0],@KL[$d] 341 .endif 342 add @mx[$idx0].s,@mx[$idx0].s,$bk0.s 343 .if mixin == 1 344 add @sxx[$idx1],@sxx[$idx1],@K[$d],lsr #32 345 .endif 346 add @mx[$idx1].s,@mx[$idx1].s,$bk1.s 347 .if mixin == 1 348 add @sxx[$idx0],@sxx[$idx0],$sxx[$idx1],lsl #32 // pack 349 .endif 350___ 351} 352 353sub SCA_INP() { 354 my $idx0 = shift; 355 my $idx1 = $idx0 + 2; 356$code.=<<___; 357 .if mixin == 1 358 ldp @sxx[$idx0],@sxx[$idx1],[$inp],#16 359 .endif 360___ 361} 362 363sub SVE_ACCUM_STATES() { 364 my ($tmp,$tmpw) = ($counter,$counter_w); 365 366$code.=<<___; 367 lsr $tmp,@K[5],#32 368 dup @bak[10].s,@KL[5] 369 dup @bak[11].s,$tmpw 370 lsr $tmp,@K[6],#32 371 dup @bak[13].s,$tmpw 372 lsr $tmp,@K[7],#32 373___ 374 &ACCUM(0); 375 &ACCUM(2); 376 &SCA_INP(1); 377 &ACCUM(4); 378 &ACCUM(6); 379 &SCA_INP(5); 380 &ACCUM(8); 381 &ACCUM(10); 382 &SCA_INP(9); 383$code.=<<___; 384 dup @bak[14].s,@KL[7] 385 dup @bak[0].s,$tmpw // bak[15] not available for SVE 386___ 387 &ACCUM(12); 388 &ACCUM(14, @bak[14],@bak[0]); 389 &SCA_INP(13); 390} 391 392sub SVE2_ACCUM_STATES() { 393 &ACCUM(0); 394 &ACCUM(2); 395 &SCA_INP(1); 396 &ACCUM(4); 397 &ACCUM(6); 398 &SCA_INP(5); 399 &ACCUM(8); 400 &ACCUM(10); 401 &SCA_INP(9); 402 &ACCUM(12); 403 &ACCUM(14); 404 &SCA_INP(13); 405} 406 407sub SCA_EOR() { 408 my $idx0 = shift; 409 my $idx1 = $idx0 + 1; 410$code.=<<___; 411 .if mixin == 1 412 eor @sxx[$idx0],@sxx[$idx0],@sxx[$idx1] 413 .endif 414___ 415} 416 417sub SCA_SAVE() { 418 my $idx0 = shift; 419 my $idx1 = shift; 420$code.=<<___; 421 .if mixin == 1 422 stp @sxx[$idx0],@sxx[$idx1],[$outp],#16 423 .endif 424___ 425} 426 427sub SVE_VL128_TRANSFORMS() { 428 &SCA_EOR(0); 429 &SCA_EOR(2); 430 &SCA_EOR(4); 431 &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 432 &SCA_EOR(6); 433 &SCA_EOR(8); 434 &SCA_EOR(10); 435 &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 436 &SCA_EOR(12); 437 &SCA_EOR(14); 438$code.=<<___; 439 ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64 440 ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64 441 eor $xa0.d,$xa0.d,@xt[0].d 442 eor $xb0.d,$xb0.d,@xt[1].d 443 eor $xc0.d,$xc0.d,@xt[2].d 444 eor $xd0.d,$xd0.d,@xt[3].d 445 eor $xa1.d,$xa1.d,@xt[4].d 446 eor $xb1.d,$xb1.d,@xt[5].d 447 eor $xc1.d,$xc1.d,@xt[6].d 448 eor $xd1.d,$xd1.d,@xt[7].d 449 ld1 {@vt[0].4s-@vt[3].4s},[$inp],#64 450 ld1 {@vt[4].4s-@vt[7].4s},[$inp],#64 451___ 452 &SCA_SAVE(0,2); 453$code.=<<___; 454 eor $xa2.d,$xa2.d,@xt[0].d 455 eor $xb2.d,$xb2.d,@xt[1].d 456___ 457 &SCA_SAVE(4,6); 458$code.=<<___; 459 eor $xc2.d,$xc2.d,@xt[2].d 460 eor $xd2.d,$xd2.d,@xt[3].d 461___ 462 &SCA_SAVE(8,10); 463$code.=<<___; 464 eor $xa3.d,$xa3.d,@xt[4].d 465 eor $xb3.d,$xb3.d,@xt[5].d 466___ 467 &SCA_SAVE(12,14); 468$code.=<<___; 469 eor $xc3.d,$xc3.d,@xt[6].d 470 eor $xd3.d,$xd3.d,@xt[7].d 471 st1 {@vx[0].4s-@vx[12].4s},[$outp],#64 472 st1 {@vx[1].4s-@vx[13].4s},[$outp],#64 473 st1 {@vx[2].4s-@vx[14].4s},[$outp],#64 474 st1 {@vx[3].4s-@vx[15].4s},[$outp],#64 475___ 476} 477 478sub SVE_TRANSFORMS() { 479$code.=<<___; 480#ifdef __AARCH64EB__ 481 rev @sxx[0],@sxx[0] 482 rev @sxx[2],@sxx[2] 483 rev @sxx[4],@sxx[4] 484 rev @sxx[6],@sxx[6] 485 rev @sxx[8],@sxx[8] 486 rev @sxx[10],@sxx[10] 487 rev @sxx[12],@sxx[12] 488 rev @sxx[14],@sxx[14] 489#endif 490 .if mixin == 1 491 add @K[6],@K[6],#1 492 .endif 493 cmp $veclen,4 494 b.ne 200f 495___ 496 &SVE_VL128_TRANSFORMS(); 497$code.=<<___; 498 b 210f 499200: 500___ 501 &transpose($xa0,$xb0,$xc0,$xd0,$xa1,$xb1,$xc1,$xd1); 502 &SCA_EOR(0); 503 &SCA_EOR(2); 504 &transpose($xa2,$xb2,$xc2,$xd2,$xa3,$xb3,$xc3,$xd3); 505 &SCA_EOR(4); 506 &SCA_EOR(6); 507 &transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 508 &SCA_EOR(8); 509 &SCA_EOR(10); 510 &transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 511 &SCA_EOR(12); 512 &SCA_EOR(14); 513 &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]); 514$code.=<<___; 515 eor $xa0.d,$xa0.d,@xt[0].d 516 eor $xa1.d,$xa1.d,@xt[1].d 517 eor $xa2.d,$xa2.d,@xt[2].d 518 eor $xa3.d,$xa3.d,@xt[3].d 519 eor $xb0.d,$xb0.d,@xt[4].d 520 eor $xb1.d,$xb1.d,@xt[5].d 521 eor $xb2.d,$xb2.d,@xt[6].d 522 eor $xb3.d,$xb3.d,@xt[7].d 523___ 524 &load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]); 525 &SCA_SAVE(0,2); 526$code.=<<___; 527 eor $xc0.d,$xc0.d,@xt[0].d 528 eor $xc1.d,$xc1.d,@xt[1].d 529___ 530 &SCA_SAVE(4,6); 531$code.=<<___; 532 eor $xc2.d,$xc2.d,@xt[2].d 533 eor $xc3.d,$xc3.d,@xt[3].d 534___ 535 &SCA_SAVE(8,10); 536$code.=<<___; 537 eor $xd0.d,$xd0.d,@xt[4].d 538 eor $xd1.d,$xd1.d,@xt[5].d 539___ 540 &SCA_SAVE(12,14); 541$code.=<<___; 542 eor $xd2.d,$xd2.d,@xt[6].d 543 eor $xd3.d,$xd3.d,@xt[7].d 544___ 545 &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 546 &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 547$code.=<<___; 548210: 549 incw @K[6], ALL, MUL #1 550___ 551} 552 553sub SET_STATE_BAK() { 554 my $idx0 = shift; 555 my $idx1 = $idx0 + 1; 556 my $x0 = @sx[$idx0]; 557 my $xx0 = @sxx[$idx0]; 558 my $x1 = @sx[$idx1]; 559 my $xx1 = @sxx[$idx1]; 560 my $d = $idx0/2; 561 562$code.=<<___; 563 lsr $xx1,@K[$d],#32 564 dup @mx[$idx0].s,@KL[$d] 565 dup @bak[$idx0].s,@KL[$d] 566 .if mixin == 1 567 mov $x0,@KL[$d] 568 .endif 569 dup @mx[$idx1].s,$x1 570 dup @bak[$idx1].s,$x1 571___ 572} 573 574sub SET_STATE() { 575 my $idx0 = shift; 576 my $idx1 = $idx0 + 1; 577 my $x0 = @sx[$idx0]; 578 my $xx0 = @sxx[$idx0]; 579 my $x1 = @sx[$idx1]; 580 my $xx1 = @sxx[$idx1]; 581 my $d = $idx0/2; 582 583$code.=<<___; 584 lsr $xx1,@K[$d],#32 585 dup @mx[$idx0].s,@KL[$d] 586 .if mixin == 1 587 mov $x0,@KL[$d] 588 .endif 589 dup @mx[$idx1].s,$x1 590___ 591} 592 593sub SVE_LOAD_STATES() { 594 &SET_STATE_BAK(0); 595 &SET_STATE_BAK(2); 596 &SET_STATE_BAK(4); 597 &SET_STATE_BAK(6); 598 &SET_STATE_BAK(8); 599 &SET_STATE(10); 600 &SET_STATE(14); 601$code.=<<___; 602 .if mixin == 1 603 add @sx[13],@KL[6],#1 604 mov @sx[12],@KL[6] 605 index $zctr.s,@sx[13],1 606 index @mx[12].s,@sx[13],1 607 .else 608 index $zctr.s,@KL[6],1 609 index @mx[12].s,@KL[6],1 610 .endif 611 lsr @sxx[13],@K[6],#32 612 dup @mx[13].s,@sx[13] 613___ 614} 615 616sub SVE2_LOAD_STATES() { 617 &SET_STATE_BAK(0); 618 &SET_STATE_BAK(2); 619 &SET_STATE_BAK(4); 620 &SET_STATE_BAK(6); 621 &SET_STATE_BAK(8); 622 &SET_STATE_BAK(10); 623 &SET_STATE_BAK(14); 624 625$code.=<<___; 626 .if mixin == 1 627 add @sx[13],@KL[6],#1 628 mov @sx[12],@KL[6] 629 index $zctr.s,@sx[13],1 630 index @mx[12].s,@sx[13],1 631 .else 632 index $zctr.s,@KL[6],1 633 index @mx[12].s,@KL[6],1 634 .endif 635 lsr @sxx[13],@K[6],#32 636 dup @mx[13].s,@sx[13] 637 dup @bak[13].s,@sx[13] 638___ 639} 640 641sub chacha20_sve() { 642 my ($tmp) = (@sxx[0]); 643 644$code.=<<___; 645.align 5 646100: 647 subs $tmp,$len,$veclen,lsl #6 648 b.lt 110f 649 mov $len,$tmp 650 b.eq 101f 651 cmp $len,64 652 b.lt 101f 653 mixin=1 654___ 655 &SVE_LOAD_STATES(); 656 &SVE_INNER_BLOCK(); 657 &SVE_ACCUM_STATES(); 658 &SVE_TRANSFORMS(); 659$code.=<<___; 660 subs $len,$len,64 661 b.gt 100b 662 b 110f 663101: 664 mixin=0 665___ 666 &SVE_LOAD_STATES(); 667 &SVE_INNER_BLOCK(); 668 &SVE_ACCUM_STATES(); 669 &SVE_TRANSFORMS(); 670$code.=<<___; 671110: 672___ 673} 674 675sub chacha20_sve2() { 676 my ($tmp) = (@sxx[0]); 677 678$code.=<<___; 679.align 5 680100: 681 subs $tmp,$len,$veclen,lsl #6 682 b.lt 110f 683 mov $len,$tmp 684 b.eq 101f 685 cmp $len,64 686 b.lt 101f 687 mixin=1 688___ 689 &SVE2_LOAD_STATES(); 690 &SVE2_INNER_BLOCK(); 691 &SVE2_ACCUM_STATES(); 692 &SVE_TRANSFORMS(); 693$code.=<<___; 694 subs $len,$len,64 695 b.gt 100b 696 b 110f 697101: 698 mixin=0 699___ 700 &SVE2_LOAD_STATES(); 701 &SVE2_INNER_BLOCK(); 702 &SVE2_ACCUM_STATES(); 703 &SVE_TRANSFORMS(); 704$code.=<<___; 705110: 706___ 707} 708 709 710{{{ 711 my ($tmp,$tmpw) = ("x6", "w6"); 712 my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10"); 713 my ($sve2flag) = ("x7"); 714 715$code.=<<___; 716#include "arm_arch.h" 717 718.arch armv8-a 719 720.extern OPENSSL_armcap_P 721.hidden OPENSSL_armcap_P 722 723.text 724.align 5 725.Lchacha20_consts: 726.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 727.Lrot8: 728 .word 0x02010003,0x04040404,0x02010003,0x04040404 729.globl ChaCha20_ctr32_sve 730.type ChaCha20_ctr32_sve,%function 731.align 5 732ChaCha20_ctr32_sve: 733 AARCH64_VALID_CALL_TARGET 734 cntw $veclen, ALL, MUL #1 735 cmp $len,$veclen,lsl #6 736 b.lt .Lreturn 737 mov $sve2flag,0 738 adrp $tmp,OPENSSL_armcap_P 739 ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] 740 tst $tmpw,#ARMV8_SVE2 741 b.eq 1f 742 mov $sve2flag,1 743 b 2f 7441: 745 cmp $veclen,4 746 b.le .Lreturn 747 adr $tmp,.Lrot8 748 ldp $tmpw0,$tmpw1,[$tmp] 749 index $rot8.s,$tmpw0,$tmpw1 7502: 751 AARCH64_SIGN_LINK_REGISTER 752 stp d8,d9,[sp,-192]! 753 stp d10,d11,[sp,16] 754 stp d12,d13,[sp,32] 755 stp d14,d15,[sp,48] 756 stp x16,x17,[sp,64] 757 stp x18,x19,[sp,80] 758 stp x20,x21,[sp,96] 759 stp x22,x23,[sp,112] 760 stp x24,x25,[sp,128] 761 stp x26,x27,[sp,144] 762 stp x28,x29,[sp,160] 763 str x30,[sp,176] 764 765 adr $tmp,.Lchacha20_consts 766 ldp @K[0],@K[1],[$tmp] 767 ldp @K[2],@K[3],[$key] 768 ldp @K[4],@K[5],[$key, 16] 769 ldp @K[6],@K[7],[$ctr] 770 ptrues p0.s,ALL 771#ifdef __AARCH64EB__ 772 ror @K[2],@K[2],#32 773 ror @K[3],@K[3],#32 774 ror @K[4],@K[4],#32 775 ror @K[5],@K[5],#32 776 ror @K[6],@K[6],#32 777 ror @K[7],@K[7],#32 778#endif 779 cbz $sve2flag, 1f 780___ 781 &chacha20_sve2(); 782$code.=<<___; 783 b 2f 7841: 785___ 786 &chacha20_sve(); 787$code.=<<___; 7882: 789 str @KL[6],[$ctr] 790 ldp d10,d11,[sp,16] 791 ldp d12,d13,[sp,32] 792 ldp d14,d15,[sp,48] 793 ldp x16,x17,[sp,64] 794 ldp x18,x19,[sp,80] 795 ldp x20,x21,[sp,96] 796 ldp x22,x23,[sp,112] 797 ldp x24,x25,[sp,128] 798 ldp x26,x27,[sp,144] 799 ldp x28,x29,[sp,160] 800 ldr x30,[sp,176] 801 ldp d8,d9,[sp],192 802 AARCH64_VALIDATE_LINK_REGISTER 803.Lreturn: 804 ret 805.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve 806___ 807 808}}} 809 810######################################## 811{ 812my %opcode_unpred = ( 813 "movprfx" => 0x0420BC00, 814 "eor" => 0x04a03000, 815 "add" => 0x04200000, 816 "orr" => 0x04603000, 817 "lsl" => 0x04209C00, 818 "lsr" => 0x04209400, 819 "incw" => 0x04B00000, 820 "xar" => 0x04203400, 821 "zip1" => 0x05206000, 822 "zip2" => 0x05206400, 823 "uzp1" => 0x05206800, 824 "uzp2" => 0x05206C00, 825 "index" => 0x04204C00, 826 "mov" => 0x05203800, 827 "dup" => 0x05203800, 828 "cntw" => 0x04A0E000, 829 "tbl" => 0x05203000); 830 831my %opcode_imm_unpred = ( 832 "dup" => 0x2538C000, 833 "index" => 0x04204400); 834 835my %opcode_scalar_pred = ( 836 "mov" => 0x0528A000, 837 "cpy" => 0x0528A000, 838 "st4w" => 0xE5606000, 839 "st1w" => 0xE5004000, 840 "ld1w" => 0xA5404000); 841 842my %opcode_gather_pred = ( 843 "ld1w" => 0x85204000); 844 845my %opcode_pred = ( 846 "eor" => 0x04190000, 847 "add" => 0x04000000, 848 "orr" => 0x04180000, 849 "whilelo" => 0x25200C00, 850 "whilelt" => 0x25200400, 851 "cntp" => 0x25208000, 852 "addvl" => 0x04205000, 853 "lsl" => 0x04038000, 854 "lsr" => 0x04018000, 855 "sel" => 0x0520C000, 856 "mov" => 0x0520C000, 857 "ptrue" => 0x2518E000, 858 "pfalse" => 0x2518E400, 859 "ptrues" => 0x2519E000, 860 "pnext" => 0x2519C400, 861 "ld4w" => 0xA560E000, 862 "st4w" => 0xE570E000, 863 "st1w" => 0xE500E000, 864 "ld1w" => 0xA540A000, 865 "ld1rw" => 0x8540C000, 866 "lasta" => 0x0520A000, 867 "revh" => 0x05258000, 868 "revb" => 0x05248000); 869 870my %tsize = ( 871 'b' => 0, 872 'h' => 1, 873 's' => 2, 874 'd' => 3); 875 876my %sf = ( 877 "w" => 0, 878 "x" => 1); 879 880my %pattern = ( 881 "POW2" => 0, 882 "VL1" => 1, 883 "VL2" => 2, 884 "VL3" => 3, 885 "VL4" => 4, 886 "VL5" => 5, 887 "VL6" => 6, 888 "VL7" => 7, 889 "VL8" => 8, 890 "VL16" => 9, 891 "VL32" => 10, 892 "VL64" => 11, 893 "VL128" => 12, 894 "VL256" => 13, 895 "MUL4" => 29, 896 "MUL3" => 30, 897 "ALL" => 31); 898 899sub create_verifier { 900 my $filename="./compile_sve.sh"; 901 902$scripts = <<___; 903#! /bin/bash 904set -e 905CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'} 906 907[ -z "\$1" ] && exit 1 908ARCH=`uname -p | xargs echo -n` 909 910# need gcc-10 and above to compile SVE code 911# change this according to your system during debugging 912if [ \$ARCH == 'aarch64' ]; then 913 CC=gcc-11 914 OBJDUMP=objdump 915else 916 CC=\${CROSS_COMPILE}gcc 917 OBJDUMP=\${CROSS_COMPILE}objdump 918fi 919TMPFILE=/tmp/\$\$ 920cat > \$TMPFILE.c << EOF 921extern __attribute__((noinline, section("disasm_output"))) void dummy_func() 922{ 923 asm("\$@\\t\\n"); 924} 925int main(int argc, char *argv[]) 926{ 927} 928EOF 929\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c 930\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}' 931rm \$TMPFILE.c \$TMPFILE.out 932___ 933 open(FH, '>', $filename) or die $!; 934 print FH $scripts; 935 close(FH); 936 system("chmod a+x ./compile_sve.sh"); 937} 938 939sub compile_sve { 940 return `./compile_sve.sh '@_'` 941} 942 943sub verify_inst { 944 my ($code,$inst)=@_; 945 my $hexcode = (sprintf "%08x", $code); 946 947 if ($debug_encoder == 1) { 948 my $expect=&compile_sve($inst); 949 if ($expect ne $hexcode) { 950 return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode); 951 } 952 } 953 return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst); 954} 955 956sub reg_code { 957 my $code = shift; 958 959 if ($code == "zr") { 960 return "31"; 961 } 962 return $code; 963} 964 965sub encode_size_imm() { 966 my ($mnemonic, $isize, $const)=@_; 967 my $esize = (8<<$tsize{$isize}); 968 my $tsize_imm = $esize + $const; 969 970 if ($mnemonic eq "lsr" || $mnemonic eq "xar") { 971 $tsize_imm = 2*$esize - $const; 972 } 973 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16); 974} 975 976sub encode_shift_pred() { 977 my ($mnemonic, $isize, $const)=@_; 978 my $esize = (8<<$tsize{$isize}); 979 my $tsize_imm = $esize + $const; 980 981 if ($mnemonic eq "lsr") { 982 $tsize_imm = 2*$esize - $const; 983 } 984 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5); 985} 986 987sub sve_unpred { 988 my ($mnemonic,$arg)=@_; 989 my $inst = (sprintf "%s %s", $mnemonic,$arg); 990 991 if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) { 992 return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16), 993 $inst) 994 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) { 995 my $regd = $1; 996 my $isize = $2; 997 my $regs=$3; 998 999 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { 1000 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o 1001 && ((8<<$tsize{$isize}) > $2)) { 1002 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2), 1003 $inst); 1004 } 1005 } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) { 1006 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); 1007 } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) { 1008 return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); 1009 } elsif ($regs =~ m/[wx]([0-9]+)/o) { 1010 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst); 1011 } else { 1012 my $encoded_size = 0; 1013 if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) { 1014 $encoded_size = ($tsize{$isize}<<22); 1015 } 1016 if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o && 1017 $1 == $regd) { 1018 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst); 1019 } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) { 1020 return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst); 1021 } 1022 } 1023 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) { 1024 return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22), 1025 $inst) 1026 } 1027 sprintf "%s // fail to parse", $inst; 1028} 1029 1030sub sve_pred { 1031 my ($mnemonic,,$arg)=@_; 1032 my $inst = (sprintf "%s %s", $mnemonic,$arg); 1033 1034 if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) { 1035 my $zt = $1; 1036 my $size = $tsize{$2}; 1037 my $pg = $3; 1038 my $addr = $5; 1039 my $xn = 31; 1040 1041 if ($addr =~ m/x([0-9]+)\s*/o) { 1042 $xn = $1; 1043 } 1044 1045 if ($mnemonic =~m/ld1r[bhwd]/o) { 1046 $size = 0; 1047 } 1048 if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { 1049 return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 1050 } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { 1051 my $xs = ($2 eq "SXTW") ? 1 : 0; 1052 return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 1053 } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) { 1054 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 1055 } else { 1056 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst); 1057 } 1058 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) { 1059 my $regd = $1; 1060 my $isize = $2; 1061 my $pg = $3; 1062 my $mod = $4; 1063 my $regs = $5; 1064 1065 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { 1066 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o 1067 && $regd == $1 1068 && $mode == 'm' 1069 && ((8<<$tsize{$isize}) > $2)) { 1070 return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst); 1071 } 1072 } elsif($regs =~ m/[wx]([0-9]+)/o) { 1073 return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); 1074 } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) { 1075 if ($mnemonic eq "sel") { 1076 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst); 1077 } elsif ($mnemonic eq "mov") { 1078 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst); 1079 } elsif (length $2 > 0) { 1080 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst); 1081 } else { 1082 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); 1083 } 1084 } 1085 } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) { 1086 my $pg = $1; 1087 my $isize = $2; 1088 my $regs = $3; 1089 1090 if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { 1091 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(®_code($2)<<5)|(®_code($3)<<16), $inst); 1092 } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) { 1093 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst); 1094 } else { 1095 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst); 1096 } 1097 } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) { 1098 return &verify_inst($opcode_pred{$mnemonic}|$1, $inst); 1099 } 1100 1101 sprintf "%s // fail to parse", $inst; 1102} 1103 1104sub sve_other { 1105 my ($mnemonic,$arg)=@_; 1106 my $inst = (sprintf "%s %s", $mnemonic,$arg); 1107 1108 if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) { 1109 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst); 1110 } elsif ($arg =~ m/(x|w)([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*z([0-9]+)\.([bhsd])/o) { 1111 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$5}<<22)|$1|($3<<10)|($4<<5)|$2, $inst); 1112 }elsif ($mnemonic =~ /inc[bhdw]/) { 1113 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 1114 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16)|0xE000, $inst); 1115 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 1116 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16)|0xC000, $inst); 1117 } elsif ($arg =~ m/x([0-9]+)/o) { 1118 return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16)|0xE000, $inst); 1119 } 1120 } elsif ($mnemonic =~ /cnt[bhdw]/) { 1121 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 1122 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); 1123 } 1124 } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { 1125 return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); 1126 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) { 1127 return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst); 1128 } 1129 sprintf "%s // fail to parse", $inst; 1130} 1131} 1132 1133open SELF,$0; 1134while(<SELF>) { 1135 next if (/^#!/); 1136 last if (!s/^#/\/\// and !/^$/); 1137 print; 1138} 1139close SELF; 1140 1141if ($debug_encoder == 1) { 1142 &create_verifier(); 1143} 1144 1145foreach(split("\n",$code)) { 1146 s/\`([^\`]*)\`/eval($1)/ge; 1147 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; 1148 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; 1149 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; 1150 s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; 1151 s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; 1152 s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; 1153 s/\b(movprfx|lasta|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z|w).*)/sve_other($1,$2)/ge; 1154 print $_,"\n"; 1155} 1156 1157close STDOUT or die "error closing STDOUT: $!"; 1158