1#! /usr/bin/env perl 2# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# 10# ChaCha20 for ARMv8 via SVE 11# 12# $output is the last argument if it looks like a file (it has an extension) 13# $flavour is the first argument if it doesn't look like a file 14$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 15$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 16 17$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 18( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 19( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 20die "can't locate arm-xlate.pl"; 21 22open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 23 or die "can't call $xlate: $!"; 24*STDOUT=*OUT; 25 26sub AUTOLOAD() # thunk [simplified] x86-style perlasm 27{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 28 my $arg = pop; 29 $arg = "#$arg" if ($arg*1 eq $arg); 30 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 31} 32 33my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); 34my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6"); 35my ($sve2flag) = ("x7"); 36my ($wctr, $xctr) = ("w8", "x8"); 37my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10"); 38my ($tmp,$tmpw) = ("x10", "w10"); 39my ($counter) = ("x11"); 40my @K=map("x$_",(12..15,19..22)); 41my @KL=map("w$_",(12..15,19..22)); 42my @mx=map("z$_",(0..15)); 43my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 44 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; 45my ($zctr) = ("z16"); 46my @xt=map("z$_",(17..24)); 47my @perm=map("z$_",(25..30)); 48my ($rot8) = ("z31"); 49my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt; 50# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register) 51# in SVE2 we use all 15 backup register 52my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8); 53my $debug_encoder=0; 54 55sub SVE_ADD() { 56 my $x = shift; 57 my $y = shift; 58 59$code.=<<___; 60 add @mx[$x].s,@mx[$x].s,@mx[$y].s 61___ 62 if (@_) { 63 &SVE_ADD(@_); 64 } 65} 66 67sub SVE_EOR() { 68 my $x = shift; 69 my $y = shift; 70 71$code.=<<___; 72 eor @mx[$x].d,@mx[$x].d,@mx[$y].d 73___ 74 if (@_) { 75 &SVE_EOR(@_); 76 } 77} 78 79sub SVE_LSL() { 80 my $bits = shift; 81 my $x = shift; 82 my $y = shift; 83 my $next = $x + 1; 84 85$code.=<<___; 86 lsl @xt[$x].s,@mx[$y].s,$bits 87___ 88 if (@_) { 89 &SVE_LSL($bits,$next,@_); 90 } 91} 92 93sub SVE_LSR() { 94 my $bits = shift; 95 my $x = shift; 96 97$code.=<<___; 98 lsr @mx[$x].s,@mx[$x].s,$bits 99___ 100 if (@_) { 101 &SVE_LSR($bits,@_); 102 } 103} 104 105sub SVE_ORR() { 106 my $x = shift; 107 my $y = shift; 108 my $next = $x + 1; 109 110$code.=<<___; 111 orr @mx[$y].d,@mx[$y].d,@xt[$x].d 112___ 113 if (@_) { 114 &SVE_ORR($next,@_); 115 } 116} 117 118sub SVE_REV16() { 119 my $x = shift; 120 121$code.=<<___; 122 revh @mx[$x].s,p0/m,@mx[$x].s 123___ 124 if (@_) { 125 &SVE_REV16(@_); 126 } 127} 128 129sub SVE_ROT8() { 130 my $x = shift; 131 132$code.=<<___; 133 tbl @mx[$x].b,{@mx[$x].b},$rot8.b 134___ 135 if (@_) { 136 &SVE_ROT8(@_); 137 } 138} 139 140sub SVE2_XAR() { 141 my $bits = shift; 142 my $x = shift; 143 my $y = shift; 144 my $rbits = 32-$bits; 145 146$code.=<<___; 147 xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits 148___ 149 if (@_) { 150 &SVE2_XAR($bits,@_); 151 } 152} 153 154sub SVE_QR_GROUP() { 155 my $have_sve2 = shift; 156 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; 157 158 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 159 if ($have_sve2 == 0) { 160 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 161 &SVE_REV16($d0,$d1,$d2,$d3); 162 } else { 163 &SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 164 } 165 166 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 167 if ($have_sve2 == 0) { 168 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 169 &SVE_LSL(12,0,$b0,$b1,$b2,$b3); 170 &SVE_LSR(20,$b0,$b1,$b2,$b3); 171 &SVE_ORR(0,$b0,$b1,$b2,$b3,); 172 } else { 173 &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 174 } 175 176 &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); 177 if ($have_sve2 == 0) { 178 &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 179 &SVE_ROT8($d0,$d1,$d2,$d3); 180 } else { 181 &SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); 182 } 183 184 &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); 185 if ($have_sve2 == 0) { 186 &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 187 &SVE_LSL(7,0,$b0,$b1,$b2,$b3); 188 &SVE_LSR(25,$b0,$b1,$b2,$b3); 189 &SVE_ORR(0,$b0,$b1,$b2,$b3); 190 } else { 191 &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); 192 } 193} 194 195sub SVE_INNER_BLOCK() { 196$code.=<<___; 197 mov $counter,#10 1981: 199.align 5 200___ 201 &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 202 &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); 203$code.=<<___; 204 subs $counter,$counter,1 205 b.ne 1b 206___ 207} 208 209sub SVE2_INNER_BLOCK() { 210$code.=<<___; 211 mov $counter,#10 2121: 213.align 5 214___ 215 &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); 216 &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); 217$code.=<<___; 218 subs $counter,$counter,1 219 b.ne 1b 220___ 221} 222 223sub load() { 224 my $x0 = shift; 225 my $x1 = shift; 226 my $x2 = shift; 227 my $x3 = shift; 228 my $x4 = shift; 229 my $x5 = shift; 230 my $x6 = shift; 231 my $x7 = shift; 232 233$code.=<<___; 234 ld1w {$x0.s},p0/z,[$inp] 235 ld1w {$x1.s},p0/z,[$inp, #1, MUL VL] 236 ld1w {$x2.s},p0/z,[$inp, #2, MUL VL] 237 ld1w {$x3.s},p0/z,[$inp, #3, MUL VL] 238 ld1w {$x4.s},p0/z,[$inp, #4, MUL VL] 239 ld1w {$x5.s},p0/z,[$inp, #5, MUL VL] 240 ld1w {$x6.s},p0/z,[$inp, #6, MUL VL] 241 ld1w {$x7.s},p0/z,[$inp, #7, MUL VL] 242 addvl $inp,$inp,#8 243___ 244} 245 246sub store() { 247 my $x0 = shift; 248 my $x1 = shift; 249 my $x2 = shift; 250 my $x3 = shift; 251 my $x4 = shift; 252 my $x5 = shift; 253 my $x6 = shift; 254 my $x7 = shift; 255 256$code.=<<___; 257 st1w {$x0.s},p0,[$outp] 258 st1w {$x1.s},p0,[$outp, #1, MUL VL] 259 st1w {$x2.s},p0,[$outp, #2, MUL VL] 260 st1w {$x3.s},p0,[$outp, #3, MUL VL] 261 st1w {$x4.s},p0,[$outp, #4, MUL VL] 262 st1w {$x5.s},p0,[$outp, #5, MUL VL] 263 st1w {$x6.s},p0,[$outp, #6, MUL VL] 264 st1w {$x7.s},p0,[$outp, #7, MUL VL] 265 addvl $outp,$outp,#8 266___ 267} 268 269sub transpose() { 270 my $xa = shift; 271 my $xb = shift; 272 my $xc = shift; 273 my $xd = shift; 274 275$code.=<<___; 276 zip1 $xt0.s,$xa.s,$xb.s 277 zip2 $xt1.s,$xa.s,$xb.s 278 zip1 $xt2.s,$xc.s,$xd.s 279 zip2 $xt3.s,$xc.s,$xd.s 280 zip1 $xa.d,$xt0.d,$xt2.d 281 zip2 $xb.d,$xt0.d,$xt2.d 282 zip1 $xc.d,$xt1.d,$xt3.d 283 zip2 $xd.d,$xt1.d,$xt3.d 284___ 285} 286 287sub SVE_ADD_STATES() { 288$code.=<<___; 289 lsr $tmp1,@K[5],#32 290 dup $xt0.s,@KL[5] 291 dup $xt1.s,$tmpw1 292 add @mx[0].s,@mx[0].s,$bak0.s 293 add @mx[1].s,@mx[1].s,$bak1.s 294 add @mx[2].s,@mx[2].s,$bak2.s 295 add @mx[3].s,@mx[3].s,$bak3.s 296 add @mx[4].s,@mx[4].s,$bak4.s 297 add @mx[5].s,@mx[5].s,$bak5.s 298 add @mx[6].s,@mx[6].s,$bak6.s 299 add @mx[7].s,@mx[7].s,$bak7.s 300 add @mx[8].s,@mx[8].s,$bak8.s 301 add @mx[9].s,@mx[9].s,$bak9.s 302 lsr $tmp0,@K[6],#32 303 dup $xt4.s,$tmpw0 304 lsr $tmp1,@K[7],#32 305 dup $xt5.s,@KL[7] 306 dup $xt6.s,$tmpw1 307 add @mx[10].s,@mx[10].s,$xt0.s 308 add @mx[11].s,@mx[11].s,$xt1.s 309 add @mx[12].s,@mx[12].s,$zctr.s 310 add @mx[13].s,@mx[13].s,$xt4.s 311 add @mx[14].s,@mx[14].s,$xt5.s 312 add @mx[15].s,@mx[15].s,$xt6.s 313___ 314} 315 316sub SVE2_ADD_STATES() { 317$code.=<<___; 318 add @mx[0].s,@mx[0].s,$bak0.s 319 add @mx[1].s,@mx[1].s,$bak1.s 320 add @mx[2].s,@mx[2].s,$bak2.s 321 add @mx[3].s,@mx[3].s,$bak3.s 322 add @mx[4].s,@mx[4].s,$bak4.s 323 add @mx[5].s,@mx[5].s,$bak5.s 324 add @mx[6].s,@mx[6].s,$bak6.s 325 add @mx[7].s,@mx[7].s,$bak7.s 326 add @mx[8].s,@mx[8].s,$bak8.s 327 add @mx[9].s,@mx[9].s,$bak9.s 328 add @mx[10].s,@mx[10].s,$bak10.s 329 add @mx[11].s,@mx[11].s,$bak11.s 330 add @mx[12].s,@mx[12].s,$zctr.s 331 add @mx[13].s,@mx[13].s,$bak13.s 332 add @mx[14].s,@mx[14].s,$bak14.s 333 add @mx[15].s,@mx[15].s,$bak15.s 334___ 335} 336 337sub SVE_TRANSFORMS() { 338 &transpose($xa0,$xb0,$xc0,$xd0); 339 &transpose($xa1,$xb1,$xc1,$xd1); 340 &transpose($xa2,$xb2,$xc2,$xd2); 341 &transpose($xa3,$xb3,$xc3,$xd3); 342 &transpose($xa0,$xa1,$xa2,$xa3); 343 &transpose($xb0,$xb1,$xb2,$xb3); 344 &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); 345$code.=<<___; 346 eor $xa0.d,$xa0.d,$xt0.d 347 eor $xa1.d,$xa1.d,$xt1.d 348 eor $xa2.d,$xa2.d,$xt2.d 349 eor $xa3.d,$xa3.d,$xt3.d 350 eor $xb0.d,$xb0.d,$xt4.d 351 eor $xb1.d,$xb1.d,$xt5.d 352 eor $xb2.d,$xb2.d,$xt6.d 353 eor $xb3.d,$xb3.d,$xt7.d 354___ 355 &transpose($xc0,$xc1,$xc2,$xc3); 356 &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); 357 &transpose($xd0,$xd1,$xd2,$xd3); 358 &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); 359$code.=<<___; 360 eor $xc0.d,$xc0.d,$xt0.d 361 eor $xc1.d,$xc1.d,$xt1.d 362 eor $xc2.d,$xc2.d,$xt2.d 363 eor $xc3.d,$xc3.d,$xt3.d 364 eor $xd0.d,$xd0.d,$xt4.d 365 eor $xd1.d,$xd1.d,$xt5.d 366 eor $xd2.d,$xd2.d,$xt6.d 367 eor $xd3.d,$xd3.d,$xt7.d 368___ 369 &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); 370$code.=<<___; 371 incw $xctr, ALL, MUL #1 372 incw $zctr.s, ALL, MUL #1 373___ 374} 375 376sub SVE_LOAD_STATES() { 377$code.=<<___; 378 lsr $tmp0,@K[0],#32 379 dup @mx[0].s,@KL[0] 380 dup $bak0.s,@KL[0] 381 dup @mx[1].s,$tmpw0 382 dup $bak1.s,$tmpw0 383 lsr $tmp1,@K[1],#32 384 dup @mx[2].s,@KL[1] 385 dup $bak2.s,@KL[1] 386 dup @mx[3].s,$tmpw1 387 dup $bak3.s,$tmpw1 388 lsr $tmp0,@K[2],#32 389 dup @mx[4].s,@KL[2] 390 dup $bak4.s,@KL[2] 391 dup @mx[5].s,$tmpw0 392 dup $bak5.s,$tmpw0 393 lsr $tmp1,@K[3],#32 394 dup @mx[6].s,@KL[3] 395 dup $bak6.s,@KL[3] 396 dup @mx[7].s,$tmpw1 397 dup $bak7.s,$tmpw1 398 lsr $tmp0,@K[4],#32 399 dup @mx[8].s,@KL[4] 400 dup $bak8.s,@KL[4] 401 dup @mx[9].s,$tmpw0 402 dup $bak9.s,$tmpw0 403 lsr $tmp1,@K[5],#32 404 dup @mx[10].s,@KL[5] 405 dup @mx[11].s,$tmpw1 406 orr @mx[12].d,$zctr.d,$zctr.d 407 lsr $tmp0,@K[6],#32 408 dup @mx[13].s,$tmpw0 409 lsr $tmp1,@K[7],#32 410 dup @mx[14].s,@KL[7] 411 dup @mx[15].s,$tmpw1 412___ 413} 414 415sub SVE2_LOAD_STATES() { 416$code.=<<___; 417 lsr $tmp0,@K[0],#32 418 dup @mx[0].s,@KL[0] 419 dup $bak0.s,@KL[0] 420 dup @mx[1].s,$tmpw0 421 dup $bak1.s,$tmpw0 422 lsr $tmp1,@K[1],#32 423 dup @mx[2].s,@KL[1] 424 dup $bak2.s,@KL[1] 425 dup @mx[3].s,$tmpw1 426 dup $bak3.s,$tmpw1 427 lsr $tmp0,@K[2],#32 428 dup @mx[4].s,@KL[2] 429 dup $bak4.s,@KL[2] 430 dup @mx[5].s,$tmpw0 431 dup $bak5.s,$tmpw0 432 lsr $tmp1,@K[3],#32 433 dup @mx[6].s,@KL[3] 434 dup $bak6.s,@KL[3] 435 dup @mx[7].s,$tmpw1 436 dup $bak7.s,$tmpw1 437 lsr $tmp0,@K[4],#32 438 dup @mx[8].s,@KL[4] 439 dup $bak8.s,@KL[4] 440 dup @mx[9].s,$tmpw0 441 dup $bak9.s,$tmpw0 442 lsr $tmp1,@K[5],#32 443 dup @mx[10].s,@KL[5] 444 dup $bak10.s,@KL[5] 445 dup @mx[11].s,$tmpw1 446 dup $bak11.s,$tmpw1 447 orr @mx[12].d,$zctr.d,$zctr.d 448 lsr $tmp0,@K[6],#32 449 dup @mx[13].s,$tmpw0 450 dup $bak13.s,$tmpw0 451 lsr $tmp1,@K[7],#32 452 dup @mx[14].s,@KL[7] 453 dup $bak14.s,@KL[7] 454 dup @mx[15].s,$tmpw1 455 dup $bak15.s,$tmpw1 456___ 457} 458 459sub sve_handle_blocks() { 460$code.=<<___; 461 cbz $sve2flag,.sve_inner 462___ 463 &SVE2_LOAD_STATES(); 464 &SVE2_INNER_BLOCK(); 465 &SVE2_ADD_STATES(); 466$code.=<<___; 467 b .fini_inner 468.sve_inner: 469___ 470 &SVE_LOAD_STATES(); 471 &SVE_INNER_BLOCK(); 472 &SVE_ADD_STATES(); 473$code.=<<___; 474.fini_inner: 475___ 476 &SVE_TRANSFORMS(); 477} 478 479sub chacha20_process() { 480$code.=<<___; 481.align 5 482.Loop: 483 cmp $blocks,$veclen 484 b.lt .Lexit 485___ 486 &sve_handle_blocks(); 487$code.=<<___; 488 subs $blocks,$blocks,$veclen 489 b.gt .Loop 490.Lexit: 491___ 492} 493 494{{{ 495$code.=<<___; 496#include "arm_arch.h" 497 498.arch armv8-a 499 500.extern OPENSSL_armcap_P 501.hidden OPENSSL_armcap_P 502 503.text 504.align 5 505.Lchacha20_consts: 506.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 507.Lrot8: 508 .word 0x02010003,0x04040404,0x02010003,0x04040404 509.globl ChaCha20_ctr32_sve 510.type ChaCha20_ctr32_sve,%function 511.align 5 512ChaCha20_ctr32_sve: 513 AARCH64_VALID_CALL_TARGET 514 cntw $veclen, ALL, MUL #1 515 lsr $blocks,$len,#6 516 cmp $blocks,$veclen 517 b.lt .Lreturn 518 mov $sve2flag,0 519 adrp $tmp,OPENSSL_armcap_P 520 ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] 521 tst $tmpw,#ARMV8_SVE2 522 b.eq 1f 523 mov $sve2flag,1 524 b 2f 5251: 526 cmp $veclen,4 527 b.le .Lreturn 528 adr $tmp,.Lrot8 529 ldp $tmpw0,$tmpw1,[$tmp] 530 index $rot8.s,$tmpw0,$tmpw1 5312: 532 stp d8,d9,[sp,-96]! 533 stp d10,d11,[sp,16] 534 stp d12,d13,[sp,32] 535 stp d14,d15,[sp,48] 536 stp x19,x20,[sp,64] 537 stp x21,x22,[sp,80] 538 adr $tmp,.Lchacha20_consts 539 ldp @K[0],@K[1],[$tmp] 540 ldp @K[2],@K[3],[$key] 541 ldp @K[4],@K[5],[$key, 16] 542 ldp @K[6],@K[7],[$ctr] 543 ldr $wctr,[$ctr] 544 index $zctr.s,$wctr,1 545 ptrues p0.s,ALL 546#ifdef __AARCH64EB__ 547 ror @K[2],@K[2],#32 548 ror @K[3],@K[3],#32 549 ror @K[4],@K[4],#32 550 ror @K[5],@K[5],#32 551 ror @K[6],@K[6],#32 552 ror @K[7],@K[7],#32 553#endif 554___ 555 &chacha20_process(); 556$code.=<<___; 557 ldp d10,d11,[sp,16] 558 ldp d12,d13,[sp,32] 559 ldp d14,d15,[sp,48] 560 ldp x19,x20,[sp,64] 561 ldp x21,x22,[sp,80] 562 ldp d8,d9,[sp],96 563 str $wctr,[$ctr] 564 and $len,$len,#63 565 add $len,$len,$blocks,lsl #6 566.Lreturn: 567 ret 568.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve 569___ 570 571}}} 572 573######################################## 574{ 575my %opcode_unpred = ( 576 "movprfx" => 0x0420BC00, 577 "eor" => 0x04a03000, 578 "add" => 0x04200000, 579 "orr" => 0x04603000, 580 "lsl" => 0x04209C00, 581 "lsr" => 0x04209400, 582 "incw" => 0x04B0C000, 583 "xar" => 0x04203400, 584 "zip1" => 0x05206000, 585 "zip2" => 0x05206400, 586 "uzp1" => 0x05206800, 587 "uzp2" => 0x05206C00, 588 "index" => 0x04204C00, 589 "mov" => 0x05203800, 590 "dup" => 0x05203800, 591 "cntw" => 0x04A0E000, 592 "tbl" => 0x05203000); 593 594my %opcode_imm_unpred = ( 595 "dup" => 0x2538C000, 596 "index" => 0x04204400); 597 598my %opcode_scalar_pred = ( 599 "mov" => 0x0528A000, 600 "cpy" => 0x0528A000, 601 "st4w" => 0xE5606000, 602 "st1w" => 0xE5004000, 603 "ld1w" => 0xA5404000); 604 605my %opcode_gather_pred = ( 606 "ld1w" => 0x85204000); 607 608my %opcode_pred = ( 609 "eor" => 0x04190000, 610 "add" => 0x04000000, 611 "orr" => 0x04180000, 612 "whilelo" => 0x25200C00, 613 "whilelt" => 0x25200400, 614 "cntp" => 0x25208000, 615 "addvl" => 0x04205000, 616 "lsl" => 0x04038000, 617 "lsr" => 0x04018000, 618 "sel" => 0x0520C000, 619 "mov" => 0x0520C000, 620 "ptrue" => 0x2518E000, 621 "pfalse" => 0x2518E400, 622 "ptrues" => 0x2519E000, 623 "pnext" => 0x2519C400, 624 "ld4w" => 0xA560E000, 625 "st4w" => 0xE570E000, 626 "st1w" => 0xE500E000, 627 "ld1w" => 0xA540A000, 628 "ld1rw" => 0x8540C000, 629 "revh" => 0x05258000); 630 631my %tsize = ( 632 'b' => 0, 633 'h' => 1, 634 's' => 2, 635 'd' => 3); 636 637my %sf = ( 638 "w" => 0, 639 "x" => 1); 640 641my %pattern = ( 642 "POW2" => 0, 643 "VL1" => 1, 644 "VL2" => 2, 645 "VL3" => 3, 646 "VL4" => 4, 647 "VL5" => 5, 648 "VL6" => 6, 649 "VL7" => 7, 650 "VL8" => 8, 651 "VL16" => 9, 652 "VL32" => 10, 653 "VL64" => 11, 654 "VL128" => 12, 655 "VL256" => 13, 656 "MUL4" => 29, 657 "MUL3" => 30, 658 "ALL" => 31); 659 660sub create_verifier { 661 my $filename="./compile_sve.sh"; 662 663$scripts = <<___; 664#! /bin/bash 665set -e 666CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'} 667 668[ -z "\$1" ] && exit 1 669ARCH=`uname -p | xargs echo -n` 670 671# need gcc-10 and above to compile SVE code 672# change this according to your system during debugging 673if [ \$ARCH == 'aarch64' ]; then 674 CC=gcc-11 675 OBJDUMP=objdump 676else 677 CC=\${CROSS_COMPILE}gcc 678 OBJDUMP=\${CROSS_COMPILE}objdump 679fi 680TMPFILE=/tmp/\$\$ 681cat > \$TMPFILE.c << EOF 682extern __attribute__((noinline, section("disasm_output"))) void dummy_func() 683{ 684 asm("\$@\\t\\n"); 685} 686int main(int argc, char *argv[]) 687{ 688} 689EOF 690\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c 691\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}' 692rm \$TMPFILE.c \$TMPFILE.out 693___ 694 open(FH, '>', $filename) or die $!; 695 print FH $scripts; 696 close(FH); 697 system("chmod a+x ./compile_sve.sh"); 698} 699 700sub compile_sve { 701 return `./compile_sve.sh '@_'` 702} 703 704sub verify_inst { 705 my ($code,$inst)=@_; 706 my $hexcode = (sprintf "%08x", $code); 707 708 if ($debug_encoder == 1) { 709 my $expect=&compile_sve($inst); 710 if ($expect ne $hexcode) { 711 return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode); 712 } 713 } 714 return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst); 715} 716 717sub reg_code { 718 my $code = shift; 719 720 if ($code == "zr") { 721 return "31"; 722 } 723 return $code; 724} 725 726sub encode_size_imm() { 727 my ($mnemonic, $isize, $const)=@_; 728 my $esize = (8<<$tsize{$isize}); 729 my $tsize_imm = $esize + $const; 730 731 if ($mnemonic eq "lsr" || $mnemonic eq "xar") { 732 $tsize_imm = 2*$esize - $const; 733 } 734 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16); 735} 736 737sub encode_shift_pred() { 738 my ($mnemonic, $isize, $const)=@_; 739 my $esize = (8<<$tsize{$isize}); 740 my $tsize_imm = $esize + $const; 741 742 if ($mnemonic eq "lsr") { 743 $tsize_imm = 2*$esize - $const; 744 } 745 return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5); 746} 747 748sub sve_unpred { 749 my ($mnemonic,$arg)=@_; 750 my $inst = (sprintf "%s %s", $mnemonic,$arg); 751 752 if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) { 753 return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16), 754 $inst) 755 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) { 756 my $regd = $1; 757 my $isize = $2; 758 my $regs=$3; 759 760 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { 761 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o 762 && ((8<<$tsize{$isize}) > $2)) { 763 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2), 764 $inst); 765 } 766 } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) { 767 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); 768 } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) { 769 return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); 770 } elsif ($regs =~ m/[wx]([0-9]+)/o) { 771 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst); 772 } else { 773 my $encoded_size = 0; 774 if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) { 775 $encoded_size = ($tsize{$isize}<<22); 776 } 777 if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o && 778 $1 == $regd) { 779 return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst); 780 } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) { 781 return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst); 782 } 783 } 784 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) { 785 return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22), 786 $inst) 787 } 788 sprintf "%s // fail to parse", $inst; 789} 790 791sub sve_pred { 792 my ($mnemonic,,$arg)=@_; 793 my $inst = (sprintf "%s %s", $mnemonic,$arg); 794 795 if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) { 796 my $zt = $1; 797 my $size = $tsize{$2}; 798 my $pg = $3; 799 my $addr = $5; 800 my $xn = 31; 801 802 if ($addr =~ m/x([0-9]+)\s*/o) { 803 $xn = $1; 804 } 805 806 if ($mnemonic =~m/ld1r[bhwd]/o) { 807 $size = 0; 808 } 809 if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { 810 return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 811 } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { 812 my $xs = ($2 eq "SXTW") ? 1 : 0; 813 return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 814 } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) { 815 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); 816 } else { 817 return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst); 818 } 819 } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) { 820 my $regd = $1; 821 my $isize = $2; 822 my $pg = $3; 823 my $mod = $4; 824 my $regs = $5; 825 826 if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { 827 if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o 828 && $regd == $1 829 && $mode == 'm' 830 && ((8<<$tsize{$isize}) > $2)) { 831 return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst); 832 } 833 } elsif($regs =~ m/[wx]([0-9]+)/o) { 834 return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); 835 } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) { 836 if ($mnemonic eq "sel") { 837 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst); 838 } elsif ($mnemonic eq "mov") { 839 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst); 840 } elsif (length $2 > 0) { 841 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst); 842 } else { 843 return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); 844 } 845 } 846 } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) { 847 my $pg = $1; 848 my $isize = $2; 849 my $regs = $3; 850 851 if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { 852 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(®_code($2)<<5)|(®_code($3)<<16), $inst); 853 } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) { 854 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst); 855 } else { 856 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst); 857 } 858 } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) { 859 return &verify_inst($opcode_pred{$mnemonic}|$1, $inst); 860 } 861 862 sprintf "%s // fail to parse", $inst; 863} 864 865sub sve_other { 866 my ($mnemonic,$arg)=@_; 867 my $inst = (sprintf "%s %s", $mnemonic,$arg); 868 869 if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) { 870 return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst); 871 } elsif ($mnemonic =~ /inc[bhdw]/) { 872 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 873 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16), $inst); 874 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 875 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); 876 } elsif ($arg =~ m/x([0-9]+)/o) { 877 return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst); 878 } 879 } elsif ($mnemonic =~ /cnt[bhdw]/) { 880 if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { 881 return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); 882 } 883 } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { 884 return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); 885 } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) { 886 return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst); 887 } 888 sprintf "%s // fail to parse", $inst; 889} 890} 891 892open SELF,$0; 893while(<SELF>) { 894 next if (/^#!/); 895 last if (!s/^#/\/\// and !/^$/); 896 print; 897} 898close SELF; 899 900if ($debug_encoder == 1) { 901 &create_verifier(); 902} 903 904foreach(split("\n",$code)) { 905 s/\`([^\`]*)\`/eval($1)/ge; 906 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; 907 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; 908 s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; 909 s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; 910 s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; 911 s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; 912 s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge; 913 print $_,"\n"; 914} 915 916close STDOUT or die "error closing STDOUT: $!"; 917