1#! /usr/bin/env perl 2# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# $output is the last argument if it looks like a file (it has an extension) 10# $flavour is the first argument if it doesn't look like a file 11$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 12$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 13 14$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 15( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 16( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 17die "can't locate arm-xlate.pl"; 18 19open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 20 or die "can't call $xlate: $!"; 21*STDOUT=*OUT; 22 23my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14)); 24my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14)); 25my ($t0,$t1,$t2,$t3)=map("x$_",(3..6)); 26my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20)); 27 28sub bn_mod_add() { 29 my $mod = shift; 30$code.=<<___; 31 // Load inputs 32 ldp $s0,$s1,[x1] 33 ldp $s2,$s3,[x1,#16] 34 ldp $s4,$s5,[x2] 35 ldp $s6,$s7,[x2,#16] 36 37 // Addition 38 adds $s0,$s0,$s4 39 adcs $s1,$s1,$s5 40 adcs $s2,$s2,$s6 41 adcs $s3,$s3,$s7 42 adc $t4,xzr,xzr 43 44 // Load polynomial 45 adrp x2,$mod 46 add x2,x2,:lo12:$mod 47 ldp $s4,$s5,[x2] 48 ldp $s6,$s7,[x2,#16] 49 50 // Backup Addition 51 mov $t0,$s0 52 mov $t1,$s1 53 mov $t2,$s2 54 mov $t3,$s3 55 56 // Sub polynomial 57 subs $t0,$t0,$s4 58 sbcs $t1,$t1,$s5 59 sbcs $t2,$t2,$s6 60 sbcs $t3,$t3,$s7 61 sbcs $t4,$t4,xzr 62 63 // Select based on carry 64 csel $s0,$s0,$t0,cc 65 csel $s1,$s1,$t1,cc 66 csel $s2,$s2,$t2,cc 67 csel $s3,$s3,$t3,cc 68 69 // Store results 70 stp $s0,$s1,[x0] 71 stp $s2,$s3,[x0,#16] 72___ 73} 74 75sub bn_mod_sub() { 76 my $mod = shift; 77$code.=<<___; 78 // Load inputs 79 ldp $s0,$s1,[x1] 80 ldp $s2,$s3,[x1,#16] 81 ldp $s4,$s5,[x2] 82 ldp $s6,$s7,[x2,#16] 83 84 // Subtraction 85 subs $s0,$s0,$s4 86 sbcs $s1,$s1,$s5 87 sbcs $s2,$s2,$s6 88 sbcs $s3,$s3,$s7 89 sbc $t4,xzr,xzr 90 91 // Load polynomial 92 adrp x2,$mod 93 add x2,x2,:lo12:$mod 94 ldp $s4,$s5,[x2] 95 ldp $s6,$s7,[x2,#16] 96 97 // Backup subtraction 98 mov $t0,$s0 99 mov $t1,$s1 100 mov $t2,$s2 101 mov $t3,$s3 102 103 // Add polynomial 104 adds $t0,$t0,$s4 105 adcs $t1,$t1,$s5 106 adcs $t2,$t2,$s6 107 adcs $t3,$t3,$s7 108 tst $t4,$t4 109 110 // Select based on carry 111 csel $s0,$s0,$t0,eq 112 csel $s1,$s1,$t1,eq 113 csel $s2,$s2,$t2,eq 114 csel $s3,$s3,$t3,eq 115 116 // Store results 117 stp $s0,$s1,[x0] 118 stp $s2,$s3,[x0,#16] 119___ 120} 121 122sub bn_mod_div_by_2() { 123 my $mod = shift; 124$code.=<<___; 125 // Load inputs 126 ldp $s0,$s1,[x1] 127 ldp $s2,$s3,[x1,#16] 128 129 // Save the least significant bit 130 mov $t0,$s0 131 132 // Right shift 1 133 extr $s0,$s1,$s0,#1 134 extr $s1,$s2,$s1,#1 135 extr $s2,$s3,$s2,#1 136 lsr $s3,$s3,#1 137 138 // Load mod 139 adrp x2,$mod 140 add x2,x2,:lo12:$mod 141 ldp $s4,$s5,[x2] 142 ldp $s6,$s7,[x2,#16] 143 144 // Parity check 145 tst $t0,#1 146 csel $s4,xzr,$s4,eq 147 csel $s5,xzr,$s5,eq 148 csel $s6,xzr,$s6,eq 149 csel $s7,xzr,$s7,eq 150 151 // Add 152 adds $s0,$s0,$s4 153 adcs $s1,$s1,$s5 154 adcs $s2,$s2,$s6 155 adc $s3,$s3,$s7 156 157 // Store results 158 stp $s0,$s1,[x0] 159 stp $s2,$s3,[x0,#16] 160___ 161} 162 163{ 164$code.=<<___; 165#include "arm_arch.h" 166.arch armv8-a 167.rodata 168 169.align 5 170// The polynomial p 171.Lpoly: 172.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff 173// The order of polynomial n 174.Lord: 175.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff 176// (p + 1) / 2 177.Lpoly_div_2: 178.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff 179// (n + 1) / 2 180.Lord_div_2: 181.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff 182 183.text 184 185// void bn_rshift1(BN_ULONG *a); 186.globl bn_rshift1 187.type bn_rshift1,%function 188.align 5 189bn_rshift1: 190 AARCH64_VALID_CALL_TARGET 191 // Load inputs 192 ldp $s0,$s1,[x0] 193 ldp $s2,$s3,[x0,#16] 194 195 // Right shift 196 extr $s0,$s1,$s0,#1 197 extr $s1,$s2,$s1,#1 198 extr $s2,$s3,$s2,#1 199 lsr $s3,$s3,#1 200 201 // Store results 202 stp $s0,$s1,[x0] 203 stp $s2,$s3,[x0,#16] 204 205 ret 206.size bn_rshift1,.-bn_rshift1 207 208// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 209.globl bn_sub 210.type bn_sub,%function 211.align 5 212bn_sub: 213 AARCH64_VALID_CALL_TARGET 214 // Load inputs 215 ldp $s0,$s1,[x1] 216 ldp $s2,$s3,[x1,#16] 217 ldp $s4,$s5,[x2] 218 ldp $s6,$s7,[x2,#16] 219 220 // Subtraction 221 subs $s0,$s0,$s4 222 sbcs $s1,$s1,$s5 223 sbcs $s2,$s2,$s6 224 sbc $s3,$s3,$s7 225 226 // Store results 227 stp $s0,$s1,[x0] 228 stp $s2,$s3,[x0,#16] 229 230 ret 231.size bn_sub,.-bn_sub 232 233// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a); 234.globl ecp_sm2p256_div_by_2 235.type ecp_sm2p256_div_by_2,%function 236.align 5 237ecp_sm2p256_div_by_2: 238 AARCH64_VALID_CALL_TARGET 239___ 240 &bn_mod_div_by_2(".Lpoly_div_2"); 241$code.=<<___; 242 ret 243.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2 244 245// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a); 246.globl ecp_sm2p256_div_by_2_mod_ord 247.type ecp_sm2p256_div_by_2_mod_ord,%function 248.align 5 249ecp_sm2p256_div_by_2_mod_ord: 250 AARCH64_VALID_CALL_TARGET 251___ 252 &bn_mod_div_by_2(".Lord_div_2"); 253$code.=<<___; 254 ret 255.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord 256 257// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a); 258.globl ecp_sm2p256_mul_by_3 259.type ecp_sm2p256_mul_by_3,%function 260.align 5 261ecp_sm2p256_mul_by_3: 262 AARCH64_VALID_CALL_TARGET 263 // Load inputs 264 ldp $s0,$s1,[x1] 265 ldp $s2,$s3,[x1,#16] 266 267 // 2*a 268 adds $s0,$s0,$s0 269 adcs $s1,$s1,$s1 270 adcs $s2,$s2,$s2 271 adcs $s3,$s3,$s3 272 adcs $t4,xzr,xzr 273 274 mov $t0,$s0 275 mov $t1,$s1 276 mov $t2,$s2 277 mov $t3,$s3 278 279 // Sub polynomial 280 adrp x2,.Lpoly 281 add x2,x2,:lo12:.Lpoly 282 ldp $s4,$s5,[x2] 283 ldp $s6,$s7,[x2,#16] 284 subs $s0,$s0,$s4 285 sbcs $s1,$s1,$s5 286 sbcs $s2,$s2,$s6 287 sbcs $s3,$s3,$s7 288 sbcs $t4,$t4,xzr 289 290 csel $s0,$s0,$t0,cs 291 csel $s1,$s1,$t1,cs 292 csel $s2,$s2,$t2,cs 293 csel $s3,$s3,$t3,cs 294 eor $t4,$t4,$t4 295 296 // 3*a 297 ldp $s4,$s5,[x1] 298 ldp $s6,$s7,[x1,#16] 299 adds $s0,$s0,$s4 300 adcs $s1,$s1,$s5 301 adcs $s2,$s2,$s6 302 adcs $s3,$s3,$s7 303 adcs $t4,xzr,xzr 304 305 mov $t0,$s0 306 mov $t1,$s1 307 mov $t2,$s2 308 mov $t3,$s3 309 310 // Sub polynomial 311 adrp x2,.Lpoly 312 add x2,x2,:lo12:.Lpoly 313 ldp $s4,$s5,[x2] 314 ldp $s6,$s7,[x2,#16] 315 subs $s0,$s0,$s4 316 sbcs $s1,$s1,$s5 317 sbcs $s2,$s2,$s6 318 sbcs $s3,$s3,$s7 319 sbcs $t4,$t4,xzr 320 321 csel $s0,$s0,$t0,cs 322 csel $s1,$s1,$t1,cs 323 csel $s2,$s2,$t2,cs 324 csel $s3,$s3,$t3,cs 325 326 // Store results 327 stp $s0,$s1,[x0] 328 stp $s2,$s3,[x0,#16] 329 330 ret 331.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3 332 333// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 334.globl ecp_sm2p256_add 335.type ecp_sm2p256_add,%function 336.align 5 337ecp_sm2p256_add: 338 AARCH64_VALID_CALL_TARGET 339___ 340 &bn_mod_add(".Lpoly"); 341$code.=<<___; 342 ret 343.size ecp_sm2p256_add,.-ecp_sm2p256_add 344 345// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 346.globl ecp_sm2p256_sub 347.type ecp_sm2p256_sub,%function 348.align 5 349ecp_sm2p256_sub: 350 AARCH64_VALID_CALL_TARGET 351___ 352 &bn_mod_sub(".Lpoly"); 353$code.=<<___; 354 ret 355.size ecp_sm2p256_sub,.-ecp_sm2p256_sub 356 357// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 358.globl ecp_sm2p256_sub_mod_ord 359.type ecp_sm2p256_sub_mod_ord,%function 360.align 5 361ecp_sm2p256_sub_mod_ord: 362 AARCH64_VALID_CALL_TARGET 363___ 364 &bn_mod_sub(".Lord"); 365$code.=<<___; 366 ret 367.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord 368 369.macro RDC 370 // a = | s7 | ... | s0 |, where si are 64-bit quantities 371 // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities 372 // | s7 | s6 | s5 | s4 | 373 // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | 374 // | s3 | s2 | s1 | s0 | 375 // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | 376 // ================================================= 377 // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) 378 // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) 379 // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) 380 // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) 381 // | a12 | 0 | s7 | a13 | 0 | s6 | (+) 382 // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) 383 // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) 384 // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 385 // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 386 // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 387 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 388 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 389 // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 390 // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) 391 // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) 392 // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) 393 // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) 394 // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| 395 // | V[3] | V[2] | V[1] | V[0] | 396 397 // 1. 64-bit addition 398 // t2=s6+s7+s7 399 adds $t2,$s6,$s7 400 adcs $t1,xzr,xzr 401 adds $t2,$t2,$s7 402 adcs $t1,$t1,xzr 403 // t3=s4+s5+t2 404 adds $t3,$s4,$t2 405 adcs $t4,$t1,xzr 406 adds $t3,$t3,$s5 407 adcs $t4,$t4,xzr 408 // sum 409 adds $s0,$s0,$t3 410 adcs $s1,$s1,$t4 411 adcs $s2,$s2,$t2 412 adcs $s3,$s3,$s7 413 adcs $t0,xzr,xzr 414 adds $s3,$s3,$t1 415 adcs $t0,$t0,xzr 416 417 stp $s0,$s1,[sp,#32] 418 stp $s2,$s3,[sp,#48] 419 420 // 2. 64-bit to 32-bit spread 421 mov $t1,#0xffffffff 422 mov $s0,$s4 423 mov $s1,$s5 424 mov $s2,$s6 425 mov $s3,$s7 426 and $s0,$s0,$t1 // a8 427 and $s1,$s1,$t1 // a10 428 and $s2,$s2,$t1 // a12 429 and $s3,$s3,$t1 // a14 430 lsr $s4,$s4,#32 // a9 431 lsr $s5,$s5,#32 // a11 432 lsr $s6,$s6,#32 // a13 433 lsr $s7,$s7,#32 // a15 434 435 // 3. 32-bit addition 436 add $t1,$a14,$a12 // t1 <- a12 + a14 437 add $t2,$a15,$a13 // t2 <- a13 + a15 438 add $t3,$a8,$a9 // t3 <- a8 + a9 439 add $t4,$a14,$a10 // t4 <- a10 + a14 440 add $a15,$a15,$a11 // a15 <- a11 + a15 441 add $a12,$t2,$t1 // a12 <- a12 + a13 + a14 + a15 442 add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15 443 add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15) 444 add $a10,$a10,$t3 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) 445 add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 446 add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15 447 add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15 448 add $a12,$a12,$a8 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15 449 add $t3,$t3,$a14 // t3 <- a8 + a9 + a14 450 add $t3,$t3,$a13 // t3 <- a8 + a9 + a13 + a14 451 add $a9,$a9,$t2 // a9 <- a9 + a13 + a15 452 add $a11,$a11,$a9 // a11 <- a9 + a11 + a13 + a15 453 add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15) 454 add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14 455 456 // U[0] s5 a9 + a11 + 2*(a13 + a15) 457 // U[1] t1 a10 + a12 + 2*a14 458 // U[2] -t3 a8 + a9 + a13 + a14 459 // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 460 // U[4] s4 a9 + a13 + a15 461 // U[5] t4 a10 + a14 462 // U[6] s7 a11 + a15 463 // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 464 465 // 4. 32-bit to 64-bit 466 lsl $s0,$t1,#32 467 extr $t1,$s2,$t1,#32 468 extr $s2,$t4,$s2,#32 469 extr $t4,$s1,$t4,#32 470 lsr $s1,$s1,#32 471 472 // 5. 64-bit addition 473 adds $s5,$s5,$s0 474 adcs $t1,$t1,xzr 475 adcs $s4,$s4,$s2 476 adcs $s7,$s7,$t4 477 adcs $t0,$t0,$s1 478 479 // V[0] s5 480 // V[1] t1 481 // V[2] s4 482 // V[3] s7 483 // carry t0 484 // sub t3 485 486 // 5. Process s0-s3 487 ldp $s0,$s1,[sp,#32] 488 ldp $s2,$s3,[sp,#48] 489 // add with V0-V3 490 adds $s0,$s0,$s5 491 adcs $s1,$s1,$t1 492 adcs $s2,$s2,$s4 493 adcs $s3,$s3,$s7 494 adcs $t0,$t0,xzr 495 // sub with t3 496 subs $s1,$s1,$t3 497 sbcs $s2,$s2,xzr 498 sbcs $s3,$s3,xzr 499 sbcs $t0,$t0,xzr 500 501 // 6. MOD 502 // First Mod 503 lsl $t1,$t0,#32 504 subs $t2,$t1,$t0 505 506 adds $s0,$s0,$t0 507 adcs $s1,$s1,$t2 508 adcs $s2,$s2,xzr 509 adcs $s3,$s3,$t1 510 511 // Last Mod 512 // return y - p if y > p else y 513 mov $s4,$s0 514 mov $s5,$s1 515 mov $s6,$s2 516 mov $s7,$s3 517 518 adrp $t0,.Lpoly 519 add $t0,$t0,:lo12:.Lpoly 520 ldp $t1,$t2,[$t0] 521 ldp $t3,$t4,[$t0,#16] 522 523 adcs $t5,xzr,xzr 524 525 subs $s0,$s0,$t1 526 sbcs $s1,$s1,$t2 527 sbcs $s2,$s2,$t3 528 sbcs $s3,$s3,$t4 529 sbcs $t5,$t5,xzr 530 531 csel $s0,$s0,$s4,cs 532 csel $s1,$s1,$s5,cs 533 csel $s2,$s2,$s6,cs 534 csel $s3,$s3,$s7,cs 535 536.endm 537 538// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 539.globl ecp_sm2p256_mul 540.type ecp_sm2p256_mul,%function 541.align 5 542ecp_sm2p256_mul: 543 AARCH64_SIGN_LINK_REGISTER 544 // Store scalar registers 545 stp x29,x30,[sp,#-80]! 546 add x29,sp,#0 547 stp x16,x17,[sp,#16] 548 stp x19,x20,[sp,#64] 549 550 // Load inputs 551 ldp $s0,$s1,[x1] 552 ldp $s2,$s3,[x1,#16] 553 ldp $s4,$s5,[x2] 554 ldp $s6,$s7,[x2,#16] 555 556// ### multiplication ### 557 // ======================== 558 // s3 s2 s1 s0 559 // * s7 s6 s5 s4 560 // ------------------------ 561 // + s0 s0 s0 s0 562 // * * * * 563 // s7 s6 s5 s4 564 // s1 s1 s1 s1 565 // * * * * 566 // s7 s6 s5 s4 567 // s2 s2 s2 s2 568 // * * * * 569 // s7 s6 s5 s4 570 // s3 s3 s3 s3 571 // * * * * 572 // s7 s6 s5 s4 573 // ------------------------ 574 // s7 s6 s5 s4 s3 s2 s1 s0 575 // ======================== 576 577// ### s0*s4 ### 578 mul $t5,$s0,$s4 579 umulh $t2,$s0,$s4 580 581// ### s1*s4 + s0*s5 ### 582 mul $t0,$s1,$s4 583 umulh $t1,$s1,$s4 584 adds $t2,$t2,$t0 585 adcs $t3,$t1,xzr 586 587 mul $t0,$s0,$s5 588 umulh $t1,$s0,$s5 589 adds $t2,$t2,$t0 590 adcs $t3,$t3,$t1 591 adcs $t4,xzr,xzr 592 593// ### s2*s4 + s1*s5 + s0*s6 ### 594 mul $t0,$s2,$s4 595 umulh $t1,$s2,$s4 596 adds $t3,$t3,$t0 597 adcs $t4,$t4,$t1 598 599 mul $t0,$s1,$s5 600 umulh $t1,$s1,$s5 601 adds $t3,$t3,$t0 602 adcs $t4,$t4,$t1 603 adcs $t6,xzr,xzr 604 605 mul $t0,$s0,$s6 606 umulh $t1,$s0,$s6 607 adds $t3,$t3,$t0 608 adcs $t4,$t4,$t1 609 adcs $t6,$t6,xzr 610 611// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### 612 mul $t0,$s3,$s4 613 umulh $t1,$s3,$s4 614 adds $t4,$t4,$t0 615 adcs $t6,$t6,$t1 616 adcs $t7,xzr,xzr 617 618 mul $t0,$s2,$s5 619 umulh $t1,$s2,$s5 620 adds $t4,$t4,$t0 621 adcs $t6,$t6,$t1 622 adcs $t7,$t7,xzr 623 624 mul $t0,$s1,$s6 625 umulh $t1,$s1,$s6 626 adds $t4,$t4,$t0 627 adcs $t6,$t6,$t1 628 adcs $t7,$t7,xzr 629 630 mul $t0,$s0,$s7 631 umulh $t1,$s0,$s7 632 adds $t4,$t4,$t0 633 adcs $t6,$t6,$t1 634 adcs $t7,$t7,xzr 635 636// ### s3*s5 + s2*s6 + s1*s7 ### 637 mul $t0,$s3,$s5 638 umulh $t1,$s3,$s5 639 adds $t6,$t6,$t0 640 adcs $t7,$t7,$t1 641 adcs $t8,xzr,xzr 642 643 mul $t0,$s2,$s6 644 umulh $t1,$s2,$s6 645 adds $t6,$t6,$t0 646 adcs $t7,$t7,$t1 647 adcs $t8,$t8,xzr 648 649 mul $t0,$s1,$s7 650 umulh $t1,$s1,$s7 651 adds $s4,$t6,$t0 652 adcs $t7,$t7,$t1 653 adcs $t8,$t8,xzr 654 655// ### s3*s6 + s2*s7 ### 656 mul $t0,$s3,$s6 657 umulh $t1,$s3,$s6 658 adds $t7,$t7,$t0 659 adcs $t8,$t8,$t1 660 adcs $t6,xzr,xzr 661 662 mul $t0,$s2,$s7 663 umulh $t1,$s2,$s7 664 adds $s5,$t7,$t0 665 adcs $t8,$t8,$t1 666 adcs $t6,$t6,xzr 667 668// ### s3*s7 ### 669 mul $t0,$s3,$s7 670 umulh $t1,$s3,$s7 671 adds $s6,$t8,$t0 672 adcs $s7,$t6,$t1 673 674 mov $s0,$t5 675 mov $s1,$t2 676 mov $s2,$t3 677 mov $s3,$t4 678 679 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 680 681// ### Reduction ### 682 RDC 683 684 stp $s0,$s1,[x0] 685 stp $s2,$s3,[x0,#16] 686 687 // Restore scalar registers 688 ldp x16,x17,[sp,#16] 689 ldp x19,x20,[sp,#64] 690 ldp x29,x30,[sp],#80 691 692 AARCH64_VALIDATE_LINK_REGISTER 693 ret 694.size ecp_sm2p256_mul,.-ecp_sm2p256_mul 695 696// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a); 697.globl ecp_sm2p256_sqr 698.type ecp_sm2p256_sqr,%function 699.align 5 700 701ecp_sm2p256_sqr: 702 AARCH64_SIGN_LINK_REGISTER 703 // Store scalar registers 704 stp x29,x30,[sp,#-80]! 705 add x29,sp,#0 706 stp x16,x17,[sp,#16] 707 stp x19,x20,[sp,#64] 708 709 // Load inputs 710 ldp $s4,$s5,[x1] 711 ldp $s6,$s7,[x1,#16] 712 713// ### square ### 714 // ======================== 715 // s7 s6 s5 s4 716 // * s7 s6 s5 s4 717 // ------------------------ 718 // + s4 s4 s4 s4 719 // * * * * 720 // s7 s6 s5 s4 721 // s5 s5 s5 s5 722 // * * * * 723 // s7 s6 s5 s4 724 // s6 s6 s6 s6 725 // * * * * 726 // s7 s6 s5 s4 727 // s7 s7 s7 s7 728 // * * * * 729 // s7 s6 s5 s4 730 // ------------------------ 731 // s7 s6 s5 s4 s3 s2 s1 s0 732 // ======================== 733 734// ### s4*s5 ### 735 mul $s1,$s4,$s5 736 umulh $s2,$s4,$s5 737 738// ### s4*s6 ### 739 mul $t0,$s6,$s4 740 umulh $s3,$s6,$s4 741 adds $s2,$s2,$t0 742 adcs $s3,$s3,xzr 743 744// ### s4*s7 + s5*s6 ### 745 mul $t0,$s7,$s4 746 umulh $t1,$s7,$s4 747 adds $s3,$s3,$t0 748 adcs $s0,$t1,xzr 749 750 mul $t0,$s6,$s5 751 umulh $t1,$s6,$s5 752 adds $s3,$s3,$t0 753 adcs $s0,$s0,$t1 754 adcs $t2,xzr,xzr 755 756// ### s5*s7 ### 757 mul $t0,$s7,$s5 758 umulh $t1,$s7,$s5 759 adds $s0,$s0,$t0 760 adcs $t2,$t2,$t1 761 762// ### s6*s7 ### 763 mul $t0,$s7,$s6 764 umulh $t1,$s7,$s6 765 adds $t2,$t2,$t0 766 adcs $t3,$t1,xzr 767 768// ### 2*(t3,t2,s0,s3,s2,s1) ### 769 adds $s1,$s1,$s1 770 adcs $s2,$s2,$s2 771 adcs $s3,$s3,$s3 772 adcs $s0,$s0,$s0 773 adcs $t2,$t2,$t2 774 adcs $t3,$t3,$t3 775 adcs $t4,xzr,xzr 776 777// ### s4*s4 ### 778 mul $t5,$s4,$s4 779 umulh $t6,$s4,$s4 780 781// ### s5*s5 ### 782 mul $s4,$s5,$s5 783 umulh $s5,$s5,$s5 784 785// ### s6*s6 ### 786 mul $t0,$s6,$s6 787 umulh $t1,$s6,$s6 788 789// ### s7*s7 ### 790 mul $t7,$s7,$s7 791 umulh $t8,$s7,$s7 792 793 adds $s1,$s1,$t6 794 adcs $s2,$s2,$s4 795 adcs $s3,$s3,$s5 796 adcs $s0,$s0,$t0 797 adcs $t2,$t2,$t1 798 adcs $t3,$t3,$t7 799 adcs $t4,$t4,$t8 800 801 mov $s4,$s0 802 mov $s0,$t5 803 mov $s5,$t2 804 mov $s6,$t3 805 mov $s7,$t4 806 807 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 808 809// ### Reduction ### 810 RDC 811 812 stp $s0,$s1,[x0] 813 stp $s2,$s3,[x0,#16] 814 815 // Restore scalar registers 816 ldp x16,x17,[sp,#16] 817 ldp x19,x20,[sp,#64] 818 ldp x29,x30,[sp],#80 819 820 AARCH64_VALIDATE_LINK_REGISTER 821 ret 822.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr 823___ 824} 825 826foreach (split("\n",$code)) { 827 s/\`([^\`]*)\`/eval $1/ge; 828 829 print $_,"\n"; 830} 831close STDOUT or die "error closing STDOUT: $!"; # enforce flush 832