1#! /usr/bin/env perl 2# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# $output is the last argument if it looks like a file (it has an extension) 10# $flavour is the first argument if it doesn't look like a file 11$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 12$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 13 14$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 15( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 16( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 17die "can't locate arm-xlate.pl"; 18 19open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 20 or die "can't call $xlate: $!"; 21*STDOUT=*OUT; 22 23my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14)); 24my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14)); 25my ($t0,$t1,$t2,$t3)=map("x$_",(3..6)); 26my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20)); 27 28sub bn_mod_add() { 29 my $mod = shift; 30$code.=<<___; 31 // Load inputs 32 ldp $s0,$s1,[x1] 33 ldp $s2,$s3,[x1,#16] 34 ldp $s4,$s5,[x2] 35 ldp $s6,$s7,[x2,#16] 36 37 // Addition 38 adds $s0,$s0,$s4 39 adcs $s1,$s1,$s5 40 adcs $s2,$s2,$s6 41 adcs $s3,$s3,$s7 42 adc $t4,xzr,xzr 43 44 // Load polynomial 45 adr x2,$mod 46 ldp $s4,$s5,[x2] 47 ldp $s6,$s7,[x2,#16] 48 49 // Backup Addition 50 mov $t0,$s0 51 mov $t1,$s1 52 mov $t2,$s2 53 mov $t3,$s3 54 55 // Sub polynomial 56 subs $t0,$t0,$s4 57 sbcs $t1,$t1,$s5 58 sbcs $t2,$t2,$s6 59 sbcs $t3,$t3,$s7 60 sbcs $t4,$t4,xzr 61 62 // Select based on carry 63 csel $s0,$s0,$t0,cc 64 csel $s1,$s1,$t1,cc 65 csel $s2,$s2,$t2,cc 66 csel $s3,$s3,$t3,cc 67 68 // Store results 69 stp $s0,$s1,[x0] 70 stp $s2,$s3,[x0,#16] 71___ 72} 73 74sub bn_mod_sub() { 75 my $mod = shift; 76$code.=<<___; 77 // Load inputs 78 ldp $s0,$s1,[x1] 79 ldp $s2,$s3,[x1,#16] 80 ldp $s4,$s5,[x2] 81 ldp $s6,$s7,[x2,#16] 82 83 // Subtraction 84 subs $s0,$s0,$s4 85 sbcs $s1,$s1,$s5 86 sbcs $s2,$s2,$s6 87 sbcs $s3,$s3,$s7 88 sbc $t4,xzr,xzr 89 90 // Load polynomial 91 adr x2,$mod 92 ldp $s4,$s5,[x2] 93 ldp $s6,$s7,[x2,#16] 94 95 // Backup subtraction 96 mov $t0,$s0 97 mov $t1,$s1 98 mov $t2,$s2 99 mov $t3,$s3 100 101 // Add polynomial 102 adds $t0,$t0,$s4 103 adcs $t1,$t1,$s5 104 adcs $t2,$t2,$s6 105 adcs $t3,$t3,$s7 106 tst $t4,$t4 107 108 // Select based on carry 109 csel $s0,$s0,$t0,eq 110 csel $s1,$s1,$t1,eq 111 csel $s2,$s2,$t2,eq 112 csel $s3,$s3,$t3,eq 113 114 // Store results 115 stp $s0,$s1,[x0] 116 stp $s2,$s3,[x0,#16] 117___ 118} 119 120sub bn_mod_div_by_2() { 121 my $mod = shift; 122$code.=<<___; 123 // Load inputs 124 ldp $s0,$s1,[x1] 125 ldp $s2,$s3,[x1,#16] 126 127 // Save the least significant bit 128 mov $t0,$s0 129 130 // Right shift 1 131 extr $s0,$s1,$s0,#1 132 extr $s1,$s2,$s1,#1 133 extr $s2,$s3,$s2,#1 134 lsr $s3,$s3,#1 135 136 // Load mod 137 adr x2,$mod 138 ldp $s4,$s5,[x2] 139 ldp $s6,$s7,[x2,#16] 140 141 // Parity check 142 tst $t0,#1 143 csel $s4,xzr,$s4,eq 144 csel $s5,xzr,$s5,eq 145 csel $s6,xzr,$s6,eq 146 csel $s7,xzr,$s7,eq 147 148 // Add 149 adds $s0,$s0,$s4 150 adcs $s1,$s1,$s5 151 adcs $s2,$s2,$s6 152 adc $s3,$s3,$s7 153 154 // Store results 155 stp $s0,$s1,[x0] 156 stp $s2,$s3,[x0,#16] 157___ 158} 159 160{ 161$code.=<<___; 162#include "arm_arch.h" 163.arch armv8-a 164.text 165 166.align 5 167// The polynomial p 168.Lpoly: 169.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff 170// The order of polynomial n 171.Lord: 172.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff 173// (p + 1) / 2 174.Lpoly_div_2: 175.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff 176// (n + 1) / 2 177.Lord_div_2: 178.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff 179 180// void bn_rshift1(BN_ULONG *a); 181.globl bn_rshift1 182.type bn_rshift1,%function 183.align 5 184bn_rshift1: 185 AARCH64_VALID_CALL_TARGET 186 // Load inputs 187 ldp $s0,$s1,[x0] 188 ldp $s2,$s3,[x0,#16] 189 190 // Right shift 191 extr $s0,$s1,$s0,#1 192 extr $s1,$s2,$s1,#1 193 extr $s2,$s3,$s2,#1 194 lsr $s3,$s3,#1 195 196 // Store results 197 stp $s0,$s1,[x0] 198 stp $s2,$s3,[x0,#16] 199 200 ret 201.size bn_rshift1,.-bn_rshift1 202 203// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 204.globl bn_sub 205.type bn_sub,%function 206.align 5 207bn_sub: 208 AARCH64_VALID_CALL_TARGET 209 // Load inputs 210 ldp $s0,$s1,[x1] 211 ldp $s2,$s3,[x1,#16] 212 ldp $s4,$s5,[x2] 213 ldp $s6,$s7,[x2,#16] 214 215 // Subtraction 216 subs $s0,$s0,$s4 217 sbcs $s1,$s1,$s5 218 sbcs $s2,$s2,$s6 219 sbc $s3,$s3,$s7 220 221 // Store results 222 stp $s0,$s1,[x0] 223 stp $s2,$s3,[x0,#16] 224 225 ret 226.size bn_sub,.-bn_sub 227 228// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a); 229.globl ecp_sm2p256_div_by_2 230.type ecp_sm2p256_div_by_2,%function 231.align 5 232ecp_sm2p256_div_by_2: 233 AARCH64_VALID_CALL_TARGET 234___ 235 &bn_mod_div_by_2(".Lpoly_div_2"); 236$code.=<<___; 237 ret 238.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2 239 240// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a); 241.globl ecp_sm2p256_div_by_2_mod_ord 242.type ecp_sm2p256_div_by_2_mod_ord,%function 243.align 5 244ecp_sm2p256_div_by_2_mod_ord: 245 AARCH64_VALID_CALL_TARGET 246___ 247 &bn_mod_div_by_2(".Lord_div_2"); 248$code.=<<___; 249 ret 250.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord 251 252// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a); 253.globl ecp_sm2p256_mul_by_3 254.type ecp_sm2p256_mul_by_3,%function 255.align 5 256ecp_sm2p256_mul_by_3: 257 AARCH64_VALID_CALL_TARGET 258 // Load inputs 259 ldp $s0,$s1,[x1] 260 ldp $s2,$s3,[x1,#16] 261 262 // 2*a 263 adds $s0,$s0,$s0 264 adcs $s1,$s1,$s1 265 adcs $s2,$s2,$s2 266 adcs $s3,$s3,$s3 267 adcs $t4,xzr,xzr 268 269 mov $t0,$s0 270 mov $t1,$s1 271 mov $t2,$s2 272 mov $t3,$s3 273 274 // Sub polynomial 275 adr x2,.Lpoly 276 ldp $s4,$s5,[x2] 277 ldp $s6,$s7,[x2,#16] 278 subs $s0,$s0,$s4 279 sbcs $s1,$s1,$s5 280 sbcs $s2,$s2,$s6 281 sbcs $s3,$s3,$s7 282 sbcs $t4,$t4,xzr 283 284 csel $s0,$s0,$t0,cs 285 csel $s1,$s1,$t1,cs 286 csel $s2,$s2,$t2,cs 287 csel $s3,$s3,$t3,cs 288 eor $t4,$t4,$t4 289 290 // 3*a 291 ldp $s4,$s5,[x1] 292 ldp $s6,$s7,[x1,#16] 293 adds $s0,$s0,$s4 294 adcs $s1,$s1,$s5 295 adcs $s2,$s2,$s6 296 adcs $s3,$s3,$s7 297 adcs $t4,xzr,xzr 298 299 mov $t0,$s0 300 mov $t1,$s1 301 mov $t2,$s2 302 mov $t3,$s3 303 304 // Sub polynomial 305 adr x2,.Lpoly 306 ldp $s4,$s5,[x2] 307 ldp $s6,$s7,[x2,#16] 308 subs $s0,$s0,$s4 309 sbcs $s1,$s1,$s5 310 sbcs $s2,$s2,$s6 311 sbcs $s3,$s3,$s7 312 sbcs $t4,$t4,xzr 313 314 csel $s0,$s0,$t0,cs 315 csel $s1,$s1,$t1,cs 316 csel $s2,$s2,$t2,cs 317 csel $s3,$s3,$t3,cs 318 319 // Store results 320 stp $s0,$s1,[x0] 321 stp $s2,$s3,[x0,#16] 322 323 ret 324.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3 325 326// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 327.globl ecp_sm2p256_add 328.type ecp_sm2p256_add,%function 329.align 5 330ecp_sm2p256_add: 331 AARCH64_VALID_CALL_TARGET 332___ 333 &bn_mod_add(".Lpoly"); 334$code.=<<___; 335 ret 336.size ecp_sm2p256_add,.-ecp_sm2p256_add 337 338// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 339.globl ecp_sm2p256_sub 340.type ecp_sm2p256_sub,%function 341.align 5 342ecp_sm2p256_sub: 343 AARCH64_VALID_CALL_TARGET 344___ 345 &bn_mod_sub(".Lpoly"); 346$code.=<<___; 347 ret 348.size ecp_sm2p256_sub,.-ecp_sm2p256_sub 349 350// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 351.globl ecp_sm2p256_sub_mod_ord 352.type ecp_sm2p256_sub_mod_ord,%function 353.align 5 354ecp_sm2p256_sub_mod_ord: 355 AARCH64_VALID_CALL_TARGET 356___ 357 &bn_mod_sub(".Lord"); 358$code.=<<___; 359 ret 360.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord 361 362.macro RDC 363 // a = | s7 | ... | s0 |, where si are 64-bit quantities 364 // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities 365 // | s7 | s6 | s5 | s4 | 366 // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | 367 // | s3 | s2 | s1 | s0 | 368 // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | 369 // ================================================= 370 // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) 371 // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) 372 // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) 373 // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) 374 // | a12 | 0 | s7 | a13 | 0 | s6 | (+) 375 // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) 376 // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) 377 // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 378 // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 379 // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 380 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 381 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 382 // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 383 // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) 384 // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) 385 // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) 386 // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) 387 // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| 388 // | V[3] | V[2] | V[1] | V[0] | 389 390 // 1. 64-bit addition 391 // t2=s6+s7+s7 392 adds $t2,$s6,$s7 393 adcs $t1,xzr,xzr 394 adds $t2,$t2,$s7 395 adcs $t1,$t1,xzr 396 // t3=s4+s5+t2 397 adds $t3,$s4,$t2 398 adcs $t4,$t1,xzr 399 adds $t3,$t3,$s5 400 adcs $t4,$t4,xzr 401 // sum 402 adds $s0,$s0,$t3 403 adcs $s1,$s1,$t4 404 adcs $s2,$s2,$t2 405 adcs $s3,$s3,$s7 406 adcs $t0,xzr,xzr 407 adds $s3,$s3,$t1 408 adcs $t0,$t0,xzr 409 410 stp $s0,$s1,[sp,#32] 411 stp $s2,$s3,[sp,#48] 412 413 // 2. 64-bit to 32-bit spread 414 mov $t1,#0xffffffff 415 mov $s0,$s4 416 mov $s1,$s5 417 mov $s2,$s6 418 mov $s3,$s7 419 and $s0,$s0,$t1 // a8 420 and $s1,$s1,$t1 // a10 421 and $s2,$s2,$t1 // a12 422 and $s3,$s3,$t1 // a14 423 lsr $s4,$s4,#32 // a9 424 lsr $s5,$s5,#32 // a11 425 lsr $s6,$s6,#32 // a13 426 lsr $s7,$s7,#32 // a15 427 428 // 3. 32-bit addition 429 add $t1,$a14,$a12 // t1 <- a12 + a14 430 add $t2,$a15,$a13 // t2 <- a13 + a15 431 add $t3,$a8,$a9 // t3 <- a8 + a9 432 add $t4,$a14,$a10 // t4 <- a10 + a14 433 add $a15,$a15,$a11 // a15 <- a11 + a15 434 add $a12,$t2,$t1 // a12 <- a12 + a13 + a14 + a15 435 add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15 436 add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15) 437 add $a10,$a10,$t3 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) 438 add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 439 add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15 440 add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15 441 add $a12,$a12,$a8 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15 442 add $t3,$t3,$a14 // t3 <- a8 + a9 + a14 443 add $t3,$t3,$a13 // t3 <- a8 + a9 + a13 + a14 444 add $a9,$a9,$t2 // a9 <- a9 + a13 + a15 445 add $a11,$a11,$a9 // a11 <- a9 + a11 + a13 + a15 446 add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15) 447 add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14 448 449 // U[0] s5 a9 + a11 + 2*(a13 + a15) 450 // U[1] t1 a10 + a12 + 2*a14 451 // U[2] -t3 a8 + a9 + a13 + a14 452 // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 453 // U[4] s4 a9 + a13 + a15 454 // U[5] t4 a10 + a14 455 // U[6] s7 a11 + a15 456 // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 457 458 // 4. 32-bit to 64-bit 459 lsl $s0,$t1,#32 460 extr $t1,$s2,$t1,#32 461 extr $s2,$t4,$s2,#32 462 extr $t4,$s1,$t4,#32 463 lsr $s1,$s1,#32 464 465 // 5. 64-bit addition 466 adds $s5,$s5,$s0 467 adcs $t1,$t1,xzr 468 adcs $s4,$s4,$s2 469 adcs $s7,$s7,$t4 470 adcs $t0,$t0,$s1 471 472 // V[0] s5 473 // V[1] t1 474 // V[2] s4 475 // V[3] s7 476 // carry t0 477 // sub t3 478 479 // 5. Process s0-s3 480 ldp $s0,$s1,[sp,#32] 481 ldp $s2,$s3,[sp,#48] 482 // add with V0-V3 483 adds $s0,$s0,$s5 484 adcs $s1,$s1,$t1 485 adcs $s2,$s2,$s4 486 adcs $s3,$s3,$s7 487 adcs $t0,$t0,xzr 488 // sub with t3 489 subs $s1,$s1,$t3 490 sbcs $s2,$s2,xzr 491 sbcs $s3,$s3,xzr 492 sbcs $t0,$t0,xzr 493 494 // 6. MOD 495 // First Mod 496 lsl $t1,$t0,#32 497 subs $t2,$t1,$t0 498 499 adds $s0,$s0,$t0 500 adcs $s1,$s1,$t2 501 adcs $s2,$s2,xzr 502 adcs $s3,$s3,$t1 503 504 // Last Mod 505 // return y - p if y > p else y 506 mov $s4,$s0 507 mov $s5,$s1 508 mov $s6,$s2 509 mov $s7,$s3 510 511 adr $t0,.Lpoly 512 ldp $t1,$t2,[$t0] 513 ldp $t3,$t4,[$t0,#16] 514 515 adcs $t5,xzr,xzr 516 517 subs $s0,$s0,$t1 518 sbcs $s1,$s1,$t2 519 sbcs $s2,$s2,$t3 520 sbcs $s3,$s3,$t4 521 sbcs $t5,$t5,xzr 522 523 csel $s0,$s0,$s4,cs 524 csel $s1,$s1,$s5,cs 525 csel $s2,$s2,$s6,cs 526 csel $s3,$s3,$s7,cs 527 528.endm 529 530// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 531.globl ecp_sm2p256_mul 532.type ecp_sm2p256_mul,%function 533.align 5 534ecp_sm2p256_mul: 535 AARCH64_SIGN_LINK_REGISTER 536 // Store scalar registers 537 stp x29,x30,[sp,#-80]! 538 add x29,sp,#0 539 stp x16,x17,[sp,#16] 540 stp x19,x20,[sp,#64] 541 542 // Load inputs 543 ldp $s0,$s1,[x1] 544 ldp $s2,$s3,[x1,#16] 545 ldp $s4,$s5,[x2] 546 ldp $s6,$s7,[x2,#16] 547 548// ### multiplication ### 549 // ======================== 550 // s3 s2 s1 s0 551 // * s7 s6 s5 s4 552 // ------------------------ 553 // + s0 s0 s0 s0 554 // * * * * 555 // s7 s6 s5 s4 556 // s1 s1 s1 s1 557 // * * * * 558 // s7 s6 s5 s4 559 // s2 s2 s2 s2 560 // * * * * 561 // s7 s6 s5 s4 562 // s3 s3 s3 s3 563 // * * * * 564 // s7 s6 s5 s4 565 // ------------------------ 566 // s7 s6 s5 s4 s3 s2 s1 s0 567 // ======================== 568 569// ### s0*s4 ### 570 mul $t5,$s0,$s4 571 umulh $t2,$s0,$s4 572 573// ### s1*s4 + s0*s5 ### 574 mul $t0,$s1,$s4 575 umulh $t1,$s1,$s4 576 adds $t2,$t2,$t0 577 adcs $t3,$t1,xzr 578 579 mul $t0,$s0,$s5 580 umulh $t1,$s0,$s5 581 adds $t2,$t2,$t0 582 adcs $t3,$t3,$t1 583 adcs $t4,xzr,xzr 584 585// ### s2*s4 + s1*s5 + s0*s6 ### 586 mul $t0,$s2,$s4 587 umulh $t1,$s2,$s4 588 adds $t3,$t3,$t0 589 adcs $t4,$t4,$t1 590 591 mul $t0,$s1,$s5 592 umulh $t1,$s1,$s5 593 adds $t3,$t3,$t0 594 adcs $t4,$t4,$t1 595 adcs $t6,xzr,xzr 596 597 mul $t0,$s0,$s6 598 umulh $t1,$s0,$s6 599 adds $t3,$t3,$t0 600 adcs $t4,$t4,$t1 601 adcs $t6,$t6,xzr 602 603// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### 604 mul $t0,$s3,$s4 605 umulh $t1,$s3,$s4 606 adds $t4,$t4,$t0 607 adcs $t6,$t6,$t1 608 adcs $t7,xzr,xzr 609 610 mul $t0,$s2,$s5 611 umulh $t1,$s2,$s5 612 adds $t4,$t4,$t0 613 adcs $t6,$t6,$t1 614 adcs $t7,$t7,xzr 615 616 mul $t0,$s1,$s6 617 umulh $t1,$s1,$s6 618 adds $t4,$t4,$t0 619 adcs $t6,$t6,$t1 620 adcs $t7,$t7,xzr 621 622 mul $t0,$s0,$s7 623 umulh $t1,$s0,$s7 624 adds $t4,$t4,$t0 625 adcs $t6,$t6,$t1 626 adcs $t7,$t7,xzr 627 628// ### s3*s5 + s2*s6 + s1*s7 ### 629 mul $t0,$s3,$s5 630 umulh $t1,$s3,$s5 631 adds $t6,$t6,$t0 632 adcs $t7,$t7,$t1 633 adcs $t8,xzr,xzr 634 635 mul $t0,$s2,$s6 636 umulh $t1,$s2,$s6 637 adds $t6,$t6,$t0 638 adcs $t7,$t7,$t1 639 adcs $t8,$t8,xzr 640 641 mul $t0,$s1,$s7 642 umulh $t1,$s1,$s7 643 adds $s4,$t6,$t0 644 adcs $t7,$t7,$t1 645 adcs $t8,$t8,xzr 646 647// ### s3*s6 + s2*s7 ### 648 mul $t0,$s3,$s6 649 umulh $t1,$s3,$s6 650 adds $t7,$t7,$t0 651 adcs $t8,$t8,$t1 652 adcs $t6,xzr,xzr 653 654 mul $t0,$s2,$s7 655 umulh $t1,$s2,$s7 656 adds $s5,$t7,$t0 657 adcs $t8,$t8,$t1 658 adcs $t6,$t6,xzr 659 660// ### s3*s7 ### 661 mul $t0,$s3,$s7 662 umulh $t1,$s3,$s7 663 adds $s6,$t8,$t0 664 adcs $s7,$t6,$t1 665 666 mov $s0,$t5 667 mov $s1,$t2 668 mov $s2,$t3 669 mov $s3,$t4 670 671 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 672 673// ### Reduction ### 674 RDC 675 676 stp $s0,$s1,[x0] 677 stp $s2,$s3,[x0,#16] 678 679 // Restore scalar registers 680 ldp x16,x17,[sp,#16] 681 ldp x19,x20,[sp,#64] 682 ldp x29,x30,[sp],#80 683 684 AARCH64_VALIDATE_LINK_REGISTER 685 ret 686.size ecp_sm2p256_mul,.-ecp_sm2p256_mul 687 688// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a); 689.globl ecp_sm2p256_sqr 690.type ecp_sm2p256_sqr,%function 691.align 5 692 693ecp_sm2p256_sqr: 694 AARCH64_SIGN_LINK_REGISTER 695 // Store scalar registers 696 stp x29,x30,[sp,#-80]! 697 add x29,sp,#0 698 stp x16,x17,[sp,#16] 699 stp x19,x20,[sp,#64] 700 701 // Load inputs 702 ldp $s4,$s5,[x1] 703 ldp $s6,$s7,[x1,#16] 704 705// ### square ### 706 // ======================== 707 // s7 s6 s5 s4 708 // * s7 s6 s5 s4 709 // ------------------------ 710 // + s4 s4 s4 s4 711 // * * * * 712 // s7 s6 s5 s4 713 // s5 s5 s5 s5 714 // * * * * 715 // s7 s6 s5 s4 716 // s6 s6 s6 s6 717 // * * * * 718 // s7 s6 s5 s4 719 // s7 s7 s7 s7 720 // * * * * 721 // s7 s6 s5 s4 722 // ------------------------ 723 // s7 s6 s5 s4 s3 s2 s1 s0 724 // ======================== 725 726// ### s4*s5 ### 727 mul $s1,$s4,$s5 728 umulh $s2,$s4,$s5 729 730// ### s4*s6 ### 731 mul $t0,$s6,$s4 732 umulh $s3,$s6,$s4 733 adds $s2,$s2,$t0 734 adcs $s3,$s3,xzr 735 736// ### s4*s7 + s5*s6 ### 737 mul $t0,$s7,$s4 738 umulh $t1,$s7,$s4 739 adds $s3,$s3,$t0 740 adcs $s0,$t1,xzr 741 742 mul $t0,$s6,$s5 743 umulh $t1,$s6,$s5 744 adds $s3,$s3,$t0 745 adcs $s0,$s0,$t1 746 adcs $t2,xzr,xzr 747 748// ### s5*s7 ### 749 mul $t0,$s7,$s5 750 umulh $t1,$s7,$s5 751 adds $s0,$s0,$t0 752 adcs $t2,$t2,$t1 753 754// ### s6*s7 ### 755 mul $t0,$s7,$s6 756 umulh $t1,$s7,$s6 757 adds $t2,$t2,$t0 758 adcs $t3,$t1,xzr 759 760// ### 2*(t3,t2,s0,s3,s2,s1) ### 761 adds $s1,$s1,$s1 762 adcs $s2,$s2,$s2 763 adcs $s3,$s3,$s3 764 adcs $s0,$s0,$s0 765 adcs $t2,$t2,$t2 766 adcs $t3,$t3,$t3 767 adcs $t4,xzr,xzr 768 769// ### s4*s4 ### 770 mul $t5,$s4,$s4 771 umulh $t6,$s4,$s4 772 773// ### s5*s5 ### 774 mul $s4,$s5,$s5 775 umulh $s5,$s5,$s5 776 777// ### s6*s6 ### 778 mul $t0,$s6,$s6 779 umulh $t1,$s6,$s6 780 781// ### s7*s7 ### 782 mul $t7,$s7,$s7 783 umulh $t8,$s7,$s7 784 785 adds $s1,$s1,$t6 786 adcs $s2,$s2,$s4 787 adcs $s3,$s3,$s5 788 adcs $s0,$s0,$t0 789 adcs $t2,$t2,$t1 790 adcs $t3,$t3,$t7 791 adcs $t4,$t4,$t8 792 793 mov $s4,$s0 794 mov $s0,$t5 795 mov $s5,$t2 796 mov $s6,$t3 797 mov $s7,$t4 798 799 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 800 801// ### Reduction ### 802 RDC 803 804 stp $s0,$s1,[x0] 805 stp $s2,$s3,[x0,#16] 806 807 // Restore scalar registers 808 ldp x16,x17,[sp,#16] 809 ldp x19,x20,[sp,#64] 810 ldp x29,x30,[sp],#80 811 812 AARCH64_VALIDATE_LINK_REGISTER 813 ret 814.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr 815___ 816} 817 818foreach (split("\n",$code)) { 819 s/\`([^\`]*)\`/eval $1/ge; 820 821 print $_,"\n"; 822} 823close STDOUT or die "error closing STDOUT: $!"; # enforce flush 824