1#! /usr/bin/env perl 2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2015 18# 19# "Teaser" Montgomery multiplication module for ARMv8. Needs more 20# work. While it does improve RSA sign performance by 20-30% (less for 21# longer keys) on most processors, for some reason RSA2048 is not 22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 23# instruction issue rate is limited on processor in question, meaning 24# that dedicated squaring procedure is a must. Well, actually all 25# contemporary AArch64 processors seem to have limited multiplication 26# issue rate, i.e. they can't issue multiplication every cycle, which 27# explains moderate improvement coefficients in comparison to 28# compiler-generated code. Recall that compiler is instructed to use 29# umulh and therefore uses same amount of multiplication instructions 30# to do the job. Assembly's edge is to minimize number of "collateral" 31# instructions and of course instruction scheduling. 32# 33# April 2015 34# 35# Squaring procedure that handles lengths divisible by 8 improves 36# RSA/DSA performance by 25-40-60% depending on processor and key 37# length. Overall improvement coefficients are always positive in 38# comparison to compiler-generated code. On Cortex-A57 improvement 39# is still modest on longest key lengths, while others exhibit e.g. 40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 41# on Cortex-A57 and ~60-100% faster on others. 42 43# $output is the last argument if it looks like a file (it has an extension) 44# $flavour is the first argument if it doesn't look like a file 45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 47 48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 51die "can't locate arm-xlate.pl"; 52 53open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 54 or die "can't call $xlate: $1"; 55*STDOUT=*OUT; 56 57($lo0,$hi0,$aj,$m0,$alo,$ahi, 58 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 60 61# int bn_mul_mont( 62$rp="x0"; # BN_ULONG *rp, 63$ap="x1"; # const BN_ULONG *ap, 64$bp="x2"; # const BN_ULONG *bp, 65$np="x3"; # const BN_ULONG *np, 66$n0="x4"; # const BN_ULONG *n0, 67$num="x5"; # int num); 68 69$code.=<<___; 70#include "arm_arch.h" 71#ifndef __KERNEL__ 72.extern OPENSSL_armv8_rsa_neonized 73.hidden OPENSSL_armv8_rsa_neonized 74#endif 75.text 76 77.globl bn_mul_mont 78.type bn_mul_mont,%function 79.align 5 80bn_mul_mont: 81 AARCH64_SIGN_LINK_REGISTER 82.Lbn_mul_mont: 83 tst $num,#3 84 b.ne .Lmul_mont 85 cmp $num,#32 86 b.le .Lscalar_impl 87#ifndef __KERNEL__ 88#ifndef __AARCH64EB__ 89 adrp x17,OPENSSL_armv8_rsa_neonized 90 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 91 cbnz w17, bn_mul8x_mont_neon 92#endif 93#endif 94 95.Lscalar_impl: 96 tst $num,#7 97 b.eq __bn_sqr8x_mont 98 tst $num,#3 99 b.eq __bn_mul4x_mont 100 101.Lmul_mont: 102 stp x29,x30,[sp,#-64]! 103 add x29,sp,#0 104 stp x19,x20,[sp,#16] 105 stp x21,x22,[sp,#32] 106 stp x23,x24,[sp,#48] 107 108 ldr $m0,[$bp],#8 // bp[0] 109 sub $tp,sp,$num,lsl#3 110 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 111 lsl $num,$num,#3 112 ldr $n0,[$n0] // *n0 113 and $tp,$tp,#-16 // ABI says so 114 ldp $hi1,$nj,[$np],#16 // np[0..1] 115 116 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 117 sub $j,$num,#16 // j=num-2 118 umulh $hi0,$hi0,$m0 119 mul $alo,$aj,$m0 // ap[1]*bp[0] 120 umulh $ahi,$aj,$m0 121 122 mul $m1,$lo0,$n0 // "tp[0]"*n0 123 mov sp,$tp // alloca 124 125 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 126 umulh $hi1,$hi1,$m1 127 mul $nlo,$nj,$m1 // np[1]*m1 128 // (*) adds $lo1,$lo1,$lo0 // discarded 129 // (*) As for removal of first multiplication and addition 130 // instructions. The outcome of first addition is 131 // guaranteed to be zero, which leaves two computationally 132 // significant outcomes: it either carries or not. Then 133 // question is when does it carry? Is there alternative 134 // way to deduce it? If you follow operations, you can 135 // observe that condition for carry is quite simple: 136 // $lo0 being non-zero. So that carry can be calculated 137 // by adding -1 to $lo0. That's what next instruction does. 138 subs xzr,$lo0,#1 // (*) 139 umulh $nhi,$nj,$m1 140 adc $hi1,$hi1,xzr 141 cbz $j,.L1st_skip 142 143.L1st: 144 ldr $aj,[$ap],#8 145 adds $lo0,$alo,$hi0 146 sub $j,$j,#8 // j-- 147 adc $hi0,$ahi,xzr 148 149 ldr $nj,[$np],#8 150 adds $lo1,$nlo,$hi1 151 mul $alo,$aj,$m0 // ap[j]*bp[0] 152 adc $hi1,$nhi,xzr 153 umulh $ahi,$aj,$m0 154 155 adds $lo1,$lo1,$lo0 156 mul $nlo,$nj,$m1 // np[j]*m1 157 adc $hi1,$hi1,xzr 158 umulh $nhi,$nj,$m1 159 str $lo1,[$tp],#8 // tp[j-1] 160 cbnz $j,.L1st 161 162.L1st_skip: 163 adds $lo0,$alo,$hi0 164 sub $ap,$ap,$num // rewind $ap 165 adc $hi0,$ahi,xzr 166 167 adds $lo1,$nlo,$hi1 168 sub $np,$np,$num // rewind $np 169 adc $hi1,$nhi,xzr 170 171 adds $lo1,$lo1,$lo0 172 sub $i,$num,#8 // i=num-1 173 adcs $hi1,$hi1,$hi0 174 175 adc $ovf,xzr,xzr // upmost overflow bit 176 stp $lo1,$hi1,[$tp] 177 178.Louter: 179 ldr $m0,[$bp],#8 // bp[i] 180 ldp $hi0,$aj,[$ap],#16 181 ldr $tj,[sp] // tp[0] 182 add $tp,sp,#8 183 184 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 185 sub $j,$num,#16 // j=num-2 186 umulh $hi0,$hi0,$m0 187 ldp $hi1,$nj,[$np],#16 188 mul $alo,$aj,$m0 // ap[1]*bp[i] 189 adds $lo0,$lo0,$tj 190 umulh $ahi,$aj,$m0 191 adc $hi0,$hi0,xzr 192 193 mul $m1,$lo0,$n0 194 sub $i,$i,#8 // i-- 195 196 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 197 umulh $hi1,$hi1,$m1 198 mul $nlo,$nj,$m1 // np[1]*m1 199 // (*) adds $lo1,$lo1,$lo0 200 subs xzr,$lo0,#1 // (*) 201 umulh $nhi,$nj,$m1 202 cbz $j,.Linner_skip 203 204.Linner: 205 ldr $aj,[$ap],#8 206 adc $hi1,$hi1,xzr 207 ldr $tj,[$tp],#8 // tp[j] 208 adds $lo0,$alo,$hi0 209 sub $j,$j,#8 // j-- 210 adc $hi0,$ahi,xzr 211 212 adds $lo1,$nlo,$hi1 213 ldr $nj,[$np],#8 214 adc $hi1,$nhi,xzr 215 216 mul $alo,$aj,$m0 // ap[j]*bp[i] 217 adds $lo0,$lo0,$tj 218 umulh $ahi,$aj,$m0 219 adc $hi0,$hi0,xzr 220 221 mul $nlo,$nj,$m1 // np[j]*m1 222 adds $lo1,$lo1,$lo0 223 umulh $nhi,$nj,$m1 224 stur $lo1,[$tp,#-16] // tp[j-1] 225 cbnz $j,.Linner 226 227.Linner_skip: 228 ldr $tj,[$tp],#8 // tp[j] 229 adc $hi1,$hi1,xzr 230 adds $lo0,$alo,$hi0 231 sub $ap,$ap,$num // rewind $ap 232 adc $hi0,$ahi,xzr 233 234 adds $lo1,$nlo,$hi1 235 sub $np,$np,$num // rewind $np 236 adcs $hi1,$nhi,$ovf 237 adc $ovf,xzr,xzr 238 239 adds $lo0,$lo0,$tj 240 adc $hi0,$hi0,xzr 241 242 adds $lo1,$lo1,$lo0 243 adcs $hi1,$hi1,$hi0 244 adc $ovf,$ovf,xzr // upmost overflow bit 245 stp $lo1,$hi1,[$tp,#-16] 246 247 cbnz $i,.Louter 248 249 // Final step. We see if result is larger than modulus, and 250 // if it is, subtract the modulus. But comparison implies 251 // subtraction. So we subtract modulus, see if it borrowed, 252 // and conditionally copy original value. 253 ldr $tj,[sp] // tp[0] 254 add $tp,sp,#8 255 ldr $nj,[$np],#8 // np[0] 256 subs $j,$num,#8 // j=num-1 and clear borrow 257 mov $ap,$rp 258.Lsub: 259 sbcs $aj,$tj,$nj // tp[j]-np[j] 260 ldr $tj,[$tp],#8 261 sub $j,$j,#8 // j-- 262 ldr $nj,[$np],#8 263 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 264 cbnz $j,.Lsub 265 266 sbcs $aj,$tj,$nj 267 sbcs $ovf,$ovf,xzr // did it borrow? 268 str $aj,[$ap],#8 // rp[num-1] 269 270 ldr $tj,[sp] // tp[0] 271 add $tp,sp,#8 272 ldr $aj,[$rp],#8 // rp[0] 273 sub $num,$num,#8 // num-- 274 nop 275.Lcond_copy: 276 sub $num,$num,#8 // num-- 277 csel $nj,$tj,$aj,lo // did it borrow? 278 ldr $tj,[$tp],#8 279 ldr $aj,[$rp],#8 280 stur xzr,[$tp,#-16] // wipe tp 281 stur $nj,[$rp,#-16] 282 cbnz $num,.Lcond_copy 283 284 csel $nj,$tj,$aj,lo 285 stur xzr,[$tp,#-8] // wipe tp 286 stur $nj,[$rp,#-8] 287 288 ldp x19,x20,[x29,#16] 289 mov sp,x29 290 ldp x21,x22,[x29,#32] 291 mov x0,#1 292 ldp x23,x24,[x29,#48] 293 ldr x29,[sp],#64 294 AARCH64_VALIDATE_LINK_REGISTER 295 ret 296.size bn_mul_mont,.-bn_mul_mont 297___ 298{ 299my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); 300my ($Z,$Temp)=("v4.16b","v5"); 301my @ACC=map("v$_",(6..13)); 302my ($Bi,$Ni,$M0)=map("v$_",(28..30)); 303my $sBi="s28"; 304my $sM0="s30"; 305my $zero="v14"; 306my $temp="v15"; 307my $ACCTemp="v16"; 308 309my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); 310my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); 311 312$code.=<<___; 313.type bn_mul8x_mont_neon,%function 314.align 5 315bn_mul8x_mont_neon: 316 // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 317 // only from bn_mul_mont which has already signed the return address. 318 stp x29,x30,[sp,#-80]! 319 mov x16,sp 320 stp d8,d9,[sp,#16] 321 stp d10,d11,[sp,#32] 322 stp d12,d13,[sp,#48] 323 stp d14,d15,[sp,#64] 324 lsl $num,$num,#1 325 eor $zero.16b,$zero.16b,$zero.16b 326 327.align 4 328.LNEON_8n: 329 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b 330 sub $toutptr,sp,#128 331 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b 332 sub $toutptr,$toutptr,$num,lsl#4 333 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b 334 and $toutptr,$toutptr,#-64 335 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b 336 mov sp,$toutptr // alloca 337 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b 338 add $toutptr,$toutptr,#256 339 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b 340 sub $inner,$num,#8 341 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b 342 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b 343 344.LNEON_8n_init: 345 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 346 subs $inner,$inner,#8 347 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 348 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 349 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 350 bne .LNEON_8n_init 351 352 add $tinptr,sp,#256 353 ld1 {$A0.4s,$A1.4s},[$aptr],#32 354 add $bnptr,sp,#8 355 ldr $sM0,[$n0],#4 356 mov $outer,$num 357 b .LNEON_8n_outer 358 359.align 4 360.LNEON_8n_outer: 361 ldr $sBi,[$bptr],#4 // *b++ 362 uxtl $Bi.4s,$Bi.4h 363 add $toutptr,sp,#128 364 ld1 {$N0.4s,$N1.4s},[$nptr],#32 365 366 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 367 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 368 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 369 shl $Ni.2d,@ACC[0].2d,#16 370 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 371 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 372 add $Ni.2d,$Ni.2d,@ACC[0].2d 373 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 374 mul $Ni.2s,$Ni.2s,$M0.2s 375 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 376 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] 377 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 378 uxtl $Ni.4s,$Ni.4h 379 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 380___ 381for ($i=0; $i<7;) { 382$code.=<<___; 383 ldr $sBi,[$bptr],#4 // *b++ 384 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 385 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 386 uxtl $Bi.4s,$Bi.4h 387 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 388 ushr $temp.2d,@ACC[0].2d,#16 389 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 390 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 391 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 392 add @ACC[0].2d,@ACC[0].2d,$temp.2d 393 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 394 ushr @ACC[0].2d,@ACC[0].2d,#16 395 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 396 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 397 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d 398 ins @ACC[1].d[0],$ACCTemp.d[0] 399 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] 400___ 401 push(@ACC,shift(@ACC)); $i++; 402$code.=<<___; 403 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 404 ld1 {@ACC[7].2d},[$tinptr],#16 405 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 406 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 407 shl $Ni.2d,@ACC[0].2d,#16 408 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 409 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 410 add $Ni.2d,$Ni.2d,@ACC[0].2d 411 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 412 mul $Ni.2s,$Ni.2s,$M0.2s 413 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 414 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] 415 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 416 uxtl $Ni.4s,$Ni.4h 417 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 418___ 419} 420$code.=<<___; 421 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] 422 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 423 ld1 {$A0.4s,$A1.4s},[$aptr],#32 424 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 425 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 426 mov $Temp.16b,@ACC[0].16b 427 ushr $Temp.2d,$Temp.2d,#16 428 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 429 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 430 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 431 add @ACC[0].2d,@ACC[0].2d,$Temp.2d 432 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 433 ushr @ACC[0].2d,@ACC[0].2d,#16 434 eor $temp.16b,$temp.16b,$temp.16b 435 ins @ACC[0].d[1],$temp.d[0] 436 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 437 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 438 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d 439 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] 440 add $bnptr,sp,#8 // rewind 441___ 442 push(@ACC,shift(@ACC)); 443$code.=<<___; 444 sub $inner,$num,#8 445 b .LNEON_8n_inner 446 447.align 4 448.LNEON_8n_inner: 449 subs $inner,$inner,#8 450 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 451 ld1 {@ACC[7].2d},[$tinptr] 452 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 453 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] 454 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 455 ld1 {$N0.4s,$N1.4s},[$nptr],#32 456 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 457 b.eq .LInner_jump 458 add $tinptr,$tinptr,#16 // don't advance in last iteration 459.LInner_jump: 460 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 461 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 462 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 463 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 464___ 465for ($i=1; $i<8; $i++) { 466$code.=<<___; 467 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] 468 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 469 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 470 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 471 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 472 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 473 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 474 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 475 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 476 st1 {@ACC[0].2d},[$toutptr],#16 477___ 478 push(@ACC,shift(@ACC)); 479$code.=<<___; 480 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 481 ld1 {@ACC[7].2d},[$tinptr] 482 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 483 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] 484 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 485 b.eq .LInner_jump$i 486 add $tinptr,$tinptr,#16 // don't advance in last iteration 487.LInner_jump$i: 488 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 489 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 490 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 491 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 492 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 493___ 494} 495$code.=<<___; 496 b.ne .LInner_after_rewind$i 497 sub $aptr,$aptr,$num,lsl#2 // rewind 498.LInner_after_rewind$i: 499 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 500 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] 501 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 502 ld1 {$A0.4s,$A1.4s},[$aptr],#32 503 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 504 add $bnptr,sp,#8 // rewind 505 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 506 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 507 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 508 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 509 st1 {@ACC[0].2d},[$toutptr],#16 510 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 511 512 bne .LNEON_8n_inner 513___ 514 push(@ACC,shift(@ACC)); 515$code.=<<___; 516 add $tinptr,sp,#128 517 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 518 eor $N0.16b,$N0.16b,$N0.16b // $N0 519 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 520 eor $N1.16b,$N1.16b,$N1.16b // $N1 521 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 522 st1 {@ACC[6].2d},[$toutptr] 523 524 subs $outer,$outer,#8 525 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 526 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 527 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 528 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 529 530 b.eq .LInner_8n_jump_2steps 531 sub $nptr,$nptr,$num,lsl#2 // rewind 532 b .LNEON_8n_outer 533 534.LInner_8n_jump_2steps: 535 add $toutptr,sp,#128 536 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame 537 mov $Temp.16b,@ACC[0].16b 538 ushr $temp.2d,@ACC[0].2d,#16 539 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 540 st1 {$N0.2d,$N1.2d}, [sp],#32 541 add @ACC[0].2d,@ACC[0].2d,$temp.2d 542 st1 {$N0.2d,$N1.2d}, [sp],#32 543 ushr $temp.2d,@ACC[0].2d,#16 544 st1 {$N0.2d,$N1.2d}, [sp],#32 545 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h 546 ins $temp.d[1],$zero.d[0] 547 548 mov $inner,$num 549 b .LNEON_tail_entry 550 551.align 4 552.LNEON_tail: 553 add @ACC[0].2d,@ACC[0].2d,$temp.2d 554 mov $Temp.16b,@ACC[0].16b 555 ushr $temp.2d,@ACC[0].2d,#16 556 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 557 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 558 add @ACC[0].2d,@ACC[0].2d,$temp.2d 559 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 560 ushr $temp.2d,@ACC[0].2d,#16 561 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 562 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h 563 ins $temp.d[1],$zero.d[0] 564 565.LNEON_tail_entry: 566___ 567for ($i=1; $i<8; $i++) { 568$code.=<<___; 569 add @ACC[1].2d,@ACC[1].2d,$temp.2d 570 st1 {@ACC[0].s}[0], [$toutptr],#4 571 ushr $temp.2d,@ACC[1].2d,#16 572 mov $Temp.16b,@ACC[1].16b 573 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 574 add @ACC[1].2d,@ACC[1].2d,$temp.2d 575 ushr $temp.2d,@ACC[1].2d,#16 576 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h 577 ins $temp.d[1],$zero.d[0] 578___ 579 push(@ACC,shift(@ACC)); 580} 581 push(@ACC,shift(@ACC)); 582$code.=<<___; 583 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 584 subs $inner,$inner,#8 585 st1 {@ACC[7].s}[0], [$toutptr],#4 586 bne .LNEON_tail 587 588 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit 589 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr 590 subs $aptr,sp,#0 // clear carry flag 591 add $bptr,sp,$num,lsl#2 592 593.LNEON_sub: 594 ldp w4,w5,[$aptr],#8 595 ldp w6,w7,[$aptr],#8 596 ldp w8,w9,[$nptr],#8 597 ldp w10,w11,[$nptr],#8 598 sbcs w8,w4,w8 599 sbcs w9,w5,w9 600 sbcs w10,w6,w10 601 sbcs w11,w7,w11 602 sub x17,$bptr,$aptr 603 stp w8,w9,[$rptr],#8 604 stp w10,w11,[$rptr],#8 605 cbnz x17,.LNEON_sub 606 607 ldr w10, [$aptr] // load top-most bit 608 mov x11,sp 609 eor v0.16b,v0.16b,v0.16b 610 sub x11,$bptr,x11 // this is num*4 611 eor v1.16b,v1.16b,v1.16b 612 mov $aptr,sp 613 sub $rptr,$rptr,x11 // rewind $rptr 614 mov $nptr,$bptr // second 3/4th of frame 615 sbcs w10,w10,wzr // result is carry flag 616 617.LNEON_copy_n_zap: 618 ldp w4,w5,[$aptr],#8 619 ldp w6,w7,[$aptr],#8 620 ldp w8,w9,[$rptr],#8 621 ldp w10,w11,[$rptr] 622 sub $rptr,$rptr,#8 623 b.cs .LCopy_1 624 mov w8,w4 625 mov w9,w5 626 mov w10,w6 627 mov w11,w7 628.LCopy_1: 629 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 630 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 631 ldp w4,w5,[$aptr],#8 632 ldp w6,w7,[$aptr],#8 633 stp w8,w9,[$rptr],#8 634 stp w10,w11,[$rptr],#8 635 sub $aptr,$aptr,#32 636 ldp w8,w9,[$rptr],#8 637 ldp w10,w11,[$rptr] 638 sub $rptr,$rptr,#8 639 b.cs .LCopy_2 640 mov w8, w4 641 mov w9, w5 642 mov w10, w6 643 mov w11, w7 644.LCopy_2: 645 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe 646 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 647 sub x17,$bptr,$aptr // preserves carry 648 stp w8,w9,[$rptr],#8 649 stp w10,w11,[$rptr],#8 650 cbnz x17,.LNEON_copy_n_zap 651 652 mov sp,x16 653 ldp d14,d15,[sp,#64] 654 ldp d12,d13,[sp,#48] 655 ldp d10,d11,[sp,#32] 656 ldp d8,d9,[sp,#16] 657 ldr x29,[sp],#80 658 AARCH64_VALIDATE_LINK_REGISTER 659 ret // bx lr 660 661.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 662___ 663} 664{ 665######################################################################## 666# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 667 668my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 669my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 670my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 671my ($cnt,$carry,$topmost)=("x27","x28","x30"); 672my ($tp,$ap_end,$na0)=($bp,$np,$carry); 673 674$code.=<<___; 675.type __bn_sqr8x_mont,%function 676.align 5 677__bn_sqr8x_mont: 678 cmp $ap,$bp 679 b.ne __bn_mul4x_mont 680.Lsqr8x_mont: 681 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 682 // only from bn_mul_mont which has already signed the return address. 683 stp x29,x30,[sp,#-128]! 684 add x29,sp,#0 685 stp x19,x20,[sp,#16] 686 stp x21,x22,[sp,#32] 687 stp x23,x24,[sp,#48] 688 stp x25,x26,[sp,#64] 689 stp x27,x28,[sp,#80] 690 stp $rp,$np,[sp,#96] // offload rp and np 691 692 ldp $a0,$a1,[$ap,#8*0] 693 ldp $a2,$a3,[$ap,#8*2] 694 ldp $a4,$a5,[$ap,#8*4] 695 ldp $a6,$a7,[$ap,#8*6] 696 697 sub $tp,sp,$num,lsl#4 698 lsl $num,$num,#3 699 ldr $n0,[$n0] // *n0 700 mov sp,$tp // alloca 701 sub $cnt,$num,#8*8 702 b .Lsqr8x_zero_start 703 704.Lsqr8x_zero: 705 sub $cnt,$cnt,#8*8 706 stp xzr,xzr,[$tp,#8*0] 707 stp xzr,xzr,[$tp,#8*2] 708 stp xzr,xzr,[$tp,#8*4] 709 stp xzr,xzr,[$tp,#8*6] 710.Lsqr8x_zero_start: 711 stp xzr,xzr,[$tp,#8*8] 712 stp xzr,xzr,[$tp,#8*10] 713 stp xzr,xzr,[$tp,#8*12] 714 stp xzr,xzr,[$tp,#8*14] 715 add $tp,$tp,#8*16 716 cbnz $cnt,.Lsqr8x_zero 717 718 add $ap_end,$ap,$num 719 add $ap,$ap,#8*8 720 mov $acc0,xzr 721 mov $acc1,xzr 722 mov $acc2,xzr 723 mov $acc3,xzr 724 mov $acc4,xzr 725 mov $acc5,xzr 726 mov $acc6,xzr 727 mov $acc7,xzr 728 mov $tp,sp 729 str $n0,[x29,#112] // offload n0 730 731 // Multiply everything but a[i]*a[i] 732.align 4 733.Lsqr8x_outer_loop: 734 // a[1]a[0] (i) 735 // a[2]a[0] 736 // a[3]a[0] 737 // a[4]a[0] 738 // a[5]a[0] 739 // a[6]a[0] 740 // a[7]a[0] 741 // a[2]a[1] (ii) 742 // a[3]a[1] 743 // a[4]a[1] 744 // a[5]a[1] 745 // a[6]a[1] 746 // a[7]a[1] 747 // a[3]a[2] (iii) 748 // a[4]a[2] 749 // a[5]a[2] 750 // a[6]a[2] 751 // a[7]a[2] 752 // a[4]a[3] (iv) 753 // a[5]a[3] 754 // a[6]a[3] 755 // a[7]a[3] 756 // a[5]a[4] (v) 757 // a[6]a[4] 758 // a[7]a[4] 759 // a[6]a[5] (vi) 760 // a[7]a[5] 761 // a[7]a[6] (vii) 762 763 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 764 mul $t1,$a2,$a0 765 mul $t2,$a3,$a0 766 mul $t3,$a4,$a0 767 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 768 mul $t0,$a5,$a0 769 adcs $acc2,$acc2,$t1 770 mul $t1,$a6,$a0 771 adcs $acc3,$acc3,$t2 772 mul $t2,$a7,$a0 773 adcs $acc4,$acc4,$t3 774 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 775 adcs $acc5,$acc5,$t0 776 umulh $t0,$a2,$a0 777 adcs $acc6,$acc6,$t1 778 umulh $t1,$a3,$a0 779 adcs $acc7,$acc7,$t2 780 umulh $t2,$a4,$a0 781 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 782 adc $acc0,xzr,xzr // t[8] 783 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 784 umulh $t3,$a5,$a0 785 adcs $acc3,$acc3,$t0 786 umulh $t0,$a6,$a0 787 adcs $acc4,$acc4,$t1 788 umulh $t1,$a7,$a0 789 adcs $acc5,$acc5,$t2 790 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 791 adcs $acc6,$acc6,$t3 792 mul $t3,$a3,$a1 793 adcs $acc7,$acc7,$t0 794 mul $t0,$a4,$a1 795 adc $acc0,$acc0,$t1 796 797 mul $t1,$a5,$a1 798 adds $acc3,$acc3,$t2 799 mul $t2,$a6,$a1 800 adcs $acc4,$acc4,$t3 801 mul $t3,$a7,$a1 802 adcs $acc5,$acc5,$t0 803 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 804 adcs $acc6,$acc6,$t1 805 umulh $t1,$a3,$a1 806 adcs $acc7,$acc7,$t2 807 umulh $t2,$a4,$a1 808 adcs $acc0,$acc0,$t3 809 umulh $t3,$a5,$a1 810 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 811 adc $acc1,xzr,xzr // t[9] 812 adds $acc4,$acc4,$t0 813 umulh $t0,$a6,$a1 814 adcs $acc5,$acc5,$t1 815 umulh $t1,$a7,$a1 816 adcs $acc6,$acc6,$t2 817 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 818 adcs $acc7,$acc7,$t3 819 mul $t3,$a4,$a2 820 adcs $acc0,$acc0,$t0 821 mul $t0,$a5,$a2 822 adc $acc1,$acc1,$t1 823 824 mul $t1,$a6,$a2 825 adds $acc5,$acc5,$t2 826 mul $t2,$a7,$a2 827 adcs $acc6,$acc6,$t3 828 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 829 adcs $acc7,$acc7,$t0 830 umulh $t0,$a4,$a2 831 adcs $acc0,$acc0,$t1 832 umulh $t1,$a5,$a2 833 adcs $acc1,$acc1,$t2 834 umulh $t2,$a6,$a2 835 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 836 adc $acc2,xzr,xzr // t[10] 837 adds $acc6,$acc6,$t3 838 umulh $t3,$a7,$a2 839 adcs $acc7,$acc7,$t0 840 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 841 adcs $acc0,$acc0,$t1 842 mul $t1,$a5,$a3 843 adcs $acc1,$acc1,$t2 844 mul $t2,$a6,$a3 845 adc $acc2,$acc2,$t3 846 847 mul $t3,$a7,$a3 848 adds $acc7,$acc7,$t0 849 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 850 adcs $acc0,$acc0,$t1 851 umulh $t1,$a5,$a3 852 adcs $acc1,$acc1,$t2 853 umulh $t2,$a6,$a3 854 adcs $acc2,$acc2,$t3 855 umulh $t3,$a7,$a3 856 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 857 adc $acc3,xzr,xzr // t[11] 858 adds $acc0,$acc0,$t0 859 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 860 adcs $acc1,$acc1,$t1 861 mul $t1,$a6,$a4 862 adcs $acc2,$acc2,$t2 863 mul $t2,$a7,$a4 864 adc $acc3,$acc3,$t3 865 866 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 867 adds $acc1,$acc1,$t0 868 umulh $t0,$a6,$a4 869 adcs $acc2,$acc2,$t1 870 umulh $t1,$a7,$a4 871 adcs $acc3,$acc3,$t2 872 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 873 adc $acc4,xzr,xzr // t[12] 874 adds $acc2,$acc2,$t3 875 mul $t3,$a7,$a5 876 adcs $acc3,$acc3,$t0 877 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 878 adc $acc4,$acc4,$t1 879 880 umulh $t1,$a7,$a5 881 adds $acc3,$acc3,$t2 882 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 883 adcs $acc4,$acc4,$t3 884 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 885 adc $acc5,xzr,xzr // t[13] 886 adds $acc4,$acc4,$t0 887 sub $cnt,$ap_end,$ap // done yet? 888 adc $acc5,$acc5,$t1 889 890 adds $acc5,$acc5,$t2 891 sub $t0,$ap_end,$num // rewinded ap 892 adc $acc6,xzr,xzr // t[14] 893 add $acc6,$acc6,$t3 894 895 cbz $cnt,.Lsqr8x_outer_break 896 897 mov $n0,$a0 898 ldp $a0,$a1,[$tp,#8*0] 899 ldp $a2,$a3,[$tp,#8*2] 900 ldp $a4,$a5,[$tp,#8*4] 901 ldp $a6,$a7,[$tp,#8*6] 902 adds $acc0,$acc0,$a0 903 adcs $acc1,$acc1,$a1 904 ldp $a0,$a1,[$ap,#8*0] 905 adcs $acc2,$acc2,$a2 906 adcs $acc3,$acc3,$a3 907 ldp $a2,$a3,[$ap,#8*2] 908 adcs $acc4,$acc4,$a4 909 adcs $acc5,$acc5,$a5 910 ldp $a4,$a5,[$ap,#8*4] 911 adcs $acc6,$acc6,$a6 912 mov $rp,$ap 913 adcs $acc7,xzr,$a7 914 ldp $a6,$a7,[$ap,#8*6] 915 add $ap,$ap,#8*8 916 //adc $carry,xzr,xzr // moved below 917 mov $cnt,#-8*8 918 919 // a[8]a[0] 920 // a[9]a[0] 921 // a[a]a[0] 922 // a[b]a[0] 923 // a[c]a[0] 924 // a[d]a[0] 925 // a[e]a[0] 926 // a[f]a[0] 927 // a[8]a[1] 928 // a[f]a[1]........................ 929 // a[8]a[2] 930 // a[f]a[2]........................ 931 // a[8]a[3] 932 // a[f]a[3]........................ 933 // a[8]a[4] 934 // a[f]a[4]........................ 935 // a[8]a[5] 936 // a[f]a[5]........................ 937 // a[8]a[6] 938 // a[f]a[6]........................ 939 // a[8]a[7] 940 // a[f]a[7]........................ 941.Lsqr8x_mul: 942 mul $t0,$a0,$n0 943 adc $carry,xzr,xzr // carry bit, modulo-scheduled 944 mul $t1,$a1,$n0 945 add $cnt,$cnt,#8 946 mul $t2,$a2,$n0 947 mul $t3,$a3,$n0 948 adds $acc0,$acc0,$t0 949 mul $t0,$a4,$n0 950 adcs $acc1,$acc1,$t1 951 mul $t1,$a5,$n0 952 adcs $acc2,$acc2,$t2 953 mul $t2,$a6,$n0 954 adcs $acc3,$acc3,$t3 955 mul $t3,$a7,$n0 956 adcs $acc4,$acc4,$t0 957 umulh $t0,$a0,$n0 958 adcs $acc5,$acc5,$t1 959 umulh $t1,$a1,$n0 960 adcs $acc6,$acc6,$t2 961 umulh $t2,$a2,$n0 962 adcs $acc7,$acc7,$t3 963 umulh $t3,$a3,$n0 964 adc $carry,$carry,xzr 965 str $acc0,[$tp],#8 966 adds $acc0,$acc1,$t0 967 umulh $t0,$a4,$n0 968 adcs $acc1,$acc2,$t1 969 umulh $t1,$a5,$n0 970 adcs $acc2,$acc3,$t2 971 umulh $t2,$a6,$n0 972 adcs $acc3,$acc4,$t3 973 umulh $t3,$a7,$n0 974 ldr $n0,[$rp,$cnt] 975 adcs $acc4,$acc5,$t0 976 adcs $acc5,$acc6,$t1 977 adcs $acc6,$acc7,$t2 978 adcs $acc7,$carry,$t3 979 //adc $carry,xzr,xzr // moved above 980 cbnz $cnt,.Lsqr8x_mul 981 // note that carry flag is guaranteed 982 // to be zero at this point 983 cmp $ap,$ap_end // done yet? 984 b.eq .Lsqr8x_break 985 986 ldp $a0,$a1,[$tp,#8*0] 987 ldp $a2,$a3,[$tp,#8*2] 988 ldp $a4,$a5,[$tp,#8*4] 989 ldp $a6,$a7,[$tp,#8*6] 990 adds $acc0,$acc0,$a0 991 ldur $n0,[$rp,#-8*8] 992 adcs $acc1,$acc1,$a1 993 ldp $a0,$a1,[$ap,#8*0] 994 adcs $acc2,$acc2,$a2 995 adcs $acc3,$acc3,$a3 996 ldp $a2,$a3,[$ap,#8*2] 997 adcs $acc4,$acc4,$a4 998 adcs $acc5,$acc5,$a5 999 ldp $a4,$a5,[$ap,#8*4] 1000 adcs $acc6,$acc6,$a6 1001 mov $cnt,#-8*8 1002 adcs $acc7,$acc7,$a7 1003 ldp $a6,$a7,[$ap,#8*6] 1004 add $ap,$ap,#8*8 1005 //adc $carry,xzr,xzr // moved above 1006 b .Lsqr8x_mul 1007 1008.align 4 1009.Lsqr8x_break: 1010 ldp $a0,$a1,[$rp,#8*0] 1011 add $ap,$rp,#8*8 1012 ldp $a2,$a3,[$rp,#8*2] 1013 sub $t0,$ap_end,$ap // is it last iteration? 1014 ldp $a4,$a5,[$rp,#8*4] 1015 sub $t1,$tp,$t0 1016 ldp $a6,$a7,[$rp,#8*6] 1017 cbz $t0,.Lsqr8x_outer_loop 1018 1019 stp $acc0,$acc1,[$tp,#8*0] 1020 ldp $acc0,$acc1,[$t1,#8*0] 1021 stp $acc2,$acc3,[$tp,#8*2] 1022 ldp $acc2,$acc3,[$t1,#8*2] 1023 stp $acc4,$acc5,[$tp,#8*4] 1024 ldp $acc4,$acc5,[$t1,#8*4] 1025 stp $acc6,$acc7,[$tp,#8*6] 1026 mov $tp,$t1 1027 ldp $acc6,$acc7,[$t1,#8*6] 1028 b .Lsqr8x_outer_loop 1029 1030.align 4 1031.Lsqr8x_outer_break: 1032 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1033 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 1034 ldp $t1,$t2,[sp,#8*1] 1035 ldp $a5,$a7,[$t0,#8*2] 1036 add $ap,$t0,#8*4 1037 ldp $t3,$t0,[sp,#8*3] 1038 1039 stp $acc0,$acc1,[$tp,#8*0] 1040 mul $acc0,$a1,$a1 1041 stp $acc2,$acc3,[$tp,#8*2] 1042 umulh $a1,$a1,$a1 1043 stp $acc4,$acc5,[$tp,#8*4] 1044 mul $a2,$a3,$a3 1045 stp $acc6,$acc7,[$tp,#8*6] 1046 mov $tp,sp 1047 umulh $a3,$a3,$a3 1048 adds $acc1,$a1,$t1,lsl#1 1049 extr $t1,$t2,$t1,#63 1050 sub $cnt,$num,#8*4 1051 1052.Lsqr4x_shift_n_add: 1053 adcs $acc2,$a2,$t1 1054 extr $t2,$t3,$t2,#63 1055 sub $cnt,$cnt,#8*4 1056 adcs $acc3,$a3,$t2 1057 ldp $t1,$t2,[$tp,#8*5] 1058 mul $a4,$a5,$a5 1059 ldp $a1,$a3,[$ap],#8*2 1060 umulh $a5,$a5,$a5 1061 mul $a6,$a7,$a7 1062 umulh $a7,$a7,$a7 1063 extr $t3,$t0,$t3,#63 1064 stp $acc0,$acc1,[$tp,#8*0] 1065 adcs $acc4,$a4,$t3 1066 extr $t0,$t1,$t0,#63 1067 stp $acc2,$acc3,[$tp,#8*2] 1068 adcs $acc5,$a5,$t0 1069 ldp $t3,$t0,[$tp,#8*7] 1070 extr $t1,$t2,$t1,#63 1071 adcs $acc6,$a6,$t1 1072 extr $t2,$t3,$t2,#63 1073 adcs $acc7,$a7,$t2 1074 ldp $t1,$t2,[$tp,#8*9] 1075 mul $a0,$a1,$a1 1076 ldp $a5,$a7,[$ap],#8*2 1077 umulh $a1,$a1,$a1 1078 mul $a2,$a3,$a3 1079 umulh $a3,$a3,$a3 1080 stp $acc4,$acc5,[$tp,#8*4] 1081 extr $t3,$t0,$t3,#63 1082 stp $acc6,$acc7,[$tp,#8*6] 1083 add $tp,$tp,#8*8 1084 adcs $acc0,$a0,$t3 1085 extr $t0,$t1,$t0,#63 1086 adcs $acc1,$a1,$t0 1087 ldp $t3,$t0,[$tp,#8*3] 1088 extr $t1,$t2,$t1,#63 1089 cbnz $cnt,.Lsqr4x_shift_n_add 1090___ 1091my ($np,$np_end)=($ap,$ap_end); 1092$code.=<<___; 1093 ldp $np,$n0,[x29,#104] // pull np and n0 1094 1095 adcs $acc2,$a2,$t1 1096 extr $t2,$t3,$t2,#63 1097 adcs $acc3,$a3,$t2 1098 ldp $t1,$t2,[$tp,#8*5] 1099 mul $a4,$a5,$a5 1100 umulh $a5,$a5,$a5 1101 stp $acc0,$acc1,[$tp,#8*0] 1102 mul $a6,$a7,$a7 1103 umulh $a7,$a7,$a7 1104 stp $acc2,$acc3,[$tp,#8*2] 1105 extr $t3,$t0,$t3,#63 1106 adcs $acc4,$a4,$t3 1107 extr $t0,$t1,$t0,#63 1108 ldp $acc0,$acc1,[sp,#8*0] 1109 adcs $acc5,$a5,$t0 1110 extr $t1,$t2,$t1,#63 1111 ldp $a0,$a1,[$np,#8*0] 1112 adcs $acc6,$a6,$t1 1113 extr $t2,xzr,$t2,#63 1114 ldp $a2,$a3,[$np,#8*2] 1115 adc $acc7,$a7,$t2 1116 ldp $a4,$a5,[$np,#8*4] 1117 1118 // Reduce by 512 bits per iteration 1119 mul $na0,$n0,$acc0 // t[0]*n0 1120 ldp $a6,$a7,[$np,#8*6] 1121 add $np_end,$np,$num 1122 ldp $acc2,$acc3,[sp,#8*2] 1123 stp $acc4,$acc5,[$tp,#8*4] 1124 ldp $acc4,$acc5,[sp,#8*4] 1125 stp $acc6,$acc7,[$tp,#8*6] 1126 ldp $acc6,$acc7,[sp,#8*6] 1127 add $np,$np,#8*8 1128 mov $topmost,xzr // initial top-most carry 1129 mov $tp,sp 1130 mov $cnt,#8 1131 1132.Lsqr8x_reduction: 1133 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 1134 mul $t1,$a1,$na0 1135 sub $cnt,$cnt,#1 1136 mul $t2,$a2,$na0 1137 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 1138 mul $t3,$a3,$na0 1139 // (*) adds xzr,$acc0,$t0 1140 subs xzr,$acc0,#1 // (*) 1141 mul $t0,$a4,$na0 1142 adcs $acc0,$acc1,$t1 1143 mul $t1,$a5,$na0 1144 adcs $acc1,$acc2,$t2 1145 mul $t2,$a6,$na0 1146 adcs $acc2,$acc3,$t3 1147 mul $t3,$a7,$na0 1148 adcs $acc3,$acc4,$t0 1149 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 1150 adcs $acc4,$acc5,$t1 1151 umulh $t1,$a1,$na0 1152 adcs $acc5,$acc6,$t2 1153 umulh $t2,$a2,$na0 1154 adcs $acc6,$acc7,$t3 1155 umulh $t3,$a3,$na0 1156 adc $acc7,xzr,xzr 1157 adds $acc0,$acc0,$t0 1158 umulh $t0,$a4,$na0 1159 adcs $acc1,$acc1,$t1 1160 umulh $t1,$a5,$na0 1161 adcs $acc2,$acc2,$t2 1162 umulh $t2,$a6,$na0 1163 adcs $acc3,$acc3,$t3 1164 umulh $t3,$a7,$na0 1165 mul $na0,$n0,$acc0 // next t[0]*n0 1166 adcs $acc4,$acc4,$t0 1167 adcs $acc5,$acc5,$t1 1168 adcs $acc6,$acc6,$t2 1169 adc $acc7,$acc7,$t3 1170 cbnz $cnt,.Lsqr8x_reduction 1171 1172 ldp $t0,$t1,[$tp,#8*0] 1173 ldp $t2,$t3,[$tp,#8*2] 1174 mov $rp,$tp 1175 sub $cnt,$np_end,$np // done yet? 1176 adds $acc0,$acc0,$t0 1177 adcs $acc1,$acc1,$t1 1178 ldp $t0,$t1,[$tp,#8*4] 1179 adcs $acc2,$acc2,$t2 1180 adcs $acc3,$acc3,$t3 1181 ldp $t2,$t3,[$tp,#8*6] 1182 adcs $acc4,$acc4,$t0 1183 adcs $acc5,$acc5,$t1 1184 adcs $acc6,$acc6,$t2 1185 adcs $acc7,$acc7,$t3 1186 //adc $carry,xzr,xzr // moved below 1187 cbz $cnt,.Lsqr8x8_post_condition 1188 1189 ldur $n0,[$tp,#-8*8] 1190 ldp $a0,$a1,[$np,#8*0] 1191 ldp $a2,$a3,[$np,#8*2] 1192 ldp $a4,$a5,[$np,#8*4] 1193 mov $cnt,#-8*8 1194 ldp $a6,$a7,[$np,#8*6] 1195 add $np,$np,#8*8 1196 1197.Lsqr8x_tail: 1198 mul $t0,$a0,$n0 1199 adc $carry,xzr,xzr // carry bit, modulo-scheduled 1200 mul $t1,$a1,$n0 1201 add $cnt,$cnt,#8 1202 mul $t2,$a2,$n0 1203 mul $t3,$a3,$n0 1204 adds $acc0,$acc0,$t0 1205 mul $t0,$a4,$n0 1206 adcs $acc1,$acc1,$t1 1207 mul $t1,$a5,$n0 1208 adcs $acc2,$acc2,$t2 1209 mul $t2,$a6,$n0 1210 adcs $acc3,$acc3,$t3 1211 mul $t3,$a7,$n0 1212 adcs $acc4,$acc4,$t0 1213 umulh $t0,$a0,$n0 1214 adcs $acc5,$acc5,$t1 1215 umulh $t1,$a1,$n0 1216 adcs $acc6,$acc6,$t2 1217 umulh $t2,$a2,$n0 1218 adcs $acc7,$acc7,$t3 1219 umulh $t3,$a3,$n0 1220 adc $carry,$carry,xzr 1221 str $acc0,[$tp],#8 1222 adds $acc0,$acc1,$t0 1223 umulh $t0,$a4,$n0 1224 adcs $acc1,$acc2,$t1 1225 umulh $t1,$a5,$n0 1226 adcs $acc2,$acc3,$t2 1227 umulh $t2,$a6,$n0 1228 adcs $acc3,$acc4,$t3 1229 umulh $t3,$a7,$n0 1230 ldr $n0,[$rp,$cnt] 1231 adcs $acc4,$acc5,$t0 1232 adcs $acc5,$acc6,$t1 1233 adcs $acc6,$acc7,$t2 1234 adcs $acc7,$carry,$t3 1235 //adc $carry,xzr,xzr // moved above 1236 cbnz $cnt,.Lsqr8x_tail 1237 // note that carry flag is guaranteed 1238 // to be zero at this point 1239 ldp $a0,$a1,[$tp,#8*0] 1240 sub $cnt,$np_end,$np // done yet? 1241 sub $t2,$np_end,$num // rewinded np 1242 ldp $a2,$a3,[$tp,#8*2] 1243 ldp $a4,$a5,[$tp,#8*4] 1244 ldp $a6,$a7,[$tp,#8*6] 1245 cbz $cnt,.Lsqr8x_tail_break 1246 1247 ldur $n0,[$rp,#-8*8] 1248 adds $acc0,$acc0,$a0 1249 adcs $acc1,$acc1,$a1 1250 ldp $a0,$a1,[$np,#8*0] 1251 adcs $acc2,$acc2,$a2 1252 adcs $acc3,$acc3,$a3 1253 ldp $a2,$a3,[$np,#8*2] 1254 adcs $acc4,$acc4,$a4 1255 adcs $acc5,$acc5,$a5 1256 ldp $a4,$a5,[$np,#8*4] 1257 adcs $acc6,$acc6,$a6 1258 mov $cnt,#-8*8 1259 adcs $acc7,$acc7,$a7 1260 ldp $a6,$a7,[$np,#8*6] 1261 add $np,$np,#8*8 1262 //adc $carry,xzr,xzr // moved above 1263 b .Lsqr8x_tail 1264 1265.align 4 1266.Lsqr8x_tail_break: 1267 ldr $n0,[x29,#112] // pull n0 1268 add $cnt,$tp,#8*8 // end of current t[num] window 1269 1270 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 1271 adcs $t0,$acc0,$a0 1272 adcs $t1,$acc1,$a1 1273 ldp $acc0,$acc1,[$rp,#8*0] 1274 adcs $acc2,$acc2,$a2 1275 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 1276 adcs $acc3,$acc3,$a3 1277 ldp $a2,$a3,[$t2,#8*2] 1278 adcs $acc4,$acc4,$a4 1279 adcs $acc5,$acc5,$a5 1280 ldp $a4,$a5,[$t2,#8*4] 1281 adcs $acc6,$acc6,$a6 1282 adcs $acc7,$acc7,$a7 1283 ldp $a6,$a7,[$t2,#8*6] 1284 add $np,$t2,#8*8 1285 adc $topmost,xzr,xzr // top-most carry 1286 mul $na0,$n0,$acc0 1287 stp $t0,$t1,[$tp,#8*0] 1288 stp $acc2,$acc3,[$tp,#8*2] 1289 ldp $acc2,$acc3,[$rp,#8*2] 1290 stp $acc4,$acc5,[$tp,#8*4] 1291 ldp $acc4,$acc5,[$rp,#8*4] 1292 cmp $cnt,x29 // did we hit the bottom? 1293 stp $acc6,$acc7,[$tp,#8*6] 1294 mov $tp,$rp // slide the window 1295 ldp $acc6,$acc7,[$rp,#8*6] 1296 mov $cnt,#8 1297 b.ne .Lsqr8x_reduction 1298 1299 // Final step. We see if result is larger than modulus, and 1300 // if it is, subtract the modulus. But comparison implies 1301 // subtraction. So we subtract modulus, see if it borrowed, 1302 // and conditionally copy original value. 1303 ldr $rp,[x29,#96] // pull rp 1304 add $tp,$tp,#8*8 1305 subs $t0,$acc0,$a0 1306 sbcs $t1,$acc1,$a1 1307 sub $cnt,$num,#8*8 1308 mov $ap_end,$rp // $rp copy 1309 1310.Lsqr8x_sub: 1311 sbcs $t2,$acc2,$a2 1312 ldp $a0,$a1,[$np,#8*0] 1313 sbcs $t3,$acc3,$a3 1314 stp $t0,$t1,[$rp,#8*0] 1315 sbcs $t0,$acc4,$a4 1316 ldp $a2,$a3,[$np,#8*2] 1317 sbcs $t1,$acc5,$a5 1318 stp $t2,$t3,[$rp,#8*2] 1319 sbcs $t2,$acc6,$a6 1320 ldp $a4,$a5,[$np,#8*4] 1321 sbcs $t3,$acc7,$a7 1322 ldp $a6,$a7,[$np,#8*6] 1323 add $np,$np,#8*8 1324 ldp $acc0,$acc1,[$tp,#8*0] 1325 sub $cnt,$cnt,#8*8 1326 ldp $acc2,$acc3,[$tp,#8*2] 1327 ldp $acc4,$acc5,[$tp,#8*4] 1328 ldp $acc6,$acc7,[$tp,#8*6] 1329 add $tp,$tp,#8*8 1330 stp $t0,$t1,[$rp,#8*4] 1331 sbcs $t0,$acc0,$a0 1332 stp $t2,$t3,[$rp,#8*6] 1333 add $rp,$rp,#8*8 1334 sbcs $t1,$acc1,$a1 1335 cbnz $cnt,.Lsqr8x_sub 1336 1337 sbcs $t2,$acc2,$a2 1338 mov $tp,sp 1339 add $ap,sp,$num 1340 ldp $a0,$a1,[$ap_end,#8*0] 1341 sbcs $t3,$acc3,$a3 1342 stp $t0,$t1,[$rp,#8*0] 1343 sbcs $t0,$acc4,$a4 1344 ldp $a2,$a3,[$ap_end,#8*2] 1345 sbcs $t1,$acc5,$a5 1346 stp $t2,$t3,[$rp,#8*2] 1347 sbcs $t2,$acc6,$a6 1348 ldp $acc0,$acc1,[$ap,#8*0] 1349 sbcs $t3,$acc7,$a7 1350 ldp $acc2,$acc3,[$ap,#8*2] 1351 sbcs xzr,$topmost,xzr // did it borrow? 1352 ldr x30,[x29,#8] // pull return address 1353 stp $t0,$t1,[$rp,#8*4] 1354 stp $t2,$t3,[$rp,#8*6] 1355 1356 sub $cnt,$num,#8*4 1357.Lsqr4x_cond_copy: 1358 sub $cnt,$cnt,#8*4 1359 csel $t0,$acc0,$a0,lo 1360 stp xzr,xzr,[$tp,#8*0] 1361 csel $t1,$acc1,$a1,lo 1362 ldp $a0,$a1,[$ap_end,#8*4] 1363 ldp $acc0,$acc1,[$ap,#8*4] 1364 csel $t2,$acc2,$a2,lo 1365 stp xzr,xzr,[$tp,#8*2] 1366 add $tp,$tp,#8*4 1367 csel $t3,$acc3,$a3,lo 1368 ldp $a2,$a3,[$ap_end,#8*6] 1369 ldp $acc2,$acc3,[$ap,#8*6] 1370 add $ap,$ap,#8*4 1371 stp $t0,$t1,[$ap_end,#8*0] 1372 stp $t2,$t3,[$ap_end,#8*2] 1373 add $ap_end,$ap_end,#8*4 1374 stp xzr,xzr,[$ap,#8*0] 1375 stp xzr,xzr,[$ap,#8*2] 1376 cbnz $cnt,.Lsqr4x_cond_copy 1377 1378 csel $t0,$acc0,$a0,lo 1379 stp xzr,xzr,[$tp,#8*0] 1380 csel $t1,$acc1,$a1,lo 1381 stp xzr,xzr,[$tp,#8*2] 1382 csel $t2,$acc2,$a2,lo 1383 csel $t3,$acc3,$a3,lo 1384 stp $t0,$t1,[$ap_end,#8*0] 1385 stp $t2,$t3,[$ap_end,#8*2] 1386 1387 b .Lsqr8x_done 1388 1389.align 4 1390.Lsqr8x8_post_condition: 1391 adc $carry,xzr,xzr 1392 ldr x30,[x29,#8] // pull return address 1393 // $acc0-7,$carry hold result, $a0-7 hold modulus 1394 subs $a0,$acc0,$a0 1395 ldr $ap,[x29,#96] // pull rp 1396 sbcs $a1,$acc1,$a1 1397 stp xzr,xzr,[sp,#8*0] 1398 sbcs $a2,$acc2,$a2 1399 stp xzr,xzr,[sp,#8*2] 1400 sbcs $a3,$acc3,$a3 1401 stp xzr,xzr,[sp,#8*4] 1402 sbcs $a4,$acc4,$a4 1403 stp xzr,xzr,[sp,#8*6] 1404 sbcs $a5,$acc5,$a5 1405 stp xzr,xzr,[sp,#8*8] 1406 sbcs $a6,$acc6,$a6 1407 stp xzr,xzr,[sp,#8*10] 1408 sbcs $a7,$acc7,$a7 1409 stp xzr,xzr,[sp,#8*12] 1410 sbcs $carry,$carry,xzr // did it borrow? 1411 stp xzr,xzr,[sp,#8*14] 1412 1413 // $a0-7 hold result-modulus 1414 csel $a0,$acc0,$a0,lo 1415 csel $a1,$acc1,$a1,lo 1416 csel $a2,$acc2,$a2,lo 1417 csel $a3,$acc3,$a3,lo 1418 stp $a0,$a1,[$ap,#8*0] 1419 csel $a4,$acc4,$a4,lo 1420 csel $a5,$acc5,$a5,lo 1421 stp $a2,$a3,[$ap,#8*2] 1422 csel $a6,$acc6,$a6,lo 1423 csel $a7,$acc7,$a7,lo 1424 stp $a4,$a5,[$ap,#8*4] 1425 stp $a6,$a7,[$ap,#8*6] 1426 1427.Lsqr8x_done: 1428 ldp x19,x20,[x29,#16] 1429 mov sp,x29 1430 ldp x21,x22,[x29,#32] 1431 mov x0,#1 1432 ldp x23,x24,[x29,#48] 1433 ldp x25,x26,[x29,#64] 1434 ldp x27,x28,[x29,#80] 1435 ldr x29,[sp],#128 1436 // x30 is loaded earlier 1437 AARCH64_VALIDATE_LINK_REGISTER 1438 ret 1439.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1440___ 1441} 1442 1443{ 1444######################################################################## 1445# Even though this might look as ARMv8 adaptation of mulx4x_mont from 1446# x86_64-mont5 module, it's different in sense that it performs 1447# reduction 256 bits at a time. 1448 1449my ($a0,$a1,$a2,$a3, 1450 $t0,$t1,$t2,$t3, 1451 $m0,$m1,$m2,$m3, 1452 $acc0,$acc1,$acc2,$acc3,$acc4, 1453 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1454my $bp_end=$rp; 1455my ($carry,$topmost) = ($rp,"x30"); 1456 1457$code.=<<___; 1458.type __bn_mul4x_mont,%function 1459.align 5 1460__bn_mul4x_mont: 1461 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1462 // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1463 stp x29,x30,[sp,#-128]! 1464 add x29,sp,#0 1465 stp x19,x20,[sp,#16] 1466 stp x21,x22,[sp,#32] 1467 stp x23,x24,[sp,#48] 1468 stp x25,x26,[sp,#64] 1469 stp x27,x28,[sp,#80] 1470 1471 sub $tp,sp,$num,lsl#3 1472 lsl $num,$num,#3 1473 ldr $n0,[$n0] // *n0 1474 sub sp,$tp,#8*4 // alloca 1475 1476 add $t0,$bp,$num 1477 add $ap_end,$ap,$num 1478 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1479 1480 ldr $bi,[$bp,#8*0] // b[0] 1481 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1482 ldp $a2,$a3,[$ap,#8*2] 1483 add $ap,$ap,#8*4 1484 mov $acc0,xzr 1485 mov $acc1,xzr 1486 mov $acc2,xzr 1487 mov $acc3,xzr 1488 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1489 ldp $m2,$m3,[$np,#8*2] 1490 adds $np,$np,#8*4 // clear carry bit 1491 mov $carry,xzr 1492 mov $cnt,#0 1493 mov $tp,sp 1494 1495.Loop_mul4x_1st_reduction: 1496 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1497 adc $carry,$carry,xzr // modulo-scheduled 1498 mul $t1,$a1,$bi 1499 add $cnt,$cnt,#8 1500 mul $t2,$a2,$bi 1501 and $cnt,$cnt,#31 1502 mul $t3,$a3,$bi 1503 adds $acc0,$acc0,$t0 1504 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1505 adcs $acc1,$acc1,$t1 1506 mul $mi,$acc0,$n0 // t[0]*n0 1507 adcs $acc2,$acc2,$t2 1508 umulh $t1,$a1,$bi 1509 adcs $acc3,$acc3,$t3 1510 umulh $t2,$a2,$bi 1511 adc $acc4,xzr,xzr 1512 umulh $t3,$a3,$bi 1513 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1514 adds $acc1,$acc1,$t0 1515 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1516 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1517 adcs $acc2,$acc2,$t1 1518 mul $t1,$m1,$mi 1519 adcs $acc3,$acc3,$t2 1520 mul $t2,$m2,$mi 1521 adc $acc4,$acc4,$t3 // can't overflow 1522 mul $t3,$m3,$mi 1523 // (*) adds xzr,$acc0,$t0 1524 subs xzr,$acc0,#1 // (*) 1525 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1526 adcs $acc0,$acc1,$t1 1527 umulh $t1,$m1,$mi 1528 adcs $acc1,$acc2,$t2 1529 umulh $t2,$m2,$mi 1530 adcs $acc2,$acc3,$t3 1531 umulh $t3,$m3,$mi 1532 adcs $acc3,$acc4,$carry 1533 adc $carry,xzr,xzr 1534 adds $acc0,$acc0,$t0 1535 sub $t0,$ap_end,$ap 1536 adcs $acc1,$acc1,$t1 1537 adcs $acc2,$acc2,$t2 1538 adcs $acc3,$acc3,$t3 1539 //adc $carry,$carry,xzr 1540 cbnz $cnt,.Loop_mul4x_1st_reduction 1541 1542 cbz $t0,.Lmul4x4_post_condition 1543 1544 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1545 ldp $a2,$a3,[$ap,#8*2] 1546 add $ap,$ap,#8*4 1547 ldr $mi,[sp] // a[0]*n0 1548 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1549 ldp $m2,$m3,[$np,#8*2] 1550 add $np,$np,#8*4 1551 1552.Loop_mul4x_1st_tail: 1553 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1554 adc $carry,$carry,xzr // modulo-scheduled 1555 mul $t1,$a1,$bi 1556 add $cnt,$cnt,#8 1557 mul $t2,$a2,$bi 1558 and $cnt,$cnt,#31 1559 mul $t3,$a3,$bi 1560 adds $acc0,$acc0,$t0 1561 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1562 adcs $acc1,$acc1,$t1 1563 umulh $t1,$a1,$bi 1564 adcs $acc2,$acc2,$t2 1565 umulh $t2,$a2,$bi 1566 adcs $acc3,$acc3,$t3 1567 umulh $t3,$a3,$bi 1568 adc $acc4,xzr,xzr 1569 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1570 adds $acc1,$acc1,$t0 1571 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1572 adcs $acc2,$acc2,$t1 1573 mul $t1,$m1,$mi 1574 adcs $acc3,$acc3,$t2 1575 mul $t2,$m2,$mi 1576 adc $acc4,$acc4,$t3 // can't overflow 1577 mul $t3,$m3,$mi 1578 adds $acc0,$acc0,$t0 1579 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1580 adcs $acc1,$acc1,$t1 1581 umulh $t1,$m1,$mi 1582 adcs $acc2,$acc2,$t2 1583 umulh $t2,$m2,$mi 1584 adcs $acc3,$acc3,$t3 1585 adcs $acc4,$acc4,$carry 1586 umulh $t3,$m3,$mi 1587 adc $carry,xzr,xzr 1588 ldr $mi,[sp,$cnt] // next t[0]*n0 1589 str $acc0,[$tp],#8 // result!!! 1590 adds $acc0,$acc1,$t0 1591 sub $t0,$ap_end,$ap // done yet? 1592 adcs $acc1,$acc2,$t1 1593 adcs $acc2,$acc3,$t2 1594 adcs $acc3,$acc4,$t3 1595 //adc $carry,$carry,xzr 1596 cbnz $cnt,.Loop_mul4x_1st_tail 1597 1598 sub $t1,$ap_end,$num // rewinded $ap 1599 cbz $t0,.Lmul4x_proceed 1600 1601 ldp $a0,$a1,[$ap,#8*0] 1602 ldp $a2,$a3,[$ap,#8*2] 1603 add $ap,$ap,#8*4 1604 ldp $m0,$m1,[$np,#8*0] 1605 ldp $m2,$m3,[$np,#8*2] 1606 add $np,$np,#8*4 1607 b .Loop_mul4x_1st_tail 1608 1609.align 5 1610.Lmul4x_proceed: 1611 ldr $bi,[$bp,#8*4]! // *++b 1612 adc $topmost,$carry,xzr 1613 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1614 sub $np,$np,$num // rewind np 1615 ldp $a2,$a3,[$t1,#8*2] 1616 add $ap,$t1,#8*4 1617 1618 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1619 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1620 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1621 ldp $acc2,$acc3,[sp,#8*6] 1622 1623 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1624 mov $tp,sp 1625 ldp $m2,$m3,[$np,#8*2] 1626 adds $np,$np,#8*4 // clear carry bit 1627 mov $carry,xzr 1628 1629.align 4 1630.Loop_mul4x_reduction: 1631 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1632 adc $carry,$carry,xzr // modulo-scheduled 1633 mul $t1,$a1,$bi 1634 add $cnt,$cnt,#8 1635 mul $t2,$a2,$bi 1636 and $cnt,$cnt,#31 1637 mul $t3,$a3,$bi 1638 adds $acc0,$acc0,$t0 1639 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1640 adcs $acc1,$acc1,$t1 1641 mul $mi,$acc0,$n0 // t[0]*n0 1642 adcs $acc2,$acc2,$t2 1643 umulh $t1,$a1,$bi 1644 adcs $acc3,$acc3,$t3 1645 umulh $t2,$a2,$bi 1646 adc $acc4,xzr,xzr 1647 umulh $t3,$a3,$bi 1648 ldr $bi,[$bp,$cnt] // next b[i] 1649 adds $acc1,$acc1,$t0 1650 // (*) mul $t0,$m0,$mi 1651 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1652 adcs $acc2,$acc2,$t1 1653 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1654 adcs $acc3,$acc3,$t2 1655 mul $t2,$m2,$mi 1656 adc $acc4,$acc4,$t3 // can't overflow 1657 mul $t3,$m3,$mi 1658 // (*) adds xzr,$acc0,$t0 1659 subs xzr,$acc0,#1 // (*) 1660 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1661 adcs $acc0,$acc1,$t1 1662 umulh $t1,$m1,$mi 1663 adcs $acc1,$acc2,$t2 1664 umulh $t2,$m2,$mi 1665 adcs $acc2,$acc3,$t3 1666 umulh $t3,$m3,$mi 1667 adcs $acc3,$acc4,$carry 1668 adc $carry,xzr,xzr 1669 adds $acc0,$acc0,$t0 1670 adcs $acc1,$acc1,$t1 1671 adcs $acc2,$acc2,$t2 1672 adcs $acc3,$acc3,$t3 1673 //adc $carry,$carry,xzr 1674 cbnz $cnt,.Loop_mul4x_reduction 1675 1676 adc $carry,$carry,xzr 1677 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1678 ldp $t2,$t3,[$tp,#8*6] 1679 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1680 ldp $a2,$a3,[$ap,#8*2] 1681 add $ap,$ap,#8*4 1682 adds $acc0,$acc0,$t0 1683 adcs $acc1,$acc1,$t1 1684 adcs $acc2,$acc2,$t2 1685 adcs $acc3,$acc3,$t3 1686 //adc $carry,$carry,xzr 1687 1688 ldr $mi,[sp] // t[0]*n0 1689 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1690 ldp $m2,$m3,[$np,#8*2] 1691 add $np,$np,#8*4 1692 1693.align 4 1694.Loop_mul4x_tail: 1695 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1696 adc $carry,$carry,xzr // modulo-scheduled 1697 mul $t1,$a1,$bi 1698 add $cnt,$cnt,#8 1699 mul $t2,$a2,$bi 1700 and $cnt,$cnt,#31 1701 mul $t3,$a3,$bi 1702 adds $acc0,$acc0,$t0 1703 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1704 adcs $acc1,$acc1,$t1 1705 umulh $t1,$a1,$bi 1706 adcs $acc2,$acc2,$t2 1707 umulh $t2,$a2,$bi 1708 adcs $acc3,$acc3,$t3 1709 umulh $t3,$a3,$bi 1710 adc $acc4,xzr,xzr 1711 ldr $bi,[$bp,$cnt] // next b[i] 1712 adds $acc1,$acc1,$t0 1713 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1714 adcs $acc2,$acc2,$t1 1715 mul $t1,$m1,$mi 1716 adcs $acc3,$acc3,$t2 1717 mul $t2,$m2,$mi 1718 adc $acc4,$acc4,$t3 // can't overflow 1719 mul $t3,$m3,$mi 1720 adds $acc0,$acc0,$t0 1721 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1722 adcs $acc1,$acc1,$t1 1723 umulh $t1,$m1,$mi 1724 adcs $acc2,$acc2,$t2 1725 umulh $t2,$m2,$mi 1726 adcs $acc3,$acc3,$t3 1727 umulh $t3,$m3,$mi 1728 adcs $acc4,$acc4,$carry 1729 ldr $mi,[sp,$cnt] // next a[0]*n0 1730 adc $carry,xzr,xzr 1731 str $acc0,[$tp],#8 // result!!! 1732 adds $acc0,$acc1,$t0 1733 sub $t0,$ap_end,$ap // done yet? 1734 adcs $acc1,$acc2,$t1 1735 adcs $acc2,$acc3,$t2 1736 adcs $acc3,$acc4,$t3 1737 //adc $carry,$carry,xzr 1738 cbnz $cnt,.Loop_mul4x_tail 1739 1740 sub $t1,$np,$num // rewinded np? 1741 adc $carry,$carry,xzr 1742 cbz $t0,.Loop_mul4x_break 1743 1744 ldp $t0,$t1,[$tp,#8*4] 1745 ldp $t2,$t3,[$tp,#8*6] 1746 ldp $a0,$a1,[$ap,#8*0] 1747 ldp $a2,$a3,[$ap,#8*2] 1748 add $ap,$ap,#8*4 1749 adds $acc0,$acc0,$t0 1750 adcs $acc1,$acc1,$t1 1751 adcs $acc2,$acc2,$t2 1752 adcs $acc3,$acc3,$t3 1753 //adc $carry,$carry,xzr 1754 ldp $m0,$m1,[$np,#8*0] 1755 ldp $m2,$m3,[$np,#8*2] 1756 add $np,$np,#8*4 1757 b .Loop_mul4x_tail 1758 1759.align 4 1760.Loop_mul4x_break: 1761 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1762 adds $acc0,$acc0,$topmost 1763 add $bp,$bp,#8*4 // bp++ 1764 adcs $acc1,$acc1,xzr 1765 sub $ap,$ap,$num // rewind ap 1766 adcs $acc2,$acc2,xzr 1767 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1768 adcs $acc3,$acc3,xzr 1769 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1770 adc $topmost,$carry,xzr 1771 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1772 cmp $bp,$t3 // done yet? 1773 ldp $acc2,$acc3,[sp,#8*6] 1774 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1775 ldp $m2,$m3,[$t1,#8*2] 1776 add $np,$t1,#8*4 1777 b.eq .Lmul4x_post 1778 1779 ldr $bi,[$bp] 1780 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1781 ldp $a2,$a3,[$ap,#8*2] 1782 adds $ap,$ap,#8*4 // clear carry bit 1783 mov $carry,xzr 1784 mov $tp,sp 1785 b .Loop_mul4x_reduction 1786 1787.align 4 1788.Lmul4x_post: 1789 // Final step. We see if result is larger than modulus, and 1790 // if it is, subtract the modulus. But comparison implies 1791 // subtraction. So we subtract modulus, see if it borrowed, 1792 // and conditionally copy original value. 1793 mov $rp,$t2 1794 mov $ap_end,$t2 // $rp copy 1795 subs $t0,$acc0,$m0 1796 add $tp,sp,#8*8 1797 sbcs $t1,$acc1,$m1 1798 sub $cnt,$num,#8*4 1799 1800.Lmul4x_sub: 1801 sbcs $t2,$acc2,$m2 1802 ldp $m0,$m1,[$np,#8*0] 1803 sub $cnt,$cnt,#8*4 1804 ldp $acc0,$acc1,[$tp,#8*0] 1805 sbcs $t3,$acc3,$m3 1806 ldp $m2,$m3,[$np,#8*2] 1807 add $np,$np,#8*4 1808 ldp $acc2,$acc3,[$tp,#8*2] 1809 add $tp,$tp,#8*4 1810 stp $t0,$t1,[$rp,#8*0] 1811 sbcs $t0,$acc0,$m0 1812 stp $t2,$t3,[$rp,#8*2] 1813 add $rp,$rp,#8*4 1814 sbcs $t1,$acc1,$m1 1815 cbnz $cnt,.Lmul4x_sub 1816 1817 sbcs $t2,$acc2,$m2 1818 mov $tp,sp 1819 add $ap,sp,#8*4 1820 ldp $a0,$a1,[$ap_end,#8*0] 1821 sbcs $t3,$acc3,$m3 1822 stp $t0,$t1,[$rp,#8*0] 1823 ldp $a2,$a3,[$ap_end,#8*2] 1824 stp $t2,$t3,[$rp,#8*2] 1825 ldp $acc0,$acc1,[$ap,#8*0] 1826 ldp $acc2,$acc3,[$ap,#8*2] 1827 sbcs xzr,$topmost,xzr // did it borrow? 1828 ldr x30,[x29,#8] // pull return address 1829 1830 sub $cnt,$num,#8*4 1831.Lmul4x_cond_copy: 1832 sub $cnt,$cnt,#8*4 1833 csel $t0,$acc0,$a0,lo 1834 stp xzr,xzr,[$tp,#8*0] 1835 csel $t1,$acc1,$a1,lo 1836 ldp $a0,$a1,[$ap_end,#8*4] 1837 ldp $acc0,$acc1,[$ap,#8*4] 1838 csel $t2,$acc2,$a2,lo 1839 stp xzr,xzr,[$tp,#8*2] 1840 add $tp,$tp,#8*4 1841 csel $t3,$acc3,$a3,lo 1842 ldp $a2,$a3,[$ap_end,#8*6] 1843 ldp $acc2,$acc3,[$ap,#8*6] 1844 add $ap,$ap,#8*4 1845 stp $t0,$t1,[$ap_end,#8*0] 1846 stp $t2,$t3,[$ap_end,#8*2] 1847 add $ap_end,$ap_end,#8*4 1848 cbnz $cnt,.Lmul4x_cond_copy 1849 1850 csel $t0,$acc0,$a0,lo 1851 stp xzr,xzr,[$tp,#8*0] 1852 csel $t1,$acc1,$a1,lo 1853 stp xzr,xzr,[$tp,#8*2] 1854 csel $t2,$acc2,$a2,lo 1855 stp xzr,xzr,[$tp,#8*3] 1856 csel $t3,$acc3,$a3,lo 1857 stp xzr,xzr,[$tp,#8*4] 1858 stp $t0,$t1,[$ap_end,#8*0] 1859 stp $t2,$t3,[$ap_end,#8*2] 1860 1861 b .Lmul4x_done 1862 1863.align 4 1864.Lmul4x4_post_condition: 1865 adc $carry,$carry,xzr 1866 ldr $ap,[x29,#96] // pull rp 1867 // $acc0-3,$carry hold result, $m0-7 hold modulus 1868 subs $a0,$acc0,$m0 1869 ldr x30,[x29,#8] // pull return address 1870 sbcs $a1,$acc1,$m1 1871 stp xzr,xzr,[sp,#8*0] 1872 sbcs $a2,$acc2,$m2 1873 stp xzr,xzr,[sp,#8*2] 1874 sbcs $a3,$acc3,$m3 1875 stp xzr,xzr,[sp,#8*4] 1876 sbcs xzr,$carry,xzr // did it borrow? 1877 stp xzr,xzr,[sp,#8*6] 1878 1879 // $a0-3 hold result-modulus 1880 csel $a0,$acc0,$a0,lo 1881 csel $a1,$acc1,$a1,lo 1882 csel $a2,$acc2,$a2,lo 1883 csel $a3,$acc3,$a3,lo 1884 stp $a0,$a1,[$ap,#8*0] 1885 stp $a2,$a3,[$ap,#8*2] 1886 1887.Lmul4x_done: 1888 ldp x19,x20,[x29,#16] 1889 mov sp,x29 1890 ldp x21,x22,[x29,#32] 1891 mov x0,#1 1892 ldp x23,x24,[x29,#48] 1893 ldp x25,x26,[x29,#64] 1894 ldp x27,x28,[x29,#80] 1895 ldr x29,[sp],#128 1896 // x30 loaded earlier 1897 AARCH64_VALIDATE_LINK_REGISTER 1898 ret 1899.size __bn_mul4x_mont,.-__bn_mul4x_mont 1900___ 1901} 1902$code.=<<___; 1903.rodata 1904.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1905.align 4 1906___ 1907 1908print $code; 1909 1910close STDOUT or die "error closing STDOUT: $!"; 1911