1#! /usr/bin/env perl 2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2015 18# 19# "Teaser" Montgomery multiplication module for ARMv8. Needs more 20# work. While it does improve RSA sign performance by 20-30% (less for 21# longer keys) on most processors, for some reason RSA2048 is not 22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 23# instruction issue rate is limited on processor in question, meaning 24# that dedicated squaring procedure is a must. Well, actually all 25# contemporary AArch64 processors seem to have limited multiplication 26# issue rate, i.e. they can't issue multiplication every cycle, which 27# explains moderate improvement coefficients in comparison to 28# compiler-generated code. Recall that compiler is instructed to use 29# umulh and therefore uses same amount of multiplication instructions 30# to do the job. Assembly's edge is to minimize number of "collateral" 31# instructions and of course instruction scheduling. 32# 33# April 2015 34# 35# Squaring procedure that handles lengths divisible by 8 improves 36# RSA/DSA performance by 25-40-60% depending on processor and key 37# length. Overall improvement coefficients are always positive in 38# comparison to compiler-generated code. On Cortex-A57 improvement 39# is still modest on longest key lengths, while others exhibit e.g. 40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 41# on Cortex-A57 and ~60-100% faster on others. 42 43# $output is the last argument if it looks like a file (it has an extension) 44# $flavour is the first argument if it doesn't look like a file 45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 47 48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 51die "can't locate arm-xlate.pl"; 52 53open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 54 or die "can't call $xlate: $1"; 55*STDOUT=*OUT; 56 57($lo0,$hi0,$aj,$m0,$alo,$ahi, 58 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 60 61# int bn_mul_mont( 62$rp="x0"; # BN_ULONG *rp, 63$ap="x1"; # const BN_ULONG *ap, 64$bp="x2"; # const BN_ULONG *bp, 65$np="x3"; # const BN_ULONG *np, 66$n0="x4"; # const BN_ULONG *n0, 67$num="x5"; # int num); 68 69$code.=<<___; 70#include "arm_arch.h" 71#ifndef __KERNEL__ 72.extern OPENSSL_armv8_rsa_neonized 73.hidden OPENSSL_armv8_rsa_neonized 74#endif 75.text 76 77.globl bn_mul_mont 78.type bn_mul_mont,%function 79.align 5 80bn_mul_mont: 81 AARCH64_SIGN_LINK_REGISTER 82.Lbn_mul_mont: 83 tst $num,#3 84 b.ne .Lmul_mont 85 cmp $num,#32 86 b.le .Lscalar_impl 87#ifndef __KERNEL__ 88 adrp x17,OPENSSL_armv8_rsa_neonized 89 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 90 cbnz w17, bn_mul8x_mont_neon 91#endif 92 93.Lscalar_impl: 94 tst $num,#7 95 b.eq __bn_sqr8x_mont 96 tst $num,#3 97 b.eq __bn_mul4x_mont 98 99.Lmul_mont: 100 stp x29,x30,[sp,#-64]! 101 add x29,sp,#0 102 stp x19,x20,[sp,#16] 103 stp x21,x22,[sp,#32] 104 stp x23,x24,[sp,#48] 105 106 ldr $m0,[$bp],#8 // bp[0] 107 sub $tp,sp,$num,lsl#3 108 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 109 lsl $num,$num,#3 110 ldr $n0,[$n0] // *n0 111 and $tp,$tp,#-16 // ABI says so 112 ldp $hi1,$nj,[$np],#16 // np[0..1] 113 114 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 115 sub $j,$num,#16 // j=num-2 116 umulh $hi0,$hi0,$m0 117 mul $alo,$aj,$m0 // ap[1]*bp[0] 118 umulh $ahi,$aj,$m0 119 120 mul $m1,$lo0,$n0 // "tp[0]"*n0 121 mov sp,$tp // alloca 122 123 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 124 umulh $hi1,$hi1,$m1 125 mul $nlo,$nj,$m1 // np[1]*m1 126 // (*) adds $lo1,$lo1,$lo0 // discarded 127 // (*) As for removal of first multiplication and addition 128 // instructions. The outcome of first addition is 129 // guaranteed to be zero, which leaves two computationally 130 // significant outcomes: it either carries or not. Then 131 // question is when does it carry? Is there alternative 132 // way to deduce it? If you follow operations, you can 133 // observe that condition for carry is quite simple: 134 // $lo0 being non-zero. So that carry can be calculated 135 // by adding -1 to $lo0. That's what next instruction does. 136 subs xzr,$lo0,#1 // (*) 137 umulh $nhi,$nj,$m1 138 adc $hi1,$hi1,xzr 139 cbz $j,.L1st_skip 140 141.L1st: 142 ldr $aj,[$ap],#8 143 adds $lo0,$alo,$hi0 144 sub $j,$j,#8 // j-- 145 adc $hi0,$ahi,xzr 146 147 ldr $nj,[$np],#8 148 adds $lo1,$nlo,$hi1 149 mul $alo,$aj,$m0 // ap[j]*bp[0] 150 adc $hi1,$nhi,xzr 151 umulh $ahi,$aj,$m0 152 153 adds $lo1,$lo1,$lo0 154 mul $nlo,$nj,$m1 // np[j]*m1 155 adc $hi1,$hi1,xzr 156 umulh $nhi,$nj,$m1 157 str $lo1,[$tp],#8 // tp[j-1] 158 cbnz $j,.L1st 159 160.L1st_skip: 161 adds $lo0,$alo,$hi0 162 sub $ap,$ap,$num // rewind $ap 163 adc $hi0,$ahi,xzr 164 165 adds $lo1,$nlo,$hi1 166 sub $np,$np,$num // rewind $np 167 adc $hi1,$nhi,xzr 168 169 adds $lo1,$lo1,$lo0 170 sub $i,$num,#8 // i=num-1 171 adcs $hi1,$hi1,$hi0 172 173 adc $ovf,xzr,xzr // upmost overflow bit 174 stp $lo1,$hi1,[$tp] 175 176.Louter: 177 ldr $m0,[$bp],#8 // bp[i] 178 ldp $hi0,$aj,[$ap],#16 179 ldr $tj,[sp] // tp[0] 180 add $tp,sp,#8 181 182 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 183 sub $j,$num,#16 // j=num-2 184 umulh $hi0,$hi0,$m0 185 ldp $hi1,$nj,[$np],#16 186 mul $alo,$aj,$m0 // ap[1]*bp[i] 187 adds $lo0,$lo0,$tj 188 umulh $ahi,$aj,$m0 189 adc $hi0,$hi0,xzr 190 191 mul $m1,$lo0,$n0 192 sub $i,$i,#8 // i-- 193 194 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 195 umulh $hi1,$hi1,$m1 196 mul $nlo,$nj,$m1 // np[1]*m1 197 // (*) adds $lo1,$lo1,$lo0 198 subs xzr,$lo0,#1 // (*) 199 umulh $nhi,$nj,$m1 200 cbz $j,.Linner_skip 201 202.Linner: 203 ldr $aj,[$ap],#8 204 adc $hi1,$hi1,xzr 205 ldr $tj,[$tp],#8 // tp[j] 206 adds $lo0,$alo,$hi0 207 sub $j,$j,#8 // j-- 208 adc $hi0,$ahi,xzr 209 210 adds $lo1,$nlo,$hi1 211 ldr $nj,[$np],#8 212 adc $hi1,$nhi,xzr 213 214 mul $alo,$aj,$m0 // ap[j]*bp[i] 215 adds $lo0,$lo0,$tj 216 umulh $ahi,$aj,$m0 217 adc $hi0,$hi0,xzr 218 219 mul $nlo,$nj,$m1 // np[j]*m1 220 adds $lo1,$lo1,$lo0 221 umulh $nhi,$nj,$m1 222 stur $lo1,[$tp,#-16] // tp[j-1] 223 cbnz $j,.Linner 224 225.Linner_skip: 226 ldr $tj,[$tp],#8 // tp[j] 227 adc $hi1,$hi1,xzr 228 adds $lo0,$alo,$hi0 229 sub $ap,$ap,$num // rewind $ap 230 adc $hi0,$ahi,xzr 231 232 adds $lo1,$nlo,$hi1 233 sub $np,$np,$num // rewind $np 234 adcs $hi1,$nhi,$ovf 235 adc $ovf,xzr,xzr 236 237 adds $lo0,$lo0,$tj 238 adc $hi0,$hi0,xzr 239 240 adds $lo1,$lo1,$lo0 241 adcs $hi1,$hi1,$hi0 242 adc $ovf,$ovf,xzr // upmost overflow bit 243 stp $lo1,$hi1,[$tp,#-16] 244 245 cbnz $i,.Louter 246 247 // Final step. We see if result is larger than modulus, and 248 // if it is, subtract the modulus. But comparison implies 249 // subtraction. So we subtract modulus, see if it borrowed, 250 // and conditionally copy original value. 251 ldr $tj,[sp] // tp[0] 252 add $tp,sp,#8 253 ldr $nj,[$np],#8 // np[0] 254 subs $j,$num,#8 // j=num-1 and clear borrow 255 mov $ap,$rp 256.Lsub: 257 sbcs $aj,$tj,$nj // tp[j]-np[j] 258 ldr $tj,[$tp],#8 259 sub $j,$j,#8 // j-- 260 ldr $nj,[$np],#8 261 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 262 cbnz $j,.Lsub 263 264 sbcs $aj,$tj,$nj 265 sbcs $ovf,$ovf,xzr // did it borrow? 266 str $aj,[$ap],#8 // rp[num-1] 267 268 ldr $tj,[sp] // tp[0] 269 add $tp,sp,#8 270 ldr $aj,[$rp],#8 // rp[0] 271 sub $num,$num,#8 // num-- 272 nop 273.Lcond_copy: 274 sub $num,$num,#8 // num-- 275 csel $nj,$tj,$aj,lo // did it borrow? 276 ldr $tj,[$tp],#8 277 ldr $aj,[$rp],#8 278 stur xzr,[$tp,#-16] // wipe tp 279 stur $nj,[$rp,#-16] 280 cbnz $num,.Lcond_copy 281 282 csel $nj,$tj,$aj,lo 283 stur xzr,[$tp,#-8] // wipe tp 284 stur $nj,[$rp,#-8] 285 286 ldp x19,x20,[x29,#16] 287 mov sp,x29 288 ldp x21,x22,[x29,#32] 289 mov x0,#1 290 ldp x23,x24,[x29,#48] 291 ldr x29,[sp],#64 292 AARCH64_VALIDATE_LINK_REGISTER 293 ret 294.size bn_mul_mont,.-bn_mul_mont 295___ 296{ 297my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); 298my ($Z,$Temp)=("v4.16b","v5"); 299my @ACC=map("v$_",(6..13)); 300my ($Bi,$Ni,$M0)=map("v$_",(28..30)); 301my $sBi="s28"; 302my $sM0="s30"; 303my $zero="v14"; 304my $temp="v15"; 305my $ACCTemp="v16"; 306 307my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); 308my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); 309 310$code.=<<___; 311.type bn_mul8x_mont_neon,%function 312.align 5 313bn_mul8x_mont_neon: 314 // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 315 // only from bn_mul_mont which has already signed the return address. 316 stp x29,x30,[sp,#-80]! 317 mov x16,sp 318 stp d8,d9,[sp,#16] 319 stp d10,d11,[sp,#32] 320 stp d12,d13,[sp,#48] 321 stp d14,d15,[sp,#64] 322 lsl $num,$num,#1 323 eor $zero.16b,$zero.16b,$zero.16b 324 325.align 4 326.LNEON_8n: 327 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b 328 sub $toutptr,sp,#128 329 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b 330 sub $toutptr,$toutptr,$num,lsl#4 331 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b 332 and $toutptr,$toutptr,#-64 333 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b 334 mov sp,$toutptr // alloca 335 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b 336 add $toutptr,$toutptr,#256 337 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b 338 sub $inner,$num,#8 339 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b 340 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b 341 342.LNEON_8n_init: 343 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 344 subs $inner,$inner,#8 345 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 346 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 347 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 348 bne .LNEON_8n_init 349 350 add $tinptr,sp,#256 351 ld1 {$A0.4s,$A1.4s},[$aptr],#32 352 add $bnptr,sp,#8 353 ldr $sM0,[$n0],#4 354 mov $outer,$num 355 b .LNEON_8n_outer 356 357.align 4 358.LNEON_8n_outer: 359 ldr $sBi,[$bptr],#4 // *b++ 360 uxtl $Bi.4s,$Bi.4h 361 add $toutptr,sp,#128 362 ld1 {$N0.4s,$N1.4s},[$nptr],#32 363 364 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 365 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 366 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 367 shl $Ni.2d,@ACC[0].2d,#16 368 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 369 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 370 add $Ni.2d,$Ni.2d,@ACC[0].2d 371 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 372 mul $Ni.2s,$Ni.2s,$M0.2s 373 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 374 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] 375 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 376 uxtl $Ni.4s,$Ni.4h 377 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 378___ 379for ($i=0; $i<7;) { 380$code.=<<___; 381 ldr $sBi,[$bptr],#4 // *b++ 382 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 383 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 384 uxtl $Bi.4s,$Bi.4h 385 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 386 ushr $temp.2d,@ACC[0].2d,#16 387 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 388 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 389 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 390 add @ACC[0].2d,@ACC[0].2d,$temp.2d 391 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 392 ushr @ACC[0].2d,@ACC[0].2d,#16 393 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 394 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 395 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d 396 ins @ACC[1].d[0],$ACCTemp.d[0] 397 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] 398___ 399 push(@ACC,shift(@ACC)); $i++; 400$code.=<<___; 401 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 402 ld1 {@ACC[7].2d},[$tinptr],#16 403 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 404 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 405 shl $Ni.2d,@ACC[0].2d,#16 406 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 407 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 408 add $Ni.2d,$Ni.2d,@ACC[0].2d 409 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 410 mul $Ni.2s,$Ni.2s,$M0.2s 411 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 412 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] 413 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 414 uxtl $Ni.4s,$Ni.4h 415 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 416___ 417} 418$code.=<<___; 419 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] 420 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 421 ld1 {$A0.4s,$A1.4s},[$aptr],#32 422 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 423 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 424 mov $Temp.16b,@ACC[0].16b 425 ushr $Temp.2d,$Temp.2d,#16 426 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 427 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 428 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 429 add @ACC[0].2d,@ACC[0].2d,$Temp.2d 430 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 431 ushr @ACC[0].2d,@ACC[0].2d,#16 432 eor $temp.16b,$temp.16b,$temp.16b 433 ins @ACC[0].d[1],$temp.d[0] 434 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 435 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 436 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d 437 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] 438 add $bnptr,sp,#8 // rewind 439___ 440 push(@ACC,shift(@ACC)); 441$code.=<<___; 442 sub $inner,$num,#8 443 b .LNEON_8n_inner 444 445.align 4 446.LNEON_8n_inner: 447 subs $inner,$inner,#8 448 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 449 ld1 {@ACC[7].2d},[$tinptr] 450 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 451 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] 452 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 453 ld1 {$N0.4s,$N1.4s},[$nptr],#32 454 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 455 b.eq .LInner_jump 456 add $tinptr,$tinptr,#16 // don't advance in last iteration 457.LInner_jump: 458 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 459 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 460 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 461 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 462___ 463for ($i=1; $i<8; $i++) { 464$code.=<<___; 465 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] 466 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 467 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 468 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 469 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 470 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 471 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 472 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 473 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 474 st1 {@ACC[0].2d},[$toutptr],#16 475___ 476 push(@ACC,shift(@ACC)); 477$code.=<<___; 478 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 479 ld1 {@ACC[7].2d},[$tinptr] 480 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 481 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] 482 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 483 b.eq .LInner_jump$i 484 add $tinptr,$tinptr,#16 // don't advance in last iteration 485.LInner_jump$i: 486 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 487 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 488 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 489 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 490 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 491___ 492} 493$code.=<<___; 494 b.ne .LInner_after_rewind$i 495 sub $aptr,$aptr,$num,lsl#2 // rewind 496.LInner_after_rewind$i: 497 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 498 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] 499 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 500 ld1 {$A0.4s,$A1.4s},[$aptr],#32 501 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 502 add $bnptr,sp,#8 // rewind 503 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 504 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 505 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 506 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 507 st1 {@ACC[0].2d},[$toutptr],#16 508 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 509 510 bne .LNEON_8n_inner 511___ 512 push(@ACC,shift(@ACC)); 513$code.=<<___; 514 add $tinptr,sp,#128 515 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 516 eor $N0.16b,$N0.16b,$N0.16b // $N0 517 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 518 eor $N1.16b,$N1.16b,$N1.16b // $N1 519 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 520 st1 {@ACC[6].2d},[$toutptr] 521 522 subs $outer,$outer,#8 523 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 524 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 525 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 526 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 527 528 b.eq .LInner_8n_jump_2steps 529 sub $nptr,$nptr,$num,lsl#2 // rewind 530 b .LNEON_8n_outer 531 532.LInner_8n_jump_2steps: 533 add $toutptr,sp,#128 534 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame 535 mov $Temp.16b,@ACC[0].16b 536 ushr $temp.2d,@ACC[0].2d,#16 537 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 538 st1 {$N0.2d,$N1.2d}, [sp],#32 539 add @ACC[0].2d,@ACC[0].2d,$temp.2d 540 st1 {$N0.2d,$N1.2d}, [sp],#32 541 ushr $temp.2d,@ACC[0].2d,#16 542 st1 {$N0.2d,$N1.2d}, [sp],#32 543 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h 544 ins $temp.d[1],$zero.d[0] 545 546 mov $inner,$num 547 b .LNEON_tail_entry 548 549.align 4 550.LNEON_tail: 551 add @ACC[0].2d,@ACC[0].2d,$temp.2d 552 mov $Temp.16b,@ACC[0].16b 553 ushr $temp.2d,@ACC[0].2d,#16 554 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 555 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 556 add @ACC[0].2d,@ACC[0].2d,$temp.2d 557 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 558 ushr $temp.2d,@ACC[0].2d,#16 559 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 560 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h 561 ins $temp.d[1],$zero.d[0] 562 563.LNEON_tail_entry: 564___ 565for ($i=1; $i<8; $i++) { 566$code.=<<___; 567 add @ACC[1].2d,@ACC[1].2d,$temp.2d 568 st1 {@ACC[0].s}[0], [$toutptr],#4 569 ushr $temp.2d,@ACC[1].2d,#16 570 mov $Temp.16b,@ACC[1].16b 571 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 572 add @ACC[1].2d,@ACC[1].2d,$temp.2d 573 ushr $temp.2d,@ACC[1].2d,#16 574 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h 575 ins $temp.d[1],$zero.d[0] 576___ 577 push(@ACC,shift(@ACC)); 578} 579 push(@ACC,shift(@ACC)); 580$code.=<<___; 581 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 582 subs $inner,$inner,#8 583 st1 {@ACC[7].s}[0], [$toutptr],#4 584 bne .LNEON_tail 585 586 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit 587 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr 588 subs $aptr,sp,#0 // clear carry flag 589 add $bptr,sp,$num,lsl#2 590 591.LNEON_sub: 592 ldp w4,w5,[$aptr],#8 593 ldp w6,w7,[$aptr],#8 594 ldp w8,w9,[$nptr],#8 595 ldp w10,w11,[$nptr],#8 596 sbcs w8,w4,w8 597 sbcs w9,w5,w9 598 sbcs w10,w6,w10 599 sbcs w11,w7,w11 600 sub x17,$bptr,$aptr 601 stp w8,w9,[$rptr],#8 602 stp w10,w11,[$rptr],#8 603 cbnz x17,.LNEON_sub 604 605 ldr w10, [$aptr] // load top-most bit 606 mov x11,sp 607 eor v0.16b,v0.16b,v0.16b 608 sub x11,$bptr,x11 // this is num*4 609 eor v1.16b,v1.16b,v1.16b 610 mov $aptr,sp 611 sub $rptr,$rptr,x11 // rewind $rptr 612 mov $nptr,$bptr // second 3/4th of frame 613 sbcs w10,w10,wzr // result is carry flag 614 615.LNEON_copy_n_zap: 616 ldp w4,w5,[$aptr],#8 617 ldp w6,w7,[$aptr],#8 618 ldp w8,w9,[$rptr],#8 619 ldp w10,w11,[$rptr] 620 sub $rptr,$rptr,#8 621 b.cs .LCopy_1 622 mov w8,w4 623 mov w9,w5 624 mov w10,w6 625 mov w11,w7 626.LCopy_1: 627 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 628 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 629 ldp w4,w5,[$aptr],#8 630 ldp w6,w7,[$aptr],#8 631 stp w8,w9,[$rptr],#8 632 stp w10,w11,[$rptr],#8 633 sub $aptr,$aptr,#32 634 ldp w8,w9,[$rptr],#8 635 ldp w10,w11,[$rptr] 636 sub $rptr,$rptr,#8 637 b.cs .LCopy_2 638 mov w8, w4 639 mov w9, w5 640 mov w10, w6 641 mov w11, w7 642.LCopy_2: 643 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe 644 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 645 sub x17,$bptr,$aptr // preserves carry 646 stp w8,w9,[$rptr],#8 647 stp w10,w11,[$rptr],#8 648 cbnz x17,.LNEON_copy_n_zap 649 650 mov sp,x16 651 ldp d14,d15,[sp,#64] 652 ldp d12,d13,[sp,#48] 653 ldp d10,d11,[sp,#32] 654 ldp d8,d9,[sp,#16] 655 ldr x29,[sp],#80 656 AARCH64_VALIDATE_LINK_REGISTER 657 ret // bx lr 658 659.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 660___ 661} 662{ 663######################################################################## 664# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 665 666my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 667my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 668my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 669my ($cnt,$carry,$topmost)=("x27","x28","x30"); 670my ($tp,$ap_end,$na0)=($bp,$np,$carry); 671 672$code.=<<___; 673.type __bn_sqr8x_mont,%function 674.align 5 675__bn_sqr8x_mont: 676 cmp $ap,$bp 677 b.ne __bn_mul4x_mont 678.Lsqr8x_mont: 679 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 680 // only from bn_mul_mont which has already signed the return address. 681 stp x29,x30,[sp,#-128]! 682 add x29,sp,#0 683 stp x19,x20,[sp,#16] 684 stp x21,x22,[sp,#32] 685 stp x23,x24,[sp,#48] 686 stp x25,x26,[sp,#64] 687 stp x27,x28,[sp,#80] 688 stp $rp,$np,[sp,#96] // offload rp and np 689 690 ldp $a0,$a1,[$ap,#8*0] 691 ldp $a2,$a3,[$ap,#8*2] 692 ldp $a4,$a5,[$ap,#8*4] 693 ldp $a6,$a7,[$ap,#8*6] 694 695 sub $tp,sp,$num,lsl#4 696 lsl $num,$num,#3 697 ldr $n0,[$n0] // *n0 698 mov sp,$tp // alloca 699 sub $cnt,$num,#8*8 700 b .Lsqr8x_zero_start 701 702.Lsqr8x_zero: 703 sub $cnt,$cnt,#8*8 704 stp xzr,xzr,[$tp,#8*0] 705 stp xzr,xzr,[$tp,#8*2] 706 stp xzr,xzr,[$tp,#8*4] 707 stp xzr,xzr,[$tp,#8*6] 708.Lsqr8x_zero_start: 709 stp xzr,xzr,[$tp,#8*8] 710 stp xzr,xzr,[$tp,#8*10] 711 stp xzr,xzr,[$tp,#8*12] 712 stp xzr,xzr,[$tp,#8*14] 713 add $tp,$tp,#8*16 714 cbnz $cnt,.Lsqr8x_zero 715 716 add $ap_end,$ap,$num 717 add $ap,$ap,#8*8 718 mov $acc0,xzr 719 mov $acc1,xzr 720 mov $acc2,xzr 721 mov $acc3,xzr 722 mov $acc4,xzr 723 mov $acc5,xzr 724 mov $acc6,xzr 725 mov $acc7,xzr 726 mov $tp,sp 727 str $n0,[x29,#112] // offload n0 728 729 // Multiply everything but a[i]*a[i] 730.align 4 731.Lsqr8x_outer_loop: 732 // a[1]a[0] (i) 733 // a[2]a[0] 734 // a[3]a[0] 735 // a[4]a[0] 736 // a[5]a[0] 737 // a[6]a[0] 738 // a[7]a[0] 739 // a[2]a[1] (ii) 740 // a[3]a[1] 741 // a[4]a[1] 742 // a[5]a[1] 743 // a[6]a[1] 744 // a[7]a[1] 745 // a[3]a[2] (iii) 746 // a[4]a[2] 747 // a[5]a[2] 748 // a[6]a[2] 749 // a[7]a[2] 750 // a[4]a[3] (iv) 751 // a[5]a[3] 752 // a[6]a[3] 753 // a[7]a[3] 754 // a[5]a[4] (v) 755 // a[6]a[4] 756 // a[7]a[4] 757 // a[6]a[5] (vi) 758 // a[7]a[5] 759 // a[7]a[6] (vii) 760 761 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 762 mul $t1,$a2,$a0 763 mul $t2,$a3,$a0 764 mul $t3,$a4,$a0 765 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 766 mul $t0,$a5,$a0 767 adcs $acc2,$acc2,$t1 768 mul $t1,$a6,$a0 769 adcs $acc3,$acc3,$t2 770 mul $t2,$a7,$a0 771 adcs $acc4,$acc4,$t3 772 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 773 adcs $acc5,$acc5,$t0 774 umulh $t0,$a2,$a0 775 adcs $acc6,$acc6,$t1 776 umulh $t1,$a3,$a0 777 adcs $acc7,$acc7,$t2 778 umulh $t2,$a4,$a0 779 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 780 adc $acc0,xzr,xzr // t[8] 781 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 782 umulh $t3,$a5,$a0 783 adcs $acc3,$acc3,$t0 784 umulh $t0,$a6,$a0 785 adcs $acc4,$acc4,$t1 786 umulh $t1,$a7,$a0 787 adcs $acc5,$acc5,$t2 788 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 789 adcs $acc6,$acc6,$t3 790 mul $t3,$a3,$a1 791 adcs $acc7,$acc7,$t0 792 mul $t0,$a4,$a1 793 adc $acc0,$acc0,$t1 794 795 mul $t1,$a5,$a1 796 adds $acc3,$acc3,$t2 797 mul $t2,$a6,$a1 798 adcs $acc4,$acc4,$t3 799 mul $t3,$a7,$a1 800 adcs $acc5,$acc5,$t0 801 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 802 adcs $acc6,$acc6,$t1 803 umulh $t1,$a3,$a1 804 adcs $acc7,$acc7,$t2 805 umulh $t2,$a4,$a1 806 adcs $acc0,$acc0,$t3 807 umulh $t3,$a5,$a1 808 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 809 adc $acc1,xzr,xzr // t[9] 810 adds $acc4,$acc4,$t0 811 umulh $t0,$a6,$a1 812 adcs $acc5,$acc5,$t1 813 umulh $t1,$a7,$a1 814 adcs $acc6,$acc6,$t2 815 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 816 adcs $acc7,$acc7,$t3 817 mul $t3,$a4,$a2 818 adcs $acc0,$acc0,$t0 819 mul $t0,$a5,$a2 820 adc $acc1,$acc1,$t1 821 822 mul $t1,$a6,$a2 823 adds $acc5,$acc5,$t2 824 mul $t2,$a7,$a2 825 adcs $acc6,$acc6,$t3 826 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 827 adcs $acc7,$acc7,$t0 828 umulh $t0,$a4,$a2 829 adcs $acc0,$acc0,$t1 830 umulh $t1,$a5,$a2 831 adcs $acc1,$acc1,$t2 832 umulh $t2,$a6,$a2 833 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 834 adc $acc2,xzr,xzr // t[10] 835 adds $acc6,$acc6,$t3 836 umulh $t3,$a7,$a2 837 adcs $acc7,$acc7,$t0 838 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 839 adcs $acc0,$acc0,$t1 840 mul $t1,$a5,$a3 841 adcs $acc1,$acc1,$t2 842 mul $t2,$a6,$a3 843 adc $acc2,$acc2,$t3 844 845 mul $t3,$a7,$a3 846 adds $acc7,$acc7,$t0 847 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 848 adcs $acc0,$acc0,$t1 849 umulh $t1,$a5,$a3 850 adcs $acc1,$acc1,$t2 851 umulh $t2,$a6,$a3 852 adcs $acc2,$acc2,$t3 853 umulh $t3,$a7,$a3 854 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 855 adc $acc3,xzr,xzr // t[11] 856 adds $acc0,$acc0,$t0 857 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 858 adcs $acc1,$acc1,$t1 859 mul $t1,$a6,$a4 860 adcs $acc2,$acc2,$t2 861 mul $t2,$a7,$a4 862 adc $acc3,$acc3,$t3 863 864 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 865 adds $acc1,$acc1,$t0 866 umulh $t0,$a6,$a4 867 adcs $acc2,$acc2,$t1 868 umulh $t1,$a7,$a4 869 adcs $acc3,$acc3,$t2 870 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 871 adc $acc4,xzr,xzr // t[12] 872 adds $acc2,$acc2,$t3 873 mul $t3,$a7,$a5 874 adcs $acc3,$acc3,$t0 875 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 876 adc $acc4,$acc4,$t1 877 878 umulh $t1,$a7,$a5 879 adds $acc3,$acc3,$t2 880 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 881 adcs $acc4,$acc4,$t3 882 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 883 adc $acc5,xzr,xzr // t[13] 884 adds $acc4,$acc4,$t0 885 sub $cnt,$ap_end,$ap // done yet? 886 adc $acc5,$acc5,$t1 887 888 adds $acc5,$acc5,$t2 889 sub $t0,$ap_end,$num // rewinded ap 890 adc $acc6,xzr,xzr // t[14] 891 add $acc6,$acc6,$t3 892 893 cbz $cnt,.Lsqr8x_outer_break 894 895 mov $n0,$a0 896 ldp $a0,$a1,[$tp,#8*0] 897 ldp $a2,$a3,[$tp,#8*2] 898 ldp $a4,$a5,[$tp,#8*4] 899 ldp $a6,$a7,[$tp,#8*6] 900 adds $acc0,$acc0,$a0 901 adcs $acc1,$acc1,$a1 902 ldp $a0,$a1,[$ap,#8*0] 903 adcs $acc2,$acc2,$a2 904 adcs $acc3,$acc3,$a3 905 ldp $a2,$a3,[$ap,#8*2] 906 adcs $acc4,$acc4,$a4 907 adcs $acc5,$acc5,$a5 908 ldp $a4,$a5,[$ap,#8*4] 909 adcs $acc6,$acc6,$a6 910 mov $rp,$ap 911 adcs $acc7,xzr,$a7 912 ldp $a6,$a7,[$ap,#8*6] 913 add $ap,$ap,#8*8 914 //adc $carry,xzr,xzr // moved below 915 mov $cnt,#-8*8 916 917 // a[8]a[0] 918 // a[9]a[0] 919 // a[a]a[0] 920 // a[b]a[0] 921 // a[c]a[0] 922 // a[d]a[0] 923 // a[e]a[0] 924 // a[f]a[0] 925 // a[8]a[1] 926 // a[f]a[1]........................ 927 // a[8]a[2] 928 // a[f]a[2]........................ 929 // a[8]a[3] 930 // a[f]a[3]........................ 931 // a[8]a[4] 932 // a[f]a[4]........................ 933 // a[8]a[5] 934 // a[f]a[5]........................ 935 // a[8]a[6] 936 // a[f]a[6]........................ 937 // a[8]a[7] 938 // a[f]a[7]........................ 939.Lsqr8x_mul: 940 mul $t0,$a0,$n0 941 adc $carry,xzr,xzr // carry bit, modulo-scheduled 942 mul $t1,$a1,$n0 943 add $cnt,$cnt,#8 944 mul $t2,$a2,$n0 945 mul $t3,$a3,$n0 946 adds $acc0,$acc0,$t0 947 mul $t0,$a4,$n0 948 adcs $acc1,$acc1,$t1 949 mul $t1,$a5,$n0 950 adcs $acc2,$acc2,$t2 951 mul $t2,$a6,$n0 952 adcs $acc3,$acc3,$t3 953 mul $t3,$a7,$n0 954 adcs $acc4,$acc4,$t0 955 umulh $t0,$a0,$n0 956 adcs $acc5,$acc5,$t1 957 umulh $t1,$a1,$n0 958 adcs $acc6,$acc6,$t2 959 umulh $t2,$a2,$n0 960 adcs $acc7,$acc7,$t3 961 umulh $t3,$a3,$n0 962 adc $carry,$carry,xzr 963 str $acc0,[$tp],#8 964 adds $acc0,$acc1,$t0 965 umulh $t0,$a4,$n0 966 adcs $acc1,$acc2,$t1 967 umulh $t1,$a5,$n0 968 adcs $acc2,$acc3,$t2 969 umulh $t2,$a6,$n0 970 adcs $acc3,$acc4,$t3 971 umulh $t3,$a7,$n0 972 ldr $n0,[$rp,$cnt] 973 adcs $acc4,$acc5,$t0 974 adcs $acc5,$acc6,$t1 975 adcs $acc6,$acc7,$t2 976 adcs $acc7,$carry,$t3 977 //adc $carry,xzr,xzr // moved above 978 cbnz $cnt,.Lsqr8x_mul 979 // note that carry flag is guaranteed 980 // to be zero at this point 981 cmp $ap,$ap_end // done yet? 982 b.eq .Lsqr8x_break 983 984 ldp $a0,$a1,[$tp,#8*0] 985 ldp $a2,$a3,[$tp,#8*2] 986 ldp $a4,$a5,[$tp,#8*4] 987 ldp $a6,$a7,[$tp,#8*6] 988 adds $acc0,$acc0,$a0 989 ldur $n0,[$rp,#-8*8] 990 adcs $acc1,$acc1,$a1 991 ldp $a0,$a1,[$ap,#8*0] 992 adcs $acc2,$acc2,$a2 993 adcs $acc3,$acc3,$a3 994 ldp $a2,$a3,[$ap,#8*2] 995 adcs $acc4,$acc4,$a4 996 adcs $acc5,$acc5,$a5 997 ldp $a4,$a5,[$ap,#8*4] 998 adcs $acc6,$acc6,$a6 999 mov $cnt,#-8*8 1000 adcs $acc7,$acc7,$a7 1001 ldp $a6,$a7,[$ap,#8*6] 1002 add $ap,$ap,#8*8 1003 //adc $carry,xzr,xzr // moved above 1004 b .Lsqr8x_mul 1005 1006.align 4 1007.Lsqr8x_break: 1008 ldp $a0,$a1,[$rp,#8*0] 1009 add $ap,$rp,#8*8 1010 ldp $a2,$a3,[$rp,#8*2] 1011 sub $t0,$ap_end,$ap // is it last iteration? 1012 ldp $a4,$a5,[$rp,#8*4] 1013 sub $t1,$tp,$t0 1014 ldp $a6,$a7,[$rp,#8*6] 1015 cbz $t0,.Lsqr8x_outer_loop 1016 1017 stp $acc0,$acc1,[$tp,#8*0] 1018 ldp $acc0,$acc1,[$t1,#8*0] 1019 stp $acc2,$acc3,[$tp,#8*2] 1020 ldp $acc2,$acc3,[$t1,#8*2] 1021 stp $acc4,$acc5,[$tp,#8*4] 1022 ldp $acc4,$acc5,[$t1,#8*4] 1023 stp $acc6,$acc7,[$tp,#8*6] 1024 mov $tp,$t1 1025 ldp $acc6,$acc7,[$t1,#8*6] 1026 b .Lsqr8x_outer_loop 1027 1028.align 4 1029.Lsqr8x_outer_break: 1030 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1031 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 1032 ldp $t1,$t2,[sp,#8*1] 1033 ldp $a5,$a7,[$t0,#8*2] 1034 add $ap,$t0,#8*4 1035 ldp $t3,$t0,[sp,#8*3] 1036 1037 stp $acc0,$acc1,[$tp,#8*0] 1038 mul $acc0,$a1,$a1 1039 stp $acc2,$acc3,[$tp,#8*2] 1040 umulh $a1,$a1,$a1 1041 stp $acc4,$acc5,[$tp,#8*4] 1042 mul $a2,$a3,$a3 1043 stp $acc6,$acc7,[$tp,#8*6] 1044 mov $tp,sp 1045 umulh $a3,$a3,$a3 1046 adds $acc1,$a1,$t1,lsl#1 1047 extr $t1,$t2,$t1,#63 1048 sub $cnt,$num,#8*4 1049 1050.Lsqr4x_shift_n_add: 1051 adcs $acc2,$a2,$t1 1052 extr $t2,$t3,$t2,#63 1053 sub $cnt,$cnt,#8*4 1054 adcs $acc3,$a3,$t2 1055 ldp $t1,$t2,[$tp,#8*5] 1056 mul $a4,$a5,$a5 1057 ldp $a1,$a3,[$ap],#8*2 1058 umulh $a5,$a5,$a5 1059 mul $a6,$a7,$a7 1060 umulh $a7,$a7,$a7 1061 extr $t3,$t0,$t3,#63 1062 stp $acc0,$acc1,[$tp,#8*0] 1063 adcs $acc4,$a4,$t3 1064 extr $t0,$t1,$t0,#63 1065 stp $acc2,$acc3,[$tp,#8*2] 1066 adcs $acc5,$a5,$t0 1067 ldp $t3,$t0,[$tp,#8*7] 1068 extr $t1,$t2,$t1,#63 1069 adcs $acc6,$a6,$t1 1070 extr $t2,$t3,$t2,#63 1071 adcs $acc7,$a7,$t2 1072 ldp $t1,$t2,[$tp,#8*9] 1073 mul $a0,$a1,$a1 1074 ldp $a5,$a7,[$ap],#8*2 1075 umulh $a1,$a1,$a1 1076 mul $a2,$a3,$a3 1077 umulh $a3,$a3,$a3 1078 stp $acc4,$acc5,[$tp,#8*4] 1079 extr $t3,$t0,$t3,#63 1080 stp $acc6,$acc7,[$tp,#8*6] 1081 add $tp,$tp,#8*8 1082 adcs $acc0,$a0,$t3 1083 extr $t0,$t1,$t0,#63 1084 adcs $acc1,$a1,$t0 1085 ldp $t3,$t0,[$tp,#8*3] 1086 extr $t1,$t2,$t1,#63 1087 cbnz $cnt,.Lsqr4x_shift_n_add 1088___ 1089my ($np,$np_end)=($ap,$ap_end); 1090$code.=<<___; 1091 ldp $np,$n0,[x29,#104] // pull np and n0 1092 1093 adcs $acc2,$a2,$t1 1094 extr $t2,$t3,$t2,#63 1095 adcs $acc3,$a3,$t2 1096 ldp $t1,$t2,[$tp,#8*5] 1097 mul $a4,$a5,$a5 1098 umulh $a5,$a5,$a5 1099 stp $acc0,$acc1,[$tp,#8*0] 1100 mul $a6,$a7,$a7 1101 umulh $a7,$a7,$a7 1102 stp $acc2,$acc3,[$tp,#8*2] 1103 extr $t3,$t0,$t3,#63 1104 adcs $acc4,$a4,$t3 1105 extr $t0,$t1,$t0,#63 1106 ldp $acc0,$acc1,[sp,#8*0] 1107 adcs $acc5,$a5,$t0 1108 extr $t1,$t2,$t1,#63 1109 ldp $a0,$a1,[$np,#8*0] 1110 adcs $acc6,$a6,$t1 1111 extr $t2,xzr,$t2,#63 1112 ldp $a2,$a3,[$np,#8*2] 1113 adc $acc7,$a7,$t2 1114 ldp $a4,$a5,[$np,#8*4] 1115 1116 // Reduce by 512 bits per iteration 1117 mul $na0,$n0,$acc0 // t[0]*n0 1118 ldp $a6,$a7,[$np,#8*6] 1119 add $np_end,$np,$num 1120 ldp $acc2,$acc3,[sp,#8*2] 1121 stp $acc4,$acc5,[$tp,#8*4] 1122 ldp $acc4,$acc5,[sp,#8*4] 1123 stp $acc6,$acc7,[$tp,#8*6] 1124 ldp $acc6,$acc7,[sp,#8*6] 1125 add $np,$np,#8*8 1126 mov $topmost,xzr // initial top-most carry 1127 mov $tp,sp 1128 mov $cnt,#8 1129 1130.Lsqr8x_reduction: 1131 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 1132 mul $t1,$a1,$na0 1133 sub $cnt,$cnt,#1 1134 mul $t2,$a2,$na0 1135 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 1136 mul $t3,$a3,$na0 1137 // (*) adds xzr,$acc0,$t0 1138 subs xzr,$acc0,#1 // (*) 1139 mul $t0,$a4,$na0 1140 adcs $acc0,$acc1,$t1 1141 mul $t1,$a5,$na0 1142 adcs $acc1,$acc2,$t2 1143 mul $t2,$a6,$na0 1144 adcs $acc2,$acc3,$t3 1145 mul $t3,$a7,$na0 1146 adcs $acc3,$acc4,$t0 1147 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 1148 adcs $acc4,$acc5,$t1 1149 umulh $t1,$a1,$na0 1150 adcs $acc5,$acc6,$t2 1151 umulh $t2,$a2,$na0 1152 adcs $acc6,$acc7,$t3 1153 umulh $t3,$a3,$na0 1154 adc $acc7,xzr,xzr 1155 adds $acc0,$acc0,$t0 1156 umulh $t0,$a4,$na0 1157 adcs $acc1,$acc1,$t1 1158 umulh $t1,$a5,$na0 1159 adcs $acc2,$acc2,$t2 1160 umulh $t2,$a6,$na0 1161 adcs $acc3,$acc3,$t3 1162 umulh $t3,$a7,$na0 1163 mul $na0,$n0,$acc0 // next t[0]*n0 1164 adcs $acc4,$acc4,$t0 1165 adcs $acc5,$acc5,$t1 1166 adcs $acc6,$acc6,$t2 1167 adc $acc7,$acc7,$t3 1168 cbnz $cnt,.Lsqr8x_reduction 1169 1170 ldp $t0,$t1,[$tp,#8*0] 1171 ldp $t2,$t3,[$tp,#8*2] 1172 mov $rp,$tp 1173 sub $cnt,$np_end,$np // done yet? 1174 adds $acc0,$acc0,$t0 1175 adcs $acc1,$acc1,$t1 1176 ldp $t0,$t1,[$tp,#8*4] 1177 adcs $acc2,$acc2,$t2 1178 adcs $acc3,$acc3,$t3 1179 ldp $t2,$t3,[$tp,#8*6] 1180 adcs $acc4,$acc4,$t0 1181 adcs $acc5,$acc5,$t1 1182 adcs $acc6,$acc6,$t2 1183 adcs $acc7,$acc7,$t3 1184 //adc $carry,xzr,xzr // moved below 1185 cbz $cnt,.Lsqr8x8_post_condition 1186 1187 ldur $n0,[$tp,#-8*8] 1188 ldp $a0,$a1,[$np,#8*0] 1189 ldp $a2,$a3,[$np,#8*2] 1190 ldp $a4,$a5,[$np,#8*4] 1191 mov $cnt,#-8*8 1192 ldp $a6,$a7,[$np,#8*6] 1193 add $np,$np,#8*8 1194 1195.Lsqr8x_tail: 1196 mul $t0,$a0,$n0 1197 adc $carry,xzr,xzr // carry bit, modulo-scheduled 1198 mul $t1,$a1,$n0 1199 add $cnt,$cnt,#8 1200 mul $t2,$a2,$n0 1201 mul $t3,$a3,$n0 1202 adds $acc0,$acc0,$t0 1203 mul $t0,$a4,$n0 1204 adcs $acc1,$acc1,$t1 1205 mul $t1,$a5,$n0 1206 adcs $acc2,$acc2,$t2 1207 mul $t2,$a6,$n0 1208 adcs $acc3,$acc3,$t3 1209 mul $t3,$a7,$n0 1210 adcs $acc4,$acc4,$t0 1211 umulh $t0,$a0,$n0 1212 adcs $acc5,$acc5,$t1 1213 umulh $t1,$a1,$n0 1214 adcs $acc6,$acc6,$t2 1215 umulh $t2,$a2,$n0 1216 adcs $acc7,$acc7,$t3 1217 umulh $t3,$a3,$n0 1218 adc $carry,$carry,xzr 1219 str $acc0,[$tp],#8 1220 adds $acc0,$acc1,$t0 1221 umulh $t0,$a4,$n0 1222 adcs $acc1,$acc2,$t1 1223 umulh $t1,$a5,$n0 1224 adcs $acc2,$acc3,$t2 1225 umulh $t2,$a6,$n0 1226 adcs $acc3,$acc4,$t3 1227 umulh $t3,$a7,$n0 1228 ldr $n0,[$rp,$cnt] 1229 adcs $acc4,$acc5,$t0 1230 adcs $acc5,$acc6,$t1 1231 adcs $acc6,$acc7,$t2 1232 adcs $acc7,$carry,$t3 1233 //adc $carry,xzr,xzr // moved above 1234 cbnz $cnt,.Lsqr8x_tail 1235 // note that carry flag is guaranteed 1236 // to be zero at this point 1237 ldp $a0,$a1,[$tp,#8*0] 1238 sub $cnt,$np_end,$np // done yet? 1239 sub $t2,$np_end,$num // rewinded np 1240 ldp $a2,$a3,[$tp,#8*2] 1241 ldp $a4,$a5,[$tp,#8*4] 1242 ldp $a6,$a7,[$tp,#8*6] 1243 cbz $cnt,.Lsqr8x_tail_break 1244 1245 ldur $n0,[$rp,#-8*8] 1246 adds $acc0,$acc0,$a0 1247 adcs $acc1,$acc1,$a1 1248 ldp $a0,$a1,[$np,#8*0] 1249 adcs $acc2,$acc2,$a2 1250 adcs $acc3,$acc3,$a3 1251 ldp $a2,$a3,[$np,#8*2] 1252 adcs $acc4,$acc4,$a4 1253 adcs $acc5,$acc5,$a5 1254 ldp $a4,$a5,[$np,#8*4] 1255 adcs $acc6,$acc6,$a6 1256 mov $cnt,#-8*8 1257 adcs $acc7,$acc7,$a7 1258 ldp $a6,$a7,[$np,#8*6] 1259 add $np,$np,#8*8 1260 //adc $carry,xzr,xzr // moved above 1261 b .Lsqr8x_tail 1262 1263.align 4 1264.Lsqr8x_tail_break: 1265 ldr $n0,[x29,#112] // pull n0 1266 add $cnt,$tp,#8*8 // end of current t[num] window 1267 1268 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 1269 adcs $t0,$acc0,$a0 1270 adcs $t1,$acc1,$a1 1271 ldp $acc0,$acc1,[$rp,#8*0] 1272 adcs $acc2,$acc2,$a2 1273 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 1274 adcs $acc3,$acc3,$a3 1275 ldp $a2,$a3,[$t2,#8*2] 1276 adcs $acc4,$acc4,$a4 1277 adcs $acc5,$acc5,$a5 1278 ldp $a4,$a5,[$t2,#8*4] 1279 adcs $acc6,$acc6,$a6 1280 adcs $acc7,$acc7,$a7 1281 ldp $a6,$a7,[$t2,#8*6] 1282 add $np,$t2,#8*8 1283 adc $topmost,xzr,xzr // top-most carry 1284 mul $na0,$n0,$acc0 1285 stp $t0,$t1,[$tp,#8*0] 1286 stp $acc2,$acc3,[$tp,#8*2] 1287 ldp $acc2,$acc3,[$rp,#8*2] 1288 stp $acc4,$acc5,[$tp,#8*4] 1289 ldp $acc4,$acc5,[$rp,#8*4] 1290 cmp $cnt,x29 // did we hit the bottom? 1291 stp $acc6,$acc7,[$tp,#8*6] 1292 mov $tp,$rp // slide the window 1293 ldp $acc6,$acc7,[$rp,#8*6] 1294 mov $cnt,#8 1295 b.ne .Lsqr8x_reduction 1296 1297 // Final step. We see if result is larger than modulus, and 1298 // if it is, subtract the modulus. But comparison implies 1299 // subtraction. So we subtract modulus, see if it borrowed, 1300 // and conditionally copy original value. 1301 ldr $rp,[x29,#96] // pull rp 1302 add $tp,$tp,#8*8 1303 subs $t0,$acc0,$a0 1304 sbcs $t1,$acc1,$a1 1305 sub $cnt,$num,#8*8 1306 mov $ap_end,$rp // $rp copy 1307 1308.Lsqr8x_sub: 1309 sbcs $t2,$acc2,$a2 1310 ldp $a0,$a1,[$np,#8*0] 1311 sbcs $t3,$acc3,$a3 1312 stp $t0,$t1,[$rp,#8*0] 1313 sbcs $t0,$acc4,$a4 1314 ldp $a2,$a3,[$np,#8*2] 1315 sbcs $t1,$acc5,$a5 1316 stp $t2,$t3,[$rp,#8*2] 1317 sbcs $t2,$acc6,$a6 1318 ldp $a4,$a5,[$np,#8*4] 1319 sbcs $t3,$acc7,$a7 1320 ldp $a6,$a7,[$np,#8*6] 1321 add $np,$np,#8*8 1322 ldp $acc0,$acc1,[$tp,#8*0] 1323 sub $cnt,$cnt,#8*8 1324 ldp $acc2,$acc3,[$tp,#8*2] 1325 ldp $acc4,$acc5,[$tp,#8*4] 1326 ldp $acc6,$acc7,[$tp,#8*6] 1327 add $tp,$tp,#8*8 1328 stp $t0,$t1,[$rp,#8*4] 1329 sbcs $t0,$acc0,$a0 1330 stp $t2,$t3,[$rp,#8*6] 1331 add $rp,$rp,#8*8 1332 sbcs $t1,$acc1,$a1 1333 cbnz $cnt,.Lsqr8x_sub 1334 1335 sbcs $t2,$acc2,$a2 1336 mov $tp,sp 1337 add $ap,sp,$num 1338 ldp $a0,$a1,[$ap_end,#8*0] 1339 sbcs $t3,$acc3,$a3 1340 stp $t0,$t1,[$rp,#8*0] 1341 sbcs $t0,$acc4,$a4 1342 ldp $a2,$a3,[$ap_end,#8*2] 1343 sbcs $t1,$acc5,$a5 1344 stp $t2,$t3,[$rp,#8*2] 1345 sbcs $t2,$acc6,$a6 1346 ldp $acc0,$acc1,[$ap,#8*0] 1347 sbcs $t3,$acc7,$a7 1348 ldp $acc2,$acc3,[$ap,#8*2] 1349 sbcs xzr,$topmost,xzr // did it borrow? 1350 ldr x30,[x29,#8] // pull return address 1351 stp $t0,$t1,[$rp,#8*4] 1352 stp $t2,$t3,[$rp,#8*6] 1353 1354 sub $cnt,$num,#8*4 1355.Lsqr4x_cond_copy: 1356 sub $cnt,$cnt,#8*4 1357 csel $t0,$acc0,$a0,lo 1358 stp xzr,xzr,[$tp,#8*0] 1359 csel $t1,$acc1,$a1,lo 1360 ldp $a0,$a1,[$ap_end,#8*4] 1361 ldp $acc0,$acc1,[$ap,#8*4] 1362 csel $t2,$acc2,$a2,lo 1363 stp xzr,xzr,[$tp,#8*2] 1364 add $tp,$tp,#8*4 1365 csel $t3,$acc3,$a3,lo 1366 ldp $a2,$a3,[$ap_end,#8*6] 1367 ldp $acc2,$acc3,[$ap,#8*6] 1368 add $ap,$ap,#8*4 1369 stp $t0,$t1,[$ap_end,#8*0] 1370 stp $t2,$t3,[$ap_end,#8*2] 1371 add $ap_end,$ap_end,#8*4 1372 stp xzr,xzr,[$ap,#8*0] 1373 stp xzr,xzr,[$ap,#8*2] 1374 cbnz $cnt,.Lsqr4x_cond_copy 1375 1376 csel $t0,$acc0,$a0,lo 1377 stp xzr,xzr,[$tp,#8*0] 1378 csel $t1,$acc1,$a1,lo 1379 stp xzr,xzr,[$tp,#8*2] 1380 csel $t2,$acc2,$a2,lo 1381 csel $t3,$acc3,$a3,lo 1382 stp $t0,$t1,[$ap_end,#8*0] 1383 stp $t2,$t3,[$ap_end,#8*2] 1384 1385 b .Lsqr8x_done 1386 1387.align 4 1388.Lsqr8x8_post_condition: 1389 adc $carry,xzr,xzr 1390 ldr x30,[x29,#8] // pull return address 1391 // $acc0-7,$carry hold result, $a0-7 hold modulus 1392 subs $a0,$acc0,$a0 1393 ldr $ap,[x29,#96] // pull rp 1394 sbcs $a1,$acc1,$a1 1395 stp xzr,xzr,[sp,#8*0] 1396 sbcs $a2,$acc2,$a2 1397 stp xzr,xzr,[sp,#8*2] 1398 sbcs $a3,$acc3,$a3 1399 stp xzr,xzr,[sp,#8*4] 1400 sbcs $a4,$acc4,$a4 1401 stp xzr,xzr,[sp,#8*6] 1402 sbcs $a5,$acc5,$a5 1403 stp xzr,xzr,[sp,#8*8] 1404 sbcs $a6,$acc6,$a6 1405 stp xzr,xzr,[sp,#8*10] 1406 sbcs $a7,$acc7,$a7 1407 stp xzr,xzr,[sp,#8*12] 1408 sbcs $carry,$carry,xzr // did it borrow? 1409 stp xzr,xzr,[sp,#8*14] 1410 1411 // $a0-7 hold result-modulus 1412 csel $a0,$acc0,$a0,lo 1413 csel $a1,$acc1,$a1,lo 1414 csel $a2,$acc2,$a2,lo 1415 csel $a3,$acc3,$a3,lo 1416 stp $a0,$a1,[$ap,#8*0] 1417 csel $a4,$acc4,$a4,lo 1418 csel $a5,$acc5,$a5,lo 1419 stp $a2,$a3,[$ap,#8*2] 1420 csel $a6,$acc6,$a6,lo 1421 csel $a7,$acc7,$a7,lo 1422 stp $a4,$a5,[$ap,#8*4] 1423 stp $a6,$a7,[$ap,#8*6] 1424 1425.Lsqr8x_done: 1426 ldp x19,x20,[x29,#16] 1427 mov sp,x29 1428 ldp x21,x22,[x29,#32] 1429 mov x0,#1 1430 ldp x23,x24,[x29,#48] 1431 ldp x25,x26,[x29,#64] 1432 ldp x27,x28,[x29,#80] 1433 ldr x29,[sp],#128 1434 // x30 is loaded earlier 1435 AARCH64_VALIDATE_LINK_REGISTER 1436 ret 1437.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1438___ 1439} 1440 1441{ 1442######################################################################## 1443# Even though this might look as ARMv8 adaptation of mulx4x_mont from 1444# x86_64-mont5 module, it's different in sense that it performs 1445# reduction 256 bits at a time. 1446 1447my ($a0,$a1,$a2,$a3, 1448 $t0,$t1,$t2,$t3, 1449 $m0,$m1,$m2,$m3, 1450 $acc0,$acc1,$acc2,$acc3,$acc4, 1451 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1452my $bp_end=$rp; 1453my ($carry,$topmost) = ($rp,"x30"); 1454 1455$code.=<<___; 1456.type __bn_mul4x_mont,%function 1457.align 5 1458__bn_mul4x_mont: 1459 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1460 // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1461 stp x29,x30,[sp,#-128]! 1462 add x29,sp,#0 1463 stp x19,x20,[sp,#16] 1464 stp x21,x22,[sp,#32] 1465 stp x23,x24,[sp,#48] 1466 stp x25,x26,[sp,#64] 1467 stp x27,x28,[sp,#80] 1468 1469 sub $tp,sp,$num,lsl#3 1470 lsl $num,$num,#3 1471 ldr $n0,[$n0] // *n0 1472 sub sp,$tp,#8*4 // alloca 1473 1474 add $t0,$bp,$num 1475 add $ap_end,$ap,$num 1476 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1477 1478 ldr $bi,[$bp,#8*0] // b[0] 1479 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1480 ldp $a2,$a3,[$ap,#8*2] 1481 add $ap,$ap,#8*4 1482 mov $acc0,xzr 1483 mov $acc1,xzr 1484 mov $acc2,xzr 1485 mov $acc3,xzr 1486 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1487 ldp $m2,$m3,[$np,#8*2] 1488 adds $np,$np,#8*4 // clear carry bit 1489 mov $carry,xzr 1490 mov $cnt,#0 1491 mov $tp,sp 1492 1493.Loop_mul4x_1st_reduction: 1494 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1495 adc $carry,$carry,xzr // modulo-scheduled 1496 mul $t1,$a1,$bi 1497 add $cnt,$cnt,#8 1498 mul $t2,$a2,$bi 1499 and $cnt,$cnt,#31 1500 mul $t3,$a3,$bi 1501 adds $acc0,$acc0,$t0 1502 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1503 adcs $acc1,$acc1,$t1 1504 mul $mi,$acc0,$n0 // t[0]*n0 1505 adcs $acc2,$acc2,$t2 1506 umulh $t1,$a1,$bi 1507 adcs $acc3,$acc3,$t3 1508 umulh $t2,$a2,$bi 1509 adc $acc4,xzr,xzr 1510 umulh $t3,$a3,$bi 1511 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1512 adds $acc1,$acc1,$t0 1513 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1514 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1515 adcs $acc2,$acc2,$t1 1516 mul $t1,$m1,$mi 1517 adcs $acc3,$acc3,$t2 1518 mul $t2,$m2,$mi 1519 adc $acc4,$acc4,$t3 // can't overflow 1520 mul $t3,$m3,$mi 1521 // (*) adds xzr,$acc0,$t0 1522 subs xzr,$acc0,#1 // (*) 1523 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1524 adcs $acc0,$acc1,$t1 1525 umulh $t1,$m1,$mi 1526 adcs $acc1,$acc2,$t2 1527 umulh $t2,$m2,$mi 1528 adcs $acc2,$acc3,$t3 1529 umulh $t3,$m3,$mi 1530 adcs $acc3,$acc4,$carry 1531 adc $carry,xzr,xzr 1532 adds $acc0,$acc0,$t0 1533 sub $t0,$ap_end,$ap 1534 adcs $acc1,$acc1,$t1 1535 adcs $acc2,$acc2,$t2 1536 adcs $acc3,$acc3,$t3 1537 //adc $carry,$carry,xzr 1538 cbnz $cnt,.Loop_mul4x_1st_reduction 1539 1540 cbz $t0,.Lmul4x4_post_condition 1541 1542 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1543 ldp $a2,$a3,[$ap,#8*2] 1544 add $ap,$ap,#8*4 1545 ldr $mi,[sp] // a[0]*n0 1546 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1547 ldp $m2,$m3,[$np,#8*2] 1548 add $np,$np,#8*4 1549 1550.Loop_mul4x_1st_tail: 1551 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1552 adc $carry,$carry,xzr // modulo-scheduled 1553 mul $t1,$a1,$bi 1554 add $cnt,$cnt,#8 1555 mul $t2,$a2,$bi 1556 and $cnt,$cnt,#31 1557 mul $t3,$a3,$bi 1558 adds $acc0,$acc0,$t0 1559 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1560 adcs $acc1,$acc1,$t1 1561 umulh $t1,$a1,$bi 1562 adcs $acc2,$acc2,$t2 1563 umulh $t2,$a2,$bi 1564 adcs $acc3,$acc3,$t3 1565 umulh $t3,$a3,$bi 1566 adc $acc4,xzr,xzr 1567 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1568 adds $acc1,$acc1,$t0 1569 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1570 adcs $acc2,$acc2,$t1 1571 mul $t1,$m1,$mi 1572 adcs $acc3,$acc3,$t2 1573 mul $t2,$m2,$mi 1574 adc $acc4,$acc4,$t3 // can't overflow 1575 mul $t3,$m3,$mi 1576 adds $acc0,$acc0,$t0 1577 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1578 adcs $acc1,$acc1,$t1 1579 umulh $t1,$m1,$mi 1580 adcs $acc2,$acc2,$t2 1581 umulh $t2,$m2,$mi 1582 adcs $acc3,$acc3,$t3 1583 adcs $acc4,$acc4,$carry 1584 umulh $t3,$m3,$mi 1585 adc $carry,xzr,xzr 1586 ldr $mi,[sp,$cnt] // next t[0]*n0 1587 str $acc0,[$tp],#8 // result!!! 1588 adds $acc0,$acc1,$t0 1589 sub $t0,$ap_end,$ap // done yet? 1590 adcs $acc1,$acc2,$t1 1591 adcs $acc2,$acc3,$t2 1592 adcs $acc3,$acc4,$t3 1593 //adc $carry,$carry,xzr 1594 cbnz $cnt,.Loop_mul4x_1st_tail 1595 1596 sub $t1,$ap_end,$num // rewinded $ap 1597 cbz $t0,.Lmul4x_proceed 1598 1599 ldp $a0,$a1,[$ap,#8*0] 1600 ldp $a2,$a3,[$ap,#8*2] 1601 add $ap,$ap,#8*4 1602 ldp $m0,$m1,[$np,#8*0] 1603 ldp $m2,$m3,[$np,#8*2] 1604 add $np,$np,#8*4 1605 b .Loop_mul4x_1st_tail 1606 1607.align 5 1608.Lmul4x_proceed: 1609 ldr $bi,[$bp,#8*4]! // *++b 1610 adc $topmost,$carry,xzr 1611 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1612 sub $np,$np,$num // rewind np 1613 ldp $a2,$a3,[$t1,#8*2] 1614 add $ap,$t1,#8*4 1615 1616 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1617 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1618 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1619 ldp $acc2,$acc3,[sp,#8*6] 1620 1621 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1622 mov $tp,sp 1623 ldp $m2,$m3,[$np,#8*2] 1624 adds $np,$np,#8*4 // clear carry bit 1625 mov $carry,xzr 1626 1627.align 4 1628.Loop_mul4x_reduction: 1629 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1630 adc $carry,$carry,xzr // modulo-scheduled 1631 mul $t1,$a1,$bi 1632 add $cnt,$cnt,#8 1633 mul $t2,$a2,$bi 1634 and $cnt,$cnt,#31 1635 mul $t3,$a3,$bi 1636 adds $acc0,$acc0,$t0 1637 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1638 adcs $acc1,$acc1,$t1 1639 mul $mi,$acc0,$n0 // t[0]*n0 1640 adcs $acc2,$acc2,$t2 1641 umulh $t1,$a1,$bi 1642 adcs $acc3,$acc3,$t3 1643 umulh $t2,$a2,$bi 1644 adc $acc4,xzr,xzr 1645 umulh $t3,$a3,$bi 1646 ldr $bi,[$bp,$cnt] // next b[i] 1647 adds $acc1,$acc1,$t0 1648 // (*) mul $t0,$m0,$mi 1649 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1650 adcs $acc2,$acc2,$t1 1651 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1652 adcs $acc3,$acc3,$t2 1653 mul $t2,$m2,$mi 1654 adc $acc4,$acc4,$t3 // can't overflow 1655 mul $t3,$m3,$mi 1656 // (*) adds xzr,$acc0,$t0 1657 subs xzr,$acc0,#1 // (*) 1658 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1659 adcs $acc0,$acc1,$t1 1660 umulh $t1,$m1,$mi 1661 adcs $acc1,$acc2,$t2 1662 umulh $t2,$m2,$mi 1663 adcs $acc2,$acc3,$t3 1664 umulh $t3,$m3,$mi 1665 adcs $acc3,$acc4,$carry 1666 adc $carry,xzr,xzr 1667 adds $acc0,$acc0,$t0 1668 adcs $acc1,$acc1,$t1 1669 adcs $acc2,$acc2,$t2 1670 adcs $acc3,$acc3,$t3 1671 //adc $carry,$carry,xzr 1672 cbnz $cnt,.Loop_mul4x_reduction 1673 1674 adc $carry,$carry,xzr 1675 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1676 ldp $t2,$t3,[$tp,#8*6] 1677 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1678 ldp $a2,$a3,[$ap,#8*2] 1679 add $ap,$ap,#8*4 1680 adds $acc0,$acc0,$t0 1681 adcs $acc1,$acc1,$t1 1682 adcs $acc2,$acc2,$t2 1683 adcs $acc3,$acc3,$t3 1684 //adc $carry,$carry,xzr 1685 1686 ldr $mi,[sp] // t[0]*n0 1687 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1688 ldp $m2,$m3,[$np,#8*2] 1689 add $np,$np,#8*4 1690 1691.align 4 1692.Loop_mul4x_tail: 1693 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1694 adc $carry,$carry,xzr // modulo-scheduled 1695 mul $t1,$a1,$bi 1696 add $cnt,$cnt,#8 1697 mul $t2,$a2,$bi 1698 and $cnt,$cnt,#31 1699 mul $t3,$a3,$bi 1700 adds $acc0,$acc0,$t0 1701 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1702 adcs $acc1,$acc1,$t1 1703 umulh $t1,$a1,$bi 1704 adcs $acc2,$acc2,$t2 1705 umulh $t2,$a2,$bi 1706 adcs $acc3,$acc3,$t3 1707 umulh $t3,$a3,$bi 1708 adc $acc4,xzr,xzr 1709 ldr $bi,[$bp,$cnt] // next b[i] 1710 adds $acc1,$acc1,$t0 1711 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1712 adcs $acc2,$acc2,$t1 1713 mul $t1,$m1,$mi 1714 adcs $acc3,$acc3,$t2 1715 mul $t2,$m2,$mi 1716 adc $acc4,$acc4,$t3 // can't overflow 1717 mul $t3,$m3,$mi 1718 adds $acc0,$acc0,$t0 1719 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1720 adcs $acc1,$acc1,$t1 1721 umulh $t1,$m1,$mi 1722 adcs $acc2,$acc2,$t2 1723 umulh $t2,$m2,$mi 1724 adcs $acc3,$acc3,$t3 1725 umulh $t3,$m3,$mi 1726 adcs $acc4,$acc4,$carry 1727 ldr $mi,[sp,$cnt] // next a[0]*n0 1728 adc $carry,xzr,xzr 1729 str $acc0,[$tp],#8 // result!!! 1730 adds $acc0,$acc1,$t0 1731 sub $t0,$ap_end,$ap // done yet? 1732 adcs $acc1,$acc2,$t1 1733 adcs $acc2,$acc3,$t2 1734 adcs $acc3,$acc4,$t3 1735 //adc $carry,$carry,xzr 1736 cbnz $cnt,.Loop_mul4x_tail 1737 1738 sub $t1,$np,$num // rewinded np? 1739 adc $carry,$carry,xzr 1740 cbz $t0,.Loop_mul4x_break 1741 1742 ldp $t0,$t1,[$tp,#8*4] 1743 ldp $t2,$t3,[$tp,#8*6] 1744 ldp $a0,$a1,[$ap,#8*0] 1745 ldp $a2,$a3,[$ap,#8*2] 1746 add $ap,$ap,#8*4 1747 adds $acc0,$acc0,$t0 1748 adcs $acc1,$acc1,$t1 1749 adcs $acc2,$acc2,$t2 1750 adcs $acc3,$acc3,$t3 1751 //adc $carry,$carry,xzr 1752 ldp $m0,$m1,[$np,#8*0] 1753 ldp $m2,$m3,[$np,#8*2] 1754 add $np,$np,#8*4 1755 b .Loop_mul4x_tail 1756 1757.align 4 1758.Loop_mul4x_break: 1759 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1760 adds $acc0,$acc0,$topmost 1761 add $bp,$bp,#8*4 // bp++ 1762 adcs $acc1,$acc1,xzr 1763 sub $ap,$ap,$num // rewind ap 1764 adcs $acc2,$acc2,xzr 1765 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1766 adcs $acc3,$acc3,xzr 1767 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1768 adc $topmost,$carry,xzr 1769 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1770 cmp $bp,$t3 // done yet? 1771 ldp $acc2,$acc3,[sp,#8*6] 1772 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1773 ldp $m2,$m3,[$t1,#8*2] 1774 add $np,$t1,#8*4 1775 b.eq .Lmul4x_post 1776 1777 ldr $bi,[$bp] 1778 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1779 ldp $a2,$a3,[$ap,#8*2] 1780 adds $ap,$ap,#8*4 // clear carry bit 1781 mov $carry,xzr 1782 mov $tp,sp 1783 b .Loop_mul4x_reduction 1784 1785.align 4 1786.Lmul4x_post: 1787 // Final step. We see if result is larger than modulus, and 1788 // if it is, subtract the modulus. But comparison implies 1789 // subtraction. So we subtract modulus, see if it borrowed, 1790 // and conditionally copy original value. 1791 mov $rp,$t2 1792 mov $ap_end,$t2 // $rp copy 1793 subs $t0,$acc0,$m0 1794 add $tp,sp,#8*8 1795 sbcs $t1,$acc1,$m1 1796 sub $cnt,$num,#8*4 1797 1798.Lmul4x_sub: 1799 sbcs $t2,$acc2,$m2 1800 ldp $m0,$m1,[$np,#8*0] 1801 sub $cnt,$cnt,#8*4 1802 ldp $acc0,$acc1,[$tp,#8*0] 1803 sbcs $t3,$acc3,$m3 1804 ldp $m2,$m3,[$np,#8*2] 1805 add $np,$np,#8*4 1806 ldp $acc2,$acc3,[$tp,#8*2] 1807 add $tp,$tp,#8*4 1808 stp $t0,$t1,[$rp,#8*0] 1809 sbcs $t0,$acc0,$m0 1810 stp $t2,$t3,[$rp,#8*2] 1811 add $rp,$rp,#8*4 1812 sbcs $t1,$acc1,$m1 1813 cbnz $cnt,.Lmul4x_sub 1814 1815 sbcs $t2,$acc2,$m2 1816 mov $tp,sp 1817 add $ap,sp,#8*4 1818 ldp $a0,$a1,[$ap_end,#8*0] 1819 sbcs $t3,$acc3,$m3 1820 stp $t0,$t1,[$rp,#8*0] 1821 ldp $a2,$a3,[$ap_end,#8*2] 1822 stp $t2,$t3,[$rp,#8*2] 1823 ldp $acc0,$acc1,[$ap,#8*0] 1824 ldp $acc2,$acc3,[$ap,#8*2] 1825 sbcs xzr,$topmost,xzr // did it borrow? 1826 ldr x30,[x29,#8] // pull return address 1827 1828 sub $cnt,$num,#8*4 1829.Lmul4x_cond_copy: 1830 sub $cnt,$cnt,#8*4 1831 csel $t0,$acc0,$a0,lo 1832 stp xzr,xzr,[$tp,#8*0] 1833 csel $t1,$acc1,$a1,lo 1834 ldp $a0,$a1,[$ap_end,#8*4] 1835 ldp $acc0,$acc1,[$ap,#8*4] 1836 csel $t2,$acc2,$a2,lo 1837 stp xzr,xzr,[$tp,#8*2] 1838 add $tp,$tp,#8*4 1839 csel $t3,$acc3,$a3,lo 1840 ldp $a2,$a3,[$ap_end,#8*6] 1841 ldp $acc2,$acc3,[$ap,#8*6] 1842 add $ap,$ap,#8*4 1843 stp $t0,$t1,[$ap_end,#8*0] 1844 stp $t2,$t3,[$ap_end,#8*2] 1845 add $ap_end,$ap_end,#8*4 1846 cbnz $cnt,.Lmul4x_cond_copy 1847 1848 csel $t0,$acc0,$a0,lo 1849 stp xzr,xzr,[$tp,#8*0] 1850 csel $t1,$acc1,$a1,lo 1851 stp xzr,xzr,[$tp,#8*2] 1852 csel $t2,$acc2,$a2,lo 1853 stp xzr,xzr,[$tp,#8*3] 1854 csel $t3,$acc3,$a3,lo 1855 stp xzr,xzr,[$tp,#8*4] 1856 stp $t0,$t1,[$ap_end,#8*0] 1857 stp $t2,$t3,[$ap_end,#8*2] 1858 1859 b .Lmul4x_done 1860 1861.align 4 1862.Lmul4x4_post_condition: 1863 adc $carry,$carry,xzr 1864 ldr $ap,[x29,#96] // pull rp 1865 // $acc0-3,$carry hold result, $m0-7 hold modulus 1866 subs $a0,$acc0,$m0 1867 ldr x30,[x29,#8] // pull return address 1868 sbcs $a1,$acc1,$m1 1869 stp xzr,xzr,[sp,#8*0] 1870 sbcs $a2,$acc2,$m2 1871 stp xzr,xzr,[sp,#8*2] 1872 sbcs $a3,$acc3,$m3 1873 stp xzr,xzr,[sp,#8*4] 1874 sbcs xzr,$carry,xzr // did it borrow? 1875 stp xzr,xzr,[sp,#8*6] 1876 1877 // $a0-3 hold result-modulus 1878 csel $a0,$acc0,$a0,lo 1879 csel $a1,$acc1,$a1,lo 1880 csel $a2,$acc2,$a2,lo 1881 csel $a3,$acc3,$a3,lo 1882 stp $a0,$a1,[$ap,#8*0] 1883 stp $a2,$a3,[$ap,#8*2] 1884 1885.Lmul4x_done: 1886 ldp x19,x20,[x29,#16] 1887 mov sp,x29 1888 ldp x21,x22,[x29,#32] 1889 mov x0,#1 1890 ldp x23,x24,[x29,#48] 1891 ldp x25,x26,[x29,#64] 1892 ldp x27,x28,[x29,#80] 1893 ldr x29,[sp],#128 1894 // x30 loaded earlier 1895 AARCH64_VALIDATE_LINK_REGISTER 1896 ret 1897.size __bn_mul4x_mont,.-__bn_mul4x_mont 1898___ 1899} 1900$code.=<<___; 1901.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1902.align 4 1903___ 1904 1905print $code; 1906 1907close STDOUT or die "error closing STDOUT: $!"; 1908