1#! /usr/bin/env perl 2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# April 2006 18 19# "Teaser" Montgomery multiplication module for PowerPC. It's possible 20# to gain a bit more by modulo-scheduling outer loop, then dedicated 21# squaring procedure should give further 20% and code can be adapted 22# for 32-bit application running on 64-bit CPU. As for the latter. 23# It won't be able to achieve "native" 64-bit performance, because in 24# 32-bit application context every addc instruction will have to be 25# expanded as addc, twice right shift by 32 and finally adde, etc. 26# So far RSA *sign* performance improvement over pre-bn_mul_mont asm 27# for 64-bit application running on PPC970/G5 is: 28# 29# 512-bit +65% 30# 1024-bit +35% 31# 2048-bit +18% 32# 4096-bit +4% 33 34# September 2016 35# 36# Add multiplication procedure operating on lengths divisible by 4 37# and squaring procedure operating on lengths divisible by 8. Length 38# is expressed in number of limbs. RSA private key operations are 39# ~35-50% faster (more for longer keys) on contemporary high-end POWER 40# processors in 64-bit builds, [mysteriously enough] more in 32-bit 41# builds. On low-end 32-bit processors performance improvement turned 42# to be marginal... 43 44# $output is the last argument if it looks like a file (it has an extension) 45# $flavour is the first argument if it doesn't look like a file 46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 48 49if ($flavour =~ /32/) { 50 $BITS= 32; 51 $BNSZ= $BITS/8; 52 $SIZE_T=4; 53 $RZONE= 224; 54 55 $LD= "lwz"; # load 56 $LDU= "lwzu"; # load and update 57 $LDX= "lwzx"; # load indexed 58 $ST= "stw"; # store 59 $STU= "stwu"; # store and update 60 $STX= "stwx"; # store indexed 61 $STUX= "stwux"; # store indexed and update 62 $UMULL= "mullw"; # unsigned multiply low 63 $UMULH= "mulhwu"; # unsigned multiply high 64 $UCMP= "cmplw"; # unsigned compare 65 $SHRI= "srwi"; # unsigned shift right by immediate 66 $SHLI= "slwi"; # unsigned shift left by immediate 67 $PUSH= $ST; 68 $POP= $LD; 69} elsif ($flavour =~ /64/) { 70 $BITS= 64; 71 $BNSZ= $BITS/8; 72 $SIZE_T=8; 73 $RZONE= 288; 74 75 # same as above, but 64-bit mnemonics... 76 $LD= "ld"; # load 77 $LDU= "ldu"; # load and update 78 $LDX= "ldx"; # load indexed 79 $ST= "std"; # store 80 $STU= "stdu"; # store and update 81 $STX= "stdx"; # store indexed 82 $STUX= "stdux"; # store indexed and update 83 $UMULL= "mulld"; # unsigned multiply low 84 $UMULH= "mulhdu"; # unsigned multiply high 85 $UCMP= "cmpld"; # unsigned compare 86 $SHRI= "srdi"; # unsigned shift right by immediate 87 $SHLI= "sldi"; # unsigned shift left by immediate 88 $PUSH= $ST; 89 $POP= $LD; 90} else { die "nonsense $flavour"; } 91 92$FRAME=8*$SIZE_T+$RZONE; 93$LOCALS=8*$SIZE_T; 94 95$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 96( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 97( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 98die "can't locate ppc-xlate.pl"; 99 100open STDOUT,"| $^X $xlate $flavour \"$output\"" 101 or die "can't call $xlate: $!"; 102 103$sp="r1"; 104$toc="r2"; 105$rp="r3"; 106$ap="r4"; 107$bp="r5"; 108$np="r6"; 109$n0="r7"; 110$num="r8"; 111 112{ 113my $ovf=$rp; 114my $rp="r9"; # $rp is reassigned 115my $aj="r10"; 116my $nj="r11"; 117my $tj="r12"; 118# non-volatile registers 119my $i="r20"; 120my $j="r21"; 121my $tp="r22"; 122my $m0="r23"; 123my $m1="r24"; 124my $lo0="r25"; 125my $hi0="r26"; 126my $lo1="r27"; 127my $hi1="r28"; 128my $alo="r29"; 129my $ahi="r30"; 130my $nlo="r31"; 131# 132my $nhi="r0"; 133 134$code=<<___; 135.machine "any" 136.text 137 138.globl .bn_mul_mont_int 139.align 5 140.bn_mul_mont_int: 141 mr $rp,r3 ; $rp is reassigned 142 li r3,0 143___ 144$code.=<<___ if ($BNSZ==4); 145 cmpwi $num,32 ; longer key performance is not better 146 bgelr 147___ 148$code.=<<___; 149 slwi $num,$num,`log($BNSZ)/log(2)` 150 li $tj,-4096 151 addi $ovf,$num,$FRAME 152 subf $ovf,$ovf,$sp ; $sp-$ovf 153 and $ovf,$ovf,$tj ; minimize TLB usage 154 subf $ovf,$sp,$ovf ; $ovf-$sp 155 mr $tj,$sp 156 srwi $num,$num,`log($BNSZ)/log(2)` 157 $STUX $sp,$sp,$ovf 158 159 $PUSH r20,`-12*$SIZE_T`($tj) 160 $PUSH r21,`-11*$SIZE_T`($tj) 161 $PUSH r22,`-10*$SIZE_T`($tj) 162 $PUSH r23,`-9*$SIZE_T`($tj) 163 $PUSH r24,`-8*$SIZE_T`($tj) 164 $PUSH r25,`-7*$SIZE_T`($tj) 165 $PUSH r26,`-6*$SIZE_T`($tj) 166 $PUSH r27,`-5*$SIZE_T`($tj) 167 $PUSH r28,`-4*$SIZE_T`($tj) 168 $PUSH r29,`-3*$SIZE_T`($tj) 169 $PUSH r30,`-2*$SIZE_T`($tj) 170 $PUSH r31,`-1*$SIZE_T`($tj) 171 172 $LD $n0,0($n0) ; pull n0[0] value 173 addi $num,$num,-2 ; adjust $num for counter register 174 175 $LD $m0,0($bp) ; m0=bp[0] 176 $LD $aj,0($ap) ; ap[0] 177 addi $tp,$sp,$LOCALS 178 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] 179 $UMULH $hi0,$aj,$m0 180 181 $LD $aj,$BNSZ($ap) ; ap[1] 182 $LD $nj,0($np) ; np[0] 183 184 $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0 185 186 $UMULL $alo,$aj,$m0 ; ap[1]*bp[0] 187 $UMULH $ahi,$aj,$m0 188 189 $UMULL $lo1,$nj,$m1 ; np[0]*m1 190 $UMULH $hi1,$nj,$m1 191 $LD $nj,$BNSZ($np) ; np[1] 192 addc $lo1,$lo1,$lo0 193 addze $hi1,$hi1 194 195 $UMULL $nlo,$nj,$m1 ; np[1]*m1 196 $UMULH $nhi,$nj,$m1 197 198 mtctr $num 199 li $j,`2*$BNSZ` 200.align 4 201L1st: 202 $LDX $aj,$ap,$j ; ap[j] 203 addc $lo0,$alo,$hi0 204 $LDX $nj,$np,$j ; np[j] 205 addze $hi0,$ahi 206 $UMULL $alo,$aj,$m0 ; ap[j]*bp[0] 207 addc $lo1,$nlo,$hi1 208 $UMULH $ahi,$aj,$m0 209 addze $hi1,$nhi 210 $UMULL $nlo,$nj,$m1 ; np[j]*m1 211 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 212 $UMULH $nhi,$nj,$m1 213 addze $hi1,$hi1 214 $ST $lo1,0($tp) ; tp[j-1] 215 216 addi $j,$j,$BNSZ ; j++ 217 addi $tp,$tp,$BNSZ ; tp++ 218 bdnz L1st 219;L1st 220 addc $lo0,$alo,$hi0 221 addze $hi0,$ahi 222 223 addc $lo1,$nlo,$hi1 224 addze $hi1,$nhi 225 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0] 226 addze $hi1,$hi1 227 $ST $lo1,0($tp) ; tp[j-1] 228 229 li $ovf,0 230 addc $hi1,$hi1,$hi0 231 addze $ovf,$ovf ; upmost overflow bit 232 $ST $hi1,$BNSZ($tp) 233 234 li $i,$BNSZ 235.align 4 236Louter: 237 $LDX $m0,$bp,$i ; m0=bp[i] 238 $LD $aj,0($ap) ; ap[0] 239 addi $tp,$sp,$LOCALS 240 $LD $tj,$LOCALS($sp); tp[0] 241 $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] 242 $UMULH $hi0,$aj,$m0 243 $LD $aj,$BNSZ($ap) ; ap[1] 244 $LD $nj,0($np) ; np[0] 245 addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0] 246 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 247 addze $hi0,$hi0 248 $UMULL $m1,$lo0,$n0 ; tp[0]*n0 249 $UMULH $ahi,$aj,$m0 250 $UMULL $lo1,$nj,$m1 ; np[0]*m1 251 $UMULH $hi1,$nj,$m1 252 $LD $nj,$BNSZ($np) ; np[1] 253 addc $lo1,$lo1,$lo0 254 $UMULL $nlo,$nj,$m1 ; np[1]*m1 255 addze $hi1,$hi1 256 $UMULH $nhi,$nj,$m1 257 258 mtctr $num 259 li $j,`2*$BNSZ` 260.align 4 261Linner: 262 $LDX $aj,$ap,$j ; ap[j] 263 addc $lo0,$alo,$hi0 264 $LD $tj,$BNSZ($tp) ; tp[j] 265 addze $hi0,$ahi 266 $LDX $nj,$np,$j ; np[j] 267 addc $lo1,$nlo,$hi1 268 $UMULL $alo,$aj,$m0 ; ap[j]*bp[i] 269 addze $hi1,$nhi 270 $UMULH $ahi,$aj,$m0 271 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 272 $UMULL $nlo,$nj,$m1 ; np[j]*m1 273 addze $hi0,$hi0 274 $UMULH $nhi,$nj,$m1 275 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 276 addi $j,$j,$BNSZ ; j++ 277 addze $hi1,$hi1 278 $ST $lo1,0($tp) ; tp[j-1] 279 addi $tp,$tp,$BNSZ ; tp++ 280 bdnz Linner 281;Linner 282 $LD $tj,$BNSZ($tp) ; tp[j] 283 addc $lo0,$alo,$hi0 284 addze $hi0,$ahi 285 addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j] 286 addze $hi0,$hi0 287 288 addc $lo1,$nlo,$hi1 289 addze $hi1,$nhi 290 addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j] 291 addze $hi1,$hi1 292 $ST $lo1,0($tp) ; tp[j-1] 293 294 addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA] 295 li $ovf,0 296 adde $hi1,$hi1,$hi0 297 addze $ovf,$ovf 298 $ST $hi1,$BNSZ($tp) 299; 300 slwi $tj,$num,`log($BNSZ)/log(2)` 301 $UCMP $i,$tj 302 addi $i,$i,$BNSZ 303 ble Louter 304 305 addi $num,$num,2 ; restore $num 306 subfc $j,$j,$j ; j=0 and "clear" XER[CA] 307 addi $tp,$sp,$LOCALS 308 mtctr $num 309 310.align 4 311Lsub: $LDX $tj,$tp,$j 312 $LDX $nj,$np,$j 313 subfe $aj,$nj,$tj ; tp[j]-np[j] 314 $STX $aj,$rp,$j 315 addi $j,$j,$BNSZ 316 bdnz Lsub 317 318 li $j,0 319 mtctr $num 320 subfe $ovf,$j,$ovf ; handle upmost overflow bit 321 322.align 4 323Lcopy: ; conditional copy 324 $LDX $tj,$tp,$j 325 $LDX $aj,$rp,$j 326 and $tj,$tj,$ovf 327 andc $aj,$aj,$ovf 328 $STX $j,$tp,$j ; zap at once 329 or $aj,$aj,$tj 330 $STX $aj,$rp,$j 331 addi $j,$j,$BNSZ 332 bdnz Lcopy 333 334 $POP $tj,0($sp) 335 li r3,1 336 $POP r20,`-12*$SIZE_T`($tj) 337 $POP r21,`-11*$SIZE_T`($tj) 338 $POP r22,`-10*$SIZE_T`($tj) 339 $POP r23,`-9*$SIZE_T`($tj) 340 $POP r24,`-8*$SIZE_T`($tj) 341 $POP r25,`-7*$SIZE_T`($tj) 342 $POP r26,`-6*$SIZE_T`($tj) 343 $POP r27,`-5*$SIZE_T`($tj) 344 $POP r28,`-4*$SIZE_T`($tj) 345 $POP r29,`-3*$SIZE_T`($tj) 346 $POP r30,`-2*$SIZE_T`($tj) 347 $POP r31,`-1*$SIZE_T`($tj) 348 mr $sp,$tj 349 blr 350 .long 0 351 .byte 0,12,4,0,0x80,12,6,0 352 .long 0 353.size .bn_mul_mont_int,.-.bn_mul_mont_int 354___ 355} 356if (1) { 357my ($a0,$a1,$a2,$a3, 358 $t0,$t1,$t2,$t3, 359 $m0,$m1,$m2,$m3, 360 $acc0,$acc1,$acc2,$acc3,$acc4, 361 $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31)); 362my ($carry,$zero) = ($rp,"r0"); 363 364# sp----------->+-------------------------------+ 365# | saved sp | 366# +-------------------------------+ 367# . . 368# +8*size_t +-------------------------------+ 369# | 4 "n0*t0" | 370# . . 371# . . 372# +12*size_t +-------------------------------+ 373# | size_t tmp[num] | 374# . . 375# . . 376# . . 377# +-------------------------------+ 378# | topmost carry | 379# . . 380# -18*size_t +-------------------------------+ 381# | 18 saved gpr, r14-r31 | 382# . . 383# . . 384# +-------------------------------+ 385$code.=<<___; 386.globl .bn_mul4x_mont_int 387.align 5 388.bn_mul4x_mont_int: 389 andi. r0,$num,7 390 bne .Lmul4x_do 391 $UCMP $ap,$bp 392 bne .Lmul4x_do 393 b .Lsqr8x_do 394.Lmul4x_do: 395 slwi $num,$num,`log($SIZE_T)/log(2)` 396 mr $a0,$sp 397 li $a1,-32*$SIZE_T 398 sub $a1,$a1,$num 399 $STUX $sp,$sp,$a1 # alloca 400 401 $PUSH r14,-$SIZE_T*18($a0) 402 $PUSH r15,-$SIZE_T*17($a0) 403 $PUSH r16,-$SIZE_T*16($a0) 404 $PUSH r17,-$SIZE_T*15($a0) 405 $PUSH r18,-$SIZE_T*14($a0) 406 $PUSH r19,-$SIZE_T*13($a0) 407 $PUSH r20,-$SIZE_T*12($a0) 408 $PUSH r21,-$SIZE_T*11($a0) 409 $PUSH r22,-$SIZE_T*10($a0) 410 $PUSH r23,-$SIZE_T*9($a0) 411 $PUSH r24,-$SIZE_T*8($a0) 412 $PUSH r25,-$SIZE_T*7($a0) 413 $PUSH r26,-$SIZE_T*6($a0) 414 $PUSH r27,-$SIZE_T*5($a0) 415 $PUSH r28,-$SIZE_T*4($a0) 416 $PUSH r29,-$SIZE_T*3($a0) 417 $PUSH r30,-$SIZE_T*2($a0) 418 $PUSH r31,-$SIZE_T*1($a0) 419 420 subi $ap,$ap,$SIZE_T # bias by -1 421 subi $np,$np,$SIZE_T # bias by -1 422 subi $rp,$rp,$SIZE_T # bias by -1 423 $LD $n0,0($n0) # *n0 424 425 add $t0,$bp,$num 426 add $ap_end,$ap,$num 427 subi $t0,$t0,$SIZE_T*4 # &b[num-4] 428 429 $LD $bi,$SIZE_T*0($bp) # b[0] 430 li $acc0,0 431 $LD $a0,$SIZE_T*1($ap) # a[0..3] 432 li $acc1,0 433 $LD $a1,$SIZE_T*2($ap) 434 li $acc2,0 435 $LD $a2,$SIZE_T*3($ap) 436 li $acc3,0 437 $LDU $a3,$SIZE_T*4($ap) 438 $LD $m0,$SIZE_T*1($np) # n[0..3] 439 $LD $m1,$SIZE_T*2($np) 440 $LD $m2,$SIZE_T*3($np) 441 $LDU $m3,$SIZE_T*4($np) 442 443 $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4] 444 $PUSH $t0,$SIZE_T*7($sp) 445 li $carry,0 446 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 447 li $cnt,0 448 li $zero,0 449 b .Loop_mul4x_1st_reduction 450 451.align 5 452.Loop_mul4x_1st_reduction: 453 $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0]) 454 addze $carry,$carry # modulo-scheduled 455 $UMULL $t1,$a1,$bi 456 addi $cnt,$cnt,$SIZE_T 457 $UMULL $t2,$a2,$bi 458 andi. $cnt,$cnt,$SIZE_T*4-1 459 $UMULL $t3,$a3,$bi 460 addc $acc0,$acc0,$t0 461 $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0]) 462 adde $acc1,$acc1,$t1 463 $UMULH $t1,$a1,$bi 464 adde $acc2,$acc2,$t2 465 $UMULL $mi,$acc0,$n0 # t[0]*n0 466 adde $acc3,$acc3,$t3 467 $UMULH $t2,$a2,$bi 468 addze $acc4,$zero 469 $UMULH $t3,$a3,$bi 470 $LDX $bi,$bp,$cnt # next b[i] (or b[0]) 471 addc $acc1,$acc1,$t0 472 # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0) 473 $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 474 adde $acc2,$acc2,$t1 475 $UMULL $t1,$m1,$mi 476 adde $acc3,$acc3,$t2 477 $UMULL $t2,$m2,$mi 478 adde $acc4,$acc4,$t3 # can't overflow 479 $UMULL $t3,$m3,$mi 480 # (*) addc $acc0,$acc0,$t0 481 # (*) As for removal of first multiplication and addition 482 # instructions. The outcome of first addition is 483 # guaranteed to be zero, which leaves two computationally 484 # significant outcomes: it either carries or not. Then 485 # question is when does it carry? Is there alternative 486 # way to deduce it? If you follow operations, you can 487 # observe that condition for carry is quite simple: 488 # $acc0 being non-zero. So that carry can be calculated 489 # by adding -1 to $acc0. That's what next instruction does. 490 addic $acc0,$acc0,-1 # (*), discarded 491 $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0) 492 adde $acc0,$acc1,$t1 493 $UMULH $t1,$m1,$mi 494 adde $acc1,$acc2,$t2 495 $UMULH $t2,$m2,$mi 496 adde $acc2,$acc3,$t3 497 $UMULH $t3,$m3,$mi 498 adde $acc3,$acc4,$carry 499 addze $carry,$zero 500 addc $acc0,$acc0,$t0 501 adde $acc1,$acc1,$t1 502 adde $acc2,$acc2,$t2 503 adde $acc3,$acc3,$t3 504 #addze $carry,$carry 505 bne .Loop_mul4x_1st_reduction 506 507 $UCMP $ap_end,$ap 508 beq .Lmul4x4_post_condition 509 510 $LD $a0,$SIZE_T*1($ap) # a[4..7] 511 $LD $a1,$SIZE_T*2($ap) 512 $LD $a2,$SIZE_T*3($ap) 513 $LDU $a3,$SIZE_T*4($ap) 514 $LD $mi,$SIZE_T*8($sp) # a[0]*n0 515 $LD $m0,$SIZE_T*1($np) # n[4..7] 516 $LD $m1,$SIZE_T*2($np) 517 $LD $m2,$SIZE_T*3($np) 518 $LDU $m3,$SIZE_T*4($np) 519 b .Loop_mul4x_1st_tail 520 521.align 5 522.Loop_mul4x_1st_tail: 523 $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i]) 524 addze $carry,$carry # modulo-scheduled 525 $UMULL $t1,$a1,$bi 526 addi $cnt,$cnt,$SIZE_T 527 $UMULL $t2,$a2,$bi 528 andi. $cnt,$cnt,$SIZE_T*4-1 529 $UMULL $t3,$a3,$bi 530 addc $acc0,$acc0,$t0 531 $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i]) 532 adde $acc1,$acc1,$t1 533 $UMULH $t1,$a1,$bi 534 adde $acc2,$acc2,$t2 535 $UMULH $t2,$a2,$bi 536 adde $acc3,$acc3,$t3 537 $UMULH $t3,$a3,$bi 538 addze $acc4,$zero 539 $LDX $bi,$bp,$cnt # next b[i] (or b[0]) 540 addc $acc1,$acc1,$t0 541 $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0) 542 adde $acc2,$acc2,$t1 543 $UMULL $t1,$m1,$mi 544 adde $acc3,$acc3,$t2 545 $UMULL $t2,$m2,$mi 546 adde $acc4,$acc4,$t3 # can't overflow 547 $UMULL $t3,$m3,$mi 548 addc $acc0,$acc0,$t0 549 $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0) 550 adde $acc1,$acc1,$t1 551 $UMULH $t1,$m1,$mi 552 adde $acc2,$acc2,$t2 553 $UMULH $t2,$m2,$mi 554 adde $acc3,$acc3,$t3 555 adde $acc4,$acc4,$carry 556 $UMULH $t3,$m3,$mi 557 addze $carry,$zero 558 addi $mi,$sp,$SIZE_T*8 559 $LDX $mi,$mi,$cnt # next t[0]*n0 560 $STU $acc0,$SIZE_T($tp) # word of result 561 addc $acc0,$acc1,$t0 562 adde $acc1,$acc2,$t1 563 adde $acc2,$acc3,$t2 564 adde $acc3,$acc4,$t3 565 #addze $carry,$carry 566 bne .Loop_mul4x_1st_tail 567 568 sub $t1,$ap_end,$num # rewinded $ap 569 $UCMP $ap_end,$ap # done yet? 570 beq .Lmul4x_proceed 571 572 $LD $a0,$SIZE_T*1($ap) 573 $LD $a1,$SIZE_T*2($ap) 574 $LD $a2,$SIZE_T*3($ap) 575 $LDU $a3,$SIZE_T*4($ap) 576 $LD $m0,$SIZE_T*1($np) 577 $LD $m1,$SIZE_T*2($np) 578 $LD $m2,$SIZE_T*3($np) 579 $LDU $m3,$SIZE_T*4($np) 580 b .Loop_mul4x_1st_tail 581 582.align 5 583.Lmul4x_proceed: 584 $LDU $bi,$SIZE_T*4($bp) # *++b 585 addze $carry,$carry # topmost carry 586 $LD $a0,$SIZE_T*1($t1) 587 $LD $a1,$SIZE_T*2($t1) 588 $LD $a2,$SIZE_T*3($t1) 589 $LD $a3,$SIZE_T*4($t1) 590 addi $ap,$t1,$SIZE_T*4 591 sub $np,$np,$num # rewind np 592 593 $ST $acc0,$SIZE_T*1($tp) # result 594 $ST $acc1,$SIZE_T*2($tp) 595 $ST $acc2,$SIZE_T*3($tp) 596 $ST $acc3,$SIZE_T*4($tp) 597 $ST $carry,$SIZE_T*5($tp) # save topmost carry 598 $LD $acc0,$SIZE_T*12($sp) # t[0..3] 599 $LD $acc1,$SIZE_T*13($sp) 600 $LD $acc2,$SIZE_T*14($sp) 601 $LD $acc3,$SIZE_T*15($sp) 602 603 $LD $m0,$SIZE_T*1($np) # n[0..3] 604 $LD $m1,$SIZE_T*2($np) 605 $LD $m2,$SIZE_T*3($np) 606 $LDU $m3,$SIZE_T*4($np) 607 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 608 li $carry,0 609 b .Loop_mul4x_reduction 610 611.align 5 612.Loop_mul4x_reduction: 613 $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4]) 614 addze $carry,$carry # modulo-scheduled 615 $UMULL $t1,$a1,$bi 616 addi $cnt,$cnt,$SIZE_T 617 $UMULL $t2,$a2,$bi 618 andi. $cnt,$cnt,$SIZE_T*4-1 619 $UMULL $t3,$a3,$bi 620 addc $acc0,$acc0,$t0 621 $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4]) 622 adde $acc1,$acc1,$t1 623 $UMULH $t1,$a1,$bi 624 adde $acc2,$acc2,$t2 625 $UMULL $mi,$acc0,$n0 # t[0]*n0 626 adde $acc3,$acc3,$t3 627 $UMULH $t2,$a2,$bi 628 addze $acc4,$zero 629 $UMULH $t3,$a3,$bi 630 $LDX $bi,$bp,$cnt # next b[i] 631 addc $acc1,$acc1,$t0 632 # (*) mul $t0,$m0,$mi 633 $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 634 adde $acc2,$acc2,$t1 635 $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0 636 adde $acc3,$acc3,$t2 637 $UMULL $t2,$m2,$mi 638 adde $acc4,$acc4,$t3 # can't overflow 639 $UMULL $t3,$m3,$mi 640 # (*) addc $acc0,$acc0,$t0 641 addic $acc0,$acc0,-1 # (*), discarded 642 $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0 643 adde $acc0,$acc1,$t1 644 $UMULH $t1,$m1,$mi 645 adde $acc1,$acc2,$t2 646 $UMULH $t2,$m2,$mi 647 adde $acc2,$acc3,$t3 648 $UMULH $t3,$m3,$mi 649 adde $acc3,$acc4,$carry 650 addze $carry,$zero 651 addc $acc0,$acc0,$t0 652 adde $acc1,$acc1,$t1 653 adde $acc2,$acc2,$t2 654 adde $acc3,$acc3,$t3 655 #addze $carry,$carry 656 bne .Loop_mul4x_reduction 657 658 $LD $t0,$SIZE_T*5($tp) # t[4..7] 659 addze $carry,$carry 660 $LD $t1,$SIZE_T*6($tp) 661 $LD $t2,$SIZE_T*7($tp) 662 $LD $t3,$SIZE_T*8($tp) 663 $LD $a0,$SIZE_T*1($ap) # a[4..7] 664 $LD $a1,$SIZE_T*2($ap) 665 $LD $a2,$SIZE_T*3($ap) 666 $LDU $a3,$SIZE_T*4($ap) 667 addc $acc0,$acc0,$t0 668 adde $acc1,$acc1,$t1 669 adde $acc2,$acc2,$t2 670 adde $acc3,$acc3,$t3 671 #addze $carry,$carry 672 673 $LD $mi,$SIZE_T*8($sp) # t[0]*n0 674 $LD $m0,$SIZE_T*1($np) # n[4..7] 675 $LD $m1,$SIZE_T*2($np) 676 $LD $m2,$SIZE_T*3($np) 677 $LDU $m3,$SIZE_T*4($np) 678 b .Loop_mul4x_tail 679 680.align 5 681.Loop_mul4x_tail: 682 $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4]) 683 addze $carry,$carry # modulo-scheduled 684 $UMULL $t1,$a1,$bi 685 addi $cnt,$cnt,$SIZE_T 686 $UMULL $t2,$a2,$bi 687 andi. $cnt,$cnt,$SIZE_T*4-1 688 $UMULL $t3,$a3,$bi 689 addc $acc0,$acc0,$t0 690 $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4]) 691 adde $acc1,$acc1,$t1 692 $UMULH $t1,$a1,$bi 693 adde $acc2,$acc2,$t2 694 $UMULH $t2,$a2,$bi 695 adde $acc3,$acc3,$t3 696 $UMULH $t3,$a3,$bi 697 addze $acc4,$zero 698 $LDX $bi,$bp,$cnt # next b[i] 699 addc $acc1,$acc1,$t0 700 $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0) 701 adde $acc2,$acc2,$t1 702 $UMULL $t1,$m1,$mi 703 adde $acc3,$acc3,$t2 704 $UMULL $t2,$m2,$mi 705 adde $acc4,$acc4,$t3 # can't overflow 706 $UMULL $t3,$m3,$mi 707 addc $acc0,$acc0,$t0 708 $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0) 709 adde $acc1,$acc1,$t1 710 $UMULH $t1,$m1,$mi 711 adde $acc2,$acc2,$t2 712 $UMULH $t2,$m2,$mi 713 adde $acc3,$acc3,$t3 714 $UMULH $t3,$m3,$mi 715 adde $acc4,$acc4,$carry 716 addi $mi,$sp,$SIZE_T*8 717 $LDX $mi,$mi,$cnt # next a[0]*n0 718 addze $carry,$zero 719 $STU $acc0,$SIZE_T($tp) # word of result 720 addc $acc0,$acc1,$t0 721 adde $acc1,$acc2,$t1 722 adde $acc2,$acc3,$t2 723 adde $acc3,$acc4,$t3 724 #addze $carry,$carry 725 bne .Loop_mul4x_tail 726 727 $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry 728 sub $t1,$np,$num # rewinded np? 729 addze $carry,$carry 730 $UCMP $ap_end,$ap # done yet? 731 beq .Loop_mul4x_break 732 733 $LD $t1,$SIZE_T*6($tp) 734 $LD $t2,$SIZE_T*7($tp) 735 $LD $t3,$SIZE_T*8($tp) 736 $LD $a0,$SIZE_T*1($ap) 737 $LD $a1,$SIZE_T*2($ap) 738 $LD $a2,$SIZE_T*3($ap) 739 $LDU $a3,$SIZE_T*4($ap) 740 addc $acc0,$acc0,$t0 741 adde $acc1,$acc1,$t1 742 adde $acc2,$acc2,$t2 743 adde $acc3,$acc3,$t3 744 #addze $carry,$carry 745 746 $LD $m0,$SIZE_T*1($np) # n[4..7] 747 $LD $m1,$SIZE_T*2($np) 748 $LD $m2,$SIZE_T*3($np) 749 $LDU $m3,$SIZE_T*4($np) 750 b .Loop_mul4x_tail 751 752.align 5 753.Loop_mul4x_break: 754 $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4] 755 $POP $t3,$SIZE_T*7($sp) 756 addc $a0,$acc0,$t0 # accumulate topmost carry 757 $LD $acc0,$SIZE_T*12($sp) # t[0..3] 758 addze $a1,$acc1 759 $LD $acc1,$SIZE_T*13($sp) 760 addze $a2,$acc2 761 $LD $acc2,$SIZE_T*14($sp) 762 addze $a3,$acc3 763 $LD $acc3,$SIZE_T*15($sp) 764 addze $carry,$carry # topmost carry 765 $ST $a0,$SIZE_T*1($tp) # result 766 sub $ap,$ap_end,$num # rewind ap 767 $ST $a1,$SIZE_T*2($tp) 768 $ST $a2,$SIZE_T*3($tp) 769 $ST $a3,$SIZE_T*4($tp) 770 $ST $carry,$SIZE_T*5($tp) # store topmost carry 771 772 $LD $m0,$SIZE_T*1($t1) # n[0..3] 773 $LD $m1,$SIZE_T*2($t1) 774 $LD $m2,$SIZE_T*3($t1) 775 $LD $m3,$SIZE_T*4($t1) 776 addi $np,$t1,$SIZE_T*4 777 $UCMP $bp,$t3 # done yet? 778 beq .Lmul4x_post 779 780 $LDU $bi,$SIZE_T*4($bp) 781 $LD $a0,$SIZE_T*1($ap) # a[0..3] 782 $LD $a1,$SIZE_T*2($ap) 783 $LD $a2,$SIZE_T*3($ap) 784 $LDU $a3,$SIZE_T*4($ap) 785 li $carry,0 786 addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit 787 b .Loop_mul4x_reduction 788 789.align 5 790.Lmul4x_post: 791 # Final step. We see if result is larger than modulus, and 792 # if it is, subtract the modulus. But comparison implies 793 # subtraction. So we subtract modulus, see if it borrowed, 794 # and conditionally copy original value. 795 srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 796 mr $bp,$t2 # &rp[-1] 797 subi $cnt,$cnt,1 798 mr $ap_end,$t2 # &rp[-1] copy 799 subfc $t0,$m0,$acc0 800 addi $tp,$sp,$SIZE_T*15 801 subfe $t1,$m1,$acc1 802 803 mtctr $cnt 804.Lmul4x_sub: 805 $LD $m0,$SIZE_T*1($np) 806 $LD $acc0,$SIZE_T*1($tp) 807 subfe $t2,$m2,$acc2 808 $LD $m1,$SIZE_T*2($np) 809 $LD $acc1,$SIZE_T*2($tp) 810 subfe $t3,$m3,$acc3 811 $LD $m2,$SIZE_T*3($np) 812 $LD $acc2,$SIZE_T*3($tp) 813 $LDU $m3,$SIZE_T*4($np) 814 $LDU $acc3,$SIZE_T*4($tp) 815 $ST $t0,$SIZE_T*1($bp) 816 $ST $t1,$SIZE_T*2($bp) 817 subfe $t0,$m0,$acc0 818 $ST $t2,$SIZE_T*3($bp) 819 $STU $t3,$SIZE_T*4($bp) 820 subfe $t1,$m1,$acc1 821 bdnz .Lmul4x_sub 822 823 $LD $a0,$SIZE_T*1($ap_end) 824 $ST $t0,$SIZE_T*1($bp) 825 $LD $t0,$SIZE_T*12($sp) 826 subfe $t2,$m2,$acc2 827 $LD $a1,$SIZE_T*2($ap_end) 828 $ST $t1,$SIZE_T*2($bp) 829 $LD $t1,$SIZE_T*13($sp) 830 subfe $t3,$m3,$acc3 831 subfe $carry,$zero,$carry # did it borrow? 832 addi $tp,$sp,$SIZE_T*12 833 $LD $a2,$SIZE_T*3($ap_end) 834 $ST $t2,$SIZE_T*3($bp) 835 $LD $t2,$SIZE_T*14($sp) 836 $LD $a3,$SIZE_T*4($ap_end) 837 $ST $t3,$SIZE_T*4($bp) 838 $LD $t3,$SIZE_T*15($sp) 839 840 mtctr $cnt 841.Lmul4x_cond_copy: 842 and $t0,$t0,$carry 843 andc $a0,$a0,$carry 844 $ST $zero,$SIZE_T*0($tp) # wipe stack clean 845 and $t1,$t1,$carry 846 andc $a1,$a1,$carry 847 $ST $zero,$SIZE_T*1($tp) 848 and $t2,$t2,$carry 849 andc $a2,$a2,$carry 850 $ST $zero,$SIZE_T*2($tp) 851 and $t3,$t3,$carry 852 andc $a3,$a3,$carry 853 $ST $zero,$SIZE_T*3($tp) 854 or $acc0,$t0,$a0 855 $LD $a0,$SIZE_T*5($ap_end) 856 $LD $t0,$SIZE_T*4($tp) 857 or $acc1,$t1,$a1 858 $LD $a1,$SIZE_T*6($ap_end) 859 $LD $t1,$SIZE_T*5($tp) 860 or $acc2,$t2,$a2 861 $LD $a2,$SIZE_T*7($ap_end) 862 $LD $t2,$SIZE_T*6($tp) 863 or $acc3,$t3,$a3 864 $LD $a3,$SIZE_T*8($ap_end) 865 $LD $t3,$SIZE_T*7($tp) 866 addi $tp,$tp,$SIZE_T*4 867 $ST $acc0,$SIZE_T*1($ap_end) 868 $ST $acc1,$SIZE_T*2($ap_end) 869 $ST $acc2,$SIZE_T*3($ap_end) 870 $STU $acc3,$SIZE_T*4($ap_end) 871 bdnz .Lmul4x_cond_copy 872 873 $POP $bp,0($sp) # pull saved sp 874 and $t0,$t0,$carry 875 andc $a0,$a0,$carry 876 $ST $zero,$SIZE_T*0($tp) 877 and $t1,$t1,$carry 878 andc $a1,$a1,$carry 879 $ST $zero,$SIZE_T*1($tp) 880 and $t2,$t2,$carry 881 andc $a2,$a2,$carry 882 $ST $zero,$SIZE_T*2($tp) 883 and $t3,$t3,$carry 884 andc $a3,$a3,$carry 885 $ST $zero,$SIZE_T*3($tp) 886 or $acc0,$t0,$a0 887 or $acc1,$t1,$a1 888 $ST $zero,$SIZE_T*4($tp) 889 or $acc2,$t2,$a2 890 or $acc3,$t3,$a3 891 $ST $acc0,$SIZE_T*1($ap_end) 892 $ST $acc1,$SIZE_T*2($ap_end) 893 $ST $acc2,$SIZE_T*3($ap_end) 894 $ST $acc3,$SIZE_T*4($ap_end) 895 896 b .Lmul4x_done 897 898.align 4 899.Lmul4x4_post_condition: 900 $POP $ap,$SIZE_T*6($sp) # pull &rp[-1] 901 $POP $bp,0($sp) # pull saved sp 902 addze $carry,$carry # modulo-scheduled 903 # $acc0-3,$carry hold result, $m0-3 hold modulus 904 subfc $a0,$m0,$acc0 905 subfe $a1,$m1,$acc1 906 subfe $a2,$m2,$acc2 907 subfe $a3,$m3,$acc3 908 subfe $carry,$zero,$carry # did it borrow? 909 910 and $m0,$m0,$carry 911 and $m1,$m1,$carry 912 addc $a0,$a0,$m0 913 and $m2,$m2,$carry 914 adde $a1,$a1,$m1 915 and $m3,$m3,$carry 916 adde $a2,$a2,$m2 917 adde $a3,$a3,$m3 918 919 $ST $a0,$SIZE_T*1($ap) # write result 920 $ST $a1,$SIZE_T*2($ap) 921 $ST $a2,$SIZE_T*3($ap) 922 $ST $a3,$SIZE_T*4($ap) 923 924.Lmul4x_done: 925 $ST $zero,$SIZE_T*8($sp) # wipe stack clean 926 $ST $zero,$SIZE_T*9($sp) 927 $ST $zero,$SIZE_T*10($sp) 928 $ST $zero,$SIZE_T*11($sp) 929 li r3,1 # signal "done" 930 $POP r14,-$SIZE_T*18($bp) 931 $POP r15,-$SIZE_T*17($bp) 932 $POP r16,-$SIZE_T*16($bp) 933 $POP r17,-$SIZE_T*15($bp) 934 $POP r18,-$SIZE_T*14($bp) 935 $POP r19,-$SIZE_T*13($bp) 936 $POP r20,-$SIZE_T*12($bp) 937 $POP r21,-$SIZE_T*11($bp) 938 $POP r22,-$SIZE_T*10($bp) 939 $POP r23,-$SIZE_T*9($bp) 940 $POP r24,-$SIZE_T*8($bp) 941 $POP r25,-$SIZE_T*7($bp) 942 $POP r26,-$SIZE_T*6($bp) 943 $POP r27,-$SIZE_T*5($bp) 944 $POP r28,-$SIZE_T*4($bp) 945 $POP r29,-$SIZE_T*3($bp) 946 $POP r30,-$SIZE_T*2($bp) 947 $POP r31,-$SIZE_T*1($bp) 948 mr $sp,$bp 949 blr 950 .long 0 951 .byte 0,12,4,0x20,0x80,18,6,0 952 .long 0 953.size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int 954___ 955} 956 957if (1) { 958######################################################################## 959# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module. 960 961my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17)); 962my ($t0,$t1,$t2,$t3)=map("r$_",(18..21)); 963my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29)); 964my ($cnt,$carry,$zero)=("r30","r31","r0"); 965my ($tp,$ap_end,$na0)=($bp,$np,$carry); 966 967# sp----------->+-------------------------------+ 968# | saved sp | 969# +-------------------------------+ 970# . . 971# +12*size_t +-------------------------------+ 972# | size_t tmp[2*num] | 973# . . 974# . . 975# . . 976# +-------------------------------+ 977# . . 978# -18*size_t +-------------------------------+ 979# | 18 saved gpr, r14-r31 | 980# . . 981# . . 982# +-------------------------------+ 983$code.=<<___; 984.align 5 985__bn_sqr8x_mont: 986.Lsqr8x_do: 987 mr $a0,$sp 988 slwi $a1,$num,`log($SIZE_T)/log(2)+1` 989 li $a2,-32*$SIZE_T 990 sub $a1,$a2,$a1 991 slwi $num,$num,`log($SIZE_T)/log(2)` 992 $STUX $sp,$sp,$a1 # alloca 993 994 $PUSH r14,-$SIZE_T*18($a0) 995 $PUSH r15,-$SIZE_T*17($a0) 996 $PUSH r16,-$SIZE_T*16($a0) 997 $PUSH r17,-$SIZE_T*15($a0) 998 $PUSH r18,-$SIZE_T*14($a0) 999 $PUSH r19,-$SIZE_T*13($a0) 1000 $PUSH r20,-$SIZE_T*12($a0) 1001 $PUSH r21,-$SIZE_T*11($a0) 1002 $PUSH r22,-$SIZE_T*10($a0) 1003 $PUSH r23,-$SIZE_T*9($a0) 1004 $PUSH r24,-$SIZE_T*8($a0) 1005 $PUSH r25,-$SIZE_T*7($a0) 1006 $PUSH r26,-$SIZE_T*6($a0) 1007 $PUSH r27,-$SIZE_T*5($a0) 1008 $PUSH r28,-$SIZE_T*4($a0) 1009 $PUSH r29,-$SIZE_T*3($a0) 1010 $PUSH r30,-$SIZE_T*2($a0) 1011 $PUSH r31,-$SIZE_T*1($a0) 1012 1013 subi $ap,$ap,$SIZE_T # bias by -1 1014 subi $t0,$np,$SIZE_T # bias by -1 1015 subi $rp,$rp,$SIZE_T # bias by -1 1016 $LD $n0,0($n0) # *n0 1017 li $zero,0 1018 1019 add $ap_end,$ap,$num 1020 $LD $a0,$SIZE_T*1($ap) 1021 #li $acc0,0 1022 $LD $a1,$SIZE_T*2($ap) 1023 li $acc1,0 1024 $LD $a2,$SIZE_T*3($ap) 1025 li $acc2,0 1026 $LD $a3,$SIZE_T*4($ap) 1027 li $acc3,0 1028 $LD $a4,$SIZE_T*5($ap) 1029 li $acc4,0 1030 $LD $a5,$SIZE_T*6($ap) 1031 li $acc5,0 1032 $LD $a6,$SIZE_T*7($ap) 1033 li $acc6,0 1034 $LDU $a7,$SIZE_T*8($ap) 1035 li $acc7,0 1036 1037 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1038 subic. $cnt,$num,$SIZE_T*8 1039 b .Lsqr8x_zero_start 1040 1041.align 5 1042.Lsqr8x_zero: 1043 subic. $cnt,$cnt,$SIZE_T*8 1044 $ST $zero,$SIZE_T*1($tp) 1045 $ST $zero,$SIZE_T*2($tp) 1046 $ST $zero,$SIZE_T*3($tp) 1047 $ST $zero,$SIZE_T*4($tp) 1048 $ST $zero,$SIZE_T*5($tp) 1049 $ST $zero,$SIZE_T*6($tp) 1050 $ST $zero,$SIZE_T*7($tp) 1051 $ST $zero,$SIZE_T*8($tp) 1052.Lsqr8x_zero_start: 1053 $ST $zero,$SIZE_T*9($tp) 1054 $ST $zero,$SIZE_T*10($tp) 1055 $ST $zero,$SIZE_T*11($tp) 1056 $ST $zero,$SIZE_T*12($tp) 1057 $ST $zero,$SIZE_T*13($tp) 1058 $ST $zero,$SIZE_T*14($tp) 1059 $ST $zero,$SIZE_T*15($tp) 1060 $STU $zero,$SIZE_T*16($tp) 1061 bne .Lsqr8x_zero 1062 1063 $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1] 1064 $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1] 1065 $PUSH $n0,$SIZE_T*8($sp) # offload n0 1066 $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1] 1067 $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry 1068 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1069 1070 # Multiply everything but a[i]*a[i] 1071.align 5 1072.Lsqr8x_outer_loop: 1073 # a[1]a[0] (i) 1074 # a[2]a[0] 1075 # a[3]a[0] 1076 # a[4]a[0] 1077 # a[5]a[0] 1078 # a[6]a[0] 1079 # a[7]a[0] 1080 # a[2]a[1] (ii) 1081 # a[3]a[1] 1082 # a[4]a[1] 1083 # a[5]a[1] 1084 # a[6]a[1] 1085 # a[7]a[1] 1086 # a[3]a[2] (iii) 1087 # a[4]a[2] 1088 # a[5]a[2] 1089 # a[6]a[2] 1090 # a[7]a[2] 1091 # a[4]a[3] (iv) 1092 # a[5]a[3] 1093 # a[6]a[3] 1094 # a[7]a[3] 1095 # a[5]a[4] (v) 1096 # a[6]a[4] 1097 # a[7]a[4] 1098 # a[6]a[5] (vi) 1099 # a[7]a[5] 1100 # a[7]a[6] (vii) 1101 1102 $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i) 1103 $UMULL $t1,$a2,$a0 1104 $UMULL $t2,$a3,$a0 1105 $UMULL $t3,$a4,$a0 1106 addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0]) 1107 $UMULL $t0,$a5,$a0 1108 adde $acc2,$acc2,$t1 1109 $UMULL $t1,$a6,$a0 1110 adde $acc3,$acc3,$t2 1111 $UMULL $t2,$a7,$a0 1112 adde $acc4,$acc4,$t3 1113 $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0]) 1114 adde $acc5,$acc5,$t0 1115 $UMULH $t0,$a2,$a0 1116 adde $acc6,$acc6,$t1 1117 $UMULH $t1,$a3,$a0 1118 adde $acc7,$acc7,$t2 1119 $UMULH $t2,$a4,$a0 1120 $ST $acc0,$SIZE_T*1($tp) # t[0] 1121 addze $acc0,$zero # t[8] 1122 $ST $acc1,$SIZE_T*2($tp) # t[1] 1123 addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0]) 1124 $UMULH $t3,$a5,$a0 1125 adde $acc3,$acc3,$t0 1126 $UMULH $t0,$a6,$a0 1127 adde $acc4,$acc4,$t1 1128 $UMULH $t1,$a7,$a0 1129 adde $acc5,$acc5,$t2 1130 $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii) 1131 adde $acc6,$acc6,$t3 1132 $UMULL $t3,$a3,$a1 1133 adde $acc7,$acc7,$t0 1134 $UMULL $t0,$a4,$a1 1135 adde $acc0,$acc0,$t1 1136 1137 $UMULL $t1,$a5,$a1 1138 addc $acc3,$acc3,$t2 1139 $UMULL $t2,$a6,$a1 1140 adde $acc4,$acc4,$t3 1141 $UMULL $t3,$a7,$a1 1142 adde $acc5,$acc5,$t0 1143 $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1]) 1144 adde $acc6,$acc6,$t1 1145 $UMULH $t1,$a3,$a1 1146 adde $acc7,$acc7,$t2 1147 $UMULH $t2,$a4,$a1 1148 adde $acc0,$acc0,$t3 1149 $UMULH $t3,$a5,$a1 1150 $ST $acc2,$SIZE_T*3($tp) # t[2] 1151 addze $acc1,$zero # t[9] 1152 $ST $acc3,$SIZE_T*4($tp) # t[3] 1153 addc $acc4,$acc4,$t0 1154 $UMULH $t0,$a6,$a1 1155 adde $acc5,$acc5,$t1 1156 $UMULH $t1,$a7,$a1 1157 adde $acc6,$acc6,$t2 1158 $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii) 1159 adde $acc7,$acc7,$t3 1160 $UMULL $t3,$a4,$a2 1161 adde $acc0,$acc0,$t0 1162 $UMULL $t0,$a5,$a2 1163 adde $acc1,$acc1,$t1 1164 1165 $UMULL $t1,$a6,$a2 1166 addc $acc5,$acc5,$t2 1167 $UMULL $t2,$a7,$a2 1168 adde $acc6,$acc6,$t3 1169 $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2]) 1170 adde $acc7,$acc7,$t0 1171 $UMULH $t0,$a4,$a2 1172 adde $acc0,$acc0,$t1 1173 $UMULH $t1,$a5,$a2 1174 adde $acc1,$acc1,$t2 1175 $UMULH $t2,$a6,$a2 1176 $ST $acc4,$SIZE_T*5($tp) # t[4] 1177 addze $acc2,$zero # t[10] 1178 $ST $acc5,$SIZE_T*6($tp) # t[5] 1179 addc $acc6,$acc6,$t3 1180 $UMULH $t3,$a7,$a2 1181 adde $acc7,$acc7,$t0 1182 $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv) 1183 adde $acc0,$acc0,$t1 1184 $UMULL $t1,$a5,$a3 1185 adde $acc1,$acc1,$t2 1186 $UMULL $t2,$a6,$a3 1187 adde $acc2,$acc2,$t3 1188 1189 $UMULL $t3,$a7,$a3 1190 addc $acc7,$acc7,$t0 1191 $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3]) 1192 adde $acc0,$acc0,$t1 1193 $UMULH $t1,$a5,$a3 1194 adde $acc1,$acc1,$t2 1195 $UMULH $t2,$a6,$a3 1196 adde $acc2,$acc2,$t3 1197 $UMULH $t3,$a7,$a3 1198 $ST $acc6,$SIZE_T*7($tp) # t[6] 1199 addze $acc3,$zero # t[11] 1200 $STU $acc7,$SIZE_T*8($tp) # t[7] 1201 addc $acc0,$acc0,$t0 1202 $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v) 1203 adde $acc1,$acc1,$t1 1204 $UMULL $t1,$a6,$a4 1205 adde $acc2,$acc2,$t2 1206 $UMULL $t2,$a7,$a4 1207 adde $acc3,$acc3,$t3 1208 1209 $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4]) 1210 addc $acc1,$acc1,$t0 1211 $UMULH $t0,$a6,$a4 1212 adde $acc2,$acc2,$t1 1213 $UMULH $t1,$a7,$a4 1214 adde $acc3,$acc3,$t2 1215 $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi) 1216 addze $acc4,$zero # t[12] 1217 addc $acc2,$acc2,$t3 1218 $UMULL $t3,$a7,$a5 1219 adde $acc3,$acc3,$t0 1220 $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5]) 1221 adde $acc4,$acc4,$t1 1222 1223 $UMULH $t1,$a7,$a5 1224 addc $acc3,$acc3,$t2 1225 $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii) 1226 adde $acc4,$acc4,$t3 1227 $UMULH $t3,$a7,$a6 # hi(a[7]*a[6]) 1228 addze $acc5,$zero # t[13] 1229 addc $acc4,$acc4,$t0 1230 $UCMP $ap_end,$ap # done yet? 1231 adde $acc5,$acc5,$t1 1232 1233 addc $acc5,$acc5,$t2 1234 sub $t0,$ap_end,$num # rewinded ap 1235 addze $acc6,$zero # t[14] 1236 add $acc6,$acc6,$t3 1237 1238 beq .Lsqr8x_outer_break 1239 1240 mr $n0,$a0 1241 $LD $a0,$SIZE_T*1($tp) 1242 $LD $a1,$SIZE_T*2($tp) 1243 $LD $a2,$SIZE_T*3($tp) 1244 $LD $a3,$SIZE_T*4($tp) 1245 $LD $a4,$SIZE_T*5($tp) 1246 $LD $a5,$SIZE_T*6($tp) 1247 $LD $a6,$SIZE_T*7($tp) 1248 $LD $a7,$SIZE_T*8($tp) 1249 addc $acc0,$acc0,$a0 1250 $LD $a0,$SIZE_T*1($ap) 1251 adde $acc1,$acc1,$a1 1252 $LD $a1,$SIZE_T*2($ap) 1253 adde $acc2,$acc2,$a2 1254 $LD $a2,$SIZE_T*3($ap) 1255 adde $acc3,$acc3,$a3 1256 $LD $a3,$SIZE_T*4($ap) 1257 adde $acc4,$acc4,$a4 1258 $LD $a4,$SIZE_T*5($ap) 1259 adde $acc5,$acc5,$a5 1260 $LD $a5,$SIZE_T*6($ap) 1261 adde $acc6,$acc6,$a6 1262 $LD $a6,$SIZE_T*7($ap) 1263 subi $rp,$ap,$SIZE_T*7 1264 addze $acc7,$a7 1265 $LDU $a7,$SIZE_T*8($ap) 1266 #addze $carry,$zero # moved below 1267 li $cnt,0 1268 b .Lsqr8x_mul 1269 1270 # a[8]a[0] 1271 # a[9]a[0] 1272 # a[a]a[0] 1273 # a[b]a[0] 1274 # a[c]a[0] 1275 # a[d]a[0] 1276 # a[e]a[0] 1277 # a[f]a[0] 1278 # a[8]a[1] 1279 # a[f]a[1]........................ 1280 # a[8]a[2] 1281 # a[f]a[2]........................ 1282 # a[8]a[3] 1283 # a[f]a[3]........................ 1284 # a[8]a[4] 1285 # a[f]a[4]........................ 1286 # a[8]a[5] 1287 # a[f]a[5]........................ 1288 # a[8]a[6] 1289 # a[f]a[6]........................ 1290 # a[8]a[7] 1291 # a[f]a[7]........................ 1292.align 5 1293.Lsqr8x_mul: 1294 $UMULL $t0,$a0,$n0 1295 addze $carry,$zero # carry bit, modulo-scheduled 1296 $UMULL $t1,$a1,$n0 1297 addi $cnt,$cnt,$SIZE_T 1298 $UMULL $t2,$a2,$n0 1299 andi. $cnt,$cnt,$SIZE_T*8-1 1300 $UMULL $t3,$a3,$n0 1301 addc $acc0,$acc0,$t0 1302 $UMULL $t0,$a4,$n0 1303 adde $acc1,$acc1,$t1 1304 $UMULL $t1,$a5,$n0 1305 adde $acc2,$acc2,$t2 1306 $UMULL $t2,$a6,$n0 1307 adde $acc3,$acc3,$t3 1308 $UMULL $t3,$a7,$n0 1309 adde $acc4,$acc4,$t0 1310 $UMULH $t0,$a0,$n0 1311 adde $acc5,$acc5,$t1 1312 $UMULH $t1,$a1,$n0 1313 adde $acc6,$acc6,$t2 1314 $UMULH $t2,$a2,$n0 1315 adde $acc7,$acc7,$t3 1316 $UMULH $t3,$a3,$n0 1317 addze $carry,$carry 1318 $STU $acc0,$SIZE_T($tp) 1319 addc $acc0,$acc1,$t0 1320 $UMULH $t0,$a4,$n0 1321 adde $acc1,$acc2,$t1 1322 $UMULH $t1,$a5,$n0 1323 adde $acc2,$acc3,$t2 1324 $UMULH $t2,$a6,$n0 1325 adde $acc3,$acc4,$t3 1326 $UMULH $t3,$a7,$n0 1327 $LDX $n0,$rp,$cnt 1328 adde $acc4,$acc5,$t0 1329 adde $acc5,$acc6,$t1 1330 adde $acc6,$acc7,$t2 1331 adde $acc7,$carry,$t3 1332 #addze $carry,$zero # moved above 1333 bne .Lsqr8x_mul 1334 # note that carry flag is guaranteed 1335 # to be zero at this point 1336 $UCMP $ap,$ap_end # done yet? 1337 beq .Lsqr8x_break 1338 1339 $LD $a0,$SIZE_T*1($tp) 1340 $LD $a1,$SIZE_T*2($tp) 1341 $LD $a2,$SIZE_T*3($tp) 1342 $LD $a3,$SIZE_T*4($tp) 1343 $LD $a4,$SIZE_T*5($tp) 1344 $LD $a5,$SIZE_T*6($tp) 1345 $LD $a6,$SIZE_T*7($tp) 1346 $LD $a7,$SIZE_T*8($tp) 1347 addc $acc0,$acc0,$a0 1348 $LD $a0,$SIZE_T*1($ap) 1349 adde $acc1,$acc1,$a1 1350 $LD $a1,$SIZE_T*2($ap) 1351 adde $acc2,$acc2,$a2 1352 $LD $a2,$SIZE_T*3($ap) 1353 adde $acc3,$acc3,$a3 1354 $LD $a3,$SIZE_T*4($ap) 1355 adde $acc4,$acc4,$a4 1356 $LD $a4,$SIZE_T*5($ap) 1357 adde $acc5,$acc5,$a5 1358 $LD $a5,$SIZE_T*6($ap) 1359 adde $acc6,$acc6,$a6 1360 $LD $a6,$SIZE_T*7($ap) 1361 adde $acc7,$acc7,$a7 1362 $LDU $a7,$SIZE_T*8($ap) 1363 #addze $carry,$zero # moved above 1364 b .Lsqr8x_mul 1365 1366.align 5 1367.Lsqr8x_break: 1368 $LD $a0,$SIZE_T*8($rp) 1369 addi $ap,$rp,$SIZE_T*15 1370 $LD $a1,$SIZE_T*9($rp) 1371 sub. $t0,$ap_end,$ap # is it last iteration? 1372 $LD $a2,$SIZE_T*10($rp) 1373 sub $t1,$tp,$t0 1374 $LD $a3,$SIZE_T*11($rp) 1375 $LD $a4,$SIZE_T*12($rp) 1376 $LD $a5,$SIZE_T*13($rp) 1377 $LD $a6,$SIZE_T*14($rp) 1378 $LD $a7,$SIZE_T*15($rp) 1379 beq .Lsqr8x_outer_loop 1380 1381 $ST $acc0,$SIZE_T*1($tp) 1382 $LD $acc0,$SIZE_T*1($t1) 1383 $ST $acc1,$SIZE_T*2($tp) 1384 $LD $acc1,$SIZE_T*2($t1) 1385 $ST $acc2,$SIZE_T*3($tp) 1386 $LD $acc2,$SIZE_T*3($t1) 1387 $ST $acc3,$SIZE_T*4($tp) 1388 $LD $acc3,$SIZE_T*4($t1) 1389 $ST $acc4,$SIZE_T*5($tp) 1390 $LD $acc4,$SIZE_T*5($t1) 1391 $ST $acc5,$SIZE_T*6($tp) 1392 $LD $acc5,$SIZE_T*6($t1) 1393 $ST $acc6,$SIZE_T*7($tp) 1394 $LD $acc6,$SIZE_T*7($t1) 1395 $ST $acc7,$SIZE_T*8($tp) 1396 $LD $acc7,$SIZE_T*8($t1) 1397 mr $tp,$t1 1398 b .Lsqr8x_outer_loop 1399 1400.align 5 1401.Lsqr8x_outer_break: 1402 #################################################################### 1403 # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1404 $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1] 1405 $LD $a3,$SIZE_T*2($t0) 1406 $LD $a5,$SIZE_T*3($t0) 1407 $LD $a7,$SIZE_T*4($t0) 1408 addi $ap,$t0,$SIZE_T*4 1409 # "tp[x]" comments are for num==8 case 1410 $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting 1411 $LD $t2,$SIZE_T*14($sp) 1412 $LD $t3,$SIZE_T*15($sp) 1413 $LD $t0,$SIZE_T*16($sp) 1414 1415 $ST $acc0,$SIZE_T*1($tp) # tp[8]= 1416 srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 1417 $ST $acc1,$SIZE_T*2($tp) 1418 subi $cnt,$cnt,1 1419 $ST $acc2,$SIZE_T*3($tp) 1420 $ST $acc3,$SIZE_T*4($tp) 1421 $ST $acc4,$SIZE_T*5($tp) 1422 $ST $acc5,$SIZE_T*6($tp) 1423 $ST $acc6,$SIZE_T*7($tp) 1424 #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting 1425 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1426 $UMULL $acc0,$a1,$a1 1427 $UMULH $a1,$a1,$a1 1428 add $acc1,$t1,$t1 # <<1 1429 $SHRI $t1,$t1,$BITS-1 1430 $UMULL $a2,$a3,$a3 1431 $UMULH $a3,$a3,$a3 1432 addc $acc1,$acc1,$a1 1433 add $acc2,$t2,$t2 1434 $SHRI $t2,$t2,$BITS-1 1435 add $acc3,$t3,$t3 1436 $SHRI $t3,$t3,$BITS-1 1437 or $acc2,$acc2,$t1 1438 1439 mtctr $cnt 1440.Lsqr4x_shift_n_add: 1441 $UMULL $a4,$a5,$a5 1442 $UMULH $a5,$a5,$a5 1443 $LD $t1,$SIZE_T*6($tp) # =tp[5] 1444 $LD $a1,$SIZE_T*1($ap) 1445 adde $acc2,$acc2,$a2 1446 add $acc4,$t0,$t0 1447 $SHRI $t0,$t0,$BITS-1 1448 or $acc3,$acc3,$t2 1449 $LD $t2,$SIZE_T*7($tp) # =tp[6] 1450 adde $acc3,$acc3,$a3 1451 $LD $a3,$SIZE_T*2($ap) 1452 add $acc5,$t1,$t1 1453 $SHRI $t1,$t1,$BITS-1 1454 or $acc4,$acc4,$t3 1455 $LD $t3,$SIZE_T*8($tp) # =tp[7] 1456 $UMULL $a6,$a7,$a7 1457 $UMULH $a7,$a7,$a7 1458 adde $acc4,$acc4,$a4 1459 add $acc6,$t2,$t2 1460 $SHRI $t2,$t2,$BITS-1 1461 or $acc5,$acc5,$t0 1462 $LD $t0,$SIZE_T*9($tp) # =tp[8] 1463 adde $acc5,$acc5,$a5 1464 $LD $a5,$SIZE_T*3($ap) 1465 add $acc7,$t3,$t3 1466 $SHRI $t3,$t3,$BITS-1 1467 or $acc6,$acc6,$t1 1468 $LD $t1,$SIZE_T*10($tp) # =tp[9] 1469 $UMULL $a0,$a1,$a1 1470 $UMULH $a1,$a1,$a1 1471 adde $acc6,$acc6,$a6 1472 $ST $acc0,$SIZE_T*1($tp) # tp[0]= 1473 add $acc0,$t0,$t0 1474 $SHRI $t0,$t0,$BITS-1 1475 or $acc7,$acc7,$t2 1476 $LD $t2,$SIZE_T*11($tp) # =tp[10] 1477 adde $acc7,$acc7,$a7 1478 $LDU $a7,$SIZE_T*4($ap) 1479 $ST $acc1,$SIZE_T*2($tp) # tp[1]= 1480 add $acc1,$t1,$t1 1481 $SHRI $t1,$t1,$BITS-1 1482 or $acc0,$acc0,$t3 1483 $LD $t3,$SIZE_T*12($tp) # =tp[11] 1484 $UMULL $a2,$a3,$a3 1485 $UMULH $a3,$a3,$a3 1486 adde $acc0,$acc0,$a0 1487 $ST $acc2,$SIZE_T*3($tp) # tp[2]= 1488 add $acc2,$t2,$t2 1489 $SHRI $t2,$t2,$BITS-1 1490 or $acc1,$acc1,$t0 1491 $LD $t0,$SIZE_T*13($tp) # =tp[12] 1492 adde $acc1,$acc1,$a1 1493 $ST $acc3,$SIZE_T*4($tp) # tp[3]= 1494 $ST $acc4,$SIZE_T*5($tp) # tp[4]= 1495 $ST $acc5,$SIZE_T*6($tp) # tp[5]= 1496 $ST $acc6,$SIZE_T*7($tp) # tp[6]= 1497 $STU $acc7,$SIZE_T*8($tp) # tp[7]= 1498 add $acc3,$t3,$t3 1499 $SHRI $t3,$t3,$BITS-1 1500 or $acc2,$acc2,$t1 1501 bdnz .Lsqr4x_shift_n_add 1502___ 1503my ($np,$np_end)=($ap,$ap_end); 1504$code.=<<___; 1505 $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0 1506 $POP $n0,$SIZE_T*8($sp) 1507 1508 $UMULL $a4,$a5,$a5 1509 $UMULH $a5,$a5,$a5 1510 $ST $acc0,$SIZE_T*1($tp) # tp[8]= 1511 $LD $acc0,$SIZE_T*12($sp) # =tp[0] 1512 $LD $t1,$SIZE_T*6($tp) # =tp[13] 1513 adde $acc2,$acc2,$a2 1514 add $acc4,$t0,$t0 1515 $SHRI $t0,$t0,$BITS-1 1516 or $acc3,$acc3,$t2 1517 $LD $t2,$SIZE_T*7($tp) # =tp[14] 1518 adde $acc3,$acc3,$a3 1519 add $acc5,$t1,$t1 1520 $SHRI $t1,$t1,$BITS-1 1521 or $acc4,$acc4,$t3 1522 $UMULL $a6,$a7,$a7 1523 $UMULH $a7,$a7,$a7 1524 adde $acc4,$acc4,$a4 1525 add $acc6,$t2,$t2 1526 $SHRI $t2,$t2,$BITS-1 1527 or $acc5,$acc5,$t0 1528 $ST $acc1,$SIZE_T*2($tp) # tp[9]= 1529 $LD $acc1,$SIZE_T*13($sp) # =tp[1] 1530 adde $acc5,$acc5,$a5 1531 or $acc6,$acc6,$t1 1532 $LD $a0,$SIZE_T*1($np) 1533 $LD $a1,$SIZE_T*2($np) 1534 adde $acc6,$acc6,$a6 1535 $LD $a2,$SIZE_T*3($np) 1536 $LD $a3,$SIZE_T*4($np) 1537 adde $acc7,$a7,$t2 1538 $LD $a4,$SIZE_T*5($np) 1539 $LD $a5,$SIZE_T*6($np) 1540 1541 ################################################################ 1542 # Reduce by 8 limbs per iteration 1543 $UMULL $na0,$n0,$acc0 # t[0]*n0 1544 li $cnt,8 1545 $LD $a6,$SIZE_T*7($np) 1546 add $np_end,$np,$num 1547 $LDU $a7,$SIZE_T*8($np) 1548 $ST $acc2,$SIZE_T*3($tp) # tp[10]= 1549 $LD $acc2,$SIZE_T*14($sp) 1550 $ST $acc3,$SIZE_T*4($tp) # tp[11]= 1551 $LD $acc3,$SIZE_T*15($sp) 1552 $ST $acc4,$SIZE_T*5($tp) # tp[12]= 1553 $LD $acc4,$SIZE_T*16($sp) 1554 $ST $acc5,$SIZE_T*6($tp) # tp[13]= 1555 $LD $acc5,$SIZE_T*17($sp) 1556 $ST $acc6,$SIZE_T*7($tp) # tp[14]= 1557 $LD $acc6,$SIZE_T*18($sp) 1558 $ST $acc7,$SIZE_T*8($tp) # tp[15]= 1559 $LD $acc7,$SIZE_T*19($sp) 1560 addi $tp,$sp,$SIZE_T*11 # &tp[-1] 1561 mtctr $cnt 1562 b .Lsqr8x_reduction 1563 1564.align 5 1565.Lsqr8x_reduction: 1566 # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0) 1567 $UMULL $t1,$a1,$na0 1568 $UMULL $t2,$a2,$na0 1569 $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing 1570 $UMULL $t3,$a3,$na0 1571 # (*) addc $acc0,$acc0,$t0 1572 addic $acc0,$acc0,-1 # (*) 1573 $UMULL $t0,$a4,$na0 1574 adde $acc0,$acc1,$t1 1575 $UMULL $t1,$a5,$na0 1576 adde $acc1,$acc2,$t2 1577 $UMULL $t2,$a6,$na0 1578 adde $acc2,$acc3,$t3 1579 $UMULL $t3,$a7,$na0 1580 adde $acc3,$acc4,$t0 1581 $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0) 1582 adde $acc4,$acc5,$t1 1583 $UMULH $t1,$a1,$na0 1584 adde $acc5,$acc6,$t2 1585 $UMULH $t2,$a2,$na0 1586 adde $acc6,$acc7,$t3 1587 $UMULH $t3,$a3,$na0 1588 addze $acc7,$zero 1589 addc $acc0,$acc0,$t0 1590 $UMULH $t0,$a4,$na0 1591 adde $acc1,$acc1,$t1 1592 $UMULH $t1,$a5,$na0 1593 adde $acc2,$acc2,$t2 1594 $UMULH $t2,$a6,$na0 1595 adde $acc3,$acc3,$t3 1596 $UMULH $t3,$a7,$na0 1597 $UMULL $na0,$n0,$acc0 # next t[0]*n0 1598 adde $acc4,$acc4,$t0 1599 adde $acc5,$acc5,$t1 1600 adde $acc6,$acc6,$t2 1601 adde $acc7,$acc7,$t3 1602 bdnz .Lsqr8x_reduction 1603 1604 $LD $t0,$SIZE_T*1($tp) 1605 $LD $t1,$SIZE_T*2($tp) 1606 $LD $t2,$SIZE_T*3($tp) 1607 $LD $t3,$SIZE_T*4($tp) 1608 subi $rp,$tp,$SIZE_T*7 1609 $UCMP $np_end,$np # done yet? 1610 addc $acc0,$acc0,$t0 1611 $LD $t0,$SIZE_T*5($tp) 1612 adde $acc1,$acc1,$t1 1613 $LD $t1,$SIZE_T*6($tp) 1614 adde $acc2,$acc2,$t2 1615 $LD $t2,$SIZE_T*7($tp) 1616 adde $acc3,$acc3,$t3 1617 $LD $t3,$SIZE_T*8($tp) 1618 adde $acc4,$acc4,$t0 1619 adde $acc5,$acc5,$t1 1620 adde $acc6,$acc6,$t2 1621 adde $acc7,$acc7,$t3 1622 #addze $carry,$zero # moved below 1623 beq .Lsqr8x8_post_condition 1624 1625 $LD $n0,$SIZE_T*0($rp) 1626 $LD $a0,$SIZE_T*1($np) 1627 $LD $a1,$SIZE_T*2($np) 1628 $LD $a2,$SIZE_T*3($np) 1629 $LD $a3,$SIZE_T*4($np) 1630 $LD $a4,$SIZE_T*5($np) 1631 $LD $a5,$SIZE_T*6($np) 1632 $LD $a6,$SIZE_T*7($np) 1633 $LDU $a7,$SIZE_T*8($np) 1634 li $cnt,0 1635 1636.align 5 1637.Lsqr8x_tail: 1638 $UMULL $t0,$a0,$n0 1639 addze $carry,$zero # carry bit, modulo-scheduled 1640 $UMULL $t1,$a1,$n0 1641 addi $cnt,$cnt,$SIZE_T 1642 $UMULL $t2,$a2,$n0 1643 andi. $cnt,$cnt,$SIZE_T*8-1 1644 $UMULL $t3,$a3,$n0 1645 addc $acc0,$acc0,$t0 1646 $UMULL $t0,$a4,$n0 1647 adde $acc1,$acc1,$t1 1648 $UMULL $t1,$a5,$n0 1649 adde $acc2,$acc2,$t2 1650 $UMULL $t2,$a6,$n0 1651 adde $acc3,$acc3,$t3 1652 $UMULL $t3,$a7,$n0 1653 adde $acc4,$acc4,$t0 1654 $UMULH $t0,$a0,$n0 1655 adde $acc5,$acc5,$t1 1656 $UMULH $t1,$a1,$n0 1657 adde $acc6,$acc6,$t2 1658 $UMULH $t2,$a2,$n0 1659 adde $acc7,$acc7,$t3 1660 $UMULH $t3,$a3,$n0 1661 addze $carry,$carry 1662 $STU $acc0,$SIZE_T($tp) 1663 addc $acc0,$acc1,$t0 1664 $UMULH $t0,$a4,$n0 1665 adde $acc1,$acc2,$t1 1666 $UMULH $t1,$a5,$n0 1667 adde $acc2,$acc3,$t2 1668 $UMULH $t2,$a6,$n0 1669 adde $acc3,$acc4,$t3 1670 $UMULH $t3,$a7,$n0 1671 $LDX $n0,$rp,$cnt 1672 adde $acc4,$acc5,$t0 1673 adde $acc5,$acc6,$t1 1674 adde $acc6,$acc7,$t2 1675 adde $acc7,$carry,$t3 1676 #addze $carry,$zero # moved above 1677 bne .Lsqr8x_tail 1678 # note that carry flag is guaranteed 1679 # to be zero at this point 1680 $LD $a0,$SIZE_T*1($tp) 1681 $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break 1682 $UCMP $np_end,$np # done yet? 1683 $LD $a1,$SIZE_T*2($tp) 1684 sub $t2,$np_end,$num # rewinded np 1685 $LD $a2,$SIZE_T*3($tp) 1686 $LD $a3,$SIZE_T*4($tp) 1687 $LD $a4,$SIZE_T*5($tp) 1688 $LD $a5,$SIZE_T*6($tp) 1689 $LD $a6,$SIZE_T*7($tp) 1690 $LD $a7,$SIZE_T*8($tp) 1691 beq .Lsqr8x_tail_break 1692 1693 addc $acc0,$acc0,$a0 1694 $LD $a0,$SIZE_T*1($np) 1695 adde $acc1,$acc1,$a1 1696 $LD $a1,$SIZE_T*2($np) 1697 adde $acc2,$acc2,$a2 1698 $LD $a2,$SIZE_T*3($np) 1699 adde $acc3,$acc3,$a3 1700 $LD $a3,$SIZE_T*4($np) 1701 adde $acc4,$acc4,$a4 1702 $LD $a4,$SIZE_T*5($np) 1703 adde $acc5,$acc5,$a5 1704 $LD $a5,$SIZE_T*6($np) 1705 adde $acc6,$acc6,$a6 1706 $LD $a6,$SIZE_T*7($np) 1707 adde $acc7,$acc7,$a7 1708 $LDU $a7,$SIZE_T*8($np) 1709 #addze $carry,$zero # moved above 1710 b .Lsqr8x_tail 1711 1712.align 5 1713.Lsqr8x_tail_break: 1714 $POP $n0,$SIZE_T*8($sp) # pull n0 1715 $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1] 1716 addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window 1717 1718 addic $carry,$carry,-1 # "move" top-most carry to carry bit 1719 adde $t0,$acc0,$a0 1720 $LD $acc0,$SIZE_T*8($rp) 1721 $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1] 1722 adde $t1,$acc1,$a1 1723 $LD $acc1,$SIZE_T*9($rp) 1724 $LD $a1,$SIZE_T*2($t2) 1725 adde $acc2,$acc2,$a2 1726 $LD $a2,$SIZE_T*3($t2) 1727 adde $acc3,$acc3,$a3 1728 $LD $a3,$SIZE_T*4($t2) 1729 adde $acc4,$acc4,$a4 1730 $LD $a4,$SIZE_T*5($t2) 1731 adde $acc5,$acc5,$a5 1732 $LD $a5,$SIZE_T*6($t2) 1733 adde $acc6,$acc6,$a6 1734 $LD $a6,$SIZE_T*7($t2) 1735 adde $acc7,$acc7,$a7 1736 $LD $a7,$SIZE_T*8($t2) 1737 addi $np,$t2,$SIZE_T*8 1738 addze $t2,$zero # top-most carry 1739 $UMULL $na0,$n0,$acc0 1740 $ST $t0,$SIZE_T*1($tp) 1741 $UCMP $cnt,$t3 # did we hit the bottom? 1742 $ST $t1,$SIZE_T*2($tp) 1743 li $cnt,8 1744 $ST $acc2,$SIZE_T*3($tp) 1745 $LD $acc2,$SIZE_T*10($rp) 1746 $ST $acc3,$SIZE_T*4($tp) 1747 $LD $acc3,$SIZE_T*11($rp) 1748 $ST $acc4,$SIZE_T*5($tp) 1749 $LD $acc4,$SIZE_T*12($rp) 1750 $ST $acc5,$SIZE_T*6($tp) 1751 $LD $acc5,$SIZE_T*13($rp) 1752 $ST $acc6,$SIZE_T*7($tp) 1753 $LD $acc6,$SIZE_T*14($rp) 1754 $ST $acc7,$SIZE_T*8($tp) 1755 $LD $acc7,$SIZE_T*15($rp) 1756 $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry 1757 addi $tp,$rp,$SIZE_T*7 # slide the window 1758 mtctr $cnt 1759 bne .Lsqr8x_reduction 1760 1761 ################################################################ 1762 # Final step. We see if result is larger than modulus, and 1763 # if it is, subtract the modulus. But comparison implies 1764 # subtraction. So we subtract modulus, see if it borrowed, 1765 # and conditionally copy original value. 1766 $POP $rp,$SIZE_T*6($sp) # pull &rp[-1] 1767 srwi $cnt,$num,`log($SIZE_T)/log(2)+3` 1768 mr $n0,$tp # put tp aside 1769 addi $tp,$tp,$SIZE_T*8 1770 subi $cnt,$cnt,1 1771 subfc $t0,$a0,$acc0 1772 subfe $t1,$a1,$acc1 1773 mr $carry,$t2 1774 mr $ap_end,$rp # $rp copy 1775 1776 mtctr $cnt 1777 b .Lsqr8x_sub 1778 1779.align 5 1780.Lsqr8x_sub: 1781 $LD $a0,$SIZE_T*1($np) 1782 $LD $acc0,$SIZE_T*1($tp) 1783 $LD $a1,$SIZE_T*2($np) 1784 $LD $acc1,$SIZE_T*2($tp) 1785 subfe $t2,$a2,$acc2 1786 $LD $a2,$SIZE_T*3($np) 1787 $LD $acc2,$SIZE_T*3($tp) 1788 subfe $t3,$a3,$acc3 1789 $LD $a3,$SIZE_T*4($np) 1790 $LD $acc3,$SIZE_T*4($tp) 1791 $ST $t0,$SIZE_T*1($rp) 1792 subfe $t0,$a4,$acc4 1793 $LD $a4,$SIZE_T*5($np) 1794 $LD $acc4,$SIZE_T*5($tp) 1795 $ST $t1,$SIZE_T*2($rp) 1796 subfe $t1,$a5,$acc5 1797 $LD $a5,$SIZE_T*6($np) 1798 $LD $acc5,$SIZE_T*6($tp) 1799 $ST $t2,$SIZE_T*3($rp) 1800 subfe $t2,$a6,$acc6 1801 $LD $a6,$SIZE_T*7($np) 1802 $LD $acc6,$SIZE_T*7($tp) 1803 $ST $t3,$SIZE_T*4($rp) 1804 subfe $t3,$a7,$acc7 1805 $LDU $a7,$SIZE_T*8($np) 1806 $LDU $acc7,$SIZE_T*8($tp) 1807 $ST $t0,$SIZE_T*5($rp) 1808 subfe $t0,$a0,$acc0 1809 $ST $t1,$SIZE_T*6($rp) 1810 subfe $t1,$a1,$acc1 1811 $ST $t2,$SIZE_T*7($rp) 1812 $STU $t3,$SIZE_T*8($rp) 1813 bdnz .Lsqr8x_sub 1814 1815 srwi $cnt,$num,`log($SIZE_T)/log(2)+2` 1816 $LD $a0,$SIZE_T*1($ap_end) # original $rp 1817 $LD $acc0,$SIZE_T*1($n0) # original $tp 1818 subi $cnt,$cnt,1 1819 $LD $a1,$SIZE_T*2($ap_end) 1820 $LD $acc1,$SIZE_T*2($n0) 1821 subfe $t2,$a2,$acc2 1822 $LD $a2,$SIZE_T*3($ap_end) 1823 $LD $acc2,$SIZE_T*3($n0) 1824 subfe $t3,$a3,$acc3 1825 $LD $a3,$SIZE_T*4($ap_end) 1826 $LDU $acc3,$SIZE_T*4($n0) 1827 $ST $t0,$SIZE_T*1($rp) 1828 subfe $t0,$a4,$acc4 1829 $ST $t1,$SIZE_T*2($rp) 1830 subfe $t1,$a5,$acc5 1831 $ST $t2,$SIZE_T*3($rp) 1832 subfe $t2,$a6,$acc6 1833 $ST $t3,$SIZE_T*4($rp) 1834 subfe $t3,$a7,$acc7 1835 $ST $t0,$SIZE_T*5($rp) 1836 subfe $carry,$zero,$carry # did it borrow? 1837 $ST $t1,$SIZE_T*6($rp) 1838 $ST $t2,$SIZE_T*7($rp) 1839 $ST $t3,$SIZE_T*8($rp) 1840 1841 addi $tp,$sp,$SIZE_T*11 1842 mtctr $cnt 1843 1844.Lsqr4x_cond_copy: 1845 andc $a0,$a0,$carry 1846 $ST $zero,-$SIZE_T*3($n0) # wipe stack clean 1847 and $acc0,$acc0,$carry 1848 $ST $zero,-$SIZE_T*2($n0) 1849 andc $a1,$a1,$carry 1850 $ST $zero,-$SIZE_T*1($n0) 1851 and $acc1,$acc1,$carry 1852 $ST $zero,-$SIZE_T*0($n0) 1853 andc $a2,$a2,$carry 1854 $ST $zero,$SIZE_T*1($tp) 1855 and $acc2,$acc2,$carry 1856 $ST $zero,$SIZE_T*2($tp) 1857 andc $a3,$a3,$carry 1858 $ST $zero,$SIZE_T*3($tp) 1859 and $acc3,$acc3,$carry 1860 $STU $zero,$SIZE_T*4($tp) 1861 or $t0,$a0,$acc0 1862 $LD $a0,$SIZE_T*5($ap_end) 1863 $LD $acc0,$SIZE_T*1($n0) 1864 or $t1,$a1,$acc1 1865 $LD $a1,$SIZE_T*6($ap_end) 1866 $LD $acc1,$SIZE_T*2($n0) 1867 or $t2,$a2,$acc2 1868 $LD $a2,$SIZE_T*7($ap_end) 1869 $LD $acc2,$SIZE_T*3($n0) 1870 or $t3,$a3,$acc3 1871 $LD $a3,$SIZE_T*8($ap_end) 1872 $LDU $acc3,$SIZE_T*4($n0) 1873 $ST $t0,$SIZE_T*1($ap_end) 1874 $ST $t1,$SIZE_T*2($ap_end) 1875 $ST $t2,$SIZE_T*3($ap_end) 1876 $STU $t3,$SIZE_T*4($ap_end) 1877 bdnz .Lsqr4x_cond_copy 1878 1879 $POP $ap,0($sp) # pull saved sp 1880 andc $a0,$a0,$carry 1881 and $acc0,$acc0,$carry 1882 andc $a1,$a1,$carry 1883 and $acc1,$acc1,$carry 1884 andc $a2,$a2,$carry 1885 and $acc2,$acc2,$carry 1886 andc $a3,$a3,$carry 1887 and $acc3,$acc3,$carry 1888 or $t0,$a0,$acc0 1889 or $t1,$a1,$acc1 1890 or $t2,$a2,$acc2 1891 or $t3,$a3,$acc3 1892 $ST $t0,$SIZE_T*1($ap_end) 1893 $ST $t1,$SIZE_T*2($ap_end) 1894 $ST $t2,$SIZE_T*3($ap_end) 1895 $ST $t3,$SIZE_T*4($ap_end) 1896 1897 b .Lsqr8x_done 1898 1899.align 5 1900.Lsqr8x8_post_condition: 1901 $POP $rp,$SIZE_T*6($sp) # pull rp 1902 $POP $ap,0($sp) # pull saved sp 1903 addze $carry,$zero 1904 1905 # $acc0-7,$carry hold result, $a0-7 hold modulus 1906 subfc $acc0,$a0,$acc0 1907 subfe $acc1,$a1,$acc1 1908 $ST $zero,$SIZE_T*12($sp) # wipe stack clean 1909 $ST $zero,$SIZE_T*13($sp) 1910 subfe $acc2,$a2,$acc2 1911 $ST $zero,$SIZE_T*14($sp) 1912 $ST $zero,$SIZE_T*15($sp) 1913 subfe $acc3,$a3,$acc3 1914 $ST $zero,$SIZE_T*16($sp) 1915 $ST $zero,$SIZE_T*17($sp) 1916 subfe $acc4,$a4,$acc4 1917 $ST $zero,$SIZE_T*18($sp) 1918 $ST $zero,$SIZE_T*19($sp) 1919 subfe $acc5,$a5,$acc5 1920 $ST $zero,$SIZE_T*20($sp) 1921 $ST $zero,$SIZE_T*21($sp) 1922 subfe $acc6,$a6,$acc6 1923 $ST $zero,$SIZE_T*22($sp) 1924 $ST $zero,$SIZE_T*23($sp) 1925 subfe $acc7,$a7,$acc7 1926 $ST $zero,$SIZE_T*24($sp) 1927 $ST $zero,$SIZE_T*25($sp) 1928 subfe $carry,$zero,$carry # did it borrow? 1929 $ST $zero,$SIZE_T*26($sp) 1930 $ST $zero,$SIZE_T*27($sp) 1931 1932 and $a0,$a0,$carry 1933 and $a1,$a1,$carry 1934 addc $acc0,$acc0,$a0 # add modulus back if borrowed 1935 and $a2,$a2,$carry 1936 adde $acc1,$acc1,$a1 1937 and $a3,$a3,$carry 1938 adde $acc2,$acc2,$a2 1939 and $a4,$a4,$carry 1940 adde $acc3,$acc3,$a3 1941 and $a5,$a5,$carry 1942 adde $acc4,$acc4,$a4 1943 and $a6,$a6,$carry 1944 adde $acc5,$acc5,$a5 1945 and $a7,$a7,$carry 1946 adde $acc6,$acc6,$a6 1947 adde $acc7,$acc7,$a7 1948 $ST $acc0,$SIZE_T*1($rp) 1949 $ST $acc1,$SIZE_T*2($rp) 1950 $ST $acc2,$SIZE_T*3($rp) 1951 $ST $acc3,$SIZE_T*4($rp) 1952 $ST $acc4,$SIZE_T*5($rp) 1953 $ST $acc5,$SIZE_T*6($rp) 1954 $ST $acc6,$SIZE_T*7($rp) 1955 $ST $acc7,$SIZE_T*8($rp) 1956 1957.Lsqr8x_done: 1958 $PUSH $zero,$SIZE_T*8($sp) 1959 $PUSH $zero,$SIZE_T*10($sp) 1960 1961 $POP r14,-$SIZE_T*18($ap) 1962 li r3,1 # signal "done" 1963 $POP r15,-$SIZE_T*17($ap) 1964 $POP r16,-$SIZE_T*16($ap) 1965 $POP r17,-$SIZE_T*15($ap) 1966 $POP r18,-$SIZE_T*14($ap) 1967 $POP r19,-$SIZE_T*13($ap) 1968 $POP r20,-$SIZE_T*12($ap) 1969 $POP r21,-$SIZE_T*11($ap) 1970 $POP r22,-$SIZE_T*10($ap) 1971 $POP r23,-$SIZE_T*9($ap) 1972 $POP r24,-$SIZE_T*8($ap) 1973 $POP r25,-$SIZE_T*7($ap) 1974 $POP r26,-$SIZE_T*6($ap) 1975 $POP r27,-$SIZE_T*5($ap) 1976 $POP r28,-$SIZE_T*4($ap) 1977 $POP r29,-$SIZE_T*3($ap) 1978 $POP r30,-$SIZE_T*2($ap) 1979 $POP r31,-$SIZE_T*1($ap) 1980 mr $sp,$ap 1981 blr 1982 .long 0 1983 .byte 0,12,4,0x20,0x80,18,6,0 1984 .long 0 1985.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1986___ 1987} 1988$code.=<<___; 1989.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" 1990___ 1991 1992$code =~ s/\`([^\`]*)\`/eval $1/gem; 1993print $code; 1994close STDOUT or die "error closing STDOUT: $!"; 1995