1#! /usr/bin/env perl 2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# December 2007 18 19# The reason for undertaken effort is basically following. Even though 20# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI 21# performance was observed to be less than impressive, essentially as 22# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope. 23# Well, it's not surprising that IBM had to make some sacrifices to 24# boost the clock frequency that much, but no overall improvement? 25# Having observed how much difference did switching to FPU make on 26# UltraSPARC, playing same stunt on Power 6 appeared appropriate... 27# Unfortunately the resulting performance improvement is not as 28# impressive, ~30%, and in absolute terms is still very far from what 29# one would expect from 4.7GHz CPU. There is a chance that I'm doing 30# something wrong, but in the lack of assembler level micro-profiling 31# data or at least decent platform guide I can't tell... Or better 32# results might be achieved with VMX... Anyway, this module provides 33# *worse* performance on other PowerPC implementations, ~40-15% slower 34# on PPC970 depending on key length and ~40% slower on Power 5 for all 35# key lengths. As it's obviously inappropriate as "best all-round" 36# alternative, it has to be complemented with run-time CPU family 37# detection. Oh! It should also be noted that unlike other PowerPC 38# implementation IALU ppc-mont.pl module performs *suboptimally* on 39# >=1024-bit key lengths on Power 6. It should also be noted that 40# *everything* said so far applies to 64-bit builds! As far as 32-bit 41# application executed on 64-bit CPU goes, this module is likely to 42# become preferred choice, because it's easy to adapt it for such 43# case and *is* faster than 32-bit ppc-mont.pl on *all* processors. 44 45# February 2008 46 47# Micro-profiling assisted optimization results in ~15% improvement 48# over original ppc64-mont.pl version, or overall ~50% improvement 49# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same 50# Power 6 CPU, this module is 5-150% faster depending on key length, 51# [hereafter] more for longer keys. But if compared to ppc-mont.pl 52# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive 53# in absolute terms, but it's apparently the way Power 6 is... 54 55# December 2009 56 57# Adapted for 32-bit build this module delivers 25-120%, yes, more 58# than *twice* for longer keys, performance improvement over 32-bit 59# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes 60# even 64-bit integer operations and the trouble is that most PPC 61# operating systems don't preserve upper halves of general purpose 62# registers upon 32-bit signal delivery. They do preserve them upon 63# context switch, but not signalling:-( This means that asynchronous 64# signals have to be blocked upon entry to this subroutine. Signal 65# masking (and of course complementary unmasking) has quite an impact 66# on performance, naturally larger for shorter keys. It's so severe 67# that 512-bit key performance can be as low as 1/3 of expected one. 68# This is why this routine can be engaged for longer key operations 69# only on these OSes, see crypto/ppccap.c for further details. MacOS X 70# is an exception from this and doesn't require signal masking, and 71# that's where above improvement coefficients were collected. For 72# others alternative would be to break dependence on upper halves of 73# GPRs by sticking to 32-bit integer operations... 74 75# December 2012 76 77# Remove above mentioned dependence on GPRs' upper halves in 32-bit 78# build. No signal masking overhead, but integer instructions are 79# *more* numerous... It's still "universally" faster than 32-bit 80# ppc-mont.pl, but improvement coefficient is not as impressive 81# for longer keys... 82 83# $output is the last argument if it looks like a file (it has an extension) 84# $flavour is the first argument if it doesn't look like a file 85$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 86$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 87 88if ($flavour =~ /32/) { 89 $SIZE_T=4; 90 $RZONE= 224; 91 $fname= "bn_mul_mont_fpu64"; 92 93 $STUX= "stwux"; # store indexed and update 94 $PUSH= "stw"; 95 $POP= "lwz"; 96} elsif ($flavour =~ /64/) { 97 $SIZE_T=8; 98 $RZONE= 288; 99 $fname= "bn_mul_mont_fpu64"; 100 101 # same as above, but 64-bit mnemonics... 102 $STUX= "stdux"; # store indexed and update 103 $PUSH= "std"; 104 $POP= "ld"; 105} else { die "nonsense $flavour"; } 106 107$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; 108 109$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 110( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 111( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 112die "can't locate ppc-xlate.pl"; 113 114open STDOUT,"| $^X $xlate $flavour \"$output\"" 115 or die "can't call $xlate: $!"; 116 117$FRAME=64; # padded frame header 118$TRANSFER=16*8; 119 120$carry="r0"; 121$sp="r1"; 122$toc="r2"; 123$rp="r3"; $ovf="r3"; 124$ap="r4"; 125$bp="r5"; 126$np="r6"; 127$n0="r7"; 128$num="r8"; 129$rp="r9"; # $rp is reassigned 130$tp="r10"; 131$j="r11"; 132$i="r12"; 133# non-volatile registers 134$c1="r19"; 135$n1="r20"; 136$a1="r21"; 137$nap_d="r22"; # interleaved ap and np in double format 138$a0="r23"; # ap[0] 139$t0="r24"; # temporary registers 140$t1="r25"; 141$t2="r26"; 142$t3="r27"; 143$t4="r28"; 144$t5="r29"; 145$t6="r30"; 146$t7="r31"; 147 148# PPC offers enough register bank capacity to unroll inner loops twice 149# 150# ..A3A2A1A0 151# dcba 152# ----------- 153# A0a 154# A0b 155# A0c 156# A0d 157# A1a 158# A1b 159# A1c 160# A1d 161# A2a 162# A2b 163# A2c 164# A2d 165# A3a 166# A3b 167# A3c 168# A3d 169# ..a 170# ..b 171# 172$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3"; 173$na="f4"; $nb="f5"; $nc="f6"; $nd="f7"; 174$dota="f8"; $dotb="f9"; 175$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13"; 176$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23"; 177$T0a="f24"; $T0b="f25"; 178$T1a="f26"; $T1b="f27"; 179$T2a="f28"; $T2b="f29"; 180$T3a="f30"; $T3b="f31"; 181 182# sp----------->+-------------------------------+ 183# | saved sp | 184# +-------------------------------+ 185# . . 186# +64 +-------------------------------+ 187# | 16 gpr<->fpr transfer zone | 188# . . 189# . . 190# +16*8 +-------------------------------+ 191# | __int64 tmp[-1] | 192# +-------------------------------+ 193# | __int64 tmp[num] | 194# . . 195# . . 196# . . 197# +(num+1)*8 +-------------------------------+ 198# | padding to 64 byte boundary | 199# . . 200# +X +-------------------------------+ 201# | double nap_d[4*num] | 202# . . 203# . . 204# . . 205# +-------------------------------+ 206# . . 207# -13*size_t +-------------------------------+ 208# | 13 saved gpr, r19-r31 | 209# . . 210# . . 211# -12*8 +-------------------------------+ 212# | 12 saved fpr, f20-f31 | 213# . . 214# . . 215# +-------------------------------+ 216 217$code=<<___; 218.machine "any" 219.text 220 221.globl .$fname 222.align 5 223.$fname: 224 cmpwi $num,`3*8/$SIZE_T` 225 mr $rp,r3 ; $rp is reassigned 226 li r3,0 ; possible "not handled" return code 227 bltlr- 228 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" 229 bnelr- 230 231 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) 232 li $i,-4096 233 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num 234 add $tp,$tp,$num ; place for tp[num+1] 235 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE` 236 subf $tp,$tp,$sp ; $sp-$tp 237 and $tp,$tp,$i ; minimize TLB usage 238 subf $tp,$sp,$tp ; $tp-$sp 239 mr $i,$sp 240 $STUX $sp,$sp,$tp ; alloca 241 242 $PUSH r19,`-12*8-13*$SIZE_T`($i) 243 $PUSH r20,`-12*8-12*$SIZE_T`($i) 244 $PUSH r21,`-12*8-11*$SIZE_T`($i) 245 $PUSH r22,`-12*8-10*$SIZE_T`($i) 246 $PUSH r23,`-12*8-9*$SIZE_T`($i) 247 $PUSH r24,`-12*8-8*$SIZE_T`($i) 248 $PUSH r25,`-12*8-7*$SIZE_T`($i) 249 $PUSH r26,`-12*8-6*$SIZE_T`($i) 250 $PUSH r27,`-12*8-5*$SIZE_T`($i) 251 $PUSH r28,`-12*8-4*$SIZE_T`($i) 252 $PUSH r29,`-12*8-3*$SIZE_T`($i) 253 $PUSH r30,`-12*8-2*$SIZE_T`($i) 254 $PUSH r31,`-12*8-1*$SIZE_T`($i) 255 stfd f20,`-12*8`($i) 256 stfd f21,`-11*8`($i) 257 stfd f22,`-10*8`($i) 258 stfd f23,`-9*8`($i) 259 stfd f24,`-8*8`($i) 260 stfd f25,`-7*8`($i) 261 stfd f26,`-6*8`($i) 262 stfd f27,`-5*8`($i) 263 stfd f28,`-4*8`($i) 264 stfd f29,`-3*8`($i) 265 stfd f30,`-2*8`($i) 266 stfd f31,`-1*8`($i) 267 268 addi $tp,$sp,`$FRAME+$TRANSFER+8+64` 269 li $i,-64 270 add $nap_d,$tp,$num 271 and $nap_d,$nap_d,$i ; align to 64 bytes 272 ; nap_d is off by 1, because it's used with stfdu/lfdu 273 addi $nap_d,$nap_d,-8 274 srwi $j,$num,`3+1` ; counter register, num/2 275 addi $j,$j,-1 276 addi $tp,$sp,`$FRAME+$TRANSFER-8` 277 li $carry,0 278 mtctr $j 279___ 280 281$code.=<<___ if ($SIZE_T==8); 282 ld $a0,0($ap) ; pull ap[0] value 283 ld $t3,0($bp) ; bp[0] 284 ld $n0,0($n0) ; pull n0[0] value 285 286 mulld $t7,$a0,$t3 ; ap[0]*bp[0] 287 ; transfer bp[0] to FPU as 4x16-bit values 288 extrdi $t0,$t3,16,48 289 extrdi $t1,$t3,16,32 290 extrdi $t2,$t3,16,16 291 extrdi $t3,$t3,16,0 292 std $t0,`$FRAME+0`($sp) 293 std $t1,`$FRAME+8`($sp) 294 std $t2,`$FRAME+16`($sp) 295 std $t3,`$FRAME+24`($sp) 296 297 mulld $t7,$t7,$n0 ; tp[0]*n0 298 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 299 extrdi $t4,$t7,16,48 300 extrdi $t5,$t7,16,32 301 extrdi $t6,$t7,16,16 302 extrdi $t7,$t7,16,0 303 std $t4,`$FRAME+32`($sp) 304 std $t5,`$FRAME+40`($sp) 305 std $t6,`$FRAME+48`($sp) 306 std $t7,`$FRAME+56`($sp) 307 308 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap) 309 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap) 310 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair 311 lwz $t3,`8^$LITTLE_ENDIAN`($ap) 312 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair 313 lwz $t5,`0^$LITTLE_ENDIAN`($np) 314 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair 315 lwz $t7,`8^$LITTLE_ENDIAN`($np) 316___ 317$code.=<<___ if ($SIZE_T==4); 318 lwz $a0,0($ap) ; pull ap[0,1] value 319 mr $n1,$n0 320 lwz $a1,4($ap) 321 li $c1,0 322 lwz $t1,0($bp) ; bp[0,1] 323 lwz $t3,4($bp) 324 lwz $n0,0($n1) ; pull n0[0,1] value 325 lwz $n1,4($n1) 326 327 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0] 328 mulhwu $t5,$a0,$t1 329 mullw $t6,$a1,$t1 330 mullw $t7,$a0,$t3 331 add $t5,$t5,$t6 332 add $t5,$t5,$t7 333 ; transfer bp[0] to FPU as 4x16-bit values 334 extrwi $t0,$t1,16,16 335 extrwi $t1,$t1,16,0 336 extrwi $t2,$t3,16,16 337 extrwi $t3,$t3,16,0 338 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build 339 std $t1,`$FRAME+8`($sp) 340 std $t2,`$FRAME+16`($sp) 341 std $t3,`$FRAME+24`($sp) 342 343 mullw $t0,$t4,$n0 ; mulld tp[0]*n0 344 mulhwu $t1,$t4,$n0 345 mullw $t2,$t5,$n0 346 mullw $t3,$t4,$n1 347 add $t1,$t1,$t2 348 add $t1,$t1,$t3 349 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values 350 extrwi $t4,$t0,16,16 351 extrwi $t5,$t0,16,0 352 extrwi $t6,$t1,16,16 353 extrwi $t7,$t1,16,0 354 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build 355 std $t5,`$FRAME+40`($sp) 356 std $t6,`$FRAME+48`($sp) 357 std $t7,`$FRAME+56`($sp) 358 359 mr $t0,$a0 ; lwz $t0,0($ap) 360 mr $t1,$a1 ; lwz $t1,4($ap) 361 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs 362 lwz $t3,12($ap) 363 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 364 lwz $t5,4($np) 365 lwz $t6,8($np) 366 lwz $t7,12($np) 367___ 368$code.=<<___; 369 lfd $ba,`$FRAME+0`($sp) 370 lfd $bb,`$FRAME+8`($sp) 371 lfd $bc,`$FRAME+16`($sp) 372 lfd $bd,`$FRAME+24`($sp) 373 lfd $na,`$FRAME+32`($sp) 374 lfd $nb,`$FRAME+40`($sp) 375 lfd $nc,`$FRAME+48`($sp) 376 lfd $nd,`$FRAME+56`($sp) 377 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build 378 std $t1,`$FRAME+72`($sp) 379 std $t2,`$FRAME+80`($sp) 380 std $t3,`$FRAME+88`($sp) 381 std $t4,`$FRAME+96`($sp) 382 std $t5,`$FRAME+104`($sp) 383 std $t6,`$FRAME+112`($sp) 384 std $t7,`$FRAME+120`($sp) 385 fcfid $ba,$ba 386 fcfid $bb,$bb 387 fcfid $bc,$bc 388 fcfid $bd,$bd 389 fcfid $na,$na 390 fcfid $nb,$nb 391 fcfid $nc,$nc 392 fcfid $nd,$nd 393 394 lfd $A0,`$FRAME+64`($sp) 395 lfd $A1,`$FRAME+72`($sp) 396 lfd $A2,`$FRAME+80`($sp) 397 lfd $A3,`$FRAME+88`($sp) 398 lfd $N0,`$FRAME+96`($sp) 399 lfd $N1,`$FRAME+104`($sp) 400 lfd $N2,`$FRAME+112`($sp) 401 lfd $N3,`$FRAME+120`($sp) 402 fcfid $A0,$A0 403 fcfid $A1,$A1 404 fcfid $A2,$A2 405 fcfid $A3,$A3 406 fcfid $N0,$N0 407 fcfid $N1,$N1 408 fcfid $N2,$N2 409 fcfid $N3,$N3 410 addi $ap,$ap,16 411 addi $np,$np,16 412 413 fmul $T1a,$A1,$ba 414 fmul $T1b,$A1,$bb 415 stfd $A0,8($nap_d) ; save a[j] in double format 416 stfd $A1,16($nap_d) 417 fmul $T2a,$A2,$ba 418 fmul $T2b,$A2,$bb 419 stfd $A2,24($nap_d) ; save a[j+1] in double format 420 stfd $A3,32($nap_d) 421 fmul $T3a,$A3,$ba 422 fmul $T3b,$A3,$bb 423 stfd $N0,40($nap_d) ; save n[j] in double format 424 stfd $N1,48($nap_d) 425 fmul $T0a,$A0,$ba 426 fmul $T0b,$A0,$bb 427 stfd $N2,56($nap_d) ; save n[j+1] in double format 428 stfdu $N3,64($nap_d) 429 430 fmadd $T1a,$A0,$bc,$T1a 431 fmadd $T1b,$A0,$bd,$T1b 432 fmadd $T2a,$A1,$bc,$T2a 433 fmadd $T2b,$A1,$bd,$T2b 434 fmadd $T3a,$A2,$bc,$T3a 435 fmadd $T3b,$A2,$bd,$T3b 436 fmul $dota,$A3,$bc 437 fmul $dotb,$A3,$bd 438 439 fmadd $T1a,$N1,$na,$T1a 440 fmadd $T1b,$N1,$nb,$T1b 441 fmadd $T2a,$N2,$na,$T2a 442 fmadd $T2b,$N2,$nb,$T2b 443 fmadd $T3a,$N3,$na,$T3a 444 fmadd $T3b,$N3,$nb,$T3b 445 fmadd $T0a,$N0,$na,$T0a 446 fmadd $T0b,$N0,$nb,$T0b 447 448 fmadd $T1a,$N0,$nc,$T1a 449 fmadd $T1b,$N0,$nd,$T1b 450 fmadd $T2a,$N1,$nc,$T2a 451 fmadd $T2b,$N1,$nd,$T2b 452 fmadd $T3a,$N2,$nc,$T3a 453 fmadd $T3b,$N2,$nd,$T3b 454 fmadd $dota,$N3,$nc,$dota 455 fmadd $dotb,$N3,$nd,$dotb 456 457 fctid $T0a,$T0a 458 fctid $T0b,$T0b 459 fctid $T1a,$T1a 460 fctid $T1b,$T1b 461 fctid $T2a,$T2a 462 fctid $T2b,$T2b 463 fctid $T3a,$T3a 464 fctid $T3b,$T3b 465 466 stfd $T0a,`$FRAME+0`($sp) 467 stfd $T0b,`$FRAME+8`($sp) 468 stfd $T1a,`$FRAME+16`($sp) 469 stfd $T1b,`$FRAME+24`($sp) 470 stfd $T2a,`$FRAME+32`($sp) 471 stfd $T2b,`$FRAME+40`($sp) 472 stfd $T3a,`$FRAME+48`($sp) 473 stfd $T3b,`$FRAME+56`($sp) 474 475.align 5 476L1st: 477___ 478$code.=<<___ if ($SIZE_T==8); 479 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair 480 lwz $t1,`0^$LITTLE_ENDIAN`($ap) 481 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair 482 lwz $t3,`8^$LITTLE_ENDIAN`($ap) 483 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair 484 lwz $t5,`0^$LITTLE_ENDIAN`($np) 485 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair 486 lwz $t7,`8^$LITTLE_ENDIAN`($np) 487___ 488$code.=<<___ if ($SIZE_T==4); 489 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs 490 lwz $t1,4($ap) 491 lwz $t2,8($ap) 492 lwz $t3,12($ap) 493 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs 494 lwz $t5,4($np) 495 lwz $t6,8($np) 496 lwz $t7,12($np) 497___ 498$code.=<<___; 499 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build 500 std $t1,`$FRAME+72`($sp) 501 std $t2,`$FRAME+80`($sp) 502 std $t3,`$FRAME+88`($sp) 503 std $t4,`$FRAME+96`($sp) 504 std $t5,`$FRAME+104`($sp) 505 std $t6,`$FRAME+112`($sp) 506 std $t7,`$FRAME+120`($sp) 507___ 508if ($SIZE_T==8 or $flavour =~ /osx/) { 509$code.=<<___; 510 ld $t0,`$FRAME+0`($sp) 511 ld $t1,`$FRAME+8`($sp) 512 ld $t2,`$FRAME+16`($sp) 513 ld $t3,`$FRAME+24`($sp) 514 ld $t4,`$FRAME+32`($sp) 515 ld $t5,`$FRAME+40`($sp) 516 ld $t6,`$FRAME+48`($sp) 517 ld $t7,`$FRAME+56`($sp) 518___ 519} else { 520$code.=<<___; 521 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 522 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 523 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 524 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 525 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 526 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 527 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 528 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 529___ 530} 531$code.=<<___; 532 lfd $A0,`$FRAME+64`($sp) 533 lfd $A1,`$FRAME+72`($sp) 534 lfd $A2,`$FRAME+80`($sp) 535 lfd $A3,`$FRAME+88`($sp) 536 lfd $N0,`$FRAME+96`($sp) 537 lfd $N1,`$FRAME+104`($sp) 538 lfd $N2,`$FRAME+112`($sp) 539 lfd $N3,`$FRAME+120`($sp) 540 fcfid $A0,$A0 541 fcfid $A1,$A1 542 fcfid $A2,$A2 543 fcfid $A3,$A3 544 fcfid $N0,$N0 545 fcfid $N1,$N1 546 fcfid $N2,$N2 547 fcfid $N3,$N3 548 addi $ap,$ap,16 549 addi $np,$np,16 550 551 fmul $T1a,$A1,$ba 552 fmul $T1b,$A1,$bb 553 fmul $T2a,$A2,$ba 554 fmul $T2b,$A2,$bb 555 stfd $A0,8($nap_d) ; save a[j] in double format 556 stfd $A1,16($nap_d) 557 fmul $T3a,$A3,$ba 558 fmul $T3b,$A3,$bb 559 fmadd $T0a,$A0,$ba,$dota 560 fmadd $T0b,$A0,$bb,$dotb 561 stfd $A2,24($nap_d) ; save a[j+1] in double format 562 stfd $A3,32($nap_d) 563___ 564if ($SIZE_T==8 or $flavour =~ /osx/) { 565$code.=<<___; 566 fmadd $T1a,$A0,$bc,$T1a 567 fmadd $T1b,$A0,$bd,$T1b 568 fmadd $T2a,$A1,$bc,$T2a 569 fmadd $T2b,$A1,$bd,$T2b 570 stfd $N0,40($nap_d) ; save n[j] in double format 571 stfd $N1,48($nap_d) 572 fmadd $T3a,$A2,$bc,$T3a 573 fmadd $T3b,$A2,$bd,$T3b 574 add $t0,$t0,$carry ; can not overflow 575 fmul $dota,$A3,$bc 576 fmul $dotb,$A3,$bd 577 stfd $N2,56($nap_d) ; save n[j+1] in double format 578 stfdu $N3,64($nap_d) 579 srdi $carry,$t0,16 580 add $t1,$t1,$carry 581 srdi $carry,$t1,16 582 583 fmadd $T1a,$N1,$na,$T1a 584 fmadd $T1b,$N1,$nb,$T1b 585 insrdi $t0,$t1,16,32 586 fmadd $T2a,$N2,$na,$T2a 587 fmadd $T2b,$N2,$nb,$T2b 588 add $t2,$t2,$carry 589 fmadd $T3a,$N3,$na,$T3a 590 fmadd $T3b,$N3,$nb,$T3b 591 srdi $carry,$t2,16 592 fmadd $T0a,$N0,$na,$T0a 593 fmadd $T0b,$N0,$nb,$T0b 594 insrdi $t0,$t2,16,16 595 add $t3,$t3,$carry 596 srdi $carry,$t3,16 597 598 fmadd $T1a,$N0,$nc,$T1a 599 fmadd $T1b,$N0,$nd,$T1b 600 insrdi $t0,$t3,16,0 ; 0..63 bits 601 fmadd $T2a,$N1,$nc,$T2a 602 fmadd $T2b,$N1,$nd,$T2b 603 add $t4,$t4,$carry 604 fmadd $T3a,$N2,$nc,$T3a 605 fmadd $T3b,$N2,$nd,$T3b 606 srdi $carry,$t4,16 607 fmadd $dota,$N3,$nc,$dota 608 fmadd $dotb,$N3,$nd,$dotb 609 add $t5,$t5,$carry 610 srdi $carry,$t5,16 611 insrdi $t4,$t5,16,32 612 613 fctid $T0a,$T0a 614 fctid $T0b,$T0b 615 add $t6,$t6,$carry 616 fctid $T1a,$T1a 617 fctid $T1b,$T1b 618 srdi $carry,$t6,16 619 fctid $T2a,$T2a 620 fctid $T2b,$T2b 621 insrdi $t4,$t6,16,16 622 fctid $T3a,$T3a 623 fctid $T3b,$T3b 624 add $t7,$t7,$carry 625 insrdi $t4,$t7,16,0 ; 64..127 bits 626 srdi $carry,$t7,16 ; upper 33 bits 627 628 stfd $T0a,`$FRAME+0`($sp) 629 stfd $T0b,`$FRAME+8`($sp) 630 stfd $T1a,`$FRAME+16`($sp) 631 stfd $T1b,`$FRAME+24`($sp) 632 stfd $T2a,`$FRAME+32`($sp) 633 stfd $T2b,`$FRAME+40`($sp) 634 stfd $T3a,`$FRAME+48`($sp) 635 stfd $T3b,`$FRAME+56`($sp) 636 std $t0,8($tp) ; tp[j-1] 637 stdu $t4,16($tp) ; tp[j] 638___ 639} else { 640$code.=<<___; 641 fmadd $T1a,$A0,$bc,$T1a 642 fmadd $T1b,$A0,$bd,$T1b 643 addc $t0,$t0,$carry 644 adde $t1,$t1,$c1 645 srwi $carry,$t0,16 646 fmadd $T2a,$A1,$bc,$T2a 647 fmadd $T2b,$A1,$bd,$T2b 648 stfd $N0,40($nap_d) ; save n[j] in double format 649 stfd $N1,48($nap_d) 650 srwi $c1,$t1,16 651 insrwi $carry,$t1,16,0 652 fmadd $T3a,$A2,$bc,$T3a 653 fmadd $T3b,$A2,$bd,$T3b 654 addc $t2,$t2,$carry 655 adde $t3,$t3,$c1 656 srwi $carry,$t2,16 657 fmul $dota,$A3,$bc 658 fmul $dotb,$A3,$bd 659 stfd $N2,56($nap_d) ; save n[j+1] in double format 660 stfdu $N3,64($nap_d) 661 insrwi $t0,$t2,16,0 ; 0..31 bits 662 srwi $c1,$t3,16 663 insrwi $carry,$t3,16,0 664 665 fmadd $T1a,$N1,$na,$T1a 666 fmadd $T1b,$N1,$nb,$T1b 667 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 668 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 669 addc $t4,$t4,$carry 670 adde $t5,$t5,$c1 671 srwi $carry,$t4,16 672 fmadd $T2a,$N2,$na,$T2a 673 fmadd $T2b,$N2,$nb,$T2b 674 srwi $c1,$t5,16 675 insrwi $carry,$t5,16,0 676 fmadd $T3a,$N3,$na,$T3a 677 fmadd $T3b,$N3,$nb,$T3b 678 addc $t6,$t6,$carry 679 adde $t7,$t7,$c1 680 srwi $carry,$t6,16 681 fmadd $T0a,$N0,$na,$T0a 682 fmadd $T0b,$N0,$nb,$T0b 683 insrwi $t4,$t6,16,0 ; 32..63 bits 684 srwi $c1,$t7,16 685 insrwi $carry,$t7,16,0 686 687 fmadd $T1a,$N0,$nc,$T1a 688 fmadd $T1b,$N0,$nd,$T1b 689 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 690 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 691 addc $t2,$t2,$carry 692 adde $t3,$t3,$c1 693 srwi $carry,$t2,16 694 fmadd $T2a,$N1,$nc,$T2a 695 fmadd $T2b,$N1,$nd,$T2b 696 stw $t0,12($tp) ; tp[j-1] 697 stw $t4,8($tp) 698 srwi $c1,$t3,16 699 insrwi $carry,$t3,16,0 700 fmadd $T3a,$N2,$nc,$T3a 701 fmadd $T3b,$N2,$nd,$T3b 702 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 703 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 704 addc $t6,$t6,$carry 705 adde $t7,$t7,$c1 706 srwi $carry,$t6,16 707 fmadd $dota,$N3,$nc,$dota 708 fmadd $dotb,$N3,$nd,$dotb 709 insrwi $t2,$t6,16,0 ; 64..95 bits 710 srwi $c1,$t7,16 711 insrwi $carry,$t7,16,0 712 713 fctid $T0a,$T0a 714 fctid $T0b,$T0b 715 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 716 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 717 addc $t0,$t0,$carry 718 adde $t1,$t1,$c1 719 srwi $carry,$t0,16 720 fctid $T1a,$T1a 721 fctid $T1b,$T1b 722 srwi $c1,$t1,16 723 insrwi $carry,$t1,16,0 724 fctid $T2a,$T2a 725 fctid $T2b,$T2b 726 addc $t4,$t4,$carry 727 adde $t5,$t5,$c1 728 srwi $carry,$t4,16 729 fctid $T3a,$T3a 730 fctid $T3b,$T3b 731 insrwi $t0,$t4,16,0 ; 96..127 bits 732 srwi $c1,$t5,16 733 insrwi $carry,$t5,16,0 734 735 stfd $T0a,`$FRAME+0`($sp) 736 stfd $T0b,`$FRAME+8`($sp) 737 stfd $T1a,`$FRAME+16`($sp) 738 stfd $T1b,`$FRAME+24`($sp) 739 stfd $T2a,`$FRAME+32`($sp) 740 stfd $T2b,`$FRAME+40`($sp) 741 stfd $T3a,`$FRAME+48`($sp) 742 stfd $T3b,`$FRAME+56`($sp) 743 stw $t2,20($tp) ; tp[j] 744 stwu $t0,16($tp) 745___ 746} 747$code.=<<___; 748 bdnz L1st 749 750 fctid $dota,$dota 751 fctid $dotb,$dotb 752___ 753if ($SIZE_T==8 or $flavour =~ /osx/) { 754$code.=<<___; 755 ld $t0,`$FRAME+0`($sp) 756 ld $t1,`$FRAME+8`($sp) 757 ld $t2,`$FRAME+16`($sp) 758 ld $t3,`$FRAME+24`($sp) 759 ld $t4,`$FRAME+32`($sp) 760 ld $t5,`$FRAME+40`($sp) 761 ld $t6,`$FRAME+48`($sp) 762 ld $t7,`$FRAME+56`($sp) 763 stfd $dota,`$FRAME+64`($sp) 764 stfd $dotb,`$FRAME+72`($sp) 765 766 add $t0,$t0,$carry ; can not overflow 767 srdi $carry,$t0,16 768 add $t1,$t1,$carry 769 srdi $carry,$t1,16 770 insrdi $t0,$t1,16,32 771 add $t2,$t2,$carry 772 srdi $carry,$t2,16 773 insrdi $t0,$t2,16,16 774 add $t3,$t3,$carry 775 srdi $carry,$t3,16 776 insrdi $t0,$t3,16,0 ; 0..63 bits 777 add $t4,$t4,$carry 778 srdi $carry,$t4,16 779 add $t5,$t5,$carry 780 srdi $carry,$t5,16 781 insrdi $t4,$t5,16,32 782 add $t6,$t6,$carry 783 srdi $carry,$t6,16 784 insrdi $t4,$t6,16,16 785 add $t7,$t7,$carry 786 insrdi $t4,$t7,16,0 ; 64..127 bits 787 srdi $carry,$t7,16 ; upper 33 bits 788 ld $t6,`$FRAME+64`($sp) 789 ld $t7,`$FRAME+72`($sp) 790 791 std $t0,8($tp) ; tp[j-1] 792 stdu $t4,16($tp) ; tp[j] 793 794 add $t6,$t6,$carry ; can not overflow 795 srdi $carry,$t6,16 796 add $t7,$t7,$carry 797 insrdi $t6,$t7,48,0 798 srdi $ovf,$t7,48 799 std $t6,8($tp) ; tp[num-1] 800___ 801} else { 802$code.=<<___; 803 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 804 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 805 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 806 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 807 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 808 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 809 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 810 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 811 stfd $dota,`$FRAME+64`($sp) 812 stfd $dotb,`$FRAME+72`($sp) 813 814 addc $t0,$t0,$carry 815 adde $t1,$t1,$c1 816 srwi $carry,$t0,16 817 insrwi $carry,$t1,16,0 818 srwi $c1,$t1,16 819 addc $t2,$t2,$carry 820 adde $t3,$t3,$c1 821 srwi $carry,$t2,16 822 insrwi $t0,$t2,16,0 ; 0..31 bits 823 insrwi $carry,$t3,16,0 824 srwi $c1,$t3,16 825 addc $t4,$t4,$carry 826 adde $t5,$t5,$c1 827 srwi $carry,$t4,16 828 insrwi $carry,$t5,16,0 829 srwi $c1,$t5,16 830 addc $t6,$t6,$carry 831 adde $t7,$t7,$c1 832 srwi $carry,$t6,16 833 insrwi $t4,$t6,16,0 ; 32..63 bits 834 insrwi $carry,$t7,16,0 835 srwi $c1,$t7,16 836 stw $t0,12($tp) ; tp[j-1] 837 stw $t4,8($tp) 838 839 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 840 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 841 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 842 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 843 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 844 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 845 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 846 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 847 848 addc $t2,$t2,$carry 849 adde $t3,$t3,$c1 850 srwi $carry,$t2,16 851 insrwi $carry,$t3,16,0 852 srwi $c1,$t3,16 853 addc $t6,$t6,$carry 854 adde $t7,$t7,$c1 855 srwi $carry,$t6,16 856 insrwi $t2,$t6,16,0 ; 64..95 bits 857 insrwi $carry,$t7,16,0 858 srwi $c1,$t7,16 859 addc $t0,$t0,$carry 860 adde $t1,$t1,$c1 861 srwi $carry,$t0,16 862 insrwi $carry,$t1,16,0 863 srwi $c1,$t1,16 864 addc $t4,$t4,$carry 865 adde $t5,$t5,$c1 866 srwi $carry,$t4,16 867 insrwi $t0,$t4,16,0 ; 96..127 bits 868 insrwi $carry,$t5,16,0 869 srwi $c1,$t5,16 870 stw $t2,20($tp) ; tp[j] 871 stwu $t0,16($tp) 872 873 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) 874 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) 875 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) 876 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) 877 878 addc $t6,$t6,$carry 879 adde $t7,$t7,$c1 880 srwi $carry,$t6,16 881 insrwi $carry,$t7,16,0 882 srwi $c1,$t7,16 883 addc $t4,$t4,$carry 884 adde $t5,$t5,$c1 885 886 insrwi $t6,$t4,16,0 887 srwi $t4,$t4,16 888 insrwi $t4,$t5,16,0 889 srwi $ovf,$t5,16 890 stw $t6,12($tp) ; tp[num-1] 891 stw $t4,8($tp) 892___ 893} 894$code.=<<___; 895 slwi $t7,$num,2 896 subf $nap_d,$t7,$nap_d ; rewind pointer 897 898 li $i,8 ; i=1 899.align 5 900Louter: 901 addi $tp,$sp,`$FRAME+$TRANSFER` 902 li $carry,0 903 mtctr $j 904___ 905$code.=<<___ if ($SIZE_T==8); 906 ldx $t3,$bp,$i ; bp[i] 907 908 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 909 mulld $t7,$a0,$t3 ; ap[0]*bp[i] 910 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0] 911 ; transfer bp[i] to FPU as 4x16-bit values 912 extrdi $t0,$t3,16,48 913 extrdi $t1,$t3,16,32 914 extrdi $t2,$t3,16,16 915 extrdi $t3,$t3,16,0 916 std $t0,`$FRAME+0`($sp) 917 std $t1,`$FRAME+8`($sp) 918 std $t2,`$FRAME+16`($sp) 919 std $t3,`$FRAME+24`($sp) 920 921 mulld $t7,$t7,$n0 ; tp[0]*n0 922 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 923 extrdi $t4,$t7,16,48 924 extrdi $t5,$t7,16,32 925 extrdi $t6,$t7,16,16 926 extrdi $t7,$t7,16,0 927 std $t4,`$FRAME+32`($sp) 928 std $t5,`$FRAME+40`($sp) 929 std $t6,`$FRAME+48`($sp) 930 std $t7,`$FRAME+56`($sp) 931___ 932$code.=<<___ if ($SIZE_T==4); 933 add $t0,$bp,$i 934 li $c1,0 935 lwz $t1,0($t0) ; bp[i,i+1] 936 lwz $t3,4($t0) 937 938 mullw $t4,$a0,$t1 ; ap[0]*bp[i] 939 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0] 940 mulhwu $t5,$a0,$t1 941 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0] 942 mullw $t6,$a1,$t1 943 mullw $t7,$a0,$t3 944 add $t5,$t5,$t6 945 add $t5,$t5,$t7 946 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0] 947 adde $t5,$t5,$t2 948 ; transfer bp[i] to FPU as 4x16-bit values 949 extrwi $t0,$t1,16,16 950 extrwi $t1,$t1,16,0 951 extrwi $t2,$t3,16,16 952 extrwi $t3,$t3,16,0 953 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build 954 std $t1,`$FRAME+8`($sp) 955 std $t2,`$FRAME+16`($sp) 956 std $t3,`$FRAME+24`($sp) 957 958 mullw $t0,$t4,$n0 ; mulld tp[0]*n0 959 mulhwu $t1,$t4,$n0 960 mullw $t2,$t5,$n0 961 mullw $t3,$t4,$n1 962 add $t1,$t1,$t2 963 add $t1,$t1,$t3 964 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values 965 extrwi $t4,$t0,16,16 966 extrwi $t5,$t0,16,0 967 extrwi $t6,$t1,16,16 968 extrwi $t7,$t1,16,0 969 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build 970 std $t5,`$FRAME+40`($sp) 971 std $t6,`$FRAME+48`($sp) 972 std $t7,`$FRAME+56`($sp) 973___ 974$code.=<<___; 975 lfd $A0,8($nap_d) ; load a[j] in double format 976 lfd $A1,16($nap_d) 977 lfd $A2,24($nap_d) ; load a[j+1] in double format 978 lfd $A3,32($nap_d) 979 lfd $N0,40($nap_d) ; load n[j] in double format 980 lfd $N1,48($nap_d) 981 lfd $N2,56($nap_d) ; load n[j+1] in double format 982 lfdu $N3,64($nap_d) 983 984 lfd $ba,`$FRAME+0`($sp) 985 lfd $bb,`$FRAME+8`($sp) 986 lfd $bc,`$FRAME+16`($sp) 987 lfd $bd,`$FRAME+24`($sp) 988 lfd $na,`$FRAME+32`($sp) 989 lfd $nb,`$FRAME+40`($sp) 990 lfd $nc,`$FRAME+48`($sp) 991 lfd $nd,`$FRAME+56`($sp) 992 993 fcfid $ba,$ba 994 fcfid $bb,$bb 995 fcfid $bc,$bc 996 fcfid $bd,$bd 997 fcfid $na,$na 998 fcfid $nb,$nb 999 fcfid $nc,$nc 1000 fcfid $nd,$nd 1001 1002 fmul $T1a,$A1,$ba 1003 fmul $T1b,$A1,$bb 1004 fmul $T2a,$A2,$ba 1005 fmul $T2b,$A2,$bb 1006 fmul $T3a,$A3,$ba 1007 fmul $T3b,$A3,$bb 1008 fmul $T0a,$A0,$ba 1009 fmul $T0b,$A0,$bb 1010 1011 fmadd $T1a,$A0,$bc,$T1a 1012 fmadd $T1b,$A0,$bd,$T1b 1013 fmadd $T2a,$A1,$bc,$T2a 1014 fmadd $T2b,$A1,$bd,$T2b 1015 fmadd $T3a,$A2,$bc,$T3a 1016 fmadd $T3b,$A2,$bd,$T3b 1017 fmul $dota,$A3,$bc 1018 fmul $dotb,$A3,$bd 1019 1020 fmadd $T1a,$N1,$na,$T1a 1021 fmadd $T1b,$N1,$nb,$T1b 1022 lfd $A0,8($nap_d) ; load a[j] in double format 1023 lfd $A1,16($nap_d) 1024 fmadd $T2a,$N2,$na,$T2a 1025 fmadd $T2b,$N2,$nb,$T2b 1026 lfd $A2,24($nap_d) ; load a[j+1] in double format 1027 lfd $A3,32($nap_d) 1028 fmadd $T3a,$N3,$na,$T3a 1029 fmadd $T3b,$N3,$nb,$T3b 1030 fmadd $T0a,$N0,$na,$T0a 1031 fmadd $T0b,$N0,$nb,$T0b 1032 1033 fmadd $T1a,$N0,$nc,$T1a 1034 fmadd $T1b,$N0,$nd,$T1b 1035 fmadd $T2a,$N1,$nc,$T2a 1036 fmadd $T2b,$N1,$nd,$T2b 1037 fmadd $T3a,$N2,$nc,$T3a 1038 fmadd $T3b,$N2,$nd,$T3b 1039 fmadd $dota,$N3,$nc,$dota 1040 fmadd $dotb,$N3,$nd,$dotb 1041 1042 fctid $T0a,$T0a 1043 fctid $T0b,$T0b 1044 fctid $T1a,$T1a 1045 fctid $T1b,$T1b 1046 fctid $T2a,$T2a 1047 fctid $T2b,$T2b 1048 fctid $T3a,$T3a 1049 fctid $T3b,$T3b 1050 1051 stfd $T0a,`$FRAME+0`($sp) 1052 stfd $T0b,`$FRAME+8`($sp) 1053 stfd $T1a,`$FRAME+16`($sp) 1054 stfd $T1b,`$FRAME+24`($sp) 1055 stfd $T2a,`$FRAME+32`($sp) 1056 stfd $T2b,`$FRAME+40`($sp) 1057 stfd $T3a,`$FRAME+48`($sp) 1058 stfd $T3b,`$FRAME+56`($sp) 1059 1060.align 5 1061Linner: 1062 fmul $T1a,$A1,$ba 1063 fmul $T1b,$A1,$bb 1064 fmul $T2a,$A2,$ba 1065 fmul $T2b,$A2,$bb 1066 lfd $N0,40($nap_d) ; load n[j] in double format 1067 lfd $N1,48($nap_d) 1068 fmul $T3a,$A3,$ba 1069 fmul $T3b,$A3,$bb 1070 fmadd $T0a,$A0,$ba,$dota 1071 fmadd $T0b,$A0,$bb,$dotb 1072 lfd $N2,56($nap_d) ; load n[j+1] in double format 1073 lfdu $N3,64($nap_d) 1074 1075 fmadd $T1a,$A0,$bc,$T1a 1076 fmadd $T1b,$A0,$bd,$T1b 1077 fmadd $T2a,$A1,$bc,$T2a 1078 fmadd $T2b,$A1,$bd,$T2b 1079 lfd $A0,8($nap_d) ; load a[j] in double format 1080 lfd $A1,16($nap_d) 1081 fmadd $T3a,$A2,$bc,$T3a 1082 fmadd $T3b,$A2,$bd,$T3b 1083 fmul $dota,$A3,$bc 1084 fmul $dotb,$A3,$bd 1085 lfd $A2,24($nap_d) ; load a[j+1] in double format 1086 lfd $A3,32($nap_d) 1087___ 1088if ($SIZE_T==8 or $flavour =~ /osx/) { 1089$code.=<<___; 1090 fmadd $T1a,$N1,$na,$T1a 1091 fmadd $T1b,$N1,$nb,$T1b 1092 ld $t0,`$FRAME+0`($sp) 1093 ld $t1,`$FRAME+8`($sp) 1094 fmadd $T2a,$N2,$na,$T2a 1095 fmadd $T2b,$N2,$nb,$T2b 1096 ld $t2,`$FRAME+16`($sp) 1097 ld $t3,`$FRAME+24`($sp) 1098 fmadd $T3a,$N3,$na,$T3a 1099 fmadd $T3b,$N3,$nb,$T3b 1100 add $t0,$t0,$carry ; can not overflow 1101 ld $t4,`$FRAME+32`($sp) 1102 ld $t5,`$FRAME+40`($sp) 1103 fmadd $T0a,$N0,$na,$T0a 1104 fmadd $T0b,$N0,$nb,$T0b 1105 srdi $carry,$t0,16 1106 add $t1,$t1,$carry 1107 srdi $carry,$t1,16 1108 ld $t6,`$FRAME+48`($sp) 1109 ld $t7,`$FRAME+56`($sp) 1110 1111 fmadd $T1a,$N0,$nc,$T1a 1112 fmadd $T1b,$N0,$nd,$T1b 1113 insrdi $t0,$t1,16,32 1114 ld $t1,8($tp) ; tp[j] 1115 fmadd $T2a,$N1,$nc,$T2a 1116 fmadd $T2b,$N1,$nd,$T2b 1117 add $t2,$t2,$carry 1118 fmadd $T3a,$N2,$nc,$T3a 1119 fmadd $T3b,$N2,$nd,$T3b 1120 srdi $carry,$t2,16 1121 insrdi $t0,$t2,16,16 1122 fmadd $dota,$N3,$nc,$dota 1123 fmadd $dotb,$N3,$nd,$dotb 1124 add $t3,$t3,$carry 1125 ldu $t2,16($tp) ; tp[j+1] 1126 srdi $carry,$t3,16 1127 insrdi $t0,$t3,16,0 ; 0..63 bits 1128 add $t4,$t4,$carry 1129 1130 fctid $T0a,$T0a 1131 fctid $T0b,$T0b 1132 srdi $carry,$t4,16 1133 fctid $T1a,$T1a 1134 fctid $T1b,$T1b 1135 add $t5,$t5,$carry 1136 fctid $T2a,$T2a 1137 fctid $T2b,$T2b 1138 srdi $carry,$t5,16 1139 insrdi $t4,$t5,16,32 1140 fctid $T3a,$T3a 1141 fctid $T3b,$T3b 1142 add $t6,$t6,$carry 1143 srdi $carry,$t6,16 1144 insrdi $t4,$t6,16,16 1145 1146 stfd $T0a,`$FRAME+0`($sp) 1147 stfd $T0b,`$FRAME+8`($sp) 1148 add $t7,$t7,$carry 1149 addc $t3,$t0,$t1 1150___ 1151$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1152 extrdi $t0,$t0,32,0 1153 extrdi $t1,$t1,32,0 1154 adde $t0,$t0,$t1 1155___ 1156$code.=<<___; 1157 stfd $T1a,`$FRAME+16`($sp) 1158 stfd $T1b,`$FRAME+24`($sp) 1159 insrdi $t4,$t7,16,0 ; 64..127 bits 1160 srdi $carry,$t7,16 ; upper 33 bits 1161 stfd $T2a,`$FRAME+32`($sp) 1162 stfd $T2b,`$FRAME+40`($sp) 1163 adde $t5,$t4,$t2 1164___ 1165$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1166 extrdi $t4,$t4,32,0 1167 extrdi $t2,$t2,32,0 1168 adde $t4,$t4,$t2 1169___ 1170$code.=<<___; 1171 stfd $T3a,`$FRAME+48`($sp) 1172 stfd $T3b,`$FRAME+56`($sp) 1173 addze $carry,$carry 1174 std $t3,-16($tp) ; tp[j-1] 1175 std $t5,-8($tp) ; tp[j] 1176___ 1177} else { 1178$code.=<<___; 1179 fmadd $T1a,$N1,$na,$T1a 1180 fmadd $T1b,$N1,$nb,$T1b 1181 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 1182 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 1183 fmadd $T2a,$N2,$na,$T2a 1184 fmadd $T2b,$N2,$nb,$T2b 1185 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 1186 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 1187 fmadd $T3a,$N3,$na,$T3a 1188 fmadd $T3b,$N3,$nb,$T3b 1189 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 1190 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 1191 addc $t0,$t0,$carry 1192 adde $t1,$t1,$c1 1193 srwi $carry,$t0,16 1194 fmadd $T0a,$N0,$na,$T0a 1195 fmadd $T0b,$N0,$nb,$T0b 1196 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 1197 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 1198 srwi $c1,$t1,16 1199 insrwi $carry,$t1,16,0 1200 1201 fmadd $T1a,$N0,$nc,$T1a 1202 fmadd $T1b,$N0,$nd,$T1b 1203 addc $t2,$t2,$carry 1204 adde $t3,$t3,$c1 1205 srwi $carry,$t2,16 1206 fmadd $T2a,$N1,$nc,$T2a 1207 fmadd $T2b,$N1,$nd,$T2b 1208 insrwi $t0,$t2,16,0 ; 0..31 bits 1209 srwi $c1,$t3,16 1210 insrwi $carry,$t3,16,0 1211 fmadd $T3a,$N2,$nc,$T3a 1212 fmadd $T3b,$N2,$nd,$T3b 1213 lwz $t2,12($tp) ; tp[j] 1214 lwz $t3,8($tp) 1215 addc $t4,$t4,$carry 1216 adde $t5,$t5,$c1 1217 srwi $carry,$t4,16 1218 fmadd $dota,$N3,$nc,$dota 1219 fmadd $dotb,$N3,$nd,$dotb 1220 srwi $c1,$t5,16 1221 insrwi $carry,$t5,16,0 1222 1223 fctid $T0a,$T0a 1224 addc $t6,$t6,$carry 1225 adde $t7,$t7,$c1 1226 srwi $carry,$t6,16 1227 fctid $T0b,$T0b 1228 insrwi $t4,$t6,16,0 ; 32..63 bits 1229 srwi $c1,$t7,16 1230 insrwi $carry,$t7,16,0 1231 fctid $T1a,$T1a 1232 addc $t0,$t0,$t2 1233 adde $t4,$t4,$t3 1234 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 1235 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 1236 fctid $T1b,$T1b 1237 addze $carry,$carry 1238 addze $c1,$c1 1239 stw $t0,4($tp) ; tp[j-1] 1240 stw $t4,0($tp) 1241 fctid $T2a,$T2a 1242 addc $t2,$t2,$carry 1243 adde $t3,$t3,$c1 1244 srwi $carry,$t2,16 1245 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 1246 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 1247 fctid $T2b,$T2b 1248 srwi $c1,$t3,16 1249 insrwi $carry,$t3,16,0 1250 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 1251 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 1252 fctid $T3a,$T3a 1253 addc $t6,$t6,$carry 1254 adde $t7,$t7,$c1 1255 srwi $carry,$t6,16 1256 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 1257 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 1258 fctid $T3b,$T3b 1259 1260 insrwi $t2,$t6,16,0 ; 64..95 bits 1261 insrwi $carry,$t7,16,0 1262 srwi $c1,$t7,16 1263 lwz $t6,20($tp) 1264 lwzu $t7,16($tp) 1265 addc $t0,$t0,$carry 1266 stfd $T0a,`$FRAME+0`($sp) 1267 adde $t1,$t1,$c1 1268 srwi $carry,$t0,16 1269 stfd $T0b,`$FRAME+8`($sp) 1270 insrwi $carry,$t1,16,0 1271 srwi $c1,$t1,16 1272 addc $t4,$t4,$carry 1273 stfd $T1a,`$FRAME+16`($sp) 1274 adde $t5,$t5,$c1 1275 srwi $carry,$t4,16 1276 insrwi $t0,$t4,16,0 ; 96..127 bits 1277 stfd $T1b,`$FRAME+24`($sp) 1278 insrwi $carry,$t5,16,0 1279 srwi $c1,$t5,16 1280 1281 addc $t2,$t2,$t6 1282 stfd $T2a,`$FRAME+32`($sp) 1283 adde $t0,$t0,$t7 1284 stfd $T2b,`$FRAME+40`($sp) 1285 addze $carry,$carry 1286 stfd $T3a,`$FRAME+48`($sp) 1287 addze $c1,$c1 1288 stfd $T3b,`$FRAME+56`($sp) 1289 stw $t2,-4($tp) ; tp[j] 1290 stw $t0,-8($tp) 1291___ 1292} 1293$code.=<<___; 1294 bdnz Linner 1295 1296 fctid $dota,$dota 1297 fctid $dotb,$dotb 1298___ 1299if ($SIZE_T==8 or $flavour =~ /osx/) { 1300$code.=<<___; 1301 ld $t0,`$FRAME+0`($sp) 1302 ld $t1,`$FRAME+8`($sp) 1303 ld $t2,`$FRAME+16`($sp) 1304 ld $t3,`$FRAME+24`($sp) 1305 ld $t4,`$FRAME+32`($sp) 1306 ld $t5,`$FRAME+40`($sp) 1307 ld $t6,`$FRAME+48`($sp) 1308 ld $t7,`$FRAME+56`($sp) 1309 stfd $dota,`$FRAME+64`($sp) 1310 stfd $dotb,`$FRAME+72`($sp) 1311 1312 add $t0,$t0,$carry ; can not overflow 1313 srdi $carry,$t0,16 1314 add $t1,$t1,$carry 1315 srdi $carry,$t1,16 1316 insrdi $t0,$t1,16,32 1317 add $t2,$t2,$carry 1318 ld $t1,8($tp) ; tp[j] 1319 srdi $carry,$t2,16 1320 insrdi $t0,$t2,16,16 1321 add $t3,$t3,$carry 1322 ldu $t2,16($tp) ; tp[j+1] 1323 srdi $carry,$t3,16 1324 insrdi $t0,$t3,16,0 ; 0..63 bits 1325 add $t4,$t4,$carry 1326 srdi $carry,$t4,16 1327 add $t5,$t5,$carry 1328 srdi $carry,$t5,16 1329 insrdi $t4,$t5,16,32 1330 add $t6,$t6,$carry 1331 srdi $carry,$t6,16 1332 insrdi $t4,$t6,16,16 1333 add $t7,$t7,$carry 1334 insrdi $t4,$t7,16,0 ; 64..127 bits 1335 srdi $carry,$t7,16 ; upper 33 bits 1336 ld $t6,`$FRAME+64`($sp) 1337 ld $t7,`$FRAME+72`($sp) 1338 1339 addc $t3,$t0,$t1 1340___ 1341$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1342 extrdi $t0,$t0,32,0 1343 extrdi $t1,$t1,32,0 1344 adde $t0,$t0,$t1 1345___ 1346$code.=<<___; 1347 adde $t5,$t4,$t2 1348___ 1349$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] 1350 extrdi $t4,$t4,32,0 1351 extrdi $t2,$t2,32,0 1352 adde $t4,$t4,$t2 1353___ 1354$code.=<<___; 1355 addze $carry,$carry 1356 1357 std $t3,-16($tp) ; tp[j-1] 1358 std $t5,-8($tp) ; tp[j] 1359 1360 add $carry,$carry,$ovf ; consume upmost overflow 1361 add $t6,$t6,$carry ; can not overflow 1362 srdi $carry,$t6,16 1363 add $t7,$t7,$carry 1364 insrdi $t6,$t7,48,0 1365 srdi $ovf,$t7,48 1366 std $t6,0($tp) ; tp[num-1] 1367___ 1368} else { 1369$code.=<<___; 1370 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp) 1371 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp) 1372 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp) 1373 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp) 1374 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp) 1375 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp) 1376 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp) 1377 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp) 1378 stfd $dota,`$FRAME+64`($sp) 1379 stfd $dotb,`$FRAME+72`($sp) 1380 1381 addc $t0,$t0,$carry 1382 adde $t1,$t1,$c1 1383 srwi $carry,$t0,16 1384 insrwi $carry,$t1,16,0 1385 srwi $c1,$t1,16 1386 addc $t2,$t2,$carry 1387 adde $t3,$t3,$c1 1388 srwi $carry,$t2,16 1389 insrwi $t0,$t2,16,0 ; 0..31 bits 1390 lwz $t2,12($tp) ; tp[j] 1391 insrwi $carry,$t3,16,0 1392 srwi $c1,$t3,16 1393 lwz $t3,8($tp) 1394 addc $t4,$t4,$carry 1395 adde $t5,$t5,$c1 1396 srwi $carry,$t4,16 1397 insrwi $carry,$t5,16,0 1398 srwi $c1,$t5,16 1399 addc $t6,$t6,$carry 1400 adde $t7,$t7,$c1 1401 srwi $carry,$t6,16 1402 insrwi $t4,$t6,16,0 ; 32..63 bits 1403 insrwi $carry,$t7,16,0 1404 srwi $c1,$t7,16 1405 1406 addc $t0,$t0,$t2 1407 adde $t4,$t4,$t3 1408 addze $carry,$carry 1409 addze $c1,$c1 1410 stw $t0,4($tp) ; tp[j-1] 1411 stw $t4,0($tp) 1412 1413 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1 1414 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0 1415 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3 1416 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2 1417 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5 1418 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4 1419 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7 1420 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6 1421 1422 addc $t2,$t2,$carry 1423 adde $t3,$t3,$c1 1424 srwi $carry,$t2,16 1425 insrwi $carry,$t3,16,0 1426 srwi $c1,$t3,16 1427 addc $t6,$t6,$carry 1428 adde $t7,$t7,$c1 1429 srwi $carry,$t6,16 1430 insrwi $t2,$t6,16,0 ; 64..95 bits 1431 lwz $t6,20($tp) 1432 insrwi $carry,$t7,16,0 1433 srwi $c1,$t7,16 1434 lwzu $t7,16($tp) 1435 addc $t0,$t0,$carry 1436 adde $t1,$t1,$c1 1437 srwi $carry,$t0,16 1438 insrwi $carry,$t1,16,0 1439 srwi $c1,$t1,16 1440 addc $t4,$t4,$carry 1441 adde $t5,$t5,$c1 1442 srwi $carry,$t4,16 1443 insrwi $t0,$t4,16,0 ; 96..127 bits 1444 insrwi $carry,$t5,16,0 1445 srwi $c1,$t5,16 1446 1447 addc $t2,$t2,$t6 1448 adde $t0,$t0,$t7 1449 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp) 1450 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp) 1451 addze $carry,$carry 1452 addze $c1,$c1 1453 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp) 1454 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp) 1455 1456 addc $t6,$t6,$carry 1457 adde $t7,$t7,$c1 1458 stw $t2,-4($tp) ; tp[j] 1459 stw $t0,-8($tp) 1460 addc $t6,$t6,$ovf 1461 addze $t7,$t7 1462 srwi $carry,$t6,16 1463 insrwi $carry,$t7,16,0 1464 srwi $c1,$t7,16 1465 addc $t4,$t4,$carry 1466 adde $t5,$t5,$c1 1467 1468 insrwi $t6,$t4,16,0 1469 srwi $t4,$t4,16 1470 insrwi $t4,$t5,16,0 1471 srwi $ovf,$t5,16 1472 stw $t6,4($tp) ; tp[num-1] 1473 stw $t4,0($tp) 1474___ 1475} 1476$code.=<<___; 1477 slwi $t7,$num,2 1478 addi $i,$i,8 1479 subf $nap_d,$t7,$nap_d ; rewind pointer 1480 cmpw $i,$num 1481 blt- Louter 1482___ 1483 1484$code.=<<___ if ($SIZE_T==8); 1485 subf $np,$num,$np ; rewind np 1486 addi $j,$j,1 ; restore counter 1487 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 1488 addi $tp,$sp,`$FRAME+$TRANSFER+8` 1489 addi $t4,$sp,`$FRAME+$TRANSFER+16` 1490 addi $t5,$np,8 1491 addi $t6,$rp,8 1492 mtctr $j 1493 1494.align 4 1495Lsub: ldx $t0,$tp,$i 1496 ldx $t1,$np,$i 1497 ldx $t2,$t4,$i 1498 ldx $t3,$t5,$i 1499 subfe $t0,$t1,$t0 ; tp[j]-np[j] 1500 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] 1501 stdx $t0,$rp,$i 1502 stdx $t2,$t6,$i 1503 addi $i,$i,16 1504 bdnz Lsub 1505 1506 li $i,0 1507 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1508 mtctr $j 1509 1510.align 4 1511Lcopy: ; conditional copy 1512 ldx $t0,$tp,$i 1513 ldx $t1,$t4,$i 1514 ldx $t2,$rp,$i 1515 ldx $t3,$t6,$i 1516 std $i,8($nap_d) ; zap nap_d 1517 std $i,16($nap_d) 1518 std $i,24($nap_d) 1519 std $i,32($nap_d) 1520 std $i,40($nap_d) 1521 std $i,48($nap_d) 1522 std $i,56($nap_d) 1523 stdu $i,64($nap_d) 1524 and $t0,$t0,$ovf 1525 and $t1,$t1,$ovf 1526 andc $t2,$t2,$ovf 1527 andc $t3,$t3,$ovf 1528 or $t0,$t0,$t2 1529 or $t1,$t1,$t3 1530 stdx $t0,$rp,$i 1531 stdx $t1,$t6,$i 1532 stdx $i,$tp,$i ; zap tp at once 1533 stdx $i,$t4,$i 1534 addi $i,$i,16 1535 bdnz Lcopy 1536___ 1537$code.=<<___ if ($SIZE_T==4); 1538 subf $np,$num,$np ; rewind np 1539 addi $j,$j,1 ; restore counter 1540 subfc $i,$i,$i ; j=0 and "clear" XER[CA] 1541 addi $tp,$sp,`$FRAME+$TRANSFER` 1542 addi $np,$np,-4 1543 addi $rp,$rp,-4 1544 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1545 mtctr $j 1546 1547.align 4 1548Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order 1549 lwz $t1,8($tp) 1550 lwz $t2,20($tp) 1551 lwzu $t3,16($tp) 1552 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order 1553 lwz $t5,8($np) 1554 lwz $t6,12($np) 1555 lwzu $t7,16($np) 1556 subfe $t4,$t4,$t0 ; tp[j]-np[j] 1557 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order 1558 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] 1559 stw $t1,8($ap) 1560 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] 1561 stw $t2,12($ap) 1562 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] 1563 stwu $t3,16($ap) 1564 stw $t4,4($rp) 1565 stw $t5,8($rp) 1566 stw $t6,12($rp) 1567 stwu $t7,16($rp) 1568 bdnz Lsub 1569 1570 li $i,0 1571 subfe $ovf,$i,$ovf ; handle upmost overflow bit 1572 addi $ap,$sp,`$FRAME+$TRANSFER+4` 1573 subf $rp,$num,$rp ; rewind rp 1574 addi $tp,$sp,`$FRAME+$TRANSFER` 1575 mtctr $j 1576 1577.align 4 1578Lcopy: ; conditional copy 1579 lwz $t0,4($ap) 1580 lwz $t1,8($ap) 1581 lwz $t2,12($ap) 1582 lwzu $t3,16($ap) 1583 lwz $t4,4($rp) 1584 lwz $t5,8($rp) 1585 lwz $t6,12($rp) 1586 lwz $t7,16($rp) 1587 std $i,8($nap_d) ; zap nap_d 1588 std $i,16($nap_d) 1589 std $i,24($nap_d) 1590 std $i,32($nap_d) 1591 std $i,40($nap_d) 1592 std $i,48($nap_d) 1593 std $i,56($nap_d) 1594 stdu $i,64($nap_d) 1595 and $t0,$t0,$ovf 1596 and $t1,$t1,$ovf 1597 and $t2,$t2,$ovf 1598 and $t3,$t3,$ovf 1599 andc $t4,$t4,$ovf 1600 andc $t5,$t5,$ovf 1601 andc $t6,$t6,$ovf 1602 andc $t7,$t7,$ovf 1603 or $t0,$t0,$t4 1604 or $t1,$t1,$t5 1605 or $t2,$t2,$t6 1606 or $t3,$t3,$t7 1607 stw $t0,4($rp) 1608 stw $t1,8($rp) 1609 stw $t2,12($rp) 1610 stwu $t3,16($rp) 1611 std $i,8($tp) ; zap tp at once 1612 stdu $i,16($tp) 1613 bdnz Lcopy 1614___ 1615 1616$code.=<<___; 1617 $POP $i,0($sp) 1618 li r3,1 ; signal "handled" 1619 $POP r19,`-12*8-13*$SIZE_T`($i) 1620 $POP r20,`-12*8-12*$SIZE_T`($i) 1621 $POP r21,`-12*8-11*$SIZE_T`($i) 1622 $POP r22,`-12*8-10*$SIZE_T`($i) 1623 $POP r23,`-12*8-9*$SIZE_T`($i) 1624 $POP r24,`-12*8-8*$SIZE_T`($i) 1625 $POP r25,`-12*8-7*$SIZE_T`($i) 1626 $POP r26,`-12*8-6*$SIZE_T`($i) 1627 $POP r27,`-12*8-5*$SIZE_T`($i) 1628 $POP r28,`-12*8-4*$SIZE_T`($i) 1629 $POP r29,`-12*8-3*$SIZE_T`($i) 1630 $POP r30,`-12*8-2*$SIZE_T`($i) 1631 $POP r31,`-12*8-1*$SIZE_T`($i) 1632 lfd f20,`-12*8`($i) 1633 lfd f21,`-11*8`($i) 1634 lfd f22,`-10*8`($i) 1635 lfd f23,`-9*8`($i) 1636 lfd f24,`-8*8`($i) 1637 lfd f25,`-7*8`($i) 1638 lfd f26,`-6*8`($i) 1639 lfd f27,`-5*8`($i) 1640 lfd f28,`-4*8`($i) 1641 lfd f29,`-3*8`($i) 1642 lfd f30,`-2*8`($i) 1643 lfd f31,`-1*8`($i) 1644 mr $sp,$i 1645 blr 1646 .long 0 1647 .byte 0,12,4,0,0x8c,13,6,0 1648 .long 0 1649.size .$fname,.-.$fname 1650 1651.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 1652___ 1653 1654$code =~ s/\`([^\`]*)\`/eval $1/gem; 1655print $code; 1656close STDOUT or die "error closing STDOUT: $!"; 1657