1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# On PA-7100LC this module performs ~90-50% better, less for longer 18# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 19# that compiler utilized xmpyu instruction to perform 32x32=64-bit 20# multiplication, which in turn means that "baseline" performance was 21# optimal in respect to instruction set capabilities. Fair comparison 22# with vendor compiler is problematic, because OpenSSL doesn't define 23# BN_LLONG [presumably] for historical reasons, which drives compiler 24# toward 4 times 16x16=32-bit multiplications [plus complementary 25# shifts and additions] instead. This means that you should observe 26# several times improvement over code generated by vendor compiler 27# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 28# improvement coefficient was never collected on PA-7100LC, or any 29# other 1.1 CPU, because I don't have access to such machine with 30# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 31# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 32# of ~5x on PA-8600. 33# 34# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 35# reportedly ~2x faster than vendor compiler generated code [according 36# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 37# this implementation is actually 32-bit one, in the sense that it 38# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 39# 64-bit BN_LONGs... How do they interoperate then? No problem. This 40# module picks halves of 64-bit values in reverse order and pretends 41# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 42# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 43# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 44# i.e. there is no "wider" multiplication like on most other 64-bit 45# platforms. This means that even being effectively 32-bit, this 46# implementation performs "64-bit" computational task in same amount 47# of arithmetic operations, most notably multiplications. It requires 48# more memory references, most notably to tp[num], but this doesn't 49# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 50# 2.0 code path provides virtually same performance as pa-risc2[W].s: 51# it's ~10% better for shortest key length and ~10% worse for longest 52# one. 53# 54# In case it wasn't clear. The module has two distinct code paths: 55# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 56# additions and 64-bit integer loads, not to mention specific 57# instruction scheduling. In 64-bit build naturally only 2.0 code path 58# is assembled. In 32-bit application context both code paths are 59# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 60# is taken automatically. Also, in 32-bit build the module imposes 61# couple of limitations: vector lengths has to be even and vector 62# addresses has to be 64-bit aligned. Normally neither is a problem: 63# most common key lengths are even and vectors are commonly malloc-ed, 64# which ensures alignment. 65# 66# Special thanks to polarhome.com for providing HP-UX account on 67# PA-RISC 1.1 machine, and to correspondent who chose to remain 68# anonymous for testing the code on PA-RISC 2.0 machine. 69 70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71 72# $output is the last argument if it looks like a file (it has an extension) 73# $flavour is the first argument if it doesn't look like a file 74$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 75$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 76 77$output and open STDOUT,">$output"; 78 79if ($flavour =~ /64/) { 80 $LEVEL ="2.0W"; 81 $SIZE_T =8; 82 $FRAME_MARKER =80; 83 $SAVED_RP =16; 84 $PUSH ="std"; 85 $PUSHMA ="std,ma"; 86 $POP ="ldd"; 87 $POPMB ="ldd,mb"; 88 $BN_SZ =$SIZE_T; 89} else { 90 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 91 $SIZE_T =4; 92 $FRAME_MARKER =48; 93 $SAVED_RP =20; 94 $PUSH ="stw"; 95 $PUSHMA ="stwm"; 96 $POP ="ldw"; 97 $POPMB ="ldwm"; 98 $BN_SZ =$SIZE_T; 99 if (open CONF,"<${dir}../../opensslconf.h") { 100 while(<CONF>) { 101 if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 102 $BN_SZ=8; 103 $LEVEL="2.0"; 104 last; 105 } 106 } 107 close CONF; 108 } 109} 110 111$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 112 # [+ argument transfer] 113$LOCALS=$FRAME-$FRAME_MARKER; 114$FRAME+=32; # local variables 115 116$tp="%r31"; 117$ti1="%r29"; 118$ti0="%r28"; 119 120$rp="%r26"; 121$ap="%r25"; 122$bp="%r24"; 123$np="%r23"; 124$n0="%r22"; # passed through stack in 32-bit 125$num="%r21"; # passed through stack in 32-bit 126$idx="%r20"; 127$arrsz="%r19"; 128 129$nm1="%r7"; 130$nm0="%r6"; 131$ab1="%r5"; 132$ab0="%r4"; 133 134$fp="%r3"; 135$hi1="%r2"; 136$hi0="%r1"; 137 138$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s 139 140$fm0="%fr4"; $fti=$fm0; 141$fbi="%fr5L"; 142$fn0="%fr5R"; 143$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 144$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 145 146$code=<<___; 147 .LEVEL $LEVEL 148 .SPACE \$TEXT\$ 149 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 150 151 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 152 .ALIGN 64 153bn_mul_mont 154 .PROC 155 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 156 .ENTRY 157 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 158 $PUSHMA %r3,$FRAME(%sp) 159 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 160 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 161 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 162 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 163 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 164 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 165 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 166 ldo -$FRAME(%sp),$fp 167___ 168$code.=<<___ if ($SIZE_T==4); 169 ldw `-$FRAME_MARKER-4`($fp),$n0 170 ldw `-$FRAME_MARKER-8`($fp),$num 171 nop 172 nop ; alignment 173___ 174$code.=<<___ if ($BN_SZ==4); 175 comiclr,<= 6,$num,%r0 ; are vectors long enough? 176 b L\$abort 177 ldi 0,%r28 ; signal "unhandled" 178 add,ev %r0,$num,$num ; is $num even? 179 b L\$abort 180 nop 181 or $ap,$np,$ti1 182 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 183 b L\$abort 184 nop 185 nop ; alignment 186 nop 187 188 fldws 0($n0),${fn0} 189 fldws,ma 4($bp),${fbi} ; bp[0] 190___ 191$code.=<<___ if ($BN_SZ==8); 192 comib,> 3,$num,L\$abort ; are vectors long enough? 193 ldi 0,%r28 ; signal "unhandled" 194 addl $num,$num,$num ; I operate on 32-bit values 195 196 fldws 4($n0),${fn0} ; only low part of n0 197 fldws 4($bp),${fbi} ; bp[0] in flipped word order 198___ 199$code.=<<___; 200 fldds 0($ap),${fai} ; ap[0,1] 201 fldds 0($np),${fni} ; np[0,1] 202 203 sh2addl $num,%r0,$arrsz 204 ldi 31,$hi0 205 ldo 36($arrsz),$hi1 ; space for tp[num+1] 206 andcm $hi1,$hi0,$hi1 ; align 207 addl $hi1,%sp,%sp 208 $PUSH $fp,-$SIZE_T(%sp) 209 210 ldo `$LOCALS+16`($fp),$xfer 211 ldo `$LOCALS+32+4`($fp),$tp 212 213 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 214 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 215 xmpyu ${fn0},${fab0}R,${fm0} 216 217 addl $arrsz,$ap,$ap ; point at the end 218 addl $arrsz,$np,$np 219 subi 0,$arrsz,$idx ; j=0 220 ldo 8($idx),$idx ; j++++ 221 222 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 223 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 224 fstds ${fab0},-16($xfer) 225 fstds ${fnm0},-8($xfer) 226 fstds ${fab1},0($xfer) 227 fstds ${fnm1},8($xfer) 228 flddx $idx($ap),${fai} ; ap[2,3] 229 flddx $idx($np),${fni} ; np[2,3] 230___ 231$code.=<<___ if ($BN_SZ==4); 232 mtctl $hi0,%cr11 ; $hi0 still holds 31 233 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 234 b L\$parisc11 235 nop 236___ 237$code.=<<___; # PA-RISC 2.0 code-path 238 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 239 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 240 ldd -16($xfer),$ab0 241 fstds ${fab0},-16($xfer) 242 243 extrd,u $ab0,31,32,$hi0 244 extrd,u $ab0,63,32,$ab0 245 ldd -8($xfer),$nm0 246 fstds ${fnm0},-8($xfer) 247 ldo 8($idx),$idx ; j++++ 248 addl $ab0,$nm0,$nm0 ; low part is discarded 249 extrd,u $nm0,31,32,$hi1 250 251L\$1st 252 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 253 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 254 ldd 0($xfer),$ab1 255 fstds ${fab1},0($xfer) 256 addl $hi0,$ab1,$ab1 257 extrd,u $ab1,31,32,$hi0 258 ldd 8($xfer),$nm1 259 fstds ${fnm1},8($xfer) 260 extrd,u $ab1,63,32,$ab1 261 addl $hi1,$nm1,$nm1 262 flddx $idx($ap),${fai} ; ap[j,j+1] 263 flddx $idx($np),${fni} ; np[j,j+1] 264 addl $ab1,$nm1,$nm1 265 extrd,u $nm1,31,32,$hi1 266 267 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 268 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 269 ldd -16($xfer),$ab0 270 fstds ${fab0},-16($xfer) 271 addl $hi0,$ab0,$ab0 272 extrd,u $ab0,31,32,$hi0 273 ldd -8($xfer),$nm0 274 fstds ${fnm0},-8($xfer) 275 extrd,u $ab0,63,32,$ab0 276 addl $hi1,$nm0,$nm0 277 stw $nm1,-4($tp) ; tp[j-1] 278 addl $ab0,$nm0,$nm0 279 stw,ma $nm0,8($tp) ; tp[j-1] 280 addib,<> 8,$idx,L\$1st ; j++++ 281 extrd,u $nm0,31,32,$hi1 282 283 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 284 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 285 ldd 0($xfer),$ab1 286 fstds ${fab1},0($xfer) 287 addl $hi0,$ab1,$ab1 288 extrd,u $ab1,31,32,$hi0 289 ldd 8($xfer),$nm1 290 fstds ${fnm1},8($xfer) 291 extrd,u $ab1,63,32,$ab1 292 addl $hi1,$nm1,$nm1 293 ldd -16($xfer),$ab0 294 addl $ab1,$nm1,$nm1 295 ldd -8($xfer),$nm0 296 extrd,u $nm1,31,32,$hi1 297 298 addl $hi0,$ab0,$ab0 299 extrd,u $ab0,31,32,$hi0 300 stw $nm1,-4($tp) ; tp[j-1] 301 extrd,u $ab0,63,32,$ab0 302 addl $hi1,$nm0,$nm0 303 ldd 0($xfer),$ab1 304 addl $ab0,$nm0,$nm0 305 ldd,mb 8($xfer),$nm1 306 extrd,u $nm0,31,32,$hi1 307 stw,ma $nm0,8($tp) ; tp[j-1] 308 309 ldo -1($num),$num ; i-- 310 subi 0,$arrsz,$idx ; j=0 311___ 312$code.=<<___ if ($BN_SZ==4); 313 fldws,ma 4($bp),${fbi} ; bp[1] 314___ 315$code.=<<___ if ($BN_SZ==8); 316 fldws 0($bp),${fbi} ; bp[1] in flipped word order 317___ 318$code.=<<___; 319 flddx $idx($ap),${fai} ; ap[0,1] 320 flddx $idx($np),${fni} ; np[0,1] 321 fldws 8($xfer),${fti}R ; tp[0] 322 addl $hi0,$ab1,$ab1 323 extrd,u $ab1,31,32,$hi0 324 extrd,u $ab1,63,32,$ab1 325 ldo 8($idx),$idx ; j++++ 326 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 327 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 328 addl $hi1,$nm1,$nm1 329 addl $ab1,$nm1,$nm1 330 extrd,u $nm1,31,32,$hi1 331 fstws,mb ${fab0}L,-8($xfer) ; save high part 332 stw $nm1,-4($tp) ; tp[j-1] 333 334 fcpy,sgl %fr0,${fti}L ; zero high part 335 fcpy,sgl %fr0,${fab0}L 336 addl $hi1,$hi0,$hi0 337 extrd,u $hi0,31,32,$hi1 338 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 339 fcnvxf,dbl,dbl ${fab0},${fab0} 340 stw $hi0,0($tp) 341 stw $hi1,4($tp) 342 343 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 344 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 345 xmpyu ${fn0},${fab0}R,${fm0} 346 ldo `$LOCALS+32+4`($fp),$tp 347L\$outer 348 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 349 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 350 fstds ${fab0},-16($xfer) ; 33-bit value 351 fstds ${fnm0},-8($xfer) 352 flddx $idx($ap),${fai} ; ap[2] 353 flddx $idx($np),${fni} ; np[2] 354 ldo 8($idx),$idx ; j++++ 355 ldd -16($xfer),$ab0 ; 33-bit value 356 ldd -8($xfer),$nm0 357 ldw 0($xfer),$hi0 ; high part 358 359 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 360 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 361 extrd,u $ab0,31,32,$ti0 ; carry bit 362 extrd,u $ab0,63,32,$ab0 363 fstds ${fab1},0($xfer) 364 addl $ti0,$hi0,$hi0 ; account carry bit 365 fstds ${fnm1},8($xfer) 366 addl $ab0,$nm0,$nm0 ; low part is discarded 367 ldw 0($tp),$ti1 ; tp[1] 368 extrd,u $nm0,31,32,$hi1 369 fstds ${fab0},-16($xfer) 370 fstds ${fnm0},-8($xfer) 371 372L\$inner 373 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 374 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 375 ldd 0($xfer),$ab1 376 fstds ${fab1},0($xfer) 377 addl $hi0,$ti1,$ti1 378 addl $ti1,$ab1,$ab1 379 ldd 8($xfer),$nm1 380 fstds ${fnm1},8($xfer) 381 extrd,u $ab1,31,32,$hi0 382 extrd,u $ab1,63,32,$ab1 383 flddx $idx($ap),${fai} ; ap[j,j+1] 384 flddx $idx($np),${fni} ; np[j,j+1] 385 addl $hi1,$nm1,$nm1 386 addl $ab1,$nm1,$nm1 387 ldw 4($tp),$ti0 ; tp[j] 388 stw $nm1,-4($tp) ; tp[j-1] 389 390 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 391 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 392 ldd -16($xfer),$ab0 393 fstds ${fab0},-16($xfer) 394 addl $hi0,$ti0,$ti0 395 addl $ti0,$ab0,$ab0 396 ldd -8($xfer),$nm0 397 fstds ${fnm0},-8($xfer) 398 extrd,u $ab0,31,32,$hi0 399 extrd,u $nm1,31,32,$hi1 400 ldw 8($tp),$ti1 ; tp[j] 401 extrd,u $ab0,63,32,$ab0 402 addl $hi1,$nm0,$nm0 403 addl $ab0,$nm0,$nm0 404 stw,ma $nm0,8($tp) ; tp[j-1] 405 addib,<> 8,$idx,L\$inner ; j++++ 406 extrd,u $nm0,31,32,$hi1 407 408 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 409 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 410 ldd 0($xfer),$ab1 411 fstds ${fab1},0($xfer) 412 addl $hi0,$ti1,$ti1 413 addl $ti1,$ab1,$ab1 414 ldd 8($xfer),$nm1 415 fstds ${fnm1},8($xfer) 416 extrd,u $ab1,31,32,$hi0 417 extrd,u $ab1,63,32,$ab1 418 ldw 4($tp),$ti0 ; tp[j] 419 addl $hi1,$nm1,$nm1 420 addl $ab1,$nm1,$nm1 421 ldd -16($xfer),$ab0 422 ldd -8($xfer),$nm0 423 extrd,u $nm1,31,32,$hi1 424 425 addl $hi0,$ab0,$ab0 426 addl $ti0,$ab0,$ab0 427 stw $nm1,-4($tp) ; tp[j-1] 428 extrd,u $ab0,31,32,$hi0 429 ldw 8($tp),$ti1 ; tp[j] 430 extrd,u $ab0,63,32,$ab0 431 addl $hi1,$nm0,$nm0 432 ldd 0($xfer),$ab1 433 addl $ab0,$nm0,$nm0 434 ldd,mb 8($xfer),$nm1 435 extrd,u $nm0,31,32,$hi1 436 stw,ma $nm0,8($tp) ; tp[j-1] 437 438 addib,= -1,$num,L\$outerdone ; i-- 439 subi 0,$arrsz,$idx ; j=0 440___ 441$code.=<<___ if ($BN_SZ==4); 442 fldws,ma 4($bp),${fbi} ; bp[i] 443___ 444$code.=<<___ if ($BN_SZ==8); 445 ldi 12,$ti0 ; bp[i] in flipped word order 446 addl,ev %r0,$num,$num 447 ldi -4,$ti0 448 addl $ti0,$bp,$bp 449 fldws 0($bp),${fbi} 450___ 451$code.=<<___; 452 flddx $idx($ap),${fai} ; ap[0] 453 addl $hi0,$ab1,$ab1 454 flddx $idx($np),${fni} ; np[0] 455 fldws 8($xfer),${fti}R ; tp[0] 456 addl $ti1,$ab1,$ab1 457 extrd,u $ab1,31,32,$hi0 458 extrd,u $ab1,63,32,$ab1 459 460 ldo 8($idx),$idx ; j++++ 461 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 462 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 463 ldw 4($tp),$ti0 ; tp[j] 464 465 addl $hi1,$nm1,$nm1 466 fstws,mb ${fab0}L,-8($xfer) ; save high part 467 addl $ab1,$nm1,$nm1 468 extrd,u $nm1,31,32,$hi1 469 fcpy,sgl %fr0,${fti}L ; zero high part 470 fcpy,sgl %fr0,${fab0}L 471 stw $nm1,-4($tp) ; tp[j-1] 472 473 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 474 fcnvxf,dbl,dbl ${fab0},${fab0} 475 addl $hi1,$hi0,$hi0 476 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 477 addl $ti0,$hi0,$hi0 478 extrd,u $hi0,31,32,$hi1 479 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 480 stw $hi0,0($tp) 481 stw $hi1,4($tp) 482 xmpyu ${fn0},${fab0}R,${fm0} 483 484 b L\$outer 485 ldo `$LOCALS+32+4`($fp),$tp 486 487L\$outerdone 488 addl $hi0,$ab1,$ab1 489 addl $ti1,$ab1,$ab1 490 extrd,u $ab1,31,32,$hi0 491 extrd,u $ab1,63,32,$ab1 492 493 ldw 4($tp),$ti0 ; tp[j] 494 495 addl $hi1,$nm1,$nm1 496 addl $ab1,$nm1,$nm1 497 extrd,u $nm1,31,32,$hi1 498 stw $nm1,-4($tp) ; tp[j-1] 499 500 addl $hi1,$hi0,$hi0 501 addl $ti0,$hi0,$hi0 502 extrd,u $hi0,31,32,$hi1 503 stw $hi0,0($tp) 504 stw $hi1,4($tp) 505 506 ldo `$LOCALS+32`($fp),$tp 507 sub %r0,%r0,%r0 ; clear borrow 508___ 509$code.=<<___ if ($BN_SZ==4); 510 ldws,ma 4($tp),$ti0 511 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 512 b L\$sub_pa11 513 addl $tp,$arrsz,$tp 514L\$sub 515 ldwx $idx($np),$hi0 516 subb $ti0,$hi0,$hi1 517 ldwx $idx($tp),$ti0 518 addib,<> 4,$idx,L\$sub 519 stws,ma $hi1,4($rp) 520 521 subb $ti0,%r0,$hi1 522___ 523$code.=<<___ if ($BN_SZ==8); 524 ldd,ma 8($tp),$ti0 525L\$sub 526 ldd $idx($np),$hi0 527 shrpd $ti0,$ti0,32,$ti0 ; flip word order 528 std $ti0,-8($tp) ; save flipped value 529 sub,db $ti0,$hi0,$hi1 530 ldd,ma 8($tp),$ti0 531 addib,<> 8,$idx,L\$sub 532 std,ma $hi1,8($rp) 533 534 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 535 sub,db $ti0,%r0,$hi1 536___ 537$code.=<<___; 538 ldo `$LOCALS+32`($fp),$tp 539 sub $rp,$arrsz,$rp ; rewind rp 540 subi 0,$arrsz,$idx 541L\$copy 542 ldd 0($tp),$ti0 543 ldd 0($rp),$hi0 544 std,ma %r0,8($tp) 545 comiclr,= 0,$hi1,%r0 546 copy $ti0,$hi0 547 addib,<> 8,$idx,L\$copy 548 std,ma $hi0,8($rp) 549___ 550 551if ($BN_SZ==4) { # PA-RISC 1.1 code-path 552$ablo=$ab0; 553$abhi=$ab1; 554$nmlo0=$nm0; 555$nmhi0=$nm1; 556$nmlo1="%r9"; 557$nmhi1="%r8"; 558 559$code.=<<___; 560 b L\$done 561 nop 562 563 .ALIGN 8 564L\$parisc11 565 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 566 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 567 ldw -12($xfer),$ablo 568 ldw -16($xfer),$hi0 569 ldw -4($xfer),$nmlo0 570 ldw -8($xfer),$nmhi0 571 fstds ${fab0},-16($xfer) 572 fstds ${fnm0},-8($xfer) 573 574 ldo 8($idx),$idx ; j++++ 575 add $ablo,$nmlo0,$nmlo0 ; discarded 576 addc %r0,$nmhi0,$hi1 577 ldw 4($xfer),$ablo 578 ldw 0($xfer),$abhi 579 nop 580 581L\$1st_pa11 582 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 583 flddx $idx($ap),${fai} ; ap[j,j+1] 584 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 585 flddx $idx($np),${fni} ; np[j,j+1] 586 add $hi0,$ablo,$ablo 587 ldw 12($xfer),$nmlo1 588 addc %r0,$abhi,$hi0 589 ldw 8($xfer),$nmhi1 590 add $ablo,$nmlo1,$nmlo1 591 fstds ${fab1},0($xfer) 592 addc %r0,$nmhi1,$nmhi1 593 fstds ${fnm1},8($xfer) 594 add $hi1,$nmlo1,$nmlo1 595 ldw -12($xfer),$ablo 596 addc %r0,$nmhi1,$hi1 597 ldw -16($xfer),$abhi 598 599 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 600 ldw -4($xfer),$nmlo0 601 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 602 ldw -8($xfer),$nmhi0 603 add $hi0,$ablo,$ablo 604 stw $nmlo1,-4($tp) ; tp[j-1] 605 addc %r0,$abhi,$hi0 606 fstds ${fab0},-16($xfer) 607 add $ablo,$nmlo0,$nmlo0 608 fstds ${fnm0},-8($xfer) 609 addc %r0,$nmhi0,$nmhi0 610 ldw 0($xfer),$abhi 611 add $hi1,$nmlo0,$nmlo0 612 ldw 4($xfer),$ablo 613 stws,ma $nmlo0,8($tp) ; tp[j-1] 614 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 615 addc %r0,$nmhi0,$hi1 616 617 ldw 8($xfer),$nmhi1 618 ldw 12($xfer),$nmlo1 619 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 620 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 621 add $hi0,$ablo,$ablo 622 fstds ${fab1},0($xfer) 623 addc %r0,$abhi,$hi0 624 fstds ${fnm1},8($xfer) 625 add $ablo,$nmlo1,$nmlo1 626 ldw -16($xfer),$abhi 627 addc %r0,$nmhi1,$nmhi1 628 ldw -12($xfer),$ablo 629 add $hi1,$nmlo1,$nmlo1 630 ldw -8($xfer),$nmhi0 631 addc %r0,$nmhi1,$hi1 632 ldw -4($xfer),$nmlo0 633 634 add $hi0,$ablo,$ablo 635 stw $nmlo1,-4($tp) ; tp[j-1] 636 addc %r0,$abhi,$hi0 637 ldw 0($xfer),$abhi 638 add $ablo,$nmlo0,$nmlo0 639 ldw 4($xfer),$ablo 640 addc %r0,$nmhi0,$nmhi0 641 ldws,mb 8($xfer),$nmhi1 642 add $hi1,$nmlo0,$nmlo0 643 ldw 4($xfer),$nmlo1 644 addc %r0,$nmhi0,$hi1 645 stws,ma $nmlo0,8($tp) ; tp[j-1] 646 647 ldo -1($num),$num ; i-- 648 subi 0,$arrsz,$idx ; j=0 649 650 fldws,ma 4($bp),${fbi} ; bp[1] 651 flddx $idx($ap),${fai} ; ap[0,1] 652 flddx $idx($np),${fni} ; np[0,1] 653 fldws 8($xfer),${fti}R ; tp[0] 654 add $hi0,$ablo,$ablo 655 addc %r0,$abhi,$hi0 656 ldo 8($idx),$idx ; j++++ 657 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 658 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 659 add $hi1,$nmlo1,$nmlo1 660 addc %r0,$nmhi1,$nmhi1 661 add $ablo,$nmlo1,$nmlo1 662 addc %r0,$nmhi1,$hi1 663 fstws,mb ${fab0}L,-8($xfer) ; save high part 664 stw $nmlo1,-4($tp) ; tp[j-1] 665 666 fcpy,sgl %fr0,${fti}L ; zero high part 667 fcpy,sgl %fr0,${fab0}L 668 add $hi1,$hi0,$hi0 669 addc %r0,%r0,$hi1 670 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 671 fcnvxf,dbl,dbl ${fab0},${fab0} 672 stw $hi0,0($tp) 673 stw $hi1,4($tp) 674 675 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 676 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 677 xmpyu ${fn0},${fab0}R,${fm0} 678 ldo `$LOCALS+32+4`($fp),$tp 679L\$outer_pa11 680 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 681 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 682 fstds ${fab0},-16($xfer) ; 33-bit value 683 fstds ${fnm0},-8($xfer) 684 flddx $idx($ap),${fai} ; ap[2,3] 685 flddx $idx($np),${fni} ; np[2,3] 686 ldw -16($xfer),$abhi ; carry bit actually 687 ldo 8($idx),$idx ; j++++ 688 ldw -12($xfer),$ablo 689 ldw -8($xfer),$nmhi0 690 ldw -4($xfer),$nmlo0 691 ldw 0($xfer),$hi0 ; high part 692 693 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 694 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 695 fstds ${fab1},0($xfer) 696 addl $abhi,$hi0,$hi0 ; account carry bit 697 fstds ${fnm1},8($xfer) 698 add $ablo,$nmlo0,$nmlo0 ; discarded 699 ldw 0($tp),$ti1 ; tp[1] 700 addc %r0,$nmhi0,$hi1 701 fstds ${fab0},-16($xfer) 702 fstds ${fnm0},-8($xfer) 703 ldw 4($xfer),$ablo 704 ldw 0($xfer),$abhi 705 706L\$inner_pa11 707 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 708 flddx $idx($ap),${fai} ; ap[j,j+1] 709 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 710 flddx $idx($np),${fni} ; np[j,j+1] 711 add $hi0,$ablo,$ablo 712 ldw 4($tp),$ti0 ; tp[j] 713 addc %r0,$abhi,$abhi 714 ldw 12($xfer),$nmlo1 715 add $ti1,$ablo,$ablo 716 ldw 8($xfer),$nmhi1 717 addc %r0,$abhi,$hi0 718 fstds ${fab1},0($xfer) 719 add $ablo,$nmlo1,$nmlo1 720 fstds ${fnm1},8($xfer) 721 addc %r0,$nmhi1,$nmhi1 722 ldw -12($xfer),$ablo 723 add $hi1,$nmlo1,$nmlo1 724 ldw -16($xfer),$abhi 725 addc %r0,$nmhi1,$hi1 726 727 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 728 ldw 8($tp),$ti1 ; tp[j] 729 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 730 ldw -4($xfer),$nmlo0 731 add $hi0,$ablo,$ablo 732 ldw -8($xfer),$nmhi0 733 addc %r0,$abhi,$abhi 734 stw $nmlo1,-4($tp) ; tp[j-1] 735 add $ti0,$ablo,$ablo 736 fstds ${fab0},-16($xfer) 737 addc %r0,$abhi,$hi0 738 fstds ${fnm0},-8($xfer) 739 add $ablo,$nmlo0,$nmlo0 740 ldw 4($xfer),$ablo 741 addc %r0,$nmhi0,$nmhi0 742 ldw 0($xfer),$abhi 743 add $hi1,$nmlo0,$nmlo0 744 stws,ma $nmlo0,8($tp) ; tp[j-1] 745 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 746 addc %r0,$nmhi0,$hi1 747 748 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 749 ldw 12($xfer),$nmlo1 750 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 751 ldw 8($xfer),$nmhi1 752 add $hi0,$ablo,$ablo 753 ldw 4($tp),$ti0 ; tp[j] 754 addc %r0,$abhi,$abhi 755 fstds ${fab1},0($xfer) 756 add $ti1,$ablo,$ablo 757 fstds ${fnm1},8($xfer) 758 addc %r0,$abhi,$hi0 759 ldw -16($xfer),$abhi 760 add $ablo,$nmlo1,$nmlo1 761 ldw -12($xfer),$ablo 762 addc %r0,$nmhi1,$nmhi1 763 ldw -8($xfer),$nmhi0 764 add $hi1,$nmlo1,$nmlo1 765 ldw -4($xfer),$nmlo0 766 addc %r0,$nmhi1,$hi1 767 768 add $hi0,$ablo,$ablo 769 stw $nmlo1,-4($tp) ; tp[j-1] 770 addc %r0,$abhi,$abhi 771 add $ti0,$ablo,$ablo 772 ldw 8($tp),$ti1 ; tp[j] 773 addc %r0,$abhi,$hi0 774 ldw 0($xfer),$abhi 775 add $ablo,$nmlo0,$nmlo0 776 ldw 4($xfer),$ablo 777 addc %r0,$nmhi0,$nmhi0 778 ldws,mb 8($xfer),$nmhi1 779 add $hi1,$nmlo0,$nmlo0 780 ldw 4($xfer),$nmlo1 781 addc %r0,$nmhi0,$hi1 782 stws,ma $nmlo0,8($tp) ; tp[j-1] 783 784 addib,= -1,$num,L\$outerdone_pa11; i-- 785 subi 0,$arrsz,$idx ; j=0 786 787 fldws,ma 4($bp),${fbi} ; bp[i] 788 flddx $idx($ap),${fai} ; ap[0] 789 add $hi0,$ablo,$ablo 790 addc %r0,$abhi,$abhi 791 flddx $idx($np),${fni} ; np[0] 792 fldws 8($xfer),${fti}R ; tp[0] 793 add $ti1,$ablo,$ablo 794 addc %r0,$abhi,$hi0 795 796 ldo 8($idx),$idx ; j++++ 797 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 798 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 799 ldw 4($tp),$ti0 ; tp[j] 800 801 add $hi1,$nmlo1,$nmlo1 802 addc %r0,$nmhi1,$nmhi1 803 fstws,mb ${fab0}L,-8($xfer) ; save high part 804 add $ablo,$nmlo1,$nmlo1 805 addc %r0,$nmhi1,$hi1 806 fcpy,sgl %fr0,${fti}L ; zero high part 807 fcpy,sgl %fr0,${fab0}L 808 stw $nmlo1,-4($tp) ; tp[j-1] 809 810 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 811 fcnvxf,dbl,dbl ${fab0},${fab0} 812 add $hi1,$hi0,$hi0 813 addc %r0,%r0,$hi1 814 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 815 add $ti0,$hi0,$hi0 816 addc %r0,$hi1,$hi1 817 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 818 stw $hi0,0($tp) 819 stw $hi1,4($tp) 820 xmpyu ${fn0},${fab0}R,${fm0} 821 822 b L\$outer_pa11 823 ldo `$LOCALS+32+4`($fp),$tp 824 825L\$outerdone_pa11 826 add $hi0,$ablo,$ablo 827 addc %r0,$abhi,$abhi 828 add $ti1,$ablo,$ablo 829 addc %r0,$abhi,$hi0 830 831 ldw 4($tp),$ti0 ; tp[j] 832 833 add $hi1,$nmlo1,$nmlo1 834 addc %r0,$nmhi1,$nmhi1 835 add $ablo,$nmlo1,$nmlo1 836 addc %r0,$nmhi1,$hi1 837 stw $nmlo1,-4($tp) ; tp[j-1] 838 839 add $hi1,$hi0,$hi0 840 addc %r0,%r0,$hi1 841 add $ti0,$hi0,$hi0 842 addc %r0,$hi1,$hi1 843 stw $hi0,0($tp) 844 stw $hi1,4($tp) 845 846 ldo `$LOCALS+32+4`($fp),$tp 847 sub %r0,%r0,%r0 ; clear borrow 848 ldw -4($tp),$ti0 849 addl $tp,$arrsz,$tp 850L\$sub_pa11 851 ldwx $idx($np),$hi0 852 subb $ti0,$hi0,$hi1 853 ldwx $idx($tp),$ti0 854 addib,<> 4,$idx,L\$sub_pa11 855 stws,ma $hi1,4($rp) 856 857 subb $ti0,%r0,$hi1 858 859 ldo `$LOCALS+32`($fp),$tp 860 sub $rp,$arrsz,$rp ; rewind rp 861 subi 0,$arrsz,$idx 862L\$copy_pa11 863 ldw 0($tp),$ti0 864 ldw 0($rp),$hi0 865 stws,ma %r0,4($tp) 866 comiclr,= 0,$hi1,%r0 867 copy $ti0,$hi0 868 addib,<> 4,$idx,L\$copy_pa11 869 stws,ma $hi0,4($rp) 870 871 nop ; alignment 872L\$done 873___ 874} 875 876$code.=<<___; 877 ldi 1,%r28 ; signal "handled" 878 ldo $FRAME($fp),%sp ; destroy tp[num+1] 879 880 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 881 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 882 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 883 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 884 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 885 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 886 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 887 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 888L\$abort 889 bv (%r2) 890 .EXIT 891 $POPMB -$FRAME(%sp),%r3 892 .PROCEND 893 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 894___ 895 896# Explicitly encode PA-RISC 2.0 instructions used in this module, so 897# that it can be compiled with .LEVEL 1.0. It should be noted that I 898# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 899# directive... 900 901my $ldd = sub { 902 my ($mod,$args) = @_; 903 my $orig = "ldd$mod\t$args"; 904 905 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 906 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 907 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 908 } 909 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 910 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 911 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 912 $opcode|=(1<<5) if ($mod =~ /^,m/); 913 $opcode|=(1<<13) if ($mod =~ /^,mb/); 914 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 915 } 916 else { "\t".$orig; } 917}; 918 919my $std = sub { 920 my ($mod,$args) = @_; 921 my $orig = "std$mod\t$args"; 922 923 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 924 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 925 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 926 $opcode|=(1<<5) if ($mod =~ /^,m/); 927 $opcode|=(1<<13) if ($mod =~ /^,mb/); 928 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 929 } 930 else { "\t".$orig; } 931}; 932 933my $extrd = sub { 934 my ($mod,$args) = @_; 935 my $orig = "extrd$mod\t$args"; 936 937 # I only have ",u" completer, it's implicitly encoded... 938 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 939 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 940 my $len=32-$3; 941 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 942 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 943 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 944 } 945 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 946 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 947 my $len=32-$2; 948 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 949 $opcode |= (1<<13) if ($mod =~ /,\**=/); 950 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 951 } 952 else { "\t".$orig; } 953}; 954 955my $shrpd = sub { 956 my ($mod,$args) = @_; 957 my $orig = "shrpd$mod\t$args"; 958 959 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 960 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 961 my $cpos=63-$3; 962 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 963 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 964 } 965 else { "\t".$orig; } 966}; 967 968my $sub = sub { 969 my ($mod,$args) = @_; 970 my $orig = "sub$mod\t$args"; 971 972 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 973 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 974 $opcode|=(1<<10); # e1 975 $opcode|=(1<<8); # e2 976 $opcode|=(1<<5); # d 977 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 978 } 979 else { "\t".$orig; } 980}; 981 982sub assemble { 983 my ($mnemonic,$mod,$args)=@_; 984 my $opcode = eval("\$$mnemonic"); 985 986 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 987} 988 989if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 990 =~ /GNU assembler/) { 991 $gnuas = 1; 992} 993 994foreach (split("\n",$code)) { 995 s/\`([^\`]*)\`/eval $1/ge; 996 # flip word order in 64-bit mode... 997 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 998 # assemble 2.0 instructions in 32-bit mode... 999 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 1000 1001 s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); 1002 s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); 1003 s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); 1004 s/\bbv\b/bve/ if ($SIZE_T==8); 1005 1006 print $_,"\n"; 1007} 1008close STDOUT or die "error closing STDOUT: $!"; 1009