1#! /usr/bin/env perl 2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# October 2005 18# 19# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU? 20# Because unlike integer multiplier, which simply stalls whole CPU, 21# FPU is fully pipelined and can effectively emit 48 bit partial 22# product every cycle. Why not blended SPARC v9? One can argue that 23# making this module dependent on UltraSPARC VIS extension limits its 24# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!) 25# implementations from compatibility matrix. But the rest, whole Sun 26# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support 27# VIS extension instructions used in this module. This is considered 28# good enough to not care about HAL SPARC64 users [if any] who have 29# integer-only pure SPARCv9 module to "fall down" to. 30 31# USI&II cores currently exhibit uniform 2x improvement [over pre- 32# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII 33# performance improves few percents for shorter keys and worsens few 34# percents for longer keys. This is because USIII integer multiplier 35# is >3x faster than USI&II one, which is harder to match [but see 36# TODO list below]. It should also be noted that SPARC64 V features 37# out-of-order execution, which *might* mean that integer multiplier 38# is pipelined, which in turn *might* be impossible to match... On 39# additional note, SPARC64 V implements FP Multiply-Add instruction, 40# which is perfectly usable in this context... In other words, as far 41# as Fujitsu SPARC64 V goes, talk to the author:-) 42 43# The implementation implies following "non-natural" limitations on 44# input arguments: 45# - num may not be less than 4; 46# - num has to be even; 47# Failure to meet either condition has no fatal effects, simply 48# doesn't give any performance gain. 49 50# TODO: 51# - modulo-schedule inner loop for better performance (on in-order 52# execution core such as UltraSPARC this shall result in further 53# noticeable(!) improvement); 54# - dedicated squaring procedure[?]; 55 56###################################################################### 57# November 2006 58# 59# Modulo-scheduled inner loops allow to interleave floating point and 60# integer instructions and minimize Read-After-Write penalties. This 61# results in *further* 20-50% performance improvement [depending on 62# key length, more for longer keys] on USI&II cores and 30-80% - on 63# USIII&IV. 64 65# $output is the last argument if it looks like a file (it has an extension) 66$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 67 68$output and open STDOUT,">$output"; 69 70$fname="bn_mul_mont_fpu"; 71 72$frame="STACK_FRAME"; 73$bias="STACK_BIAS"; 74$locals=64; 75 76# In order to provide for 32-/64-bit ABI duality, I keep integers wider 77# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used 78# exclusively for pointers, indexes and other small values... 79# int bn_mul_mont( 80$rp="%i0"; # BN_ULONG *rp, 81$ap="%i1"; # const BN_ULONG *ap, 82$bp="%i2"; # const BN_ULONG *bp, 83$np="%i3"; # const BN_ULONG *np, 84$n0="%i4"; # const BN_ULONG *n0, 85$num="%i5"; # int num); 86 87$tp="%l0"; # t[num] 88$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved 89$ap_h="%l2"; # to these four vectors as double-precision FP values. 90$np_l="%l3"; # This way a bunch of fxtods are eliminated in second 91$np_h="%l4"; # loop and L1-cache aliasing is minimized... 92$i="%l5"; 93$j="%l6"; 94$mask="%l7"; # 16-bit mask, 0xffff 95 96$n0="%g4"; # reassigned(!) to "64-bit" register 97$carry="%i4"; # %i4 reused(!) for a carry bit 98 99# FP register naming chart 100# 101# ..HILO 102# dcba 103# -------- 104# LOa 105# LOb 106# LOc 107# LOd 108# HIa 109# HIb 110# HIc 111# HId 112# ..a 113# ..b 114$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6"; 115$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14"; 116$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19"; 117$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23"; 118 119$dota="%f24"; $dotb="%f26"; 120 121$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38"; 122$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46"; 123$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54"; 124$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; 125 126$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load 127 128$code=<<___; 129#ifndef __ASSEMBLER__ 130# define __ASSEMBLER__ 1 131#endif 132#include "crypto/sparc_arch.h" 133 134.section ".text",#alloc,#execinstr 135 136.global $fname 137.align 32 138$fname: 139 save %sp,-$frame-$locals,%sp 140 141 cmp $num,4 142 bl,a,pn %icc,.Lret 143 clr %i0 144 andcc $num,1,%g0 ! $num has to be even... 145 bnz,a,pn %icc,.Lret 146 clr %i0 ! signal "unsupported input value" 147 148 srl $num,1,$num 149 sethi %hi(0xffff),$mask 150 ld [%i4+0],$n0 ! $n0 reassigned, remember? 151 or $mask,%lo(0xffff),$mask 152 ld [%i4+4],%o0 153 sllx %o0,32,%o0 154 or %o0,$n0,$n0 ! $n0=n0[1].n0[0] 155 156 sll $num,3,$num ! num*=8 157 158 add %sp,$bias,%o0 ! real top of stack 159 sll $num,2,%o1 160 add %o1,$num,%o1 ! %o1=num*5 161 sub %o0,%o1,%o0 162 and %o0,-2048,%o0 ! optimize TLB utilization 163 sub %o0,$bias,%sp ! alloca(5*num*8) 164 165 rd %asi,%o7 ! save %asi 166 add %sp,$bias+$frame+$locals,$tp 167 add $tp,$num,$ap_l 168 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends ! 169 add $ap_l,$num,$ap_h 170 add $ap_h,$num,$np_l 171 add $np_l,$num,$np_h 172 173 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads 174 175 add $rp,$num,$rp ! readjust input pointers to point 176 add $ap,$num,$ap ! at the ends too... 177 add $bp,$num,$bp 178 add $np,$num,$np 179 180 stx %o7,[%sp+$bias+$frame+48] ! save %asi 181 182 sub %g0,$num,$i ! i=-num 183 sub %g0,$num,$j ! j=-num 184 185 add $ap,$j,%o3 186 add $bp,$i,%o4 187 188 ld [%o3+4],%g1 ! bp[0] 189 ld [%o3+0],%o0 190 ld [%o4+4],%g5 ! ap[0] 191 sllx %g1,32,%g1 192 ld [%o4+0],%o1 193 sllx %g5,32,%g5 194 or %g1,%o0,%o0 195 or %g5,%o1,%o1 196 197 add $np,$j,%o5 198 199 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 200 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0 201 stx %o0,[%sp+$bias+$frame+0] 202 203 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words 204 fzeros $alo 205 ld [%o3+4],$ahi_ 206 fzeros $ahi 207 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 208 fzeros $nlo 209 ld [%o5+4],$nhi_ 210 fzeros $nhi 211 212 ! transfer b[i] to FPU as 4x16-bit values 213 ldda [%o4+2]%asi,$ba 214 fxtod $alo,$alo 215 ldda [%o4+0]%asi,$bb 216 fxtod $ahi,$ahi 217 ldda [%o4+6]%asi,$bc 218 fxtod $nlo,$nlo 219 ldda [%o4+4]%asi,$bd 220 fxtod $nhi,$nhi 221 222 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 223 ldda [%sp+$bias+$frame+6]%asi,$na 224 fxtod $ba,$ba 225 ldda [%sp+$bias+$frame+4]%asi,$nb 226 fxtod $bb,$bb 227 ldda [%sp+$bias+$frame+2]%asi,$nc 228 fxtod $bc,$bc 229 ldda [%sp+$bias+$frame+0]%asi,$nd 230 fxtod $bd,$bd 231 232 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 233 fxtod $na,$na 234 std $ahi,[$ap_h+$j] 235 fxtod $nb,$nb 236 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 237 fxtod $nc,$nc 238 std $nhi,[$np_h+$j] 239 fxtod $nd,$nd 240 241 fmuld $alo,$ba,$aloa 242 fmuld $nlo,$na,$nloa 243 fmuld $alo,$bb,$alob 244 fmuld $nlo,$nb,$nlob 245 fmuld $alo,$bc,$aloc 246 faddd $aloa,$nloa,$nloa 247 fmuld $nlo,$nc,$nloc 248 fmuld $alo,$bd,$alod 249 faddd $alob,$nlob,$nlob 250 fmuld $nlo,$nd,$nlod 251 fmuld $ahi,$ba,$ahia 252 faddd $aloc,$nloc,$nloc 253 fmuld $nhi,$na,$nhia 254 fmuld $ahi,$bb,$ahib 255 faddd $alod,$nlod,$nlod 256 fmuld $nhi,$nb,$nhib 257 fmuld $ahi,$bc,$ahic 258 faddd $ahia,$nhia,$nhia 259 fmuld $nhi,$nc,$nhic 260 fmuld $ahi,$bd,$ahid 261 faddd $ahib,$nhib,$nhib 262 fmuld $nhi,$nd,$nhid 263 264 faddd $ahic,$nhic,$dota ! $nhic 265 faddd $ahid,$nhid,$dotb ! $nhid 266 267 faddd $nloc,$nhia,$nloc 268 faddd $nlod,$nhib,$nlod 269 270 fdtox $nloa,$nloa 271 fdtox $nlob,$nlob 272 fdtox $nloc,$nloc 273 fdtox $nlod,$nlod 274 275 std $nloa,[%sp+$bias+$frame+0] 276 add $j,8,$j 277 std $nlob,[%sp+$bias+$frame+8] 278 add $ap,$j,%o4 279 std $nloc,[%sp+$bias+$frame+16] 280 add $np,$j,%o5 281 std $nlod,[%sp+$bias+$frame+24] 282 283 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 284 fzeros $alo 285 ld [%o4+4],$ahi_ 286 fzeros $ahi 287 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 288 fzeros $nlo 289 ld [%o5+4],$nhi_ 290 fzeros $nhi 291 292 fxtod $alo,$alo 293 fxtod $ahi,$ahi 294 fxtod $nlo,$nlo 295 fxtod $nhi,$nhi 296 297 ldx [%sp+$bias+$frame+0],%o0 298 fmuld $alo,$ba,$aloa 299 ldx [%sp+$bias+$frame+8],%o1 300 fmuld $nlo,$na,$nloa 301 ldx [%sp+$bias+$frame+16],%o2 302 fmuld $alo,$bb,$alob 303 ldx [%sp+$bias+$frame+24],%o3 304 fmuld $nlo,$nb,$nlob 305 306 srlx %o0,16,%o7 307 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 308 fmuld $alo,$bc,$aloc 309 add %o7,%o1,%o1 310 std $ahi,[$ap_h+$j] 311 faddd $aloa,$nloa,$nloa 312 fmuld $nlo,$nc,$nloc 313 srlx %o1,16,%o7 314 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 315 fmuld $alo,$bd,$alod 316 add %o7,%o2,%o2 317 std $nhi,[$np_h+$j] 318 faddd $alob,$nlob,$nlob 319 fmuld $nlo,$nd,$nlod 320 srlx %o2,16,%o7 321 fmuld $ahi,$ba,$ahia 322 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 323 faddd $aloc,$nloc,$nloc 324 fmuld $nhi,$na,$nhia 325 !and %o0,$mask,%o0 326 !and %o1,$mask,%o1 327 !and %o2,$mask,%o2 328 !sllx %o1,16,%o1 329 !sllx %o2,32,%o2 330 !sllx %o3,48,%o7 331 !or %o1,%o0,%o0 332 !or %o2,%o0,%o0 333 !or %o7,%o0,%o0 ! 64-bit result 334 srlx %o3,16,%g1 ! 34-bit carry 335 fmuld $ahi,$bb,$ahib 336 337 faddd $alod,$nlod,$nlod 338 fmuld $nhi,$nb,$nhib 339 fmuld $ahi,$bc,$ahic 340 faddd $ahia,$nhia,$nhia 341 fmuld $nhi,$nc,$nhic 342 fmuld $ahi,$bd,$ahid 343 faddd $ahib,$nhib,$nhib 344 fmuld $nhi,$nd,$nhid 345 346 faddd $dota,$nloa,$nloa 347 faddd $dotb,$nlob,$nlob 348 faddd $ahic,$nhic,$dota ! $nhic 349 faddd $ahid,$nhid,$dotb ! $nhid 350 351 faddd $nloc,$nhia,$nloc 352 faddd $nlod,$nhib,$nlod 353 354 fdtox $nloa,$nloa 355 fdtox $nlob,$nlob 356 fdtox $nloc,$nloc 357 fdtox $nlod,$nlod 358 359 std $nloa,[%sp+$bias+$frame+0] 360 std $nlob,[%sp+$bias+$frame+8] 361 addcc $j,8,$j 362 std $nloc,[%sp+$bias+$frame+16] 363 bz,pn %icc,.L1stskip 364 std $nlod,[%sp+$bias+$frame+24] 365 366.align 32 ! incidentally already aligned ! 367.L1st: 368 add $ap,$j,%o4 369 add $np,$j,%o5 370 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words 371 fzeros $alo 372 ld [%o4+4],$ahi_ 373 fzeros $ahi 374 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words 375 fzeros $nlo 376 ld [%o5+4],$nhi_ 377 fzeros $nhi 378 379 fxtod $alo,$alo 380 fxtod $ahi,$ahi 381 fxtod $nlo,$nlo 382 fxtod $nhi,$nhi 383 384 ldx [%sp+$bias+$frame+0],%o0 385 fmuld $alo,$ba,$aloa 386 ldx [%sp+$bias+$frame+8],%o1 387 fmuld $nlo,$na,$nloa 388 ldx [%sp+$bias+$frame+16],%o2 389 fmuld $alo,$bb,$alob 390 ldx [%sp+$bias+$frame+24],%o3 391 fmuld $nlo,$nb,$nlob 392 393 srlx %o0,16,%o7 394 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format 395 fmuld $alo,$bc,$aloc 396 add %o7,%o1,%o1 397 std $ahi,[$ap_h+$j] 398 faddd $aloa,$nloa,$nloa 399 fmuld $nlo,$nc,$nloc 400 srlx %o1,16,%o7 401 std $nlo,[$np_l+$j] ! save smashed np[j] in double format 402 fmuld $alo,$bd,$alod 403 add %o7,%o2,%o2 404 std $nhi,[$np_h+$j] 405 faddd $alob,$nlob,$nlob 406 fmuld $nlo,$nd,$nlod 407 srlx %o2,16,%o7 408 fmuld $ahi,$ba,$ahia 409 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 410 and %o0,$mask,%o0 411 faddd $aloc,$nloc,$nloc 412 fmuld $nhi,$na,$nhia 413 and %o1,$mask,%o1 414 and %o2,$mask,%o2 415 fmuld $ahi,$bb,$ahib 416 sllx %o1,16,%o1 417 faddd $alod,$nlod,$nlod 418 fmuld $nhi,$nb,$nhib 419 sllx %o2,32,%o2 420 fmuld $ahi,$bc,$ahic 421 sllx %o3,48,%o7 422 or %o1,%o0,%o0 423 faddd $ahia,$nhia,$nhia 424 fmuld $nhi,$nc,$nhic 425 or %o2,%o0,%o0 426 fmuld $ahi,$bd,$ahid 427 or %o7,%o0,%o0 ! 64-bit result 428 faddd $ahib,$nhib,$nhib 429 fmuld $nhi,$nd,$nhid 430 addcc %g1,%o0,%o0 431 faddd $dota,$nloa,$nloa 432 srlx %o3,16,%g1 ! 34-bit carry 433 faddd $dotb,$nlob,$nlob 434 bcs,a %xcc,.+8 435 add %g1,1,%g1 436 437 stx %o0,[$tp] ! tp[j-1]= 438 439 faddd $ahic,$nhic,$dota ! $nhic 440 faddd $ahid,$nhid,$dotb ! $nhid 441 442 faddd $nloc,$nhia,$nloc 443 faddd $nlod,$nhib,$nlod 444 445 fdtox $nloa,$nloa 446 fdtox $nlob,$nlob 447 fdtox $nloc,$nloc 448 fdtox $nlod,$nlod 449 450 std $nloa,[%sp+$bias+$frame+0] 451 std $nlob,[%sp+$bias+$frame+8] 452 std $nloc,[%sp+$bias+$frame+16] 453 std $nlod,[%sp+$bias+$frame+24] 454 455 addcc $j,8,$j 456 bnz,pt %icc,.L1st 457 add $tp,8,$tp 458 459.L1stskip: 460 fdtox $dota,$dota 461 fdtox $dotb,$dotb 462 463 ldx [%sp+$bias+$frame+0],%o0 464 ldx [%sp+$bias+$frame+8],%o1 465 ldx [%sp+$bias+$frame+16],%o2 466 ldx [%sp+$bias+$frame+24],%o3 467 468 srlx %o0,16,%o7 469 std $dota,[%sp+$bias+$frame+32] 470 add %o7,%o1,%o1 471 std $dotb,[%sp+$bias+$frame+40] 472 srlx %o1,16,%o7 473 add %o7,%o2,%o2 474 srlx %o2,16,%o7 475 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 476 and %o0,$mask,%o0 477 and %o1,$mask,%o1 478 and %o2,$mask,%o2 479 sllx %o1,16,%o1 480 sllx %o2,32,%o2 481 sllx %o3,48,%o7 482 or %o1,%o0,%o0 483 or %o2,%o0,%o0 484 or %o7,%o0,%o0 ! 64-bit result 485 ldx [%sp+$bias+$frame+32],%o4 486 addcc %g1,%o0,%o0 487 ldx [%sp+$bias+$frame+40],%o5 488 srlx %o3,16,%g1 ! 34-bit carry 489 bcs,a %xcc,.+8 490 add %g1,1,%g1 491 492 stx %o0,[$tp] ! tp[j-1]= 493 add $tp,8,$tp 494 495 srlx %o4,16,%o7 496 add %o7,%o5,%o5 497 and %o4,$mask,%o4 498 sllx %o5,16,%o7 499 or %o7,%o4,%o4 500 addcc %g1,%o4,%o4 501 srlx %o5,48,%g1 502 bcs,a %xcc,.+8 503 add %g1,1,%g1 504 505 mov %g1,$carry 506 stx %o4,[$tp] ! tp[num-1]= 507 508 ba .Louter 509 add $i,8,$i 510.align 32 511.Louter: 512 sub %g0,$num,$j ! j=-num 513 add %sp,$bias+$frame+$locals,$tp 514 515 add $ap,$j,%o3 516 add $bp,$i,%o4 517 518 ld [%o3+4],%g1 ! bp[i] 519 ld [%o3+0],%o0 520 ld [%o4+4],%g5 ! ap[0] 521 sllx %g1,32,%g1 522 ld [%o4+0],%o1 523 sllx %g5,32,%g5 524 or %g1,%o0,%o0 525 or %g5,%o1,%o1 526 527 ldx [$tp],%o2 ! tp[0] 528 mulx %o1,%o0,%o0 529 addcc %o2,%o0,%o0 530 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 531 stx %o0,[%sp+$bias+$frame+0] 532 533 ! transfer b[i] to FPU as 4x16-bit values 534 ldda [%o4+2]%asi,$ba 535 ldda [%o4+0]%asi,$bb 536 ldda [%o4+6]%asi,$bc 537 ldda [%o4+4]%asi,$bd 538 539 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 540 ldda [%sp+$bias+$frame+6]%asi,$na 541 fxtod $ba,$ba 542 ldda [%sp+$bias+$frame+4]%asi,$nb 543 fxtod $bb,$bb 544 ldda [%sp+$bias+$frame+2]%asi,$nc 545 fxtod $bc,$bc 546 ldda [%sp+$bias+$frame+0]%asi,$nd 547 fxtod $bd,$bd 548 ldd [$ap_l+$j],$alo ! load a[j] in double format 549 fxtod $na,$na 550 ldd [$ap_h+$j],$ahi 551 fxtod $nb,$nb 552 ldd [$np_l+$j],$nlo ! load n[j] in double format 553 fxtod $nc,$nc 554 ldd [$np_h+$j],$nhi 555 fxtod $nd,$nd 556 557 fmuld $alo,$ba,$aloa 558 fmuld $nlo,$na,$nloa 559 fmuld $alo,$bb,$alob 560 fmuld $nlo,$nb,$nlob 561 fmuld $alo,$bc,$aloc 562 faddd $aloa,$nloa,$nloa 563 fmuld $nlo,$nc,$nloc 564 fmuld $alo,$bd,$alod 565 faddd $alob,$nlob,$nlob 566 fmuld $nlo,$nd,$nlod 567 fmuld $ahi,$ba,$ahia 568 faddd $aloc,$nloc,$nloc 569 fmuld $nhi,$na,$nhia 570 fmuld $ahi,$bb,$ahib 571 faddd $alod,$nlod,$nlod 572 fmuld $nhi,$nb,$nhib 573 fmuld $ahi,$bc,$ahic 574 faddd $ahia,$nhia,$nhia 575 fmuld $nhi,$nc,$nhic 576 fmuld $ahi,$bd,$ahid 577 faddd $ahib,$nhib,$nhib 578 fmuld $nhi,$nd,$nhid 579 580 faddd $ahic,$nhic,$dota ! $nhic 581 faddd $ahid,$nhid,$dotb ! $nhid 582 583 faddd $nloc,$nhia,$nloc 584 faddd $nlod,$nhib,$nlod 585 586 fdtox $nloa,$nloa 587 fdtox $nlob,$nlob 588 fdtox $nloc,$nloc 589 fdtox $nlod,$nlod 590 591 std $nloa,[%sp+$bias+$frame+0] 592 std $nlob,[%sp+$bias+$frame+8] 593 std $nloc,[%sp+$bias+$frame+16] 594 add $j,8,$j 595 std $nlod,[%sp+$bias+$frame+24] 596 597 ldd [$ap_l+$j],$alo ! load a[j] in double format 598 ldd [$ap_h+$j],$ahi 599 ldd [$np_l+$j],$nlo ! load n[j] in double format 600 ldd [$np_h+$j],$nhi 601 602 fmuld $alo,$ba,$aloa 603 fmuld $nlo,$na,$nloa 604 fmuld $alo,$bb,$alob 605 fmuld $nlo,$nb,$nlob 606 fmuld $alo,$bc,$aloc 607 ldx [%sp+$bias+$frame+0],%o0 608 faddd $aloa,$nloa,$nloa 609 fmuld $nlo,$nc,$nloc 610 ldx [%sp+$bias+$frame+8],%o1 611 fmuld $alo,$bd,$alod 612 ldx [%sp+$bias+$frame+16],%o2 613 faddd $alob,$nlob,$nlob 614 fmuld $nlo,$nd,$nlod 615 ldx [%sp+$bias+$frame+24],%o3 616 fmuld $ahi,$ba,$ahia 617 618 srlx %o0,16,%o7 619 faddd $aloc,$nloc,$nloc 620 fmuld $nhi,$na,$nhia 621 add %o7,%o1,%o1 622 fmuld $ahi,$bb,$ahib 623 srlx %o1,16,%o7 624 faddd $alod,$nlod,$nlod 625 fmuld $nhi,$nb,$nhib 626 add %o7,%o2,%o2 627 fmuld $ahi,$bc,$ahic 628 srlx %o2,16,%o7 629 faddd $ahia,$nhia,$nhia 630 fmuld $nhi,$nc,$nhic 631 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 632 ! why? 633 and %o0,$mask,%o0 634 fmuld $ahi,$bd,$ahid 635 and %o1,$mask,%o1 636 and %o2,$mask,%o2 637 faddd $ahib,$nhib,$nhib 638 fmuld $nhi,$nd,$nhid 639 sllx %o1,16,%o1 640 faddd $dota,$nloa,$nloa 641 sllx %o2,32,%o2 642 faddd $dotb,$nlob,$nlob 643 sllx %o3,48,%o7 644 or %o1,%o0,%o0 645 faddd $ahic,$nhic,$dota ! $nhic 646 or %o2,%o0,%o0 647 faddd $ahid,$nhid,$dotb ! $nhid 648 or %o7,%o0,%o0 ! 64-bit result 649 ldx [$tp],%o7 650 faddd $nloc,$nhia,$nloc 651 addcc %o7,%o0,%o0 652 ! end-of-why? 653 faddd $nlod,$nhib,$nlod 654 srlx %o3,16,%g1 ! 34-bit carry 655 fdtox $nloa,$nloa 656 bcs,a %xcc,.+8 657 add %g1,1,%g1 658 659 fdtox $nlob,$nlob 660 fdtox $nloc,$nloc 661 fdtox $nlod,$nlod 662 663 std $nloa,[%sp+$bias+$frame+0] 664 std $nlob,[%sp+$bias+$frame+8] 665 addcc $j,8,$j 666 std $nloc,[%sp+$bias+$frame+16] 667 bz,pn %icc,.Linnerskip 668 std $nlod,[%sp+$bias+$frame+24] 669 670 ba .Linner 671 nop 672.align 32 673.Linner: 674 ldd [$ap_l+$j],$alo ! load a[j] in double format 675 ldd [$ap_h+$j],$ahi 676 ldd [$np_l+$j],$nlo ! load n[j] in double format 677 ldd [$np_h+$j],$nhi 678 679 fmuld $alo,$ba,$aloa 680 fmuld $nlo,$na,$nloa 681 fmuld $alo,$bb,$alob 682 fmuld $nlo,$nb,$nlob 683 fmuld $alo,$bc,$aloc 684 ldx [%sp+$bias+$frame+0],%o0 685 faddd $aloa,$nloa,$nloa 686 fmuld $nlo,$nc,$nloc 687 ldx [%sp+$bias+$frame+8],%o1 688 fmuld $alo,$bd,$alod 689 ldx [%sp+$bias+$frame+16],%o2 690 faddd $alob,$nlob,$nlob 691 fmuld $nlo,$nd,$nlod 692 ldx [%sp+$bias+$frame+24],%o3 693 fmuld $ahi,$ba,$ahia 694 695 srlx %o0,16,%o7 696 faddd $aloc,$nloc,$nloc 697 fmuld $nhi,$na,$nhia 698 add %o7,%o1,%o1 699 fmuld $ahi,$bb,$ahib 700 srlx %o1,16,%o7 701 faddd $alod,$nlod,$nlod 702 fmuld $nhi,$nb,$nhib 703 add %o7,%o2,%o2 704 fmuld $ahi,$bc,$ahic 705 srlx %o2,16,%o7 706 faddd $ahia,$nhia,$nhia 707 fmuld $nhi,$nc,$nhic 708 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 709 and %o0,$mask,%o0 710 fmuld $ahi,$bd,$ahid 711 and %o1,$mask,%o1 712 and %o2,$mask,%o2 713 faddd $ahib,$nhib,$nhib 714 fmuld $nhi,$nd,$nhid 715 sllx %o1,16,%o1 716 faddd $dota,$nloa,$nloa 717 sllx %o2,32,%o2 718 faddd $dotb,$nlob,$nlob 719 sllx %o3,48,%o7 720 or %o1,%o0,%o0 721 faddd $ahic,$nhic,$dota ! $nhic 722 or %o2,%o0,%o0 723 faddd $ahid,$nhid,$dotb ! $nhid 724 or %o7,%o0,%o0 ! 64-bit result 725 faddd $nloc,$nhia,$nloc 726 addcc %g1,%o0,%o0 727 ldx [$tp+8],%o7 ! tp[j] 728 faddd $nlod,$nhib,$nlod 729 srlx %o3,16,%g1 ! 34-bit carry 730 fdtox $nloa,$nloa 731 bcs,a %xcc,.+8 732 add %g1,1,%g1 733 fdtox $nlob,$nlob 734 addcc %o7,%o0,%o0 735 fdtox $nloc,$nloc 736 bcs,a %xcc,.+8 737 add %g1,1,%g1 738 739 stx %o0,[$tp] ! tp[j-1] 740 fdtox $nlod,$nlod 741 742 std $nloa,[%sp+$bias+$frame+0] 743 std $nlob,[%sp+$bias+$frame+8] 744 std $nloc,[%sp+$bias+$frame+16] 745 addcc $j,8,$j 746 std $nlod,[%sp+$bias+$frame+24] 747 bnz,pt %icc,.Linner 748 add $tp,8,$tp 749 750.Linnerskip: 751 fdtox $dota,$dota 752 fdtox $dotb,$dotb 753 754 ldx [%sp+$bias+$frame+0],%o0 755 ldx [%sp+$bias+$frame+8],%o1 756 ldx [%sp+$bias+$frame+16],%o2 757 ldx [%sp+$bias+$frame+24],%o3 758 759 srlx %o0,16,%o7 760 std $dota,[%sp+$bias+$frame+32] 761 add %o7,%o1,%o1 762 std $dotb,[%sp+$bias+$frame+40] 763 srlx %o1,16,%o7 764 add %o7,%o2,%o2 765 srlx %o2,16,%o7 766 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 767 and %o0,$mask,%o0 768 and %o1,$mask,%o1 769 and %o2,$mask,%o2 770 sllx %o1,16,%o1 771 sllx %o2,32,%o2 772 sllx %o3,48,%o7 773 or %o1,%o0,%o0 774 or %o2,%o0,%o0 775 ldx [%sp+$bias+$frame+32],%o4 776 or %o7,%o0,%o0 ! 64-bit result 777 ldx [%sp+$bias+$frame+40],%o5 778 addcc %g1,%o0,%o0 779 ldx [$tp+8],%o7 ! tp[j] 780 srlx %o3,16,%g1 ! 34-bit carry 781 bcs,a %xcc,.+8 782 add %g1,1,%g1 783 784 addcc %o7,%o0,%o0 785 bcs,a %xcc,.+8 786 add %g1,1,%g1 787 788 stx %o0,[$tp] ! tp[j-1] 789 add $tp,8,$tp 790 791 srlx %o4,16,%o7 792 add %o7,%o5,%o5 793 and %o4,$mask,%o4 794 sllx %o5,16,%o7 795 or %o7,%o4,%o4 796 addcc %g1,%o4,%o4 797 srlx %o5,48,%g1 798 bcs,a %xcc,.+8 799 add %g1,1,%g1 800 801 addcc $carry,%o4,%o4 802 stx %o4,[$tp] ! tp[num-1] 803 mov %g1,$carry 804 bcs,a %xcc,.+8 805 add $carry,1,$carry 806 807 addcc $i,8,$i 808 bnz %icc,.Louter 809 nop 810 811 add $tp,8,$tp ! adjust tp to point at the end 812 orn %g0,%g0,%g4 813 sub %g0,$num,%o7 ! n=-num 814 ba .Lsub 815 subcc %g0,%g0,%g0 ! clear %icc.c 816 817.align 32 818.Lsub: 819 ldx [$tp+%o7],%o0 820 add $np,%o7,%g1 821 ld [%g1+0],%o2 822 ld [%g1+4],%o3 823 srlx %o0,32,%o1 824 subccc %o0,%o2,%o2 825 add $rp,%o7,%g1 826 subccc %o1,%o3,%o3 827 st %o2,[%g1+0] 828 add %o7,8,%o7 829 brnz,pt %o7,.Lsub 830 st %o3,[%g1+4] 831 subc $carry,0,%g4 832 sub %g0,$num,%o7 ! n=-num 833 ba .Lcopy 834 nop 835 836.align 32 837.Lcopy: 838 ldx [$tp+%o7],%o0 839 add $rp,%o7,%g1 840 ld [%g1+0],%o2 841 ld [%g1+4],%o3 842 stx %g0,[$tp+%o7] 843 and %o0,%g4,%o0 844 srlx %o0,32,%o1 845 andn %o2,%g4,%o2 846 andn %o3,%g4,%o3 847 or %o2,%o0,%o0 848 or %o3,%o1,%o1 849 st %o0,[%g1+0] 850 add %o7,8,%o7 851 brnz,pt %o7,.Lcopy 852 st %o1,[%g1+4] 853 sub %g0,$num,%o7 ! n=-num 854 855.Lzap: 856 stx %g0,[$ap_l+%o7] 857 stx %g0,[$ap_h+%o7] 858 stx %g0,[$np_l+%o7] 859 stx %g0,[$np_h+%o7] 860 add %o7,8,%o7 861 brnz,pt %o7,.Lzap 862 nop 863 864 ldx [%sp+$bias+$frame+48],%o7 865 wr %g0,%o7,%asi ! restore %asi 866 867 mov 1,%i0 868.Lret: 869 ret 870 restore 871.type $fname,#function 872.size $fname,(.-$fname) 873.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" 874.align 32 875___ 876 877$code =~ s/\`([^\`]*)\`/eval($1)/gem; 878 879# Below substitution makes it possible to compile without demanding 880# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I 881# dare to do this, because VIS capability is detected at run-time now 882# and this routine is not called on CPU not capable to execute it. Do 883# note that fzeros is not the only VIS dependency! Another dependency 884# is implicit and is just _a_ numerical value loaded to %asi register, 885# which assembler can't recognize as VIS specific... 886$code =~ s/fzeros\s+%f([0-9]+)/ 887 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1) 888 /gem; 889 890print $code; 891# flush 892close STDOUT or die "error closing STDOUT: $!"; 893