1#! /usr/bin/env perl 2# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by David S. Miller and Andy Polyakov 12# The module is licensed under 2-clause BSD license. 13# November 2012. All rights reserved. 14# ==================================================================== 15 16###################################################################### 17# Montgomery squaring-n-multiplication module for SPARC T4. 18# 19# The module consists of three parts: 20# 21# 1) collection of "single-op" subroutines that perform single 22# operation, Montgomery squaring or multiplication, on 512-, 23# 1024-, 1536- and 2048-bit operands; 24# 2) collection of "multi-op" subroutines that perform 5 squaring and 25# 1 multiplication operations on operands of above lengths; 26# 3) fall-back and helper VIS3 subroutines. 27# 28# RSA sign is dominated by multi-op subroutine, while RSA verify and 29# DSA - by single-op. Special note about 4096-bit RSA verify result. 30# Operands are too long for dedicated hardware and it's handled by 31# VIS3 code, which is why you don't see any improvement. It's surely 32# possible to improve it [by deploying 'mpmul' instruction], maybe in 33# the future... 34# 35# Performance improvement. 36# 37# 64-bit process, VIS3: 38# sign verify sign/s verify/s 39# rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4 40# rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3 41# rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9 42# dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9 43# dsa 2048 bits 0.001056s 0.001233s 946.9 810.8 44# 45# 64-bit process, this module: 46# sign verify sign/s verify/s 47# rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9 48# rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7 49# rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5 50# dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5 51# dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6 52# 53###################################################################### 54# 32-bit process, VIS3: 55# sign verify sign/s verify/s 56# rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3 57# rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4 58# rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8 59# dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6 60# dsa 2048 bits 0.001101s 0.001260s 908.2 793.4 61# 62# 32-bit process, this module: 63# sign verify sign/s verify/s 64# rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0 65# rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7 66# rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4 67# dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2 68# dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2 69# 70# 32-bit code is prone to performance degradation as interrupt rate 71# dispatched to CPU executing the code grows. This is because in 72# standard process of handling interrupt in 32-bit process context 73# upper halves of most integer registers used as input or output are 74# zeroed. This renders result invalid, and operation has to be re-run. 75# If CPU is "bothered" with timer interrupts only, the penalty is 76# hardly measurable. But in order to mitigate this problem for higher 77# interrupt rates contemporary Linux kernel recognizes biased stack 78# even in 32-bit process context and preserves full register contents. 79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb 80# for details. 81 82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 83push(@INC,"${dir}","${dir}../../perlasm"); 84require "sparcv9_modes.pl"; 85 86$output = pop and open STDOUT,">$output"; 87 88$code.=<<___; 89#ifndef __ASSEMBLER__ 90# define __ASSEMBLER__ 1 91#endif 92#include "crypto/sparc_arch.h" 93 94#ifdef __arch64__ 95.register %g2,#scratch 96.register %g3,#scratch 97#endif 98 99.section ".text",#alloc,#execinstr 100 101#ifdef __PIC__ 102SPARC_PIC_THUNK(%g1) 103#endif 104___ 105 106######################################################################## 107# Register layout for mont[mul|sqr] instructions. 108# For details see "Oracle SPARC Architecture 2011" manual at 109# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/. 110# 111my @R=map("%f".2*$_,(0..11,30,31,12..29)); 112my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]); 113my @A=(@N[0..13],@R[14..31]); 114my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3))); 115 116######################################################################## 117# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp, 118# const u64 *np,const BN_ULONG *n0); 119# 120sub generate_bn_mul_mont_t4() { 121my $NUM=shift; 122my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5)); 123 124$code.=<<___; 125.globl bn_mul_mont_t4_$NUM 126.align 32 127bn_mul_mont_t4_$NUM: 128#ifdef __arch64__ 129 mov 0,$sentinel 130 mov -128,%g4 131#elif defined(SPARCV9_64BIT_STACK) 132 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 133 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 134 mov -2047,%g4 135 and %g1,SPARCV9_64BIT_STACK,%g1 136 movrz %g1,0,%g4 137 mov -1,$sentinel 138 add %g4,-128,%g4 139#else 140 mov -1,$sentinel 141 mov -128,%g4 142#endif 143 sllx $sentinel,32,$sentinel 144 save %sp,%g4,%sp 145#ifndef __arch64__ 146 save %sp,-128,%sp ! warm it up 147 save %sp,-128,%sp 148 save %sp,-128,%sp 149 save %sp,-128,%sp 150 save %sp,-128,%sp 151 save %sp,-128,%sp 152 restore 153 restore 154 restore 155 restore 156 restore 157 restore 158#endif 159 and %sp,1,%g4 160 or $sentinel,%fp,%fp 161 or %g4,$sentinel,$sentinel 162 163 ! copy arguments to global registers 164 mov %i0,$rp 165 mov %i1,$ap 166 mov %i2,$bp 167 mov %i3,$np 168 ld [%i4+0],%f1 ! load *n0 169 ld [%i4+4],%f0 170 fsrc2 %f0,%f60 171___ 172 173# load ap[$NUM] ######################################################## 174$code.=<<___; 175 save %sp,-128,%sp; or $sentinel,%fp,%fp 176___ 177for($i=0; $i<14 && $i<$NUM; $i++) { 178my $lo=$i<13?@A[$i+1]:"%o7"; 179$code.=<<___; 180 ld [$ap+$i*8+0],$lo 181 ld [$ap+$i*8+4],@A[$i] 182 sllx @A[$i],32,@A[$i] 183 or $lo,@A[$i],@A[$i] 184___ 185} 186for(; $i<$NUM; $i++) { 187my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 188$code.=<<___; 189 ld [$ap+$i*8+0],$lo 190 ld [$ap+$i*8+4],$hi 191 fsrc2 $hi,@A[$i] 192___ 193} 194# load np[$NUM] ######################################################## 195$code.=<<___; 196 save %sp,-128,%sp; or $sentinel,%fp,%fp 197___ 198for($i=0; $i<14 && $i<$NUM; $i++) { 199my $lo=$i<13?@N[$i+1]:"%o7"; 200$code.=<<___; 201 ld [$np+$i*8+0],$lo 202 ld [$np+$i*8+4],@N[$i] 203 sllx @N[$i],32,@N[$i] 204 or $lo,@N[$i],@N[$i] 205___ 206} 207$code.=<<___; 208 save %sp,-128,%sp; or $sentinel,%fp,%fp 209___ 210for(; $i<28 && $i<$NUM; $i++) { 211my $lo=$i<27?@N[$i+1]:"%o7"; 212$code.=<<___; 213 ld [$np+$i*8+0],$lo 214 ld [$np+$i*8+4],@N[$i] 215 sllx @N[$i],32,@N[$i] 216 or $lo,@N[$i],@N[$i] 217___ 218} 219$code.=<<___; 220 save %sp,-128,%sp; or $sentinel,%fp,%fp 221___ 222for(; $i<$NUM; $i++) { 223my $lo=($i<$NUM-1)?@N[$i+1]:"%o7"; 224$code.=<<___; 225 ld [$np+$i*8+0],$lo 226 ld [$np+$i*8+4],@N[$i] 227 sllx @N[$i],32,@N[$i] 228 or $lo,@N[$i],@N[$i] 229___ 230} 231$code.=<<___; 232 cmp $ap,$bp 233 be SIZE_T_CC,.Lmsquare_$NUM 234 nop 235___ 236 237# load bp[$NUM] ######################################################## 238$code.=<<___; 239 save %sp,-128,%sp; or $sentinel,%fp,%fp 240___ 241for($i=0; $i<14 && $i<$NUM; $i++) { 242my $lo=$i<13?@B[$i+1]:"%o7"; 243$code.=<<___; 244 ld [$bp+$i*8+0],$lo 245 ld [$bp+$i*8+4],@B[$i] 246 sllx @B[$i],32,@B[$i] 247 or $lo,@B[$i],@B[$i] 248___ 249} 250$code.=<<___; 251 save %sp,-128,%sp; or $sentinel,%fp,%fp 252___ 253for(; $i<$NUM; $i++) { 254my $lo=($i<$NUM-1)?@B[$i+1]:"%o7"; 255$code.=<<___; 256 ld [$bp+$i*8+0],$lo 257 ld [$bp+$i*8+4],@B[$i] 258 sllx @B[$i],32,@B[$i] 259 or $lo,@B[$i],@B[$i] 260___ 261} 262# magic ################################################################ 263$code.=<<___; 264 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 265.Lmresume_$NUM: 266 fbu,pn %fcc3,.Lmabort_$NUM 267#ifndef __arch64__ 268 and %fp,$sentinel,$sentinel 269 brz,pn $sentinel,.Lmabort_$NUM 270#endif 271 nop 272#ifdef __arch64__ 273 restore 274 restore 275 restore 276 restore 277 restore 278#else 279 restore; and %fp,$sentinel,$sentinel 280 restore; and %fp,$sentinel,$sentinel 281 restore; and %fp,$sentinel,$sentinel 282 restore; and %fp,$sentinel,$sentinel 283 brz,pn $sentinel,.Lmabort1_$NUM 284 restore 285#endif 286___ 287 288# save tp[$NUM] ######################################################## 289for($i=0; $i<14 && $i<$NUM; $i++) { 290$code.=<<___; 291 movxtod @A[$i],@R[$i] 292___ 293} 294$code.=<<___; 295#ifdef __arch64__ 296 restore 297#else 298 and %fp,$sentinel,$sentinel 299 restore 300 and $sentinel,1,%o7 301 and %fp,$sentinel,$sentinel 302 srl %fp,0,%fp ! just in case? 303 or %o7,$sentinel,$sentinel 304 brz,a,pn $sentinel,.Lmdone_$NUM 305 mov 0,%i0 ! return failure 306#endif 307___ 308for($i=0; $i<12 && $i<$NUM; $i++) { 309@R[$i] =~ /%f([0-9]+)/; 310my $lo = "%f".($1+1); 311$code.=<<___; 312 st $lo,[$rp+$i*8+0] 313 st @R[$i],[$rp+$i*8+4] 314___ 315} 316for(; $i<$NUM; $i++) { 317my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1)); 318$code.=<<___; 319 fsrc2 @R[$i],$hi 320 st $lo,[$rp+$i*8+0] 321 st $hi,[$rp+$i*8+4] 322___ 323} 324$code.=<<___; 325 mov 1,%i0 ! return success 326.Lmdone_$NUM: 327 ret 328 restore 329 330.Lmabort_$NUM: 331 restore 332 restore 333 restore 334 restore 335 restore 336.Lmabort1_$NUM: 337 restore 338 339 mov 0,%i0 ! return failure 340 ret 341 restore 342 343.align 32 344.Lmsquare_$NUM: 345 save %sp,-128,%sp; or $sentinel,%fp,%fp 346 save %sp,-128,%sp; or $sentinel,%fp,%fp 347 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 348 ba .Lmresume_$NUM 349 nop 350.type bn_mul_mont_t4_$NUM, #function 351.size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM 352___ 353} 354 355for ($i=8;$i<=32;$i+=8) { 356 &generate_bn_mul_mont_t4($i); 357} 358 359######################################################################## 360# 361sub load_ccr { 362my ($ptbl,$pwr,$ccr,$skip_wr)=@_; 363$code.=<<___; 364 srl $pwr, 2, %o4 365 and $pwr, 3, %o5 366 and %o4, 7, %o4 367 sll %o5, 3, %o5 ! offset within first cache line 368 add %o5, $ptbl, $ptbl ! of the pwrtbl 369 or %g0, 1, %o5 370 sll %o5, %o4, $ccr 371___ 372$code.=<<___ if (!$skip_wr); 373 wr $ccr, %g0, %ccr 374___ 375} 376sub load_b_pair { 377my ($pwrtbl,$B0,$B1)=@_; 378 379$code.=<<___; 380 ldx [$pwrtbl+0*32], $B0 381 ldx [$pwrtbl+8*32], $B1 382 ldx [$pwrtbl+1*32], %o4 383 ldx [$pwrtbl+9*32], %o5 384 movvs %icc, %o4, $B0 385 ldx [$pwrtbl+2*32], %o4 386 movvs %icc, %o5, $B1 387 ldx [$pwrtbl+10*32],%o5 388 move %icc, %o4, $B0 389 ldx [$pwrtbl+3*32], %o4 390 move %icc, %o5, $B1 391 ldx [$pwrtbl+11*32],%o5 392 movneg %icc, %o4, $B0 393 ldx [$pwrtbl+4*32], %o4 394 movneg %icc, %o5, $B1 395 ldx [$pwrtbl+12*32],%o5 396 movcs %xcc, %o4, $B0 397 ldx [$pwrtbl+5*32],%o4 398 movcs %xcc, %o5, $B1 399 ldx [$pwrtbl+13*32],%o5 400 movvs %xcc, %o4, $B0 401 ldx [$pwrtbl+6*32], %o4 402 movvs %xcc, %o5, $B1 403 ldx [$pwrtbl+14*32],%o5 404 move %xcc, %o4, $B0 405 ldx [$pwrtbl+7*32], %o4 406 move %xcc, %o5, $B1 407 ldx [$pwrtbl+15*32],%o5 408 movneg %xcc, %o4, $B0 409 add $pwrtbl,16*32, $pwrtbl 410 movneg %xcc, %o5, $B1 411___ 412} 413sub load_b { 414my ($pwrtbl,$Bi)=@_; 415 416$code.=<<___; 417 ldx [$pwrtbl+0*32], $Bi 418 ldx [$pwrtbl+1*32], %o4 419 ldx [$pwrtbl+2*32], %o5 420 movvs %icc, %o4, $Bi 421 ldx [$pwrtbl+3*32], %o4 422 move %icc, %o5, $Bi 423 ldx [$pwrtbl+4*32], %o5 424 movneg %icc, %o4, $Bi 425 ldx [$pwrtbl+5*32], %o4 426 movcs %xcc, %o5, $Bi 427 ldx [$pwrtbl+6*32], %o5 428 movvs %xcc, %o4, $Bi 429 ldx [$pwrtbl+7*32], %o4 430 move %xcc, %o5, $Bi 431 add $pwrtbl,8*32, $pwrtbl 432 movneg %xcc, %o4, $Bi 433___ 434} 435 436######################################################################## 437# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0, 438# const u64 *pwrtbl,int pwr,int stride); 439# 440sub generate_bn_pwr5_mont_t4() { 441my $NUM=shift; 442my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5)); 443 444$code.=<<___; 445.globl bn_pwr5_mont_t4_$NUM 446.align 32 447bn_pwr5_mont_t4_$NUM: 448#ifdef __arch64__ 449 mov 0,$sentinel 450 mov -128,%g4 451#elif defined(SPARCV9_64BIT_STACK) 452 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 453 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0] 454 mov -2047,%g4 455 and %g1,SPARCV9_64BIT_STACK,%g1 456 movrz %g1,0,%g4 457 mov -1,$sentinel 458 add %g4,-128,%g4 459#else 460 mov -1,$sentinel 461 mov -128,%g4 462#endif 463 sllx $sentinel,32,$sentinel 464 save %sp,%g4,%sp 465#ifndef __arch64__ 466 save %sp,-128,%sp ! warm it up 467 save %sp,-128,%sp 468 save %sp,-128,%sp 469 save %sp,-128,%sp 470 save %sp,-128,%sp 471 save %sp,-128,%sp 472 restore 473 restore 474 restore 475 restore 476 restore 477 restore 478#endif 479 and %sp,1,%g4 480 or $sentinel,%fp,%fp 481 or %g4,$sentinel,$sentinel 482 483 ! copy arguments to global registers 484 mov %i0,$tp 485 mov %i1,$np 486 ld [%i2+0],%f1 ! load *n0 487 ld [%i2+4],%f0 488 mov %i3,$pwrtbl 489 srl %i4,%g0,%i4 ! pack last arguments 490 sllx %i5,32,$pwr 491 or %i4,$pwr,$pwr 492 fsrc2 %f0,%f60 493___ 494 495# load tp[$NUM] ######################################################## 496$code.=<<___; 497 save %sp,-128,%sp; or $sentinel,%fp,%fp 498___ 499for($i=0; $i<14 && $i<$NUM; $i++) { 500$code.=<<___; 501 ldx [$tp+$i*8],@A[$i] 502___ 503} 504for(; $i<$NUM; $i++) { 505$code.=<<___; 506 ldd [$tp+$i*8],@A[$i] 507___ 508} 509# load np[$NUM] ######################################################## 510$code.=<<___; 511 save %sp,-128,%sp; or $sentinel,%fp,%fp 512___ 513for($i=0; $i<14 && $i<$NUM; $i++) { 514$code.=<<___; 515 ldx [$np+$i*8],@N[$i] 516___ 517} 518$code.=<<___; 519 save %sp,-128,%sp; or $sentinel,%fp,%fp 520___ 521for(; $i<28 && $i<$NUM; $i++) { 522$code.=<<___; 523 ldx [$np+$i*8],@N[$i] 524___ 525} 526$code.=<<___; 527 save %sp,-128,%sp; or $sentinel,%fp,%fp 528___ 529for(; $i<$NUM; $i++) { 530$code.=<<___; 531 ldx [$np+$i*8],@N[$i] 532___ 533} 534# load pwrtbl[pwr] ######################################################## 535$code.=<<___; 536 save %sp,-128,%sp; or $sentinel,%fp,%fp 537 538 srlx $pwr, 32, %o4 ! unpack $pwr 539 srl $pwr, %g0, %o5 540 sub %o4, 5, %o4 541 mov $pwrtbl, %o7 542 sllx %o4, 32, $pwr ! re-pack $pwr 543 or %o5, $pwr, $pwr 544 srl %o5, %o4, %o5 545___ 546 &load_ccr("%o7","%o5","%o4"); 547$code.=<<___; 548 b .Lstride_$NUM 549 nop 550.align 16 551.Lstride_$NUM: 552___ 553for($i=0; $i<14 && $i<$NUM; $i+=2) { 554 &load_b_pair("%o7",@B[$i],@B[$i+1]); 555} 556$code.=<<___; 557 save %sp,-128,%sp; or $sentinel,%fp,%fp 558___ 559for(; $i<$NUM; $i+=2) { 560 &load_b_pair("%i7",@B[$i],@B[$i+1]); 561} 562$code.=<<___; 563 srax $pwr, 32, %o4 ! unpack $pwr 564 srl $pwr, %g0, %o5 565 sub %o4, 5, %o4 566 mov $pwrtbl, %i7 567 sllx %o4, 32, $pwr ! re-pack $pwr 568 or %o5, $pwr, $pwr 569 srl %o5, %o4, %o5 570___ 571 &load_ccr("%i7","%o5","%o4",1); 572 573# magic ################################################################ 574for($i=0; $i<5; $i++) { 575$code.=<<___; 576 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1 577 fbu,pn %fcc3,.Labort_$NUM 578#ifndef __arch64__ 579 and %fp,$sentinel,$sentinel 580 brz,pn $sentinel,.Labort_$NUM 581#endif 582 nop 583___ 584} 585$code.=<<___; 586 wr %o4, %g0, %ccr 587 .word 0x81b02920+$NUM-1 ! montmul $NUM-1 588 fbu,pn %fcc3,.Labort_$NUM 589#ifndef __arch64__ 590 and %fp,$sentinel,$sentinel 591 brz,pn $sentinel,.Labort_$NUM 592#endif 593 594 srax $pwr, 32, %o4 595#ifdef __arch64__ 596 brgez %o4,.Lstride_$NUM 597 restore 598 restore 599 restore 600 restore 601 restore 602#else 603 brgez %o4,.Lstride_$NUM 604 restore; and %fp,$sentinel,$sentinel 605 restore; and %fp,$sentinel,$sentinel 606 restore; and %fp,$sentinel,$sentinel 607 restore; and %fp,$sentinel,$sentinel 608 brz,pn $sentinel,.Labort1_$NUM 609 restore 610#endif 611___ 612 613# save tp[$NUM] ######################################################## 614for($i=0; $i<14 && $i<$NUM; $i++) { 615$code.=<<___; 616 movxtod @A[$i],@R[$i] 617___ 618} 619$code.=<<___; 620#ifdef __arch64__ 621 restore 622#else 623 and %fp,$sentinel,$sentinel 624 restore 625 and $sentinel,1,%o7 626 and %fp,$sentinel,$sentinel 627 srl %fp,0,%fp ! just in case? 628 or %o7,$sentinel,$sentinel 629 brz,a,pn $sentinel,.Ldone_$NUM 630 mov 0,%i0 ! return failure 631#endif 632___ 633for($i=0; $i<$NUM; $i++) { 634$code.=<<___; 635 std @R[$i],[$tp+$i*8] 636___ 637} 638$code.=<<___; 639 mov 1,%i0 ! return success 640.Ldone_$NUM: 641 ret 642 restore 643 644.Labort_$NUM: 645 restore 646 restore 647 restore 648 restore 649 restore 650.Labort1_$NUM: 651 restore 652 653 mov 0,%i0 ! return failure 654 ret 655 restore 656.type bn_pwr5_mont_t4_$NUM, #function 657.size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM 658___ 659} 660 661for ($i=8;$i<=32;$i+=8) { 662 &generate_bn_pwr5_mont_t4($i); 663} 664 665{ 666######################################################################## 667# Fall-back subroutines 668# 669# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values 670# 671($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= 672 (map("%g$_",(1..5)),map("%o$_",(0..5,7))); 673 674# int bn_mul_mont( 675$rp="%o0"; # u64 *rp, 676$ap="%o1"; # const u64 *ap, 677$bp="%o2"; # const u64 *bp, 678$np="%o3"; # const u64 *np, 679$n0p="%o4"; # const BN_ULONG *n0, 680$num="%o5"; # int num); # caller ensures that num is >=3 681$code.=<<___; 682.globl bn_mul_mont_t4 683.align 32 684bn_mul_mont_t4: 685 add %sp, STACK_BIAS, %g4 ! real top of stack 686 sll $num, 3, $num ! size in bytes 687 add $num, 63, %g1 688 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 689 sub %g4, %g1, %g1 690 andn %g1, 63, %g1 ! align at 64 byte 691 sub %g1, STACK_FRAME, %g1 ! new top of stack 692 sub %g1, %g4, %g1 693 694 save %sp, %g1, %sp 695___ 696# +-------------------------------+<----- %sp 697# . . 698# +-------------------------------+<----- aligned at 64 bytes 699# | __int64 tmp[0] | 700# +-------------------------------+ 701# . . 702# . . 703# +-------------------------------+<----- aligned at 64 bytes 704# . . 705($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 706($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7)); 707($ovf,$i)=($t0,$t1); 708$code.=<<___; 709 ld [$n0p+0], $t0 ! pull n0[0..1] value 710 ld [$n0p+4], $t1 711 add %sp, STACK_BIAS+STACK_FRAME, $tp 712 ldx [$bp+0], $m0 ! m0=bp[0] 713 sllx $t1, 32, $n0 714 add $bp, 8, $bp 715 or $t0, $n0, $n0 716 717 ldx [$ap+0], $aj ! ap[0] 718 719 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 720 umulxhi $aj, $m0, $hi0 721 722 ldx [$ap+8], $aj ! ap[1] 723 add $ap, 16, $ap 724 ldx [$np+0], $nj ! np[0] 725 726 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 727 728 mulx $aj, $m0, $alo ! ap[1]*bp[0] 729 umulxhi $aj, $m0, $aj ! ahi=aj 730 731 mulx $nj, $m1, $lo1 ! np[0]*m1 732 umulxhi $nj, $m1, $hi1 733 734 ldx [$np+8], $nj ! np[1] 735 736 addcc $lo0, $lo1, $lo1 737 add $np, 16, $np 738 addxc %g0, $hi1, $hi1 739 740 mulx $nj, $m1, $nlo ! np[1]*m1 741 umulxhi $nj, $m1, $nj ! nhi=nj 742 743 ba .L1st 744 sub $num, 24, $cnt ! cnt=num-3 745 746.align 16 747.L1st: 748 addcc $alo, $hi0, $lo0 749 addxc $aj, %g0, $hi0 750 751 ldx [$ap+0], $aj ! ap[j] 752 addcc $nlo, $hi1, $lo1 753 add $ap, 8, $ap 754 addxc $nj, %g0, $hi1 ! nhi=nj 755 756 ldx [$np+0], $nj ! np[j] 757 mulx $aj, $m0, $alo ! ap[j]*bp[0] 758 add $np, 8, $np 759 umulxhi $aj, $m0, $aj ! ahi=aj 760 761 mulx $nj, $m1, $nlo ! np[j]*m1 762 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 763 umulxhi $nj, $m1, $nj ! nhi=nj 764 addxc %g0, $hi1, $hi1 765 stxa $lo1, [$tp]0xe2 ! tp[j-1] 766 add $tp, 8, $tp ! tp++ 767 768 brnz,pt $cnt, .L1st 769 sub $cnt, 8, $cnt ! j-- 770!.L1st 771 addcc $alo, $hi0, $lo0 772 addxc $aj, %g0, $hi0 ! ahi=aj 773 774 addcc $nlo, $hi1, $lo1 775 addxc $nj, %g0, $hi1 776 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 777 addxc %g0, $hi1, $hi1 778 stxa $lo1, [$tp]0xe2 ! tp[j-1] 779 add $tp, 8, $tp 780 781 addcc $hi0, $hi1, $hi1 782 addxc %g0, %g0, $ovf ! upmost overflow bit 783 stxa $hi1, [$tp]0xe2 784 add $tp, 8, $tp 785 786 ba .Louter 787 sub $num, 16, $i ! i=num-2 788 789.align 16 790.Louter: 791 ldx [$bp+0], $m0 ! m0=bp[i] 792 add $bp, 8, $bp 793 794 sub $ap, $num, $ap ! rewind 795 sub $np, $num, $np 796 sub $tp, $num, $tp 797 798 ldx [$ap+0], $aj ! ap[0] 799 ldx [$np+0], $nj ! np[0] 800 801 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 802 ldx [$tp], $tj ! tp[0] 803 umulxhi $aj, $m0, $hi0 804 ldx [$ap+8], $aj ! ap[1] 805 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 806 mulx $aj, $m0, $alo ! ap[1]*bp[i] 807 addxc %g0, $hi0, $hi0 808 mulx $lo0, $n0, $m1 ! tp[0]*n0 809 umulxhi $aj, $m0, $aj ! ahi=aj 810 mulx $nj, $m1, $lo1 ! np[0]*m1 811 add $ap, 16, $ap 812 umulxhi $nj, $m1, $hi1 813 ldx [$np+8], $nj ! np[1] 814 add $np, 16, $np 815 addcc $lo1, $lo0, $lo1 816 mulx $nj, $m1, $nlo ! np[1]*m1 817 addxc %g0, $hi1, $hi1 818 umulxhi $nj, $m1, $nj ! nhi=nj 819 820 ba .Linner 821 sub $num, 24, $cnt ! cnt=num-3 822.align 16 823.Linner: 824 addcc $alo, $hi0, $lo0 825 ldx [$tp+8], $tj ! tp[j] 826 addxc $aj, %g0, $hi0 ! ahi=aj 827 ldx [$ap+0], $aj ! ap[j] 828 add $ap, 8, $ap 829 addcc $nlo, $hi1, $lo1 830 mulx $aj, $m0, $alo ! ap[j]*bp[i] 831 addxc $nj, %g0, $hi1 ! nhi=nj 832 ldx [$np+0], $nj ! np[j] 833 add $np, 8, $np 834 umulxhi $aj, $m0, $aj ! ahi=aj 835 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 836 mulx $nj, $m1, $nlo ! np[j]*m1 837 addxc %g0, $hi0, $hi0 838 umulxhi $nj, $m1, $nj ! nhi=nj 839 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 840 addxc %g0, $hi1, $hi1 841 stx $lo1, [$tp] ! tp[j-1] 842 add $tp, 8, $tp 843 brnz,pt $cnt, .Linner 844 sub $cnt, 8, $cnt 845!.Linner 846 ldx [$tp+8], $tj ! tp[j] 847 addcc $alo, $hi0, $lo0 848 addxc $aj, %g0, $hi0 ! ahi=aj 849 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 850 addxc %g0, $hi0, $hi0 851 852 addcc $nlo, $hi1, $lo1 853 addxc $nj, %g0, $hi1 ! nhi=nj 854 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 855 addxc %g0, $hi1, $hi1 856 stx $lo1, [$tp] ! tp[j-1] 857 858 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 859 addxccc $hi1, $hi0, $hi1 860 addxc %g0, %g0, $ovf 861 stx $hi1, [$tp+8] 862 add $tp, 16, $tp 863 864 brnz,pt $i, .Louter 865 sub $i, 8, $i 866 867 sub $ap, $num, $ap ! rewind 868 sub $np, $num, $np 869 sub $tp, $num, $tp 870 ba .Lsub 871 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 872 873.align 16 874.Lsub: 875 ldx [$tp], $tj 876 add $tp, 8, $tp 877 ldx [$np+0], $nj 878 add $np, 8, $np 879 subccc $tj, $nj, $t2 ! tp[j]-np[j] 880 srlx $tj, 32, $tj 881 srlx $nj, 32, $nj 882 subccc $tj, $nj, $t3 883 add $rp, 8, $rp 884 st $t2, [$rp-4] ! reverse order 885 st $t3, [$rp-8] 886 brnz,pt $cnt, .Lsub 887 sub $cnt, 8, $cnt 888 889 sub $np, $num, $np ! rewind 890 sub $tp, $num, $tp 891 sub $rp, $num, $rp 892 893 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 894 ba .Lcopy 895 sub $num, 8, $cnt 896 897.align 16 898.Lcopy: ! conditional copy 899 ldx [$tp], $tj 900 ldx [$rp+0], $t2 901 stx %g0, [$tp] ! zap 902 add $tp, 8, $tp 903 movcs %icc, $tj, $t2 904 stx $t2, [$rp+0] 905 add $rp, 8, $rp 906 brnz $cnt, .Lcopy 907 sub $cnt, 8, $cnt 908 909 mov 1, %o0 910 ret 911 restore 912.type bn_mul_mont_t4, #function 913.size bn_mul_mont_t4, .-bn_mul_mont_t4 914___ 915 916# int bn_mul_mont_gather5( 917$rp="%o0"; # u64 *rp, 918$ap="%o1"; # const u64 *ap, 919$bp="%o2"; # const u64 *pwrtbl, 920$np="%o3"; # const u64 *np, 921$n0p="%o4"; # const BN_ULONG *n0, 922$num="%o5"; # int num, # caller ensures that num is >=3 923 # int power); 924$code.=<<___; 925.globl bn_mul_mont_gather5_t4 926.align 32 927bn_mul_mont_gather5_t4: 928 add %sp, STACK_BIAS, %g4 ! real top of stack 929 sll $num, 3, $num ! size in bytes 930 add $num, 63, %g1 931 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes 932 sub %g4, %g1, %g1 933 andn %g1, 63, %g1 ! align at 64 byte 934 sub %g1, STACK_FRAME, %g1 ! new top of stack 935 sub %g1, %g4, %g1 936 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument 937 938 save %sp, %g1, %sp 939___ 940# +-------------------------------+<----- %sp 941# . . 942# +-------------------------------+<----- aligned at 64 bytes 943# | __int64 tmp[0] | 944# +-------------------------------+ 945# . . 946# . . 947# +-------------------------------+<----- aligned at 64 bytes 948# . . 949($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 950($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7)); 951($ovf,$i)=($t0,$t1); 952 &load_ccr($bp,"%g4",$ccr); 953 &load_b($bp,$m0,"%o7"); # m0=bp[0] 954 955$code.=<<___; 956 ld [$n0p+0], $t0 ! pull n0[0..1] value 957 ld [$n0p+4], $t1 958 add %sp, STACK_BIAS+STACK_FRAME, $tp 959 sllx $t1, 32, $n0 960 or $t0, $n0, $n0 961 962 ldx [$ap+0], $aj ! ap[0] 963 964 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 965 umulxhi $aj, $m0, $hi0 966 967 ldx [$ap+8], $aj ! ap[1] 968 add $ap, 16, $ap 969 ldx [$np+0], $nj ! np[0] 970 971 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 972 973 mulx $aj, $m0, $alo ! ap[1]*bp[0] 974 umulxhi $aj, $m0, $aj ! ahi=aj 975 976 mulx $nj, $m1, $lo1 ! np[0]*m1 977 umulxhi $nj, $m1, $hi1 978 979 ldx [$np+8], $nj ! np[1] 980 981 addcc $lo0, $lo1, $lo1 982 add $np, 16, $np 983 addxc %g0, $hi1, $hi1 984 985 mulx $nj, $m1, $nlo ! np[1]*m1 986 umulxhi $nj, $m1, $nj ! nhi=nj 987 988 ba .L1st_g5 989 sub $num, 24, $cnt ! cnt=num-3 990 991.align 16 992.L1st_g5: 993 addcc $alo, $hi0, $lo0 994 addxc $aj, %g0, $hi0 995 996 ldx [$ap+0], $aj ! ap[j] 997 addcc $nlo, $hi1, $lo1 998 add $ap, 8, $ap 999 addxc $nj, %g0, $hi1 ! nhi=nj 1000 1001 ldx [$np+0], $nj ! np[j] 1002 mulx $aj, $m0, $alo ! ap[j]*bp[0] 1003 add $np, 8, $np 1004 umulxhi $aj, $m0, $aj ! ahi=aj 1005 1006 mulx $nj, $m1, $nlo ! np[j]*m1 1007 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1008 umulxhi $nj, $m1, $nj ! nhi=nj 1009 addxc %g0, $hi1, $hi1 1010 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1011 add $tp, 8, $tp ! tp++ 1012 1013 brnz,pt $cnt, .L1st_g5 1014 sub $cnt, 8, $cnt ! j-- 1015!.L1st_g5 1016 addcc $alo, $hi0, $lo0 1017 addxc $aj, %g0, $hi0 ! ahi=aj 1018 1019 addcc $nlo, $hi1, $lo1 1020 addxc $nj, %g0, $hi1 1021 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 1022 addxc %g0, $hi1, $hi1 1023 stxa $lo1, [$tp]0xe2 ! tp[j-1] 1024 add $tp, 8, $tp 1025 1026 addcc $hi0, $hi1, $hi1 1027 addxc %g0, %g0, $ovf ! upmost overflow bit 1028 stxa $hi1, [$tp]0xe2 1029 add $tp, 8, $tp 1030 1031 ba .Louter_g5 1032 sub $num, 16, $i ! i=num-2 1033 1034.align 16 1035.Louter_g5: 1036 wr $ccr, %g0, %ccr 1037___ 1038 &load_b($bp,$m0); # m0=bp[i] 1039$code.=<<___; 1040 sub $ap, $num, $ap ! rewind 1041 sub $np, $num, $np 1042 sub $tp, $num, $tp 1043 1044 ldx [$ap+0], $aj ! ap[0] 1045 ldx [$np+0], $nj ! np[0] 1046 1047 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 1048 ldx [$tp], $tj ! tp[0] 1049 umulxhi $aj, $m0, $hi0 1050 ldx [$ap+8], $aj ! ap[1] 1051 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 1052 mulx $aj, $m0, $alo ! ap[1]*bp[i] 1053 addxc %g0, $hi0, $hi0 1054 mulx $lo0, $n0, $m1 ! tp[0]*n0 1055 umulxhi $aj, $m0, $aj ! ahi=aj 1056 mulx $nj, $m1, $lo1 ! np[0]*m1 1057 add $ap, 16, $ap 1058 umulxhi $nj, $m1, $hi1 1059 ldx [$np+8], $nj ! np[1] 1060 add $np, 16, $np 1061 addcc $lo1, $lo0, $lo1 1062 mulx $nj, $m1, $nlo ! np[1]*m1 1063 addxc %g0, $hi1, $hi1 1064 umulxhi $nj, $m1, $nj ! nhi=nj 1065 1066 ba .Linner_g5 1067 sub $num, 24, $cnt ! cnt=num-3 1068.align 16 1069.Linner_g5: 1070 addcc $alo, $hi0, $lo0 1071 ldx [$tp+8], $tj ! tp[j] 1072 addxc $aj, %g0, $hi0 ! ahi=aj 1073 ldx [$ap+0], $aj ! ap[j] 1074 add $ap, 8, $ap 1075 addcc $nlo, $hi1, $lo1 1076 mulx $aj, $m0, $alo ! ap[j]*bp[i] 1077 addxc $nj, %g0, $hi1 ! nhi=nj 1078 ldx [$np+0], $nj ! np[j] 1079 add $np, 8, $np 1080 umulxhi $aj, $m0, $aj ! ahi=aj 1081 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1082 mulx $nj, $m1, $nlo ! np[j]*m1 1083 addxc %g0, $hi0, $hi0 1084 umulxhi $nj, $m1, $nj ! nhi=nj 1085 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1086 addxc %g0, $hi1, $hi1 1087 stx $lo1, [$tp] ! tp[j-1] 1088 add $tp, 8, $tp 1089 brnz,pt $cnt, .Linner_g5 1090 sub $cnt, 8, $cnt 1091!.Linner_g5 1092 ldx [$tp+8], $tj ! tp[j] 1093 addcc $alo, $hi0, $lo0 1094 addxc $aj, %g0, $hi0 ! ahi=aj 1095 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 1096 addxc %g0, $hi0, $hi0 1097 1098 addcc $nlo, $hi1, $lo1 1099 addxc $nj, %g0, $hi1 ! nhi=nj 1100 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 1101 addxc %g0, $hi1, $hi1 1102 stx $lo1, [$tp] ! tp[j-1] 1103 1104 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 1105 addxccc $hi1, $hi0, $hi1 1106 addxc %g0, %g0, $ovf 1107 stx $hi1, [$tp+8] 1108 add $tp, 16, $tp 1109 1110 brnz,pt $i, .Louter_g5 1111 sub $i, 8, $i 1112 1113 sub $ap, $num, $ap ! rewind 1114 sub $np, $num, $np 1115 sub $tp, $num, $tp 1116 ba .Lsub_g5 1117 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 1118 1119.align 16 1120.Lsub_g5: 1121 ldx [$tp], $tj 1122 add $tp, 8, $tp 1123 ldx [$np+0], $nj 1124 add $np, 8, $np 1125 subccc $tj, $nj, $t2 ! tp[j]-np[j] 1126 srlx $tj, 32, $tj 1127 srlx $nj, 32, $nj 1128 subccc $tj, $nj, $t3 1129 add $rp, 8, $rp 1130 st $t2, [$rp-4] ! reverse order 1131 st $t3, [$rp-8] 1132 brnz,pt $cnt, .Lsub_g5 1133 sub $cnt, 8, $cnt 1134 1135 sub $np, $num, $np ! rewind 1136 sub $tp, $num, $tp 1137 sub $rp, $num, $rp 1138 1139 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 1140 ba .Lcopy_g5 1141 sub $num, 8, $cnt 1142 1143.align 16 1144.Lcopy_g5: ! conditional copy 1145 ldx [$tp], $tj 1146 ldx [$rp+0], $t2 1147 stx %g0, [$tp] ! zap 1148 add $tp, 8, $tp 1149 movcs %icc, $tj, $t2 1150 stx $t2, [$rp+0] 1151 add $rp, 8, $rp 1152 brnz $cnt, .Lcopy_g5 1153 sub $cnt, 8, $cnt 1154 1155 mov 1, %o0 1156 ret 1157 restore 1158.type bn_mul_mont_gather5_t4, #function 1159.size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4 1160___ 1161} 1162 1163$code.=<<___; 1164.globl bn_flip_t4 1165.align 32 1166bn_flip_t4: 1167.Loop_flip: 1168 ld [%o1+0], %o4 1169 sub %o2, 1, %o2 1170 ld [%o1+4], %o5 1171 add %o1, 8, %o1 1172 st %o5, [%o0+0] 1173 st %o4, [%o0+4] 1174 brnz %o2, .Loop_flip 1175 add %o0, 8, %o0 1176 retl 1177 nop 1178.type bn_flip_t4, #function 1179.size bn_flip_t4, .-bn_flip_t4 1180 1181.globl bn_flip_n_scatter5_t4 1182.align 32 1183bn_flip_n_scatter5_t4: 1184 sll %o3, 3, %o3 1185 srl %o1, 1, %o1 1186 add %o3, %o2, %o2 ! &pwrtbl[pwr] 1187 sub %o1, 1, %o1 1188.Loop_flip_n_scatter5: 1189 ld [%o0+0], %o4 ! inp[i] 1190 ld [%o0+4], %o5 1191 add %o0, 8, %o0 1192 sllx %o5, 32, %o5 1193 or %o4, %o5, %o5 1194 stx %o5, [%o2] 1195 add %o2, 32*8, %o2 1196 brnz %o1, .Loop_flip_n_scatter5 1197 sub %o1, 1, %o1 1198 retl 1199 nop 1200.type bn_flip_n_scatter5_t4, #function 1201.size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4 1202 1203.globl bn_gather5_t4 1204.align 32 1205bn_gather5_t4: 1206___ 1207 &load_ccr("%o2","%o3","%g1"); 1208$code.=<<___; 1209 sub %o1, 1, %o1 1210.Loop_gather5: 1211___ 1212 &load_b("%o2","%g1"); 1213$code.=<<___; 1214 stx %g1, [%o0] 1215 add %o0, 8, %o0 1216 brnz %o1, .Loop_gather5 1217 sub %o1, 1, %o1 1218 1219 retl 1220 nop 1221.type bn_gather5_t4, #function 1222.size bn_gather5_t4, .-bn_gather5_t4 1223 1224.asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov" 1225.align 4 1226___ 1227 1228&emit_assembler(); 1229 1230close STDOUT or die "error closing STDOUT: $!"; 1231