1#! /usr/bin/env perl 2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# December 2005 18# 19# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons 20# for undertaken effort are multiple. First of all, UltraSPARC is not 21# the whole SPARCv9 universe and other VIS-free implementations deserve 22# optimized code as much. Secondly, newly introduced UltraSPARC T1, 23# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, 24# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with 25# several integrated RSA/DSA accelerator circuits accessible through 26# kernel driver [only(*)], but having decent user-land software 27# implementation is important too. Finally, reasons like desire to 28# experiment with dedicated squaring procedure. Yes, this module 29# implements one, because it was easiest to draft it in SPARCv9 30# instructions... 31 32# (*) Engine accessing the driver in question is on my TODO list. 33# For reference, accelerator is estimated to give 6 to 10 times 34# improvement on single-threaded RSA sign. It should be noted 35# that 6-10x improvement coefficient does not actually mean 36# something extraordinary in terms of absolute [single-threaded] 37# performance, as SPARCv9 instruction set is by all means least 38# suitable for high performance crypto among other 64 bit 39# platforms. 6-10x factor simply places T1 in same performance 40# domain as say AMD64 and IA-64. Improvement of RSA verify don't 41# appear impressive at all, but it's the sign operation which is 42# far more critical/interesting. 43 44# You might notice that inner loops are modulo-scheduled:-) This has 45# essentially negligible impact on UltraSPARC performance, it's 46# Fujitsu SPARC64 V users who should notice and hopefully appreciate 47# the advantage... Currently this module surpasses sparcv9a-mont.pl 48# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a 49# module still have hidden potential [see TODO list there], which is 50# estimated to be larger than 20%... 51 52$output = pop and open STDOUT,">$output"; 53 54# int bn_mul_mont( 55$rp="%i0"; # BN_ULONG *rp, 56$ap="%i1"; # const BN_ULONG *ap, 57$bp="%i2"; # const BN_ULONG *bp, 58$np="%i3"; # const BN_ULONG *np, 59$n0="%i4"; # const BN_ULONG *n0, 60$num="%i5"; # int num); 61 62$frame="STACK_FRAME"; 63$bias="STACK_BIAS"; 64 65$car0="%o0"; 66$car1="%o1"; 67$car2="%o2"; # 1 bit 68$acc0="%o3"; 69$acc1="%o4"; 70$mask="%g1"; # 32 bits, what a waste... 71$tmp0="%g4"; 72$tmp1="%g5"; 73 74$i="%l0"; 75$j="%l1"; 76$mul0="%l2"; 77$mul1="%l3"; 78$tp="%l4"; 79$apj="%l5"; 80$npj="%l6"; 81$tpj="%l7"; 82 83$fname="bn_mul_mont_int"; 84 85$code=<<___; 86#ifndef __ASSEMBLER__ 87# define __ASSEMBLER__ 1 88#endif 89#include "crypto/sparc_arch.h" 90 91.section ".text",#alloc,#execinstr 92 93.global $fname 94.align 32 95$fname: 96 cmp %o5,4 ! 128 bits minimum 97 bge,pt %icc,.Lenter 98 sethi %hi(0xffffffff),$mask 99 retl 100 clr %o0 101.align 32 102.Lenter: 103 save %sp,-$frame,%sp 104 sll $num,2,$num ! num*=4 105 or $mask,%lo(0xffffffff),$mask 106 ld [$n0],$n0 107 cmp $ap,$bp 108 and $num,$mask,$num 109 ld [$bp],$mul0 ! bp[0] 110 nop 111 112 add %sp,$bias,%o7 ! real top of stack 113 ld [$ap],$car0 ! ap[0] ! redundant in squaring context 114 sub %o7,$num,%o7 115 ld [$ap+4],$apj ! ap[1] 116 and %o7,-1024,%o7 117 ld [$np],$car1 ! np[0] 118 sub %o7,$bias,%sp ! alloca 119 ld [$np+4],$npj ! np[1] 120 be,pt SIZE_T_CC,.Lbn_sqr_mont 121 mov 12,$j 122 123 mulx $car0,$mul0,$car0 ! ap[0]*bp[0] 124 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0] 125 and $car0,$mask,$acc0 126 add %sp,$bias+$frame,$tp 127 ld [$ap+8],$apj !prologue! 128 129 mulx $n0,$acc0,$mul1 ! "t[0]"*n0 130 and $mul1,$mask,$mul1 131 132 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 133 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0 134 srlx $car0,32,$car0 135 add $acc0,$car1,$car1 136 ld [$np+8],$npj !prologue! 137 srlx $car1,32,$car1 138 mov $tmp0,$acc0 !prologue! 139 140.L1st: 141 mulx $apj,$mul0,$tmp0 142 mulx $npj,$mul1,$tmp1 143 add $acc0,$car0,$car0 144 ld [$ap+$j],$apj ! ap[j] 145 and $car0,$mask,$acc0 146 add $acc1,$car1,$car1 147 ld [$np+$j],$npj ! np[j] 148 srlx $car0,32,$car0 149 add $acc0,$car1,$car1 150 add $j,4,$j ! j++ 151 mov $tmp0,$acc0 152 st $car1,[$tp] 153 cmp $j,$num 154 mov $tmp1,$acc1 155 srlx $car1,32,$car1 156 bl %icc,.L1st 157 add $tp,4,$tp ! tp++ 158!.L1st 159 160 mulx $apj,$mul0,$tmp0 !epilogue! 161 mulx $npj,$mul1,$tmp1 162 add $acc0,$car0,$car0 163 and $car0,$mask,$acc0 164 add $acc1,$car1,$car1 165 srlx $car0,32,$car0 166 add $acc0,$car1,$car1 167 st $car1,[$tp] 168 srlx $car1,32,$car1 169 170 add $tmp0,$car0,$car0 171 and $car0,$mask,$acc0 172 add $tmp1,$car1,$car1 173 srlx $car0,32,$car0 174 add $acc0,$car1,$car1 175 st $car1,[$tp+4] 176 srlx $car1,32,$car1 177 178 add $car0,$car1,$car1 179 st $car1,[$tp+8] 180 srlx $car1,32,$car2 181 182 mov 4,$i ! i++ 183 ld [$bp+4],$mul0 ! bp[1] 184.Louter: 185 add %sp,$bias+$frame,$tp 186 ld [$ap],$car0 ! ap[0] 187 ld [$ap+4],$apj ! ap[1] 188 ld [$np],$car1 ! np[0] 189 ld [$np+4],$npj ! np[1] 190 ld [$tp],$tmp1 ! tp[0] 191 ld [$tp+4],$tpj ! tp[1] 192 mov 12,$j 193 194 mulx $car0,$mul0,$car0 195 mulx $apj,$mul0,$tmp0 !prologue! 196 add $tmp1,$car0,$car0 197 ld [$ap+8],$apj !prologue! 198 and $car0,$mask,$acc0 199 200 mulx $n0,$acc0,$mul1 201 and $mul1,$mask,$mul1 202 203 mulx $car1,$mul1,$car1 204 mulx $npj,$mul1,$acc1 !prologue! 205 srlx $car0,32,$car0 206 add $acc0,$car1,$car1 207 ld [$np+8],$npj !prologue! 208 srlx $car1,32,$car1 209 mov $tmp0,$acc0 !prologue! 210 211.Linner: 212 mulx $apj,$mul0,$tmp0 213 mulx $npj,$mul1,$tmp1 214 add $tpj,$car0,$car0 215 ld [$ap+$j],$apj ! ap[j] 216 add $acc0,$car0,$car0 217 add $acc1,$car1,$car1 218 ld [$np+$j],$npj ! np[j] 219 and $car0,$mask,$acc0 220 ld [$tp+8],$tpj ! tp[j] 221 srlx $car0,32,$car0 222 add $acc0,$car1,$car1 223 add $j,4,$j ! j++ 224 mov $tmp0,$acc0 225 st $car1,[$tp] ! tp[j-1] 226 srlx $car1,32,$car1 227 mov $tmp1,$acc1 228 cmp $j,$num 229 bl %icc,.Linner 230 add $tp,4,$tp ! tp++ 231!.Linner 232 233 mulx $apj,$mul0,$tmp0 !epilogue! 234 mulx $npj,$mul1,$tmp1 235 add $tpj,$car0,$car0 236 add $acc0,$car0,$car0 237 ld [$tp+8],$tpj ! tp[j] 238 and $car0,$mask,$acc0 239 add $acc1,$car1,$car1 240 srlx $car0,32,$car0 241 add $acc0,$car1,$car1 242 st $car1,[$tp] ! tp[j-1] 243 srlx $car1,32,$car1 244 245 add $tpj,$car0,$car0 246 add $tmp0,$car0,$car0 247 and $car0,$mask,$acc0 248 add $tmp1,$car1,$car1 249 add $acc0,$car1,$car1 250 st $car1,[$tp+4] ! tp[j-1] 251 srlx $car0,32,$car0 252 add $i,4,$i ! i++ 253 srlx $car1,32,$car1 254 255 add $car0,$car1,$car1 256 cmp $i,$num 257 add $car2,$car1,$car1 258 st $car1,[$tp+8] 259 260 srlx $car1,32,$car2 261 bl,a %icc,.Louter 262 ld [$bp+$i],$mul0 ! bp[i] 263!.Louter 264 265 add $tp,12,$tp 266 267.Ltail: 268 add $np,$num,$np 269 add $rp,$num,$rp 270 sub %g0,$num,%o7 ! k=-num 271 ba .Lsub 272 subcc %g0,%g0,%g0 ! clear %icc.c 273.align 16 274.Lsub: 275 ld [$tp+%o7],%o0 276 ld [$np+%o7],%o1 277 subccc %o0,%o1,%o1 ! tp[j]-np[j] 278 add $rp,%o7,$i 279 add %o7,4,%o7 280 brnz %o7,.Lsub 281 st %o1,[$i] 282 subccc $car2,0,$car2 ! handle upmost overflow bit 283 sub %g0,$num,%o7 284 285.Lcopy: 286 ld [$tp+%o7],%o1 ! conditional copy 287 ld [$rp+%o7],%o0 288 st %g0,[$tp+%o7] ! zap tp 289 movcs %icc,%o1,%o0 290 st %o0,[$rp+%o7] 291 add %o7,4,%o7 292 brnz %o7,.Lcopy 293 nop 294 mov 1,%i0 295 ret 296 restore 297___ 298 299######## 300######## .Lbn_sqr_mont gives up to 20% *overall* improvement over 301######## code without following dedicated squaring procedure. 302######## 303$sbit="%o5"; 304 305$code.=<<___; 306.align 32 307.Lbn_sqr_mont: 308 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0] 309 mulx $apj,$mul0,$tmp0 !prologue! 310 and $car0,$mask,$acc0 311 add %sp,$bias+$frame,$tp 312 ld [$ap+8],$apj !prologue! 313 314 mulx $n0,$acc0,$mul1 ! "t[0]"*n0 315 srlx $car0,32,$car0 316 and $mul1,$mask,$mul1 317 318 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0 319 mulx $npj,$mul1,$acc1 !prologue! 320 and $car0,1,$sbit 321 ld [$np+8],$npj !prologue! 322 srlx $car0,1,$car0 323 add $acc0,$car1,$car1 324 srlx $car1,32,$car1 325 mov $tmp0,$acc0 !prologue! 326 327.Lsqr_1st: 328 mulx $apj,$mul0,$tmp0 329 mulx $npj,$mul1,$tmp1 330 add $acc0,$car0,$car0 ! ap[j]*a0+c0 331 add $acc1,$car1,$car1 332 ld [$ap+$j],$apj ! ap[j] 333 and $car0,$mask,$acc0 334 ld [$np+$j],$npj ! np[j] 335 srlx $car0,32,$car0 336 add $acc0,$acc0,$acc0 337 or $sbit,$acc0,$acc0 338 mov $tmp1,$acc1 339 srlx $acc0,32,$sbit 340 add $j,4,$j ! j++ 341 and $acc0,$mask,$acc0 342 cmp $j,$num 343 add $acc0,$car1,$car1 344 st $car1,[$tp] 345 mov $tmp0,$acc0 346 srlx $car1,32,$car1 347 bl %icc,.Lsqr_1st 348 add $tp,4,$tp ! tp++ 349!.Lsqr_1st 350 351 mulx $apj,$mul0,$tmp0 ! epilogue 352 mulx $npj,$mul1,$tmp1 353 add $acc0,$car0,$car0 ! ap[j]*a0+c0 354 add $acc1,$car1,$car1 355 and $car0,$mask,$acc0 356 srlx $car0,32,$car0 357 add $acc0,$acc0,$acc0 358 or $sbit,$acc0,$acc0 359 srlx $acc0,32,$sbit 360 and $acc0,$mask,$acc0 361 add $acc0,$car1,$car1 362 st $car1,[$tp] 363 srlx $car1,32,$car1 364 365 add $tmp0,$car0,$car0 ! ap[j]*a0+c0 366 add $tmp1,$car1,$car1 367 and $car0,$mask,$acc0 368 srlx $car0,32,$car0 369 add $acc0,$acc0,$acc0 370 or $sbit,$acc0,$acc0 371 srlx $acc0,32,$sbit 372 and $acc0,$mask,$acc0 373 add $acc0,$car1,$car1 374 st $car1,[$tp+4] 375 srlx $car1,32,$car1 376 377 add $car0,$car0,$car0 378 or $sbit,$car0,$car0 379 add $car0,$car1,$car1 380 st $car1,[$tp+8] 381 srlx $car1,32,$car2 382 383 ld [%sp+$bias+$frame],$tmp0 ! tp[0] 384 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1] 385 ld [%sp+$bias+$frame+8],$tpj ! tp[2] 386 ld [$ap+4],$mul0 ! ap[1] 387 ld [$ap+8],$apj ! ap[2] 388 ld [$np],$car1 ! np[0] 389 ld [$np+4],$npj ! np[1] 390 mulx $n0,$tmp0,$mul1 391 392 mulx $mul0,$mul0,$car0 393 and $mul1,$mask,$mul1 394 395 mulx $car1,$mul1,$car1 396 mulx $npj,$mul1,$acc1 397 add $tmp0,$car1,$car1 398 and $car0,$mask,$acc0 399 ld [$np+8],$npj ! np[2] 400 srlx $car1,32,$car1 401 add $tmp1,$car1,$car1 402 srlx $car0,32,$car0 403 add $acc0,$car1,$car1 404 and $car0,1,$sbit 405 add $acc1,$car1,$car1 406 srlx $car0,1,$car0 407 mov 12,$j 408 st $car1,[%sp+$bias+$frame] ! tp[0]= 409 srlx $car1,32,$car1 410 add %sp,$bias+$frame+4,$tp 411 412.Lsqr_2nd: 413 mulx $apj,$mul0,$acc0 414 mulx $npj,$mul1,$acc1 415 add $acc0,$car0,$car0 416 add $tpj,$sbit,$sbit 417 ld [$ap+$j],$apj ! ap[j] 418 and $car0,$mask,$acc0 419 ld [$np+$j],$npj ! np[j] 420 srlx $car0,32,$car0 421 add $acc1,$car1,$car1 422 ld [$tp+8],$tpj ! tp[j] 423 add $acc0,$acc0,$acc0 424 add $j,4,$j ! j++ 425 add $sbit,$acc0,$acc0 426 srlx $acc0,32,$sbit 427 and $acc0,$mask,$acc0 428 cmp $j,$num 429 add $acc0,$car1,$car1 430 st $car1,[$tp] ! tp[j-1] 431 srlx $car1,32,$car1 432 bl %icc,.Lsqr_2nd 433 add $tp,4,$tp ! tp++ 434!.Lsqr_2nd 435 436 mulx $apj,$mul0,$acc0 437 mulx $npj,$mul1,$acc1 438 add $acc0,$car0,$car0 439 add $tpj,$sbit,$sbit 440 and $car0,$mask,$acc0 441 srlx $car0,32,$car0 442 add $acc1,$car1,$car1 443 add $acc0,$acc0,$acc0 444 add $sbit,$acc0,$acc0 445 srlx $acc0,32,$sbit 446 and $acc0,$mask,$acc0 447 add $acc0,$car1,$car1 448 st $car1,[$tp] ! tp[j-1] 449 srlx $car1,32,$car1 450 451 add $car0,$car0,$car0 452 add $sbit,$car0,$car0 453 add $car0,$car1,$car1 454 add $car2,$car1,$car1 455 st $car1,[$tp+4] 456 srlx $car1,32,$car2 457 458 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 459 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 460 ld [$ap+8],$mul0 ! ap[2] 461 ld [$np],$car1 ! np[0] 462 ld [$np+4],$npj ! np[1] 463 mulx $n0,$tmp1,$mul1 464 and $mul1,$mask,$mul1 465 mov 8,$i 466 467 mulx $mul0,$mul0,$car0 468 mulx $car1,$mul1,$car1 469 and $car0,$mask,$acc0 470 add $tmp1,$car1,$car1 471 srlx $car0,32,$car0 472 add %sp,$bias+$frame,$tp 473 srlx $car1,32,$car1 474 and $car0,1,$sbit 475 srlx $car0,1,$car0 476 mov 4,$j 477 478.Lsqr_outer: 479.Lsqr_inner1: 480 mulx $npj,$mul1,$acc1 481 add $tpj,$car1,$car1 482 add $j,4,$j 483 ld [$tp+8],$tpj 484 cmp $j,$i 485 add $acc1,$car1,$car1 486 ld [$np+$j],$npj 487 st $car1,[$tp] 488 srlx $car1,32,$car1 489 bl %icc,.Lsqr_inner1 490 add $tp,4,$tp 491!.Lsqr_inner1 492 493 add $j,4,$j 494 ld [$ap+$j],$apj ! ap[j] 495 mulx $npj,$mul1,$acc1 496 add $tpj,$car1,$car1 497 ld [$np+$j],$npj ! np[j] 498 srlx $car1,32,$tmp0 499 and $car1,$mask,$car1 500 add $tmp0,$sbit,$sbit 501 add $acc0,$car1,$car1 502 ld [$tp+8],$tpj ! tp[j] 503 add $acc1,$car1,$car1 504 st $car1,[$tp] 505 srlx $car1,32,$car1 506 507 add $j,4,$j 508 cmp $j,$num 509 be,pn %icc,.Lsqr_no_inner2 510 add $tp,4,$tp 511 512.Lsqr_inner2: 513 mulx $apj,$mul0,$acc0 514 mulx $npj,$mul1,$acc1 515 add $tpj,$sbit,$sbit 516 add $acc0,$car0,$car0 517 ld [$ap+$j],$apj ! ap[j] 518 and $car0,$mask,$acc0 519 ld [$np+$j],$npj ! np[j] 520 srlx $car0,32,$car0 521 add $acc0,$acc0,$acc0 522 ld [$tp+8],$tpj ! tp[j] 523 add $sbit,$acc0,$acc0 524 add $j,4,$j ! j++ 525 srlx $acc0,32,$sbit 526 and $acc0,$mask,$acc0 527 cmp $j,$num 528 add $acc0,$car1,$car1 529 add $acc1,$car1,$car1 530 st $car1,[$tp] ! tp[j-1] 531 srlx $car1,32,$car1 532 bl %icc,.Lsqr_inner2 533 add $tp,4,$tp ! tp++ 534 535.Lsqr_no_inner2: 536 mulx $apj,$mul0,$acc0 537 mulx $npj,$mul1,$acc1 538 add $tpj,$sbit,$sbit 539 add $acc0,$car0,$car0 540 and $car0,$mask,$acc0 541 srlx $car0,32,$car0 542 add $acc0,$acc0,$acc0 543 add $sbit,$acc0,$acc0 544 srlx $acc0,32,$sbit 545 and $acc0,$mask,$acc0 546 add $acc0,$car1,$car1 547 add $acc1,$car1,$car1 548 st $car1,[$tp] ! tp[j-1] 549 srlx $car1,32,$car1 550 551 add $car0,$car0,$car0 552 add $sbit,$car0,$car0 553 add $car0,$car1,$car1 554 add $car2,$car1,$car1 555 st $car1,[$tp+4] 556 srlx $car1,32,$car2 557 558 add $i,4,$i ! i++ 559 ld [%sp+$bias+$frame],$tmp1 ! tp[0] 560 ld [%sp+$bias+$frame+4],$tpj ! tp[1] 561 ld [$ap+$i],$mul0 ! ap[j] 562 ld [$np],$car1 ! np[0] 563 ld [$np+4],$npj ! np[1] 564 mulx $n0,$tmp1,$mul1 565 and $mul1,$mask,$mul1 566 add $i,4,$tmp0 567 568 mulx $mul0,$mul0,$car0 569 mulx $car1,$mul1,$car1 570 and $car0,$mask,$acc0 571 add $tmp1,$car1,$car1 572 srlx $car0,32,$car0 573 add %sp,$bias+$frame,$tp 574 srlx $car1,32,$car1 575 and $car0,1,$sbit 576 srlx $car0,1,$car0 577 578 cmp $tmp0,$num ! i<num-1 579 bl %icc,.Lsqr_outer 580 mov 4,$j 581 582.Lsqr_last: 583 mulx $npj,$mul1,$acc1 584 add $tpj,$car1,$car1 585 add $j,4,$j 586 ld [$tp+8],$tpj 587 cmp $j,$i 588 add $acc1,$car1,$car1 589 ld [$np+$j],$npj 590 st $car1,[$tp] 591 srlx $car1,32,$car1 592 bl %icc,.Lsqr_last 593 add $tp,4,$tp 594!.Lsqr_last 595 596 mulx $npj,$mul1,$acc1 597 add $tpj,$acc0,$acc0 598 srlx $acc0,32,$tmp0 599 and $acc0,$mask,$acc0 600 add $tmp0,$sbit,$sbit 601 add $acc0,$car1,$car1 602 add $acc1,$car1,$car1 603 st $car1,[$tp] 604 srlx $car1,32,$car1 605 606 add $car0,$car0,$car0 ! recover $car0 607 add $sbit,$car0,$car0 608 add $car0,$car1,$car1 609 add $car2,$car1,$car1 610 st $car1,[$tp+4] 611 srlx $car1,32,$car2 612 613 ba .Ltail 614 add $tp,8,$tp 615.type $fname,#function 616.size $fname,(.-$fname) 617.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 618.align 32 619___ 620$code =~ s/\`([^\`]*)\`/eval($1)/gem; 621print $code; 622close STDOUT or die "error closing STDOUT: $!"; 623