1.ident "sparcv8plus.s, Version 1.4" 2.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@openssl.org>" 3 4/* 5 * ==================================================================== 6 * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. 7 * 8 * Licensed under the Apache License 2.0 (the "License"). You may not use 9 * this file except in compliance with the License. You can obtain a copy 10 * in the file LICENSE in the source distribution or at 11 * https://www.openssl.org/source/license.html 12 * ==================================================================== 13 */ 14 15/* 16 * This is my modest contribution to OpenSSL project (see 17 * http://www.openssl.org/ for more information about it) and is 18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c 19 * module. For updates see http://fy.chalmers.se/~appro/hpe/. 20 * 21 * Questions-n-answers. 22 * 23 * Q. How to compile? 24 * A. With SC4.x/SC5.x: 25 * 26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o 27 * 28 * and with gcc: 29 * 30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o 31 * 32 * or if above fails (it does if you have gas installed): 33 * 34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o 35 * 36 * Quick-n-dirty way to fuse the module into the library. 37 * Provided that the library is already configured and built 38 * (in 0.9.2 case with no-asm option): 39 * 40 * # cd crypto/bn 41 * # cp /some/place/bn_asm.sparc.v8plus.S . 42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o 43 * # make 44 * # cd ../.. 45 * # make; make test 46 * 47 * Quick-n-dirty way to get rid of it: 48 * 49 * # cd crypto/bn 50 * # touch bn_asm.c 51 * # make 52 * # cd ../.. 53 * # make; make test 54 * 55 * Q. V8plus architecture? What kind of beast is that? 56 * A. Well, it's rather a programming model than an architecture... 57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under 58 * special conditions, namely when kernel doesn't preserve upper 59 * 32 bits of otherwise 64-bit registers during a context switch. 60 * 61 * Q. Why just UltraSPARC? What about SuperSPARC? 62 * A. Original release did target UltraSPARC only. Now SuperSPARC 63 * version is provided along. Both version share bn_*comba[48] 64 * implementations (see comment later in code for explanation). 65 * But what's so special about this UltraSPARC implementation? 66 * Why didn't I let compiler do the job? Trouble is that most of 67 * available compilers (well, SC5.0 is the only exception) don't 68 * attempt to take advantage of UltraSPARC's 64-bitness under 69 * 32-bit kernels even though it's perfectly possible (see next 70 * question). 71 * 72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it 73 * doesn't work? 74 * A. You can't address *all* registers as 64-bit wide:-( The catch is 75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully 76 * preserved if you're in a leaf function, i.e. such never calling 77 * any other functions. All functions in this module are leaf and 78 * 10 registers is a handful. And as a matter of fact none-"comba" 79 * routines don't require even that much and I could even afford to 80 * not allocate own stack frame for 'em:-) 81 * 82 * Q. What about 64-bit kernels? 83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently 84 * under evaluation and development... 85 * 86 * Q. What about shared libraries? 87 * A. What about 'em? Kidding again:-) Code does *not* contain any 88 * code position dependencies and it's safe to include it into 89 * shared library as is. 90 * 91 * Q. How much faster does it go? 92 * A. Do you have a good benchmark? In either case below is what I 93 * experience with crypto/bn/expspeed.c test program: 94 * 95 * v8plus module on U10/300MHz against bn_asm.c compiled with: 96 * 97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12% 98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35% 99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45% 100 * 101 * v8 module on SS10/60MHz against bn_asm.c compiled with: 102 * 103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10% 104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10% 105 * egcs-1.1.2 -mv8 -O3 +35-45% 106 * 107 * As you can see it's damn hard to beat the new Sun C compiler 108 * and it's in first place GNU C users who will appreciate this 109 * assembler implementation:-) 110 */ 111 112/* 113 * Revision history. 114 * 115 * 1.0 - initial release; 116 * 1.1 - new loop unrolling model(*); 117 * - some more fine tuning; 118 * 1.2 - made gas friendly; 119 * - updates to documentation concerning v9; 120 * - new performance comparison matrix; 121 * 1.3 - fixed problem with /usr/ccs/lib/cpp; 122 * 1.4 - native V9 bn_*_comba[48] implementation (15% more efficient) 123 * resulting in slight overall performance kick; 124 * - some retunes; 125 * - support for GNU as added; 126 * 127 * (*) Originally unrolled loop looked like this: 128 * for (;;) { 129 * op(p+0); if (--n==0) break; 130 * op(p+1); if (--n==0) break; 131 * op(p+2); if (--n==0) break; 132 * op(p+3); if (--n==0) break; 133 * p+=4; 134 * } 135 * I unroll according to following: 136 * while (n&~3) { 137 * op(p+0); op(p+1); op(p+2); op(p+3); 138 * p+=4; n=-4; 139 * } 140 * if (n) { 141 * op(p+0); if (--n==0) return; 142 * op(p+2); if (--n==0) return; 143 * op(p+3); return; 144 * } 145 */ 146 147#if defined(__SUNPRO_C) && defined(__sparcv9) 148 /* They've said -xarch=v9 at command line */ 149 .register %g2,#scratch 150 .register %g3,#scratch 151# define FRAME_SIZE -192 152#elif defined(__GNUC__) && defined(__arch64__) 153 /* They've said -m64 at command line */ 154 .register %g2,#scratch 155 .register %g3,#scratch 156# define FRAME_SIZE -192 157#else 158# define FRAME_SIZE -96 159#endif 160/* 161 * GNU assembler can't stand stuw:-( 162 */ 163#define stuw st 164 165.section ".text",#alloc,#execinstr 166.file "bn_asm.sparc.v8plus.S" 167 168.align 32 169 170.global bn_mul_add_words 171/* 172 * BN_ULONG bn_mul_add_words(rp,ap,num,w) 173 * BN_ULONG *rp,*ap; 174 * int num; 175 * BN_ULONG w; 176 */ 177bn_mul_add_words: 178 sra %o2,%g0,%o2 ! signx %o2 179 brgz,a %o2,.L_bn_mul_add_words_proceed 180 lduw [%o1],%g2 181 retl 182 clr %o0 183 nop 184 nop 185 nop 186 187.L_bn_mul_add_words_proceed: 188 srl %o3,%g0,%o3 ! clruw %o3 189 andcc %o2,-4,%g0 190 bz,pn %icc,.L_bn_mul_add_words_tail 191 clr %o5 192 193.L_bn_mul_add_words_loop: ! wow! 32 aligned! 194 lduw [%o0],%g1 195 lduw [%o1+4],%g3 196 mulx %o3,%g2,%g2 197 add %g1,%o5,%o4 198 nop 199 add %o4,%g2,%o4 200 stuw %o4,[%o0] 201 srlx %o4,32,%o5 202 203 lduw [%o0+4],%g1 204 lduw [%o1+8],%g2 205 mulx %o3,%g3,%g3 206 add %g1,%o5,%o4 207 dec 4,%o2 208 add %o4,%g3,%o4 209 stuw %o4,[%o0+4] 210 srlx %o4,32,%o5 211 212 lduw [%o0+8],%g1 213 lduw [%o1+12],%g3 214 mulx %o3,%g2,%g2 215 add %g1,%o5,%o4 216 inc 16,%o1 217 add %o4,%g2,%o4 218 stuw %o4,[%o0+8] 219 srlx %o4,32,%o5 220 221 lduw [%o0+12],%g1 222 mulx %o3,%g3,%g3 223 add %g1,%o5,%o4 224 inc 16,%o0 225 add %o4,%g3,%o4 226 andcc %o2,-4,%g0 227 stuw %o4,[%o0-4] 228 srlx %o4,32,%o5 229 bnz,a,pt %icc,.L_bn_mul_add_words_loop 230 lduw [%o1],%g2 231 232 brnz,a,pn %o2,.L_bn_mul_add_words_tail 233 lduw [%o1],%g2 234.L_bn_mul_add_words_return: 235 retl 236 mov %o5,%o0 237 238.L_bn_mul_add_words_tail: 239 lduw [%o0],%g1 240 mulx %o3,%g2,%g2 241 add %g1,%o5,%o4 242 dec %o2 243 add %o4,%g2,%o4 244 srlx %o4,32,%o5 245 brz,pt %o2,.L_bn_mul_add_words_return 246 stuw %o4,[%o0] 247 248 lduw [%o1+4],%g2 249 lduw [%o0+4],%g1 250 mulx %o3,%g2,%g2 251 add %g1,%o5,%o4 252 dec %o2 253 add %o4,%g2,%o4 254 srlx %o4,32,%o5 255 brz,pt %o2,.L_bn_mul_add_words_return 256 stuw %o4,[%o0+4] 257 258 lduw [%o1+8],%g2 259 lduw [%o0+8],%g1 260 mulx %o3,%g2,%g2 261 add %g1,%o5,%o4 262 add %o4,%g2,%o4 263 stuw %o4,[%o0+8] 264 retl 265 srlx %o4,32,%o0 266 267.type bn_mul_add_words,#function 268.size bn_mul_add_words,(.-bn_mul_add_words) 269 270.align 32 271 272.global bn_mul_words 273/* 274 * BN_ULONG bn_mul_words(rp,ap,num,w) 275 * BN_ULONG *rp,*ap; 276 * int num; 277 * BN_ULONG w; 278 */ 279bn_mul_words: 280 sra %o2,%g0,%o2 ! signx %o2 281 brgz,a %o2,.L_bn_mul_words_proceed 282 lduw [%o1],%g2 283 retl 284 clr %o0 285 nop 286 nop 287 nop 288 289.L_bn_mul_words_proceed: 290 srl %o3,%g0,%o3 ! clruw %o3 291 andcc %o2,-4,%g0 292 bz,pn %icc,.L_bn_mul_words_tail 293 clr %o5 294 295.L_bn_mul_words_loop: ! wow! 32 aligned! 296 lduw [%o1+4],%g3 297 mulx %o3,%g2,%g2 298 add %g2,%o5,%o4 299 nop 300 stuw %o4,[%o0] 301 srlx %o4,32,%o5 302 303 lduw [%o1+8],%g2 304 mulx %o3,%g3,%g3 305 add %g3,%o5,%o4 306 dec 4,%o2 307 stuw %o4,[%o0+4] 308 srlx %o4,32,%o5 309 310 lduw [%o1+12],%g3 311 mulx %o3,%g2,%g2 312 add %g2,%o5,%o4 313 inc 16,%o1 314 stuw %o4,[%o0+8] 315 srlx %o4,32,%o5 316 317 mulx %o3,%g3,%g3 318 add %g3,%o5,%o4 319 inc 16,%o0 320 stuw %o4,[%o0-4] 321 srlx %o4,32,%o5 322 andcc %o2,-4,%g0 323 bnz,a,pt %icc,.L_bn_mul_words_loop 324 lduw [%o1],%g2 325 nop 326 nop 327 328 brnz,a,pn %o2,.L_bn_mul_words_tail 329 lduw [%o1],%g2 330.L_bn_mul_words_return: 331 retl 332 mov %o5,%o0 333 334.L_bn_mul_words_tail: 335 mulx %o3,%g2,%g2 336 add %g2,%o5,%o4 337 dec %o2 338 srlx %o4,32,%o5 339 brz,pt %o2,.L_bn_mul_words_return 340 stuw %o4,[%o0] 341 342 lduw [%o1+4],%g2 343 mulx %o3,%g2,%g2 344 add %g2,%o5,%o4 345 dec %o2 346 srlx %o4,32,%o5 347 brz,pt %o2,.L_bn_mul_words_return 348 stuw %o4,[%o0+4] 349 350 lduw [%o1+8],%g2 351 mulx %o3,%g2,%g2 352 add %g2,%o5,%o4 353 stuw %o4,[%o0+8] 354 retl 355 srlx %o4,32,%o0 356 357.type bn_mul_words,#function 358.size bn_mul_words,(.-bn_mul_words) 359 360.align 32 361.global bn_sqr_words 362/* 363 * void bn_sqr_words(r,a,n) 364 * BN_ULONG *r,*a; 365 * int n; 366 */ 367bn_sqr_words: 368 sra %o2,%g0,%o2 ! signx %o2 369 brgz,a %o2,.L_bn_sqr_words_proceed 370 lduw [%o1],%g2 371 retl 372 clr %o0 373 nop 374 nop 375 nop 376 377.L_bn_sqr_words_proceed: 378 andcc %o2,-4,%g0 379 nop 380 bz,pn %icc,.L_bn_sqr_words_tail 381 nop 382 383.L_bn_sqr_words_loop: ! wow! 32 aligned! 384 lduw [%o1+4],%g3 385 mulx %g2,%g2,%o4 386 stuw %o4,[%o0] 387 srlx %o4,32,%o5 388 stuw %o5,[%o0+4] 389 nop 390 391 lduw [%o1+8],%g2 392 mulx %g3,%g3,%o4 393 dec 4,%o2 394 stuw %o4,[%o0+8] 395 srlx %o4,32,%o5 396 stuw %o5,[%o0+12] 397 398 lduw [%o1+12],%g3 399 mulx %g2,%g2,%o4 400 srlx %o4,32,%o5 401 stuw %o4,[%o0+16] 402 inc 16,%o1 403 stuw %o5,[%o0+20] 404 405 mulx %g3,%g3,%o4 406 inc 32,%o0 407 stuw %o4,[%o0-8] 408 srlx %o4,32,%o5 409 andcc %o2,-4,%g2 410 stuw %o5,[%o0-4] 411 bnz,a,pt %icc,.L_bn_sqr_words_loop 412 lduw [%o1],%g2 413 nop 414 415 brnz,a,pn %o2,.L_bn_sqr_words_tail 416 lduw [%o1],%g2 417.L_bn_sqr_words_return: 418 retl 419 clr %o0 420 421.L_bn_sqr_words_tail: 422 mulx %g2,%g2,%o4 423 dec %o2 424 stuw %o4,[%o0] 425 srlx %o4,32,%o5 426 brz,pt %o2,.L_bn_sqr_words_return 427 stuw %o5,[%o0+4] 428 429 lduw [%o1+4],%g2 430 mulx %g2,%g2,%o4 431 dec %o2 432 stuw %o4,[%o0+8] 433 srlx %o4,32,%o5 434 brz,pt %o2,.L_bn_sqr_words_return 435 stuw %o5,[%o0+12] 436 437 lduw [%o1+8],%g2 438 mulx %g2,%g2,%o4 439 srlx %o4,32,%o5 440 stuw %o4,[%o0+16] 441 stuw %o5,[%o0+20] 442 retl 443 clr %o0 444 445.type bn_sqr_words,#function 446.size bn_sqr_words,(.-bn_sqr_words) 447 448.align 32 449.global bn_div_words 450/* 451 * BN_ULONG bn_div_words(h,l,d) 452 * BN_ULONG h,l,d; 453 */ 454bn_div_words: 455 sllx %o0,32,%o0 456 or %o0,%o1,%o0 457 udivx %o0,%o2,%o0 458 retl 459 srl %o0,%g0,%o0 ! clruw %o0 460 461.type bn_div_words,#function 462.size bn_div_words,(.-bn_div_words) 463 464.align 32 465 466.global bn_add_words 467/* 468 * BN_ULONG bn_add_words(rp,ap,bp,n) 469 * BN_ULONG *rp,*ap,*bp; 470 * int n; 471 */ 472bn_add_words: 473 sra %o3,%g0,%o3 ! signx %o3 474 brgz,a %o3,.L_bn_add_words_proceed 475 lduw [%o1],%o4 476 retl 477 clr %o0 478 479.L_bn_add_words_proceed: 480 andcc %o3,-4,%g0 481 bz,pn %icc,.L_bn_add_words_tail 482 addcc %g0,0,%g0 ! clear carry flag 483 484.L_bn_add_words_loop: ! wow! 32 aligned! 485 dec 4,%o3 486 lduw [%o2],%o5 487 lduw [%o1+4],%g1 488 lduw [%o2+4],%g2 489 lduw [%o1+8],%g3 490 lduw [%o2+8],%g4 491 addccc %o5,%o4,%o5 492 stuw %o5,[%o0] 493 494 lduw [%o1+12],%o4 495 lduw [%o2+12],%o5 496 inc 16,%o1 497 addccc %g1,%g2,%g1 498 stuw %g1,[%o0+4] 499 500 inc 16,%o2 501 addccc %g3,%g4,%g3 502 stuw %g3,[%o0+8] 503 504 inc 16,%o0 505 addccc %o5,%o4,%o5 506 stuw %o5,[%o0-4] 507 and %o3,-4,%g1 508 brnz,a,pt %g1,.L_bn_add_words_loop 509 lduw [%o1],%o4 510 511 brnz,a,pn %o3,.L_bn_add_words_tail 512 lduw [%o1],%o4 513.L_bn_add_words_return: 514 clr %o0 515 retl 516 movcs %icc,1,%o0 517 nop 518 519.L_bn_add_words_tail: 520 lduw [%o2],%o5 521 dec %o3 522 addccc %o5,%o4,%o5 523 brz,pt %o3,.L_bn_add_words_return 524 stuw %o5,[%o0] 525 526 lduw [%o1+4],%o4 527 lduw [%o2+4],%o5 528 dec %o3 529 addccc %o5,%o4,%o5 530 brz,pt %o3,.L_bn_add_words_return 531 stuw %o5,[%o0+4] 532 533 lduw [%o1+8],%o4 534 lduw [%o2+8],%o5 535 addccc %o5,%o4,%o5 536 stuw %o5,[%o0+8] 537 clr %o0 538 retl 539 movcs %icc,1,%o0 540 541.type bn_add_words,#function 542.size bn_add_words,(.-bn_add_words) 543 544.global bn_sub_words 545/* 546 * BN_ULONG bn_sub_words(rp,ap,bp,n) 547 * BN_ULONG *rp,*ap,*bp; 548 * int n; 549 */ 550bn_sub_words: 551 sra %o3,%g0,%o3 ! signx %o3 552 brgz,a %o3,.L_bn_sub_words_proceed 553 lduw [%o1],%o4 554 retl 555 clr %o0 556 557.L_bn_sub_words_proceed: 558 andcc %o3,-4,%g0 559 bz,pn %icc,.L_bn_sub_words_tail 560 addcc %g0,0,%g0 ! clear carry flag 561 562.L_bn_sub_words_loop: ! wow! 32 aligned! 563 dec 4,%o3 564 lduw [%o2],%o5 565 lduw [%o1+4],%g1 566 lduw [%o2+4],%g2 567 lduw [%o1+8],%g3 568 lduw [%o2+8],%g4 569 subccc %o4,%o5,%o5 570 stuw %o5,[%o0] 571 572 lduw [%o1+12],%o4 573 lduw [%o2+12],%o5 574 inc 16,%o1 575 subccc %g1,%g2,%g2 576 stuw %g2,[%o0+4] 577 578 inc 16,%o2 579 subccc %g3,%g4,%g4 580 stuw %g4,[%o0+8] 581 582 inc 16,%o0 583 subccc %o4,%o5,%o5 584 stuw %o5,[%o0-4] 585 and %o3,-4,%g1 586 brnz,a,pt %g1,.L_bn_sub_words_loop 587 lduw [%o1],%o4 588 589 brnz,a,pn %o3,.L_bn_sub_words_tail 590 lduw [%o1],%o4 591.L_bn_sub_words_return: 592 clr %o0 593 retl 594 movcs %icc,1,%o0 595 nop 596 597.L_bn_sub_words_tail: ! wow! 32 aligned! 598 lduw [%o2],%o5 599 dec %o3 600 subccc %o4,%o5,%o5 601 brz,pt %o3,.L_bn_sub_words_return 602 stuw %o5,[%o0] 603 604 lduw [%o1+4],%o4 605 lduw [%o2+4],%o5 606 dec %o3 607 subccc %o4,%o5,%o5 608 brz,pt %o3,.L_bn_sub_words_return 609 stuw %o5,[%o0+4] 610 611 lduw [%o1+8],%o4 612 lduw [%o2+8],%o5 613 subccc %o4,%o5,%o5 614 stuw %o5,[%o0+8] 615 clr %o0 616 retl 617 movcs %icc,1,%o0 618 619.type bn_sub_words,#function 620.size bn_sub_words,(.-bn_sub_words) 621 622/* 623 * Code below depends on the fact that upper parts of the %l0-%l7 624 * and %i0-%i7 are zeroed by kernel after context switch. In 625 * previous versions this comment stated that "the trouble is that 626 * it's not feasible to implement the mumbo-jumbo in less V9 627 * instructions:-(" which apparently isn't true thanks to 628 * 'bcs,a %xcc,.+8; inc %rd' pair. But the performance improvement 629 * results not from the shorter code, but from elimination of 630 * multicycle none-pairable 'rd %y,%rd' instructions. 631 * 632 * Andy. 633 */ 634 635/* 636 * Here is register usage map for *all* routines below. 637 */ 638#define t_1 %o0 639#define t_2 %o1 640#define c_12 %o2 641#define c_3 %o3 642 643#define ap(I) [%i1+4*I] 644#define bp(I) [%i2+4*I] 645#define rp(I) [%i0+4*I] 646 647#define a_0 %l0 648#define a_1 %l1 649#define a_2 %l2 650#define a_3 %l3 651#define a_4 %l4 652#define a_5 %l5 653#define a_6 %l6 654#define a_7 %l7 655 656#define b_0 %i3 657#define b_1 %i4 658#define b_2 %i5 659#define b_3 %o4 660#define b_4 %o5 661#define b_5 %o7 662#define b_6 %g1 663#define b_7 %g4 664 665.align 32 666.global bn_mul_comba8 667/* 668 * void bn_mul_comba8(r,a,b) 669 * BN_ULONG *r,*a,*b; 670 */ 671bn_mul_comba8: 672 save %sp,FRAME_SIZE,%sp 673 mov 1,t_2 674 lduw ap(0),a_0 675 sllx t_2,32,t_2 676 lduw bp(0),b_0 != 677 lduw bp(1),b_1 678 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); 679 srlx t_1,32,c_12 680 stuw t_1,rp(0) !=!r[0]=c1; 681 682 lduw ap(1),a_1 683 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); 684 addcc c_12,t_1,c_12 685 clr c_3 != 686 bcs,a %xcc,.+8 687 add c_3,t_2,c_3 688 lduw ap(2),a_2 689 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); 690 addcc c_12,t_1,t_1 691 bcs,a %xcc,.+8 692 add c_3,t_2,c_3 693 srlx t_1,32,c_12 != 694 stuw t_1,rp(1) !r[1]=c2; 695 or c_12,c_3,c_12 696 697 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); 698 addcc c_12,t_1,c_12 != 699 clr c_3 700 bcs,a %xcc,.+8 701 add c_3,t_2,c_3 702 lduw bp(2),b_2 != 703 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); 704 addcc c_12,t_1,c_12 705 bcs,a %xcc,.+8 706 add c_3,t_2,c_3 != 707 lduw bp(3),b_3 708 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); 709 addcc c_12,t_1,t_1 710 bcs,a %xcc,.+8 != 711 add c_3,t_2,c_3 712 srlx t_1,32,c_12 713 stuw t_1,rp(2) !r[2]=c3; 714 or c_12,c_3,c_12 != 715 716 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); 717 addcc c_12,t_1,c_12 718 clr c_3 719 bcs,a %xcc,.+8 != 720 add c_3,t_2,c_3 721 mulx a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3); 722 addcc c_12,t_1,c_12 723 bcs,a %xcc,.+8 != 724 add c_3,t_2,c_3 725 lduw ap(3),a_3 726 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); 727 addcc c_12,t_1,c_12 != 728 bcs,a %xcc,.+8 729 add c_3,t_2,c_3 730 lduw ap(4),a_4 731 mulx a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);!= 732 addcc c_12,t_1,t_1 733 bcs,a %xcc,.+8 734 add c_3,t_2,c_3 735 srlx t_1,32,c_12 != 736 stuw t_1,rp(3) !r[3]=c1; 737 or c_12,c_3,c_12 738 739 mulx a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1); 740 addcc c_12,t_1,c_12 != 741 clr c_3 742 bcs,a %xcc,.+8 743 add c_3,t_2,c_3 744 mulx a_3,b_1,t_1 !=!mul_add_c(a[3],b[1],c2,c3,c1); 745 addcc c_12,t_1,c_12 746 bcs,a %xcc,.+8 747 add c_3,t_2,c_3 748 mulx a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1); 749 addcc c_12,t_1,c_12 750 bcs,a %xcc,.+8 751 add c_3,t_2,c_3 752 lduw bp(4),b_4 != 753 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); 754 addcc c_12,t_1,c_12 755 bcs,a %xcc,.+8 756 add c_3,t_2,c_3 != 757 lduw bp(5),b_5 758 mulx a_0,b_4,t_1 !mul_add_c(a[0],b[4],c2,c3,c1); 759 addcc c_12,t_1,t_1 760 bcs,a %xcc,.+8 != 761 add c_3,t_2,c_3 762 srlx t_1,32,c_12 763 stuw t_1,rp(4) !r[4]=c2; 764 or c_12,c_3,c_12 != 765 766 mulx a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2); 767 addcc c_12,t_1,c_12 768 clr c_3 769 bcs,a %xcc,.+8 != 770 add c_3,t_2,c_3 771 mulx a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2); 772 addcc c_12,t_1,c_12 773 bcs,a %xcc,.+8 != 774 add c_3,t_2,c_3 775 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); 776 addcc c_12,t_1,c_12 777 bcs,a %xcc,.+8 != 778 add c_3,t_2,c_3 779 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); 780 addcc c_12,t_1,c_12 781 bcs,a %xcc,.+8 != 782 add c_3,t_2,c_3 783 lduw ap(5),a_5 784 mulx a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2); 785 addcc c_12,t_1,c_12 != 786 bcs,a %xcc,.+8 787 add c_3,t_2,c_3 788 lduw ap(6),a_6 789 mulx a_5,b_0,t_1 !=!mul_add_c(a[5],b[0],c3,c1,c2); 790 addcc c_12,t_1,t_1 791 bcs,a %xcc,.+8 792 add c_3,t_2,c_3 793 srlx t_1,32,c_12 != 794 stuw t_1,rp(5) !r[5]=c3; 795 or c_12,c_3,c_12 796 797 mulx a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3); 798 addcc c_12,t_1,c_12 != 799 clr c_3 800 bcs,a %xcc,.+8 801 add c_3,t_2,c_3 802 mulx a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3); 803 addcc c_12,t_1,c_12 804 bcs,a %xcc,.+8 805 add c_3,t_2,c_3 806 mulx a_4,b_2,t_1 !=!mul_add_c(a[4],b[2],c1,c2,c3); 807 addcc c_12,t_1,c_12 808 bcs,a %xcc,.+8 809 add c_3,t_2,c_3 810 mulx a_3,b_3,t_1 !=!mul_add_c(a[3],b[3],c1,c2,c3); 811 addcc c_12,t_1,c_12 812 bcs,a %xcc,.+8 813 add c_3,t_2,c_3 814 mulx a_2,b_4,t_1 !=!mul_add_c(a[2],b[4],c1,c2,c3); 815 addcc c_12,t_1,c_12 816 bcs,a %xcc,.+8 817 add c_3,t_2,c_3 818 lduw bp(6),b_6 != 819 mulx a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3); 820 addcc c_12,t_1,c_12 821 bcs,a %xcc,.+8 822 add c_3,t_2,c_3 != 823 lduw bp(7),b_7 824 mulx a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3); 825 addcc c_12,t_1,t_1 826 bcs,a %xcc,.+8 != 827 add c_3,t_2,c_3 828 srlx t_1,32,c_12 829 stuw t_1,rp(6) !r[6]=c1; 830 or c_12,c_3,c_12 != 831 832 mulx a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1); 833 addcc c_12,t_1,c_12 834 clr c_3 835 bcs,a %xcc,.+8 != 836 add c_3,t_2,c_3 837 mulx a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1); 838 addcc c_12,t_1,c_12 839 bcs,a %xcc,.+8 != 840 add c_3,t_2,c_3 841 mulx a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1); 842 addcc c_12,t_1,c_12 843 bcs,a %xcc,.+8 != 844 add c_3,t_2,c_3 845 mulx a_3,b_4,t_1 !mul_add_c(a[3],b[4],c2,c3,c1); 846 addcc c_12,t_1,c_12 847 bcs,a %xcc,.+8 != 848 add c_3,t_2,c_3 849 mulx a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1); 850 addcc c_12,t_1,c_12 851 bcs,a %xcc,.+8 != 852 add c_3,t_2,c_3 853 mulx a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1); 854 addcc c_12,t_1,c_12 855 bcs,a %xcc,.+8 != 856 add c_3,t_2,c_3 857 lduw ap(7),a_7 858 mulx a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1); 859 addcc c_12,t_1,c_12 860 bcs,a %xcc,.+8 861 add c_3,t_2,c_3 862 mulx a_7,b_0,t_1 !=!mul_add_c(a[7],b[0],c2,c3,c1); 863 addcc c_12,t_1,t_1 864 bcs,a %xcc,.+8 865 add c_3,t_2,c_3 866 srlx t_1,32,c_12 != 867 stuw t_1,rp(7) !r[7]=c2; 868 or c_12,c_3,c_12 869 870 mulx a_7,b_1,t_1 !=!mul_add_c(a[7],b[1],c3,c1,c2); 871 addcc c_12,t_1,c_12 872 clr c_3 873 bcs,a %xcc,.+8 874 add c_3,t_2,c_3 != 875 mulx a_6,b_2,t_1 !mul_add_c(a[6],b[2],c3,c1,c2); 876 addcc c_12,t_1,c_12 877 bcs,a %xcc,.+8 878 add c_3,t_2,c_3 != 879 mulx a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2); 880 addcc c_12,t_1,c_12 881 bcs,a %xcc,.+8 882 add c_3,t_2,c_3 != 883 mulx a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2); 884 addcc c_12,t_1,c_12 885 bcs,a %xcc,.+8 886 add c_3,t_2,c_3 != 887 mulx a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2); 888 addcc c_12,t_1,c_12 889 bcs,a %xcc,.+8 890 add c_3,t_2,c_3 != 891 mulx a_2,b_6,t_1 !mul_add_c(a[2],b[6],c3,c1,c2); 892 addcc c_12,t_1,c_12 893 bcs,a %xcc,.+8 894 add c_3,t_2,c_3 != 895 mulx a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2); 896 addcc c_12,t_1,t_1 897 bcs,a %xcc,.+8 898 add c_3,t_2,c_3 != 899 srlx t_1,32,c_12 900 stuw t_1,rp(8) !r[8]=c3; 901 or c_12,c_3,c_12 902 903 mulx a_2,b_7,t_1 !=!mul_add_c(a[2],b[7],c1,c2,c3); 904 addcc c_12,t_1,c_12 905 clr c_3 906 bcs,a %xcc,.+8 907 add c_3,t_2,c_3 != 908 mulx a_3,b_6,t_1 !mul_add_c(a[3],b[6],c1,c2,c3); 909 addcc c_12,t_1,c_12 910 bcs,a %xcc,.+8 != 911 add c_3,t_2,c_3 912 mulx a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3); 913 addcc c_12,t_1,c_12 914 bcs,a %xcc,.+8 != 915 add c_3,t_2,c_3 916 mulx a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3); 917 addcc c_12,t_1,c_12 918 bcs,a %xcc,.+8 != 919 add c_3,t_2,c_3 920 mulx a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3); 921 addcc c_12,t_1,c_12 922 bcs,a %xcc,.+8 != 923 add c_3,t_2,c_3 924 mulx a_7,b_2,t_1 !mul_add_c(a[7],b[2],c1,c2,c3); 925 addcc c_12,t_1,t_1 926 bcs,a %xcc,.+8 != 927 add c_3,t_2,c_3 928 srlx t_1,32,c_12 929 stuw t_1,rp(9) !r[9]=c1; 930 or c_12,c_3,c_12 != 931 932 mulx a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1); 933 addcc c_12,t_1,c_12 934 clr c_3 935 bcs,a %xcc,.+8 != 936 add c_3,t_2,c_3 937 mulx a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1); 938 addcc c_12,t_1,c_12 939 bcs,a %xcc,.+8 != 940 add c_3,t_2,c_3 941 mulx a_5,b_5,t_1 !mul_add_c(a[5],b[5],c2,c3,c1); 942 addcc c_12,t_1,c_12 943 bcs,a %xcc,.+8 != 944 add c_3,t_2,c_3 945 mulx a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1); 946 addcc c_12,t_1,c_12 947 bcs,a %xcc,.+8 != 948 add c_3,t_2,c_3 949 mulx a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1); 950 addcc c_12,t_1,t_1 951 bcs,a %xcc,.+8 != 952 add c_3,t_2,c_3 953 srlx t_1,32,c_12 954 stuw t_1,rp(10) !r[10]=c2; 955 or c_12,c_3,c_12 != 956 957 mulx a_4,b_7,t_1 !mul_add_c(a[4],b[7],c3,c1,c2); 958 addcc c_12,t_1,c_12 959 clr c_3 960 bcs,a %xcc,.+8 != 961 add c_3,t_2,c_3 962 mulx a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2); 963 addcc c_12,t_1,c_12 964 bcs,a %xcc,.+8 != 965 add c_3,t_2,c_3 966 mulx a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2); 967 addcc c_12,t_1,c_12 968 bcs,a %xcc,.+8 != 969 add c_3,t_2,c_3 970 mulx a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2); 971 addcc c_12,t_1,t_1 972 bcs,a %xcc,.+8 != 973 add c_3,t_2,c_3 974 srlx t_1,32,c_12 975 stuw t_1,rp(11) !r[11]=c3; 976 or c_12,c_3,c_12 != 977 978 mulx a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3); 979 addcc c_12,t_1,c_12 980 clr c_3 981 bcs,a %xcc,.+8 != 982 add c_3,t_2,c_3 983 mulx a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3); 984 addcc c_12,t_1,c_12 985 bcs,a %xcc,.+8 != 986 add c_3,t_2,c_3 987 mulx a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3); 988 addcc c_12,t_1,t_1 989 bcs,a %xcc,.+8 != 990 add c_3,t_2,c_3 991 srlx t_1,32,c_12 992 stuw t_1,rp(12) !r[12]=c1; 993 or c_12,c_3,c_12 != 994 995 mulx a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1); 996 addcc c_12,t_1,c_12 997 clr c_3 998 bcs,a %xcc,.+8 != 999 add c_3,t_2,c_3 1000 mulx a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1); 1001 addcc c_12,t_1,t_1 1002 bcs,a %xcc,.+8 != 1003 add c_3,t_2,c_3 1004 srlx t_1,32,c_12 1005 st t_1,rp(13) !r[13]=c2; 1006 or c_12,c_3,c_12 != 1007 1008 mulx a_7,b_7,t_1 !mul_add_c(a[7],b[7],c3,c1,c2); 1009 addcc c_12,t_1,t_1 1010 srlx t_1,32,c_12 != 1011 stuw t_1,rp(14) !r[14]=c3; 1012 stuw c_12,rp(15) !r[15]=c1; 1013 1014 ret 1015 restore %g0,%g0,%o0 != 1016 1017.type bn_mul_comba8,#function 1018.size bn_mul_comba8,(.-bn_mul_comba8) 1019 1020.align 32 1021 1022.global bn_mul_comba4 1023/* 1024 * void bn_mul_comba4(r,a,b) 1025 * BN_ULONG *r,*a,*b; 1026 */ 1027bn_mul_comba4: 1028 save %sp,FRAME_SIZE,%sp 1029 lduw ap(0),a_0 1030 mov 1,t_2 1031 lduw bp(0),b_0 1032 sllx t_2,32,t_2 != 1033 lduw bp(1),b_1 1034 mulx a_0,b_0,t_1 !mul_add_c(a[0],b[0],c1,c2,c3); 1035 srlx t_1,32,c_12 1036 stuw t_1,rp(0) !=!r[0]=c1; 1037 1038 lduw ap(1),a_1 1039 mulx a_0,b_1,t_1 !mul_add_c(a[0],b[1],c2,c3,c1); 1040 addcc c_12,t_1,c_12 1041 clr c_3 != 1042 bcs,a %xcc,.+8 1043 add c_3,t_2,c_3 1044 lduw ap(2),a_2 1045 mulx a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1); 1046 addcc c_12,t_1,t_1 1047 bcs,a %xcc,.+8 1048 add c_3,t_2,c_3 1049 srlx t_1,32,c_12 != 1050 stuw t_1,rp(1) !r[1]=c2; 1051 or c_12,c_3,c_12 1052 1053 mulx a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2); 1054 addcc c_12,t_1,c_12 != 1055 clr c_3 1056 bcs,a %xcc,.+8 1057 add c_3,t_2,c_3 1058 lduw bp(2),b_2 != 1059 mulx a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2); 1060 addcc c_12,t_1,c_12 1061 bcs,a %xcc,.+8 1062 add c_3,t_2,c_3 != 1063 lduw bp(3),b_3 1064 mulx a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2); 1065 addcc c_12,t_1,t_1 1066 bcs,a %xcc,.+8 != 1067 add c_3,t_2,c_3 1068 srlx t_1,32,c_12 1069 stuw t_1,rp(2) !r[2]=c3; 1070 or c_12,c_3,c_12 != 1071 1072 mulx a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3); 1073 addcc c_12,t_1,c_12 1074 clr c_3 1075 bcs,a %xcc,.+8 != 1076 add c_3,t_2,c_3 1077 mulx a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3); 1078 addcc c_12,t_1,c_12 1079 bcs,a %xcc,.+8 != 1080 add c_3,t_2,c_3 1081 lduw ap(3),a_3 1082 mulx a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3); 1083 addcc c_12,t_1,c_12 != 1084 bcs,a %xcc,.+8 1085 add c_3,t_2,c_3 1086 mulx a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!= 1087 addcc c_12,t_1,t_1 != 1088 bcs,a %xcc,.+8 1089 add c_3,t_2,c_3 1090 srlx t_1,32,c_12 1091 stuw t_1,rp(3) !=!r[3]=c1; 1092 or c_12,c_3,c_12 1093 1094 mulx a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1); 1095 addcc c_12,t_1,c_12 1096 clr c_3 != 1097 bcs,a %xcc,.+8 1098 add c_3,t_2,c_3 1099 mulx a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1); 1100 addcc c_12,t_1,c_12 != 1101 bcs,a %xcc,.+8 1102 add c_3,t_2,c_3 1103 mulx a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1); 1104 addcc c_12,t_1,t_1 != 1105 bcs,a %xcc,.+8 1106 add c_3,t_2,c_3 1107 srlx t_1,32,c_12 1108 stuw t_1,rp(4) !=!r[4]=c2; 1109 or c_12,c_3,c_12 1110 1111 mulx a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2); 1112 addcc c_12,t_1,c_12 1113 clr c_3 != 1114 bcs,a %xcc,.+8 1115 add c_3,t_2,c_3 1116 mulx a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2); 1117 addcc c_12,t_1,t_1 != 1118 bcs,a %xcc,.+8 1119 add c_3,t_2,c_3 1120 srlx t_1,32,c_12 1121 stuw t_1,rp(5) !=!r[5]=c3; 1122 or c_12,c_3,c_12 1123 1124 mulx a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3); 1125 addcc c_12,t_1,t_1 1126 srlx t_1,32,c_12 != 1127 stuw t_1,rp(6) !r[6]=c1; 1128 stuw c_12,rp(7) !r[7]=c2; 1129 1130 ret 1131 restore %g0,%g0,%o0 1132 1133.type bn_mul_comba4,#function 1134.size bn_mul_comba4,(.-bn_mul_comba4) 1135 1136.align 32 1137 1138.global bn_sqr_comba8 1139bn_sqr_comba8: 1140 save %sp,FRAME_SIZE,%sp 1141 mov 1,t_2 1142 lduw ap(0),a_0 1143 sllx t_2,32,t_2 1144 lduw ap(1),a_1 1145 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); 1146 srlx t_1,32,c_12 1147 stuw t_1,rp(0) !r[0]=c1; 1148 1149 lduw ap(2),a_2 1150 mulx a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1); 1151 addcc c_12,t_1,c_12 1152 clr c_3 1153 bcs,a %xcc,.+8 1154 add c_3,t_2,c_3 1155 addcc c_12,t_1,t_1 1156 bcs,a %xcc,.+8 1157 add c_3,t_2,c_3 1158 srlx t_1,32,c_12 1159 stuw t_1,rp(1) !r[1]=c2; 1160 or c_12,c_3,c_12 1161 1162 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); 1163 addcc c_12,t_1,c_12 1164 clr c_3 1165 bcs,a %xcc,.+8 1166 add c_3,t_2,c_3 1167 addcc c_12,t_1,c_12 1168 bcs,a %xcc,.+8 1169 add c_3,t_2,c_3 1170 lduw ap(3),a_3 1171 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); 1172 addcc c_12,t_1,t_1 1173 bcs,a %xcc,.+8 1174 add c_3,t_2,c_3 1175 srlx t_1,32,c_12 1176 stuw t_1,rp(2) !r[2]=c3; 1177 or c_12,c_3,c_12 1178 1179 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); 1180 addcc c_12,t_1,c_12 1181 clr c_3 1182 bcs,a %xcc,.+8 1183 add c_3,t_2,c_3 1184 addcc c_12,t_1,c_12 1185 bcs,a %xcc,.+8 1186 add c_3,t_2,c_3 1187 lduw ap(4),a_4 1188 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); 1189 addcc c_12,t_1,c_12 1190 bcs,a %xcc,.+8 1191 add c_3,t_2,c_3 1192 addcc c_12,t_1,t_1 1193 bcs,a %xcc,.+8 1194 add c_3,t_2,c_3 1195 srlx t_1,32,c_12 1196 st t_1,rp(3) !r[3]=c1; 1197 or c_12,c_3,c_12 1198 1199 mulx a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1); 1200 addcc c_12,t_1,c_12 1201 clr c_3 1202 bcs,a %xcc,.+8 1203 add c_3,t_2,c_3 1204 addcc c_12,t_1,c_12 1205 bcs,a %xcc,.+8 1206 add c_3,t_2,c_3 1207 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); 1208 addcc c_12,t_1,c_12 1209 bcs,a %xcc,.+8 1210 add c_3,t_2,c_3 1211 addcc c_12,t_1,c_12 1212 bcs,a %xcc,.+8 1213 add c_3,t_2,c_3 1214 lduw ap(5),a_5 1215 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); 1216 addcc c_12,t_1,t_1 1217 bcs,a %xcc,.+8 1218 add c_3,t_2,c_3 1219 srlx t_1,32,c_12 1220 stuw t_1,rp(4) !r[4]=c2; 1221 or c_12,c_3,c_12 1222 1223 mulx a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2); 1224 addcc c_12,t_1,c_12 1225 clr c_3 1226 bcs,a %xcc,.+8 1227 add c_3,t_2,c_3 1228 addcc c_12,t_1,c_12 1229 bcs,a %xcc,.+8 1230 add c_3,t_2,c_3 1231 mulx a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2); 1232 addcc c_12,t_1,c_12 1233 bcs,a %xcc,.+8 1234 add c_3,t_2,c_3 1235 addcc c_12,t_1,c_12 1236 bcs,a %xcc,.+8 1237 add c_3,t_2,c_3 1238 lduw ap(6),a_6 1239 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); 1240 addcc c_12,t_1,c_12 1241 bcs,a %xcc,.+8 1242 add c_3,t_2,c_3 1243 addcc c_12,t_1,t_1 1244 bcs,a %xcc,.+8 1245 add c_3,t_2,c_3 1246 srlx t_1,32,c_12 1247 stuw t_1,rp(5) !r[5]=c3; 1248 or c_12,c_3,c_12 1249 1250 mulx a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3); 1251 addcc c_12,t_1,c_12 1252 clr c_3 1253 bcs,a %xcc,.+8 1254 add c_3,t_2,c_3 1255 addcc c_12,t_1,c_12 1256 bcs,a %xcc,.+8 1257 add c_3,t_2,c_3 1258 mulx a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3); 1259 addcc c_12,t_1,c_12 1260 bcs,a %xcc,.+8 1261 add c_3,t_2,c_3 1262 addcc c_12,t_1,c_12 1263 bcs,a %xcc,.+8 1264 add c_3,t_2,c_3 1265 mulx a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3); 1266 addcc c_12,t_1,c_12 1267 bcs,a %xcc,.+8 1268 add c_3,t_2,c_3 1269 addcc c_12,t_1,c_12 1270 bcs,a %xcc,.+8 1271 add c_3,t_2,c_3 1272 lduw ap(7),a_7 1273 mulx a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3); 1274 addcc c_12,t_1,t_1 1275 bcs,a %xcc,.+8 1276 add c_3,t_2,c_3 1277 srlx t_1,32,c_12 1278 stuw t_1,rp(6) !r[6]=c1; 1279 or c_12,c_3,c_12 1280 1281 mulx a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1); 1282 addcc c_12,t_1,c_12 1283 clr c_3 1284 bcs,a %xcc,.+8 1285 add c_3,t_2,c_3 1286 addcc c_12,t_1,c_12 1287 bcs,a %xcc,.+8 1288 add c_3,t_2,c_3 1289 mulx a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1); 1290 addcc c_12,t_1,c_12 1291 bcs,a %xcc,.+8 1292 add c_3,t_2,c_3 1293 addcc c_12,t_1,c_12 1294 bcs,a %xcc,.+8 1295 add c_3,t_2,c_3 1296 mulx a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1); 1297 addcc c_12,t_1,c_12 1298 bcs,a %xcc,.+8 1299 add c_3,t_2,c_3 1300 addcc c_12,t_1,c_12 1301 bcs,a %xcc,.+8 1302 add c_3,t_2,c_3 1303 mulx a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1); 1304 addcc c_12,t_1,c_12 1305 bcs,a %xcc,.+8 1306 add c_3,t_2,c_3 1307 addcc c_12,t_1,t_1 1308 bcs,a %xcc,.+8 1309 add c_3,t_2,c_3 1310 srlx t_1,32,c_12 1311 stuw t_1,rp(7) !r[7]=c2; 1312 or c_12,c_3,c_12 1313 1314 mulx a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2); 1315 addcc c_12,t_1,c_12 1316 clr c_3 1317 bcs,a %xcc,.+8 1318 add c_3,t_2,c_3 1319 addcc c_12,t_1,c_12 1320 bcs,a %xcc,.+8 1321 add c_3,t_2,c_3 1322 mulx a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2); 1323 addcc c_12,t_1,c_12 1324 bcs,a %xcc,.+8 1325 add c_3,t_2,c_3 1326 addcc c_12,t_1,c_12 1327 bcs,a %xcc,.+8 1328 add c_3,t_2,c_3 1329 mulx a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2); 1330 addcc c_12,t_1,c_12 1331 bcs,a %xcc,.+8 1332 add c_3,t_2,c_3 1333 addcc c_12,t_1,c_12 1334 bcs,a %xcc,.+8 1335 add c_3,t_2,c_3 1336 mulx a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2); 1337 addcc c_12,t_1,t_1 1338 bcs,a %xcc,.+8 1339 add c_3,t_2,c_3 1340 srlx t_1,32,c_12 1341 stuw t_1,rp(8) !r[8]=c3; 1342 or c_12,c_3,c_12 1343 1344 mulx a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3); 1345 addcc c_12,t_1,c_12 1346 clr c_3 1347 bcs,a %xcc,.+8 1348 add c_3,t_2,c_3 1349 addcc c_12,t_1,c_12 1350 bcs,a %xcc,.+8 1351 add c_3,t_2,c_3 1352 mulx a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3); 1353 addcc c_12,t_1,c_12 1354 bcs,a %xcc,.+8 1355 add c_3,t_2,c_3 1356 addcc c_12,t_1,c_12 1357 bcs,a %xcc,.+8 1358 add c_3,t_2,c_3 1359 mulx a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3); 1360 addcc c_12,t_1,c_12 1361 bcs,a %xcc,.+8 1362 add c_3,t_2,c_3 1363 addcc c_12,t_1,t_1 1364 bcs,a %xcc,.+8 1365 add c_3,t_2,c_3 1366 srlx t_1,32,c_12 1367 stuw t_1,rp(9) !r[9]=c1; 1368 or c_12,c_3,c_12 1369 1370 mulx a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1); 1371 addcc c_12,t_1,c_12 1372 clr c_3 1373 bcs,a %xcc,.+8 1374 add c_3,t_2,c_3 1375 addcc c_12,t_1,c_12 1376 bcs,a %xcc,.+8 1377 add c_3,t_2,c_3 1378 mulx a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1); 1379 addcc c_12,t_1,c_12 1380 bcs,a %xcc,.+8 1381 add c_3,t_2,c_3 1382 addcc c_12,t_1,c_12 1383 bcs,a %xcc,.+8 1384 add c_3,t_2,c_3 1385 mulx a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1); 1386 addcc c_12,t_1,t_1 1387 bcs,a %xcc,.+8 1388 add c_3,t_2,c_3 1389 srlx t_1,32,c_12 1390 stuw t_1,rp(10) !r[10]=c2; 1391 or c_12,c_3,c_12 1392 1393 mulx a_4,a_7,t_1 !sqr_add_c2(a,7,4,c3,c1,c2); 1394 addcc c_12,t_1,c_12 1395 clr c_3 1396 bcs,a %xcc,.+8 1397 add c_3,t_2,c_3 1398 addcc c_12,t_1,c_12 1399 bcs,a %xcc,.+8 1400 add c_3,t_2,c_3 1401 mulx a_5,a_6,t_1 !sqr_add_c2(a,6,5,c3,c1,c2); 1402 addcc c_12,t_1,c_12 1403 bcs,a %xcc,.+8 1404 add c_3,t_2,c_3 1405 addcc c_12,t_1,t_1 1406 bcs,a %xcc,.+8 1407 add c_3,t_2,c_3 1408 srlx t_1,32,c_12 1409 stuw t_1,rp(11) !r[11]=c3; 1410 or c_12,c_3,c_12 1411 1412 mulx a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3); 1413 addcc c_12,t_1,c_12 1414 clr c_3 1415 bcs,a %xcc,.+8 1416 add c_3,t_2,c_3 1417 addcc c_12,t_1,c_12 1418 bcs,a %xcc,.+8 1419 add c_3,t_2,c_3 1420 mulx a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3); 1421 addcc c_12,t_1,t_1 1422 bcs,a %xcc,.+8 1423 add c_3,t_2,c_3 1424 srlx t_1,32,c_12 1425 stuw t_1,rp(12) !r[12]=c1; 1426 or c_12,c_3,c_12 1427 1428 mulx a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1); 1429 addcc c_12,t_1,c_12 1430 clr c_3 1431 bcs,a %xcc,.+8 1432 add c_3,t_2,c_3 1433 addcc c_12,t_1,t_1 1434 bcs,a %xcc,.+8 1435 add c_3,t_2,c_3 1436 srlx t_1,32,c_12 1437 stuw t_1,rp(13) !r[13]=c2; 1438 or c_12,c_3,c_12 1439 1440 mulx a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2); 1441 addcc c_12,t_1,t_1 1442 srlx t_1,32,c_12 1443 stuw t_1,rp(14) !r[14]=c3; 1444 stuw c_12,rp(15) !r[15]=c1; 1445 1446 ret 1447 restore %g0,%g0,%o0 1448 1449.type bn_sqr_comba8,#function 1450.size bn_sqr_comba8,(.-bn_sqr_comba8) 1451 1452.align 32 1453 1454.global bn_sqr_comba4 1455/* 1456 * void bn_sqr_comba4(r,a) 1457 * BN_ULONG *r,*a; 1458 */ 1459bn_sqr_comba4: 1460 save %sp,FRAME_SIZE,%sp 1461 mov 1,t_2 1462 lduw ap(0),a_0 1463 sllx t_2,32,t_2 1464 lduw ap(1),a_1 1465 mulx a_0,a_0,t_1 !sqr_add_c(a,0,c1,c2,c3); 1466 srlx t_1,32,c_12 1467 stuw t_1,rp(0) !r[0]=c1; 1468 1469 lduw ap(2),a_2 1470 mulx a_0,a_1,t_1 !sqr_add_c2(a,1,0,c2,c3,c1); 1471 addcc c_12,t_1,c_12 1472 clr c_3 1473 bcs,a %xcc,.+8 1474 add c_3,t_2,c_3 1475 addcc c_12,t_1,t_1 1476 bcs,a %xcc,.+8 1477 add c_3,t_2,c_3 1478 srlx t_1,32,c_12 1479 stuw t_1,rp(1) !r[1]=c2; 1480 or c_12,c_3,c_12 1481 1482 mulx a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2); 1483 addcc c_12,t_1,c_12 1484 clr c_3 1485 bcs,a %xcc,.+8 1486 add c_3,t_2,c_3 1487 addcc c_12,t_1,c_12 1488 bcs,a %xcc,.+8 1489 add c_3,t_2,c_3 1490 lduw ap(3),a_3 1491 mulx a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2); 1492 addcc c_12,t_1,t_1 1493 bcs,a %xcc,.+8 1494 add c_3,t_2,c_3 1495 srlx t_1,32,c_12 1496 stuw t_1,rp(2) !r[2]=c3; 1497 or c_12,c_3,c_12 1498 1499 mulx a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3); 1500 addcc c_12,t_1,c_12 1501 clr c_3 1502 bcs,a %xcc,.+8 1503 add c_3,t_2,c_3 1504 addcc c_12,t_1,c_12 1505 bcs,a %xcc,.+8 1506 add c_3,t_2,c_3 1507 mulx a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3); 1508 addcc c_12,t_1,c_12 1509 bcs,a %xcc,.+8 1510 add c_3,t_2,c_3 1511 addcc c_12,t_1,t_1 1512 bcs,a %xcc,.+8 1513 add c_3,t_2,c_3 1514 srlx t_1,32,c_12 1515 stuw t_1,rp(3) !r[3]=c1; 1516 or c_12,c_3,c_12 1517 1518 mulx a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1); 1519 addcc c_12,t_1,c_12 1520 clr c_3 1521 bcs,a %xcc,.+8 1522 add c_3,t_2,c_3 1523 addcc c_12,t_1,c_12 1524 bcs,a %xcc,.+8 1525 add c_3,t_2,c_3 1526 mulx a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1); 1527 addcc c_12,t_1,t_1 1528 bcs,a %xcc,.+8 1529 add c_3,t_2,c_3 1530 srlx t_1,32,c_12 1531 stuw t_1,rp(4) !r[4]=c2; 1532 or c_12,c_3,c_12 1533 1534 mulx a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2); 1535 addcc c_12,t_1,c_12 1536 clr c_3 1537 bcs,a %xcc,.+8 1538 add c_3,t_2,c_3 1539 addcc c_12,t_1,t_1 1540 bcs,a %xcc,.+8 1541 add c_3,t_2,c_3 1542 srlx t_1,32,c_12 1543 stuw t_1,rp(5) !r[5]=c3; 1544 or c_12,c_3,c_12 1545 1546 mulx a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3); 1547 addcc c_12,t_1,t_1 1548 srlx t_1,32,c_12 1549 stuw t_1,rp(6) !r[6]=c1; 1550 stuw c_12,rp(7) !r[7]=c2; 1551 1552 ret 1553 restore %g0,%g0,%o0 1554 1555.type bn_sqr_comba4,#function 1556.size bn_sqr_comba4,(.-bn_sqr_comba4) 1557 1558.align 32 1559