1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# January 2010 18# 19# "Teaser" Montgomery multiplication module for IA-64. There are 20# several possibilities for improvement: 21# 22# - modulo-scheduling outer loop would eliminate quite a number of 23# stalls after ldf8, xma and getf.sig outside inner loop and 24# improve shorter key performance; 25# - shorter vector support [with input vectors being fetched only 26# once] should be added; 27# - 2x unroll with help of n0[1] would make the code scalable on 28# "wider" IA-64, "wider" than Itanium 2 that is, which is not of 29# acute interest, because upcoming Tukwila's individual cores are 30# reportedly based on Itanium 2 design; 31# - dedicated squaring procedure(?); 32# 33# January 2010 34# 35# Shorter vector support is implemented by zero-padding ap and np 36# vectors up to 8 elements, or 512 bits. This means that 256-bit 37# inputs will be processed only 2 times faster than 512-bit inputs, 38# not 4 [as one would expect, because algorithm complexity is n^2]. 39# The reason for padding is that inputs shorter than 512 bits won't 40# be processed faster anyway, because minimal critical path of the 41# core loop happens to match 512-bit timing. Either way, it resulted 42# in >100% improvement of 512-bit RSA sign benchmark and 50% - of 43# 1024-bit one [in comparison to original version of *this* module]. 44# 45# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* 46# this module is: 47# sign verify sign/s verify/s 48# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 49# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 50# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 51# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 52# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 53# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 54# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 55# 56# ... and *without* (but still with ia64.S): 57# 58# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 59# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 60# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 61# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 62# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 63# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 64# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 65# 66# As it can be seen, RSA sign performance improves by 130-30%, 67# hereafter less for longer keys, while verify - by 74-13%. 68# DSA performance improves by 115-30%. 69 70# $output is the last argument if it looks like a file (it has an extension) 71$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 72 73if ($^O eq "hpux") { 74 $ADDP="addp4"; 75 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 76} else { $ADDP="add"; } 77 78$code=<<___; 79.explicit 80.text 81 82// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, 83// const BN_ULONG *bp,const BN_ULONG *np, 84// const BN_ULONG *n0p,int num); 85.align 64 86.global bn_mul_mont# 87.proc bn_mul_mont# 88bn_mul_mont: 89 .prologue 90 .body 91{ .mmi; cmp4.le p6,p7=2,r37;; 92(p6) cmp4.lt.unc p8,p9=8,r37 93 mov ret0=r0 };; 94{ .bbb; 95(p9) br.cond.dptk.many bn_mul_mont_8 96(p8) br.cond.dpnt.many bn_mul_mont_general 97(p7) br.ret.spnt.many b0 };; 98.endp bn_mul_mont# 99 100prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; 101 102rptr=r8; aptr=r9; bptr=r14; nptr=r15; 103tptr=r16; // &tp[0] 104tp_1=r17; // &tp[-1] 105num=r18; len=r19; lc=r20; 106topbit=r21; // carry bit from tmp[num] 107 108n0=f6; 109m0=f7; 110bi=f8; 111 112.align 64 113.local bn_mul_mont_general# 114.proc bn_mul_mont_general# 115bn_mul_mont_general: 116 .prologue 117{ .mmi; .save ar.pfs,prevfs 118 alloc prevfs=ar.pfs,6,2,0,8 119 $ADDP aptr=0,in1 120 .save ar.lc,prevlc 121 mov prevlc=ar.lc } 122{ .mmi; .vframe prevsp 123 mov prevsp=sp 124 $ADDP bptr=0,in2 125 .save pr,prevpr 126 mov prevpr=pr };; 127 128 .body 129 .rotf alo[6],nlo[4],ahi[8],nhi[6] 130 .rotr a[3],n[3],t[2] 131 132{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 133 ldf8 alo[4]=[aptr],16 // ap[0] 134 $ADDP r30=8,in1 };; 135{ .mmi; ldf8 alo[3]=[r30],16 // ap[1] 136 ldf8 alo[2]=[aptr],16 // ap[2] 137 $ADDP in4=0,in4 };; 138{ .mmi; ldf8 alo[1]=[r30] // ap[3] 139 ldf8 n0=[in4] // n0 140 $ADDP rptr=0,in0 } 141{ .mmi; $ADDP nptr=0,in3 142 mov r31=16 143 zxt4 num=in5 };; 144{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0] 145 shladd len=num,3,r0 146 shladd r31=num,3,r31 };; 147{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1] 148 add lc=-5,num 149 sub r31=sp,r31 };; 150{ .mfb; and sp=-16,r31 // alloca 151 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] 152 nop.b 0 } 153{ .mfb; nop.m 0 154 xmpy.lu alo[4]=alo[4],bi 155 brp.loop.imp .L1st_ctop,.L1st_cend-16 156 };; 157{ .mfi; nop.m 0 158 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] 159 add tp_1=8,sp } 160{ .mfi; nop.m 0 161 xma.lu alo[3]=alo[3],bi,ahi[2] 162 mov pr.rot=0x20001f<<16 163 // ------^----- (p40) at first (p23) 164 // ----------^^ p[16:20]=1 165 };; 166{ .mfi; nop.m 0 167 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 168 mov ar.lc=lc } 169{ .mfi; nop.m 0 170 fcvt.fxu.s1 nhi[1]=f0 171 mov ar.ec=8 };; 172 173.align 32 174.L1st_ctop: 175.pred.rel "mutex",p40,p42 176{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 177 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 178 (p40) add n[2]=n[2],a[2] } // (p23) } 179{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) 180 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 181 (p42) add n[2]=n[2],a[2],1 };; // (p23) 182{ .mfi; (p21) getf.sig a[0]=alo[5] 183 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 184 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) 185{ .mfi; (p23) st8 [tp_1]=n[2],8 186 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 187 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 188{ .mmb; (p21) getf.sig n[0]=nlo[3] 189 (p16) nop.m 0 190 br.ctop.sptk .L1st_ctop };; 191.L1st_cend: 192 193{ .mmi; getf.sig a[0]=ahi[6] // (p24) 194 getf.sig n[0]=nhi[4] 195 add num=-1,num };; // num-- 196{ .mmi; .pred.rel "mutex",p40,p42 197(p40) add n[0]=n[0],a[0] 198(p42) add n[0]=n[0],a[0],1 199 sub aptr=aptr,len };; // rewind 200{ .mmi; .pred.rel "mutex",p40,p42 201(p40) cmp.ltu p41,p39=n[0],a[0] 202(p42) cmp.leu p41,p39=n[0],a[0] 203 sub nptr=nptr,len };; 204{ .mmi; .pred.rel "mutex",p39,p41 205(p39) add topbit=r0,r0 206(p41) add topbit=r0,r0,1 207 nop.i 0 } 208{ .mmi; st8 [tp_1]=n[0] 209 add tptr=16,sp 210 add tp_1=8,sp };; 211 212.Louter: 213{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 214 ldf8 ahi[3]=[tptr] // tp[0] 215 add r30=8,aptr };; 216{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0] 217 ldf8 alo[3]=[r30],16 // ap[1] 218 add r31=8,nptr };; 219{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2] 220 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] 221 brp.loop.imp .Linner_ctop,.Linner_cend-16 222 } 223{ .mfb; ldf8 alo[1]=[r30] // ap[3] 224 xma.lu alo[4]=alo[4],bi,ahi[3] 225 clrrrb.pr };; 226{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0] 227 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] 228 nop.i 0 } 229{ .mfi; ldf8 nlo[1]=[r31] // np[1] 230 xma.lu alo[3]=alo[3],bi,ahi[2] 231 mov pr.rot=0x20101f<<16 232 // ------^----- (p40) at first (p23) 233 // --------^--- (p30) at first (p22) 234 // ----------^^ p[16:20]=1 235 };; 236{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted 237 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 238 mov ar.lc=lc } 239{ .mfi; 240 fcvt.fxu.s1 nhi[1]=f0 241 mov ar.ec=8 };; 242 243// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in 244// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 245// in latter case accounts for two-tick pipeline stall, which means 246// that its performance would be ~20% lower than optimal one. No 247// attempt was made to address this, because original Itanium is 248// hardly represented out in the wild... 249.align 32 250.Linner_ctop: 251.pred.rel "mutex",p40,p42 252.pred.rel "mutex",p30,p32 253{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 254 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 255 (p40) add n[2]=n[2],a[2] } // (p23) 256{ .mfi; (p16) nop.m 0 257 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 258 (p42) add n[2]=n[2],a[2],1 };; // (p23) 259{ .mfi; (p21) getf.sig a[0]=alo[5] 260 (p16) nop.f 0 261 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 262{ .mfi; (p21) ld8 t[0]=[tptr],8 263 (p16) nop.f 0 264 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) 265{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) 266 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 267 (p30) add a[1]=a[1],t[1] } // (p22) 268{ .mfi; (p16) nop.m 0 269 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 270 (p32) add a[1]=a[1],t[1],1 };; // (p22) 271{ .mmi; (p21) getf.sig n[0]=nlo[3] 272 (p16) nop.m 0 273 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) 274{ .mmb; (p23) st8 [tp_1]=n[2],8 275 (p32) cmp.leu p31,p29=a[1],t[1] // (p22) 276 br.ctop.sptk .Linner_ctop };; 277.Linner_cend: 278 279{ .mmi; getf.sig a[0]=ahi[6] // (p24) 280 getf.sig n[0]=nhi[4] 281 nop.i 0 };; 282 283{ .mmi; .pred.rel "mutex",p31,p33 284(p31) add a[0]=a[0],topbit 285(p33) add a[0]=a[0],topbit,1 286 mov topbit=r0 };; 287{ .mfi; .pred.rel "mutex",p31,p33 288(p31) cmp.ltu p32,p30=a[0],topbit 289(p33) cmp.leu p32,p30=a[0],topbit 290 } 291{ .mfi; .pred.rel "mutex",p40,p42 292(p40) add n[0]=n[0],a[0] 293(p42) add n[0]=n[0],a[0],1 294 };; 295{ .mmi; .pred.rel "mutex",p44,p46 296(p40) cmp.ltu p41,p39=n[0],a[0] 297(p42) cmp.leu p41,p39=n[0],a[0] 298(p32) add topbit=r0,r0,1 } 299 300{ .mmi; st8 [tp_1]=n[0],8 301 cmp4.ne p6,p0=1,num 302 sub aptr=aptr,len };; // rewind 303{ .mmi; sub nptr=nptr,len 304(p41) add topbit=r0,r0,1 305 add tptr=16,sp } 306{ .mmb; add tp_1=8,sp 307 add num=-1,num // num-- 308(p6) br.cond.sptk.many .Louter };; 309 310{ .mbb; add lc=4,lc 311 brp.loop.imp .Lsub_ctop,.Lsub_cend-16 312 clrrrb.pr };; 313{ .mii; nop.m 0 314 mov pr.rot=0x10001<<16 315 // ------^---- (p33) at first (p17) 316 mov ar.lc=lc } 317{ .mii; nop.m 0 318 mov ar.ec=3 319 nop.i 0 };; 320 321.Lsub_ctop: 322.pred.rel "mutex",p33,p35 323{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) 324 (p16) nop.f 0 325 (p33) sub n[1]=t[1],n[1] } // (p17) 326{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) 327 (p16) nop.f 0 328 (p35) sub n[1]=t[1],n[1],1 };; // (p17) 329{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r 330 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) 331 (p18) nop.b 0 } 332{ .mib; (p18) nop.m 0 333 (p35) cmp.geu p34,p32=n[1],t[1] // (p17) 334 br.ctop.sptk .Lsub_ctop };; 335.Lsub_cend: 336 337{ .mmb; .pred.rel "mutex",p34,p36 338(p34) sub topbit=topbit,r0 // (p19) 339(p36) sub topbit=topbit,r0,1 340 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 341 } 342{ .mmb; sub rptr=rptr,len // rewind 343 sub tptr=tptr,len 344 clrrrb.pr };; 345{ .mmi; mov aptr=rptr 346 mov bptr=tptr 347 mov pr.rot=1<<16 };; 348{ .mii; cmp.eq p0,p6=topbit,r0 349 mov ar.lc=lc 350 mov ar.ec=2 };; 351 352.Lcopy_ctop: 353{ .mmi; (p16) ld8 a[0]=[aptr],8 354 (p16) ld8 t[0]=[bptr],8 355 (p6) mov a[1]=t[1] };; // (p17) 356{ .mmb; (p17) st8 [rptr]=a[1],8 357 (p17) st8 [tptr]=r0,8 358 br.ctop.sptk .Lcopy_ctop };; 359.Lcopy_cend: 360 361{ .mmi; mov ret0=1 // signal "handled" 362 rum 1<<5 // clear um.mfh 363 mov ar.lc=prevlc } 364{ .mib; .restore sp 365 mov sp=prevsp 366 mov pr=prevpr,0x1ffff 367 br.ret.sptk.many b0 };; 368.endp bn_mul_mont_general# 369 370a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; 371n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; 372t0=r15; 373 374ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; 375ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; 376 377.align 64 378.skip 48 // aligns loop body 379.local bn_mul_mont_8# 380.proc bn_mul_mont_8# 381bn_mul_mont_8: 382 .prologue 383{ .mmi; .save ar.pfs,prevfs 384 alloc prevfs=ar.pfs,6,2,0,8 385 .vframe prevsp 386 mov prevsp=sp 387 .save ar.lc,prevlc 388 mov prevlc=ar.lc } 389{ .mmi; add r17=-6*16,sp 390 add sp=-7*16,sp 391 .save pr,prevpr 392 mov prevpr=pr };; 393 394{ .mmi; .save.gf 0,0x10 395 stf.spill [sp]=f16,-16 396 .save.gf 0,0x20 397 stf.spill [r17]=f17,32 398 add r16=-5*16,prevsp};; 399{ .mmi; .save.gf 0,0x40 400 stf.spill [r16]=f18,32 401 .save.gf 0,0x80 402 stf.spill [r17]=f19,32 403 $ADDP aptr=0,in1 };; 404{ .mmi; .save.gf 0,0x100 405 stf.spill [r16]=f20,32 406 .save.gf 0,0x200 407 stf.spill [r17]=f21,32 408 $ADDP r29=8,in1 };; 409{ .mmi; .save.gf 0,0x400 410 stf.spill [r16]=f22 411 .save.gf 0,0x800 412 stf.spill [r17]=f23 413 $ADDP rptr=0,in0 };; 414 415 .body 416 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] 417 .rotr t[8] 418 419// load input vectors padding them to 8 elements 420{ .mmi; ldf8 ai0=[aptr],16 // ap[0] 421 ldf8 ai1=[r29],16 // ap[1] 422 $ADDP bptr=0,in2 } 423{ .mmi; $ADDP r30=8,in2 424 $ADDP nptr=0,in3 425 $ADDP r31=8,in3 };; 426{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0] 427 ldf8 bj[6]=[r30],16 // bp[1] 428 cmp4.le p4,p5=3,in5 } 429{ .mmi; ldf8 ni0=[nptr],16 // np[0] 430 ldf8 ni1=[r31],16 // np[1] 431 cmp4.le p6,p7=4,in5 };; 432 433{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] 434 (p5)fcvt.fxu ai2=f0 435 cmp4.le p8,p9=5,in5 } 436{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] 437 (p7)fcvt.fxu ai3=f0 438 cmp4.le p10,p11=6,in5 } 439{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] 440 (p5)fcvt.fxu bj[5]=f0 441 cmp4.le p12,p13=7,in5 } 442{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] 443 (p7)fcvt.fxu bj[4]=f0 444 cmp4.le p14,p15=8,in5 } 445{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] 446 (p5)fcvt.fxu ni2=f0 447 addp4 r28=-1,in5 } 448{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3] 449 (p7)fcvt.fxu ni3=f0 450 $ADDP in4=0,in4 };; 451 452{ .mfi; ldf8 n0=[in4] 453 fcvt.fxu tf[1]=f0 454 nop.i 0 } 455 456{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] 457 (p9)fcvt.fxu ai4=f0 458 mov t[0]=r0 } 459{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] 460 (p11)fcvt.fxu ai5=f0 461 mov t[1]=r0 } 462{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] 463 (p9)fcvt.fxu bj[3]=f0 464 mov t[2]=r0 } 465{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] 466 (p11)fcvt.fxu bj[2]=f0 467 mov t[3]=r0 } 468{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] 469 (p9)fcvt.fxu ni4=f0 470 mov t[4]=r0 } 471{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5] 472 (p11)fcvt.fxu ni5=f0 473 mov t[5]=r0 };; 474 475{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] 476 (p13)fcvt.fxu ai6=f0 477 mov t[6]=r0 } 478{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] 479 (p15)fcvt.fxu ai7=f0 480 mov t[7]=r0 } 481{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] 482 (p13)fcvt.fxu bj[1]=f0 483 mov ar.lc=r28 } 484{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] 485 (p15)fcvt.fxu bj[0]=f0 486 mov ar.ec=1 } 487{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] 488 (p13)fcvt.fxu ni6=f0 489 mov pr.rot=1<<16 } 490{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7] 491 (p15)fcvt.fxu ni7=f0 492 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 493 };; 494 495// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt 496// to measure with help of Interval Time Counter indicated that the 497// factor is a tad higher: 33 or 34, if not 35. Exact measurement and 498// addressing the issue is problematic, because I don't have access 499// to platform-specific instruction-level profiler. On Itanium it 500// should run in 56*n ticks, because of higher xma latency... 501.Louter_8_ctop: 502 .pred.rel "mutex",p40,p42 503 .pred.rel "mutex",p48,p50 504{ .mfi; (p16) nop.m 0 // 0: 505 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] 506 (p40) add a3=a3,n3 } // (p17) a3+=n3 507{ .mfi; (p42) add a3=a3,n3,1 508 (p16) xma.lu alo[0]=ai0,bj[7],tf[1] 509 (p16) nop.i 0 };; 510{ .mii; (p17) getf.sig a7=alo[8] // 1: 511 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 512 (p50) add t[6]=t[6],a3,1 };; 513{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 514 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 515 (p40) cmp.ltu p43,p41=a3,n3 } 516{ .mfi; (p42) cmp.leu p43,p41=a3,n3 517 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 518 (p16) nop.i 0 };; 519{ .mii; (p17) getf.sig n5=nlo[6] // 3: 520 (p48) cmp.ltu p51,p49=t[6],a3 521 (p50) cmp.leu p51,p49=t[6],a3 };; 522 .pred.rel "mutex",p41,p43 523 .pred.rel "mutex",p49,p51 524{ .mfi; (p16) nop.m 0 // 4: 525 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] 526 (p41) add a4=a4,n4 } // (p17) a4+=n4 527{ .mfi; (p43) add a4=a4,n4,1 528 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] 529 (p16) nop.i 0 };; 530{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 531 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 532 (p51) add t[5]=t[5],a4,1 };; 533{ .mfi; (p16) nop.m 0 // 6: 534 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 535 (p41) cmp.ltu p42,p40=a4,n4 } 536{ .mfi; (p43) cmp.leu p42,p40=a4,n4 537 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 538 (p16) nop.i 0 };; 539{ .mii; (p17) getf.sig n6=nlo[7] // 7: 540 (p49) cmp.ltu p50,p48=t[5],a4 541 (p51) cmp.leu p50,p48=t[5],a4 };; 542 .pred.rel "mutex",p40,p42 543 .pred.rel "mutex",p48,p50 544{ .mfi; (p16) nop.m 0 // 8: 545 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] 546 (p40) add a5=a5,n5 } // (p17) a5+=n5 547{ .mfi; (p42) add a5=a5,n5,1 548 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] 549 (p16) nop.i 0 };; 550{ .mii; (p16) getf.sig a1=alo[1] // 9: 551 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 552 (p50) add t[4]=t[4],a5,1 };; 553{ .mfi; (p16) nop.m 0 // 10: 554 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 555 (p40) cmp.ltu p43,p41=a5,n5 } 556{ .mfi; (p42) cmp.leu p43,p41=a5,n5 557 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] 558 (p16) nop.i 0 };; 559{ .mii; (p17) getf.sig n7=nlo[8] // 11: 560 (p48) cmp.ltu p51,p49=t[4],a5 561 (p50) cmp.leu p51,p49=t[4],a5 };; 562 .pred.rel "mutex",p41,p43 563 .pred.rel "mutex",p49,p51 564{ .mfi; (p17) getf.sig n8=nhi[8] // 12: 565 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] 566 (p41) add a6=a6,n6 } // (p17) a6+=n6 567{ .mfi; (p43) add a6=a6,n6,1 568 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] 569 (p16) nop.i 0 };; 570{ .mii; (p16) getf.sig a2=alo[2] // 13: 571 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 572 (p51) add t[3]=t[3],a6,1 };; 573{ .mfi; (p16) nop.m 0 // 14: 574 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 575 (p41) cmp.ltu p42,p40=a6,n6 } 576{ .mfi; (p43) cmp.leu p42,p40=a6,n6 577 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] 578 (p16) nop.i 0 };; 579{ .mii; (p16) nop.m 0 // 15: 580 (p49) cmp.ltu p50,p48=t[3],a6 581 (p51) cmp.leu p50,p48=t[3],a6 };; 582 .pred.rel "mutex",p40,p42 583 .pred.rel "mutex",p48,p50 584{ .mfi; (p16) nop.m 0 // 16: 585 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] 586 (p40) add a7=a7,n7 } // (p17) a7+=n7 587{ .mfi; (p42) add a7=a7,n7,1 588 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] 589 (p16) nop.i 0 };; 590{ .mii; (p16) getf.sig a3=alo[3] // 17: 591 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 592 (p50) add t[2]=t[2],a7,1 };; 593{ .mfi; (p16) nop.m 0 // 18: 594 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 595 (p40) cmp.ltu p43,p41=a7,n7 } 596{ .mfi; (p42) cmp.leu p43,p41=a7,n7 597 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] 598 (p16) nop.i 0 };; 599{ .mii; (p16) getf.sig n1=nlo[1] // 19: 600 (p48) cmp.ltu p51,p49=t[2],a7 601 (p50) cmp.leu p51,p49=t[2],a7 };; 602 .pred.rel "mutex",p41,p43 603 .pred.rel "mutex",p49,p51 604{ .mfi; (p16) nop.m 0 // 20: 605 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] 606 (p41) add a8=a8,n8 } // (p17) a8+=n8 607{ .mfi; (p43) add a8=a8,n8,1 608 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] 609 (p16) nop.i 0 };; 610{ .mii; (p16) getf.sig a4=alo[4] // 21: 611 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 612 (p51) add t[1]=t[1],a8,1 };; 613{ .mfi; (p16) nop.m 0 // 22: 614 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 615 (p41) cmp.ltu p42,p40=a8,n8 } 616{ .mfi; (p43) cmp.leu p42,p40=a8,n8 617 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] 618 (p16) nop.i 0 };; 619{ .mii; (p16) getf.sig n2=nlo[2] // 23: 620 (p49) cmp.ltu p50,p48=t[1],a8 621 (p51) cmp.leu p50,p48=t[1],a8 };; 622{ .mfi; (p16) nop.m 0 // 24: 623 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] 624 (p16) add a1=a1,n1 } // (p16) a1+=n1 625{ .mfi; (p16) nop.m 0 626 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] 627 (p17) mov t[0]=r0 };; 628{ .mii; (p16) getf.sig a5=alo[5] // 25: 629 (p16) add t0=t[7],a1 // (p16) t[7]+=a1 630 (p42) add t[0]=t[0],r0,1 };; 631{ .mfi; (p16) setf.sig tf[0]=t0 // 26: 632 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 633 (p50) add t[0]=t[0],r0,1 } 634{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 635 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] 636 (p16) nop.i 0 };; 637{ .mii; (p16) getf.sig n3=nlo[3] // 27: 638 (p16) cmp.ltu.unc p50,p48=t0,a1 639 (p16) nop.i 0 };; 640 .pred.rel "mutex",p40,p42 641 .pred.rel "mutex",p48,p50 642{ .mfi; (p16) nop.m 0 // 28: 643 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] 644 (p40) add a2=a2,n2 } // (p16) a2+=n2 645{ .mfi; (p42) add a2=a2,n2,1 646 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] 647 (p16) nop.i 0 };; 648{ .mii; (p16) getf.sig a6=alo[6] // 29: 649 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 650 (p50) add t[6]=t[6],a2,1 };; 651{ .mfi; (p16) nop.m 0 // 30: 652 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 653 (p40) cmp.ltu p41,p39=a2,n2 } 654{ .mfi; (p42) cmp.leu p41,p39=a2,n2 655 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] 656 (p16) nop.i 0 };; 657{ .mfi; (p16) getf.sig n4=nlo[4] // 31: 658 (p16) nop.f 0 659 (p48) cmp.ltu p49,p47=t[6],a2 } 660{ .mfb; (p50) cmp.leu p49,p47=t[6],a2 661 (p16) nop.f 0 662 br.ctop.sptk.many .Louter_8_ctop };; 663.Louter_8_cend: 664 665// above loop has to execute one more time, without (p16), which is 666// replaced with merged move of np[8] to GPR bank 667 .pred.rel "mutex",p40,p42 668 .pred.rel "mutex",p48,p50 669{ .mmi; (p0) getf.sig n1=ni0 // 0: 670 (p40) add a3=a3,n3 // (p17) a3+=n3 671 (p42) add a3=a3,n3,1 };; 672{ .mii; (p17) getf.sig a7=alo[8] // 1: 673 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 674 (p50) add t[6]=t[6],a3,1 };; 675{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 676 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 677 (p40) cmp.ltu p43,p41=a3,n3 } 678{ .mfi; (p42) cmp.leu p43,p41=a3,n3 679 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 680 (p0) nop.i 0 };; 681{ .mii; (p17) getf.sig n5=nlo[6] // 3: 682 (p48) cmp.ltu p51,p49=t[6],a3 683 (p50) cmp.leu p51,p49=t[6],a3 };; 684 .pred.rel "mutex",p41,p43 685 .pred.rel "mutex",p49,p51 686{ .mmi; (p0) getf.sig n2=ni1 // 4: 687 (p41) add a4=a4,n4 // (p17) a4+=n4 688 (p43) add a4=a4,n4,1 };; 689{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 690 (p0) nop.f 0 691 (p51) add t[5]=t[5],a4,1 };; 692{ .mfi; (p0) getf.sig n3=ni2 // 6: 693 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 694 (p41) cmp.ltu p42,p40=a4,n4 } 695{ .mfi; (p43) cmp.leu p42,p40=a4,n4 696 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 697 (p0) nop.i 0 };; 698{ .mii; (p17) getf.sig n6=nlo[7] // 7: 699 (p49) cmp.ltu p50,p48=t[5],a4 700 (p51) cmp.leu p50,p48=t[5],a4 };; 701 .pred.rel "mutex",p40,p42 702 .pred.rel "mutex",p48,p50 703{ .mii; (p0) getf.sig n4=ni3 // 8: 704 (p40) add a5=a5,n5 // (p17) a5+=n5 705 (p42) add a5=a5,n5,1 };; 706{ .mii; (p0) nop.m 0 // 9: 707 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 708 (p50) add t[4]=t[4],a5,1 };; 709{ .mii; (p0) nop.m 0 // 10: 710 (p40) cmp.ltu p43,p41=a5,n5 711 (p42) cmp.leu p43,p41=a5,n5 };; 712{ .mii; (p17) getf.sig n7=nlo[8] // 11: 713 (p48) cmp.ltu p51,p49=t[4],a5 714 (p50) cmp.leu p51,p49=t[4],a5 };; 715 .pred.rel "mutex",p41,p43 716 .pred.rel "mutex",p49,p51 717{ .mii; (p17) getf.sig n8=nhi[8] // 12: 718 (p41) add a6=a6,n6 // (p17) a6+=n6 719 (p43) add a6=a6,n6,1 };; 720{ .mii; (p0) getf.sig n5=ni4 // 13: 721 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 722 (p51) add t[3]=t[3],a6,1 };; 723{ .mii; (p0) nop.m 0 // 14: 724 (p41) cmp.ltu p42,p40=a6,n6 725 (p43) cmp.leu p42,p40=a6,n6 };; 726{ .mii; (p0) getf.sig n6=ni5 // 15: 727 (p49) cmp.ltu p50,p48=t[3],a6 728 (p51) cmp.leu p50,p48=t[3],a6 };; 729 .pred.rel "mutex",p40,p42 730 .pred.rel "mutex",p48,p50 731{ .mii; (p0) nop.m 0 // 16: 732 (p40) add a7=a7,n7 // (p17) a7+=n7 733 (p42) add a7=a7,n7,1 };; 734{ .mii; (p0) nop.m 0 // 17: 735 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 736 (p50) add t[2]=t[2],a7,1 };; 737{ .mii; (p0) nop.m 0 // 18: 738 (p40) cmp.ltu p43,p41=a7,n7 739 (p42) cmp.leu p43,p41=a7,n7 };; 740{ .mii; (p0) getf.sig n7=ni6 // 19: 741 (p48) cmp.ltu p51,p49=t[2],a7 742 (p50) cmp.leu p51,p49=t[2],a7 };; 743 .pred.rel "mutex",p41,p43 744 .pred.rel "mutex",p49,p51 745{ .mii; (p0) nop.m 0 // 20: 746 (p41) add a8=a8,n8 // (p17) a8+=n8 747 (p43) add a8=a8,n8,1 };; 748{ .mmi; (p0) nop.m 0 // 21: 749 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 750 (p51) add t[1]=t[1],a8,1 } 751{ .mmi; (p17) mov t[0]=r0 752 (p41) cmp.ltu p42,p40=a8,n8 753 (p43) cmp.leu p42,p40=a8,n8 };; 754{ .mmi; (p0) getf.sig n8=ni7 // 22: 755 (p49) cmp.ltu p50,p48=t[1],a8 756 (p51) cmp.leu p50,p48=t[1],a8 } 757{ .mmi; (p42) add t[0]=t[0],r0,1 758 (p0) add r16=-7*16,prevsp 759 (p0) add r17=-6*16,prevsp };; 760 761// subtract np[8] from carrybit|tmp[8] 762// carrybit|tmp[8] layout upon exit from above loop is: 763// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) 764{ .mmi; (p50)add t[0]=t[0],r0,1 765 add r18=-5*16,prevsp 766 sub n1=t0,n1 };; 767{ .mmi; cmp.gtu p34,p32=n1,t0;; 768 .pred.rel "mutex",p32,p34 769 (p32)sub n2=t[7],n2 770 (p34)sub n2=t[7],n2,1 };; 771{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] 772 (p34)cmp.geu p35,p33=n2,t[7];; 773 .pred.rel "mutex",p33,p35 774 (p33)sub n3=t[6],n3 } 775{ .mmi; (p35)sub n3=t[6],n3,1;; 776 (p33)cmp.gtu p34,p32=n3,t[6] 777 (p35)cmp.geu p34,p32=n3,t[6] };; 778 .pred.rel "mutex",p32,p34 779{ .mii; (p32)sub n4=t[5],n4 780 (p34)sub n4=t[5],n4,1;; 781 (p32)cmp.gtu p35,p33=n4,t[5] } 782{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; 783 .pred.rel "mutex",p33,p35 784 (p33)sub n5=t[4],n5 785 (p35)sub n5=t[4],n5,1 };; 786{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] 787 (p35)cmp.geu p34,p32=n5,t[4];; 788 .pred.rel "mutex",p32,p34 789 (p32)sub n6=t[3],n6 } 790{ .mmi; (p34)sub n6=t[3],n6,1;; 791 (p32)cmp.gtu p35,p33=n6,t[3] 792 (p34)cmp.geu p35,p33=n6,t[3] };; 793 .pred.rel "mutex",p33,p35 794{ .mii; (p33)sub n7=t[2],n7 795 (p35)sub n7=t[2],n7,1;; 796 (p33)cmp.gtu p34,p32=n7,t[2] } 797{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; 798 .pred.rel "mutex",p32,p34 799 (p32)sub n8=t[1],n8 800 (p34)sub n8=t[1],n8,1 };; 801{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] 802 (p34)cmp.geu p35,p33=n8,t[1];; 803 .pred.rel "mutex",p33,p35 804 (p33)sub a8=t[0],r0 } 805{ .mmi; (p35)sub a8=t[0],r0,1;; 806 (p33)cmp.gtu p34,p32=a8,t[0] 807 (p35)cmp.geu p34,p32=a8,t[0] };; 808 809// save the result, either tmp[num] or tmp[num]-np[num] 810 .pred.rel "mutex",p32,p34 811{ .mmi; (p32)st8 [rptr]=n1,8 812 (p34)st8 [rptr]=t0,8 813 add r19=-4*16,prevsp};; 814{ .mmb; (p32)st8 [rptr]=n2,8 815 (p34)st8 [rptr]=t[7],8 816 (p5)br.cond.dpnt.few .Ldone };; 817{ .mmb; (p32)st8 [rptr]=n3,8 818 (p34)st8 [rptr]=t[6],8 819 (p7)br.cond.dpnt.few .Ldone };; 820{ .mmb; (p32)st8 [rptr]=n4,8 821 (p34)st8 [rptr]=t[5],8 822 (p9)br.cond.dpnt.few .Ldone };; 823{ .mmb; (p32)st8 [rptr]=n5,8 824 (p34)st8 [rptr]=t[4],8 825 (p11)br.cond.dpnt.few .Ldone };; 826{ .mmb; (p32)st8 [rptr]=n6,8 827 (p34)st8 [rptr]=t[3],8 828 (p13)br.cond.dpnt.few .Ldone };; 829{ .mmb; (p32)st8 [rptr]=n7,8 830 (p34)st8 [rptr]=t[2],8 831 (p15)br.cond.dpnt.few .Ldone };; 832{ .mmb; (p32)st8 [rptr]=n8,8 833 (p34)st8 [rptr]=t[1],8 834 nop.b 0 };; 835.Ldone: // epilogue 836{ .mmi; ldf.fill f16=[r16],64 837 ldf.fill f17=[r17],64 838 nop.i 0 } 839{ .mmi; ldf.fill f18=[r18],64 840 ldf.fill f19=[r19],64 841 mov pr=prevpr,0x1ffff };; 842{ .mmi; ldf.fill f20=[r16] 843 ldf.fill f21=[r17] 844 mov ar.lc=prevlc } 845{ .mmi; ldf.fill f22=[r18] 846 ldf.fill f23=[r19] 847 mov ret0=1 } // signal "handled" 848{ .mib; rum 1<<5 849 .restore sp 850 mov sp=prevsp 851 br.ret.sptk.many b0 };; 852.endp bn_mul_mont_8# 853 854.type copyright#,\@object 855copyright: 856stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" 857___ 858 859open STDOUT,">$output" if $output; 860print $code; 861close STDOUT or die "error closing STDOUT: $!"; 862