1// ==================================================================== 2// Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL 3// project. 4// ==================================================================== 5// 6// Poly1305 for Itanium. 7// 8// January 2019 9// 10// Performance was reported to be ~2.1 cycles per byte on Itanium 2. 11// With exception for processors in 95xx family, which have higher 12// floating-point instructions' latencies and deliver ~2.6 cpb. 13// Comparison to compiler-generated code is not exactly fair, because 14// of different radixes. But just for reference, it was observed to be 15// >3x faster. Originally it was argued that floating-point base 2^32 16// implementation would be optimal. Upon closer look estimate for below 17// integer base 2^64 implementation turned to be approximately same on 18// Itanium 2. But floating-point code would be larger, and have higher 19// overhead, which would negatively affect small-block performance... 20 21#if defined(_HPUX_SOURCE) 22# if !defined(_LP64) 23# define ADDP addp4 24# else 25# define ADDP add 26# endif 27# define RUM rum 28# define SUM sum 29#else 30# define ADDP add 31# define RUM nop 32# define SUM nop 33#endif 34 35.text 36.explicit 37 38.global poly1305_init# 39.proc poly1305_init# 40.align 64 41poly1305_init: 42 .prologue 43 .save ar.pfs,r2 44{ .mmi; alloc r2=ar.pfs,2,0,0,0 45 cmp.eq p6,p7=0,r33 } // key == NULL? 46{ .mmi; ADDP r9=8,r32 47 ADDP r10=16,r32 48 ADDP r32=0,r32 };; 49 .body 50{ .mmi; st8 [r32]=r0,24 // ctx->h0 = 0 51 st8 [r9]=r0 // ctx->h1 = 0 52(p7) ADDP r8=0,r33 } 53{ .mib; st8 [r10]=r0 // ctx->h2 = 0 54(p6) mov r8=0 55(p6) br.ret.spnt b0 };; 56 57{ .mmi; ADDP r9=1,r33 58 ADDP r10=2,r33 59 ADDP r11=3,r33 };; 60{ .mmi; ld1 r16=[r8],4 // load key, little-endian 61 ld1 r17=[r9],4 } 62{ .mmi; ld1 r18=[r10],4 63 ld1 r19=[r11],4 };; 64{ .mmi; ld1 r20=[r8],4 65 ld1 r21=[r9],4 } 66{ .mmi; ld1 r22=[r10],4 67 ld1 r23=[r11],4 68 and r19=15,r19 };; 69{ .mmi; ld1 r24=[r8],4 70 ld1 r25=[r9],4 71 and r20=-4,r20 } 72{ .mmi; ld1 r26=[r10],4 73 ld1 r27=[r11],4 74 and r23=15,r23 };; 75{ .mmi; ld1 r28=[r8],4 76 ld1 r29=[r9],4 77 and r24=-4,r24 } 78{ .mmi; ld1 r30=[r10],4 79 ld1 r31=[r11],4 80 and r27=15,r27 };; 81 82{ .mii; and r28=-4,r28 83 dep r16=r17,r16,8,8 84 dep r18=r19,r18,8,8 };; 85{ .mii; and r31=15,r31 86 dep r16=r18,r16,16,16 87 dep r20=r21,r20,8,8 };; 88{ .mii; dep r16=r20,r16,32,16 89 dep r22=r23,r22,8,8 };; 90{ .mii; dep r16=r22,r16,48,16 91 dep r24=r25,r24,8,8 };; 92{ .mii; dep r26=r27,r26,8,8 93 dep r28=r29,r28,8,8 };; 94{ .mii; dep r24=r26,r24,16,16 95 dep r30=r31,r30,8,8 };; 96{ .mii; st8 [r32]=r16,8 // ctx->r0 97 dep r24=r28,r24,32,16;; 98 dep r24=r30,r24,48,16 };; 99{ .mii; st8 [r32]=r24,8 // ctx->r1 100 shr.u r25=r24,2;; 101 add r25=r25,r24 };; 102{ .mib; st8 [r32]=r25 // ctx->s1 103 mov r8=0 104 br.ret.sptk b0 };; 105.endp poly1305_init# 106 107h0=r17; h1=r18; h2=r19; 108i0=r20; i1=r21; 109HF0=f8; HF1=f9; HF2=f10; 110RF0=f11; RF1=f12; SF1=f13; 111 112.global poly1305_blocks# 113.proc poly1305_blocks# 114.align 64 115poly1305_blocks: 116 .prologue 117 .save ar.pfs,r2 118{ .mii; alloc r2=ar.pfs,4,1,0,0 119 .save ar.lc,r3 120 mov r3=ar.lc 121 .save pr,r36 122 mov r36=pr } 123 124 .body 125{ .mmi; ADDP r8=0,r32 126 ADDP r9=8,r32 127 and r29=7,r33 };; 128{ .mmi; ld8 h0=[r8],16 129 ld8 h1=[r9],16 130 and r33=-8,r33 };; 131{ .mmi; ld8 h2=[r8],16 132 ldf8 RF0=[r9],16 133 shr.u r34=r34,4 };; 134{ .mmi; ldf8 RF1=[r8],-32 135 ldf8 SF1=[r9],-32 136 cmp.ltu p16,p17=1,r34 };; 137{ .mmi; 138(p16) add r34=-2,r34 139(p17) mov r34=0 140 ADDP r10=0,r33 } 141{ .mii; ADDP r11=8,r33 142(p16) mov ar.ec=2 143(p17) mov ar.ec=1 };; 144{ .mib; RUM 1<<1 // go little-endian 145 mov ar.lc=r34 146 brp.loop.imp .Loop,.Lcend-16 } 147 148{ .mmi; cmp.eq p8,p7=0,r29 149 cmp.eq p9,p0=1,r29 150 cmp.eq p10,p0=2,r29 } 151{ .mmi; cmp.eq p11,p0=3,r29 152 cmp.eq p12,p0=4,r29 153 cmp.eq p13,p0=5,r29 } 154{ .mmi; cmp.eq p14,p0=6,r29 155 cmp.eq p15,p0=7,r29 156 add r16=16,r10 };; 157 158{ .mmb; 159(p8) ld8 i0=[r10],16 // aligned input 160(p8) ld8 i1=[r11],16 161(p8) br.cond.sptk .Loop };; 162 163 // align first block 164 .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15 165{ .mmi; (p7) ld8 r14=[r10],24 166 (p7) ld8 r15=[r11],24 } 167 168{ .mii; (p7) ld8 r16=[r16] 169 nop.i 0;; 170 (p15) shrp i0=r15,r14,56 } 171{ .mii; (p15) shrp i1=r16,r15,56 172 (p14) shrp i0=r15,r14,48 } 173{ .mii; (p14) shrp i1=r16,r15,48 174 (p13) shrp i0=r15,r14,40 } 175{ .mii; (p13) shrp i1=r16,r15,40 176 (p12) shrp i0=r15,r14,32 } 177{ .mii; (p12) shrp i1=r16,r15,32 178 (p11) shrp i0=r15,r14,24 } 179{ .mii; (p11) shrp i1=r16,r15,24 180 (p10) shrp i0=r15,r14,16 } 181{ .mii; (p10) shrp i1=r16,r15,16 182 (p9) shrp i0=r15,r14,8 } 183{ .mii; (p9) shrp i1=r16,r15,8 184 mov r14=r16 };; 185 186.Loop: 187 .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15 188{ .mmi; add h0=h0,i0 189 add h1=h1,i1 190 add h2=h2,r35 };; 191{ .mmi; setf.sig HF0=h0 192 cmp.ltu p6,p0=h0,i0 193 cmp.ltu p7,p0=h1,i1 };; 194{ .mmi; (p6) add h1=1,h1;; 195 setf.sig HF1=h1 196 (p6) cmp.eq.or p7,p0=0,h1 };; 197{ .mmi; (p7) add h2=1,h2;; 198 setf.sig HF2=h2 };; 199 200{ .mfi; (p16) ld8 r15=[r10],16 201 xmpy.lu f32=HF0,RF0 } 202{ .mfi; (p16) ld8 r16=[r11],16 203 xmpy.hu f33=HF0,RF0 } 204{ .mfi; xmpy.lu f36=HF0,RF1 } 205{ .mfi; xmpy.hu f37=HF0,RF1 };; 206{ .mfi; xmpy.lu f34=HF1,SF1 207 (p15) shrp i0=r15,r14,56 } 208{ .mfi; xmpy.hu f35=HF1,SF1 } 209{ .mfi; xmpy.lu f38=HF1,RF0 210 (p15) shrp i1=r16,r15,56 } 211{ .mfi; xmpy.hu f39=HF1,RF0 } 212{ .mfi; xmpy.lu f40=HF2,SF1 213 (p14) shrp i0=r15,r14,48 } 214{ .mfi; xmpy.lu f41=HF2,RF0 };; 215 216{ .mmi; getf.sig r22=f32 217 getf.sig r23=f33 218 (p14) shrp i1=r16,r15,48 } 219{ .mmi; getf.sig r24=f34 220 getf.sig r25=f35 221 (p13) shrp i0=r15,r14,40 } 222{ .mmi; getf.sig r26=f36 223 getf.sig r27=f37 224 (p13) shrp i1=r16,r15,40 } 225{ .mmi; getf.sig r28=f38 226 getf.sig r29=f39 227 (p12) shrp i0=r15,r14,32 } 228{ .mmi; getf.sig r30=f40 229 getf.sig r31=f41 };; 230 231{ .mmi; add h0=r22,r24 232 add r23=r23,r25 233 (p12) shrp i1=r16,r15,32 } 234{ .mmi; add h1=r26,r28 235 add r27=r27,r29 236 (p11) shrp i0=r15,r14,24 };; 237{ .mmi; cmp.ltu p6,p0=h0,r24 238 cmp.ltu p7,p0=h1,r28 239 add r23=r23,r30 };; 240{ .mmi; (p6) add r23=1,r23 241 (p7) add r27=1,r27 242 (p11) shrp i1=r16,r15,24 };; 243{ .mmi; add h1=h1,r23;; 244 cmp.ltu p6,p7=h1,r23 245 (p10) shrp i0=r15,r14,16 };; 246{ .mmi; (p6) add h2=r31,r27,1 247 (p7) add h2=r31,r27 248 (p10) shrp i1=r16,r15,16 };; 249 250{ .mmi; (p8) mov i0=r15 251 and r22=-4,h2 252 shr.u r23=h2,2 };; 253{ .mmi; add r22=r22,r23 254 and h2=3,h2 255 (p9) shrp i0=r15,r14,8 };; 256 257{ .mmi; add h0=h0,r22;; 258 cmp.ltu p6,p0=h0,r22 259 (p9) shrp i1=r16,r15,8 };; 260{ .mmi; (p8) mov i1=r16 261 (p6) cmp.eq.unc p7,p0=-1,h1 262 (p6) add h1=1,h1 };; 263{ .mmb; (p7) add h2=1,h2 264 mov r14=r16 265 br.ctop.sptk .Loop };; 266.Lcend: 267 268{ .mii; SUM 1<<1 // back to big-endian 269 mov ar.lc=r3 };; 270 271{ .mmi; st8 [r8]=h0,16 272 st8 [r9]=h1 273 mov pr=r36,0x1ffff };; 274{ .mmb; st8 [r8]=h2 275 rum 1<<5 276 br.ret.sptk b0 };; 277.endp poly1305_blocks# 278 279.global poly1305_emit# 280.proc poly1305_emit# 281.align 64 282poly1305_emit: 283 .prologue 284 .save ar.pfs,r2 285{ .mmi; alloc r2=ar.pfs,3,0,0,0 286 ADDP r8=0,r32 287 ADDP r9=8,r32 };; 288 289 .body 290{ .mmi; ld8 r16=[r8],16 // load hash 291 ld8 r17=[r9] 292 ADDP r10=0,r34 };; 293{ .mmi; ld8 r18=[r8] 294 ld4 r24=[r10],8 // load nonce 295 ADDP r11=4,r34 };; 296 297{ .mmi; ld4 r25=[r11],8 298 ld4 r26=[r10] 299 add r20=5,r16 };; 300 301{ .mmi; ld4 r27=[r11] 302 cmp.ltu p6,p7=r20,r16 303 shl r25=r25,32 };; 304{ .mmi; 305(p6) add r21=1,r17 306(p7) add r21=0,r17 307(p6) cmp.eq.or.andcm p6,p7=-1,r17 };; 308{ .mmi; 309(p6) add r22=1,r18 310(p7) add r22=0,r18 311 shl r27=r27,32 };; 312{ .mmi; or r24=r24,r25 313 or r26=r26,r27 314 cmp.leu p6,p7=4,r22 };; 315{ .mmi; 316(p6) add r16=r20,r24 317(p7) add r16=r16,r24 318(p6) add r17=r21,r26 };; 319{ .mii; 320(p7) add r17=r17,r26 321 cmp.ltu p6,p7=r16,r24;; 322(p6) add r17=1,r17 };; 323 324{ .mmi; ADDP r8=0,r33 325 ADDP r9=4,r33 326 shr.u r20=r16,32 } 327{ .mmi; ADDP r10=8,r33 328 ADDP r11=12,r33 329 shr.u r21=r17,32 };; 330 331{ .mmi; st1 [r8]=r16,1 // write mac, little-endian 332 st1 [r9]=r20,1 333 shr.u r16=r16,8 } 334{ .mii; st1 [r10]=r17,1 335 shr.u r20=r20,8 336 shr.u r17=r17,8 } 337{ .mmi; st1 [r11]=r21,1 338 shr.u r21=r21,8 };; 339 340{ .mmi; st1 [r8]=r16,1 341 st1 [r9]=r20,1 342 shr.u r16=r16,8 } 343{ .mii; st1 [r10]=r17,1 344 shr.u r20=r20,8 345 shr.u r17=r17,8 } 346{ .mmi; st1 [r11]=r21,1 347 shr.u r21=r21,8 };; 348 349{ .mmi; st1 [r8]=r16,1 350 st1 [r9]=r20,1 351 shr.u r16=r16,8 } 352{ .mii; st1 [r10]=r17,1 353 shr.u r20=r20,8 354 shr.u r17=r17,8 } 355{ .mmi; st1 [r11]=r21,1 356 shr.u r21=r21,8 };; 357 358{ .mmi; st1 [r8]=r16 359 st1 [r9]=r20 } 360{ .mmb; st1 [r10]=r17 361 st1 [r11]=r21 362 br.ret.sptk b0 };; 363.endp poly1305_emit# 364 365stringz "Poly1305 for IA64, CRYPTOGAMS by \@dot-asm" 366