1;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. 2;; 3;; Licensed under the Apache License 2.0 (the "License"). You may not use 4;; this file except in compliance with the License. You can obtain a copy 5;; in the file LICENSE in the source distribution or at 6;; https://www.openssl.org/source/license.html 7;; 8;;==================================================================== 9;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 10;; project. 11;; 12;; Rights for redistribution and usage in source and binary forms are 13;; granted according to the License. Warranty of any kind is disclaimed. 14;;==================================================================== 15;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n 16;; being the number of 32-bit words, addition - 8*n. Corresponding 4x 17;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler 18;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. 19;;==================================================================== 20 .text 21 22 .if .ASSEMBLER_VERSION<7000000 23 .asg 0,__TI_EABI__ 24 .endif 25 .if __TI_EABI__ 26 .asg bn_mul_add_words,_bn_mul_add_words 27 .asg bn_mul_words,_bn_mul_words 28 .asg bn_sqr_words,_bn_sqr_words 29 .asg bn_add_words,_bn_add_words 30 .asg bn_sub_words,_bn_sub_words 31 .asg bn_div_words,_bn_div_words 32 .asg bn_sqr_comba8,_bn_sqr_comba8 33 .asg bn_mul_comba8,_bn_mul_comba8 34 .asg bn_sqr_comba4,_bn_sqr_comba4 35 .asg bn_mul_comba4,_bn_mul_comba4 36 .endif 37 38 .asg B3,RA 39 .asg A4,ARG0 40 .asg B4,ARG1 41 .asg A6,ARG2 42 .asg B6,ARG3 43 .asg A8,ARG4 44 .asg B8,ARG5 45 .asg A4,RET 46 .asg A15,FP 47 .asg B14,DP 48 .asg B15,SP 49 50 .global _bn_mul_add_words 51_bn_mul_add_words: 52 .asmfunc 53 MV ARG2,B0 54 [!B0] BNOP RA 55||[!B0] MVK 0,RET 56 [B0] MVC B0,ILC 57 [B0] ZERO A19 ; high part of accumulator 58|| [B0] MV ARG0,A2 59|| [B0] MV ARG3,A3 60 NOP 3 61 62 SPLOOP 2 ; 2*n+10 63;;==================================================================== 64 LDW *ARG1++,B7 ; ap[i] 65 NOP 3 66 LDW *ARG0++,A7 ; rp[i] 67 MPY32U B7,A3,A17:A16 68 NOP 3 ; [2,0] in epilogue 69 ADDU A16,A7,A21:A20 70 ADDU A19,A21:A20,A19:A18 71|| MV.S A17,A23 72 SPKERNEL 2,1 ; leave slot for "return value" 73|| STW A18,*A2++ ; rp[i] 74|| ADD A19,A23,A19 75;;==================================================================== 76 BNOP RA,4 77 MV A19,RET ; return value 78 .endasmfunc 79 80 .global _bn_mul_words 81_bn_mul_words: 82 .asmfunc 83 MV ARG2,B0 84 [!B0] BNOP RA 85||[!B0] MVK 0,RET 86 [B0] MVC B0,ILC 87 [B0] ZERO A19 ; high part of accumulator 88 NOP 3 89 90 SPLOOP 2 ; 2*n+10 91;;==================================================================== 92 LDW *ARG1++,A7 ; ap[i] 93 NOP 4 94 MPY32U A7,ARG3,A17:A16 95 NOP 4 ; [2,0] in epiloque 96 ADDU A19,A16,A19:A18 97|| MV.S A17,A21 98 SPKERNEL 2,1 ; leave slot for "return value" 99|| STW A18,*ARG0++ ; rp[i] 100|| ADD.L A19,A21,A19 101;;==================================================================== 102 BNOP RA,4 103 MV A19,RET ; return value 104 .endasmfunc 105 106 .global _bn_sqr_words 107_bn_sqr_words: 108 .asmfunc 109 MV ARG2,B0 110 [!B0] BNOP RA 111||[!B0] MVK 0,RET 112 [B0] MVC B0,ILC 113 [B0] MV ARG0,B2 114|| [B0] ADD 4,ARG0,ARG0 115 NOP 3 116 117 SPLOOP 2 ; 2*n+10 118;;==================================================================== 119 LDW *ARG1++,B7 ; ap[i] 120 NOP 4 121 MPY32U B7,B7,B1:B0 122 NOP 3 ; [2,0] in epilogue 123 STW B0,*B2++(8) ; rp[2*i] 124 MV B1,A1 125 SPKERNEL 2,0 ; fully overlap BNOP RA,5 126|| STW A1,*ARG0++(8) ; rp[2*i+1] 127;;==================================================================== 128 BNOP RA,5 129 .endasmfunc 130 131 .global _bn_add_words 132_bn_add_words: 133 .asmfunc 134 MV ARG3,B0 135 [!B0] BNOP RA 136||[!B0] MVK 0,RET 137 [B0] MVC B0,ILC 138 [B0] ZERO A1 ; carry flag 139|| [B0] MV ARG0,A3 140 NOP 3 141 142 SPLOOP 2 ; 2*n+6 143;;==================================================================== 144 LDW *ARG2++,A7 ; bp[i] 145|| LDW *ARG1++,B7 ; ap[i] 146 NOP 4 147 ADDU A7,B7,A9:A8 148 ADDU A1,A9:A8,A1:A0 149 SPKERNEL 0,0 ; fully overlap BNOP RA,5 150|| STW A0,*A3++ ; write result 151|| MV A1,RET ; keep carry flag in RET 152;;==================================================================== 153 BNOP RA,5 154 .endasmfunc 155 156 .global _bn_sub_words 157_bn_sub_words: 158 .asmfunc 159 MV ARG3,B0 160 [!B0] BNOP RA 161||[!B0] MVK 0,RET 162 [B0] MVC B0,ILC 163 [B0] ZERO A2 ; borrow flag 164|| [B0] MV ARG0,A3 165 NOP 3 166 167 SPLOOP 2 ; 2*n+6 168;;==================================================================== 169 LDW *ARG2++,A7 ; bp[i] 170|| LDW *ARG1++,B7 ; ap[i] 171 NOP 4 172 SUBU B7,A7,A1:A0 173 [A2] SUB A1:A0,1,A1:A0 174 SPKERNEL 0,1 ; leave slot for "return borrow flag" 175|| STW A0,*A3++ ; write result 176|| AND 1,A1,A2 ; pass on borrow flag 177;;==================================================================== 178 BNOP RA,4 179 AND 1,A1,RET ; return borrow flag 180 .endasmfunc 181 182 .global _bn_div_words 183_bn_div_words: 184 .asmfunc 185 LMBD 1,A6,A0 ; leading zero bits in dv 186 LMBD 1,A4,A1 ; leading zero bits in hi 187|| MVK 32,B0 188 CMPLTU A1,A0,A2 189|| ADD A0,B0,B0 190 [ A2] BNOP RA 191||[ A2] MVK -1,A4 ; return overflow 192||[!A2] MV A4,A3 ; reassign hi 193 [!A2] MV B4,A4 ; reassign lo, will be quotient 194||[!A2] MVC B0,ILC 195 [!A2] SHL A6,A0,A6 ; normalize dv 196|| MVK 1,A1 197 198 [!A2] CMPLTU A3,A6,A1 ; hi<dv? 199||[!A2] SHL A4,1,A5:A4 ; lo<<1 200 [!A1] SUB A3,A6,A3 ; hi-=dv 201||[!A1] OR 1,A4,A4 202 [!A2] SHRU A3,31,A1 ; upper bit 203||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31 204 205 SPLOOP 3 206 [!A1] CMPLTU A3,A6,A1 ; hi<dv? 207||[ A1] ZERO A1 208|| SHL A4,1,A5:A4 ; lo<<1 209 [!A1] SUB A3,A6,A3 ; hi-=dv 210||[!A1] OR 1,A4,A4 ; quotient 211 SHRU A3,31,A1 ; upper bit 212|| ADDAH A5,A3,A3 ; hi<<1|lo>>31 213 SPKERNEL 214 215 BNOP RA,5 216 .endasmfunc 217 218;;==================================================================== 219;; Not really Comba algorithm, just straightforward NxM... Dedicated 220;; fully unrolled real Comba implementations are asymptotically 2x 221;; faster, but naturally larger undertaking. Purpose of this exercise 222;; was rather to learn to master nested SPLOOPs... 223;;==================================================================== 224 .global _bn_sqr_comba8 225 .global _bn_mul_comba8 226_bn_sqr_comba8: 227 MV ARG1,ARG2 228_bn_mul_comba8: 229 .asmfunc 230 MVK 8,B0 ; N, RILC 231|| MVK 8,A0 ; M, outer loop counter 232|| MV ARG1,A5 ; copy ap 233|| MV ARG0,B4 ; copy rp 234|| ZERO B19 ; high part of accumulator 235 MVC B0,RILC 236|| SUB B0,2,B1 ; N-2, initial ILC 237|| SUB B0,1,B2 ; const B2=N-1 238|| LDW *A5++,B6 ; ap[0] 239|| MV A0,A3 ; const A3=M 240sploopNxM?: ; for best performance arrange M<=N 241 [A0] SPLOOPD 2 ; 2*n+10 242|| MVC B1,ILC 243|| ADDAW B4,B0,B5 244|| ZERO B7 245|| LDW *A5++,A9 ; pre-fetch ap[1] 246|| ZERO A1 247|| SUB A0,1,A0 248;;==================================================================== 249;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. 250;; This is because of Advisory 15 from TI publication SPRZ247I. 251 LDW *ARG2++,A7 ; bp[i] 252 NOP 3 253 [A1] LDW *B5++,B7 ; rp[i] 254 MPY32U A7,B6,B17:B16 255 NOP 3 256 ADDU B16,B7,B21:B20 257 ADDU B19,B21:B20,B19:B18 258|| MV.S B17,B23 259 SPKERNEL 260|| STW B18,*B4++ ; rp[i] 261|| ADD.S B19,B23,B19 262;;==================================================================== 263outer?: ; m*2*(n+1)+10 264 SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] 265 SPMASKR 266|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? 267 MVD A9,B6 ; move through .M unit(*) 268 [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] 269 SUBAW B5,B2,B5 ; rewind rp to rp[1] 270 MVK 1,A1 271 [A0] BNOP.S1 outer?,4 272|| [A0] SUB.L A0,1,A0 273 STW B19,*B4--[B2] ; rewind rp tp rp[1] 274|| ZERO.S B19 ; high part of accumulator 275;; end of outer? 276 BNOP RA,5 ; return 277 .endasmfunc 278;; (*) It should be noted that B6 is used as input to MPY32U in 279;; chronologically next cycle in *preceding* SPLOOP iteration. 280;; Normally such arrangement would require DINT, but at this 281;; point SPLOOP is draining and interrupts are disabled 282;; implicitly. 283 284 .global _bn_sqr_comba4 285 .global _bn_mul_comba4 286_bn_sqr_comba4: 287 MV ARG1,ARG2 288_bn_mul_comba4: 289 .asmfunc 290 .if 0 291 BNOP sploopNxM?,3 292 ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, 293 ;; because of low-counter effect, when prologue phase finishes 294 ;; before SPKERNEL instruction is reached. As result it's 25% 295 ;; slower than expected... 296 MVK 4,B0 ; N, RILC 297|| MVK 4,A0 ; M, outer loop counter 298|| MV ARG1,A5 ; copy ap 299|| MV ARG0,B4 ; copy rp 300|| ZERO B19 ; high part of accumulator 301 MVC B0,RILC 302|| SUB B0,2,B1 ; first ILC 303|| SUB B0,1,B2 ; const B2=N-1 304|| LDW *A5++,B6 ; ap[0] 305|| MV A0,A3 ; const A3=M 306 .else 307 ;; This alternative is an exercise in fully unrolled Comba 308 ;; algorithm implementation that operates at n*(n+1)+12, or 309 ;; as little as 32 cycles... 310 LDW *ARG1[0],B16 ; a[0] 311|| LDW *ARG2[0],A16 ; b[0] 312 LDW *ARG1[1],B17 ; a[1] 313|| LDW *ARG2[1],A17 ; b[1] 314 LDW *ARG1[2],B18 ; a[2] 315|| LDW *ARG2[2],A18 ; b[2] 316 LDW *ARG1[3],B19 ; a[3] 317|| LDW *ARG2[3],A19 ; b[3] 318 NOP 319 MPY32U A16,B16,A1:A0 ; a[0]*b[0] 320 MPY32U A17,B16,A23:A22 ; a[0]*b[1] 321 MPY32U A16,B17,A25:A24 ; a[1]*b[0] 322 MPY32U A16,B18,A27:A26 ; a[2]*b[0] 323 STW A0,*ARG0[0] 324|| MPY32U A17,B17,A29:A28 ; a[1]*b[1] 325 MPY32U A18,B16,A31:A30 ; a[0]*b[2] 326|| ADDU A22,A1,A1:A0 327 MV A23,B0 328|| MPY32U A19,B16,A21:A20 ; a[3]*b[0] 329|| ADDU A24,A1:A0,A1:A0 330 ADDU A25,B0,B1:B0 331|| STW A0,*ARG0[1] 332|| MPY32U A18,B17,A23:A22 ; a[2]*b[1] 333|| ADDU A26,A1,A9:A8 334 ADDU A27,B1,B9:B8 335|| MPY32U A17,B18,A25:A24 ; a[1]*b[2] 336|| ADDU A28,A9:A8,A9:A8 337 ADDU A29,B9:B8,B9:B8 338|| MPY32U A16,B19,A27:A26 ; a[0]*b[3] 339|| ADDU A30,A9:A8,A9:A8 340 ADDU A31,B9:B8,B9:B8 341|| ADDU B0,A9:A8,A9:A8 342 STW A8,*ARG0[2] 343|| ADDU A20,A9,A1:A0 344 ADDU A21,B9,B1:B0 345|| MPY32U A19,B17,A21:A20 ; a[3]*b[1] 346|| ADDU A22,A1:A0,A1:A0 347 ADDU A23,B1:B0,B1:B0 348|| MPY32U A18,B18,A23:A22 ; a[2]*b[2] 349|| ADDU A24,A1:A0,A1:A0 350 ADDU A25,B1:B0,B1:B0 351|| MPY32U A17,B19,A25:A24 ; a[1]*b[3] 352|| ADDU A26,A1:A0,A1:A0 353 ADDU A27,B1:B0,B1:B0 354|| ADDU B8,A1:A0,A1:A0 355 STW A0,*ARG0[3] 356|| MPY32U A19,B18,A27:A26 ; a[3]*b[2] 357|| ADDU A20,A1,A9:A8 358 ADDU A21,B1,B9:B8 359|| MPY32U A18,B19,A29:A28 ; a[2]*b[3] 360|| ADDU A22,A9:A8,A9:A8 361 ADDU A23,B9:B8,B9:B8 362|| MPY32U A19,B19,A31:A30 ; a[3]*b[3] 363|| ADDU A24,A9:A8,A9:A8 364 ADDU A25,B9:B8,B9:B8 365|| ADDU B0,A9:A8,A9:A8 366 STW A8,*ARG0[4] 367|| ADDU A26,A9,A1:A0 368 ADDU A27,B9,B1:B0 369|| ADDU A28,A1:A0,A1:A0 370 ADDU A29,B1:B0,B1:B0 371|| BNOP RA 372|| ADDU B8,A1:A0,A1:A0 373 STW A0,*ARG0[5] 374|| ADDU A30,A1,A9:A8 375 ADD A31,B1,B8 376 ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below 377 ADD B8,A9,A9 378|| STW A8,*ARG0[6] 379 STW A9,*ARG0[7] 380 .endif 381 .endasmfunc 382