1#! /usr/bin/env perl 2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# On 21264 RSA sign performance improves by 70/35/20/15 percent for 18# 512/1024/2048/4096 bit key lengths. This is against vendor compiler 19# instructed to '-tune host' code with in-line assembler. Other 20# benchmarks improve by 15-20%. To anchor it to something else, the 21# code provides approximately the same performance per GHz as AMD64. 22# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x 23# difference. 24 25$output=pop and open STDOUT,">$output"; 26 27# int bn_mul_mont( 28$rp="a0"; # BN_ULONG *rp, 29$ap="a1"; # const BN_ULONG *ap, 30$bp="a2"; # const BN_ULONG *bp, 31$np="a3"; # const BN_ULONG *np, 32$n0="a4"; # const BN_ULONG *n0, 33$num="a5"; # int num); 34 35$lo0="t0"; 36$hi0="t1"; 37$lo1="t2"; 38$hi1="t3"; 39$aj="t4"; 40$bi="t5"; 41$nj="t6"; 42$tp="t7"; 43$alo="t8"; 44$ahi="t9"; 45$nlo="t10"; 46$nhi="t11"; 47$tj="t12"; 48$i="s3"; 49$j="s4"; 50$m1="s5"; 51 52$code=<<___; 53#ifdef __linux__ 54#include <asm/regdef.h> 55#else 56#include <asm.h> 57#include <regdef.h> 58#endif 59 60.text 61 62.set noat 63.set noreorder 64 65.globl bn_mul_mont 66.align 5 67.ent bn_mul_mont 68bn_mul_mont: 69 lda sp,-48(sp) 70 stq ra,0(sp) 71 stq s3,8(sp) 72 stq s4,16(sp) 73 stq s5,24(sp) 74 stq fp,32(sp) 75 mov sp,fp 76 .mask 0x0400f000,-48 77 .frame fp,48,ra 78 .prologue 0 79 80 .align 4 81 .set reorder 82 sextl $num,$num 83 mov 0,v0 84 cmplt $num,4,AT 85 bne AT,.Lexit 86 87 ldq $hi0,0($ap) # ap[0] 88 s8addq $num,16,AT 89 ldq $aj,8($ap) 90 subq sp,AT,sp 91 ldq $bi,0($bp) # bp[0] 92 lda AT,-4096(zero) # mov -4096,AT 93 ldq $n0,0($n0) 94 and sp,AT,sp 95 96 mulq $hi0,$bi,$lo0 97 ldq $hi1,0($np) # np[0] 98 umulh $hi0,$bi,$hi0 99 ldq $nj,8($np) 100 101 mulq $lo0,$n0,$m1 102 103 mulq $hi1,$m1,$lo1 104 umulh $hi1,$m1,$hi1 105 106 addq $lo1,$lo0,$lo1 107 cmpult $lo1,$lo0,AT 108 addq $hi1,AT,$hi1 109 110 mulq $aj,$bi,$alo 111 mov 2,$j 112 umulh $aj,$bi,$ahi 113 mov sp,$tp 114 115 mulq $nj,$m1,$nlo 116 s8addq $j,$ap,$aj 117 umulh $nj,$m1,$nhi 118 s8addq $j,$np,$nj 119.align 4 120.L1st: 121 .set noreorder 122 ldq $aj,0($aj) 123 addl $j,1,$j 124 ldq $nj,0($nj) 125 lda $tp,8($tp) 126 127 addq $alo,$hi0,$lo0 128 mulq $aj,$bi,$alo 129 cmpult $lo0,$hi0,AT 130 addq $nlo,$hi1,$lo1 131 132 mulq $nj,$m1,$nlo 133 addq $ahi,AT,$hi0 134 cmpult $lo1,$hi1,v0 135 cmplt $j,$num,$tj 136 137 umulh $aj,$bi,$ahi 138 addq $nhi,v0,$hi1 139 addq $lo1,$lo0,$lo1 140 s8addq $j,$ap,$aj 141 142 umulh $nj,$m1,$nhi 143 cmpult $lo1,$lo0,v0 144 addq $hi1,v0,$hi1 145 s8addq $j,$np,$nj 146 147 stq $lo1,-8($tp) 148 nop 149 unop 150 bne $tj,.L1st 151 .set reorder 152 153 addq $alo,$hi0,$lo0 154 addq $nlo,$hi1,$lo1 155 cmpult $lo0,$hi0,AT 156 cmpult $lo1,$hi1,v0 157 addq $ahi,AT,$hi0 158 addq $nhi,v0,$hi1 159 160 addq $lo1,$lo0,$lo1 161 cmpult $lo1,$lo0,v0 162 addq $hi1,v0,$hi1 163 164 stq $lo1,0($tp) 165 166 addq $hi1,$hi0,$hi1 167 cmpult $hi1,$hi0,AT 168 stq $hi1,8($tp) 169 stq AT,16($tp) 170 171 mov 1,$i 172.align 4 173.Louter: 174 s8addq $i,$bp,$bi 175 ldq $hi0,0($ap) 176 ldq $aj,8($ap) 177 ldq $bi,0($bi) 178 ldq $hi1,0($np) 179 ldq $nj,8($np) 180 ldq $tj,0(sp) 181 182 mulq $hi0,$bi,$lo0 183 umulh $hi0,$bi,$hi0 184 185 addq $lo0,$tj,$lo0 186 cmpult $lo0,$tj,AT 187 addq $hi0,AT,$hi0 188 189 mulq $lo0,$n0,$m1 190 191 mulq $hi1,$m1,$lo1 192 umulh $hi1,$m1,$hi1 193 194 addq $lo1,$lo0,$lo1 195 cmpult $lo1,$lo0,AT 196 mov 2,$j 197 addq $hi1,AT,$hi1 198 199 mulq $aj,$bi,$alo 200 mov sp,$tp 201 umulh $aj,$bi,$ahi 202 203 mulq $nj,$m1,$nlo 204 s8addq $j,$ap,$aj 205 umulh $nj,$m1,$nhi 206.align 4 207.Linner: 208 .set noreorder 209 ldq $tj,8($tp) #L0 210 nop #U1 211 ldq $aj,0($aj) #L1 212 s8addq $j,$np,$nj #U0 213 214 ldq $nj,0($nj) #L0 215 nop #U1 216 addq $alo,$hi0,$lo0 #L1 217 lda $tp,8($tp) 218 219 mulq $aj,$bi,$alo #U1 220 cmpult $lo0,$hi0,AT #L0 221 addq $nlo,$hi1,$lo1 #L1 222 addl $j,1,$j 223 224 mulq $nj,$m1,$nlo #U1 225 addq $ahi,AT,$hi0 #L0 226 addq $lo0,$tj,$lo0 #L1 227 cmpult $lo1,$hi1,v0 #U0 228 229 umulh $aj,$bi,$ahi #U1 230 cmpult $lo0,$tj,AT #L0 231 addq $lo1,$lo0,$lo1 #L1 232 addq $nhi,v0,$hi1 #U0 233 234 umulh $nj,$m1,$nhi #U1 235 s8addq $j,$ap,$aj #L0 236 cmpult $lo1,$lo0,v0 #L1 237 cmplt $j,$num,$tj #U0 # borrow $tj 238 239 addq $hi0,AT,$hi0 #L0 240 addq $hi1,v0,$hi1 #U1 241 stq $lo1,-8($tp) #L1 242 bne $tj,.Linner #U0 243 .set reorder 244 245 ldq $tj,8($tp) 246 addq $alo,$hi0,$lo0 247 addq $nlo,$hi1,$lo1 248 cmpult $lo0,$hi0,AT 249 cmpult $lo1,$hi1,v0 250 addq $ahi,AT,$hi0 251 addq $nhi,v0,$hi1 252 253 addq $lo0,$tj,$lo0 254 cmpult $lo0,$tj,AT 255 addq $hi0,AT,$hi0 256 257 ldq $tj,16($tp) 258 addq $lo1,$lo0,$j 259 cmpult $j,$lo0,v0 260 addq $hi1,v0,$hi1 261 262 addq $hi1,$hi0,$lo1 263 stq $j,0($tp) 264 cmpult $lo1,$hi0,$hi1 265 addq $lo1,$tj,$lo1 266 cmpult $lo1,$tj,AT 267 addl $i,1,$i 268 addq $hi1,AT,$hi1 269 stq $lo1,8($tp) 270 cmplt $i,$num,$tj # borrow $tj 271 stq $hi1,16($tp) 272 bne $tj,.Louter 273 274 s8addq $num,sp,$tj # &tp[num] 275 mov $rp,$bp # put rp aside 276 mov sp,$tp 277 mov sp,$ap 278 mov 0,$hi0 # clear borrow bit 279 280.align 4 281.Lsub: ldq $lo0,0($tp) 282 ldq $lo1,0($np) 283 lda $tp,8($tp) 284 lda $np,8($np) 285 subq $lo0,$lo1,$lo1 # tp[i]-np[i] 286 cmpult $lo0,$lo1,AT 287 subq $lo1,$hi0,$lo0 288 cmpult $lo1,$lo0,$hi0 289 or $hi0,AT,$hi0 290 stq $lo0,0($rp) 291 cmpult $tp,$tj,v0 292 lda $rp,8($rp) 293 bne v0,.Lsub 294 295 subq $hi1,$hi0,$hi0 # handle upmost overflow bit 296 mov sp,$tp 297 mov $bp,$rp # restore rp 298 299.align 4 300.Lcopy: ldq $aj,0($tp) # conditional copy 301 ldq $nj,0($rp) 302 lda $tp,8($tp) 303 lda $rp,8($rp) 304 cmoveq $hi0,$nj,$aj 305 stq zero,-8($tp) # zap tp 306 cmpult $tp,$tj,AT 307 stq $aj,-8($rp) 308 bne AT,.Lcopy 309 mov 1,v0 310 311.Lexit: 312 .set noreorder 313 mov fp,sp 314 /*ldq ra,0(sp)*/ 315 ldq s3,8(sp) 316 ldq s4,16(sp) 317 ldq s5,24(sp) 318 ldq fp,32(sp) 319 lda sp,48(sp) 320 ret (ra) 321.end bn_mul_mont 322.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 323.align 2 324___ 325 326print $code; 327close STDOUT or die "error closing STDOUT: $!"; 328