1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# This module doesn't present direct interest for OpenSSL, because it 18# doesn't provide better performance for longer keys, at least not on 19# in-order-execution cores. While 512-bit RSA sign operations can be 20# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and 21# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from 22# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA 23# verify:-( All comparisons are against bn_mul_mont-free assembler. 24# The module might be of interest to embedded system developers, as 25# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 26# and 75-30% [less for longer keys] on MIPS32 over compiler-generated 27# code. 28 29###################################################################### 30# There is a number of MIPS ABI in use, O32 and N32/64 are most 31# widely used. Then there is a new contender: NUBI. It appears that if 32# one picks the latter, it's possible to arrange code in ABI neutral 33# manner. Therefore let's stick to NUBI register layout: 34# 35($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 36($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 37($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 38($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 39# 40# The return value is placed in $a0. Following coding rules facilitate 41# interoperability: 42# 43# - never ever touch $tp, "thread pointer", former $gp; 44# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 45# old code]; 46# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 47# 48# For reference here is register layout for N32/64 MIPS ABIs: 49# 50# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 51# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 52# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 53# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 54# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 55 56# $output is the last argument if it looks like a file (it has an extension) 57# $flavour is the first argument if it doesn't look like a file 58$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 59# supported flavours are o32,n32,64,nubi32,nubi64, default is o32 60$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32"; 61 62if ($flavour =~ /64|n32/i) { 63 $PTR_ADD="daddu"; # incidentally works even on n32 64 $PTR_SUB="dsubu"; # incidentally works even on n32 65 $REG_S="sd"; 66 $REG_L="ld"; 67 $SZREG=8; 68} else { 69 $PTR_ADD="addu"; 70 $PTR_SUB="subu"; 71 $REG_S="sw"; 72 $REG_L="lw"; 73 $SZREG=4; 74} 75$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; 76# 77# <appro@openssl.org> 78# 79###################################################################### 80 81$output and open STDOUT,">$output"; 82 83if ($flavour =~ /64|n32/i) { 84 $LD="ld"; 85 $ST="sd"; 86 $MULTU="dmultu"; 87 $ADDU="daddu"; 88 $SUBU="dsubu"; 89 $BNSZ=8; 90} else { 91 $LD="lw"; 92 $ST="sw"; 93 $MULTU="multu"; 94 $ADDU="addu"; 95 $SUBU="subu"; 96 $BNSZ=4; 97} 98 99# int bn_mul_mont( 100$rp=$a0; # BN_ULONG *rp, 101$ap=$a1; # const BN_ULONG *ap, 102$bp=$a2; # const BN_ULONG *bp, 103$np=$a3; # const BN_ULONG *np, 104$n0=$a4; # const BN_ULONG *n0, 105$num=$a5; # int num); 106 107$lo0=$a6; 108$hi0=$a7; 109$lo1=$t1; 110$hi1=$t2; 111$aj=$s0; 112$bi=$s1; 113$nj=$s2; 114$tp=$s3; 115$alo=$s4; 116$ahi=$s5; 117$nlo=$s6; 118$nhi=$s7; 119$tj=$s8; 120$i=$s9; 121$j=$s10; 122$m1=$s11; 123 124$FRAMESIZE=14; 125 126$code=<<___; 127#include "mips_arch.h" 128 129.text 130 131.set noat 132.set noreorder 133 134.align 5 135.globl bn_mul_mont 136.ent bn_mul_mont 137bn_mul_mont: 138___ 139$code.=<<___ if ($flavour =~ /o32/i); 140 lw $n0,16($sp) 141 lw $num,20($sp) 142___ 143$code.=<<___; 144 slt $at,$num,4 145 bnez $at,1f 146 li $t0,0 147 slt $at,$num,17 # on in-order CPU 148 bnez $at,bn_mul_mont_internal 149 nop 1501: jr $ra 151 li $a0,0 152.end bn_mul_mont 153 154.align 5 155.ent bn_mul_mont_internal 156bn_mul_mont_internal: 157 .frame $fp,$FRAMESIZE*$SZREG,$ra 158 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG 159 $PTR_SUB $sp,$FRAMESIZE*$SZREG 160 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) 161 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) 162 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) 163 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) 164 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) 165 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) 166 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) 167 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) 168 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) 169___ 170$code.=<<___ if ($flavour =~ /nubi/i); 171 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) 172 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) 173 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) 174 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) 175___ 176$code.=<<___; 177 move $fp,$sp 178 179 .set reorder 180 $LD $n0,0($n0) 181 $LD $bi,0($bp) # bp[0] 182 $LD $aj,0($ap) # ap[0] 183 $LD $nj,0($np) # np[0] 184 185 $PTR_SUB $sp,2*$BNSZ # place for two extra words 186 sll $num,`log($BNSZ)/log(2)` 187 li $at,-4096 188 $PTR_SUB $sp,$num 189 and $sp,$at 190 191 $MULTU ($aj,$bi) 192 $LD $ahi,$BNSZ($ap) 193 $LD $nhi,$BNSZ($np) 194 mflo ($lo0,$aj,$bi) 195 mfhi ($hi0,$aj,$bi) 196 $MULTU ($lo0,$n0) 197 mflo ($m1,$lo0,$n0) 198 199 $MULTU ($ahi,$bi) 200 mflo ($alo,$ahi,$bi) 201 mfhi ($ahi,$ahi,$bi) 202 203 $MULTU ($nj,$m1) 204 mflo ($lo1,$nj,$m1) 205 mfhi ($hi1,$nj,$m1) 206 $MULTU ($nhi,$m1) 207 $ADDU $lo1,$lo0 208 sltu $at,$lo1,$lo0 209 $ADDU $hi1,$at 210 mflo ($nlo,$nhi,$m1) 211 mfhi ($nhi,$nhi,$m1) 212 213 move $tp,$sp 214 li $j,2*$BNSZ 215.align 4 216.L1st: 217 .set noreorder 218 $PTR_ADD $aj,$ap,$j 219 $PTR_ADD $nj,$np,$j 220 $LD $aj,($aj) 221 $LD $nj,($nj) 222 223 $MULTU ($aj,$bi) 224 $ADDU $lo0,$alo,$hi0 225 $ADDU $lo1,$nlo,$hi1 226 sltu $at,$lo0,$hi0 227 sltu $t0,$lo1,$hi1 228 $ADDU $hi0,$ahi,$at 229 $ADDU $hi1,$nhi,$t0 230 mflo ($alo,$aj,$bi) 231 mfhi ($ahi,$aj,$bi) 232 233 $ADDU $lo1,$lo0 234 sltu $at,$lo1,$lo0 235 $MULTU ($nj,$m1) 236 $ADDU $hi1,$at 237 addu $j,$BNSZ 238 $ST $lo1,($tp) 239 sltu $t0,$j,$num 240 mflo ($nlo,$nj,$m1) 241 mfhi ($nhi,$nj,$m1) 242 243 bnez $t0,.L1st 244 $PTR_ADD $tp,$BNSZ 245 .set reorder 246 247 $ADDU $lo0,$alo,$hi0 248 sltu $at,$lo0,$hi0 249 $ADDU $hi0,$ahi,$at 250 251 $ADDU $lo1,$nlo,$hi1 252 sltu $t0,$lo1,$hi1 253 $ADDU $hi1,$nhi,$t0 254 $ADDU $lo1,$lo0 255 sltu $at,$lo1,$lo0 256 $ADDU $hi1,$at 257 258 $ST $lo1,($tp) 259 260 $ADDU $hi1,$hi0 261 sltu $at,$hi1,$hi0 262 $ST $hi1,$BNSZ($tp) 263 $ST $at,2*$BNSZ($tp) 264 265 li $i,$BNSZ 266.align 4 267.Louter: 268 $PTR_ADD $bi,$bp,$i 269 $LD $bi,($bi) 270 $LD $aj,($ap) 271 $LD $ahi,$BNSZ($ap) 272 $LD $tj,($sp) 273 274 $MULTU ($aj,$bi) 275 $LD $nj,($np) 276 $LD $nhi,$BNSZ($np) 277 mflo ($lo0,$aj,$bi) 278 mfhi ($hi0,$aj,$bi) 279 $ADDU $lo0,$tj 280 $MULTU ($lo0,$n0) 281 sltu $at,$lo0,$tj 282 $ADDU $hi0,$at 283 mflo ($m1,$lo0,$n0) 284 285 $MULTU ($ahi,$bi) 286 mflo ($alo,$ahi,$bi) 287 mfhi ($ahi,$ahi,$bi) 288 289 $MULTU ($nj,$m1) 290 mflo ($lo1,$nj,$m1) 291 mfhi ($hi1,$nj,$m1) 292 293 $MULTU ($nhi,$m1) 294 $ADDU $lo1,$lo0 295 sltu $at,$lo1,$lo0 296 $ADDU $hi1,$at 297 mflo ($nlo,$nhi,$m1) 298 mfhi ($nhi,$nhi,$m1) 299 300 move $tp,$sp 301 li $j,2*$BNSZ 302 $LD $tj,$BNSZ($tp) 303.align 4 304.Linner: 305 .set noreorder 306 $PTR_ADD $aj,$ap,$j 307 $PTR_ADD $nj,$np,$j 308 $LD $aj,($aj) 309 $LD $nj,($nj) 310 311 $MULTU ($aj,$bi) 312 $ADDU $lo0,$alo,$hi0 313 $ADDU $lo1,$nlo,$hi1 314 sltu $at,$lo0,$hi0 315 sltu $t0,$lo1,$hi1 316 $ADDU $hi0,$ahi,$at 317 $ADDU $hi1,$nhi,$t0 318 mflo ($alo,$aj,$bi) 319 mfhi ($ahi,$aj,$bi) 320 321 $ADDU $lo0,$tj 322 addu $j,$BNSZ 323 $MULTU ($nj,$m1) 324 sltu $at,$lo0,$tj 325 $ADDU $lo1,$lo0 326 $ADDU $hi0,$at 327 sltu $t0,$lo1,$lo0 328 $LD $tj,2*$BNSZ($tp) 329 $ADDU $hi1,$t0 330 sltu $at,$j,$num 331 mflo ($nlo,$nj,$m1) 332 mfhi ($nhi,$nj,$m1) 333 $ST $lo1,($tp) 334 bnez $at,.Linner 335 $PTR_ADD $tp,$BNSZ 336 .set reorder 337 338 $ADDU $lo0,$alo,$hi0 339 sltu $at,$lo0,$hi0 340 $ADDU $hi0,$ahi,$at 341 $ADDU $lo0,$tj 342 sltu $t0,$lo0,$tj 343 $ADDU $hi0,$t0 344 345 $LD $tj,2*$BNSZ($tp) 346 $ADDU $lo1,$nlo,$hi1 347 sltu $at,$lo1,$hi1 348 $ADDU $hi1,$nhi,$at 349 $ADDU $lo1,$lo0 350 sltu $t0,$lo1,$lo0 351 $ADDU $hi1,$t0 352 $ST $lo1,($tp) 353 354 $ADDU $lo1,$hi1,$hi0 355 sltu $hi1,$lo1,$hi0 356 $ADDU $lo1,$tj 357 sltu $at,$lo1,$tj 358 $ADDU $hi1,$at 359 $ST $lo1,$BNSZ($tp) 360 $ST $hi1,2*$BNSZ($tp) 361 362 addu $i,$BNSZ 363 sltu $t0,$i,$num 364 bnez $t0,.Louter 365 366 .set noreorder 367 $PTR_ADD $tj,$sp,$num # &tp[num] 368 move $tp,$sp 369 move $ap,$sp 370 li $hi0,0 # clear borrow bit 371 372.align 4 373.Lsub: $LD $lo0,($tp) 374 $LD $lo1,($np) 375 $PTR_ADD $tp,$BNSZ 376 $PTR_ADD $np,$BNSZ 377 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] 378 sgtu $at,$lo1,$lo0 379 $SUBU $lo0,$lo1,$hi0 380 sgtu $hi0,$lo0,$lo1 381 $ST $lo0,($rp) 382 or $hi0,$at 383 sltu $at,$tp,$tj 384 bnez $at,.Lsub 385 $PTR_ADD $rp,$BNSZ 386 387 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit 388 move $tp,$sp 389 $PTR_SUB $rp,$num # restore rp 390 not $hi1,$hi0 391 392.Lcopy: $LD $nj,($tp) # conditional move 393 $LD $aj,($rp) 394 $ST $zero,($tp) 395 $PTR_ADD $tp,$BNSZ 396 and $nj,$hi0 397 and $aj,$hi1 398 or $aj,$nj 399 sltu $at,$tp,$tj 400 $ST $aj,($rp) 401 bnez $at,.Lcopy 402 $PTR_ADD $rp,$BNSZ 403 404 li $a0,1 405 li $t0,1 406 407 .set noreorder 408 move $sp,$fp 409 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) 410 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) 411 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) 412 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) 413 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) 414 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) 415 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) 416 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) 417 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) 418___ 419$code.=<<___ if ($flavour =~ /nubi/i); 420 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) 421 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) 422 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) 423 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) 424___ 425$code.=<<___; 426 jr $ra 427 $PTR_ADD $sp,$FRAMESIZE*$SZREG 428.end bn_mul_mont_internal 429.rdata 430.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 431___ 432 433$code =~ s/\`([^\`]*)\`/eval $1/gem; 434 435print $code; 436close STDOUT or die "error closing STDOUT: $!"; 437