xref: /openssl/crypto/bn/asm/mips-mont.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# This module doesn't present direct interest for OpenSSL, because it
18# doesn't provide better performance for longer keys, at least not on
19# in-order-execution cores. While 512-bit RSA sign operations can be
20# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
21# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
22# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
23# verify:-( All comparisons are against bn_mul_mont-free assembler.
24# The module might be of interest to embedded system developers, as
25# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
26# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
27# code.
28
29######################################################################
30# There is a number of MIPS ABI in use, O32 and N32/64 are most
31# widely used. Then there is a new contender: NUBI. It appears that if
32# one picks the latter, it's possible to arrange code in ABI neutral
33# manner. Therefore let's stick to NUBI register layout:
34#
35($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
36($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
37($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
38($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
39#
40# The return value is placed in $a0. Following coding rules facilitate
41# interoperability:
42#
43# - never ever touch $tp, "thread pointer", former $gp;
44# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
45#   old code];
46# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
47#
48# For reference here is register layout for N32/64 MIPS ABIs:
49#
50# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
51# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
52# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
53# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
54# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
55
56# $output is the last argument if it looks like a file (it has an extension)
57# $flavour is the first argument if it doesn't look like a file
58$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
59# supported flavours are o32,n32,64,nubi32,nubi64, default is o32
60$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
61
62if ($flavour =~ /64|n32/i) {
63	$PTR_ADD="daddu";	# incidentally works even on n32
64	$PTR_SUB="dsubu";	# incidentally works even on n32
65	$REG_S="sd";
66	$REG_L="ld";
67	$SZREG=8;
68} else {
69	$PTR_ADD="addu";
70	$PTR_SUB="subu";
71	$REG_S="sw";
72	$REG_L="lw";
73	$SZREG=4;
74}
75$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
76#
77# <appro@openssl.org>
78#
79######################################################################
80
81$output and open STDOUT,">$output";
82
83if ($flavour =~ /64|n32/i) {
84	$LD="ld";
85	$ST="sd";
86	$MULTU="dmultu";
87	$ADDU="daddu";
88	$SUBU="dsubu";
89	$BNSZ=8;
90} else {
91	$LD="lw";
92	$ST="sw";
93	$MULTU="multu";
94	$ADDU="addu";
95	$SUBU="subu";
96	$BNSZ=4;
97}
98
99# int bn_mul_mont(
100$rp=$a0;	# BN_ULONG *rp,
101$ap=$a1;	# const BN_ULONG *ap,
102$bp=$a2;	# const BN_ULONG *bp,
103$np=$a3;	# const BN_ULONG *np,
104$n0=$a4;	# const BN_ULONG *n0,
105$num=$a5;	# int num);
106
107$lo0=$a6;
108$hi0=$a7;
109$lo1=$t1;
110$hi1=$t2;
111$aj=$s0;
112$bi=$s1;
113$nj=$s2;
114$tp=$s3;
115$alo=$s4;
116$ahi=$s5;
117$nlo=$s6;
118$nhi=$s7;
119$tj=$s8;
120$i=$s9;
121$j=$s10;
122$m1=$s11;
123
124$FRAMESIZE=14;
125
126$code=<<___;
127#include "mips_arch.h"
128
129.text
130
131.set	noat
132.set	noreorder
133
134.align	5
135.globl	bn_mul_mont
136.ent	bn_mul_mont
137bn_mul_mont:
138___
139$code.=<<___ if ($flavour =~ /o32/i);
140	lw	$n0,16($sp)
141	lw	$num,20($sp)
142___
143$code.=<<___;
144	slt	$at,$num,4
145	bnez	$at,1f
146	li	$t0,0
147	slt	$at,$num,17	# on in-order CPU
148	bnez	$at,bn_mul_mont_internal
149	nop
1501:	jr	$ra
151	li	$a0,0
152.end	bn_mul_mont
153
154.align	5
155.ent	bn_mul_mont_internal
156bn_mul_mont_internal:
157	.frame	$fp,$FRAMESIZE*$SZREG,$ra
158	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
159	$PTR_SUB $sp,$FRAMESIZE*$SZREG
160	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
161	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
162	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
163	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
164	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
165	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
166	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
167	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
168	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
169___
170$code.=<<___ if ($flavour =~ /nubi/i);
171	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
172	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
173	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
174	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
175___
176$code.=<<___;
177	move	$fp,$sp
178
179	.set	reorder
180	$LD	$n0,0($n0)
181	$LD	$bi,0($bp)	# bp[0]
182	$LD	$aj,0($ap)	# ap[0]
183	$LD	$nj,0($np)	# np[0]
184
185	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
186	sll	$num,`log($BNSZ)/log(2)`
187	li	$at,-4096
188	$PTR_SUB $sp,$num
189	and	$sp,$at
190
191	$MULTU	($aj,$bi)
192	$LD	$ahi,$BNSZ($ap)
193	$LD	$nhi,$BNSZ($np)
194	mflo	($lo0,$aj,$bi)
195	mfhi	($hi0,$aj,$bi)
196	$MULTU	($lo0,$n0)
197	mflo	($m1,$lo0,$n0)
198
199	$MULTU	($ahi,$bi)
200	mflo	($alo,$ahi,$bi)
201	mfhi	($ahi,$ahi,$bi)
202
203	$MULTU	($nj,$m1)
204	mflo	($lo1,$nj,$m1)
205	mfhi	($hi1,$nj,$m1)
206	$MULTU	($nhi,$m1)
207	$ADDU	$lo1,$lo0
208	sltu	$at,$lo1,$lo0
209	$ADDU	$hi1,$at
210	mflo	($nlo,$nhi,$m1)
211	mfhi	($nhi,$nhi,$m1)
212
213	move	$tp,$sp
214	li	$j,2*$BNSZ
215.align	4
216.L1st:
217	.set	noreorder
218	$PTR_ADD $aj,$ap,$j
219	$PTR_ADD $nj,$np,$j
220	$LD	$aj,($aj)
221	$LD	$nj,($nj)
222
223	$MULTU	($aj,$bi)
224	$ADDU	$lo0,$alo,$hi0
225	$ADDU	$lo1,$nlo,$hi1
226	sltu	$at,$lo0,$hi0
227	sltu	$t0,$lo1,$hi1
228	$ADDU	$hi0,$ahi,$at
229	$ADDU	$hi1,$nhi,$t0
230	mflo	($alo,$aj,$bi)
231	mfhi	($ahi,$aj,$bi)
232
233	$ADDU	$lo1,$lo0
234	sltu	$at,$lo1,$lo0
235	$MULTU	($nj,$m1)
236	$ADDU	$hi1,$at
237	addu	$j,$BNSZ
238	$ST	$lo1,($tp)
239	sltu	$t0,$j,$num
240	mflo	($nlo,$nj,$m1)
241	mfhi	($nhi,$nj,$m1)
242
243	bnez	$t0,.L1st
244	$PTR_ADD $tp,$BNSZ
245	.set	reorder
246
247	$ADDU	$lo0,$alo,$hi0
248	sltu	$at,$lo0,$hi0
249	$ADDU	$hi0,$ahi,$at
250
251	$ADDU	$lo1,$nlo,$hi1
252	sltu	$t0,$lo1,$hi1
253	$ADDU	$hi1,$nhi,$t0
254	$ADDU	$lo1,$lo0
255	sltu	$at,$lo1,$lo0
256	$ADDU	$hi1,$at
257
258	$ST	$lo1,($tp)
259
260	$ADDU	$hi1,$hi0
261	sltu	$at,$hi1,$hi0
262	$ST	$hi1,$BNSZ($tp)
263	$ST	$at,2*$BNSZ($tp)
264
265	li	$i,$BNSZ
266.align	4
267.Louter:
268	$PTR_ADD $bi,$bp,$i
269	$LD	$bi,($bi)
270	$LD	$aj,($ap)
271	$LD	$ahi,$BNSZ($ap)
272	$LD	$tj,($sp)
273
274	$MULTU	($aj,$bi)
275	$LD	$nj,($np)
276	$LD	$nhi,$BNSZ($np)
277	mflo	($lo0,$aj,$bi)
278	mfhi	($hi0,$aj,$bi)
279	$ADDU	$lo0,$tj
280	$MULTU	($lo0,$n0)
281	sltu	$at,$lo0,$tj
282	$ADDU	$hi0,$at
283	mflo	($m1,$lo0,$n0)
284
285	$MULTU	($ahi,$bi)
286	mflo	($alo,$ahi,$bi)
287	mfhi	($ahi,$ahi,$bi)
288
289	$MULTU	($nj,$m1)
290	mflo	($lo1,$nj,$m1)
291	mfhi	($hi1,$nj,$m1)
292
293	$MULTU	($nhi,$m1)
294	$ADDU	$lo1,$lo0
295	sltu	$at,$lo1,$lo0
296	$ADDU	$hi1,$at
297	mflo	($nlo,$nhi,$m1)
298	mfhi	($nhi,$nhi,$m1)
299
300	move	$tp,$sp
301	li	$j,2*$BNSZ
302	$LD	$tj,$BNSZ($tp)
303.align	4
304.Linner:
305	.set	noreorder
306	$PTR_ADD $aj,$ap,$j
307	$PTR_ADD $nj,$np,$j
308	$LD	$aj,($aj)
309	$LD	$nj,($nj)
310
311	$MULTU	($aj,$bi)
312	$ADDU	$lo0,$alo,$hi0
313	$ADDU	$lo1,$nlo,$hi1
314	sltu	$at,$lo0,$hi0
315	sltu	$t0,$lo1,$hi1
316	$ADDU	$hi0,$ahi,$at
317	$ADDU	$hi1,$nhi,$t0
318	mflo	($alo,$aj,$bi)
319	mfhi	($ahi,$aj,$bi)
320
321	$ADDU	$lo0,$tj
322	addu	$j,$BNSZ
323	$MULTU	($nj,$m1)
324	sltu	$at,$lo0,$tj
325	$ADDU	$lo1,$lo0
326	$ADDU	$hi0,$at
327	sltu	$t0,$lo1,$lo0
328	$LD	$tj,2*$BNSZ($tp)
329	$ADDU	$hi1,$t0
330	sltu	$at,$j,$num
331	mflo	($nlo,$nj,$m1)
332	mfhi	($nhi,$nj,$m1)
333	$ST	$lo1,($tp)
334	bnez	$at,.Linner
335	$PTR_ADD $tp,$BNSZ
336	.set	reorder
337
338	$ADDU	$lo0,$alo,$hi0
339	sltu	$at,$lo0,$hi0
340	$ADDU	$hi0,$ahi,$at
341	$ADDU	$lo0,$tj
342	sltu	$t0,$lo0,$tj
343	$ADDU	$hi0,$t0
344
345	$LD	$tj,2*$BNSZ($tp)
346	$ADDU	$lo1,$nlo,$hi1
347	sltu	$at,$lo1,$hi1
348	$ADDU	$hi1,$nhi,$at
349	$ADDU	$lo1,$lo0
350	sltu	$t0,$lo1,$lo0
351	$ADDU	$hi1,$t0
352	$ST	$lo1,($tp)
353
354	$ADDU	$lo1,$hi1,$hi0
355	sltu	$hi1,$lo1,$hi0
356	$ADDU	$lo1,$tj
357	sltu	$at,$lo1,$tj
358	$ADDU	$hi1,$at
359	$ST	$lo1,$BNSZ($tp)
360	$ST	$hi1,2*$BNSZ($tp)
361
362	addu	$i,$BNSZ
363	sltu	$t0,$i,$num
364	bnez	$t0,.Louter
365
366	.set	noreorder
367	$PTR_ADD $tj,$sp,$num	# &tp[num]
368	move	$tp,$sp
369	move	$ap,$sp
370	li	$hi0,0		# clear borrow bit
371
372.align	4
373.Lsub:	$LD	$lo0,($tp)
374	$LD	$lo1,($np)
375	$PTR_ADD $tp,$BNSZ
376	$PTR_ADD $np,$BNSZ
377	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
378	sgtu	$at,$lo1,$lo0
379	$SUBU	$lo0,$lo1,$hi0
380	sgtu	$hi0,$lo0,$lo1
381	$ST	$lo0,($rp)
382	or	$hi0,$at
383	sltu	$at,$tp,$tj
384	bnez	$at,.Lsub
385	$PTR_ADD $rp,$BNSZ
386
387	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
388	move	$tp,$sp
389	$PTR_SUB $rp,$num	# restore rp
390	not	$hi1,$hi0
391
392.Lcopy:	$LD	$nj,($tp)	# conditional move
393	$LD	$aj,($rp)
394	$ST	$zero,($tp)
395	$PTR_ADD $tp,$BNSZ
396	and	$nj,$hi0
397	and	$aj,$hi1
398	or	$aj,$nj
399	sltu	$at,$tp,$tj
400	$ST	$aj,($rp)
401	bnez	$at,.Lcopy
402	$PTR_ADD $rp,$BNSZ
403
404	li	$a0,1
405	li	$t0,1
406
407	.set	noreorder
408	move	$sp,$fp
409	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
410	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
411	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
412	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
413	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
414	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
415	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
416	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
417	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
418___
419$code.=<<___ if ($flavour =~ /nubi/i);
420	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
421	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
422	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
423	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
424___
425$code.=<<___;
426	jr	$ra
427	$PTR_ADD $sp,$FRAMESIZE*$SZREG
428.end	bn_mul_mont_internal
429.rdata
430.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
431___
432
433$code =~ s/\`([^\`]*)\`/eval $1/gem;
434
435print $code;
436close STDOUT or die "error closing STDOUT: $!";
437