xref: /openssl/crypto/bn/asm/mips.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2010-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
15# granted according to the License. Warranty of any kind is disclaimed.
16# ====================================================================
17
18
19# July 1999
20#
21# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
22#
23# The module is designed to work with either of the "new" MIPS ABI(5),
24# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
25# IRIX 5.x not only because it doesn't support new ABIs but also
26# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
27# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
28# cause illegal instruction exception:-(
29#
30# In addition the code depends on preprocessor flags set up by MIPSpro
31# compiler driver (either as or cc) and therefore (probably?) can't be
32# compiled by the GNU assembler. GNU C driver manages fine though...
33# I mean as long as -mmips-as is specified or is the default option,
34# because then it simply invokes /usr/bin/as which in turn takes
35# perfect care of the preprocessor definitions. Another neat feature
36# offered by the MIPSpro assembler is an optimization pass. This gave
37# me the opportunity to have the code looking more regular as all those
38# architecture dependent instruction rescheduling details were left to
39# the assembler. Cool, huh?
40#
41# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
42# goes way over 3 times faster!
43#
44#					<appro@openssl.org>
45
46# October 2010
47#
48# Adapt the module even for 32-bit ABIs and other OSes. The former was
49# achieved by mechanical replacement of 64-bit arithmetic instructions
50# such as dmultu, daddu, etc. with their 32-bit counterparts and
51# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
52# >3x performance improvement naturally does not apply to 32-bit code
53# [because there is no instruction 32-bit compiler can't use], one
54# has to content with 40-85% improvement depending on benchmark and
55# key length, more for longer keys.
56
57# $output is the last argument if it looks like a file (it has an extension)
58# $flavour is the first argument if it doesn't look like a file
59$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
61
62if ($flavour =~ /64|n32/i) {
63	$LD="ld";
64	$ST="sd";
65	$MULTU="dmultu";
66	$DIVU="ddivu";
67	$ADDU="daddu";
68	$SUBU="dsubu";
69	$SRL="dsrl";
70	$SLL="dsll";
71	$BNSZ=8;
72	$PTR_ADD="daddu";
73	$PTR_SUB="dsubu";
74	$SZREG=8;
75	$REG_S="sd";
76	$REG_L="ld";
77} else {
78	$LD="lw";
79	$ST="sw";
80	$MULTU="multu";
81	$DIVU="divu";
82	$ADDU="addu";
83	$SUBU="subu";
84	$SRL="srl";
85	$SLL="sll";
86	$BNSZ=4;
87	$PTR_ADD="addu";
88	$PTR_SUB="subu";
89	$SZREG=4;
90	$REG_S="sw";
91	$REG_L="lw";
92	$code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set     mips2\n#endif\n";
93}
94
95$output and open STDOUT,">$output";
96
97# Below is N32/64 register layout used in the original module.
98#
99($zero,$at,$v0,$v1)=map("\$$_",(0..3));
100($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
101($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
102($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
103($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
104($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
105#
106# No special adaptation is required for O32. NUBI on the other hand
107# is treated by saving/restoring ($v1,$t0..$t3).
108
109$gp=$v1 if ($flavour =~ /nubi/i);
110
111$minus4=$v1;
112
113$code.=<<___;
114#include "mips_arch.h"
115
116#if defined(_MIPS_ARCH_MIPS64R6)
117# define ddivu(rs,rt)
118# define mfqt(rd,rs,rt)	ddivu	rd,rs,rt
119# define mfrm(rd,rs,rt)	dmodu	rd,rs,rt
120#elif defined(_MIPS_ARCH_MIPS32R6)
121# define divu(rs,rt)
122# define mfqt(rd,rs,rt)	divu	rd,rs,rt
123# define mfrm(rd,rs,rt)	modu	rd,rs,rt
124#else
125# define $DIVU(rs,rt)	$DIVU	$zero,rs,rt
126# define mfqt(rd,rs,rt)	mflo	rd
127# define mfrm(rd,rs,rt)	mfhi	rd
128#endif
129
130.rdata
131.asciiz	"mips3.s, Version 1.2"
132.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
133
134.text
135.set	noat
136
137.align	5
138.globl	bn_mul_add_words
139.ent	bn_mul_add_words
140bn_mul_add_words:
141	.set	noreorder
142	bgtz	$a2,bn_mul_add_words_internal
143	move	$v0,$zero
144	jr	$ra
145	move	$a0,$v0
146.end	bn_mul_add_words
147
148.align	5
149.ent	bn_mul_add_words_internal
150bn_mul_add_words_internal:
151___
152$code.=<<___ if ($flavour =~ /nubi/i);
153	.frame	$sp,6*$SZREG,$ra
154	.mask	0x8000f008,-$SZREG
155	.set	noreorder
156	$PTR_SUB $sp,6*$SZREG
157	$REG_S	$ra,5*$SZREG($sp)
158	$REG_S	$t3,4*$SZREG($sp)
159	$REG_S	$t2,3*$SZREG($sp)
160	$REG_S	$t1,2*$SZREG($sp)
161	$REG_S	$t0,1*$SZREG($sp)
162	$REG_S	$gp,0*$SZREG($sp)
163___
164$code.=<<___;
165	.set	reorder
166	li	$minus4,-4
167	and	$ta0,$a2,$minus4
168	beqz	$ta0,.L_bn_mul_add_words_tail
169
170.L_bn_mul_add_words_loop:
171	$LD	$t0,0($a1)
172	$MULTU	($t0,$a3)
173	$LD	$t1,0($a0)
174	$LD	$t2,$BNSZ($a1)
175	$LD	$t3,$BNSZ($a0)
176	$LD	$ta0,2*$BNSZ($a1)
177	$LD	$ta1,2*$BNSZ($a0)
178	$ADDU	$t1,$v0
179	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
180				# values", but it seems to work fine
181				# even on 64-bit registers.
182	mflo	($at,$t0,$a3)
183	mfhi	($t0,$t0,$a3)
184	$ADDU	$t1,$at
185	$ADDU	$v0,$t0
186	 $MULTU	($t2,$a3)
187	sltu	$at,$t1,$at
188	$ST	$t1,0($a0)
189	$ADDU	$v0,$at
190
191	$LD	$ta2,3*$BNSZ($a1)
192	$LD	$ta3,3*$BNSZ($a0)
193	$ADDU	$t3,$v0
194	sltu	$v0,$t3,$v0
195	mflo	($at,$t2,$a3)
196	mfhi	($t2,$t2,$a3)
197	$ADDU	$t3,$at
198	$ADDU	$v0,$t2
199	 $MULTU	($ta0,$a3)
200	sltu	$at,$t3,$at
201	$ST	$t3,$BNSZ($a0)
202	$ADDU	$v0,$at
203
204	subu	$a2,4
205	$PTR_ADD $a0,4*$BNSZ
206	$PTR_ADD $a1,4*$BNSZ
207	$ADDU	$ta1,$v0
208	sltu	$v0,$ta1,$v0
209	mflo	($at,$ta0,$a3)
210	mfhi	($ta0,$ta0,$a3)
211	$ADDU	$ta1,$at
212	$ADDU	$v0,$ta0
213	 $MULTU	($ta2,$a3)
214	sltu	$at,$ta1,$at
215	$ST	$ta1,-2*$BNSZ($a0)
216	$ADDU	$v0,$at
217
218
219	and	$ta0,$a2,$minus4
220	$ADDU	$ta3,$v0
221	sltu	$v0,$ta3,$v0
222	mflo	($at,$ta2,$a3)
223	mfhi	($ta2,$ta2,$a3)
224	$ADDU	$ta3,$at
225	$ADDU	$v0,$ta2
226	sltu	$at,$ta3,$at
227	$ST	$ta3,-$BNSZ($a0)
228	.set	noreorder
229	bgtz	$ta0,.L_bn_mul_add_words_loop
230	$ADDU	$v0,$at
231
232	beqz	$a2,.L_bn_mul_add_words_return
233	nop
234
235.L_bn_mul_add_words_tail:
236	.set	reorder
237	$LD	$t0,0($a1)
238	$MULTU	($t0,$a3)
239	$LD	$t1,0($a0)
240	subu	$a2,1
241	$ADDU	$t1,$v0
242	sltu	$v0,$t1,$v0
243	mflo	($at,$t0,$a3)
244	mfhi	($t0,$t0,$a3)
245	$ADDU	$t1,$at
246	$ADDU	$v0,$t0
247	sltu	$at,$t1,$at
248	$ST	$t1,0($a0)
249	$ADDU	$v0,$at
250	beqz	$a2,.L_bn_mul_add_words_return
251
252	$LD	$t0,$BNSZ($a1)
253	$MULTU	($t0,$a3)
254	$LD	$t1,$BNSZ($a0)
255	subu	$a2,1
256	$ADDU	$t1,$v0
257	sltu	$v0,$t1,$v0
258	mflo	($at,$t0,$a3)
259	mfhi	($t0,$t0,$a3)
260	$ADDU	$t1,$at
261	$ADDU	$v0,$t0
262	sltu	$at,$t1,$at
263	$ST	$t1,$BNSZ($a0)
264	$ADDU	$v0,$at
265	beqz	$a2,.L_bn_mul_add_words_return
266
267	$LD	$t0,2*$BNSZ($a1)
268	$MULTU	($t0,$a3)
269	$LD	$t1,2*$BNSZ($a0)
270	$ADDU	$t1,$v0
271	sltu	$v0,$t1,$v0
272	mflo	($at,$t0,$a3)
273	mfhi	($t0,$t0,$a3)
274	$ADDU	$t1,$at
275	$ADDU	$v0,$t0
276	sltu	$at,$t1,$at
277	$ST	$t1,2*$BNSZ($a0)
278	$ADDU	$v0,$at
279
280.L_bn_mul_add_words_return:
281	.set	noreorder
282___
283$code.=<<___ if ($flavour =~ /nubi/i);
284	$REG_L	$t3,4*$SZREG($sp)
285	$REG_L	$t2,3*$SZREG($sp)
286	$REG_L	$t1,2*$SZREG($sp)
287	$REG_L	$t0,1*$SZREG($sp)
288	$REG_L	$gp,0*$SZREG($sp)
289	$PTR_ADD $sp,6*$SZREG
290___
291$code.=<<___;
292	jr	$ra
293	move	$a0,$v0
294.end	bn_mul_add_words_internal
295
296.align	5
297.globl	bn_mul_words
298.ent	bn_mul_words
299bn_mul_words:
300	.set	noreorder
301	bgtz	$a2,bn_mul_words_internal
302	move	$v0,$zero
303	jr	$ra
304	move	$a0,$v0
305.end	bn_mul_words
306
307.align	5
308.ent	bn_mul_words_internal
309bn_mul_words_internal:
310___
311$code.=<<___ if ($flavour =~ /nubi/i);
312	.frame	$sp,6*$SZREG,$ra
313	.mask	0x8000f008,-$SZREG
314	.set	noreorder
315	$PTR_SUB $sp,6*$SZREG
316	$REG_S	$ra,5*$SZREG($sp)
317	$REG_S	$t3,4*$SZREG($sp)
318	$REG_S	$t2,3*$SZREG($sp)
319	$REG_S	$t1,2*$SZREG($sp)
320	$REG_S	$t0,1*$SZREG($sp)
321	$REG_S	$gp,0*$SZREG($sp)
322___
323$code.=<<___;
324	.set	reorder
325	li	$minus4,-4
326	and	$ta0,$a2,$minus4
327	beqz	$ta0,.L_bn_mul_words_tail
328
329.L_bn_mul_words_loop:
330	$LD	$t0,0($a1)
331	$MULTU	($t0,$a3)
332	$LD	$t2,$BNSZ($a1)
333	$LD	$ta0,2*$BNSZ($a1)
334	$LD	$ta2,3*$BNSZ($a1)
335	mflo	($at,$t0,$a3)
336	mfhi	($t0,$t0,$a3)
337	$ADDU	$v0,$at
338	sltu	$t1,$v0,$at
339	 $MULTU	($t2,$a3)
340	$ST	$v0,0($a0)
341	$ADDU	$v0,$t1,$t0
342
343	subu	$a2,4
344	$PTR_ADD $a0,4*$BNSZ
345	$PTR_ADD $a1,4*$BNSZ
346	mflo	($at,$t2,$a3)
347	mfhi	($t2,$t2,$a3)
348	$ADDU	$v0,$at
349	sltu	$t3,$v0,$at
350	 $MULTU	($ta0,$a3)
351	$ST	$v0,-3*$BNSZ($a0)
352	$ADDU	$v0,$t3,$t2
353
354	mflo	($at,$ta0,$a3)
355	mfhi	($ta0,$ta0,$a3)
356	$ADDU	$v0,$at
357	sltu	$ta1,$v0,$at
358	 $MULTU	($ta2,$a3)
359	$ST	$v0,-2*$BNSZ($a0)
360	$ADDU	$v0,$ta1,$ta0
361
362	and	$ta0,$a2,$minus4
363	mflo	($at,$ta2,$a3)
364	mfhi	($ta2,$ta2,$a3)
365	$ADDU	$v0,$at
366	sltu	$ta3,$v0,$at
367	$ST	$v0,-$BNSZ($a0)
368	.set	noreorder
369	bgtz	$ta0,.L_bn_mul_words_loop
370	$ADDU	$v0,$ta3,$ta2
371
372	beqz	$a2,.L_bn_mul_words_return
373	nop
374
375.L_bn_mul_words_tail:
376	.set	reorder
377	$LD	$t0,0($a1)
378	$MULTU	($t0,$a3)
379	subu	$a2,1
380	mflo	($at,$t0,$a3)
381	mfhi	($t0,$t0,$a3)
382	$ADDU	$v0,$at
383	sltu	$t1,$v0,$at
384	$ST	$v0,0($a0)
385	$ADDU	$v0,$t1,$t0
386	beqz	$a2,.L_bn_mul_words_return
387
388	$LD	$t0,$BNSZ($a1)
389	$MULTU	($t0,$a3)
390	subu	$a2,1
391	mflo	($at,$t0,$a3)
392	mfhi	($t0,$t0,$a3)
393	$ADDU	$v0,$at
394	sltu	$t1,$v0,$at
395	$ST	$v0,$BNSZ($a0)
396	$ADDU	$v0,$t1,$t0
397	beqz	$a2,.L_bn_mul_words_return
398
399	$LD	$t0,2*$BNSZ($a1)
400	$MULTU	($t0,$a3)
401	mflo	($at,$t0,$a3)
402	mfhi	($t0,$t0,$a3)
403	$ADDU	$v0,$at
404	sltu	$t1,$v0,$at
405	$ST	$v0,2*$BNSZ($a0)
406	$ADDU	$v0,$t1,$t0
407
408.L_bn_mul_words_return:
409	.set	noreorder
410___
411$code.=<<___ if ($flavour =~ /nubi/i);
412	$REG_L	$t3,4*$SZREG($sp)
413	$REG_L	$t2,3*$SZREG($sp)
414	$REG_L	$t1,2*$SZREG($sp)
415	$REG_L	$t0,1*$SZREG($sp)
416	$REG_L	$gp,0*$SZREG($sp)
417	$PTR_ADD $sp,6*$SZREG
418___
419$code.=<<___;
420	jr	$ra
421	move	$a0,$v0
422.end	bn_mul_words_internal
423
424.align	5
425.globl	bn_sqr_words
426.ent	bn_sqr_words
427bn_sqr_words:
428	.set	noreorder
429	bgtz	$a2,bn_sqr_words_internal
430	move	$v0,$zero
431	jr	$ra
432	move	$a0,$v0
433.end	bn_sqr_words
434
435.align	5
436.ent	bn_sqr_words_internal
437bn_sqr_words_internal:
438___
439$code.=<<___ if ($flavour =~ /nubi/i);
440	.frame	$sp,6*$SZREG,$ra
441	.mask	0x8000f008,-$SZREG
442	.set	noreorder
443	$PTR_SUB $sp,6*$SZREG
444	$REG_S	$ra,5*$SZREG($sp)
445	$REG_S	$t3,4*$SZREG($sp)
446	$REG_S	$t2,3*$SZREG($sp)
447	$REG_S	$t1,2*$SZREG($sp)
448	$REG_S	$t0,1*$SZREG($sp)
449	$REG_S	$gp,0*$SZREG($sp)
450___
451$code.=<<___;
452	.set	reorder
453	li	$minus4,-4
454	and	$ta0,$a2,$minus4
455	beqz	$ta0,.L_bn_sqr_words_tail
456
457.L_bn_sqr_words_loop:
458	$LD	$t0,0($a1)
459	$MULTU	($t0,$t0)
460	$LD	$t2,$BNSZ($a1)
461	$LD	$ta0,2*$BNSZ($a1)
462	$LD	$ta2,3*$BNSZ($a1)
463	mflo	($t1,$t0,$t0)
464	mfhi	($t0,$t0,$t0)
465	$ST	$t1,0($a0)
466	$ST	$t0,$BNSZ($a0)
467
468	$MULTU	($t2,$t2)
469	subu	$a2,4
470	$PTR_ADD $a0,8*$BNSZ
471	$PTR_ADD $a1,4*$BNSZ
472	mflo	($t3,$t2,$t2)
473	mfhi	($t2,$t2,$t2)
474	$ST	$t3,-6*$BNSZ($a0)
475	$ST	$t2,-5*$BNSZ($a0)
476
477	$MULTU	($ta0,$ta0)
478	mflo	($ta1,$ta0,$ta0)
479	mfhi	($ta0,$ta0,$ta0)
480	$ST	$ta1,-4*$BNSZ($a0)
481	$ST	$ta0,-3*$BNSZ($a0)
482
483
484	$MULTU	($ta2,$ta2)
485	and	$ta0,$a2,$minus4
486	mflo	($ta3,$ta2,$ta2)
487	mfhi	($ta2,$ta2,$ta2)
488	$ST	$ta3,-2*$BNSZ($a0)
489
490	.set	noreorder
491	bgtz	$ta0,.L_bn_sqr_words_loop
492	$ST	$ta2,-$BNSZ($a0)
493
494	beqz	$a2,.L_bn_sqr_words_return
495	nop
496
497.L_bn_sqr_words_tail:
498	.set	reorder
499	$LD	$t0,0($a1)
500	$MULTU	($t0,$t0)
501	subu	$a2,1
502	mflo	($t1,$t0,$t0)
503	mfhi	($t0,$t0,$t0)
504	$ST	$t1,0($a0)
505	$ST	$t0,$BNSZ($a0)
506	beqz	$a2,.L_bn_sqr_words_return
507
508	$LD	$t0,$BNSZ($a1)
509	$MULTU	($t0,$t0)
510	subu	$a2,1
511	mflo	($t1,$t0,$t0)
512	mfhi	($t0,$t0,$t0)
513	$ST	$t1,2*$BNSZ($a0)
514	$ST	$t0,3*$BNSZ($a0)
515	beqz	$a2,.L_bn_sqr_words_return
516
517	$LD	$t0,2*$BNSZ($a1)
518	$MULTU	($t0,$t0)
519	mflo	($t1,$t0,$t0)
520	mfhi	($t0,$t0,$t0)
521	$ST	$t1,4*$BNSZ($a0)
522	$ST	$t0,5*$BNSZ($a0)
523
524.L_bn_sqr_words_return:
525	.set	noreorder
526___
527$code.=<<___ if ($flavour =~ /nubi/i);
528	$REG_L	$t3,4*$SZREG($sp)
529	$REG_L	$t2,3*$SZREG($sp)
530	$REG_L	$t1,2*$SZREG($sp)
531	$REG_L	$t0,1*$SZREG($sp)
532	$REG_L	$gp,0*$SZREG($sp)
533	$PTR_ADD $sp,6*$SZREG
534___
535$code.=<<___;
536	jr	$ra
537	move	$a0,$v0
538
539.end	bn_sqr_words_internal
540
541.align	5
542.globl	bn_add_words
543.ent	bn_add_words
544bn_add_words:
545	.set	noreorder
546	bgtz	$a3,bn_add_words_internal
547	move	$v0,$zero
548	jr	$ra
549	move	$a0,$v0
550.end	bn_add_words
551
552.align	5
553.ent	bn_add_words_internal
554bn_add_words_internal:
555___
556$code.=<<___ if ($flavour =~ /nubi/i);
557	.frame	$sp,6*$SZREG,$ra
558	.mask	0x8000f008,-$SZREG
559	.set	noreorder
560	$PTR_SUB $sp,6*$SZREG
561	$REG_S	$ra,5*$SZREG($sp)
562	$REG_S	$t3,4*$SZREG($sp)
563	$REG_S	$t2,3*$SZREG($sp)
564	$REG_S	$t1,2*$SZREG($sp)
565	$REG_S	$t0,1*$SZREG($sp)
566	$REG_S	$gp,0*$SZREG($sp)
567___
568$code.=<<___;
569	.set	reorder
570	li	$minus4,-4
571	and	$at,$a3,$minus4
572	beqz	$at,.L_bn_add_words_tail
573
574.L_bn_add_words_loop:
575	$LD	$t0,0($a1)
576	$LD	$ta0,0($a2)
577	subu	$a3,4
578	$LD	$t1,$BNSZ($a1)
579	and	$at,$a3,$minus4
580	$LD	$t2,2*$BNSZ($a1)
581	$PTR_ADD $a2,4*$BNSZ
582	$LD	$t3,3*$BNSZ($a1)
583	$PTR_ADD $a0,4*$BNSZ
584	$LD	$ta1,-3*$BNSZ($a2)
585	$PTR_ADD $a1,4*$BNSZ
586	$LD	$ta2,-2*$BNSZ($a2)
587	$LD	$ta3,-$BNSZ($a2)
588	$ADDU	$ta0,$t0
589	sltu	$t8,$ta0,$t0
590	$ADDU	$t0,$ta0,$v0
591	sltu	$v0,$t0,$ta0
592	$ST	$t0,-4*$BNSZ($a0)
593	$ADDU	$v0,$t8
594
595	$ADDU	$ta1,$t1
596	sltu	$t9,$ta1,$t1
597	$ADDU	$t1,$ta1,$v0
598	sltu	$v0,$t1,$ta1
599	$ST	$t1,-3*$BNSZ($a0)
600	$ADDU	$v0,$t9
601
602	$ADDU	$ta2,$t2
603	sltu	$t8,$ta2,$t2
604	$ADDU	$t2,$ta2,$v0
605	sltu	$v0,$t2,$ta2
606	$ST	$t2,-2*$BNSZ($a0)
607	$ADDU	$v0,$t8
608
609	$ADDU	$ta3,$t3
610	sltu	$t9,$ta3,$t3
611	$ADDU	$t3,$ta3,$v0
612	sltu	$v0,$t3,$ta3
613	$ST	$t3,-$BNSZ($a0)
614
615	.set	noreorder
616	bgtz	$at,.L_bn_add_words_loop
617	$ADDU	$v0,$t9
618
619	beqz	$a3,.L_bn_add_words_return
620	nop
621
622.L_bn_add_words_tail:
623	.set	reorder
624	$LD	$t0,0($a1)
625	$LD	$ta0,0($a2)
626	$ADDU	$ta0,$t0
627	subu	$a3,1
628	sltu	$t8,$ta0,$t0
629	$ADDU	$t0,$ta0,$v0
630	sltu	$v0,$t0,$ta0
631	$ST	$t0,0($a0)
632	$ADDU	$v0,$t8
633	beqz	$a3,.L_bn_add_words_return
634
635	$LD	$t1,$BNSZ($a1)
636	$LD	$ta1,$BNSZ($a2)
637	$ADDU	$ta1,$t1
638	subu	$a3,1
639	sltu	$t9,$ta1,$t1
640	$ADDU	$t1,$ta1,$v0
641	sltu	$v0,$t1,$ta1
642	$ST	$t1,$BNSZ($a0)
643	$ADDU	$v0,$t9
644	beqz	$a3,.L_bn_add_words_return
645
646	$LD	$t2,2*$BNSZ($a1)
647	$LD	$ta2,2*$BNSZ($a2)
648	$ADDU	$ta2,$t2
649	sltu	$t8,$ta2,$t2
650	$ADDU	$t2,$ta2,$v0
651	sltu	$v0,$t2,$ta2
652	$ST	$t2,2*$BNSZ($a0)
653	$ADDU	$v0,$t8
654
655.L_bn_add_words_return:
656	.set	noreorder
657___
658$code.=<<___ if ($flavour =~ /nubi/i);
659	$REG_L	$t3,4*$SZREG($sp)
660	$REG_L	$t2,3*$SZREG($sp)
661	$REG_L	$t1,2*$SZREG($sp)
662	$REG_L	$t0,1*$SZREG($sp)
663	$REG_L	$gp,0*$SZREG($sp)
664	$PTR_ADD $sp,6*$SZREG
665___
666$code.=<<___;
667	jr	$ra
668	move	$a0,$v0
669
670.end	bn_add_words_internal
671
672.align	5
673.globl	bn_sub_words
674.ent	bn_sub_words
675bn_sub_words:
676	.set	noreorder
677	bgtz	$a3,bn_sub_words_internal
678	move	$v0,$zero
679	jr	$ra
680	move	$a0,$zero
681.end	bn_sub_words
682
683.align	5
684.ent	bn_sub_words_internal
685bn_sub_words_internal:
686___
687$code.=<<___ if ($flavour =~ /nubi/i);
688	.frame	$sp,6*$SZREG,$ra
689	.mask	0x8000f008,-$SZREG
690	.set	noreorder
691	$PTR_SUB $sp,6*$SZREG
692	$REG_S	$ra,5*$SZREG($sp)
693	$REG_S	$t3,4*$SZREG($sp)
694	$REG_S	$t2,3*$SZREG($sp)
695	$REG_S	$t1,2*$SZREG($sp)
696	$REG_S	$t0,1*$SZREG($sp)
697	$REG_S	$gp,0*$SZREG($sp)
698___
699$code.=<<___;
700	.set	reorder
701	li	$minus4,-4
702	and	$at,$a3,$minus4
703	beqz	$at,.L_bn_sub_words_tail
704
705.L_bn_sub_words_loop:
706	$LD	$t0,0($a1)
707	$LD	$ta0,0($a2)
708	subu	$a3,4
709	$LD	$t1,$BNSZ($a1)
710	and	$at,$a3,$minus4
711	$LD	$t2,2*$BNSZ($a1)
712	$PTR_ADD $a2,4*$BNSZ
713	$LD	$t3,3*$BNSZ($a1)
714	$PTR_ADD $a0,4*$BNSZ
715	$LD	$ta1,-3*$BNSZ($a2)
716	$PTR_ADD $a1,4*$BNSZ
717	$LD	$ta2,-2*$BNSZ($a2)
718	$LD	$ta3,-$BNSZ($a2)
719	sltu	$t8,$t0,$ta0
720	$SUBU	$ta0,$t0,$ta0
721	$SUBU	$t0,$ta0,$v0
722	sgtu	$v0,$t0,$ta0
723	$ST	$t0,-4*$BNSZ($a0)
724	$ADDU	$v0,$t8
725
726	sltu	$t9,$t1,$ta1
727	$SUBU	$ta1,$t1,$ta1
728	$SUBU	$t1,$ta1,$v0
729	sgtu	$v0,$t1,$ta1
730	$ST	$t1,-3*$BNSZ($a0)
731	$ADDU	$v0,$t9
732
733
734	sltu	$t8,$t2,$ta2
735	$SUBU	$ta2,$t2,$ta2
736	$SUBU	$t2,$ta2,$v0
737	sgtu	$v0,$t2,$ta2
738	$ST	$t2,-2*$BNSZ($a0)
739	$ADDU	$v0,$t8
740
741	sltu	$t9,$t3,$ta3
742	$SUBU	$ta3,$t3,$ta3
743	$SUBU	$t3,$ta3,$v0
744	sgtu	$v0,$t3,$ta3
745	$ST	$t3,-$BNSZ($a0)
746
747	.set	noreorder
748	bgtz	$at,.L_bn_sub_words_loop
749	$ADDU	$v0,$t9
750
751	beqz	$a3,.L_bn_sub_words_return
752	nop
753
754.L_bn_sub_words_tail:
755	.set	reorder
756	$LD	$t0,0($a1)
757	$LD	$ta0,0($a2)
758	subu	$a3,1
759	sltu	$t8,$t0,$ta0
760	$SUBU	$ta0,$t0,$ta0
761	$SUBU	$t0,$ta0,$v0
762	sgtu	$v0,$t0,$ta0
763	$ST	$t0,0($a0)
764	$ADDU	$v0,$t8
765	beqz	$a3,.L_bn_sub_words_return
766
767	$LD	$t1,$BNSZ($a1)
768	subu	$a3,1
769	$LD	$ta1,$BNSZ($a2)
770	sltu	$t9,$t1,$ta1
771	$SUBU	$ta1,$t1,$ta1
772	$SUBU	$t1,$ta1,$v0
773	sgtu	$v0,$t1,$ta1
774	$ST	$t1,$BNSZ($a0)
775	$ADDU	$v0,$t9
776	beqz	$a3,.L_bn_sub_words_return
777
778	$LD	$t2,2*$BNSZ($a1)
779	$LD	$ta2,2*$BNSZ($a2)
780	sltu	$t8,$t2,$ta2
781	$SUBU	$ta2,$t2,$ta2
782	$SUBU	$t2,$ta2,$v0
783	sgtu	$v0,$t2,$ta2
784	$ST	$t2,2*$BNSZ($a0)
785	$ADDU	$v0,$t8
786
787.L_bn_sub_words_return:
788	.set	noreorder
789___
790$code.=<<___ if ($flavour =~ /nubi/i);
791	$REG_L	$t3,4*$SZREG($sp)
792	$REG_L	$t2,3*$SZREG($sp)
793	$REG_L	$t1,2*$SZREG($sp)
794	$REG_L	$t0,1*$SZREG($sp)
795	$REG_L	$gp,0*$SZREG($sp)
796	$PTR_ADD $sp,6*$SZREG
797___
798$code.=<<___;
799	jr	$ra
800	move	$a0,$v0
801.end	bn_sub_words_internal
802
803#if 0
804/*
805 * The bn_div_3_words entry point is reused for constant-time interface.
806 * Implementation is retained as historical reference.
807 */
808.align 5
809.globl	bn_div_3_words
810.ent	bn_div_3_words
811bn_div_3_words:
812	.set	noreorder
813	move	$a3,$a0		# we know that bn_div_words does not
814				# touch $a3, $ta2, $ta3 and preserves $a2
815				# so that we can save two arguments
816				# and return address in registers
817				# instead of stack:-)
818
819	$LD	$a0,($a3)
820	move	$ta2,$a1
821	bne	$a0,$a2,bn_div_3_words_internal
822	$LD	$a1,-$BNSZ($a3)
823	li	$v0,-1
824	jr	$ra
825	move	$a0,$v0
826.end	bn_div_3_words
827
828.align	5
829.ent	bn_div_3_words_internal
830bn_div_3_words_internal:
831___
832$code.=<<___ if ($flavour =~ /nubi/i);
833	.frame	$sp,6*$SZREG,$ra
834	.mask	0x8000f008,-$SZREG
835	.set	noreorder
836	$PTR_SUB $sp,6*$SZREG
837	$REG_S	$ra,5*$SZREG($sp)
838	$REG_S	$t3,4*$SZREG($sp)
839	$REG_S	$t2,3*$SZREG($sp)
840	$REG_S	$t1,2*$SZREG($sp)
841	$REG_S	$t0,1*$SZREG($sp)
842	$REG_S	$gp,0*$SZREG($sp)
843___
844$code.=<<___;
845	.set	reorder
846	move	$ta3,$ra
847	bal	bn_div_words_internal
848	move	$ra,$ta3
849	$MULTU	($ta2,$v0)
850	$LD	$t2,-2*$BNSZ($a3)
851	move	$ta0,$zero
852	mfhi	($t1,$ta2,$v0)
853	mflo	($t0,$ta2,$v0)
854	sltu	$t8,$t1,$a1
855.L_bn_div_3_words_inner_loop:
856	bnez	$t8,.L_bn_div_3_words_inner_loop_done
857	sgeu	$at,$t2,$t0
858	seq	$t9,$t1,$a1
859	and	$at,$t9
860	sltu	$t3,$t0,$ta2
861	$ADDU	$a1,$a2
862	$SUBU	$t1,$t3
863	$SUBU	$t0,$ta2
864	sltu	$t8,$t1,$a1
865	sltu	$ta0,$a1,$a2
866	or	$t8,$ta0
867	.set	noreorder
868	beqz	$at,.L_bn_div_3_words_inner_loop
869	$SUBU	$v0,1
870	$ADDU	$v0,1
871	.set	reorder
872.L_bn_div_3_words_inner_loop_done:
873	.set	noreorder
874___
875$code.=<<___ if ($flavour =~ /nubi/i);
876	$REG_L	$t3,4*$SZREG($sp)
877	$REG_L	$t2,3*$SZREG($sp)
878	$REG_L	$t1,2*$SZREG($sp)
879	$REG_L	$t0,1*$SZREG($sp)
880	$REG_L	$gp,0*$SZREG($sp)
881	$PTR_ADD $sp,6*$SZREG
882___
883$code.=<<___;
884	jr	$ra
885	move	$a0,$v0
886.end	bn_div_3_words_internal
887#endif
888
889.align	5
890.globl	bn_div_words
891.ent	bn_div_words
892bn_div_words:
893	.set	noreorder
894	bnez	$a2,bn_div_words_internal
895	li	$v0,-1		# I would rather signal div-by-zero
896				# which can be done with 'break 7'
897	jr	$ra
898	move	$a0,$v0
899.end	bn_div_words
900
901.align	5
902.ent	bn_div_words_internal
903bn_div_words_internal:
904___
905$code.=<<___ if ($flavour =~ /nubi/i);
906	.frame	$sp,6*$SZREG,$ra
907	.mask	0x8000f008,-$SZREG
908	.set	noreorder
909	$PTR_SUB $sp,6*$SZREG
910	$REG_S	$ra,5*$SZREG($sp)
911	$REG_S	$t3,4*$SZREG($sp)
912	$REG_S	$t2,3*$SZREG($sp)
913	$REG_S	$t1,2*$SZREG($sp)
914	$REG_S	$t0,1*$SZREG($sp)
915	$REG_S	$gp,0*$SZREG($sp)
916___
917$code.=<<___;
918	move	$v1,$zero
919	bltz	$a2,.L_bn_div_words_body
920	move	$t9,$v1
921	$SLL	$a2,1
922	bgtz	$a2,.-4
923	addu	$t9,1
924
925	.set	reorder
926	negu	$t1,$t9
927	li	$t2,-1
928	$SLL	$t2,$t1
929	and	$t2,$a0
930	$SRL	$at,$a1,$t1
931	.set	noreorder
932	beqz	$t2,.+12
933	nop
934	break	6		# signal overflow
935	.set	reorder
936	$SLL	$a0,$t9
937	$SLL	$a1,$t9
938	or	$a0,$at
939___
940$QT=$ta0;
941$HH=$ta1;
942$DH=$v1;
943$code.=<<___;
944.L_bn_div_words_body:
945	$SRL	$DH,$a2,4*$BNSZ	# bits
946	sgeu	$at,$a0,$a2
947	.set	noreorder
948	beqz	$at,.+12
949	nop
950	$SUBU	$a0,$a2
951	.set	reorder
952
953	li	$QT,-1
954	$SRL	$HH,$a0,4*$BNSZ	# bits
955	$SRL	$QT,4*$BNSZ	# q=0xffffffff
956	beq	$DH,$HH,.L_bn_div_words_skip_div1
957	$DIVU	($a0,$DH)
958	mfqt	($QT,$a0,$DH)
959.L_bn_div_words_skip_div1:
960	$MULTU	($a2,$QT)
961	$SLL	$t3,$a0,4*$BNSZ	# bits
962	$SRL	$at,$a1,4*$BNSZ	# bits
963	or	$t3,$at
964	mflo	($t0,$a2,$QT)
965	mfhi	($t1,$a2,$QT)
966.L_bn_div_words_inner_loop1:
967	sltu	$t2,$t3,$t0
968	seq	$t8,$HH,$t1
969	sltu	$at,$HH,$t1
970	and	$t2,$t8
971	sltu	$v0,$t0,$a2
972	or	$at,$t2
973	.set	noreorder
974	beqz	$at,.L_bn_div_words_inner_loop1_done
975	$SUBU	$t1,$v0
976	$SUBU	$t0,$a2
977	b	.L_bn_div_words_inner_loop1
978	$SUBU	$QT,1
979	.set	reorder
980.L_bn_div_words_inner_loop1_done:
981
982	$SLL	$a1,4*$BNSZ	# bits
983	$SUBU	$a0,$t3,$t0
984	$SLL	$v0,$QT,4*$BNSZ	# bits
985
986	li	$QT,-1
987	$SRL	$HH,$a0,4*$BNSZ	# bits
988	$SRL	$QT,4*$BNSZ	# q=0xffffffff
989	beq	$DH,$HH,.L_bn_div_words_skip_div2
990	$DIVU	($a0,$DH)
991	mfqt	($QT,$a0,$DH)
992.L_bn_div_words_skip_div2:
993	$MULTU	($a2,$QT)
994	$SLL	$t3,$a0,4*$BNSZ	# bits
995	$SRL	$at,$a1,4*$BNSZ	# bits
996	or	$t3,$at
997	mflo	($t0,$a2,$QT)
998	mfhi	($t1,$a2,$QT)
999.L_bn_div_words_inner_loop2:
1000	sltu	$t2,$t3,$t0
1001	seq	$t8,$HH,$t1
1002	sltu	$at,$HH,$t1
1003	and	$t2,$t8
1004	sltu	$v1,$t0,$a2
1005	or	$at,$t2
1006	.set	noreorder
1007	beqz	$at,.L_bn_div_words_inner_loop2_done
1008	$SUBU	$t1,$v1
1009	$SUBU	$t0,$a2
1010	b	.L_bn_div_words_inner_loop2
1011	$SUBU	$QT,1
1012	.set	reorder
1013.L_bn_div_words_inner_loop2_done:
1014
1015	$SUBU	$a0,$t3,$t0
1016	or	$v0,$QT
1017	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
1018	$SRL	$a2,$t9		# restore $a2
1019
1020	.set	noreorder
1021	move	$a1,$v1
1022___
1023$code.=<<___ if ($flavour =~ /nubi/i);
1024	$REG_L	$t3,4*$SZREG($sp)
1025	$REG_L	$t2,3*$SZREG($sp)
1026	$REG_L	$t1,2*$SZREG($sp)
1027	$REG_L	$t0,1*$SZREG($sp)
1028	$REG_L	$gp,0*$SZREG($sp)
1029	$PTR_ADD $sp,6*$SZREG
1030___
1031$code.=<<___;
1032	jr	$ra
1033	move	$a0,$v0
1034.end	bn_div_words_internal
1035___
1036undef $HH; undef $QT; undef $DH;
1037
1038($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1039($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1040
1041($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1042($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1043
1044($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1045
1046$code.=<<___;
1047
1048.align	5
1049.globl	bn_mul_comba8
1050.ent	bn_mul_comba8
1051bn_mul_comba8:
1052	.set	noreorder
1053___
1054$code.=<<___ if ($flavour =~ /nubi/i);
1055	.frame	$sp,12*$SZREG,$ra
1056	.mask	0x803ff008,-$SZREG
1057	$PTR_SUB $sp,12*$SZREG
1058	$REG_S	$ra,11*$SZREG($sp)
1059	$REG_S	$s5,10*$SZREG($sp)
1060	$REG_S	$s4,9*$SZREG($sp)
1061	$REG_S	$s3,8*$SZREG($sp)
1062	$REG_S	$s2,7*$SZREG($sp)
1063	$REG_S	$s1,6*$SZREG($sp)
1064	$REG_S	$s0,5*$SZREG($sp)
1065	$REG_S	$t3,4*$SZREG($sp)
1066	$REG_S	$t2,3*$SZREG($sp)
1067	$REG_S	$t1,2*$SZREG($sp)
1068	$REG_S	$t0,1*$SZREG($sp)
1069	$REG_S	$gp,0*$SZREG($sp)
1070___
1071$code.=<<___ if ($flavour !~ /nubi/i);
1072	.frame	$sp,6*$SZREG,$ra
1073	.mask	0x003f0000,-$SZREG
1074	$PTR_SUB $sp,6*$SZREG
1075	$REG_S	$s5,5*$SZREG($sp)
1076	$REG_S	$s4,4*$SZREG($sp)
1077	$REG_S	$s3,3*$SZREG($sp)
1078	$REG_S	$s2,2*$SZREG($sp)
1079	$REG_S	$s1,1*$SZREG($sp)
1080	$REG_S	$s0,0*$SZREG($sp)
1081___
1082$code.=<<___;
1083
1084	.set	reorder
1085	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1086				# R5000 box assembler barks on this
1087				# 1ine with "should not have mult/div
1088				# as last instruction in bb (R10K
1089				# bug)" warning. If anybody out there
1090				# has a clue about how to circumvent
1091				# this do send me a note.
1092				#		<appro\@fy.chalmers.se>
1093
1094	$LD	$b_0,0($a2)
1095	$LD	$a_1,$BNSZ($a1)
1096	$LD	$a_2,2*$BNSZ($a1)
1097	$MULTU	($a_0,$b_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1098	$LD	$a_3,3*$BNSZ($a1)
1099	$LD	$b_1,$BNSZ($a2)
1100	$LD	$b_2,2*$BNSZ($a2)
1101	$LD	$b_3,3*$BNSZ($a2)
1102	mflo	($c_1,$a_0,$b_0)
1103	mfhi	($c_2,$a_0,$b_0)
1104
1105	$LD	$a_4,4*$BNSZ($a1)
1106	$LD	$a_5,5*$BNSZ($a1)
1107	$MULTU	($a_0,$b_1)		# mul_add_c(a[0],b[1],c2,c3,c1);
1108	$LD	$a_6,6*$BNSZ($a1)
1109	$LD	$a_7,7*$BNSZ($a1)
1110	$LD	$b_4,4*$BNSZ($a2)
1111	$LD	$b_5,5*$BNSZ($a2)
1112	mflo	($t_1,$a_0,$b_1)
1113	mfhi	($t_2,$a_0,$b_1)
1114	$ADDU	$c_2,$t_1
1115	sltu	$at,$c_2,$t_1
1116	$MULTU	($a_1,$b_0)		# mul_add_c(a[1],b[0],c2,c3,c1);
1117	$ADDU	$c_3,$t_2,$at
1118	$LD	$b_6,6*$BNSZ($a2)
1119	$LD	$b_7,7*$BNSZ($a2)
1120	$ST	$c_1,0($a0)	# r[0]=c1;
1121	mflo	($t_1,$a_1,$b_0)
1122	mfhi	($t_2,$a_1,$b_0)
1123	$ADDU	$c_2,$t_1
1124	sltu	$at,$c_2,$t_1
1125	 $MULTU	($a_2,$b_0)		# mul_add_c(a[2],b[0],c3,c1,c2);
1126	$ADDU	$t_2,$at
1127	$ADDU	$c_3,$t_2
1128	sltu	$c_1,$c_3,$t_2
1129	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1130
1131	mflo	($t_1,$a_2,$b_0)
1132	mfhi	($t_2,$a_2,$b_0)
1133	$ADDU	$c_3,$t_1
1134	sltu	$at,$c_3,$t_1
1135	$MULTU	($a_1,$b_1)		# mul_add_c(a[1],b[1],c3,c1,c2);
1136	$ADDU	$t_2,$at
1137	$ADDU	$c_1,$t_2
1138	mflo	($t_1,$a_1,$b_1)
1139	mfhi	($t_2,$a_1,$b_1)
1140	$ADDU	$c_3,$t_1
1141	sltu	$at,$c_3,$t_1
1142	$MULTU	($a_0,$b_2)		# mul_add_c(a[0],b[2],c3,c1,c2);
1143	$ADDU	$t_2,$at
1144	$ADDU	$c_1,$t_2
1145	sltu	$c_2,$c_1,$t_2
1146	mflo	($t_1,$a_0,$b_2)
1147	mfhi	($t_2,$a_0,$b_2)
1148	$ADDU	$c_3,$t_1
1149	sltu	$at,$c_3,$t_1
1150	 $MULTU	($a_0,$b_3)		# mul_add_c(a[0],b[3],c1,c2,c3);
1151	$ADDU	$t_2,$at
1152	$ADDU	$c_1,$t_2
1153	sltu	$at,$c_1,$t_2
1154	$ADDU	$c_2,$at
1155	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1156
1157	mflo	($t_1,$a_0,$b_3)
1158	mfhi	($t_2,$a_0,$b_3)
1159	$ADDU	$c_1,$t_1
1160	sltu	$at,$c_1,$t_1
1161	$MULTU	($a_1,$b_2)		# mul_add_c(a[1],b[2],c1,c2,c3);
1162	$ADDU	$t_2,$at
1163	$ADDU	$c_2,$t_2
1164	sltu	$c_3,$c_2,$t_2
1165	mflo	($t_1,$a_1,$b_2)
1166	mfhi	($t_2,$a_1,$b_2)
1167	$ADDU	$c_1,$t_1
1168	sltu	$at,$c_1,$t_1
1169	$MULTU	($a_2,$b_1)		# mul_add_c(a[2],b[1],c1,c2,c3);
1170	$ADDU	$t_2,$at
1171	$ADDU	$c_2,$t_2
1172	sltu	$at,$c_2,$t_2
1173	$ADDU	$c_3,$at
1174	mflo	($t_1,$a_2,$b_1)
1175	mfhi	($t_2,$a_2,$b_1)
1176	$ADDU	$c_1,$t_1
1177	sltu	$at,$c_1,$t_1
1178	$MULTU	($a_3,$b_0)		# mul_add_c(a[3],b[0],c1,c2,c3);
1179	$ADDU	$t_2,$at
1180	$ADDU	$c_2,$t_2
1181	sltu	$at,$c_2,$t_2
1182	$ADDU	$c_3,$at
1183	mflo	($t_1,$a_3,$b_0)
1184	mfhi	($t_2,$a_3,$b_0)
1185	$ADDU	$c_1,$t_1
1186	sltu	$at,$c_1,$t_1
1187	 $MULTU	($a_4,$b_0)		# mul_add_c(a[4],b[0],c2,c3,c1);
1188	$ADDU	$t_2,$at
1189	$ADDU	$c_2,$t_2
1190	sltu	$at,$c_2,$t_2
1191	$ADDU	$c_3,$at
1192	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1193
1194	mflo	($t_1,$a_4,$b_0)
1195	mfhi	($t_2,$a_4,$b_0)
1196	$ADDU	$c_2,$t_1
1197	sltu	$at,$c_2,$t_1
1198	$MULTU	($a_3,$b_1)		# mul_add_c(a[3],b[1],c2,c3,c1);
1199	$ADDU	$t_2,$at
1200	$ADDU	$c_3,$t_2
1201	sltu	$c_1,$c_3,$t_2
1202	mflo	($t_1,$a_3,$b_1)
1203	mfhi	($t_2,$a_3,$b_1)
1204	$ADDU	$c_2,$t_1
1205	sltu	$at,$c_2,$t_1
1206	$MULTU	($a_2,$b_2)		# mul_add_c(a[2],b[2],c2,c3,c1);
1207	$ADDU	$t_2,$at
1208	$ADDU	$c_3,$t_2
1209	sltu	$at,$c_3,$t_2
1210	$ADDU	$c_1,$at
1211	mflo	($t_1,$a_2,$b_2)
1212	mfhi	($t_2,$a_2,$b_2)
1213	$ADDU	$c_2,$t_1
1214	sltu	$at,$c_2,$t_1
1215	$MULTU	($a_1,$b_3)		# mul_add_c(a[1],b[3],c2,c3,c1);
1216	$ADDU	$t_2,$at
1217	$ADDU	$c_3,$t_2
1218	sltu	$at,$c_3,$t_2
1219	$ADDU	$c_1,$at
1220	mflo	($t_1,$a_1,$b_3)
1221	mfhi	($t_2,$a_1,$b_3)
1222	$ADDU	$c_2,$t_1
1223	sltu	$at,$c_2,$t_1
1224	$MULTU	($a_0,$b_4)		# mul_add_c(a[0],b[4],c2,c3,c1);
1225	$ADDU	$t_2,$at
1226	$ADDU	$c_3,$t_2
1227	sltu	$at,$c_3,$t_2
1228	$ADDU	$c_1,$at
1229	mflo	($t_1,$a_0,$b_4)
1230	mfhi	($t_2,$a_0,$b_4)
1231	$ADDU	$c_2,$t_1
1232	sltu	$at,$c_2,$t_1
1233	 $MULTU	($a_0,$b_5)		# mul_add_c(a[0],b[5],c3,c1,c2);
1234	$ADDU	$t_2,$at
1235	$ADDU	$c_3,$t_2
1236	sltu	$at,$c_3,$t_2
1237	$ADDU	$c_1,$at
1238	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1239
1240	mflo	($t_1,$a_0,$b_5)
1241	mfhi	($t_2,$a_0,$b_5)
1242	$ADDU	$c_3,$t_1
1243	sltu	$at,$c_3,$t_1
1244	$MULTU	($a_1,$b_4)		# mul_add_c(a[1],b[4],c3,c1,c2);
1245	$ADDU	$t_2,$at
1246	$ADDU	$c_1,$t_2
1247	sltu	$c_2,$c_1,$t_2
1248	mflo	($t_1,$a_1,$b_4)
1249	mfhi	($t_2,$a_1,$b_4)
1250	$ADDU	$c_3,$t_1
1251	sltu	$at,$c_3,$t_1
1252	$MULTU	($a_2,$b_3)		# mul_add_c(a[2],b[3],c3,c1,c2);
1253	$ADDU	$t_2,$at
1254	$ADDU	$c_1,$t_2
1255	sltu	$at,$c_1,$t_2
1256	$ADDU	$c_2,$at
1257	mflo	($t_1,$a_2,$b_3)
1258	mfhi	($t_2,$a_2,$b_3)
1259	$ADDU	$c_3,$t_1
1260	sltu	$at,$c_3,$t_1
1261	$MULTU	($a_3,$b_2)		# mul_add_c(a[3],b[2],c3,c1,c2);
1262	$ADDU	$t_2,$at
1263	$ADDU	$c_1,$t_2
1264	sltu	$at,$c_1,$t_2
1265	$ADDU	$c_2,$at
1266	mflo	($t_1,$a_3,$b_2)
1267	mfhi	($t_2,$a_3,$b_2)
1268	$ADDU	$c_3,$t_1
1269	sltu	$at,$c_3,$t_1
1270	$MULTU	($a_4,$b_1)		# mul_add_c(a[4],b[1],c3,c1,c2);
1271	$ADDU	$t_2,$at
1272	$ADDU	$c_1,$t_2
1273	sltu	$at,$c_1,$t_2
1274	$ADDU	$c_2,$at
1275	mflo	($t_1,$a_4,$b_1)
1276	mfhi	($t_2,$a_4,$b_1)
1277	$ADDU	$c_3,$t_1
1278	sltu	$at,$c_3,$t_1
1279	$MULTU	($a_5,$b_0)		# mul_add_c(a[5],b[0],c3,c1,c2);
1280	$ADDU	$t_2,$at
1281	$ADDU	$c_1,$t_2
1282	sltu	$at,$c_1,$t_2
1283	$ADDU	$c_2,$at
1284	mflo	($t_1,$a_5,$b_0)
1285	mfhi	($t_2,$a_5,$b_0)
1286	$ADDU	$c_3,$t_1
1287	sltu	$at,$c_3,$t_1
1288	 $MULTU	($a_6,$b_0)		# mul_add_c(a[6],b[0],c1,c2,c3);
1289	$ADDU	$t_2,$at
1290	$ADDU	$c_1,$t_2
1291	sltu	$at,$c_1,$t_2
1292	$ADDU	$c_2,$at
1293	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1294
1295	mflo	($t_1,$a_6,$b_0)
1296	mfhi	($t_2,$a_6,$b_0)
1297	$ADDU	$c_1,$t_1
1298	sltu	$at,$c_1,$t_1
1299	$MULTU	($a_5,$b_1)		# mul_add_c(a[5],b[1],c1,c2,c3);
1300	$ADDU	$t_2,$at
1301	$ADDU	$c_2,$t_2
1302	sltu	$c_3,$c_2,$t_2
1303	mflo	($t_1,$a_5,$b_1)
1304	mfhi	($t_2,$a_5,$b_1)
1305	$ADDU	$c_1,$t_1
1306	sltu	$at,$c_1,$t_1
1307	$MULTU	($a_4,$b_2)		# mul_add_c(a[4],b[2],c1,c2,c3);
1308	$ADDU	$t_2,$at
1309	$ADDU	$c_2,$t_2
1310	sltu	$at,$c_2,$t_2
1311	$ADDU	$c_3,$at
1312	mflo	($t_1,$a_4,$b_2)
1313	mfhi	($t_2,$a_4,$b_2)
1314	$ADDU	$c_1,$t_1
1315	sltu	$at,$c_1,$t_1
1316	$MULTU	($a_3,$b_3)		# mul_add_c(a[3],b[3],c1,c2,c3);
1317	$ADDU	$t_2,$at
1318	$ADDU	$c_2,$t_2
1319	sltu	$at,$c_2,$t_2
1320	$ADDU	$c_3,$at
1321	mflo	($t_1,$a_3,$b_3)
1322	mfhi	($t_2,$a_3,$b_3)
1323	$ADDU	$c_1,$t_1
1324	sltu	$at,$c_1,$t_1
1325	$MULTU	($a_2,$b_4)		# mul_add_c(a[2],b[4],c1,c2,c3);
1326	$ADDU	$t_2,$at
1327	$ADDU	$c_2,$t_2
1328	sltu	$at,$c_2,$t_2
1329	$ADDU	$c_3,$at
1330	mflo	($t_1,$a_2,$b_4)
1331	mfhi	($t_2,$a_2,$b_4)
1332	$ADDU	$c_1,$t_1
1333	sltu	$at,$c_1,$t_1
1334	$MULTU	($a_1,$b_5)		# mul_add_c(a[1],b[5],c1,c2,c3);
1335	$ADDU	$t_2,$at
1336	$ADDU	$c_2,$t_2
1337	sltu	$at,$c_2,$t_2
1338	$ADDU	$c_3,$at
1339	mflo	($t_1,$a_1,$b_5)
1340	mfhi	($t_2,$a_1,$b_5)
1341	$ADDU	$c_1,$t_1
1342	sltu	$at,$c_1,$t_1
1343	$MULTU	($a_0,$b_6)		# mul_add_c(a[0],b[6],c1,c2,c3);
1344	$ADDU	$t_2,$at
1345	$ADDU	$c_2,$t_2
1346	sltu	$at,$c_2,$t_2
1347	$ADDU	$c_3,$at
1348	mflo	($t_1,$a_0,$b_6)
1349	mfhi	($t_2,$a_0,$b_6)
1350	$ADDU	$c_1,$t_1
1351	sltu	$at,$c_1,$t_1
1352	 $MULTU	($a_0,$b_7)		# mul_add_c(a[0],b[7],c2,c3,c1);
1353	$ADDU	$t_2,$at
1354	$ADDU	$c_2,$t_2
1355	sltu	$at,$c_2,$t_2
1356	$ADDU	$c_3,$at
1357	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1358
1359	mflo	($t_1,$a_0,$b_7)
1360	mfhi	($t_2,$a_0,$b_7)
1361	$ADDU	$c_2,$t_1
1362	sltu	$at,$c_2,$t_1
1363	$MULTU	($a_1,$b_6)		# mul_add_c(a[1],b[6],c2,c3,c1);
1364	$ADDU	$t_2,$at
1365	$ADDU	$c_3,$t_2
1366	sltu	$c_1,$c_3,$t_2
1367	mflo	($t_1,$a_1,$b_6)
1368	mfhi	($t_2,$a_1,$b_6)
1369	$ADDU	$c_2,$t_1
1370	sltu	$at,$c_2,$t_1
1371	$MULTU	($a_2,$b_5)		# mul_add_c(a[2],b[5],c2,c3,c1);
1372	$ADDU	$t_2,$at
1373	$ADDU	$c_3,$t_2
1374	sltu	$at,$c_3,$t_2
1375	$ADDU	$c_1,$at
1376	mflo	($t_1,$a_2,$b_5)
1377	mfhi	($t_2,$a_2,$b_5)
1378	$ADDU	$c_2,$t_1
1379	sltu	$at,$c_2,$t_1
1380	$MULTU	($a_3,$b_4)		# mul_add_c(a[3],b[4],c2,c3,c1);
1381	$ADDU	$t_2,$at
1382	$ADDU	$c_3,$t_2
1383	sltu	$at,$c_3,$t_2
1384	$ADDU	$c_1,$at
1385	mflo	($t_1,$a_3,$b_4)
1386	mfhi	($t_2,$a_3,$b_4)
1387	$ADDU	$c_2,$t_1
1388	sltu	$at,$c_2,$t_1
1389	$MULTU	($a_4,$b_3)		# mul_add_c(a[4],b[3],c2,c3,c1);
1390	$ADDU	$t_2,$at
1391	$ADDU	$c_3,$t_2
1392	sltu	$at,$c_3,$t_2
1393	$ADDU	$c_1,$at
1394	mflo	($t_1,$a_4,$b_3)
1395	mfhi	($t_2,$a_4,$b_3)
1396	$ADDU	$c_2,$t_1
1397	sltu	$at,$c_2,$t_1
1398	$MULTU	($a_5,$b_2)		# mul_add_c(a[5],b[2],c2,c3,c1);
1399	$ADDU	$t_2,$at
1400	$ADDU	$c_3,$t_2
1401	sltu	$at,$c_3,$t_2
1402	$ADDU	$c_1,$at
1403	mflo	($t_1,$a_5,$b_2)
1404	mfhi	($t_2,$a_5,$b_2)
1405	$ADDU	$c_2,$t_1
1406	sltu	$at,$c_2,$t_1
1407	$MULTU	($a_6,$b_1)		# mul_add_c(a[6],b[1],c2,c3,c1);
1408	$ADDU	$t_2,$at
1409	$ADDU	$c_3,$t_2
1410	sltu	$at,$c_3,$t_2
1411	$ADDU	$c_1,$at
1412	mflo	($t_1,$a_6,$b_1)
1413	mfhi	($t_2,$a_6,$b_1)
1414	$ADDU	$c_2,$t_1
1415	sltu	$at,$c_2,$t_1
1416	$MULTU	($a_7,$b_0)		# mul_add_c(a[7],b[0],c2,c3,c1);
1417	$ADDU	$t_2,$at
1418	$ADDU	$c_3,$t_2
1419	sltu	$at,$c_3,$t_2
1420	$ADDU	$c_1,$at
1421	mflo	($t_1,$a_7,$b_0)
1422	mfhi	($t_2,$a_7,$b_0)
1423	$ADDU	$c_2,$t_1
1424	sltu	$at,$c_2,$t_1
1425	 $MULTU	($a_7,$b_1)		# mul_add_c(a[7],b[1],c3,c1,c2);
1426	$ADDU	$t_2,$at
1427	$ADDU	$c_3,$t_2
1428	sltu	$at,$c_3,$t_2
1429	$ADDU	$c_1,$at
1430	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1431
1432	mflo	($t_1,$a_7,$b_1)
1433	mfhi	($t_2,$a_7,$b_1)
1434	$ADDU	$c_3,$t_1
1435	sltu	$at,$c_3,$t_1
1436	$MULTU	($a_6,$b_2)		# mul_add_c(a[6],b[2],c3,c1,c2);
1437	$ADDU	$t_2,$at
1438	$ADDU	$c_1,$t_2
1439	sltu	$c_2,$c_1,$t_2
1440	mflo	($t_1,$a_6,$b_2)
1441	mfhi	($t_2,$a_6,$b_2)
1442	$ADDU	$c_3,$t_1
1443	sltu	$at,$c_3,$t_1
1444	$MULTU	($a_5,$b_3)		# mul_add_c(a[5],b[3],c3,c1,c2);
1445	$ADDU	$t_2,$at
1446	$ADDU	$c_1,$t_2
1447	sltu	$at,$c_1,$t_2
1448	$ADDU	$c_2,$at
1449	mflo	($t_1,$a_5,$b_3)
1450	mfhi	($t_2,$a_5,$b_3)
1451	$ADDU	$c_3,$t_1
1452	sltu	$at,$c_3,$t_1
1453	$MULTU	($a_4,$b_4)		# mul_add_c(a[4],b[4],c3,c1,c2);
1454	$ADDU	$t_2,$at
1455	$ADDU	$c_1,$t_2
1456	sltu	$at,$c_1,$t_2
1457	$ADDU	$c_2,$at
1458	mflo	($t_1,$a_4,$b_4)
1459	mfhi	($t_2,$a_4,$b_4)
1460	$ADDU	$c_3,$t_1
1461	sltu	$at,$c_3,$t_1
1462	$MULTU	($a_3,$b_5)		# mul_add_c(a[3],b[5],c3,c1,c2);
1463	$ADDU	$t_2,$at
1464	$ADDU	$c_1,$t_2
1465	sltu	$at,$c_1,$t_2
1466	$ADDU	$c_2,$at
1467	mflo	($t_1,$a_3,$b_5)
1468	mfhi	($t_2,$a_3,$b_5)
1469	$ADDU	$c_3,$t_1
1470	sltu	$at,$c_3,$t_1
1471	$MULTU	($a_2,$b_6)		# mul_add_c(a[2],b[6],c3,c1,c2);
1472	$ADDU	$t_2,$at
1473	$ADDU	$c_1,$t_2
1474	sltu	$at,$c_1,$t_2
1475	$ADDU	$c_2,$at
1476	mflo	($t_1,$a_2,$b_6)
1477	mfhi	($t_2,$a_2,$b_6)
1478	$ADDU	$c_3,$t_1
1479	sltu	$at,$c_3,$t_1
1480	$MULTU	($a_1,$b_7)		# mul_add_c(a[1],b[7],c3,c1,c2);
1481	$ADDU	$t_2,$at
1482	$ADDU	$c_1,$t_2
1483	sltu	$at,$c_1,$t_2
1484	$ADDU	$c_2,$at
1485	mflo	($t_1,$a_1,$b_7)
1486	mfhi	($t_2,$a_1,$b_7)
1487	$ADDU	$c_3,$t_1
1488	sltu	$at,$c_3,$t_1
1489	 $MULTU	($a_2,$b_7)		# mul_add_c(a[2],b[7],c1,c2,c3);
1490	$ADDU	$t_2,$at
1491	$ADDU	$c_1,$t_2
1492	sltu	$at,$c_1,$t_2
1493	$ADDU	$c_2,$at
1494	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1495
1496	mflo	($t_1,$a_2,$b_7)
1497	mfhi	($t_2,$a_2,$b_7)
1498	$ADDU	$c_1,$t_1
1499	sltu	$at,$c_1,$t_1
1500	$MULTU	($a_3,$b_6)		# mul_add_c(a[3],b[6],c1,c2,c3);
1501	$ADDU	$t_2,$at
1502	$ADDU	$c_2,$t_2
1503	sltu	$c_3,$c_2,$t_2
1504	mflo	($t_1,$a_3,$b_6)
1505	mfhi	($t_2,$a_3,$b_6)
1506	$ADDU	$c_1,$t_1
1507	sltu	$at,$c_1,$t_1
1508	$MULTU	($a_4,$b_5)		# mul_add_c(a[4],b[5],c1,c2,c3);
1509	$ADDU	$t_2,$at
1510	$ADDU	$c_2,$t_2
1511	sltu	$at,$c_2,$t_2
1512	$ADDU	$c_3,$at
1513	mflo	($t_1,$a_4,$b_5)
1514	mfhi	($t_2,$a_4,$b_5)
1515	$ADDU	$c_1,$t_1
1516	sltu	$at,$c_1,$t_1
1517	$MULTU	($a_5,$b_4)		# mul_add_c(a[5],b[4],c1,c2,c3);
1518	$ADDU	$t_2,$at
1519	$ADDU	$c_2,$t_2
1520	sltu	$at,$c_2,$t_2
1521	$ADDU	$c_3,$at
1522	mflo	($t_1,$a_5,$b_4)
1523	mfhi	($t_2,$a_5,$b_4)
1524	$ADDU	$c_1,$t_1
1525	sltu	$at,$c_1,$t_1
1526	$MULTU	($a_6,$b_3)		# mul_add_c(a[6],b[3],c1,c2,c3);
1527	$ADDU	$t_2,$at
1528	$ADDU	$c_2,$t_2
1529	sltu	$at,$c_2,$t_2
1530	$ADDU	$c_3,$at
1531	mflo	($t_1,$a_6,$b_3)
1532	mfhi	($t_2,$a_6,$b_3)
1533	$ADDU	$c_1,$t_1
1534	sltu	$at,$c_1,$t_1
1535	$MULTU	($a_7,$b_2)		# mul_add_c(a[7],b[2],c1,c2,c3);
1536	$ADDU	$t_2,$at
1537	$ADDU	$c_2,$t_2
1538	sltu	$at,$c_2,$t_2
1539	$ADDU	$c_3,$at
1540	mflo	($t_1,$a_7,$b_2)
1541	mfhi	($t_2,$a_7,$b_2)
1542	$ADDU	$c_1,$t_1
1543	sltu	$at,$c_1,$t_1
1544	 $MULTU	($a_7,$b_3)		# mul_add_c(a[7],b[3],c2,c3,c1);
1545	$ADDU	$t_2,$at
1546	$ADDU	$c_2,$t_2
1547	sltu	$at,$c_2,$t_2
1548	$ADDU	$c_3,$at
1549	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1550
1551	mflo	($t_1,$a_7,$b_3)
1552	mfhi	($t_2,$a_7,$b_3)
1553	$ADDU	$c_2,$t_1
1554	sltu	$at,$c_2,$t_1
1555	$MULTU	($a_6,$b_4)		# mul_add_c(a[6],b[4],c2,c3,c1);
1556	$ADDU	$t_2,$at
1557	$ADDU	$c_3,$t_2
1558	sltu	$c_1,$c_3,$t_2
1559	mflo	($t_1,$a_6,$b_4)
1560	mfhi	($t_2,$a_6,$b_4)
1561	$ADDU	$c_2,$t_1
1562	sltu	$at,$c_2,$t_1
1563	$MULTU	($a_5,$b_5)		# mul_add_c(a[5],b[5],c2,c3,c1);
1564	$ADDU	$t_2,$at
1565	$ADDU	$c_3,$t_2
1566	sltu	$at,$c_3,$t_2
1567	$ADDU	$c_1,$at
1568	mflo	($t_1,$a_5,$b_5)
1569	mfhi	($t_2,$a_5,$b_5)
1570	$ADDU	$c_2,$t_1
1571	sltu	$at,$c_2,$t_1
1572	$MULTU	($a_4,$b_6)		# mul_add_c(a[4],b[6],c2,c3,c1);
1573	$ADDU	$t_2,$at
1574	$ADDU	$c_3,$t_2
1575	sltu	$at,$c_3,$t_2
1576	$ADDU	$c_1,$at
1577	mflo	($t_1,$a_4,$b_6)
1578	mfhi	($t_2,$a_4,$b_6)
1579	$ADDU	$c_2,$t_1
1580	sltu	$at,$c_2,$t_1
1581	$MULTU	($a_3,$b_7)		# mul_add_c(a[3],b[7],c2,c3,c1);
1582	$ADDU	$t_2,$at
1583	$ADDU	$c_3,$t_2
1584	sltu	$at,$c_3,$t_2
1585	$ADDU	$c_1,$at
1586	mflo	($t_1,$a_3,$b_7)
1587	mfhi	($t_2,$a_3,$b_7)
1588	$ADDU	$c_2,$t_1
1589	sltu	$at,$c_2,$t_1
1590	$MULTU	($a_4,$b_7)		# mul_add_c(a[4],b[7],c3,c1,c2);
1591	$ADDU	$t_2,$at
1592	$ADDU	$c_3,$t_2
1593	sltu	$at,$c_3,$t_2
1594	$ADDU	$c_1,$at
1595	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1596
1597	mflo	($t_1,$a_4,$b_7)
1598	mfhi	($t_2,$a_4,$b_7)
1599	$ADDU	$c_3,$t_1
1600	sltu	$at,$c_3,$t_1
1601	$MULTU	($a_5,$b_6)		# mul_add_c(a[5],b[6],c3,c1,c2);
1602	$ADDU	$t_2,$at
1603	$ADDU	$c_1,$t_2
1604	sltu	$c_2,$c_1,$t_2
1605	mflo	($t_1,$a_5,$b_6)
1606	mfhi	($t_2,$a_5,$b_6)
1607	$ADDU	$c_3,$t_1
1608	sltu	$at,$c_3,$t_1
1609	$MULTU	($a_6,$b_5)		# mul_add_c(a[6],b[5],c3,c1,c2);
1610	$ADDU	$t_2,$at
1611	$ADDU	$c_1,$t_2
1612	sltu	$at,$c_1,$t_2
1613	$ADDU	$c_2,$at
1614	mflo	($t_1,$a_6,$b_5)
1615	mfhi	($t_2,$a_6,$b_5)
1616	$ADDU	$c_3,$t_1
1617	sltu	$at,$c_3,$t_1
1618	$MULTU	($a_7,$b_4)		# mul_add_c(a[7],b[4],c3,c1,c2);
1619	$ADDU	$t_2,$at
1620	$ADDU	$c_1,$t_2
1621	sltu	$at,$c_1,$t_2
1622	$ADDU	$c_2,$at
1623	mflo	($t_1,$a_7,$b_4)
1624	mfhi	($t_2,$a_7,$b_4)
1625	$ADDU	$c_3,$t_1
1626	sltu	$at,$c_3,$t_1
1627	 $MULTU	($a_7,$b_5)		# mul_add_c(a[7],b[5],c1,c2,c3);
1628	$ADDU	$t_2,$at
1629	$ADDU	$c_1,$t_2
1630	sltu	$at,$c_1,$t_2
1631	$ADDU	$c_2,$at
1632	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1633
1634	mflo	($t_1,$a_7,$b_5)
1635	mfhi	($t_2,$a_7,$b_5)
1636	$ADDU	$c_1,$t_1
1637	sltu	$at,$c_1,$t_1
1638	$MULTU	($a_6,$b_6)		# mul_add_c(a[6],b[6],c1,c2,c3);
1639	$ADDU	$t_2,$at
1640	$ADDU	$c_2,$t_2
1641	sltu	$c_3,$c_2,$t_2
1642	mflo	($t_1,$a_6,$b_6)
1643	mfhi	($t_2,$a_6,$b_6)
1644	$ADDU	$c_1,$t_1
1645	sltu	$at,$c_1,$t_1
1646	$MULTU	($a_5,$b_7)		# mul_add_c(a[5],b[7],c1,c2,c3);
1647	$ADDU	$t_2,$at
1648	$ADDU	$c_2,$t_2
1649	sltu	$at,$c_2,$t_2
1650	$ADDU	$c_3,$at
1651	mflo	($t_1,$a_5,$b_7)
1652	mfhi	($t_2,$a_5,$b_7)
1653	$ADDU	$c_1,$t_1
1654	sltu	$at,$c_1,$t_1
1655	 $MULTU	($a_6,$b_7)		# mul_add_c(a[6],b[7],c2,c3,c1);
1656	$ADDU	$t_2,$at
1657	$ADDU	$c_2,$t_2
1658	sltu	$at,$c_2,$t_2
1659	$ADDU	$c_3,$at
1660	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1661
1662	mflo	($t_1,$a_6,$b_7)
1663	mfhi	($t_2,$a_6,$b_7)
1664	$ADDU	$c_2,$t_1
1665	sltu	$at,$c_2,$t_1
1666	$MULTU	($a_7,$b_6)		# mul_add_c(a[7],b[6],c2,c3,c1);
1667	$ADDU	$t_2,$at
1668	$ADDU	$c_3,$t_2
1669	sltu	$c_1,$c_3,$t_2
1670	mflo	($t_1,$a_7,$b_6)
1671	mfhi	($t_2,$a_7,$b_6)
1672	$ADDU	$c_2,$t_1
1673	sltu	$at,$c_2,$t_1
1674	$MULTU	($a_7,$b_7)		# mul_add_c(a[7],b[7],c3,c1,c2);
1675	$ADDU	$t_2,$at
1676	$ADDU	$c_3,$t_2
1677	sltu	$at,$c_3,$t_2
1678	$ADDU	$c_1,$at
1679	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1680
1681	mflo	($t_1,$a_7,$b_7)
1682	mfhi	($t_2,$a_7,$b_7)
1683	$ADDU	$c_3,$t_1
1684	sltu	$at,$c_3,$t_1
1685	$ADDU	$t_2,$at
1686	$ADDU	$c_1,$t_2
1687	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1688	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1689
1690	.set	noreorder
1691___
1692$code.=<<___ if ($flavour =~ /nubi/i);
1693	$REG_L	$s5,10*$SZREG($sp)
1694	$REG_L	$s4,9*$SZREG($sp)
1695	$REG_L	$s3,8*$SZREG($sp)
1696	$REG_L	$s2,7*$SZREG($sp)
1697	$REG_L	$s1,6*$SZREG($sp)
1698	$REG_L	$s0,5*$SZREG($sp)
1699	$REG_L	$t3,4*$SZREG($sp)
1700	$REG_L	$t2,3*$SZREG($sp)
1701	$REG_L	$t1,2*$SZREG($sp)
1702	$REG_L	$t0,1*$SZREG($sp)
1703	$REG_L	$gp,0*$SZREG($sp)
1704	jr	$ra
1705	$PTR_ADD $sp,12*$SZREG
1706___
1707$code.=<<___ if ($flavour !~ /nubi/i);
1708	$REG_L	$s5,5*$SZREG($sp)
1709	$REG_L	$s4,4*$SZREG($sp)
1710	$REG_L	$s3,3*$SZREG($sp)
1711	$REG_L	$s2,2*$SZREG($sp)
1712	$REG_L	$s1,1*$SZREG($sp)
1713	$REG_L	$s0,0*$SZREG($sp)
1714	jr	$ra
1715	$PTR_ADD $sp,6*$SZREG
1716___
1717$code.=<<___;
1718.end	bn_mul_comba8
1719
1720.align	5
1721.globl	bn_mul_comba4
1722.ent	bn_mul_comba4
1723bn_mul_comba4:
1724___
1725$code.=<<___ if ($flavour =~ /nubi/i);
1726	.frame	$sp,6*$SZREG,$ra
1727	.mask	0x8000f008,-$SZREG
1728	.set	noreorder
1729	$PTR_SUB $sp,6*$SZREG
1730	$REG_S	$ra,5*$SZREG($sp)
1731	$REG_S	$t3,4*$SZREG($sp)
1732	$REG_S	$t2,3*$SZREG($sp)
1733	$REG_S	$t1,2*$SZREG($sp)
1734	$REG_S	$t0,1*$SZREG($sp)
1735	$REG_S	$gp,0*$SZREG($sp)
1736___
1737$code.=<<___;
1738	.set	reorder
1739	$LD	$a_0,0($a1)
1740	$LD	$b_0,0($a2)
1741	$LD	$a_1,$BNSZ($a1)
1742	$LD	$a_2,2*$BNSZ($a1)
1743	$MULTU	($a_0,$b_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1744	$LD	$a_3,3*$BNSZ($a1)
1745	$LD	$b_1,$BNSZ($a2)
1746	$LD	$b_2,2*$BNSZ($a2)
1747	$LD	$b_3,3*$BNSZ($a2)
1748	mflo	($c_1,$a_0,$b_0)
1749	mfhi	($c_2,$a_0,$b_0)
1750	$ST	$c_1,0($a0)
1751
1752	$MULTU	($a_0,$b_1)		# mul_add_c(a[0],b[1],c2,c3,c1);
1753	mflo	($t_1,$a_0,$b_1)
1754	mfhi	($t_2,$a_0,$b_1)
1755	$ADDU	$c_2,$t_1
1756	sltu	$at,$c_2,$t_1
1757	$MULTU	($a_1,$b_0)		# mul_add_c(a[1],b[0],c2,c3,c1);
1758	$ADDU	$c_3,$t_2,$at
1759	mflo	($t_1,$a_1,$b_0)
1760	mfhi	($t_2,$a_1,$b_0)
1761	$ADDU	$c_2,$t_1
1762	sltu	$at,$c_2,$t_1
1763	 $MULTU	($a_2,$b_0)		# mul_add_c(a[2],b[0],c3,c1,c2);
1764	$ADDU	$t_2,$at
1765	$ADDU	$c_3,$t_2
1766	sltu	$c_1,$c_3,$t_2
1767	$ST	$c_2,$BNSZ($a0)
1768
1769	mflo	($t_1,$a_2,$b_0)
1770	mfhi	($t_2,$a_2,$b_0)
1771	$ADDU	$c_3,$t_1
1772	sltu	$at,$c_3,$t_1
1773	$MULTU	($a_1,$b_1)		# mul_add_c(a[1],b[1],c3,c1,c2);
1774	$ADDU	$t_2,$at
1775	$ADDU	$c_1,$t_2
1776	mflo	($t_1,$a_1,$b_1)
1777	mfhi	($t_2,$a_1,$b_1)
1778	$ADDU	$c_3,$t_1
1779	sltu	$at,$c_3,$t_1
1780	$MULTU	($a_0,$b_2)		# mul_add_c(a[0],b[2],c3,c1,c2);
1781	$ADDU	$t_2,$at
1782	$ADDU	$c_1,$t_2
1783	sltu	$c_2,$c_1,$t_2
1784	mflo	($t_1,$a_0,$b_2)
1785	mfhi	($t_2,$a_0,$b_2)
1786	$ADDU	$c_3,$t_1
1787	sltu	$at,$c_3,$t_1
1788	 $MULTU	($a_0,$b_3)		# mul_add_c(a[0],b[3],c1,c2,c3);
1789	$ADDU	$t_2,$at
1790	$ADDU	$c_1,$t_2
1791	sltu	$at,$c_1,$t_2
1792	$ADDU	$c_2,$at
1793	$ST	$c_3,2*$BNSZ($a0)
1794
1795	mflo	($t_1,$a_0,$b_3)
1796	mfhi	($t_2,$a_0,$b_3)
1797	$ADDU	$c_1,$t_1
1798	sltu	$at,$c_1,$t_1
1799	$MULTU	($a_1,$b_2)		# mul_add_c(a[1],b[2],c1,c2,c3);
1800	$ADDU	$t_2,$at
1801	$ADDU	$c_2,$t_2
1802	sltu	$c_3,$c_2,$t_2
1803	mflo	($t_1,$a_1,$b_2)
1804	mfhi	($t_2,$a_1,$b_2)
1805	$ADDU	$c_1,$t_1
1806	sltu	$at,$c_1,$t_1
1807	$MULTU	($a_2,$b_1)		# mul_add_c(a[2],b[1],c1,c2,c3);
1808	$ADDU	$t_2,$at
1809	$ADDU	$c_2,$t_2
1810	sltu	$at,$c_2,$t_2
1811	$ADDU	$c_3,$at
1812	mflo	($t_1,$a_2,$b_1)
1813	mfhi	($t_2,$a_2,$b_1)
1814	$ADDU	$c_1,$t_1
1815	sltu	$at,$c_1,$t_1
1816	$MULTU	($a_3,$b_0)		# mul_add_c(a[3],b[0],c1,c2,c3);
1817	$ADDU	$t_2,$at
1818	$ADDU	$c_2,$t_2
1819	sltu	$at,$c_2,$t_2
1820	$ADDU	$c_3,$at
1821	mflo	($t_1,$a_3,$b_0)
1822	mfhi	($t_2,$a_3,$b_0)
1823	$ADDU	$c_1,$t_1
1824	sltu	$at,$c_1,$t_1
1825	 $MULTU	($a_3,$b_1)		# mul_add_c(a[3],b[1],c2,c3,c1);
1826	$ADDU	$t_2,$at
1827	$ADDU	$c_2,$t_2
1828	sltu	$at,$c_2,$t_2
1829	$ADDU	$c_3,$at
1830	$ST	$c_1,3*$BNSZ($a0)
1831
1832	mflo	($t_1,$a_3,$b_1)
1833	mfhi	($t_2,$a_3,$b_1)
1834	$ADDU	$c_2,$t_1
1835	sltu	$at,$c_2,$t_1
1836	$MULTU	($a_2,$b_2)		# mul_add_c(a[2],b[2],c2,c3,c1);
1837	$ADDU	$t_2,$at
1838	$ADDU	$c_3,$t_2
1839	sltu	$c_1,$c_3,$t_2
1840	mflo	($t_1,$a_2,$b_2)
1841	mfhi	($t_2,$a_2,$b_2)
1842	$ADDU	$c_2,$t_1
1843	sltu	$at,$c_2,$t_1
1844	$MULTU	($a_1,$b_3)		# mul_add_c(a[1],b[3],c2,c3,c1);
1845	$ADDU	$t_2,$at
1846	$ADDU	$c_3,$t_2
1847	sltu	$at,$c_3,$t_2
1848	$ADDU	$c_1,$at
1849	mflo	($t_1,$a_1,$b_3)
1850	mfhi	($t_2,$a_1,$b_3)
1851	$ADDU	$c_2,$t_1
1852	sltu	$at,$c_2,$t_1
1853	 $MULTU	($a_2,$b_3)		# mul_add_c(a[2],b[3],c3,c1,c2);
1854	$ADDU	$t_2,$at
1855	$ADDU	$c_3,$t_2
1856	sltu	$at,$c_3,$t_2
1857	$ADDU	$c_1,$at
1858	$ST	$c_2,4*$BNSZ($a0)
1859
1860	mflo	($t_1,$a_2,$b_3)
1861	mfhi	($t_2,$a_2,$b_3)
1862	$ADDU	$c_3,$t_1
1863	sltu	$at,$c_3,$t_1
1864	$MULTU	($a_3,$b_2)		# mul_add_c(a[3],b[2],c3,c1,c2);
1865	$ADDU	$t_2,$at
1866	$ADDU	$c_1,$t_2
1867	sltu	$c_2,$c_1,$t_2
1868	mflo	($t_1,$a_3,$b_2)
1869	mfhi	($t_2,$a_3,$b_2)
1870	$ADDU	$c_3,$t_1
1871	sltu	$at,$c_3,$t_1
1872	 $MULTU	($a_3,$b_3)		# mul_add_c(a[3],b[3],c1,c2,c3);
1873	$ADDU	$t_2,$at
1874	$ADDU	$c_1,$t_2
1875	sltu	$at,$c_1,$t_2
1876	$ADDU	$c_2,$at
1877	$ST	$c_3,5*$BNSZ($a0)
1878
1879	mflo	($t_1,$a_3,$b_3)
1880	mfhi	($t_2,$a_3,$b_3)
1881	$ADDU	$c_1,$t_1
1882	sltu	$at,$c_1,$t_1
1883	$ADDU	$t_2,$at
1884	$ADDU	$c_2,$t_2
1885	$ST	$c_1,6*$BNSZ($a0)
1886	$ST	$c_2,7*$BNSZ($a0)
1887
1888	.set	noreorder
1889___
1890$code.=<<___ if ($flavour =~ /nubi/i);
1891	$REG_L	$t3,4*$SZREG($sp)
1892	$REG_L	$t2,3*$SZREG($sp)
1893	$REG_L	$t1,2*$SZREG($sp)
1894	$REG_L	$t0,1*$SZREG($sp)
1895	$REG_L	$gp,0*$SZREG($sp)
1896	$PTR_ADD $sp,6*$SZREG
1897___
1898$code.=<<___;
1899	jr	$ra
1900	nop
1901.end	bn_mul_comba4
1902___
1903
1904($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1905
1906sub add_c2 () {
1907my ($hi,$lo,$c0,$c1,$c2,
1908    $warm,      # !$warm denotes first call with specific sequence of
1909                # $c_[XYZ] when there is no Z-carry to accumulate yet;
1910    $an,$bn     # these two are arguments for multiplication which
1911                # result is used in *next* step [which is why it's
1912                # commented as "forward multiplication" below];
1913    )=@_;
1914$code.=<<___;
1915	$ADDU	$c0,$lo
1916	sltu	$at,$c0,$lo
1917	 $MULTU	($an,$bn)		# forward multiplication
1918	$ADDU	$c0,$lo
1919	$ADDU	$at,$hi
1920	sltu	$lo,$c0,$lo
1921	$ADDU	$c1,$at
1922	$ADDU	$hi,$lo
1923___
1924$code.=<<___	if (!$warm);
1925	sltu	$c2,$c1,$at
1926	$ADDU	$c1,$hi
1927___
1928$code.=<<___	if ($warm);
1929	sltu	$at,$c1,$at
1930	$ADDU	$c1,$hi
1931	$ADDU	$c2,$at
1932___
1933$code.=<<___;
1934	sltu	$hi,$c1,$hi
1935	$ADDU	$c2,$hi
1936	mflo	($lo,$an,$bn)
1937	mfhi	($hi,$an,$bn)
1938___
1939}
1940
1941$code.=<<___;
1942
1943.align	5
1944.globl	bn_sqr_comba8
1945.ent	bn_sqr_comba8
1946bn_sqr_comba8:
1947___
1948$code.=<<___ if ($flavour =~ /nubi/i);
1949	.frame	$sp,6*$SZREG,$ra
1950	.mask	0x8000f008,-$SZREG
1951	.set	noreorder
1952	$PTR_SUB $sp,6*$SZREG
1953	$REG_S	$ra,5*$SZREG($sp)
1954	$REG_S	$t3,4*$SZREG($sp)
1955	$REG_S	$t2,3*$SZREG($sp)
1956	$REG_S	$t1,2*$SZREG($sp)
1957	$REG_S	$t0,1*$SZREG($sp)
1958	$REG_S	$gp,0*$SZREG($sp)
1959___
1960$code.=<<___;
1961	.set	reorder
1962	$LD	$a_0,0($a1)
1963	$LD	$a_1,$BNSZ($a1)
1964	$LD	$a_2,2*$BNSZ($a1)
1965	$LD	$a_3,3*$BNSZ($a1)
1966
1967	$MULTU	($a_0,$a_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1968	$LD	$a_4,4*$BNSZ($a1)
1969	$LD	$a_5,5*$BNSZ($a1)
1970	$LD	$a_6,6*$BNSZ($a1)
1971	$LD	$a_7,7*$BNSZ($a1)
1972	mflo	($c_1,$a_0,$a_0)
1973	mfhi	($c_2,$a_0,$a_0)
1974	$ST	$c_1,0($a0)
1975
1976	$MULTU	($a_0,$a_1)		# mul_add_c2(a[0],b[1],c2,c3,c1);
1977	mflo	($t_1,$a_0,$a_1)
1978	mfhi	($t_2,$a_0,$a_1)
1979	slt	$c_1,$t_2,$zero
1980	$SLL	$t_2,1
1981	 $MULTU	($a_2,$a_0)		# mul_add_c2(a[2],b[0],c3,c1,c2);
1982	slt	$a2,$t_1,$zero
1983	$ADDU	$t_2,$a2
1984	$SLL	$t_1,1
1985	$ADDU	$c_2,$t_1
1986	sltu	$at,$c_2,$t_1
1987	$ADDU	$c_3,$t_2,$at
1988	$ST	$c_2,$BNSZ($a0)
1989	sltu	$at,$c_3,$t_2
1990	$ADDU	$c_1,$at
1991	mflo	($t_1,$a_2,$a_0)
1992	mfhi	($t_2,$a_2,$a_0)
1993___
1994	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1995		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
1996$code.=<<___;
1997	$ADDU	$c_3,$t_1
1998	sltu	$at,$c_3,$t_1
1999	 $MULTU	($a_0,$a_3)		# mul_add_c2(a[0],b[3],c1,c2,c3);
2000	$ADDU	$t_2,$at
2001	$ADDU	$c_1,$t_2
2002	sltu	$at,$c_1,$t_2
2003	$ADDU	$c_2,$at
2004	$ST	$c_3,2*$BNSZ($a0)
2005	mflo	($t_1,$a_0,$a_3)
2006	mfhi	($t_2,$a_0,$a_3)
2007___
2008	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2009		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
2010	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2011		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
2012$code.=<<___;
2013	$ST	$c_1,3*$BNSZ($a0)
2014___
2015	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2016		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2017	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2018		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2019$code.=<<___;
2020	$ADDU	$c_2,$t_1
2021	sltu	$at,$c_2,$t_1
2022	 $MULTU	($a_0,$a_5)		# mul_add_c2(a[0],b[5],c3,c1,c2);
2023	$ADDU	$t_2,$at
2024	$ADDU	$c_3,$t_2
2025	sltu	$at,$c_3,$t_2
2026	$ADDU	$c_1,$at
2027	$ST	$c_2,4*$BNSZ($a0)
2028	mflo	($t_1,$a_0,$a_5)
2029	mfhi	($t_2,$a_0,$a_5)
2030___
2031	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2032		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
2033	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2034		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
2035	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2036		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
2037$code.=<<___;
2038	$ST	$c_3,5*$BNSZ($a0)
2039___
2040	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2041		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
2042	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2043		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
2044	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2045		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2046$code.=<<___;
2047	$ADDU	$c_1,$t_1
2048	sltu	$at,$c_1,$t_1
2049	 $MULTU	($a_0,$a_7)		# mul_add_c2(a[0],b[7],c2,c3,c1);
2050	$ADDU	$t_2,$at
2051	$ADDU	$c_2,$t_2
2052	sltu	$at,$c_2,$t_2
2053	$ADDU	$c_3,$at
2054	$ST	$c_1,6*$BNSZ($a0)
2055	mflo	($t_1,$a_0,$a_7)
2056	mfhi	($t_2,$a_0,$a_7)
2057___
2058	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2059		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
2060	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
2062	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2063		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
2064	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2065		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
2066$code.=<<___;
2067	$ST	$c_2,7*$BNSZ($a0)
2068___
2069	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2070		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
2071	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2072		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
2073	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2074		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
2075$code.=<<___;
2076	$ADDU	$c_3,$t_1
2077	sltu	$at,$c_3,$t_1
2078	 $MULTU	($a_2,$a_7)		# mul_add_c2(a[2],b[7],c1,c2,c3);
2079	$ADDU	$t_2,$at
2080	$ADDU	$c_1,$t_2
2081	sltu	$at,$c_1,$t_2
2082	$ADDU	$c_2,$at
2083	$ST	$c_3,8*$BNSZ($a0)
2084	mflo	($t_1,$a_2,$a_7)
2085	mfhi	($t_2,$a_2,$a_7)
2086___
2087	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2088		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
2089	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2090		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
2091	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2092		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
2093$code.=<<___;
2094	$ST	$c_1,9*$BNSZ($a0)
2095___
2096	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2097		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
2098	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2099		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
2100$code.=<<___;
2101	$ADDU	$c_2,$t_1
2102	sltu	$at,$c_2,$t_1
2103	 $MULTU	($a_4,$a_7)		# mul_add_c2(a[4],b[7],c3,c1,c2);
2104	$ADDU	$t_2,$at
2105	$ADDU	$c_3,$t_2
2106	sltu	$at,$c_3,$t_2
2107	$ADDU	$c_1,$at
2108	$ST	$c_2,10*$BNSZ($a0)
2109	mflo	($t_1,$a_4,$a_7)
2110	mfhi	($t_2,$a_4,$a_7)
2111___
2112	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2113		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
2114	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2115		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
2116$code.=<<___;
2117	$ST	$c_3,11*$BNSZ($a0)
2118___
2119	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2120		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
2121$code.=<<___;
2122	$ADDU	$c_1,$t_1
2123	sltu	$at,$c_1,$t_1
2124	 $MULTU	($a_6,$a_7)		# mul_add_c2(a[6],b[7],c2,c3,c1);
2125	$ADDU	$t_2,$at
2126	$ADDU	$c_2,$t_2
2127	sltu	$at,$c_2,$t_2
2128	$ADDU	$c_3,$at
2129	$ST	$c_1,12*$BNSZ($a0)
2130	mflo	($t_1,$a_6,$a_7)
2131	mfhi	($t_2,$a_6,$a_7)
2132___
2133	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2134		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
2135$code.=<<___;
2136	$ST	$c_2,13*$BNSZ($a0)
2137
2138	$ADDU	$c_3,$t_1
2139	sltu	$at,$c_3,$t_1
2140	$ADDU	$t_2,$at
2141	$ADDU	$c_1,$t_2
2142	$ST	$c_3,14*$BNSZ($a0)
2143	$ST	$c_1,15*$BNSZ($a0)
2144
2145	.set	noreorder
2146___
2147$code.=<<___ if ($flavour =~ /nubi/i);
2148	$REG_L	$t3,4*$SZREG($sp)
2149	$REG_L	$t2,3*$SZREG($sp)
2150	$REG_L	$t1,2*$SZREG($sp)
2151	$REG_L	$t0,1*$SZREG($sp)
2152	$REG_L	$gp,0*$SZREG($sp)
2153	$PTR_ADD $sp,6*$SZREG
2154___
2155$code.=<<___;
2156	jr	$ra
2157	nop
2158.end	bn_sqr_comba8
2159
2160.align	5
2161.globl	bn_sqr_comba4
2162.ent	bn_sqr_comba4
2163bn_sqr_comba4:
2164___
2165$code.=<<___ if ($flavour =~ /nubi/i);
2166	.frame	$sp,6*$SZREG,$ra
2167	.mask	0x8000f008,-$SZREG
2168	.set	noreorder
2169	$PTR_SUB $sp,6*$SZREG
2170	$REG_S	$ra,5*$SZREG($sp)
2171	$REG_S	$t3,4*$SZREG($sp)
2172	$REG_S	$t2,3*$SZREG($sp)
2173	$REG_S	$t1,2*$SZREG($sp)
2174	$REG_S	$t0,1*$SZREG($sp)
2175	$REG_S	$gp,0*$SZREG($sp)
2176___
2177$code.=<<___;
2178	.set	reorder
2179	$LD	$a_0,0($a1)
2180	$LD	$a_1,$BNSZ($a1)
2181	$MULTU	($a_0,$a_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
2182	$LD	$a_2,2*$BNSZ($a1)
2183	$LD	$a_3,3*$BNSZ($a1)
2184	mflo	($c_1,$a_0,$a_0)
2185	mfhi	($c_2,$a_0,$a_0)
2186	$ST	$c_1,0($a0)
2187
2188	$MULTU	($a_0,$a_1)		# mul_add_c2(a[0],b[1],c2,c3,c1);
2189	mflo	($t_1,$a_0,$a_1)
2190	mfhi	($t_2,$a_0,$a_1)
2191	slt	$c_1,$t_2,$zero
2192	$SLL	$t_2,1
2193	 $MULTU	($a_2,$a_0)		# mul_add_c2(a[2],b[0],c3,c1,c2);
2194	slt	$a2,$t_1,$zero
2195	$ADDU	$t_2,$a2
2196	$SLL	$t_1,1
2197	$ADDU	$c_2,$t_1
2198	sltu	$at,$c_2,$t_1
2199	$ADDU	$c_3,$t_2,$at
2200	$ST	$c_2,$BNSZ($a0)
2201	sltu	$at,$c_3,$t_2
2202	$ADDU	$c_1,$at
2203	mflo	($t_1,$a_2,$a_0)
2204	mfhi	($t_2,$a_2,$a_0)
2205___
2206	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2207		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
2208$code.=<<___;
2209	$ADDU	$c_3,$t_1
2210	sltu	$at,$c_3,$t_1
2211	 $MULTU	($a_0,$a_3)		# mul_add_c2(a[0],b[3],c1,c2,c3);
2212	$ADDU	$t_2,$at
2213	$ADDU	$c_1,$t_2
2214	sltu	$at,$c_1,$t_2
2215	$ADDU	$c_2,$at
2216	$ST	$c_3,2*$BNSZ($a0)
2217	mflo	($t_1,$a_0,$a_3)
2218	mfhi	($t_2,$a_0,$a_3)
2219___
2220	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2221		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
2222	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2223		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2224$code.=<<___;
2225	$ST	$c_1,3*$BNSZ($a0)
2226___
2227	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2228		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2229$code.=<<___;
2230	$ADDU	$c_2,$t_1
2231	sltu	$at,$c_2,$t_1
2232	 $MULTU	($a_2,$a_3)		# mul_add_c2(a[2],b[3],c3,c1,c2);
2233	$ADDU	$t_2,$at
2234	$ADDU	$c_3,$t_2
2235	sltu	$at,$c_3,$t_2
2236	$ADDU	$c_1,$at
2237	$ST	$c_2,4*$BNSZ($a0)
2238	mflo	($t_1,$a_2,$a_3)
2239	mfhi	($t_2,$a_2,$a_3)
2240___
2241	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2242		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2243$code.=<<___;
2244	$ST	$c_3,5*$BNSZ($a0)
2245
2246	$ADDU	$c_1,$t_1
2247	sltu	$at,$c_1,$t_1
2248	$ADDU	$t_2,$at
2249	$ADDU	$c_2,$t_2
2250	$ST	$c_1,6*$BNSZ($a0)
2251	$ST	$c_2,7*$BNSZ($a0)
2252
2253	.set	noreorder
2254___
2255$code.=<<___ if ($flavour =~ /nubi/i);
2256	$REG_L	$t3,4*$SZREG($sp)
2257	$REG_L	$t2,3*$SZREG($sp)
2258	$REG_L	$t1,2*$SZREG($sp)
2259	$REG_L	$t0,1*$SZREG($sp)
2260	$REG_L	$gp,0*$SZREG($sp)
2261	$PTR_ADD $sp,6*$SZREG
2262___
2263$code.=<<___;
2264	jr	$ra
2265	nop
2266.end	bn_sqr_comba4
2267___
2268print $code;
2269close STDOUT or die "error closing STDOUT: $!";
2270