xref: /openssl/crypto/bn/asm/sparcv8.S (revision 1287dabd)
1.ident	"sparcv8.s, Version 1.4"
2.ident	"SPARC v8 ISA artwork by Andy Polyakov <appro@openssl.org>"
3
4/*
5 * ====================================================================
6 * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
7 *
8 * Licensed under the Apache License 2.0 (the "License").  You may not use
9 * this file except in compliance with the License.  You can obtain a copy
10 * in the file LICENSE in the source distribution or at
11 * https://www.openssl.org/source/license.html
12 * ====================================================================
13 */
14
15/*
16 * This is my modest contribution to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
20 *
21 * See bn_asm.sparc.v8plus.S for more details.
22 */
23
24/*
25 * Revision history.
26 *
27 * 1.1	- new loop unrolling model(*);
28 * 1.2	- made gas friendly;
29 * 1.3	- fixed problem with /usr/ccs/lib/cpp;
30 * 1.4	- some retunes;
31 *
32 * (*)	see bn_asm.sparc.v8plus.S for details
33 */
34
35.section	".text",#alloc,#execinstr
36.file		"bn_asm.sparc.v8.S"
37
38.align	32
39
40.global bn_mul_add_words
41/*
42 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
43 * BN_ULONG *rp,*ap;
44 * int num;
45 * BN_ULONG w;
46 */
47bn_mul_add_words:
48	cmp	%o2,0
49	bg,a	.L_bn_mul_add_words_proceed
50	ld	[%o1],%g2
51	retl
52	clr	%o0
53
54.L_bn_mul_add_words_proceed:
55	andcc	%o2,-4,%g0
56	bz	.L_bn_mul_add_words_tail
57	clr	%o5
58
59.L_bn_mul_add_words_loop:
60	ld	[%o0],%o4
61	ld	[%o1+4],%g3
62	umul	%o3,%g2,%g2
63	rd	%y,%g1
64	addcc	%o4,%o5,%o4
65	addx	%g1,0,%g1
66	addcc	%o4,%g2,%o4
67	st	%o4,[%o0]
68	addx	%g1,0,%o5
69
70	ld	[%o0+4],%o4
71	ld	[%o1+8],%g2
72	umul	%o3,%g3,%g3
73	dec	4,%o2
74	rd	%y,%g1
75	addcc	%o4,%o5,%o4
76	addx	%g1,0,%g1
77	addcc	%o4,%g3,%o4
78	st	%o4,[%o0+4]
79	addx	%g1,0,%o5
80
81	ld	[%o0+8],%o4
82	ld	[%o1+12],%g3
83	umul	%o3,%g2,%g2
84	inc	16,%o1
85	rd	%y,%g1
86	addcc	%o4,%o5,%o4
87	addx	%g1,0,%g1
88	addcc	%o4,%g2,%o4
89	st	%o4,[%o0+8]
90	addx	%g1,0,%o5
91
92	ld	[%o0+12],%o4
93	umul	%o3,%g3,%g3
94	inc	16,%o0
95	rd	%y,%g1
96	addcc	%o4,%o5,%o4
97	addx	%g1,0,%g1
98	addcc	%o4,%g3,%o4
99	st	%o4,[%o0-4]
100	addx	%g1,0,%o5
101	andcc	%o2,-4,%g0
102	bnz,a	.L_bn_mul_add_words_loop
103	ld	[%o1],%g2
104
105	tst	%o2
106	bnz,a	.L_bn_mul_add_words_tail
107	ld	[%o1],%g2
108.L_bn_mul_add_words_return:
109	retl
110	mov	%o5,%o0
111	nop
112
113.L_bn_mul_add_words_tail:
114	ld	[%o0],%o4
115	umul	%o3,%g2,%g2
116	addcc	%o4,%o5,%o4
117	rd	%y,%g1
118	addx	%g1,0,%g1
119	addcc	%o4,%g2,%o4
120	addx	%g1,0,%o5
121	deccc	%o2
122	bz	.L_bn_mul_add_words_return
123	st	%o4,[%o0]
124
125	ld	[%o1+4],%g2
126	ld	[%o0+4],%o4
127	umul	%o3,%g2,%g2
128	rd	%y,%g1
129	addcc	%o4,%o5,%o4
130	addx	%g1,0,%g1
131	addcc	%o4,%g2,%o4
132	addx	%g1,0,%o5
133	deccc	%o2
134	bz	.L_bn_mul_add_words_return
135	st	%o4,[%o0+4]
136
137	ld	[%o1+8],%g2
138	ld	[%o0+8],%o4
139	umul	%o3,%g2,%g2
140	rd	%y,%g1
141	addcc	%o4,%o5,%o4
142	addx	%g1,0,%g1
143	addcc	%o4,%g2,%o4
144	st	%o4,[%o0+8]
145	retl
146	addx	%g1,0,%o0
147
148.type	bn_mul_add_words,#function
149.size	bn_mul_add_words,(.-bn_mul_add_words)
150
151.align	32
152
153.global bn_mul_words
154/*
155 * BN_ULONG bn_mul_words(rp,ap,num,w)
156 * BN_ULONG *rp,*ap;
157 * int num;
158 * BN_ULONG w;
159 */
160bn_mul_words:
161	cmp	%o2,0
162	bg,a	.L_bn_mul_words_proceed
163	ld	[%o1],%g2
164	retl
165	clr	%o0
166
167.L_bn_mul_words_proceed:
168	andcc	%o2,-4,%g0
169	bz	.L_bn_mul_words_tail
170	clr	%o5
171
172.L_bn_mul_words_loop:
173	ld	[%o1+4],%g3
174	umul	%o3,%g2,%g2
175	addcc	%g2,%o5,%g2
176	rd	%y,%g1
177	addx	%g1,0,%o5
178	st	%g2,[%o0]
179
180	ld	[%o1+8],%g2
181	umul	%o3,%g3,%g3
182	addcc	%g3,%o5,%g3
183	rd	%y,%g1
184	dec	4,%o2
185	addx	%g1,0,%o5
186	st	%g3,[%o0+4]
187
188	ld	[%o1+12],%g3
189	umul	%o3,%g2,%g2
190	addcc	%g2,%o5,%g2
191	rd	%y,%g1
192	inc	16,%o1
193	st	%g2,[%o0+8]
194	addx	%g1,0,%o5
195
196	umul	%o3,%g3,%g3
197	addcc	%g3,%o5,%g3
198	rd	%y,%g1
199	inc	16,%o0
200	addx	%g1,0,%o5
201	st	%g3,[%o0-4]
202	andcc	%o2,-4,%g0
203	nop
204	bnz,a	.L_bn_mul_words_loop
205	ld	[%o1],%g2
206
207	tst	%o2
208	bnz,a	.L_bn_mul_words_tail
209	ld	[%o1],%g2
210.L_bn_mul_words_return:
211	retl
212	mov	%o5,%o0
213	nop
214
215.L_bn_mul_words_tail:
216	umul	%o3,%g2,%g2
217	addcc	%g2,%o5,%g2
218	rd	%y,%g1
219	addx	%g1,0,%o5
220	deccc	%o2
221	bz	.L_bn_mul_words_return
222	st	%g2,[%o0]
223	nop
224
225	ld	[%o1+4],%g2
226	umul	%o3,%g2,%g2
227	addcc	%g2,%o5,%g2
228	rd	%y,%g1
229	addx	%g1,0,%o5
230	deccc	%o2
231	bz	.L_bn_mul_words_return
232	st	%g2,[%o0+4]
233
234	ld	[%o1+8],%g2
235	umul	%o3,%g2,%g2
236	addcc	%g2,%o5,%g2
237	rd	%y,%g1
238	st	%g2,[%o0+8]
239	retl
240	addx	%g1,0,%o0
241
242.type	bn_mul_words,#function
243.size	bn_mul_words,(.-bn_mul_words)
244
245.align  32
246.global	bn_sqr_words
247/*
248 * void bn_sqr_words(r,a,n)
249 * BN_ULONG *r,*a;
250 * int n;
251 */
252bn_sqr_words:
253	cmp	%o2,0
254	bg,a	.L_bn_sqr_words_proceed
255	ld	[%o1],%g2
256	retl
257	clr	%o0
258
259.L_bn_sqr_words_proceed:
260	andcc	%o2,-4,%g0
261	bz	.L_bn_sqr_words_tail
262	clr	%o5
263
264.L_bn_sqr_words_loop:
265	ld	[%o1+4],%g3
266	umul	%g2,%g2,%o4
267	st	%o4,[%o0]
268	rd	%y,%o5
269	st	%o5,[%o0+4]
270
271	ld	[%o1+8],%g2
272	umul	%g3,%g3,%o4
273	dec	4,%o2
274	st	%o4,[%o0+8]
275	rd	%y,%o5
276	st	%o5,[%o0+12]
277	nop
278
279	ld	[%o1+12],%g3
280	umul	%g2,%g2,%o4
281	st	%o4,[%o0+16]
282	rd	%y,%o5
283	inc	16,%o1
284	st	%o5,[%o0+20]
285
286	umul	%g3,%g3,%o4
287	inc	32,%o0
288	st	%o4,[%o0-8]
289	rd	%y,%o5
290	st	%o5,[%o0-4]
291	andcc	%o2,-4,%g2
292	bnz,a	.L_bn_sqr_words_loop
293	ld	[%o1],%g2
294
295	tst	%o2
296	nop
297	bnz,a	.L_bn_sqr_words_tail
298	ld	[%o1],%g2
299.L_bn_sqr_words_return:
300	retl
301	clr	%o0
302
303.L_bn_sqr_words_tail:
304	umul	%g2,%g2,%o4
305	st	%o4,[%o0]
306	deccc	%o2
307	rd	%y,%o5
308	bz	.L_bn_sqr_words_return
309	st	%o5,[%o0+4]
310
311	ld	[%o1+4],%g2
312	umul	%g2,%g2,%o4
313	st	%o4,[%o0+8]
314	deccc	%o2
315	rd	%y,%o5
316	nop
317	bz	.L_bn_sqr_words_return
318	st	%o5,[%o0+12]
319
320	ld	[%o1+8],%g2
321	umul	%g2,%g2,%o4
322	st	%o4,[%o0+16]
323	rd	%y,%o5
324	st	%o5,[%o0+20]
325	retl
326	clr	%o0
327
328.type	bn_sqr_words,#function
329.size	bn_sqr_words,(.-bn_sqr_words)
330
331.align	32
332
333.global bn_div_words
334/*
335 * BN_ULONG bn_div_words(h,l,d)
336 * BN_ULONG h,l,d;
337 */
338bn_div_words:
339	wr	%o0,%y
340	udiv	%o1,%o2,%o0
341	retl
342	nop
343
344.type	bn_div_words,#function
345.size	bn_div_words,(.-bn_div_words)
346
347.align	32
348
349.global bn_add_words
350/*
351 * BN_ULONG bn_add_words(rp,ap,bp,n)
352 * BN_ULONG *rp,*ap,*bp;
353 * int n;
354 */
355bn_add_words:
356	cmp	%o3,0
357	bg,a	.L_bn_add_words_proceed
358	ld	[%o1],%o4
359	retl
360	clr	%o0
361
362.L_bn_add_words_proceed:
363	andcc	%o3,-4,%g0
364	bz	.L_bn_add_words_tail
365	clr	%g1
366	ba	.L_bn_add_words_warn_loop
367	addcc	%g0,0,%g0	! clear carry flag
368
369.L_bn_add_words_loop:
370	ld	[%o1],%o4
371.L_bn_add_words_warn_loop:
372	ld	[%o2],%o5
373	ld	[%o1+4],%g3
374	ld	[%o2+4],%g4
375	dec	4,%o3
376	addxcc	%o5,%o4,%o5
377	st	%o5,[%o0]
378
379	ld	[%o1+8],%o4
380	ld	[%o2+8],%o5
381	inc	16,%o1
382	addxcc	%g3,%g4,%g3
383	st	%g3,[%o0+4]
384
385	ld	[%o1-4],%g3
386	ld	[%o2+12],%g4
387	inc	16,%o2
388	addxcc	%o5,%o4,%o5
389	st	%o5,[%o0+8]
390
391	inc	16,%o0
392	addxcc	%g3,%g4,%g3
393	st	%g3,[%o0-4]
394	addx	%g0,0,%g1
395	andcc	%o3,-4,%g0
396	bnz,a	.L_bn_add_words_loop
397	addcc	%g1,-1,%g0
398
399	tst	%o3
400	bnz,a	.L_bn_add_words_tail
401	ld	[%o1],%o4
402.L_bn_add_words_return:
403	retl
404	mov	%g1,%o0
405
406.L_bn_add_words_tail:
407	addcc	%g1,-1,%g0
408	ld	[%o2],%o5
409	addxcc	%o5,%o4,%o5
410	addx	%g0,0,%g1
411	deccc	%o3
412	bz	.L_bn_add_words_return
413	st	%o5,[%o0]
414
415	ld	[%o1+4],%o4
416	addcc	%g1,-1,%g0
417	ld	[%o2+4],%o5
418	addxcc	%o5,%o4,%o5
419	addx	%g0,0,%g1
420	deccc	%o3
421	bz	.L_bn_add_words_return
422	st	%o5,[%o0+4]
423
424	ld	[%o1+8],%o4
425	addcc	%g1,-1,%g0
426	ld	[%o2+8],%o5
427	addxcc	%o5,%o4,%o5
428	st	%o5,[%o0+8]
429	retl
430	addx	%g0,0,%o0
431
432.type	bn_add_words,#function
433.size	bn_add_words,(.-bn_add_words)
434
435.align	32
436
437.global bn_sub_words
438/*
439 * BN_ULONG bn_sub_words(rp,ap,bp,n)
440 * BN_ULONG *rp,*ap,*bp;
441 * int n;
442 */
443bn_sub_words:
444	cmp	%o3,0
445	bg,a	.L_bn_sub_words_proceed
446	ld	[%o1],%o4
447	retl
448	clr	%o0
449
450.L_bn_sub_words_proceed:
451	andcc	%o3,-4,%g0
452	bz	.L_bn_sub_words_tail
453	clr	%g1
454	ba	.L_bn_sub_words_warm_loop
455	addcc	%g0,0,%g0	! clear carry flag
456
457.L_bn_sub_words_loop:
458	ld	[%o1],%o4
459.L_bn_sub_words_warm_loop:
460	ld	[%o2],%o5
461	ld	[%o1+4],%g3
462	ld	[%o2+4],%g4
463	dec	4,%o3
464	subxcc	%o4,%o5,%o5
465	st	%o5,[%o0]
466
467	ld	[%o1+8],%o4
468	ld	[%o2+8],%o5
469	inc	16,%o1
470	subxcc	%g3,%g4,%g4
471	st	%g4,[%o0+4]
472
473	ld	[%o1-4],%g3
474	ld	[%o2+12],%g4
475	inc	16,%o2
476	subxcc	%o4,%o5,%o5
477	st	%o5,[%o0+8]
478
479	inc	16,%o0
480	subxcc	%g3,%g4,%g4
481	st	%g4,[%o0-4]
482	addx	%g0,0,%g1
483	andcc	%o3,-4,%g0
484	bnz,a	.L_bn_sub_words_loop
485	addcc	%g1,-1,%g0
486
487	tst	%o3
488	nop
489	bnz,a	.L_bn_sub_words_tail
490	ld	[%o1],%o4
491.L_bn_sub_words_return:
492	retl
493	mov	%g1,%o0
494
495.L_bn_sub_words_tail:
496	addcc	%g1,-1,%g0
497	ld	[%o2],%o5
498	subxcc	%o4,%o5,%o5
499	addx	%g0,0,%g1
500	deccc	%o3
501	bz	.L_bn_sub_words_return
502	st	%o5,[%o0]
503	nop
504
505	ld	[%o1+4],%o4
506	addcc	%g1,-1,%g0
507	ld	[%o2+4],%o5
508	subxcc	%o4,%o5,%o5
509	addx	%g0,0,%g1
510	deccc	%o3
511	bz	.L_bn_sub_words_return
512	st	%o5,[%o0+4]
513
514	ld	[%o1+8],%o4
515	addcc	%g1,-1,%g0
516	ld	[%o2+8],%o5
517	subxcc	%o4,%o5,%o5
518	st	%o5,[%o0+8]
519	retl
520	addx	%g0,0,%o0
521
522.type	bn_sub_words,#function
523.size	bn_sub_words,(.-bn_sub_words)
524
525#define FRAME_SIZE	-96
526
527/*
528 * Here is register usage map for *all* routines below.
529 */
530#define t_1	%o0
531#define	t_2	%o1
532#define c_1	%o2
533#define c_2	%o3
534#define c_3	%o4
535
536#define ap(I)	[%i1+4*I]
537#define bp(I)	[%i2+4*I]
538#define rp(I)	[%i0+4*I]
539
540#define	a_0	%l0
541#define	a_1	%l1
542#define	a_2	%l2
543#define	a_3	%l3
544#define	a_4	%l4
545#define	a_5	%l5
546#define	a_6	%l6
547#define	a_7	%l7
548
549#define	b_0	%i3
550#define	b_1	%i4
551#define	b_2	%i5
552#define	b_3	%o5
553#define	b_4	%g1
554#define	b_5	%g2
555#define	b_6	%g3
556#define	b_7	%g4
557
558.align	32
559.global bn_mul_comba8
560/*
561 * void bn_mul_comba8(r,a,b)
562 * BN_ULONG *r,*a,*b;
563 */
564bn_mul_comba8:
565	save	%sp,FRAME_SIZE,%sp
566	ld	ap(0),a_0
567	ld	bp(0),b_0
568	umul	a_0,b_0,c_1	!=!mul_add_c(a[0],b[0],c1,c2,c3);
569	ld	bp(1),b_1
570	rd	%y,c_2
571	st	c_1,rp(0)	!r[0]=c1;
572
573	umul	a_0,b_1,t_1	!=!mul_add_c(a[0],b[1],c2,c3,c1);
574	ld	ap(1),a_1
575	addcc	c_2,t_1,c_2
576	rd	%y,t_2
577	addxcc	%g0,t_2,c_3	!=
578	addx	%g0,%g0,c_1
579	ld	ap(2),a_2
580	umul	a_1,b_0,t_1	!mul_add_c(a[1],b[0],c2,c3,c1);
581	addcc	c_2,t_1,c_2	!=
582	rd	%y,t_2
583	addxcc	c_3,t_2,c_3
584	st	c_2,rp(1)	!r[1]=c2;
585	addx	c_1,%g0,c_1	!=
586
587	umul	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
588	addcc	c_3,t_1,c_3
589	rd	%y,t_2
590	addxcc	c_1,t_2,c_1	!=
591	addx	%g0,%g0,c_2
592	ld	bp(2),b_2
593	umul	a_1,b_1,t_1	!mul_add_c(a[1],b[1],c3,c1,c2);
594	addcc	c_3,t_1,c_3	!=
595	rd	%y,t_2
596	addxcc	c_1,t_2,c_1
597	ld	bp(3),b_3
598	addx	c_2,%g0,c_2	!=
599	umul	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
600	addcc	c_3,t_1,c_3
601	rd	%y,t_2
602	addxcc	c_1,t_2,c_1	!=
603	addx	c_2,%g0,c_2
604	st	c_3,rp(2)	!r[2]=c3;
605
606	umul	a_0,b_3,t_1	!mul_add_c(a[0],b[3],c1,c2,c3);
607	addcc	c_1,t_1,c_1	!=
608	rd	%y,t_2
609	addxcc	c_2,t_2,c_2
610	addx	%g0,%g0,c_3
611	umul	a_1,b_2,t_1	!=!mul_add_c(a[1],b[2],c1,c2,c3);
612	addcc	c_1,t_1,c_1
613	rd	%y,t_2
614	addxcc	c_2,t_2,c_2
615	addx	c_3,%g0,c_3	!=
616	ld	ap(3),a_3
617	umul	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
618	addcc	c_1,t_1,c_1
619	rd	%y,t_2		!=
620	addxcc	c_2,t_2,c_2
621	addx	c_3,%g0,c_3
622	ld	ap(4),a_4
623	umul	a_3,b_0,t_1	!mul_add_c(a[3],b[0],c1,c2,c3);!=
624	addcc	c_1,t_1,c_1
625	rd	%y,t_2
626	addxcc	c_2,t_2,c_2
627	addx	c_3,%g0,c_3	!=
628	st	c_1,rp(3)	!r[3]=c1;
629
630	umul	a_4,b_0,t_1	!mul_add_c(a[4],b[0],c2,c3,c1);
631	addcc	c_2,t_1,c_2
632	rd	%y,t_2		!=
633	addxcc	c_3,t_2,c_3
634	addx	%g0,%g0,c_1
635	umul	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
636	addcc	c_2,t_1,c_2	!=
637	rd	%y,t_2
638	addxcc	c_3,t_2,c_3
639	addx	c_1,%g0,c_1
640	umul	a_2,b_2,t_1	!=!mul_add_c(a[2],b[2],c2,c3,c1);
641	addcc	c_2,t_1,c_2
642	rd	%y,t_2
643	addxcc	c_3,t_2,c_3
644	addx	c_1,%g0,c_1	!=
645	ld	bp(4),b_4
646	umul	a_1,b_3,t_1	!mul_add_c(a[1],b[3],c2,c3,c1);
647	addcc	c_2,t_1,c_2
648	rd	%y,t_2		!=
649	addxcc	c_3,t_2,c_3
650	addx	c_1,%g0,c_1
651	ld	bp(5),b_5
652	umul	a_0,b_4,t_1	!=!mul_add_c(a[0],b[4],c2,c3,c1);
653	addcc	c_2,t_1,c_2
654	rd	%y,t_2
655	addxcc	c_3,t_2,c_3
656	addx	c_1,%g0,c_1	!=
657	st	c_2,rp(4)	!r[4]=c2;
658
659	umul	a_0,b_5,t_1	!mul_add_c(a[0],b[5],c3,c1,c2);
660	addcc	c_3,t_1,c_3
661	rd	%y,t_2		!=
662	addxcc	c_1,t_2,c_1
663	addx	%g0,%g0,c_2
664	umul	a_1,b_4,t_1	!mul_add_c(a[1],b[4],c3,c1,c2);
665	addcc	c_3,t_1,c_3	!=
666	rd	%y,t_2
667	addxcc	c_1,t_2,c_1
668	addx	c_2,%g0,c_2
669	umul	a_2,b_3,t_1	!=!mul_add_c(a[2],b[3],c3,c1,c2);
670	addcc	c_3,t_1,c_3
671	rd	%y,t_2
672	addxcc	c_1,t_2,c_1
673	addx	c_2,%g0,c_2	!=
674	umul	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
675	addcc	c_3,t_1,c_3
676	rd	%y,t_2
677	addxcc	c_1,t_2,c_1	!=
678	addx	c_2,%g0,c_2
679	ld	ap(5),a_5
680	umul	a_4,b_1,t_1	!mul_add_c(a[4],b[1],c3,c1,c2);
681	addcc	c_3,t_1,c_3	!=
682	rd	%y,t_2
683	addxcc	c_1,t_2,c_1
684	ld	ap(6),a_6
685	addx	c_2,%g0,c_2	!=
686	umul	a_5,b_0,t_1	!mul_add_c(a[5],b[0],c3,c1,c2);
687	addcc	c_3,t_1,c_3
688	rd	%y,t_2
689	addxcc	c_1,t_2,c_1	!=
690	addx	c_2,%g0,c_2
691	st	c_3,rp(5)	!r[5]=c3;
692
693	umul	a_6,b_0,t_1	!mul_add_c(a[6],b[0],c1,c2,c3);
694	addcc	c_1,t_1,c_1	!=
695	rd	%y,t_2
696	addxcc	c_2,t_2,c_2
697	addx	%g0,%g0,c_3
698	umul	a_5,b_1,t_1	!=!mul_add_c(a[5],b[1],c1,c2,c3);
699	addcc	c_1,t_1,c_1
700	rd	%y,t_2
701	addxcc	c_2,t_2,c_2
702	addx	c_3,%g0,c_3	!=
703	umul	a_4,b_2,t_1	!mul_add_c(a[4],b[2],c1,c2,c3);
704	addcc	c_1,t_1,c_1
705	rd	%y,t_2
706	addxcc	c_2,t_2,c_2	!=
707	addx	c_3,%g0,c_3
708	umul	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
709	addcc	c_1,t_1,c_1
710	rd	%y,t_2		!=
711	addxcc	c_2,t_2,c_2
712	addx	c_3,%g0,c_3
713	umul	a_2,b_4,t_1	!mul_add_c(a[2],b[4],c1,c2,c3);
714	addcc	c_1,t_1,c_1	!=
715	rd	%y,t_2
716	addxcc	c_2,t_2,c_2
717	ld	bp(6),b_6
718	addx	c_3,%g0,c_3	!=
719	umul	a_1,b_5,t_1	!mul_add_c(a[1],b[5],c1,c2,c3);
720	addcc	c_1,t_1,c_1
721	rd	%y,t_2
722	addxcc	c_2,t_2,c_2	!=
723	addx	c_3,%g0,c_3
724	ld	bp(7),b_7
725	umul	a_0,b_6,t_1	!mul_add_c(a[0],b[6],c1,c2,c3);
726	addcc	c_1,t_1,c_1	!=
727	rd	%y,t_2
728	addxcc	c_2,t_2,c_2
729	st	c_1,rp(6)	!r[6]=c1;
730	addx	c_3,%g0,c_3	!=
731
732	umul	a_0,b_7,t_1	!mul_add_c(a[0],b[7],c2,c3,c1);
733	addcc	c_2,t_1,c_2
734	rd	%y,t_2
735	addxcc	c_3,t_2,c_3	!=
736	addx	%g0,%g0,c_1
737	umul	a_1,b_6,t_1	!mul_add_c(a[1],b[6],c2,c3,c1);
738	addcc	c_2,t_1,c_2
739	rd	%y,t_2		!=
740	addxcc	c_3,t_2,c_3
741	addx	c_1,%g0,c_1
742	umul	a_2,b_5,t_1	!mul_add_c(a[2],b[5],c2,c3,c1);
743	addcc	c_2,t_1,c_2	!=
744	rd	%y,t_2
745	addxcc	c_3,t_2,c_3
746	addx	c_1,%g0,c_1
747	umul	a_3,b_4,t_1	!=!mul_add_c(a[3],b[4],c2,c3,c1);
748	addcc	c_2,t_1,c_2
749	rd	%y,t_2
750	addxcc	c_3,t_2,c_3
751	addx	c_1,%g0,c_1	!=
752	umul	a_4,b_3,t_1	!mul_add_c(a[4],b[3],c2,c3,c1);
753	addcc	c_2,t_1,c_2
754	rd	%y,t_2
755	addxcc	c_3,t_2,c_3	!=
756	addx	c_1,%g0,c_1
757	umul	a_5,b_2,t_1	!mul_add_c(a[5],b[2],c2,c3,c1);
758	addcc	c_2,t_1,c_2
759	rd	%y,t_2		!=
760	addxcc	c_3,t_2,c_3
761	addx	c_1,%g0,c_1
762	ld	ap(7),a_7
763	umul	a_6,b_1,t_1	!=!mul_add_c(a[6],b[1],c2,c3,c1);
764	addcc	c_2,t_1,c_2
765	rd	%y,t_2
766	addxcc	c_3,t_2,c_3
767	addx	c_1,%g0,c_1	!=
768	umul	a_7,b_0,t_1	!mul_add_c(a[7],b[0],c2,c3,c1);
769	addcc	c_2,t_1,c_2
770	rd	%y,t_2
771	addxcc	c_3,t_2,c_3	!=
772	addx	c_1,%g0,c_1
773	st	c_2,rp(7)	!r[7]=c2;
774
775	umul	a_7,b_1,t_1	!mul_add_c(a[7],b[1],c3,c1,c2);
776	addcc	c_3,t_1,c_3	!=
777	rd	%y,t_2
778	addxcc	c_1,t_2,c_1
779	addx	%g0,%g0,c_2
780	umul	a_6,b_2,t_1	!=!mul_add_c(a[6],b[2],c3,c1,c2);
781	addcc	c_3,t_1,c_3
782	rd	%y,t_2
783	addxcc	c_1,t_2,c_1
784	addx	c_2,%g0,c_2	!=
785	umul	a_5,b_3,t_1	!mul_add_c(a[5],b[3],c3,c1,c2);
786	addcc	c_3,t_1,c_3
787	rd	%y,t_2
788	addxcc	c_1,t_2,c_1	!=
789	addx	c_2,%g0,c_2
790	umul	a_4,b_4,t_1	!mul_add_c(a[4],b[4],c3,c1,c2);
791	addcc	c_3,t_1,c_3
792	rd	%y,t_2		!=
793	addxcc	c_1,t_2,c_1
794	addx	c_2,%g0,c_2
795	umul	a_3,b_5,t_1	!mul_add_c(a[3],b[5],c3,c1,c2);
796	addcc	c_3,t_1,c_3	!=
797	rd	%y,t_2
798	addxcc	c_1,t_2,c_1
799	addx	c_2,%g0,c_2
800	umul	a_2,b_6,t_1	!=!mul_add_c(a[2],b[6],c3,c1,c2);
801	addcc	c_3,t_1,c_3
802	rd	%y,t_2
803	addxcc	c_1,t_2,c_1
804	addx	c_2,%g0,c_2	!=
805	umul	a_1,b_7,t_1	!mul_add_c(a[1],b[7],c3,c1,c2);
806	addcc	c_3,t_1,c_3
807	rd	%y,t_2
808	addxcc	c_1,t_2,c_1	!
809	addx	c_2,%g0,c_2
810	st	c_3,rp(8)	!r[8]=c3;
811
812	umul	a_2,b_7,t_1	!mul_add_c(a[2],b[7],c1,c2,c3);
813	addcc	c_1,t_1,c_1	!=
814	rd	%y,t_2
815	addxcc	c_2,t_2,c_2
816	addx	%g0,%g0,c_3
817	umul	a_3,b_6,t_1	!=!mul_add_c(a[3],b[6],c1,c2,c3);
818	addcc	c_1,t_1,c_1
819	rd	%y,t_2
820	addxcc	c_2,t_2,c_2
821	addx	c_3,%g0,c_3	!=
822	umul	a_4,b_5,t_1	!mul_add_c(a[4],b[5],c1,c2,c3);
823	addcc	c_1,t_1,c_1
824	rd	%y,t_2
825	addxcc	c_2,t_2,c_2	!=
826	addx	c_3,%g0,c_3
827	umul	a_5,b_4,t_1	!mul_add_c(a[5],b[4],c1,c2,c3);
828	addcc	c_1,t_1,c_1
829	rd	%y,t_2		!=
830	addxcc	c_2,t_2,c_2
831	addx	c_3,%g0,c_3
832	umul	a_6,b_3,t_1	!mul_add_c(a[6],b[3],c1,c2,c3);
833	addcc	c_1,t_1,c_1	!=
834	rd	%y,t_2
835	addxcc	c_2,t_2,c_2
836	addx	c_3,%g0,c_3
837	umul	a_7,b_2,t_1	!=!mul_add_c(a[7],b[2],c1,c2,c3);
838	addcc	c_1,t_1,c_1
839	rd	%y,t_2
840	addxcc	c_2,t_2,c_2
841	addx	c_3,%g0,c_3	!=
842	st	c_1,rp(9)	!r[9]=c1;
843
844	umul	a_7,b_3,t_1	!mul_add_c(a[7],b[3],c2,c3,c1);
845	addcc	c_2,t_1,c_2
846	rd	%y,t_2		!=
847	addxcc	c_3,t_2,c_3
848	addx	%g0,%g0,c_1
849	umul	a_6,b_4,t_1	!mul_add_c(a[6],b[4],c2,c3,c1);
850	addcc	c_2,t_1,c_2	!=
851	rd	%y,t_2
852	addxcc	c_3,t_2,c_3
853	addx	c_1,%g0,c_1
854	umul	a_5,b_5,t_1	!=!mul_add_c(a[5],b[5],c2,c3,c1);
855	addcc	c_2,t_1,c_2
856	rd	%y,t_2
857	addxcc	c_3,t_2,c_3
858	addx	c_1,%g0,c_1	!=
859	umul	a_4,b_6,t_1	!mul_add_c(a[4],b[6],c2,c3,c1);
860	addcc	c_2,t_1,c_2
861	rd	%y,t_2
862	addxcc	c_3,t_2,c_3	!=
863	addx	c_1,%g0,c_1
864	umul	a_3,b_7,t_1	!mul_add_c(a[3],b[7],c2,c3,c1);
865	addcc	c_2,t_1,c_2
866	rd	%y,t_2		!=
867	addxcc	c_3,t_2,c_3
868	addx	c_1,%g0,c_1
869	st	c_2,rp(10)	!r[10]=c2;
870
871	umul	a_4,b_7,t_1	!=!mul_add_c(a[4],b[7],c3,c1,c2);
872	addcc	c_3,t_1,c_3
873	rd	%y,t_2
874	addxcc	c_1,t_2,c_1
875	addx	%g0,%g0,c_2	!=
876	umul	a_5,b_6,t_1	!mul_add_c(a[5],b[6],c3,c1,c2);
877	addcc	c_3,t_1,c_3
878	rd	%y,t_2
879	addxcc	c_1,t_2,c_1	!=
880	addx	c_2,%g0,c_2
881	umul	a_6,b_5,t_1	!mul_add_c(a[6],b[5],c3,c1,c2);
882	addcc	c_3,t_1,c_3
883	rd	%y,t_2		!=
884	addxcc	c_1,t_2,c_1
885	addx	c_2,%g0,c_2
886	umul	a_7,b_4,t_1	!mul_add_c(a[7],b[4],c3,c1,c2);
887	addcc	c_3,t_1,c_3	!=
888	rd	%y,t_2
889	addxcc	c_1,t_2,c_1
890	st	c_3,rp(11)	!r[11]=c3;
891	addx	c_2,%g0,c_2	!=
892
893	umul	a_7,b_5,t_1	!mul_add_c(a[7],b[5],c1,c2,c3);
894	addcc	c_1,t_1,c_1
895	rd	%y,t_2
896	addxcc	c_2,t_2,c_2	!=
897	addx	%g0,%g0,c_3
898	umul	a_6,b_6,t_1	!mul_add_c(a[6],b[6],c1,c2,c3);
899	addcc	c_1,t_1,c_1
900	rd	%y,t_2		!=
901	addxcc	c_2,t_2,c_2
902	addx	c_3,%g0,c_3
903	umul	a_5,b_7,t_1	!mul_add_c(a[5],b[7],c1,c2,c3);
904	addcc	c_1,t_1,c_1	!=
905	rd	%y,t_2
906	addxcc	c_2,t_2,c_2
907	st	c_1,rp(12)	!r[12]=c1;
908	addx	c_3,%g0,c_3	!=
909
910	umul	a_6,b_7,t_1	!mul_add_c(a[6],b[7],c2,c3,c1);
911	addcc	c_2,t_1,c_2
912	rd	%y,t_2
913	addxcc	c_3,t_2,c_3	!=
914	addx	%g0,%g0,c_1
915	umul	a_7,b_6,t_1	!mul_add_c(a[7],b[6],c2,c3,c1);
916	addcc	c_2,t_1,c_2
917	rd	%y,t_2		!=
918	addxcc	c_3,t_2,c_3
919	addx	c_1,%g0,c_1
920	st	c_2,rp(13)	!r[13]=c2;
921
922	umul	a_7,b_7,t_1	!=!mul_add_c(a[7],b[7],c3,c1,c2);
923	addcc	c_3,t_1,c_3
924	rd	%y,t_2
925	addxcc	c_1,t_2,c_1
926	nop			!=
927	st	c_3,rp(14)	!r[14]=c3;
928	st	c_1,rp(15)	!r[15]=c1;
929
930	ret
931	restore	%g0,%g0,%o0
932
933.type	bn_mul_comba8,#function
934.size	bn_mul_comba8,(.-bn_mul_comba8)
935
936.align	32
937
938.global bn_mul_comba4
939/*
940 * void bn_mul_comba4(r,a,b)
941 * BN_ULONG *r,*a,*b;
942 */
943bn_mul_comba4:
944	save	%sp,FRAME_SIZE,%sp
945	ld	ap(0),a_0
946	ld	bp(0),b_0
947	umul	a_0,b_0,c_1	!=!mul_add_c(a[0],b[0],c1,c2,c3);
948	ld	bp(1),b_1
949	rd	%y,c_2
950	st	c_1,rp(0)	!r[0]=c1;
951
952	umul	a_0,b_1,t_1	!=!mul_add_c(a[0],b[1],c2,c3,c1);
953	ld	ap(1),a_1
954	addcc	c_2,t_1,c_2
955	rd	%y,t_2		!=
956	addxcc	%g0,t_2,c_3
957	addx	%g0,%g0,c_1
958	ld	ap(2),a_2
959	umul	a_1,b_0,t_1	!=!mul_add_c(a[1],b[0],c2,c3,c1);
960	addcc	c_2,t_1,c_2
961	rd	%y,t_2
962	addxcc	c_3,t_2,c_3
963	addx	c_1,%g0,c_1	!=
964	st	c_2,rp(1)	!r[1]=c2;
965
966	umul	a_2,b_0,t_1	!mul_add_c(a[2],b[0],c3,c1,c2);
967	addcc	c_3,t_1,c_3
968	rd	%y,t_2		!=
969	addxcc	c_1,t_2,c_1
970	addx	%g0,%g0,c_2
971	ld	bp(2),b_2
972	umul	a_1,b_1,t_1	!=!mul_add_c(a[1],b[1],c3,c1,c2);
973	addcc	c_3,t_1,c_3
974	rd	%y,t_2
975	addxcc	c_1,t_2,c_1
976	addx	c_2,%g0,c_2	!=
977	ld	bp(3),b_3
978	umul	a_0,b_2,t_1	!mul_add_c(a[0],b[2],c3,c1,c2);
979	addcc	c_3,t_1,c_3
980	rd	%y,t_2		!=
981	addxcc	c_1,t_2,c_1
982	addx	c_2,%g0,c_2
983	st	c_3,rp(2)	!r[2]=c3;
984
985	umul	a_0,b_3,t_1	!=!mul_add_c(a[0],b[3],c1,c2,c3);
986	addcc	c_1,t_1,c_1
987	rd	%y,t_2
988	addxcc	c_2,t_2,c_2
989	addx	%g0,%g0,c_3	!=
990	umul	a_1,b_2,t_1	!mul_add_c(a[1],b[2],c1,c2,c3);
991	addcc	c_1,t_1,c_1
992	rd	%y,t_2
993	addxcc	c_2,t_2,c_2	!=
994	addx	c_3,%g0,c_3
995	ld	ap(3),a_3
996	umul	a_2,b_1,t_1	!mul_add_c(a[2],b[1],c1,c2,c3);
997	addcc	c_1,t_1,c_1	!=
998	rd	%y,t_2
999	addxcc	c_2,t_2,c_2
1000	addx	c_3,%g0,c_3
1001	umul	a_3,b_0,t_1	!=!mul_add_c(a[3],b[0],c1,c2,c3);
1002	addcc	c_1,t_1,c_1
1003	rd	%y,t_2
1004	addxcc	c_2,t_2,c_2
1005	addx	c_3,%g0,c_3	!=
1006	st	c_1,rp(3)	!r[3]=c1;
1007
1008	umul	a_3,b_1,t_1	!mul_add_c(a[3],b[1],c2,c3,c1);
1009	addcc	c_2,t_1,c_2
1010	rd	%y,t_2		!=
1011	addxcc	c_3,t_2,c_3
1012	addx	%g0,%g0,c_1
1013	umul	a_2,b_2,t_1	!mul_add_c(a[2],b[2],c2,c3,c1);
1014	addcc	c_2,t_1,c_2	!=
1015	rd	%y,t_2
1016	addxcc	c_3,t_2,c_3
1017	addx	c_1,%g0,c_1
1018	umul	a_1,b_3,t_1	!=!mul_add_c(a[1],b[3],c2,c3,c1);
1019	addcc	c_2,t_1,c_2
1020	rd	%y,t_2
1021	addxcc	c_3,t_2,c_3
1022	addx	c_1,%g0,c_1	!=
1023	st	c_2,rp(4)	!r[4]=c2;
1024
1025	umul	a_2,b_3,t_1	!mul_add_c(a[2],b[3],c3,c1,c2);
1026	addcc	c_3,t_1,c_3
1027	rd	%y,t_2		!=
1028	addxcc	c_1,t_2,c_1
1029	addx	%g0,%g0,c_2
1030	umul	a_3,b_2,t_1	!mul_add_c(a[3],b[2],c3,c1,c2);
1031	addcc	c_3,t_1,c_3	!=
1032	rd	%y,t_2
1033	addxcc	c_1,t_2,c_1
1034	st	c_3,rp(5)	!r[5]=c3;
1035	addx	c_2,%g0,c_2	!=
1036
1037	umul	a_3,b_3,t_1	!mul_add_c(a[3],b[3],c1,c2,c3);
1038	addcc	c_1,t_1,c_1
1039	rd	%y,t_2
1040	addxcc	c_2,t_2,c_2	!=
1041	st	c_1,rp(6)	!r[6]=c1;
1042	st	c_2,rp(7)	!r[7]=c2;
1043
1044	ret
1045	restore	%g0,%g0,%o0
1046
1047.type	bn_mul_comba4,#function
1048.size	bn_mul_comba4,(.-bn_mul_comba4)
1049
1050.align	32
1051
1052.global bn_sqr_comba8
1053bn_sqr_comba8:
1054	save	%sp,FRAME_SIZE,%sp
1055	ld	ap(0),a_0
1056	ld	ap(1),a_1
1057	umul	a_0,a_0,c_1	!=!sqr_add_c(a,0,c1,c2,c3);
1058	rd	%y,c_2
1059	st	c_1,rp(0)	!r[0]=c1;
1060
1061	ld	ap(2),a_2
1062	umul	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
1063	addcc	c_2,t_1,c_2
1064	rd	%y,t_2
1065	addxcc	%g0,t_2,c_3
1066	addx	%g0,%g0,c_1	!=
1067	addcc	c_2,t_1,c_2
1068	addxcc	c_3,t_2,c_3
1069	st	c_2,rp(1)	!r[1]=c2;
1070	addx	c_1,%g0,c_1	!=
1071
1072	umul	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1073	addcc	c_3,t_1,c_3
1074	rd	%y,t_2
1075	addxcc	c_1,t_2,c_1	!=
1076	addx	%g0,%g0,c_2
1077	addcc	c_3,t_1,c_3
1078	addxcc	c_1,t_2,c_1
1079	addx	c_2,%g0,c_2	!=
1080	ld	ap(3),a_3
1081	umul	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1082	addcc	c_3,t_1,c_3
1083	rd	%y,t_2		!=
1084	addxcc	c_1,t_2,c_1
1085	addx	c_2,%g0,c_2
1086	st	c_3,rp(2)	!r[2]=c3;
1087
1088	umul	a_0,a_3,t_1	!=!sqr_add_c2(a,3,0,c1,c2,c3);
1089	addcc	c_1,t_1,c_1
1090	rd	%y,t_2
1091	addxcc	c_2,t_2,c_2
1092	addx	%g0,%g0,c_3	!=
1093	addcc	c_1,t_1,c_1
1094	addxcc	c_2,t_2,c_2
1095	ld	ap(4),a_4
1096	addx	c_3,%g0,c_3	!=
1097	umul	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1098	addcc	c_1,t_1,c_1
1099	rd	%y,t_2
1100	addxcc	c_2,t_2,c_2	!=
1101	addx	c_3,%g0,c_3
1102	addcc	c_1,t_1,c_1
1103	addxcc	c_2,t_2,c_2
1104	addx	c_3,%g0,c_3	!=
1105	st	c_1,rp(3)	!r[3]=c1;
1106
1107	umul	a_4,a_0,t_1	!sqr_add_c2(a,4,0,c2,c3,c1);
1108	addcc	c_2,t_1,c_2
1109	rd	%y,t_2		!=
1110	addxcc	c_3,t_2,c_3
1111	addx	%g0,%g0,c_1
1112	addcc	c_2,t_1,c_2
1113	addxcc	c_3,t_2,c_3	!=
1114	addx	c_1,%g0,c_1
1115	umul	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1116	addcc	c_2,t_1,c_2
1117	rd	%y,t_2		!=
1118	addxcc	c_3,t_2,c_3
1119	addx	c_1,%g0,c_1
1120	addcc	c_2,t_1,c_2
1121	addxcc	c_3,t_2,c_3	!=
1122	addx	c_1,%g0,c_1
1123	ld	ap(5),a_5
1124	umul	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1125	addcc	c_2,t_1,c_2	!=
1126	rd	%y,t_2
1127	addxcc	c_3,t_2,c_3
1128	st	c_2,rp(4)	!r[4]=c2;
1129	addx	c_1,%g0,c_1	!=
1130
1131	umul	a_0,a_5,t_1	!sqr_add_c2(a,5,0,c3,c1,c2);
1132	addcc	c_3,t_1,c_3
1133	rd	%y,t_2
1134	addxcc	c_1,t_2,c_1	!=
1135	addx	%g0,%g0,c_2
1136	addcc	c_3,t_1,c_3
1137	addxcc	c_1,t_2,c_1
1138	addx	c_2,%g0,c_2	!=
1139	umul	a_1,a_4,t_1	!sqr_add_c2(a,4,1,c3,c1,c2);
1140	addcc	c_3,t_1,c_3
1141	rd	%y,t_2
1142	addxcc	c_1,t_2,c_1	!=
1143	addx	c_2,%g0,c_2
1144	addcc	c_3,t_1,c_3
1145	addxcc	c_1,t_2,c_1
1146	addx	c_2,%g0,c_2	!=
1147	ld	ap(6),a_6
1148	umul	a_2,a_3,t_1	!sqr_add_c2(a,3,2,c3,c1,c2);
1149	addcc	c_3,t_1,c_3
1150	rd	%y,t_2		!=
1151	addxcc	c_1,t_2,c_1
1152	addx	c_2,%g0,c_2
1153	addcc	c_3,t_1,c_3
1154	addxcc	c_1,t_2,c_1	!=
1155	addx	c_2,%g0,c_2
1156	st	c_3,rp(5)	!r[5]=c3;
1157
1158	umul	a_6,a_0,t_1	!sqr_add_c2(a,6,0,c1,c2,c3);
1159	addcc	c_1,t_1,c_1	!=
1160	rd	%y,t_2
1161	addxcc	c_2,t_2,c_2
1162	addx	%g0,%g0,c_3
1163	addcc	c_1,t_1,c_1	!=
1164	addxcc	c_2,t_2,c_2
1165	addx	c_3,%g0,c_3
1166	umul	a_5,a_1,t_1	!sqr_add_c2(a,5,1,c1,c2,c3);
1167	addcc	c_1,t_1,c_1	!=
1168	rd	%y,t_2
1169	addxcc	c_2,t_2,c_2
1170	addx	c_3,%g0,c_3
1171	addcc	c_1,t_1,c_1	!=
1172	addxcc	c_2,t_2,c_2
1173	addx	c_3,%g0,c_3
1174	umul	a_4,a_2,t_1	!sqr_add_c2(a,4,2,c1,c2,c3);
1175	addcc	c_1,t_1,c_1	!=
1176	rd	%y,t_2
1177	addxcc	c_2,t_2,c_2
1178	addx	c_3,%g0,c_3
1179	addcc	c_1,t_1,c_1	!=
1180	addxcc	c_2,t_2,c_2
1181	addx	c_3,%g0,c_3
1182	ld	ap(7),a_7
1183	umul	a_3,a_3,t_1	!=!sqr_add_c(a,3,c1,c2,c3);
1184	addcc	c_1,t_1,c_1
1185	rd	%y,t_2
1186	addxcc	c_2,t_2,c_2
1187	addx	c_3,%g0,c_3	!=
1188	st	c_1,rp(6)	!r[6]=c1;
1189
1190	umul	a_0,a_7,t_1	!sqr_add_c2(a,7,0,c2,c3,c1);
1191	addcc	c_2,t_1,c_2
1192	rd	%y,t_2		!=
1193	addxcc	c_3,t_2,c_3
1194	addx	%g0,%g0,c_1
1195	addcc	c_2,t_1,c_2
1196	addxcc	c_3,t_2,c_3	!=
1197	addx	c_1,%g0,c_1
1198	umul	a_1,a_6,t_1	!sqr_add_c2(a,6,1,c2,c3,c1);
1199	addcc	c_2,t_1,c_2
1200	rd	%y,t_2		!=
1201	addxcc	c_3,t_2,c_3
1202	addx	c_1,%g0,c_1
1203	addcc	c_2,t_1,c_2
1204	addxcc	c_3,t_2,c_3	!=
1205	addx	c_1,%g0,c_1
1206	umul	a_2,a_5,t_1	!sqr_add_c2(a,5,2,c2,c3,c1);
1207	addcc	c_2,t_1,c_2
1208	rd	%y,t_2		!=
1209	addxcc	c_3,t_2,c_3
1210	addx	c_1,%g0,c_1
1211	addcc	c_2,t_1,c_2
1212	addxcc	c_3,t_2,c_3	!=
1213	addx	c_1,%g0,c_1
1214	umul	a_3,a_4,t_1	!sqr_add_c2(a,4,3,c2,c3,c1);
1215	addcc	c_2,t_1,c_2
1216	rd	%y,t_2		!=
1217	addxcc	c_3,t_2,c_3
1218	addx	c_1,%g0,c_1
1219	addcc	c_2,t_1,c_2
1220	addxcc	c_3,t_2,c_3	!=
1221	addx	c_1,%g0,c_1
1222	st	c_2,rp(7)	!r[7]=c2;
1223
1224	umul	a_7,a_1,t_1	!sqr_add_c2(a,7,1,c3,c1,c2);
1225	addcc	c_3,t_1,c_3	!=
1226	rd	%y,t_2
1227	addxcc	c_1,t_2,c_1
1228	addx	%g0,%g0,c_2
1229	addcc	c_3,t_1,c_3	!=
1230	addxcc	c_1,t_2,c_1
1231	addx	c_2,%g0,c_2
1232	umul	a_6,a_2,t_1	!sqr_add_c2(a,6,2,c3,c1,c2);
1233	addcc	c_3,t_1,c_3	!=
1234	rd	%y,t_2
1235	addxcc	c_1,t_2,c_1
1236	addx	c_2,%g0,c_2
1237	addcc	c_3,t_1,c_3	!=
1238	addxcc	c_1,t_2,c_1
1239	addx	c_2,%g0,c_2
1240	umul	a_5,a_3,t_1	!sqr_add_c2(a,5,3,c3,c1,c2);
1241	addcc	c_3,t_1,c_3	!=
1242	rd	%y,t_2
1243	addxcc	c_1,t_2,c_1
1244	addx	c_2,%g0,c_2
1245	addcc	c_3,t_1,c_3	!=
1246	addxcc	c_1,t_2,c_1
1247	addx	c_2,%g0,c_2
1248	umul	a_4,a_4,t_1	!sqr_add_c(a,4,c3,c1,c2);
1249	addcc	c_3,t_1,c_3	!=
1250	rd	%y,t_2
1251	addxcc	c_1,t_2,c_1
1252	st	c_3,rp(8)	!r[8]=c3;
1253	addx	c_2,%g0,c_2	!=
1254
1255	umul	a_2,a_7,t_1	!sqr_add_c2(a,7,2,c1,c2,c3);
1256	addcc	c_1,t_1,c_1
1257	rd	%y,t_2
1258	addxcc	c_2,t_2,c_2	!=
1259	addx	%g0,%g0,c_3
1260	addcc	c_1,t_1,c_1
1261	addxcc	c_2,t_2,c_2
1262	addx	c_3,%g0,c_3	!=
1263	umul	a_3,a_6,t_1	!sqr_add_c2(a,6,3,c1,c2,c3);
1264	addcc	c_1,t_1,c_1
1265	rd	%y,t_2
1266	addxcc	c_2,t_2,c_2	!=
1267	addx	c_3,%g0,c_3
1268	addcc	c_1,t_1,c_1
1269	addxcc	c_2,t_2,c_2
1270	addx	c_3,%g0,c_3	!=
1271	umul	a_4,a_5,t_1	!sqr_add_c2(a,5,4,c1,c2,c3);
1272	addcc	c_1,t_1,c_1
1273	rd	%y,t_2
1274	addxcc	c_2,t_2,c_2	!=
1275	addx	c_3,%g0,c_3
1276	addcc	c_1,t_1,c_1
1277	addxcc	c_2,t_2,c_2
1278	addx	c_3,%g0,c_3	!=
1279	st	c_1,rp(9)	!r[9]=c1;
1280
1281	umul	a_7,a_3,t_1	!sqr_add_c2(a,7,3,c2,c3,c1);
1282	addcc	c_2,t_1,c_2
1283	rd	%y,t_2		!=
1284	addxcc	c_3,t_2,c_3
1285	addx	%g0,%g0,c_1
1286	addcc	c_2,t_1,c_2
1287	addxcc	c_3,t_2,c_3	!=
1288	addx	c_1,%g0,c_1
1289	umul	a_6,a_4,t_1	!sqr_add_c2(a,6,4,c2,c3,c1);
1290	addcc	c_2,t_1,c_2
1291	rd	%y,t_2		!=
1292	addxcc	c_3,t_2,c_3
1293	addx	c_1,%g0,c_1
1294	addcc	c_2,t_1,c_2
1295	addxcc	c_3,t_2,c_3	!=
1296	addx	c_1,%g0,c_1
1297	umul	a_5,a_5,t_1	!sqr_add_c(a,5,c2,c3,c1);
1298	addcc	c_2,t_1,c_2
1299	rd	%y,t_2		!=
1300	addxcc	c_3,t_2,c_3
1301	addx	c_1,%g0,c_1
1302	st	c_2,rp(10)	!r[10]=c2;
1303
1304	umul	a_4,a_7,t_1	!=!sqr_add_c2(a,7,4,c3,c1,c2);
1305	addcc	c_3,t_1,c_3
1306	rd	%y,t_2
1307	addxcc	c_1,t_2,c_1
1308	addx	%g0,%g0,c_2	!=
1309	addcc	c_3,t_1,c_3
1310	addxcc	c_1,t_2,c_1
1311	addx	c_2,%g0,c_2
1312	umul	a_5,a_6,t_1	!=!sqr_add_c2(a,6,5,c3,c1,c2);
1313	addcc	c_3,t_1,c_3
1314	rd	%y,t_2
1315	addxcc	c_1,t_2,c_1
1316	addx	c_2,%g0,c_2	!=
1317	addcc	c_3,t_1,c_3
1318	addxcc	c_1,t_2,c_1
1319	st	c_3,rp(11)	!r[11]=c3;
1320	addx	c_2,%g0,c_2	!=
1321
1322	umul	a_7,a_5,t_1	!sqr_add_c2(a,7,5,c1,c2,c3);
1323	addcc	c_1,t_1,c_1
1324	rd	%y,t_2
1325	addxcc	c_2,t_2,c_2	!=
1326	addx	%g0,%g0,c_3
1327	addcc	c_1,t_1,c_1
1328	addxcc	c_2,t_2,c_2
1329	addx	c_3,%g0,c_3	!=
1330	umul	a_6,a_6,t_1	!sqr_add_c(a,6,c1,c2,c3);
1331	addcc	c_1,t_1,c_1
1332	rd	%y,t_2
1333	addxcc	c_2,t_2,c_2	!=
1334	addx	c_3,%g0,c_3
1335	st	c_1,rp(12)	!r[12]=c1;
1336
1337	umul	a_6,a_7,t_1	!sqr_add_c2(a,7,6,c2,c3,c1);
1338	addcc	c_2,t_1,c_2	!=
1339	rd	%y,t_2
1340	addxcc	c_3,t_2,c_3
1341	addx	%g0,%g0,c_1
1342	addcc	c_2,t_1,c_2	!=
1343	addxcc	c_3,t_2,c_3
1344	st	c_2,rp(13)	!r[13]=c2;
1345	addx	c_1,%g0,c_1	!=
1346
1347	umul	a_7,a_7,t_1	!sqr_add_c(a,7,c3,c1,c2);
1348	addcc	c_3,t_1,c_3
1349	rd	%y,t_2
1350	addxcc	c_1,t_2,c_1	!=
1351	st	c_3,rp(14)	!r[14]=c3;
1352	st	c_1,rp(15)	!r[15]=c1;
1353
1354	ret
1355	restore	%g0,%g0,%o0
1356
1357.type	bn_sqr_comba8,#function
1358.size	bn_sqr_comba8,(.-bn_sqr_comba8)
1359
1360.align	32
1361
1362.global bn_sqr_comba4
1363/*
1364 * void bn_sqr_comba4(r,a)
1365 * BN_ULONG *r,*a;
1366 */
1367bn_sqr_comba4:
1368	save	%sp,FRAME_SIZE,%sp
1369	ld	ap(0),a_0
1370	umul	a_0,a_0,c_1	!sqr_add_c(a,0,c1,c2,c3);
1371	ld	ap(1),a_1	!=
1372	rd	%y,c_2
1373	st	c_1,rp(0)	!r[0]=c1;
1374
1375	ld	ap(2),a_2
1376	umul	a_0,a_1,t_1	!=!sqr_add_c2(a,1,0,c2,c3,c1);
1377	addcc	c_2,t_1,c_2
1378	rd	%y,t_2
1379	addxcc	%g0,t_2,c_3
1380	addx	%g0,%g0,c_1	!=
1381	addcc	c_2,t_1,c_2
1382	addxcc	c_3,t_2,c_3
1383	addx	c_1,%g0,c_1	!=
1384	st	c_2,rp(1)	!r[1]=c2;
1385
1386	umul	a_2,a_0,t_1	!sqr_add_c2(a,2,0,c3,c1,c2);
1387	addcc	c_3,t_1,c_3
1388	rd	%y,t_2		!=
1389	addxcc	c_1,t_2,c_1
1390	addx	%g0,%g0,c_2
1391	addcc	c_3,t_1,c_3
1392	addxcc	c_1,t_2,c_1	!=
1393	addx	c_2,%g0,c_2
1394	ld	ap(3),a_3
1395	umul	a_1,a_1,t_1	!sqr_add_c(a,1,c3,c1,c2);
1396	addcc	c_3,t_1,c_3	!=
1397	rd	%y,t_2
1398	addxcc	c_1,t_2,c_1
1399	st	c_3,rp(2)	!r[2]=c3;
1400	addx	c_2,%g0,c_2	!=
1401
1402	umul	a_0,a_3,t_1	!sqr_add_c2(a,3,0,c1,c2,c3);
1403	addcc	c_1,t_1,c_1
1404	rd	%y,t_2
1405	addxcc	c_2,t_2,c_2	!=
1406	addx	%g0,%g0,c_3
1407	addcc	c_1,t_1,c_1
1408	addxcc	c_2,t_2,c_2
1409	addx	c_3,%g0,c_3	!=
1410	umul	a_1,a_2,t_1	!sqr_add_c2(a,2,1,c1,c2,c3);
1411	addcc	c_1,t_1,c_1
1412	rd	%y,t_2
1413	addxcc	c_2,t_2,c_2	!=
1414	addx	c_3,%g0,c_3
1415	addcc	c_1,t_1,c_1
1416	addxcc	c_2,t_2,c_2
1417	addx	c_3,%g0,c_3	!=
1418	st	c_1,rp(3)	!r[3]=c1;
1419
1420	umul	a_3,a_1,t_1	!sqr_add_c2(a,3,1,c2,c3,c1);
1421	addcc	c_2,t_1,c_2
1422	rd	%y,t_2		!=
1423	addxcc	c_3,t_2,c_3
1424	addx	%g0,%g0,c_1
1425	addcc	c_2,t_1,c_2
1426	addxcc	c_3,t_2,c_3	!=
1427	addx	c_1,%g0,c_1
1428	umul	a_2,a_2,t_1	!sqr_add_c(a,2,c2,c3,c1);
1429	addcc	c_2,t_1,c_2
1430	rd	%y,t_2		!=
1431	addxcc	c_3,t_2,c_3
1432	addx	c_1,%g0,c_1
1433	st	c_2,rp(4)	!r[4]=c2;
1434
1435	umul	a_2,a_3,t_1	!=!sqr_add_c2(a,3,2,c3,c1,c2);
1436	addcc	c_3,t_1,c_3
1437	rd	%y,t_2
1438	addxcc	c_1,t_2,c_1
1439	addx	%g0,%g0,c_2	!=
1440	addcc	c_3,t_1,c_3
1441	addxcc	c_1,t_2,c_1
1442	st	c_3,rp(5)	!r[5]=c3;
1443	addx	c_2,%g0,c_2	!=
1444
1445	umul	a_3,a_3,t_1	!sqr_add_c(a,3,c1,c2,c3);
1446	addcc	c_1,t_1,c_1
1447	rd	%y,t_2
1448	addxcc	c_2,t_2,c_2	!=
1449	st	c_1,rp(6)	!r[6]=c1;
1450	st	c_2,rp(7)	!r[7]=c2;
1451
1452	ret
1453	restore	%g0,%g0,%o0
1454
1455.type	bn_sqr_comba4,#function
1456.size	bn_sqr_comba4,(.-bn_sqr_comba4)
1457
1458.align	32
1459