xref: /openssl/crypto/bn/asm/sparct4-mont.pl (revision 54b40531)
1#! /usr/bin/env perl
2# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by David S. Miller and Andy Polyakov
12# The module is licensed under 2-clause BSD license.
13# November 2012. All rights reserved.
14# ====================================================================
15
16######################################################################
17# Montgomery squaring-n-multiplication module for SPARC T4.
18#
19# The module consists of three parts:
20#
21# 1) collection of "single-op" subroutines that perform single
22#    operation, Montgomery squaring or multiplication, on 512-,
23#    1024-, 1536- and 2048-bit operands;
24# 2) collection of "multi-op" subroutines that perform 5 squaring and
25#    1 multiplication operations on operands of above lengths;
26# 3) fall-back and helper VIS3 subroutines.
27#
28# RSA sign is dominated by multi-op subroutine, while RSA verify and
29# DSA - by single-op. Special note about 4096-bit RSA verify result.
30# Operands are too long for dedicated hardware and it's handled by
31# VIS3 code, which is why you don't see any improvement. It's surely
32# possible to improve it [by deploying 'mpmul' instruction], maybe in
33# the future...
34#
35# Performance improvement.
36#
37# 64-bit process, VIS3:
38#                   sign    verify    sign/s verify/s
39# rsa 1024 bits 0.000628s 0.000028s   1592.4  35434.4
40# rsa 2048 bits 0.003282s 0.000106s    304.7   9438.3
41# rsa 4096 bits 0.025866s 0.000340s     38.7   2940.9
42# dsa 1024 bits 0.000301s 0.000332s   3323.7   3013.9
43# dsa 2048 bits 0.001056s 0.001233s    946.9    810.8
44#
45# 64-bit process, this module:
46#                   sign    verify    sign/s verify/s
47# rsa 1024 bits 0.000256s 0.000016s   3904.4  61411.9
48# rsa 2048 bits 0.000946s 0.000029s   1056.8  34292.7
49# rsa 4096 bits 0.005061s 0.000340s    197.6   2940.5
50# dsa 1024 bits 0.000176s 0.000195s   5674.7   5130.5
51# dsa 2048 bits 0.000296s 0.000354s   3383.2   2827.6
52#
53######################################################################
54# 32-bit process, VIS3:
55#                   sign    verify    sign/s verify/s
56# rsa 1024 bits 0.000665s 0.000028s   1504.8  35233.3
57# rsa 2048 bits 0.003349s 0.000106s    298.6   9433.4
58# rsa 4096 bits 0.025959s 0.000341s     38.5   2934.8
59# dsa 1024 bits 0.000320s 0.000341s   3123.3   2929.6
60# dsa 2048 bits 0.001101s 0.001260s    908.2    793.4
61#
62# 32-bit process, this module:
63#                   sign    verify    sign/s verify/s
64# rsa 1024 bits 0.000301s 0.000017s   3317.1  60240.0
65# rsa 2048 bits 0.001034s 0.000030s    966.9  33812.7
66# rsa 4096 bits 0.005244s 0.000341s    190.7   2935.4
67# dsa 1024 bits 0.000201s 0.000205s   4976.1   4879.2
68# dsa 2048 bits 0.000328s 0.000360s   3051.1   2774.2
69#
70# 32-bit code is prone to performance degradation as interrupt rate
71# dispatched to CPU executing the code grows. This is because in
72# standard process of handling interrupt in 32-bit process context
73# upper halves of most integer registers used as input or output are
74# zeroed. This renders result invalid, and operation has to be re-run.
75# If CPU is "bothered" with timer interrupts only, the penalty is
76# hardly measurable. But in order to mitigate this problem for higher
77# interrupt rates contemporary Linux kernel recognizes biased stack
78# even in 32-bit process context and preserves full register contents.
79# See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
80# for details.
81
82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83push(@INC,"${dir}","${dir}../../perlasm");
84require "sparcv9_modes.pl";
85
86$output = pop and open STDOUT,">$output";
87
88$code.=<<___;
89#ifndef __ASSEMBLER__
90# define __ASSEMBLER__ 1
91#endif
92#include "crypto/sparc_arch.h"
93
94#ifdef	__arch64__
95.register	%g2,#scratch
96.register	%g3,#scratch
97#endif
98
99.section	".text",#alloc,#execinstr
100
101#ifdef	__PIC__
102SPARC_PIC_THUNK(%g1)
103#endif
104___
105
106########################################################################
107# Register layout for mont[mul|sqr] instructions.
108# For details see "Oracle SPARC Architecture 2011" manual at
109# http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
110#
111my @R=map("%f".2*$_,(0..11,30,31,12..29));
112my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
113my @A=(@N[0..13],@R[14..31]);
114my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
115
116########################################################################
117# int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
118#			  const u64 *np,const BN_ULONG *n0);
119#
120sub generate_bn_mul_mont_t4() {
121my $NUM=shift;
122my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
123
124$code.=<<___;
125.globl	bn_mul_mont_t4_$NUM
126.align	32
127bn_mul_mont_t4_$NUM:
128#ifdef	__arch64__
129	mov	0,$sentinel
130	mov	-128,%g4
131#elif defined(SPARCV9_64BIT_STACK)
132	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
133	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
134	mov	-2047,%g4
135	and	%g1,SPARCV9_64BIT_STACK,%g1
136	movrz	%g1,0,%g4
137	mov	-1,$sentinel
138	add	%g4,-128,%g4
139#else
140	mov	-1,$sentinel
141	mov	-128,%g4
142#endif
143	sllx	$sentinel,32,$sentinel
144	save	%sp,%g4,%sp
145#ifndef	__arch64__
146	save	%sp,-128,%sp	! warm it up
147	save	%sp,-128,%sp
148	save	%sp,-128,%sp
149	save	%sp,-128,%sp
150	save	%sp,-128,%sp
151	save	%sp,-128,%sp
152	restore
153	restore
154	restore
155	restore
156	restore
157	restore
158#endif
159	and	%sp,1,%g4
160	or	$sentinel,%fp,%fp
161	or	%g4,$sentinel,$sentinel
162
163	! copy arguments to global registers
164	mov	%i0,$rp
165	mov	%i1,$ap
166	mov	%i2,$bp
167	mov	%i3,$np
168	ld	[%i4+0],%f1	! load *n0
169	ld	[%i4+4],%f0
170	fsrc2	%f0,%f60
171___
172
173# load ap[$NUM] ########################################################
174$code.=<<___;
175	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
176___
177for($i=0; $i<14 && $i<$NUM; $i++) {
178my $lo=$i<13?@A[$i+1]:"%o7";
179$code.=<<___;
180	ld	[$ap+$i*8+0],$lo
181	ld	[$ap+$i*8+4],@A[$i]
182	sllx	@A[$i],32,@A[$i]
183	or	$lo,@A[$i],@A[$i]
184___
185}
186for(; $i<$NUM; $i++) {
187my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
188$code.=<<___;
189	ld	[$ap+$i*8+0],$lo
190	ld	[$ap+$i*8+4],$hi
191	fsrc2	$hi,@A[$i]
192___
193}
194# load np[$NUM] ########################################################
195$code.=<<___;
196	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
197___
198for($i=0; $i<14 && $i<$NUM; $i++) {
199my $lo=$i<13?@N[$i+1]:"%o7";
200$code.=<<___;
201	ld	[$np+$i*8+0],$lo
202	ld	[$np+$i*8+4],@N[$i]
203	sllx	@N[$i],32,@N[$i]
204	or	$lo,@N[$i],@N[$i]
205___
206}
207$code.=<<___;
208	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
209___
210for(; $i<28 && $i<$NUM; $i++) {
211my $lo=$i<27?@N[$i+1]:"%o7";
212$code.=<<___;
213	ld	[$np+$i*8+0],$lo
214	ld	[$np+$i*8+4],@N[$i]
215	sllx	@N[$i],32,@N[$i]
216	or	$lo,@N[$i],@N[$i]
217___
218}
219$code.=<<___;
220	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
221___
222for(; $i<$NUM; $i++) {
223my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
224$code.=<<___;
225	ld	[$np+$i*8+0],$lo
226	ld	[$np+$i*8+4],@N[$i]
227	sllx	@N[$i],32,@N[$i]
228	or	$lo,@N[$i],@N[$i]
229___
230}
231$code.=<<___;
232	cmp	$ap,$bp
233	be	SIZE_T_CC,.Lmsquare_$NUM
234	nop
235___
236
237# load bp[$NUM] ########################################################
238$code.=<<___;
239	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
240___
241for($i=0; $i<14 && $i<$NUM; $i++) {
242my $lo=$i<13?@B[$i+1]:"%o7";
243$code.=<<___;
244	ld	[$bp+$i*8+0],$lo
245	ld	[$bp+$i*8+4],@B[$i]
246	sllx	@B[$i],32,@B[$i]
247	or	$lo,@B[$i],@B[$i]
248___
249}
250$code.=<<___;
251	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
252___
253for(; $i<$NUM; $i++) {
254my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
255$code.=<<___;
256	ld	[$bp+$i*8+0],$lo
257	ld	[$bp+$i*8+4],@B[$i]
258	sllx	@B[$i],32,@B[$i]
259	or	$lo,@B[$i],@B[$i]
260___
261}
262# magic ################################################################
263$code.=<<___;
264	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
265.Lmresume_$NUM:
266	fbu,pn	%fcc3,.Lmabort_$NUM
267#ifndef	__arch64__
268	and	%fp,$sentinel,$sentinel
269	brz,pn	$sentinel,.Lmabort_$NUM
270#endif
271	nop
272#ifdef	__arch64__
273	restore
274	restore
275	restore
276	restore
277	restore
278#else
279	restore;		and	%fp,$sentinel,$sentinel
280	restore;		and	%fp,$sentinel,$sentinel
281	restore;		and	%fp,$sentinel,$sentinel
282	restore;		and	%fp,$sentinel,$sentinel
283	 brz,pn	$sentinel,.Lmabort1_$NUM
284	restore
285#endif
286___
287
288# save tp[$NUM] ########################################################
289for($i=0; $i<14 && $i<$NUM; $i++) {
290$code.=<<___;
291	movxtod	@A[$i],@R[$i]
292___
293}
294$code.=<<___;
295#ifdef	__arch64__
296	restore
297#else
298	 and	%fp,$sentinel,$sentinel
299	restore
300	 and	$sentinel,1,%o7
301	 and	%fp,$sentinel,$sentinel
302	 srl	%fp,0,%fp		! just in case?
303	 or	%o7,$sentinel,$sentinel
304	brz,a,pn $sentinel,.Lmdone_$NUM
305	mov	0,%i0		! return failure
306#endif
307___
308for($i=0; $i<12 && $i<$NUM; $i++) {
309@R[$i] =~ /%f([0-9]+)/;
310my $lo = "%f".($1+1);
311$code.=<<___;
312	st	$lo,[$rp+$i*8+0]
313	st	@R[$i],[$rp+$i*8+4]
314___
315}
316for(; $i<$NUM; $i++) {
317my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
318$code.=<<___;
319	fsrc2	@R[$i],$hi
320	st	$lo,[$rp+$i*8+0]
321	st	$hi,[$rp+$i*8+4]
322___
323}
324$code.=<<___;
325	mov	1,%i0		! return success
326.Lmdone_$NUM:
327	ret
328	restore
329
330.Lmabort_$NUM:
331	restore
332	restore
333	restore
334	restore
335	restore
336.Lmabort1_$NUM:
337	restore
338
339	mov	0,%i0		! return failure
340	ret
341	restore
342
343.align	32
344.Lmsquare_$NUM:
345	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
346	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
347	.word   0x81b02940+$NUM-1	! montsqr	$NUM-1
348	ba	.Lmresume_$NUM
349	nop
350.type	bn_mul_mont_t4_$NUM, #function
351.size	bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
352___
353}
354
355for ($i=8;$i<=32;$i+=8) {
356	&generate_bn_mul_mont_t4($i);
357}
358
359########################################################################
360#
361sub load_ccr {
362my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
363$code.=<<___;
364	srl	$pwr,	2,	%o4
365	and	$pwr,	3,	%o5
366	and	%o4,	7,	%o4
367	sll	%o5,	3,	%o5	! offset within first cache line
368	add	%o5,	$ptbl,	$ptbl	! of the pwrtbl
369	or	%g0,	1,	%o5
370	sll	%o5,	%o4,	$ccr
371___
372$code.=<<___	if (!$skip_wr);
373	wr	$ccr,	%g0,	%ccr
374___
375}
376sub load_b_pair {
377my ($pwrtbl,$B0,$B1)=@_;
378
379$code.=<<___;
380	ldx	[$pwrtbl+0*32],	$B0
381	ldx	[$pwrtbl+8*32],	$B1
382	ldx	[$pwrtbl+1*32],	%o4
383	ldx	[$pwrtbl+9*32],	%o5
384	movvs	%icc,	%o4,	$B0
385	ldx	[$pwrtbl+2*32],	%o4
386	movvs	%icc,	%o5,	$B1
387	ldx	[$pwrtbl+10*32],%o5
388	move	%icc,	%o4,	$B0
389	ldx	[$pwrtbl+3*32],	%o4
390	move	%icc,	%o5,	$B1
391	ldx	[$pwrtbl+11*32],%o5
392	movneg	%icc,	%o4,	$B0
393	ldx	[$pwrtbl+4*32],	%o4
394	movneg	%icc,	%o5,	$B1
395	ldx	[$pwrtbl+12*32],%o5
396	movcs	%xcc,	%o4,	$B0
397	ldx	[$pwrtbl+5*32],%o4
398	movcs	%xcc,	%o5,	$B1
399	ldx	[$pwrtbl+13*32],%o5
400	movvs	%xcc,	%o4,	$B0
401	ldx	[$pwrtbl+6*32],	%o4
402	movvs	%xcc,	%o5,	$B1
403	ldx	[$pwrtbl+14*32],%o5
404	move	%xcc,	%o4,	$B0
405	ldx	[$pwrtbl+7*32],	%o4
406	move	%xcc,	%o5,	$B1
407	ldx	[$pwrtbl+15*32],%o5
408	movneg	%xcc,	%o4,	$B0
409	add	$pwrtbl,16*32,	$pwrtbl
410	movneg	%xcc,	%o5,	$B1
411___
412}
413sub load_b {
414my ($pwrtbl,$Bi)=@_;
415
416$code.=<<___;
417	ldx	[$pwrtbl+0*32],	$Bi
418	ldx	[$pwrtbl+1*32],	%o4
419	ldx	[$pwrtbl+2*32],	%o5
420	movvs	%icc,	%o4,	$Bi
421	ldx	[$pwrtbl+3*32],	%o4
422	move	%icc,	%o5,	$Bi
423	ldx	[$pwrtbl+4*32],	%o5
424	movneg	%icc,	%o4,	$Bi
425	ldx	[$pwrtbl+5*32],	%o4
426	movcs	%xcc,	%o5,	$Bi
427	ldx	[$pwrtbl+6*32],	%o5
428	movvs	%xcc,	%o4,	$Bi
429	ldx	[$pwrtbl+7*32],	%o4
430	move	%xcc,	%o5,	$Bi
431	add	$pwrtbl,8*32,	$pwrtbl
432	movneg	%xcc,	%o4,	$Bi
433___
434}
435
436########################################################################
437# int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
438#			   const u64 *pwrtbl,int pwr,int stride);
439#
440sub generate_bn_pwr5_mont_t4() {
441my $NUM=shift;
442my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
443
444$code.=<<___;
445.globl	bn_pwr5_mont_t4_$NUM
446.align	32
447bn_pwr5_mont_t4_$NUM:
448#ifdef	__arch64__
449	mov	0,$sentinel
450	mov	-128,%g4
451#elif defined(SPARCV9_64BIT_STACK)
452	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
453	ld	[%g1+0],%g1	! OPENSSL_sparcv9_P[0]
454	mov	-2047,%g4
455	and	%g1,SPARCV9_64BIT_STACK,%g1
456	movrz	%g1,0,%g4
457	mov	-1,$sentinel
458	add	%g4,-128,%g4
459#else
460	mov	-1,$sentinel
461	mov	-128,%g4
462#endif
463	sllx	$sentinel,32,$sentinel
464	save	%sp,%g4,%sp
465#ifndef	__arch64__
466	save	%sp,-128,%sp	! warm it up
467	save	%sp,-128,%sp
468	save	%sp,-128,%sp
469	save	%sp,-128,%sp
470	save	%sp,-128,%sp
471	save	%sp,-128,%sp
472	restore
473	restore
474	restore
475	restore
476	restore
477	restore
478#endif
479	and	%sp,1,%g4
480	or	$sentinel,%fp,%fp
481	or	%g4,$sentinel,$sentinel
482
483	! copy arguments to global registers
484	mov	%i0,$tp
485	mov	%i1,$np
486	ld	[%i2+0],%f1	! load *n0
487	ld	[%i2+4],%f0
488	mov	%i3,$pwrtbl
489	srl	%i4,%g0,%i4	! pack last arguments
490	sllx	%i5,32,$pwr
491	or	%i4,$pwr,$pwr
492	fsrc2	%f0,%f60
493___
494
495# load tp[$NUM] ########################################################
496$code.=<<___;
497	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
498___
499for($i=0; $i<14 && $i<$NUM; $i++) {
500$code.=<<___;
501	ldx	[$tp+$i*8],@A[$i]
502___
503}
504for(; $i<$NUM; $i++) {
505$code.=<<___;
506	ldd	[$tp+$i*8],@A[$i]
507___
508}
509# load np[$NUM] ########################################################
510$code.=<<___;
511	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
512___
513for($i=0; $i<14 && $i<$NUM; $i++) {
514$code.=<<___;
515	ldx	[$np+$i*8],@N[$i]
516___
517}
518$code.=<<___;
519	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
520___
521for(; $i<28 && $i<$NUM; $i++) {
522$code.=<<___;
523	ldx	[$np+$i*8],@N[$i]
524___
525}
526$code.=<<___;
527	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
528___
529for(; $i<$NUM; $i++) {
530$code.=<<___;
531	ldx	[$np+$i*8],@N[$i]
532___
533}
534# load pwrtbl[pwr] ########################################################
535$code.=<<___;
536	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
537
538	srlx	$pwr,	32,	%o4		! unpack $pwr
539	srl	$pwr,	%g0,	%o5
540	sub	%o4,	5,	%o4
541	mov	$pwrtbl,	%o7
542	sllx	%o4,	32,	$pwr		! re-pack $pwr
543	or	%o5,	$pwr,	$pwr
544	srl	%o5,	%o4,	%o5
545___
546	&load_ccr("%o7","%o5","%o4");
547$code.=<<___;
548	b	.Lstride_$NUM
549	nop
550.align	16
551.Lstride_$NUM:
552___
553for($i=0; $i<14 && $i<$NUM; $i+=2) {
554	&load_b_pair("%o7",@B[$i],@B[$i+1]);
555}
556$code.=<<___;
557	save	%sp,-128,%sp;		or	$sentinel,%fp,%fp
558___
559for(; $i<$NUM; $i+=2) {
560	&load_b_pair("%i7",@B[$i],@B[$i+1]);
561}
562$code.=<<___;
563	srax	$pwr,	32,	%o4		! unpack $pwr
564	srl	$pwr,	%g0,	%o5
565	sub	%o4,	5,	%o4
566	mov	$pwrtbl,	%i7
567	sllx	%o4,	32,	$pwr		! re-pack $pwr
568	or	%o5,	$pwr,	$pwr
569	srl	%o5,	%o4,	%o5
570___
571	&load_ccr("%i7","%o5","%o4",1);
572
573# magic ################################################################
574for($i=0; $i<5; $i++) {
575$code.=<<___;
576	.word	0x81b02940+$NUM-1	! montsqr	$NUM-1
577	fbu,pn	%fcc3,.Labort_$NUM
578#ifndef	__arch64__
579	and	%fp,$sentinel,$sentinel
580	brz,pn	$sentinel,.Labort_$NUM
581#endif
582	nop
583___
584}
585$code.=<<___;
586	wr	%o4,	%g0,	%ccr
587	.word	0x81b02920+$NUM-1	! montmul	$NUM-1
588	fbu,pn	%fcc3,.Labort_$NUM
589#ifndef	__arch64__
590	and	%fp,$sentinel,$sentinel
591	brz,pn	$sentinel,.Labort_$NUM
592#endif
593
594	srax	$pwr,	32,	%o4
595#ifdef	__arch64__
596	brgez	%o4,.Lstride_$NUM
597	restore
598	restore
599	restore
600	restore
601	restore
602#else
603	brgez	%o4,.Lstride_$NUM
604	restore;		and	%fp,$sentinel,$sentinel
605	restore;		and	%fp,$sentinel,$sentinel
606	restore;		and	%fp,$sentinel,$sentinel
607	restore;		and	%fp,$sentinel,$sentinel
608	 brz,pn	$sentinel,.Labort1_$NUM
609	restore
610#endif
611___
612
613# save tp[$NUM] ########################################################
614for($i=0; $i<14 && $i<$NUM; $i++) {
615$code.=<<___;
616	movxtod	@A[$i],@R[$i]
617___
618}
619$code.=<<___;
620#ifdef	__arch64__
621	restore
622#else
623	 and	%fp,$sentinel,$sentinel
624	restore
625	 and	$sentinel,1,%o7
626	 and	%fp,$sentinel,$sentinel
627	 srl	%fp,0,%fp		! just in case?
628	 or	%o7,$sentinel,$sentinel
629	brz,a,pn $sentinel,.Ldone_$NUM
630	mov	0,%i0		! return failure
631#endif
632___
633for($i=0; $i<$NUM; $i++) {
634$code.=<<___;
635	std	@R[$i],[$tp+$i*8]
636___
637}
638$code.=<<___;
639	mov	1,%i0		! return success
640.Ldone_$NUM:
641	ret
642	restore
643
644.Labort_$NUM:
645	restore
646	restore
647	restore
648	restore
649	restore
650.Labort1_$NUM:
651	restore
652
653	mov	0,%i0		! return failure
654	ret
655	restore
656.type	bn_pwr5_mont_t4_$NUM, #function
657.size	bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
658___
659}
660
661for ($i=8;$i<=32;$i+=8) {
662	&generate_bn_pwr5_mont_t4($i);
663}
664
665{
666########################################################################
667# Fall-back subroutines
668#
669# copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
670#
671($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
672	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
673
674# int bn_mul_mont(
675$rp="%o0";	# u64 *rp,
676$ap="%o1";	# const u64 *ap,
677$bp="%o2";	# const u64 *bp,
678$np="%o3";	# const u64 *np,
679$n0p="%o4";	# const BN_ULONG *n0,
680$num="%o5";	# int num);	# caller ensures that num is >=3
681$code.=<<___;
682.globl	bn_mul_mont_t4
683.align	32
684bn_mul_mont_t4:
685	add	%sp,	STACK_BIAS,	%g4	! real top of stack
686	sll	$num,	3,	$num		! size in bytes
687	add	$num,	63,	%g1
688	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
689	sub	%g4,	%g1,	%g1
690	andn	%g1,	63,	%g1		! align at 64 byte
691	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
692	sub	%g1,	%g4,	%g1
693
694	save	%sp,	%g1,	%sp
695___
696#	+-------------------------------+<-----	%sp
697#	.				.
698#	+-------------------------------+<-----	aligned at 64 bytes
699#	| __int64 tmp[0]		|
700#	+-------------------------------+
701#	.				.
702#	.				.
703#	+-------------------------------+<-----	aligned at 64 bytes
704#	.				.
705($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
706($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
707($ovf,$i)=($t0,$t1);
708$code.=<<___;
709	ld	[$n0p+0],	$t0	! pull n0[0..1] value
710	ld	[$n0p+4],	$t1
711	add	%sp, STACK_BIAS+STACK_FRAME, $tp
712	ldx	[$bp+0],	$m0	! m0=bp[0]
713	sllx	$t1,	32,	$n0
714	add	$bp,	8,	$bp
715	or	$t0,	$n0,	$n0
716
717	ldx	[$ap+0],	$aj	! ap[0]
718
719	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
720	umulxhi	$aj,	$m0,	$hi0
721
722	ldx	[$ap+8],	$aj	! ap[1]
723	add	$ap,	16,	$ap
724	ldx	[$np+0],	$nj	! np[0]
725
726	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
727
728	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
729	umulxhi	$aj,	$m0,	$aj	! ahi=aj
730
731	mulx	$nj,	$m1,	$lo1	! np[0]*m1
732	umulxhi	$nj,	$m1,	$hi1
733
734	ldx	[$np+8],	$nj	! np[1]
735
736	addcc	$lo0,	$lo1,	$lo1
737	add	$np,	16,	$np
738	addxc	%g0,	$hi1,	$hi1
739
740	mulx	$nj,	$m1,	$nlo	! np[1]*m1
741	umulxhi	$nj,	$m1,	$nj	! nhi=nj
742
743	ba	.L1st
744	sub	$num,	24,	$cnt	! cnt=num-3
745
746.align	16
747.L1st:
748	addcc	$alo,	$hi0,	$lo0
749	addxc	$aj,	%g0,	$hi0
750
751	ldx	[$ap+0],	$aj	! ap[j]
752	addcc	$nlo,	$hi1,	$lo1
753	add	$ap,	8,	$ap
754	addxc	$nj,	%g0,	$hi1	! nhi=nj
755
756	ldx	[$np+0],	$nj	! np[j]
757	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
758	add	$np,	8,	$np
759	umulxhi	$aj,	$m0,	$aj	! ahi=aj
760
761	mulx	$nj,	$m1,	$nlo	! np[j]*m1
762	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
763	umulxhi	$nj,	$m1,	$nj	! nhi=nj
764	addxc	%g0,	$hi1,	$hi1
765	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
766	add	$tp,	8,	$tp	! tp++
767
768	brnz,pt	$cnt,	.L1st
769	sub	$cnt,	8,	$cnt	! j--
770!.L1st
771	addcc	$alo,	$hi0,	$lo0
772	addxc	$aj,	%g0,	$hi0	! ahi=aj
773
774	addcc	$nlo,	$hi1,	$lo1
775	addxc	$nj,	%g0,	$hi1
776	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
777	addxc	%g0,	$hi1,	$hi1
778	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
779	add	$tp,	8,	$tp
780
781	addcc	$hi0,	$hi1,	$hi1
782	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
783	stxa	$hi1,	[$tp]0xe2
784	add	$tp,	8,	$tp
785
786	ba	.Louter
787	sub	$num,	16,	$i	! i=num-2
788
789.align	16
790.Louter:
791	ldx	[$bp+0],	$m0	! m0=bp[i]
792	add	$bp,	8,	$bp
793
794	sub	$ap,	$num,	$ap	! rewind
795	sub	$np,	$num,	$np
796	sub	$tp,	$num,	$tp
797
798	ldx	[$ap+0],	$aj	! ap[0]
799	ldx	[$np+0],	$nj	! np[0]
800
801	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
802	ldx	[$tp],		$tj	! tp[0]
803	umulxhi	$aj,	$m0,	$hi0
804	ldx	[$ap+8],	$aj	! ap[1]
805	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
806	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
807	addxc	%g0,	$hi0,	$hi0
808	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
809	umulxhi	$aj,	$m0,	$aj	! ahi=aj
810	mulx	$nj,	$m1,	$lo1	! np[0]*m1
811	add	$ap,	16,	$ap
812	umulxhi	$nj,	$m1,	$hi1
813	ldx	[$np+8],	$nj	! np[1]
814	add	$np,	16,	$np
815	addcc	$lo1,	$lo0,	$lo1
816	mulx	$nj,	$m1,	$nlo	! np[1]*m1
817	addxc	%g0,	$hi1,	$hi1
818	umulxhi	$nj,	$m1,	$nj	! nhi=nj
819
820	ba	.Linner
821	sub	$num,	24,	$cnt	! cnt=num-3
822.align	16
823.Linner:
824	addcc	$alo,	$hi0,	$lo0
825	ldx	[$tp+8],	$tj	! tp[j]
826	addxc	$aj,	%g0,	$hi0	! ahi=aj
827	ldx	[$ap+0],	$aj	! ap[j]
828	add	$ap,	8,	$ap
829	addcc	$nlo,	$hi1,	$lo1
830	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
831	addxc	$nj,	%g0,	$hi1	! nhi=nj
832	ldx	[$np+0],	$nj	! np[j]
833	add	$np,	8,	$np
834	umulxhi	$aj,	$m0,	$aj	! ahi=aj
835	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
836	mulx	$nj,	$m1,	$nlo	! np[j]*m1
837	addxc	%g0,	$hi0,	$hi0
838	umulxhi	$nj,	$m1,	$nj	! nhi=nj
839	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
840	addxc	%g0,	$hi1,	$hi1
841	stx	$lo1,	[$tp]		! tp[j-1]
842	add	$tp,	8,	$tp
843	brnz,pt	$cnt,	.Linner
844	sub	$cnt,	8,	$cnt
845!.Linner
846	ldx	[$tp+8],	$tj	! tp[j]
847	addcc	$alo,	$hi0,	$lo0
848	addxc	$aj,	%g0,	$hi0	! ahi=aj
849	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
850	addxc	%g0,	$hi0,	$hi0
851
852	addcc	$nlo,	$hi1,	$lo1
853	addxc	$nj,	%g0,	$hi1	! nhi=nj
854	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
855	addxc	%g0,	$hi1,	$hi1
856	stx	$lo1,	[$tp]		! tp[j-1]
857
858	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
859	addxccc	$hi1,	$hi0,	$hi1
860	addxc	%g0,	%g0,	$ovf
861	stx	$hi1,	[$tp+8]
862	add	$tp,	16,	$tp
863
864	brnz,pt	$i,	.Louter
865	sub	$i,	8,	$i
866
867	sub	$ap,	$num,	$ap	! rewind
868	sub	$np,	$num,	$np
869	sub	$tp,	$num,	$tp
870	ba	.Lsub
871	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
872
873.align	16
874.Lsub:
875	ldx	[$tp],		$tj
876	add	$tp,	8,	$tp
877	ldx	[$np+0],	$nj
878	add	$np,	8,	$np
879	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
880	srlx	$tj,	32,	$tj
881	srlx	$nj,	32,	$nj
882	subccc	$tj,	$nj,	$t3
883	add	$rp,	8,	$rp
884	st	$t2,	[$rp-4]		! reverse order
885	st	$t3,	[$rp-8]
886	brnz,pt	$cnt,	.Lsub
887	sub	$cnt,	8,	$cnt
888
889	sub	$np,	$num,	$np	! rewind
890	sub	$tp,	$num,	$tp
891	sub	$rp,	$num,	$rp
892
893	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
894	ba	.Lcopy
895	sub	$num,	8,	$cnt
896
897.align	16
898.Lcopy:					! conditional copy
899	ldx	[$tp],		$tj
900	ldx	[$rp+0],	$t2
901	stx	%g0,	[$tp]		! zap
902	add	$tp,	8,	$tp
903	movcs	%icc,	$tj,	$t2
904	stx	$t2,	[$rp+0]
905	add	$rp,	8,	$rp
906	brnz	$cnt,	.Lcopy
907	sub	$cnt,	8,	$cnt
908
909	mov	1,	%o0
910	ret
911	restore
912.type	bn_mul_mont_t4, #function
913.size	bn_mul_mont_t4, .-bn_mul_mont_t4
914___
915
916# int bn_mul_mont_gather5(
917$rp="%o0";	# u64 *rp,
918$ap="%o1";	# const u64 *ap,
919$bp="%o2";	# const u64 *pwrtbl,
920$np="%o3";	# const u64 *np,
921$n0p="%o4";	# const BN_ULONG *n0,
922$num="%o5";	# int num,	# caller ensures that num is >=3
923		# int power);
924$code.=<<___;
925.globl	bn_mul_mont_gather5_t4
926.align	32
927bn_mul_mont_gather5_t4:
928	add	%sp,	STACK_BIAS,	%g4	! real top of stack
929	sll	$num,	3,	$num		! size in bytes
930	add	$num,	63,	%g1
931	andn	%g1,	63,	%g1		! buffer size rounded up to 64 bytes
932	sub	%g4,	%g1,	%g1
933	andn	%g1,	63,	%g1		! align at 64 byte
934	sub	%g1,	STACK_FRAME,	%g1	! new top of stack
935	sub	%g1,	%g4,	%g1
936	LDPTR	[%sp+STACK_7thARG],	%g4	! load power, 7th argument
937
938	save	%sp,	%g1,	%sp
939___
940#	+-------------------------------+<-----	%sp
941#	.				.
942#	+-------------------------------+<-----	aligned at 64 bytes
943#	| __int64 tmp[0]		|
944#	+-------------------------------+
945#	.				.
946#	.				.
947#	+-------------------------------+<-----	aligned at 64 bytes
948#	.				.
949($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
950($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
951($ovf,$i)=($t0,$t1);
952	&load_ccr($bp,"%g4",$ccr);
953	&load_b($bp,$m0,"%o7");		# m0=bp[0]
954
955$code.=<<___;
956	ld	[$n0p+0],	$t0	! pull n0[0..1] value
957	ld	[$n0p+4],	$t1
958	add	%sp, STACK_BIAS+STACK_FRAME, $tp
959	sllx	$t1,	32,	$n0
960	or	$t0,	$n0,	$n0
961
962	ldx	[$ap+0],	$aj	! ap[0]
963
964	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
965	umulxhi	$aj,	$m0,	$hi0
966
967	ldx	[$ap+8],	$aj	! ap[1]
968	add	$ap,	16,	$ap
969	ldx	[$np+0],	$nj	! np[0]
970
971	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
972
973	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
974	umulxhi	$aj,	$m0,	$aj	! ahi=aj
975
976	mulx	$nj,	$m1,	$lo1	! np[0]*m1
977	umulxhi	$nj,	$m1,	$hi1
978
979	ldx	[$np+8],	$nj	! np[1]
980
981	addcc	$lo0,	$lo1,	$lo1
982	add	$np,	16,	$np
983	addxc	%g0,	$hi1,	$hi1
984
985	mulx	$nj,	$m1,	$nlo	! np[1]*m1
986	umulxhi	$nj,	$m1,	$nj	! nhi=nj
987
988	ba	.L1st_g5
989	sub	$num,	24,	$cnt	! cnt=num-3
990
991.align	16
992.L1st_g5:
993	addcc	$alo,	$hi0,	$lo0
994	addxc	$aj,	%g0,	$hi0
995
996	ldx	[$ap+0],	$aj	! ap[j]
997	addcc	$nlo,	$hi1,	$lo1
998	add	$ap,	8,	$ap
999	addxc	$nj,	%g0,	$hi1	! nhi=nj
1000
1001	ldx	[$np+0],	$nj	! np[j]
1002	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
1003	add	$np,	8,	$np
1004	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1005
1006	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1007	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1008	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1009	addxc	%g0,	$hi1,	$hi1
1010	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1011	add	$tp,	8,	$tp	! tp++
1012
1013	brnz,pt	$cnt,	.L1st_g5
1014	sub	$cnt,	8,	$cnt	! j--
1015!.L1st_g5
1016	addcc	$alo,	$hi0,	$lo0
1017	addxc	$aj,	%g0,	$hi0	! ahi=aj
1018
1019	addcc	$nlo,	$hi1,	$lo1
1020	addxc	$nj,	%g0,	$hi1
1021	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
1022	addxc	%g0,	$hi1,	$hi1
1023	stxa	$lo1,	[$tp]0xe2	! tp[j-1]
1024	add	$tp,	8,	$tp
1025
1026	addcc	$hi0,	$hi1,	$hi1
1027	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
1028	stxa	$hi1,	[$tp]0xe2
1029	add	$tp,	8,	$tp
1030
1031	ba	.Louter_g5
1032	sub	$num,	16,	$i	! i=num-2
1033
1034.align	16
1035.Louter_g5:
1036	wr	$ccr,	%g0,	%ccr
1037___
1038	&load_b($bp,$m0);		# m0=bp[i]
1039$code.=<<___;
1040	sub	$ap,	$num,	$ap	! rewind
1041	sub	$np,	$num,	$np
1042	sub	$tp,	$num,	$tp
1043
1044	ldx	[$ap+0],	$aj	! ap[0]
1045	ldx	[$np+0],	$nj	! np[0]
1046
1047	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
1048	ldx	[$tp],		$tj	! tp[0]
1049	umulxhi	$aj,	$m0,	$hi0
1050	ldx	[$ap+8],	$aj	! ap[1]
1051	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
1052	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
1053	addxc	%g0,	$hi0,	$hi0
1054	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
1055	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1056	mulx	$nj,	$m1,	$lo1	! np[0]*m1
1057	add	$ap,	16,	$ap
1058	umulxhi	$nj,	$m1,	$hi1
1059	ldx	[$np+8],	$nj	! np[1]
1060	add	$np,	16,	$np
1061	addcc	$lo1,	$lo0,	$lo1
1062	mulx	$nj,	$m1,	$nlo	! np[1]*m1
1063	addxc	%g0,	$hi1,	$hi1
1064	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1065
1066	ba	.Linner_g5
1067	sub	$num,	24,	$cnt	! cnt=num-3
1068.align	16
1069.Linner_g5:
1070	addcc	$alo,	$hi0,	$lo0
1071	ldx	[$tp+8],	$tj	! tp[j]
1072	addxc	$aj,	%g0,	$hi0	! ahi=aj
1073	ldx	[$ap+0],	$aj	! ap[j]
1074	add	$ap,	8,	$ap
1075	addcc	$nlo,	$hi1,	$lo1
1076	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
1077	addxc	$nj,	%g0,	$hi1	! nhi=nj
1078	ldx	[$np+0],	$nj	! np[j]
1079	add	$np,	8,	$np
1080	umulxhi	$aj,	$m0,	$aj	! ahi=aj
1081	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1082	mulx	$nj,	$m1,	$nlo	! np[j]*m1
1083	addxc	%g0,	$hi0,	$hi0
1084	umulxhi	$nj,	$m1,	$nj	! nhi=nj
1085	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1086	addxc	%g0,	$hi1,	$hi1
1087	stx	$lo1,	[$tp]		! tp[j-1]
1088	add	$tp,	8,	$tp
1089	brnz,pt	$cnt,	.Linner_g5
1090	sub	$cnt,	8,	$cnt
1091!.Linner_g5
1092	ldx	[$tp+8],	$tj	! tp[j]
1093	addcc	$alo,	$hi0,	$lo0
1094	addxc	$aj,	%g0,	$hi0	! ahi=aj
1095	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
1096	addxc	%g0,	$hi0,	$hi0
1097
1098	addcc	$nlo,	$hi1,	$lo1
1099	addxc	$nj,	%g0,	$hi1	! nhi=nj
1100	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
1101	addxc	%g0,	$hi1,	$hi1
1102	stx	$lo1,	[$tp]		! tp[j-1]
1103
1104	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
1105	addxccc	$hi1,	$hi0,	$hi1
1106	addxc	%g0,	%g0,	$ovf
1107	stx	$hi1,	[$tp+8]
1108	add	$tp,	16,	$tp
1109
1110	brnz,pt	$i,	.Louter_g5
1111	sub	$i,	8,	$i
1112
1113	sub	$ap,	$num,	$ap	! rewind
1114	sub	$np,	$num,	$np
1115	sub	$tp,	$num,	$tp
1116	ba	.Lsub_g5
1117	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
1118
1119.align	16
1120.Lsub_g5:
1121	ldx	[$tp],		$tj
1122	add	$tp,	8,	$tp
1123	ldx	[$np+0],	$nj
1124	add	$np,	8,	$np
1125	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
1126	srlx	$tj,	32,	$tj
1127	srlx	$nj,	32,	$nj
1128	subccc	$tj,	$nj,	$t3
1129	add	$rp,	8,	$rp
1130	st	$t2,	[$rp-4]		! reverse order
1131	st	$t3,	[$rp-8]
1132	brnz,pt	$cnt,	.Lsub_g5
1133	sub	$cnt,	8,	$cnt
1134
1135	sub	$np,	$num,	$np	! rewind
1136	sub	$tp,	$num,	$tp
1137	sub	$rp,	$num,	$rp
1138
1139	subccc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
1140	ba	.Lcopy_g5
1141	sub	$num,	8,	$cnt
1142
1143.align	16
1144.Lcopy_g5:				! conditional copy
1145	ldx	[$tp],		$tj
1146	ldx	[$rp+0],	$t2
1147	stx	%g0,	[$tp]		! zap
1148	add	$tp,	8,	$tp
1149	movcs	%icc,	$tj,	$t2
1150	stx	$t2,	[$rp+0]
1151	add	$rp,	8,	$rp
1152	brnz	$cnt,	.Lcopy_g5
1153	sub	$cnt,	8,	$cnt
1154
1155	mov	1,	%o0
1156	ret
1157	restore
1158.type	bn_mul_mont_gather5_t4, #function
1159.size	bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1160___
1161}
1162
1163$code.=<<___;
1164.globl	bn_flip_t4
1165.align	32
1166bn_flip_t4:
1167.Loop_flip:
1168	ld	[%o1+0],	%o4
1169	sub	%o2,	1,	%o2
1170	ld	[%o1+4],	%o5
1171	add	%o1,	8,	%o1
1172	st	%o5,	[%o0+0]
1173	st	%o4,	[%o0+4]
1174	brnz	%o2,	.Loop_flip
1175	add	%o0,	8,	%o0
1176	retl
1177	nop
1178.type	bn_flip_t4, #function
1179.size	bn_flip_t4, .-bn_flip_t4
1180
1181.globl	bn_flip_n_scatter5_t4
1182.align	32
1183bn_flip_n_scatter5_t4:
1184	sll	%o3,	3,	%o3
1185	srl	%o1,	1,	%o1
1186	add	%o3,	%o2,	%o2	! &pwrtbl[pwr]
1187	sub	%o1,	1,	%o1
1188.Loop_flip_n_scatter5:
1189	ld	[%o0+0],	%o4	! inp[i]
1190	ld	[%o0+4],	%o5
1191	add	%o0,	8,	%o0
1192	sllx	%o5,	32,	%o5
1193	or	%o4,	%o5,	%o5
1194	stx	%o5,	[%o2]
1195	add	%o2,	32*8,	%o2
1196	brnz	%o1,	.Loop_flip_n_scatter5
1197	sub	%o1,	1,	%o1
1198	retl
1199	nop
1200.type	bn_flip_n_scatter5_t4, #function
1201.size	bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1202
1203.globl	bn_gather5_t4
1204.align	32
1205bn_gather5_t4:
1206___
1207	&load_ccr("%o2","%o3","%g1");
1208$code.=<<___;
1209	sub	%o1,	1,	%o1
1210.Loop_gather5:
1211___
1212	&load_b("%o2","%g1");
1213$code.=<<___;
1214	stx	%g1,	[%o0]
1215	add	%o0,	8,	%o0
1216	brnz	%o1,	.Loop_gather5
1217	sub	%o1,	1,	%o1
1218
1219	retl
1220	nop
1221.type	bn_gather5_t4, #function
1222.size	bn_gather5_t4, .-bn_gather5_t4
1223
1224.asciz	"Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1225.align	4
1226___
1227
1228&emit_assembler();
1229
1230close STDOUT or die "error closing STDOUT: $!";
1231