xref: /openssl/crypto/bn/asm/ppc-mont.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# April 2006
18
19# "Teaser" Montgomery multiplication module for PowerPC. It's possible
20# to gain a bit more by modulo-scheduling outer loop, then dedicated
21# squaring procedure should give further 20% and code can be adapted
22# for 32-bit application running on 64-bit CPU. As for the latter.
23# It won't be able to achieve "native" 64-bit performance, because in
24# 32-bit application context every addc instruction will have to be
25# expanded as addc, twice right shift by 32 and finally adde, etc.
26# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
27# for 64-bit application running on PPC970/G5 is:
28#
29# 512-bit	+65%
30# 1024-bit	+35%
31# 2048-bit	+18%
32# 4096-bit	+4%
33
34# September 2016
35#
36# Add multiplication procedure operating on lengths divisible by 4
37# and squaring procedure operating on lengths divisible by 8. Length
38# is expressed in number of limbs. RSA private key operations are
39# ~35-50% faster (more for longer keys) on contemporary high-end POWER
40# processors in 64-bit builds, [mysteriously enough] more in 32-bit
41# builds. On low-end 32-bit processors performance improvement turned
42# to be marginal...
43
44# $output is the last argument if it looks like a file (it has an extension)
45# $flavour is the first argument if it doesn't look like a file
46$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
47$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48
49if ($flavour =~ /32/) {
50	$BITS=	32;
51	$BNSZ=	$BITS/8;
52	$SIZE_T=4;
53	$RZONE=	224;
54
55	$LD=	"lwz";		# load
56	$LDU=	"lwzu";		# load and update
57	$LDX=	"lwzx";		# load indexed
58	$ST=	"stw";		# store
59	$STU=	"stwu";		# store and update
60	$STX=	"stwx";		# store indexed
61	$STUX=	"stwux";	# store indexed and update
62	$UMULL=	"mullw";	# unsigned multiply low
63	$UMULH=	"mulhwu";	# unsigned multiply high
64	$UCMP=	"cmplw";	# unsigned compare
65	$SHRI=	"srwi";		# unsigned shift right by immediate
66	$SHLI=	"slwi";		# unsigned shift left by immediate
67	$PUSH=	$ST;
68	$POP=	$LD;
69} elsif ($flavour =~ /64/) {
70	$BITS=	64;
71	$BNSZ=	$BITS/8;
72	$SIZE_T=8;
73	$RZONE=	288;
74
75	# same as above, but 64-bit mnemonics...
76	$LD=	"ld";		# load
77	$LDU=	"ldu";		# load and update
78	$LDX=	"ldx";		# load indexed
79	$ST=	"std";		# store
80	$STU=	"stdu";		# store and update
81	$STX=	"stdx";		# store indexed
82	$STUX=	"stdux";	# store indexed and update
83	$UMULL=	"mulld";	# unsigned multiply low
84	$UMULH=	"mulhdu";	# unsigned multiply high
85	$UCMP=	"cmpld";	# unsigned compare
86	$SHRI=	"srdi";		# unsigned shift right by immediate
87	$SHLI=	"sldi";		# unsigned shift left by immediate
88	$PUSH=	$ST;
89	$POP=	$LD;
90} else { die "nonsense $flavour"; }
91
92$FRAME=8*$SIZE_T+$RZONE;
93$LOCALS=8*$SIZE_T;
94
95$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
96( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
97( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
98die "can't locate ppc-xlate.pl";
99
100open STDOUT,"| $^X $xlate $flavour \"$output\""
101    or die "can't call $xlate: $!";
102
103$sp="r1";
104$toc="r2";
105$rp="r3";
106$ap="r4";
107$bp="r5";
108$np="r6";
109$n0="r7";
110$num="r8";
111
112{
113my $ovf=$rp;
114my $rp="r9";	# $rp is reassigned
115my $aj="r10";
116my $nj="r11";
117my $tj="r12";
118# non-volatile registers
119my $i="r20";
120my $j="r21";
121my $tp="r22";
122my $m0="r23";
123my $m1="r24";
124my $lo0="r25";
125my $hi0="r26";
126my $lo1="r27";
127my $hi1="r28";
128my $alo="r29";
129my $ahi="r30";
130my $nlo="r31";
131#
132my $nhi="r0";
133
134$code=<<___;
135.machine "any"
136.text
137
138.globl	.bn_mul_mont_int
139.align	5
140.bn_mul_mont_int:
141	mr	$rp,r3		; $rp is reassigned
142	li	r3,0
143___
144$code.=<<___ if ($BNSZ==4);
145	cmpwi	$num,32		; longer key performance is not better
146	bgelr
147___
148$code.=<<___;
149	slwi	$num,$num,`log($BNSZ)/log(2)`
150	li	$tj,-4096
151	addi	$ovf,$num,$FRAME
152	subf	$ovf,$ovf,$sp	; $sp-$ovf
153	and	$ovf,$ovf,$tj	; minimize TLB usage
154	subf	$ovf,$sp,$ovf	; $ovf-$sp
155	mr	$tj,$sp
156	srwi	$num,$num,`log($BNSZ)/log(2)`
157	$STUX	$sp,$sp,$ovf
158
159	$PUSH	r20,`-12*$SIZE_T`($tj)
160	$PUSH	r21,`-11*$SIZE_T`($tj)
161	$PUSH	r22,`-10*$SIZE_T`($tj)
162	$PUSH	r23,`-9*$SIZE_T`($tj)
163	$PUSH	r24,`-8*$SIZE_T`($tj)
164	$PUSH	r25,`-7*$SIZE_T`($tj)
165	$PUSH	r26,`-6*$SIZE_T`($tj)
166	$PUSH	r27,`-5*$SIZE_T`($tj)
167	$PUSH	r28,`-4*$SIZE_T`($tj)
168	$PUSH	r29,`-3*$SIZE_T`($tj)
169	$PUSH	r30,`-2*$SIZE_T`($tj)
170	$PUSH	r31,`-1*$SIZE_T`($tj)
171
172	$LD	$n0,0($n0)	; pull n0[0] value
173	addi	$num,$num,-2	; adjust $num for counter register
174
175	$LD	$m0,0($bp)	; m0=bp[0]
176	$LD	$aj,0($ap)	; ap[0]
177	addi	$tp,$sp,$LOCALS
178	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
179	$UMULH	$hi0,$aj,$m0
180
181	$LD	$aj,$BNSZ($ap)	; ap[1]
182	$LD	$nj,0($np)	; np[0]
183
184	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
185
186	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
187	$UMULH	$ahi,$aj,$m0
188
189	$UMULL	$lo1,$nj,$m1	; np[0]*m1
190	$UMULH	$hi1,$nj,$m1
191	$LD	$nj,$BNSZ($np)	; np[1]
192	addc	$lo1,$lo1,$lo0
193	addze	$hi1,$hi1
194
195	$UMULL	$nlo,$nj,$m1	; np[1]*m1
196	$UMULH	$nhi,$nj,$m1
197
198	mtctr	$num
199	li	$j,`2*$BNSZ`
200.align	4
201L1st:
202	$LDX	$aj,$ap,$j	; ap[j]
203	addc	$lo0,$alo,$hi0
204	$LDX	$nj,$np,$j	; np[j]
205	addze	$hi0,$ahi
206	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
207	addc	$lo1,$nlo,$hi1
208	$UMULH	$ahi,$aj,$m0
209	addze	$hi1,$nhi
210	$UMULL	$nlo,$nj,$m1	; np[j]*m1
211	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
212	$UMULH	$nhi,$nj,$m1
213	addze	$hi1,$hi1
214	$ST	$lo1,0($tp)	; tp[j-1]
215
216	addi	$j,$j,$BNSZ	; j++
217	addi	$tp,$tp,$BNSZ	; tp++
218	bdnz	L1st
219;L1st
220	addc	$lo0,$alo,$hi0
221	addze	$hi0,$ahi
222
223	addc	$lo1,$nlo,$hi1
224	addze	$hi1,$nhi
225	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
226	addze	$hi1,$hi1
227	$ST	$lo1,0($tp)	; tp[j-1]
228
229	li	$ovf,0
230	addc	$hi1,$hi1,$hi0
231	addze	$ovf,$ovf	; upmost overflow bit
232	$ST	$hi1,$BNSZ($tp)
233
234	li	$i,$BNSZ
235.align	4
236Louter:
237	$LDX	$m0,$bp,$i	; m0=bp[i]
238	$LD	$aj,0($ap)	; ap[0]
239	addi	$tp,$sp,$LOCALS
240	$LD	$tj,$LOCALS($sp); tp[0]
241	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
242	$UMULH	$hi0,$aj,$m0
243	$LD	$aj,$BNSZ($ap)	; ap[1]
244	$LD	$nj,0($np)	; np[0]
245	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
246	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
247	addze	$hi0,$hi0
248	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
249	$UMULH	$ahi,$aj,$m0
250	$UMULL	$lo1,$nj,$m1	; np[0]*m1
251	$UMULH	$hi1,$nj,$m1
252	$LD	$nj,$BNSZ($np)	; np[1]
253	addc	$lo1,$lo1,$lo0
254	$UMULL	$nlo,$nj,$m1	; np[1]*m1
255	addze	$hi1,$hi1
256	$UMULH	$nhi,$nj,$m1
257
258	mtctr	$num
259	li	$j,`2*$BNSZ`
260.align	4
261Linner:
262	$LDX	$aj,$ap,$j	; ap[j]
263	addc	$lo0,$alo,$hi0
264	$LD	$tj,$BNSZ($tp)	; tp[j]
265	addze	$hi0,$ahi
266	$LDX	$nj,$np,$j	; np[j]
267	addc	$lo1,$nlo,$hi1
268	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
269	addze	$hi1,$nhi
270	$UMULH	$ahi,$aj,$m0
271	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
272	$UMULL	$nlo,$nj,$m1	; np[j]*m1
273	addze	$hi0,$hi0
274	$UMULH	$nhi,$nj,$m1
275	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
276	addi	$j,$j,$BNSZ	; j++
277	addze	$hi1,$hi1
278	$ST	$lo1,0($tp)	; tp[j-1]
279	addi	$tp,$tp,$BNSZ	; tp++
280	bdnz	Linner
281;Linner
282	$LD	$tj,$BNSZ($tp)	; tp[j]
283	addc	$lo0,$alo,$hi0
284	addze	$hi0,$ahi
285	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
286	addze	$hi0,$hi0
287
288	addc	$lo1,$nlo,$hi1
289	addze	$hi1,$nhi
290	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
291	addze	$hi1,$hi1
292	$ST	$lo1,0($tp)	; tp[j-1]
293
294	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
295	li	$ovf,0
296	adde	$hi1,$hi1,$hi0
297	addze	$ovf,$ovf
298	$ST	$hi1,$BNSZ($tp)
299;
300	slwi	$tj,$num,`log($BNSZ)/log(2)`
301	$UCMP	$i,$tj
302	addi	$i,$i,$BNSZ
303	ble	Louter
304
305	addi	$num,$num,2	; restore $num
306	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
307	addi	$tp,$sp,$LOCALS
308	mtctr	$num
309
310.align	4
311Lsub:	$LDX	$tj,$tp,$j
312	$LDX	$nj,$np,$j
313	subfe	$aj,$nj,$tj	; tp[j]-np[j]
314	$STX	$aj,$rp,$j
315	addi	$j,$j,$BNSZ
316	bdnz	Lsub
317
318	li	$j,0
319	mtctr	$num
320	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
321
322.align	4
323Lcopy:				; conditional copy
324	$LDX	$tj,$tp,$j
325	$LDX	$aj,$rp,$j
326	and	$tj,$tj,$ovf
327	andc	$aj,$aj,$ovf
328	$STX	$j,$tp,$j	; zap at once
329	or	$aj,$aj,$tj
330	$STX	$aj,$rp,$j
331	addi	$j,$j,$BNSZ
332	bdnz	Lcopy
333
334	$POP	$tj,0($sp)
335	li	r3,1
336	$POP	r20,`-12*$SIZE_T`($tj)
337	$POP	r21,`-11*$SIZE_T`($tj)
338	$POP	r22,`-10*$SIZE_T`($tj)
339	$POP	r23,`-9*$SIZE_T`($tj)
340	$POP	r24,`-8*$SIZE_T`($tj)
341	$POP	r25,`-7*$SIZE_T`($tj)
342	$POP	r26,`-6*$SIZE_T`($tj)
343	$POP	r27,`-5*$SIZE_T`($tj)
344	$POP	r28,`-4*$SIZE_T`($tj)
345	$POP	r29,`-3*$SIZE_T`($tj)
346	$POP	r30,`-2*$SIZE_T`($tj)
347	$POP	r31,`-1*$SIZE_T`($tj)
348	mr	$sp,$tj
349	blr
350	.long	0
351	.byte	0,12,4,0,0x80,12,6,0
352	.long	0
353.size	.bn_mul_mont_int,.-.bn_mul_mont_int
354___
355}
356if (1) {
357my ($a0,$a1,$a2,$a3,
358    $t0,$t1,$t2,$t3,
359    $m0,$m1,$m2,$m3,
360    $acc0,$acc1,$acc2,$acc3,$acc4,
361    $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31));
362my  ($carry,$zero) = ($rp,"r0");
363
364# sp----------->+-------------------------------+
365#		| saved sp			|
366#		+-------------------------------+
367#		.				.
368# +8*size_t	+-------------------------------+
369#		| 4 "n0*t0"			|
370#		.				.
371#		.				.
372# +12*size_t	+-------------------------------+
373#		| size_t tmp[num]		|
374#		.				.
375#		.				.
376#		.				.
377#		+-------------------------------+
378#		| topmost carry			|
379#		.				.
380# -18*size_t	+-------------------------------+
381#		| 18 saved gpr, r14-r31		|
382#		.				.
383#		.				.
384#		+-------------------------------+
385$code.=<<___;
386.globl	.bn_mul4x_mont_int
387.align	5
388.bn_mul4x_mont_int:
389	andi.	r0,$num,7
390	bne	.Lmul4x_do
391	$UCMP	$ap,$bp
392	bne	.Lmul4x_do
393	b	.Lsqr8x_do
394.Lmul4x_do:
395	slwi	$num,$num,`log($SIZE_T)/log(2)`
396	mr	$a0,$sp
397	li	$a1,-32*$SIZE_T
398	sub	$a1,$a1,$num
399	$STUX	$sp,$sp,$a1		# alloca
400
401	$PUSH	r14,-$SIZE_T*18($a0)
402	$PUSH	r15,-$SIZE_T*17($a0)
403	$PUSH	r16,-$SIZE_T*16($a0)
404	$PUSH	r17,-$SIZE_T*15($a0)
405	$PUSH	r18,-$SIZE_T*14($a0)
406	$PUSH	r19,-$SIZE_T*13($a0)
407	$PUSH	r20,-$SIZE_T*12($a0)
408	$PUSH	r21,-$SIZE_T*11($a0)
409	$PUSH	r22,-$SIZE_T*10($a0)
410	$PUSH	r23,-$SIZE_T*9($a0)
411	$PUSH	r24,-$SIZE_T*8($a0)
412	$PUSH	r25,-$SIZE_T*7($a0)
413	$PUSH	r26,-$SIZE_T*6($a0)
414	$PUSH	r27,-$SIZE_T*5($a0)
415	$PUSH	r28,-$SIZE_T*4($a0)
416	$PUSH	r29,-$SIZE_T*3($a0)
417	$PUSH	r30,-$SIZE_T*2($a0)
418	$PUSH	r31,-$SIZE_T*1($a0)
419
420	subi	$ap,$ap,$SIZE_T		# bias by -1
421	subi	$np,$np,$SIZE_T		# bias by -1
422	subi	$rp,$rp,$SIZE_T		# bias by -1
423	$LD	$n0,0($n0)		# *n0
424
425	add	$t0,$bp,$num
426	add	$ap_end,$ap,$num
427	subi	$t0,$t0,$SIZE_T*4	# &b[num-4]
428
429	$LD	$bi,$SIZE_T*0($bp)	# b[0]
430	li	$acc0,0
431	$LD	$a0,$SIZE_T*1($ap)	# a[0..3]
432	li	$acc1,0
433	$LD	$a1,$SIZE_T*2($ap)
434	li	$acc2,0
435	$LD	$a2,$SIZE_T*3($ap)
436	li	$acc3,0
437	$LDU	$a3,$SIZE_T*4($ap)
438	$LD	$m0,$SIZE_T*1($np)	# n[0..3]
439	$LD	$m1,$SIZE_T*2($np)
440	$LD	$m2,$SIZE_T*3($np)
441	$LDU	$m3,$SIZE_T*4($np)
442
443	$PUSH	$rp,$SIZE_T*6($sp)	# offload rp and &b[num-4]
444	$PUSH	$t0,$SIZE_T*7($sp)
445	li	$carry,0
446	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
447	li	$cnt,0
448	li	$zero,0
449	b	.Loop_mul4x_1st_reduction
450
451.align	5
452.Loop_mul4x_1st_reduction:
453	$UMULL	$t0,$a0,$bi		# lo(a[0..3]*b[0])
454	addze	$carry,$carry		# modulo-scheduled
455	$UMULL	$t1,$a1,$bi
456	addi	$cnt,$cnt,$SIZE_T
457	$UMULL	$t2,$a2,$bi
458	andi.	$cnt,$cnt,$SIZE_T*4-1
459	$UMULL	$t3,$a3,$bi
460	addc	$acc0,$acc0,$t0
461	$UMULH	$t0,$a0,$bi		# hi(a[0..3]*b[0])
462	adde	$acc1,$acc1,$t1
463	$UMULH	$t1,$a1,$bi
464	adde	$acc2,$acc2,$t2
465	$UMULL	$mi,$acc0,$n0		# t[0]*n0
466	adde	$acc3,$acc3,$t3
467	$UMULH	$t2,$a2,$bi
468	addze	$acc4,$zero
469	$UMULH	$t3,$a3,$bi
470	$LDX	$bi,$bp,$cnt		# next b[i] (or b[0])
471	addc	$acc1,$acc1,$t0
472	# (*)	mul	$t0,$m0,$mi	# lo(n[0..3]*t[0]*n0)
473	$STU	$mi,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
474	adde	$acc2,$acc2,$t1
475	$UMULL	$t1,$m1,$mi
476	adde	$acc3,$acc3,$t2
477	$UMULL	$t2,$m2,$mi
478	adde	$acc4,$acc4,$t3		# can't overflow
479	$UMULL	$t3,$m3,$mi
480	# (*)	addc	$acc0,$acc0,$t0
481	# (*)	As for removal of first multiplication and addition
482	#	instructions. The outcome of first addition is
483	#	guaranteed to be zero, which leaves two computationally
484	#	significant outcomes: it either carries or not. Then
485	#	question is when does it carry? Is there alternative
486	#	way to deduce it? If you follow operations, you can
487	#	observe that condition for carry is quite simple:
488	#	$acc0 being non-zero. So that carry can be calculated
489	#	by adding -1 to $acc0. That's what next instruction does.
490	addic	$acc0,$acc0,-1		# (*), discarded
491	$UMULH	$t0,$m0,$mi		# hi(n[0..3]*t[0]*n0)
492	adde	$acc0,$acc1,$t1
493	$UMULH	$t1,$m1,$mi
494	adde	$acc1,$acc2,$t2
495	$UMULH	$t2,$m2,$mi
496	adde	$acc2,$acc3,$t3
497	$UMULH	$t3,$m3,$mi
498	adde	$acc3,$acc4,$carry
499	addze	$carry,$zero
500	addc	$acc0,$acc0,$t0
501	adde	$acc1,$acc1,$t1
502	adde	$acc2,$acc2,$t2
503	adde	$acc3,$acc3,$t3
504	#addze	$carry,$carry
505	bne	.Loop_mul4x_1st_reduction
506
507	$UCMP	$ap_end,$ap
508	beq	.Lmul4x4_post_condition
509
510	$LD	$a0,$SIZE_T*1($ap)	# a[4..7]
511	$LD	$a1,$SIZE_T*2($ap)
512	$LD	$a2,$SIZE_T*3($ap)
513	$LDU	$a3,$SIZE_T*4($ap)
514	$LD	$mi,$SIZE_T*8($sp)	# a[0]*n0
515	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
516	$LD	$m1,$SIZE_T*2($np)
517	$LD	$m2,$SIZE_T*3($np)
518	$LDU	$m3,$SIZE_T*4($np)
519	b	.Loop_mul4x_1st_tail
520
521.align	5
522.Loop_mul4x_1st_tail:
523	$UMULL	$t0,$a0,$bi		# lo(a[4..7]*b[i])
524	addze	$carry,$carry		# modulo-scheduled
525	$UMULL	$t1,$a1,$bi
526	addi	$cnt,$cnt,$SIZE_T
527	$UMULL	$t2,$a2,$bi
528	andi.	$cnt,$cnt,$SIZE_T*4-1
529	$UMULL	$t3,$a3,$bi
530	addc	$acc0,$acc0,$t0
531	$UMULH	$t0,$a0,$bi		# hi(a[4..7]*b[i])
532	adde	$acc1,$acc1,$t1
533	$UMULH	$t1,$a1,$bi
534	adde	$acc2,$acc2,$t2
535	$UMULH	$t2,$a2,$bi
536	adde	$acc3,$acc3,$t3
537	$UMULH	$t3,$a3,$bi
538	addze	$acc4,$zero
539	$LDX	$bi,$bp,$cnt		# next b[i] (or b[0])
540	addc	$acc1,$acc1,$t0
541	$UMULL	$t0,$m0,$mi		# lo(n[4..7]*a[0]*n0)
542	adde	$acc2,$acc2,$t1
543	$UMULL	$t1,$m1,$mi
544	adde	$acc3,$acc3,$t2
545	$UMULL	$t2,$m2,$mi
546	adde	$acc4,$acc4,$t3		# can't overflow
547	$UMULL	$t3,$m3,$mi
548	addc	$acc0,$acc0,$t0
549	$UMULH	$t0,$m0,$mi		# hi(n[4..7]*a[0]*n0)
550	adde	$acc1,$acc1,$t1
551	$UMULH	$t1,$m1,$mi
552	adde	$acc2,$acc2,$t2
553	$UMULH	$t2,$m2,$mi
554	adde	$acc3,$acc3,$t3
555	adde	$acc4,$acc4,$carry
556	$UMULH	$t3,$m3,$mi
557	addze	$carry,$zero
558	addi	$mi,$sp,$SIZE_T*8
559	$LDX	$mi,$mi,$cnt		# next t[0]*n0
560	$STU	$acc0,$SIZE_T($tp)	# word of result
561	addc	$acc0,$acc1,$t0
562	adde	$acc1,$acc2,$t1
563	adde	$acc2,$acc3,$t2
564	adde	$acc3,$acc4,$t3
565	#addze	$carry,$carry
566	bne	.Loop_mul4x_1st_tail
567
568	sub	$t1,$ap_end,$num	# rewinded $ap
569	$UCMP	$ap_end,$ap		# done yet?
570	beq	.Lmul4x_proceed
571
572	$LD	$a0,$SIZE_T*1($ap)
573	$LD	$a1,$SIZE_T*2($ap)
574	$LD	$a2,$SIZE_T*3($ap)
575	$LDU	$a3,$SIZE_T*4($ap)
576	$LD	$m0,$SIZE_T*1($np)
577	$LD	$m1,$SIZE_T*2($np)
578	$LD	$m2,$SIZE_T*3($np)
579	$LDU	$m3,$SIZE_T*4($np)
580	b	.Loop_mul4x_1st_tail
581
582.align	5
583.Lmul4x_proceed:
584	$LDU	$bi,$SIZE_T*4($bp)	# *++b
585	addze	$carry,$carry		# topmost carry
586	$LD	$a0,$SIZE_T*1($t1)
587	$LD	$a1,$SIZE_T*2($t1)
588	$LD	$a2,$SIZE_T*3($t1)
589	$LD	$a3,$SIZE_T*4($t1)
590	addi	$ap,$t1,$SIZE_T*4
591	sub	$np,$np,$num		# rewind np
592
593	$ST	$acc0,$SIZE_T*1($tp)	# result
594	$ST	$acc1,$SIZE_T*2($tp)
595	$ST	$acc2,$SIZE_T*3($tp)
596	$ST	$acc3,$SIZE_T*4($tp)
597	$ST	$carry,$SIZE_T*5($tp)	# save topmost carry
598	$LD	$acc0,$SIZE_T*12($sp)	# t[0..3]
599	$LD	$acc1,$SIZE_T*13($sp)
600	$LD	$acc2,$SIZE_T*14($sp)
601	$LD	$acc3,$SIZE_T*15($sp)
602
603	$LD	$m0,$SIZE_T*1($np)	# n[0..3]
604	$LD	$m1,$SIZE_T*2($np)
605	$LD	$m2,$SIZE_T*3($np)
606	$LDU	$m3,$SIZE_T*4($np)
607	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
608	li	$carry,0
609	b	.Loop_mul4x_reduction
610
611.align	5
612.Loop_mul4x_reduction:
613	$UMULL	$t0,$a0,$bi		# lo(a[0..3]*b[4])
614	addze	$carry,$carry		# modulo-scheduled
615	$UMULL	$t1,$a1,$bi
616	addi	$cnt,$cnt,$SIZE_T
617	$UMULL	$t2,$a2,$bi
618	andi.	$cnt,$cnt,$SIZE_T*4-1
619	$UMULL	$t3,$a3,$bi
620	addc	$acc0,$acc0,$t0
621	$UMULH	$t0,$a0,$bi		# hi(a[0..3]*b[4])
622	adde	$acc1,$acc1,$t1
623	$UMULH	$t1,$a1,$bi
624	adde	$acc2,$acc2,$t2
625	$UMULL	$mi,$acc0,$n0		# t[0]*n0
626	adde	$acc3,$acc3,$t3
627	$UMULH	$t2,$a2,$bi
628	addze	$acc4,$zero
629	$UMULH	$t3,$a3,$bi
630	$LDX	$bi,$bp,$cnt		# next b[i]
631	addc	$acc1,$acc1,$t0
632	# (*)	mul	$t0,$m0,$mi
633	$STU	$mi,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
634	adde	$acc2,$acc2,$t1
635	$UMULL	$t1,$m1,$mi		# lo(n[0..3]*t[0]*n0
636	adde	$acc3,$acc3,$t2
637	$UMULL	$t2,$m2,$mi
638	adde	$acc4,$acc4,$t3		# can't overflow
639	$UMULL	$t3,$m3,$mi
640	# (*)	addc	$acc0,$acc0,$t0
641	addic	$acc0,$acc0,-1		# (*), discarded
642	$UMULH	$t0,$m0,$mi		# hi(n[0..3]*t[0]*n0
643	adde	$acc0,$acc1,$t1
644	$UMULH	$t1,$m1,$mi
645	adde	$acc1,$acc2,$t2
646	$UMULH	$t2,$m2,$mi
647	adde	$acc2,$acc3,$t3
648	$UMULH	$t3,$m3,$mi
649	adde	$acc3,$acc4,$carry
650	addze	$carry,$zero
651	addc	$acc0,$acc0,$t0
652	adde	$acc1,$acc1,$t1
653	adde	$acc2,$acc2,$t2
654	adde	$acc3,$acc3,$t3
655	#addze	$carry,$carry
656	bne	.Loop_mul4x_reduction
657
658	$LD	$t0,$SIZE_T*5($tp)	# t[4..7]
659	addze	$carry,$carry
660	$LD	$t1,$SIZE_T*6($tp)
661	$LD	$t2,$SIZE_T*7($tp)
662	$LD	$t3,$SIZE_T*8($tp)
663	$LD	$a0,$SIZE_T*1($ap)	# a[4..7]
664	$LD	$a1,$SIZE_T*2($ap)
665	$LD	$a2,$SIZE_T*3($ap)
666	$LDU	$a3,$SIZE_T*4($ap)
667	addc	$acc0,$acc0,$t0
668	adde	$acc1,$acc1,$t1
669	adde	$acc2,$acc2,$t2
670	adde	$acc3,$acc3,$t3
671	#addze	$carry,$carry
672
673	$LD	$mi,$SIZE_T*8($sp)	# t[0]*n0
674	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
675	$LD	$m1,$SIZE_T*2($np)
676	$LD	$m2,$SIZE_T*3($np)
677	$LDU	$m3,$SIZE_T*4($np)
678	b	.Loop_mul4x_tail
679
680.align	5
681.Loop_mul4x_tail:
682	$UMULL	$t0,$a0,$bi		# lo(a[4..7]*b[4])
683	addze	$carry,$carry		# modulo-scheduled
684	$UMULL	$t1,$a1,$bi
685	addi	$cnt,$cnt,$SIZE_T
686	$UMULL	$t2,$a2,$bi
687	andi.	$cnt,$cnt,$SIZE_T*4-1
688	$UMULL	$t3,$a3,$bi
689	addc	$acc0,$acc0,$t0
690	$UMULH	$t0,$a0,$bi		# hi(a[4..7]*b[4])
691	adde	$acc1,$acc1,$t1
692	$UMULH	$t1,$a1,$bi
693	adde	$acc2,$acc2,$t2
694	$UMULH	$t2,$a2,$bi
695	adde	$acc3,$acc3,$t3
696	$UMULH	$t3,$a3,$bi
697	addze	$acc4,$zero
698	$LDX	$bi,$bp,$cnt		# next b[i]
699	addc	$acc1,$acc1,$t0
700	$UMULL	$t0,$m0,$mi		# lo(n[4..7]*t[0]*n0)
701	adde	$acc2,$acc2,$t1
702	$UMULL	$t1,$m1,$mi
703	adde	$acc3,$acc3,$t2
704	$UMULL	$t2,$m2,$mi
705	adde	$acc4,$acc4,$t3		# can't overflow
706	$UMULL	$t3,$m3,$mi
707	addc	$acc0,$acc0,$t0
708	$UMULH	$t0,$m0,$mi		# hi(n[4..7]*t[0]*n0)
709	adde	$acc1,$acc1,$t1
710	$UMULH	$t1,$m1,$mi
711	adde	$acc2,$acc2,$t2
712	$UMULH	$t2,$m2,$mi
713	adde	$acc3,$acc3,$t3
714	$UMULH	$t3,$m3,$mi
715	adde	$acc4,$acc4,$carry
716	addi	$mi,$sp,$SIZE_T*8
717	$LDX	$mi,$mi,$cnt		# next a[0]*n0
718	addze	$carry,$zero
719	$STU	$acc0,$SIZE_T($tp)	# word of result
720	addc	$acc0,$acc1,$t0
721	adde	$acc1,$acc2,$t1
722	adde	$acc2,$acc3,$t2
723	adde	$acc3,$acc4,$t3
724	#addze	$carry,$carry
725	bne	.Loop_mul4x_tail
726
727	$LD	$t0,$SIZE_T*5($tp)	# next t[i] or topmost carry
728	sub	$t1,$np,$num		# rewinded np?
729	addze	$carry,$carry
730	$UCMP	$ap_end,$ap		# done yet?
731	beq	.Loop_mul4x_break
732
733	$LD	$t1,$SIZE_T*6($tp)
734	$LD	$t2,$SIZE_T*7($tp)
735	$LD	$t3,$SIZE_T*8($tp)
736	$LD	$a0,$SIZE_T*1($ap)
737	$LD	$a1,$SIZE_T*2($ap)
738	$LD	$a2,$SIZE_T*3($ap)
739	$LDU	$a3,$SIZE_T*4($ap)
740	addc	$acc0,$acc0,$t0
741	adde	$acc1,$acc1,$t1
742	adde	$acc2,$acc2,$t2
743	adde	$acc3,$acc3,$t3
744	#addze	$carry,$carry
745
746	$LD	$m0,$SIZE_T*1($np)	# n[4..7]
747	$LD	$m1,$SIZE_T*2($np)
748	$LD	$m2,$SIZE_T*3($np)
749	$LDU	$m3,$SIZE_T*4($np)
750	b	.Loop_mul4x_tail
751
752.align	5
753.Loop_mul4x_break:
754	$POP	$t2,$SIZE_T*6($sp)	# pull rp and &b[num-4]
755	$POP	$t3,$SIZE_T*7($sp)
756	addc	$a0,$acc0,$t0		# accumulate topmost carry
757	$LD	$acc0,$SIZE_T*12($sp)	# t[0..3]
758	addze	$a1,$acc1
759	$LD	$acc1,$SIZE_T*13($sp)
760	addze	$a2,$acc2
761	$LD	$acc2,$SIZE_T*14($sp)
762	addze	$a3,$acc3
763	$LD	$acc3,$SIZE_T*15($sp)
764	addze	$carry,$carry		# topmost carry
765	$ST	$a0,$SIZE_T*1($tp)	# result
766	sub	$ap,$ap_end,$num	# rewind ap
767	$ST	$a1,$SIZE_T*2($tp)
768	$ST	$a2,$SIZE_T*3($tp)
769	$ST	$a3,$SIZE_T*4($tp)
770	$ST	$carry,$SIZE_T*5($tp)	# store topmost carry
771
772	$LD	$m0,$SIZE_T*1($t1)	# n[0..3]
773	$LD	$m1,$SIZE_T*2($t1)
774	$LD	$m2,$SIZE_T*3($t1)
775	$LD	$m3,$SIZE_T*4($t1)
776	addi	$np,$t1,$SIZE_T*4
777	$UCMP	$bp,$t3			# done yet?
778	beq	.Lmul4x_post
779
780	$LDU	$bi,$SIZE_T*4($bp)
781	$LD	$a0,$SIZE_T*1($ap)	# a[0..3]
782	$LD	$a1,$SIZE_T*2($ap)
783	$LD	$a2,$SIZE_T*3($ap)
784	$LDU	$a3,$SIZE_T*4($ap)
785	li	$carry,0
786	addic	$tp,$sp,$SIZE_T*7	# &t[-1], clear carry bit
787	b	.Loop_mul4x_reduction
788
789.align	5
790.Lmul4x_post:
791	# Final step. We see if result is larger than modulus, and
792	# if it is, subtract the modulus. But comparison implies
793	# subtraction. So we subtract modulus, see if it borrowed,
794	# and conditionally copy original value.
795	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
796	mr	$bp,$t2			# &rp[-1]
797	subi	$cnt,$cnt,1
798	mr	$ap_end,$t2		# &rp[-1] copy
799	subfc	$t0,$m0,$acc0
800	addi	$tp,$sp,$SIZE_T*15
801	subfe	$t1,$m1,$acc1
802
803	mtctr	$cnt
804.Lmul4x_sub:
805	$LD	$m0,$SIZE_T*1($np)
806	$LD	$acc0,$SIZE_T*1($tp)
807	subfe	$t2,$m2,$acc2
808	$LD	$m1,$SIZE_T*2($np)
809	$LD	$acc1,$SIZE_T*2($tp)
810	subfe	$t3,$m3,$acc3
811	$LD	$m2,$SIZE_T*3($np)
812	$LD	$acc2,$SIZE_T*3($tp)
813	$LDU	$m3,$SIZE_T*4($np)
814	$LDU	$acc3,$SIZE_T*4($tp)
815	$ST	$t0,$SIZE_T*1($bp)
816	$ST	$t1,$SIZE_T*2($bp)
817	subfe	$t0,$m0,$acc0
818	$ST	$t2,$SIZE_T*3($bp)
819	$STU	$t3,$SIZE_T*4($bp)
820	subfe	$t1,$m1,$acc1
821	bdnz	.Lmul4x_sub
822
823	 $LD	$a0,$SIZE_T*1($ap_end)
824	$ST	$t0,$SIZE_T*1($bp)
825	 $LD	$t0,$SIZE_T*12($sp)
826	subfe	$t2,$m2,$acc2
827	 $LD	$a1,$SIZE_T*2($ap_end)
828	$ST	$t1,$SIZE_T*2($bp)
829	 $LD	$t1,$SIZE_T*13($sp)
830	subfe	$t3,$m3,$acc3
831	subfe	$carry,$zero,$carry	# did it borrow?
832	 addi	$tp,$sp,$SIZE_T*12
833	 $LD	$a2,$SIZE_T*3($ap_end)
834	$ST	$t2,$SIZE_T*3($bp)
835	 $LD	$t2,$SIZE_T*14($sp)
836	 $LD	$a3,$SIZE_T*4($ap_end)
837	$ST	$t3,$SIZE_T*4($bp)
838	 $LD	$t3,$SIZE_T*15($sp)
839
840	mtctr	$cnt
841.Lmul4x_cond_copy:
842	and	$t0,$t0,$carry
843	andc	$a0,$a0,$carry
844	$ST	$zero,$SIZE_T*0($tp)	# wipe stack clean
845	and	$t1,$t1,$carry
846	andc	$a1,$a1,$carry
847	$ST	$zero,$SIZE_T*1($tp)
848	and	$t2,$t2,$carry
849	andc	$a2,$a2,$carry
850	$ST	$zero,$SIZE_T*2($tp)
851	and	$t3,$t3,$carry
852	andc	$a3,$a3,$carry
853	$ST	$zero,$SIZE_T*3($tp)
854	or	$acc0,$t0,$a0
855	$LD	$a0,$SIZE_T*5($ap_end)
856	$LD	$t0,$SIZE_T*4($tp)
857	or	$acc1,$t1,$a1
858	$LD	$a1,$SIZE_T*6($ap_end)
859	$LD	$t1,$SIZE_T*5($tp)
860	or	$acc2,$t2,$a2
861	$LD	$a2,$SIZE_T*7($ap_end)
862	$LD	$t2,$SIZE_T*6($tp)
863	or	$acc3,$t3,$a3
864	$LD	$a3,$SIZE_T*8($ap_end)
865	$LD	$t3,$SIZE_T*7($tp)
866	addi	$tp,$tp,$SIZE_T*4
867	$ST	$acc0,$SIZE_T*1($ap_end)
868	$ST	$acc1,$SIZE_T*2($ap_end)
869	$ST	$acc2,$SIZE_T*3($ap_end)
870	$STU	$acc3,$SIZE_T*4($ap_end)
871	bdnz	.Lmul4x_cond_copy
872
873	$POP	$bp,0($sp)		# pull saved sp
874	and	$t0,$t0,$carry
875	andc	$a0,$a0,$carry
876	$ST	$zero,$SIZE_T*0($tp)
877	and	$t1,$t1,$carry
878	andc	$a1,$a1,$carry
879	$ST	$zero,$SIZE_T*1($tp)
880	and	$t2,$t2,$carry
881	andc	$a2,$a2,$carry
882	$ST	$zero,$SIZE_T*2($tp)
883	and	$t3,$t3,$carry
884	andc	$a3,$a3,$carry
885	$ST	$zero,$SIZE_T*3($tp)
886	or	$acc0,$t0,$a0
887	or	$acc1,$t1,$a1
888	$ST	$zero,$SIZE_T*4($tp)
889	or	$acc2,$t2,$a2
890	or	$acc3,$t3,$a3
891	$ST	$acc0,$SIZE_T*1($ap_end)
892	$ST	$acc1,$SIZE_T*2($ap_end)
893	$ST	$acc2,$SIZE_T*3($ap_end)
894	$ST	$acc3,$SIZE_T*4($ap_end)
895
896	b	.Lmul4x_done
897
898.align	4
899.Lmul4x4_post_condition:
900	$POP	$ap,$SIZE_T*6($sp)	# pull &rp[-1]
901	$POP	$bp,0($sp)		# pull saved sp
902	addze	$carry,$carry		# modulo-scheduled
903	# $acc0-3,$carry hold result, $m0-3 hold modulus
904	subfc	$a0,$m0,$acc0
905	subfe	$a1,$m1,$acc1
906	subfe	$a2,$m2,$acc2
907	subfe	$a3,$m3,$acc3
908	subfe	$carry,$zero,$carry	# did it borrow?
909
910	and	$m0,$m0,$carry
911	and	$m1,$m1,$carry
912	addc	$a0,$a0,$m0
913	and	$m2,$m2,$carry
914	adde	$a1,$a1,$m1
915	and	$m3,$m3,$carry
916	adde	$a2,$a2,$m2
917	adde	$a3,$a3,$m3
918
919	$ST	$a0,$SIZE_T*1($ap)	# write result
920	$ST	$a1,$SIZE_T*2($ap)
921	$ST	$a2,$SIZE_T*3($ap)
922	$ST	$a3,$SIZE_T*4($ap)
923
924.Lmul4x_done:
925	$ST	$zero,$SIZE_T*8($sp)	# wipe stack clean
926	$ST	$zero,$SIZE_T*9($sp)
927	$ST	$zero,$SIZE_T*10($sp)
928	$ST	$zero,$SIZE_T*11($sp)
929	li	r3,1			# signal "done"
930	$POP	r14,-$SIZE_T*18($bp)
931	$POP	r15,-$SIZE_T*17($bp)
932	$POP	r16,-$SIZE_T*16($bp)
933	$POP	r17,-$SIZE_T*15($bp)
934	$POP	r18,-$SIZE_T*14($bp)
935	$POP	r19,-$SIZE_T*13($bp)
936	$POP	r20,-$SIZE_T*12($bp)
937	$POP	r21,-$SIZE_T*11($bp)
938	$POP	r22,-$SIZE_T*10($bp)
939	$POP	r23,-$SIZE_T*9($bp)
940	$POP	r24,-$SIZE_T*8($bp)
941	$POP	r25,-$SIZE_T*7($bp)
942	$POP	r26,-$SIZE_T*6($bp)
943	$POP	r27,-$SIZE_T*5($bp)
944	$POP	r28,-$SIZE_T*4($bp)
945	$POP	r29,-$SIZE_T*3($bp)
946	$POP	r30,-$SIZE_T*2($bp)
947	$POP	r31,-$SIZE_T*1($bp)
948	mr	$sp,$bp
949	blr
950	.long	0
951	.byte	0,12,4,0x20,0x80,18,6,0
952	.long	0
953.size	.bn_mul4x_mont_int,.-.bn_mul4x_mont_int
954___
955}
956
957if (1) {
958########################################################################
959# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module.
960
961my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17));
962my ($t0,$t1,$t2,$t3)=map("r$_",(18..21));
963my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29));
964my ($cnt,$carry,$zero)=("r30","r31","r0");
965my ($tp,$ap_end,$na0)=($bp,$np,$carry);
966
967# sp----------->+-------------------------------+
968#		| saved sp			|
969#		+-------------------------------+
970#		.				.
971# +12*size_t	+-------------------------------+
972#		| size_t tmp[2*num]		|
973#		.				.
974#		.				.
975#		.				.
976#		+-------------------------------+
977#		.				.
978# -18*size_t	+-------------------------------+
979#		| 18 saved gpr, r14-r31		|
980#		.				.
981#		.				.
982#		+-------------------------------+
983$code.=<<___;
984.align	5
985__bn_sqr8x_mont:
986.Lsqr8x_do:
987	mr	$a0,$sp
988	slwi	$a1,$num,`log($SIZE_T)/log(2)+1`
989	li	$a2,-32*$SIZE_T
990	sub	$a1,$a2,$a1
991	slwi	$num,$num,`log($SIZE_T)/log(2)`
992	$STUX	$sp,$sp,$a1		# alloca
993
994	$PUSH	r14,-$SIZE_T*18($a0)
995	$PUSH	r15,-$SIZE_T*17($a0)
996	$PUSH	r16,-$SIZE_T*16($a0)
997	$PUSH	r17,-$SIZE_T*15($a0)
998	$PUSH	r18,-$SIZE_T*14($a0)
999	$PUSH	r19,-$SIZE_T*13($a0)
1000	$PUSH	r20,-$SIZE_T*12($a0)
1001	$PUSH	r21,-$SIZE_T*11($a0)
1002	$PUSH	r22,-$SIZE_T*10($a0)
1003	$PUSH	r23,-$SIZE_T*9($a0)
1004	$PUSH	r24,-$SIZE_T*8($a0)
1005	$PUSH	r25,-$SIZE_T*7($a0)
1006	$PUSH	r26,-$SIZE_T*6($a0)
1007	$PUSH	r27,-$SIZE_T*5($a0)
1008	$PUSH	r28,-$SIZE_T*4($a0)
1009	$PUSH	r29,-$SIZE_T*3($a0)
1010	$PUSH	r30,-$SIZE_T*2($a0)
1011	$PUSH	r31,-$SIZE_T*1($a0)
1012
1013	subi	$ap,$ap,$SIZE_T		# bias by -1
1014	subi	$t0,$np,$SIZE_T		# bias by -1
1015	subi	$rp,$rp,$SIZE_T		# bias by -1
1016	$LD	$n0,0($n0)		# *n0
1017	li	$zero,0
1018
1019	add	$ap_end,$ap,$num
1020	$LD	$a0,$SIZE_T*1($ap)
1021	#li	$acc0,0
1022	$LD	$a1,$SIZE_T*2($ap)
1023	li	$acc1,0
1024	$LD	$a2,$SIZE_T*3($ap)
1025	li	$acc2,0
1026	$LD	$a3,$SIZE_T*4($ap)
1027	li	$acc3,0
1028	$LD	$a4,$SIZE_T*5($ap)
1029	li	$acc4,0
1030	$LD	$a5,$SIZE_T*6($ap)
1031	li	$acc5,0
1032	$LD	$a6,$SIZE_T*7($ap)
1033	li	$acc6,0
1034	$LDU	$a7,$SIZE_T*8($ap)
1035	li	$acc7,0
1036
1037	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1038	subic.	$cnt,$num,$SIZE_T*8
1039	b	.Lsqr8x_zero_start
1040
1041.align	5
1042.Lsqr8x_zero:
1043	subic.	$cnt,$cnt,$SIZE_T*8
1044	$ST	$zero,$SIZE_T*1($tp)
1045	$ST	$zero,$SIZE_T*2($tp)
1046	$ST	$zero,$SIZE_T*3($tp)
1047	$ST	$zero,$SIZE_T*4($tp)
1048	$ST	$zero,$SIZE_T*5($tp)
1049	$ST	$zero,$SIZE_T*6($tp)
1050	$ST	$zero,$SIZE_T*7($tp)
1051	$ST	$zero,$SIZE_T*8($tp)
1052.Lsqr8x_zero_start:
1053	$ST	$zero,$SIZE_T*9($tp)
1054	$ST	$zero,$SIZE_T*10($tp)
1055	$ST	$zero,$SIZE_T*11($tp)
1056	$ST	$zero,$SIZE_T*12($tp)
1057	$ST	$zero,$SIZE_T*13($tp)
1058	$ST	$zero,$SIZE_T*14($tp)
1059	$ST	$zero,$SIZE_T*15($tp)
1060	$STU	$zero,$SIZE_T*16($tp)
1061	bne	.Lsqr8x_zero
1062
1063	$PUSH	$rp,$SIZE_T*6($sp)	# offload &rp[-1]
1064	$PUSH	$t0,$SIZE_T*7($sp)	# offload &np[-1]
1065	$PUSH	$n0,$SIZE_T*8($sp)	# offload n0
1066	$PUSH	$tp,$SIZE_T*9($sp)	# &tp[2*num-1]
1067	$PUSH	$zero,$SIZE_T*10($sp)	# initial top-most carry
1068	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1069
1070	# Multiply everything but a[i]*a[i]
1071.align	5
1072.Lsqr8x_outer_loop:
1073	#						  a[1]a[0]     (i)
1074	#					      a[2]a[0]
1075	#					  a[3]a[0]
1076	#				      a[4]a[0]
1077	#				  a[5]a[0]
1078	#			      a[6]a[0]
1079	#			  a[7]a[0]
1080	#					  a[2]a[1]	       (ii)
1081	#				      a[3]a[1]
1082	#				  a[4]a[1]
1083	#			      a[5]a[1]
1084	#			  a[6]a[1]
1085	#		      a[7]a[1]
1086	#				  a[3]a[2]		       (iii)
1087	#			      a[4]a[2]
1088	#			  a[5]a[2]
1089	#		      a[6]a[2]
1090	#		  a[7]a[2]
1091	#			  a[4]a[3]			       (iv)
1092	#		      a[5]a[3]
1093	#		  a[6]a[3]
1094	#	      a[7]a[3]
1095	#		  a[5]a[4]				       (v)
1096	#	      a[6]a[4]
1097	#	  a[7]a[4]
1098	#	  a[6]a[5]					       (vi)
1099	#     a[7]a[5]
1100	# a[7]a[6]						       (vii)
1101
1102	$UMULL	$t0,$a1,$a0		# lo(a[1..7]*a[0])		(i)
1103	$UMULL	$t1,$a2,$a0
1104	$UMULL	$t2,$a3,$a0
1105	$UMULL	$t3,$a4,$a0
1106	addc	$acc1,$acc1,$t0		# t[1]+lo(a[1]*a[0])
1107	$UMULL	$t0,$a5,$a0
1108	adde	$acc2,$acc2,$t1
1109	$UMULL	$t1,$a6,$a0
1110	adde	$acc3,$acc3,$t2
1111	$UMULL	$t2,$a7,$a0
1112	adde	$acc4,$acc4,$t3
1113	$UMULH	$t3,$a1,$a0		# hi(a[1..7]*a[0])
1114	adde	$acc5,$acc5,$t0
1115	$UMULH	$t0,$a2,$a0
1116	adde	$acc6,$acc6,$t1
1117	$UMULH	$t1,$a3,$a0
1118	adde	$acc7,$acc7,$t2
1119	$UMULH	$t2,$a4,$a0
1120	$ST	$acc0,$SIZE_T*1($tp)	# t[0]
1121	addze	$acc0,$zero		# t[8]
1122	$ST	$acc1,$SIZE_T*2($tp)	# t[1]
1123	addc	$acc2,$acc2,$t3		# t[2]+lo(a[1]*a[0])
1124	$UMULH	$t3,$a5,$a0
1125	adde	$acc3,$acc3,$t0
1126	$UMULH	$t0,$a6,$a0
1127	adde	$acc4,$acc4,$t1
1128	$UMULH	$t1,$a7,$a0
1129	adde	$acc5,$acc5,$t2
1130	 $UMULL	$t2,$a2,$a1		# lo(a[2..7]*a[1])		(ii)
1131	adde	$acc6,$acc6,$t3
1132	 $UMULL	$t3,$a3,$a1
1133	adde	$acc7,$acc7,$t0
1134	 $UMULL	$t0,$a4,$a1
1135	adde	$acc0,$acc0,$t1
1136
1137	$UMULL	$t1,$a5,$a1
1138	addc	$acc3,$acc3,$t2
1139	$UMULL	$t2,$a6,$a1
1140	adde	$acc4,$acc4,$t3
1141	$UMULL	$t3,$a7,$a1
1142	adde	$acc5,$acc5,$t0
1143	$UMULH	$t0,$a2,$a1		# hi(a[2..7]*a[1])
1144	adde	$acc6,$acc6,$t1
1145	$UMULH	$t1,$a3,$a1
1146	adde	$acc7,$acc7,$t2
1147	$UMULH	$t2,$a4,$a1
1148	adde	$acc0,$acc0,$t3
1149	$UMULH	$t3,$a5,$a1
1150	$ST	$acc2,$SIZE_T*3($tp)	# t[2]
1151	addze	$acc1,$zero		# t[9]
1152	$ST	$acc3,$SIZE_T*4($tp)	# t[3]
1153	addc	$acc4,$acc4,$t0
1154	$UMULH	$t0,$a6,$a1
1155	adde	$acc5,$acc5,$t1
1156	$UMULH	$t1,$a7,$a1
1157	adde	$acc6,$acc6,$t2
1158	 $UMULL	$t2,$a3,$a2		# lo(a[3..7]*a[2])		(iii)
1159	adde	$acc7,$acc7,$t3
1160	 $UMULL	$t3,$a4,$a2
1161	adde	$acc0,$acc0,$t0
1162	 $UMULL	$t0,$a5,$a2
1163	adde	$acc1,$acc1,$t1
1164
1165	$UMULL	$t1,$a6,$a2
1166	addc	$acc5,$acc5,$t2
1167	$UMULL	$t2,$a7,$a2
1168	adde	$acc6,$acc6,$t3
1169	$UMULH	$t3,$a3,$a2		# hi(a[3..7]*a[2])
1170	adde	$acc7,$acc7,$t0
1171	$UMULH	$t0,$a4,$a2
1172	adde	$acc0,$acc0,$t1
1173	$UMULH	$t1,$a5,$a2
1174	adde	$acc1,$acc1,$t2
1175	$UMULH	$t2,$a6,$a2
1176	$ST	$acc4,$SIZE_T*5($tp)	# t[4]
1177	addze	$acc2,$zero		# t[10]
1178	$ST	$acc5,$SIZE_T*6($tp)	# t[5]
1179	addc	$acc6,$acc6,$t3
1180	$UMULH	$t3,$a7,$a2
1181	adde	$acc7,$acc7,$t0
1182	 $UMULL	$t0,$a4,$a3		# lo(a[4..7]*a[3])		(iv)
1183	adde	$acc0,$acc0,$t1
1184	 $UMULL	$t1,$a5,$a3
1185	adde	$acc1,$acc1,$t2
1186	 $UMULL	$t2,$a6,$a3
1187	adde	$acc2,$acc2,$t3
1188
1189	$UMULL	$t3,$a7,$a3
1190	addc	$acc7,$acc7,$t0
1191	$UMULH	$t0,$a4,$a3		# hi(a[4..7]*a[3])
1192	adde	$acc0,$acc0,$t1
1193	$UMULH	$t1,$a5,$a3
1194	adde	$acc1,$acc1,$t2
1195	$UMULH	$t2,$a6,$a3
1196	adde	$acc2,$acc2,$t3
1197	$UMULH	$t3,$a7,$a3
1198	$ST	$acc6,$SIZE_T*7($tp)	# t[6]
1199	addze	$acc3,$zero		# t[11]
1200	$STU	$acc7,$SIZE_T*8($tp)	# t[7]
1201	addc	$acc0,$acc0,$t0
1202	 $UMULL	$t0,$a5,$a4		# lo(a[5..7]*a[4])		(v)
1203	adde	$acc1,$acc1,$t1
1204	 $UMULL	$t1,$a6,$a4
1205	adde	$acc2,$acc2,$t2
1206	 $UMULL	$t2,$a7,$a4
1207	adde	$acc3,$acc3,$t3
1208
1209	$UMULH	$t3,$a5,$a4		# hi(a[5..7]*a[4])
1210	addc	$acc1,$acc1,$t0
1211	$UMULH	$t0,$a6,$a4
1212	adde	$acc2,$acc2,$t1
1213	$UMULH	$t1,$a7,$a4
1214	adde	$acc3,$acc3,$t2
1215	 $UMULL	$t2,$a6,$a5		# lo(a[6..7]*a[5])		(vi)
1216	addze	$acc4,$zero		# t[12]
1217	addc	$acc2,$acc2,$t3
1218	 $UMULL	$t3,$a7,$a5
1219	adde	$acc3,$acc3,$t0
1220	 $UMULH	$t0,$a6,$a5		# hi(a[6..7]*a[5])
1221	adde	$acc4,$acc4,$t1
1222
1223	$UMULH	$t1,$a7,$a5
1224	addc	$acc3,$acc3,$t2
1225	 $UMULL	$t2,$a7,$a6		# lo(a[7]*a[6])			(vii)
1226	adde	$acc4,$acc4,$t3
1227	 $UMULH	$t3,$a7,$a6		# hi(a[7]*a[6])
1228	addze	$acc5,$zero		# t[13]
1229	addc	$acc4,$acc4,$t0
1230	$UCMP	$ap_end,$ap		# done yet?
1231	adde	$acc5,$acc5,$t1
1232
1233	addc	$acc5,$acc5,$t2
1234	sub	$t0,$ap_end,$num	# rewinded ap
1235	addze	$acc6,$zero		# t[14]
1236	add	$acc6,$acc6,$t3
1237
1238	beq	.Lsqr8x_outer_break
1239
1240	mr	$n0,$a0
1241	$LD	$a0,$SIZE_T*1($tp)
1242	$LD	$a1,$SIZE_T*2($tp)
1243	$LD	$a2,$SIZE_T*3($tp)
1244	$LD	$a3,$SIZE_T*4($tp)
1245	$LD	$a4,$SIZE_T*5($tp)
1246	$LD	$a5,$SIZE_T*6($tp)
1247	$LD	$a6,$SIZE_T*7($tp)
1248	$LD	$a7,$SIZE_T*8($tp)
1249	addc	$acc0,$acc0,$a0
1250	$LD	$a0,$SIZE_T*1($ap)
1251	adde	$acc1,$acc1,$a1
1252	$LD	$a1,$SIZE_T*2($ap)
1253	adde	$acc2,$acc2,$a2
1254	$LD	$a2,$SIZE_T*3($ap)
1255	adde	$acc3,$acc3,$a3
1256	$LD	$a3,$SIZE_T*4($ap)
1257	adde	$acc4,$acc4,$a4
1258	$LD	$a4,$SIZE_T*5($ap)
1259	adde	$acc5,$acc5,$a5
1260	$LD	$a5,$SIZE_T*6($ap)
1261	adde	$acc6,$acc6,$a6
1262	$LD	$a6,$SIZE_T*7($ap)
1263	subi	$rp,$ap,$SIZE_T*7
1264	addze	$acc7,$a7
1265	$LDU	$a7,$SIZE_T*8($ap)
1266	#addze	$carry,$zero		# moved below
1267	li	$cnt,0
1268	b	.Lsqr8x_mul
1269
1270	#                                                          a[8]a[0]
1271	#                                                      a[9]a[0]
1272	#                                                  a[a]a[0]
1273	#                                              a[b]a[0]
1274	#                                          a[c]a[0]
1275	#                                      a[d]a[0]
1276	#                                  a[e]a[0]
1277	#                              a[f]a[0]
1278	#                                                      a[8]a[1]
1279	#                          a[f]a[1]........................
1280	#                                                  a[8]a[2]
1281	#                      a[f]a[2]........................
1282	#                                              a[8]a[3]
1283	#                  a[f]a[3]........................
1284	#                                          a[8]a[4]
1285	#              a[f]a[4]........................
1286	#                                      a[8]a[5]
1287	#          a[f]a[5]........................
1288	#                                  a[8]a[6]
1289	#      a[f]a[6]........................
1290	#                              a[8]a[7]
1291	#  a[f]a[7]........................
1292.align	5
1293.Lsqr8x_mul:
1294	$UMULL	$t0,$a0,$n0
1295	addze	$carry,$zero		# carry bit, modulo-scheduled
1296	$UMULL	$t1,$a1,$n0
1297	addi	$cnt,$cnt,$SIZE_T
1298	$UMULL	$t2,$a2,$n0
1299	andi.	$cnt,$cnt,$SIZE_T*8-1
1300	$UMULL	$t3,$a3,$n0
1301	addc	$acc0,$acc0,$t0
1302	$UMULL	$t0,$a4,$n0
1303	adde	$acc1,$acc1,$t1
1304	$UMULL	$t1,$a5,$n0
1305	adde	$acc2,$acc2,$t2
1306	$UMULL	$t2,$a6,$n0
1307	adde	$acc3,$acc3,$t3
1308	$UMULL	$t3,$a7,$n0
1309	adde	$acc4,$acc4,$t0
1310	$UMULH	$t0,$a0,$n0
1311	adde	$acc5,$acc5,$t1
1312	$UMULH	$t1,$a1,$n0
1313	adde	$acc6,$acc6,$t2
1314	$UMULH	$t2,$a2,$n0
1315	adde	$acc7,$acc7,$t3
1316	$UMULH	$t3,$a3,$n0
1317	addze	$carry,$carry
1318	$STU	$acc0,$SIZE_T($tp)
1319	addc	$acc0,$acc1,$t0
1320	$UMULH	$t0,$a4,$n0
1321	adde	$acc1,$acc2,$t1
1322	$UMULH	$t1,$a5,$n0
1323	adde	$acc2,$acc3,$t2
1324	$UMULH	$t2,$a6,$n0
1325	adde	$acc3,$acc4,$t3
1326	$UMULH	$t3,$a7,$n0
1327	$LDX	$n0,$rp,$cnt
1328	adde	$acc4,$acc5,$t0
1329	adde	$acc5,$acc6,$t1
1330	adde	$acc6,$acc7,$t2
1331	adde	$acc7,$carry,$t3
1332	#addze	$carry,$zero		# moved above
1333	bne	.Lsqr8x_mul
1334					# note that carry flag is guaranteed
1335					# to be zero at this point
1336	$UCMP	$ap,$ap_end		# done yet?
1337	beq	.Lsqr8x_break
1338
1339	$LD	$a0,$SIZE_T*1($tp)
1340	$LD	$a1,$SIZE_T*2($tp)
1341	$LD	$a2,$SIZE_T*3($tp)
1342	$LD	$a3,$SIZE_T*4($tp)
1343	$LD	$a4,$SIZE_T*5($tp)
1344	$LD	$a5,$SIZE_T*6($tp)
1345	$LD	$a6,$SIZE_T*7($tp)
1346	$LD	$a7,$SIZE_T*8($tp)
1347	addc	$acc0,$acc0,$a0
1348	$LD	$a0,$SIZE_T*1($ap)
1349	adde	$acc1,$acc1,$a1
1350	$LD	$a1,$SIZE_T*2($ap)
1351	adde	$acc2,$acc2,$a2
1352	$LD	$a2,$SIZE_T*3($ap)
1353	adde	$acc3,$acc3,$a3
1354	$LD	$a3,$SIZE_T*4($ap)
1355	adde	$acc4,$acc4,$a4
1356	$LD	$a4,$SIZE_T*5($ap)
1357	adde	$acc5,$acc5,$a5
1358	$LD	$a5,$SIZE_T*6($ap)
1359	adde	$acc6,$acc6,$a6
1360	$LD	$a6,$SIZE_T*7($ap)
1361	adde	$acc7,$acc7,$a7
1362	$LDU	$a7,$SIZE_T*8($ap)
1363	#addze	$carry,$zero		# moved above
1364	b	.Lsqr8x_mul
1365
1366.align	5
1367.Lsqr8x_break:
1368	$LD	$a0,$SIZE_T*8($rp)
1369	addi	$ap,$rp,$SIZE_T*15
1370	$LD	$a1,$SIZE_T*9($rp)
1371	sub.	$t0,$ap_end,$ap		# is it last iteration?
1372	$LD	$a2,$SIZE_T*10($rp)
1373	sub	$t1,$tp,$t0
1374	$LD	$a3,$SIZE_T*11($rp)
1375	$LD	$a4,$SIZE_T*12($rp)
1376	$LD	$a5,$SIZE_T*13($rp)
1377	$LD	$a6,$SIZE_T*14($rp)
1378	$LD	$a7,$SIZE_T*15($rp)
1379	beq	.Lsqr8x_outer_loop
1380
1381	$ST	$acc0,$SIZE_T*1($tp)
1382	$LD	$acc0,$SIZE_T*1($t1)
1383	$ST	$acc1,$SIZE_T*2($tp)
1384	$LD	$acc1,$SIZE_T*2($t1)
1385	$ST	$acc2,$SIZE_T*3($tp)
1386	$LD	$acc2,$SIZE_T*3($t1)
1387	$ST	$acc3,$SIZE_T*4($tp)
1388	$LD	$acc3,$SIZE_T*4($t1)
1389	$ST	$acc4,$SIZE_T*5($tp)
1390	$LD	$acc4,$SIZE_T*5($t1)
1391	$ST	$acc5,$SIZE_T*6($tp)
1392	$LD	$acc5,$SIZE_T*6($t1)
1393	$ST	$acc6,$SIZE_T*7($tp)
1394	$LD	$acc6,$SIZE_T*7($t1)
1395	$ST	$acc7,$SIZE_T*8($tp)
1396	$LD	$acc7,$SIZE_T*8($t1)
1397	mr	$tp,$t1
1398	b	.Lsqr8x_outer_loop
1399
1400.align	5
1401.Lsqr8x_outer_break:
1402	####################################################################
1403	# Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1404	$LD	$a1,$SIZE_T*1($t0)	# recall that $t0 is &a[-1]
1405	$LD	$a3,$SIZE_T*2($t0)
1406	$LD	$a5,$SIZE_T*3($t0)
1407	$LD	$a7,$SIZE_T*4($t0)
1408	addi	$ap,$t0,$SIZE_T*4
1409					# "tp[x]" comments are for num==8 case
1410	$LD	$t1,$SIZE_T*13($sp)	# =tp[1], t[0] is not interesting
1411	$LD	$t2,$SIZE_T*14($sp)
1412	$LD	$t3,$SIZE_T*15($sp)
1413	$LD	$t0,$SIZE_T*16($sp)
1414
1415	$ST	$acc0,$SIZE_T*1($tp)	# tp[8]=
1416	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
1417	$ST	$acc1,$SIZE_T*2($tp)
1418	subi	$cnt,$cnt,1
1419	$ST	$acc2,$SIZE_T*3($tp)
1420	$ST	$acc3,$SIZE_T*4($tp)
1421	$ST	$acc4,$SIZE_T*5($tp)
1422	$ST	$acc5,$SIZE_T*6($tp)
1423	$ST	$acc6,$SIZE_T*7($tp)
1424	#$ST	$acc7,$SIZE_T*8($tp)	# tp[15] is not interesting
1425	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1426	$UMULL	$acc0,$a1,$a1
1427	$UMULH	$a1,$a1,$a1
1428	add	$acc1,$t1,$t1		# <<1
1429	$SHRI	$t1,$t1,$BITS-1
1430	$UMULL	$a2,$a3,$a3
1431	$UMULH	$a3,$a3,$a3
1432	addc	$acc1,$acc1,$a1
1433	add	$acc2,$t2,$t2
1434	$SHRI	$t2,$t2,$BITS-1
1435	add	$acc3,$t3,$t3
1436	$SHRI	$t3,$t3,$BITS-1
1437	or	$acc2,$acc2,$t1
1438
1439	mtctr	$cnt
1440.Lsqr4x_shift_n_add:
1441	$UMULL	$a4,$a5,$a5
1442	$UMULH	$a5,$a5,$a5
1443	$LD	$t1,$SIZE_T*6($tp)	# =tp[5]
1444	$LD	$a1,$SIZE_T*1($ap)
1445	adde	$acc2,$acc2,$a2
1446	add	$acc4,$t0,$t0
1447	$SHRI	$t0,$t0,$BITS-1
1448	or	$acc3,$acc3,$t2
1449	$LD	$t2,$SIZE_T*7($tp)	# =tp[6]
1450	adde	$acc3,$acc3,$a3
1451	$LD	$a3,$SIZE_T*2($ap)
1452	add	$acc5,$t1,$t1
1453	$SHRI	$t1,$t1,$BITS-1
1454	or	$acc4,$acc4,$t3
1455	$LD	$t3,$SIZE_T*8($tp)	# =tp[7]
1456	$UMULL	$a6,$a7,$a7
1457	$UMULH	$a7,$a7,$a7
1458	adde	$acc4,$acc4,$a4
1459	add	$acc6,$t2,$t2
1460	$SHRI	$t2,$t2,$BITS-1
1461	or	$acc5,$acc5,$t0
1462	$LD	$t0,$SIZE_T*9($tp)	# =tp[8]
1463	adde	$acc5,$acc5,$a5
1464	$LD	$a5,$SIZE_T*3($ap)
1465	add	$acc7,$t3,$t3
1466	$SHRI	$t3,$t3,$BITS-1
1467	or	$acc6,$acc6,$t1
1468	$LD	$t1,$SIZE_T*10($tp)	# =tp[9]
1469	$UMULL	$a0,$a1,$a1
1470	$UMULH	$a1,$a1,$a1
1471	adde	$acc6,$acc6,$a6
1472	$ST	$acc0,$SIZE_T*1($tp)	# tp[0]=
1473	add	$acc0,$t0,$t0
1474	$SHRI	$t0,$t0,$BITS-1
1475	or	$acc7,$acc7,$t2
1476	$LD	$t2,$SIZE_T*11($tp)	# =tp[10]
1477	adde	$acc7,$acc7,$a7
1478	$LDU	$a7,$SIZE_T*4($ap)
1479	$ST	$acc1,$SIZE_T*2($tp)	# tp[1]=
1480	add	$acc1,$t1,$t1
1481	$SHRI	$t1,$t1,$BITS-1
1482	or	$acc0,$acc0,$t3
1483	$LD	$t3,$SIZE_T*12($tp)	# =tp[11]
1484	$UMULL	$a2,$a3,$a3
1485	$UMULH	$a3,$a3,$a3
1486	adde	$acc0,$acc0,$a0
1487	$ST	$acc2,$SIZE_T*3($tp)	# tp[2]=
1488	add	$acc2,$t2,$t2
1489	$SHRI	$t2,$t2,$BITS-1
1490	or	$acc1,$acc1,$t0
1491	$LD	$t0,$SIZE_T*13($tp)	# =tp[12]
1492	adde	$acc1,$acc1,$a1
1493	$ST	$acc3,$SIZE_T*4($tp)	# tp[3]=
1494	$ST	$acc4,$SIZE_T*5($tp)	# tp[4]=
1495	$ST	$acc5,$SIZE_T*6($tp)	# tp[5]=
1496	$ST	$acc6,$SIZE_T*7($tp)	# tp[6]=
1497	$STU	$acc7,$SIZE_T*8($tp)	# tp[7]=
1498	add	$acc3,$t3,$t3
1499	$SHRI	$t3,$t3,$BITS-1
1500	or	$acc2,$acc2,$t1
1501	bdnz	.Lsqr4x_shift_n_add
1502___
1503my ($np,$np_end)=($ap,$ap_end);
1504$code.=<<___;
1505	 $POP	$np,$SIZE_T*7($sp)	# pull &np[-1] and n0
1506	 $POP	$n0,$SIZE_T*8($sp)
1507
1508	$UMULL	$a4,$a5,$a5
1509	$UMULH	$a5,$a5,$a5
1510	$ST	$acc0,$SIZE_T*1($tp)	# tp[8]=
1511	 $LD	$acc0,$SIZE_T*12($sp)	# =tp[0]
1512	$LD	$t1,$SIZE_T*6($tp)	# =tp[13]
1513	adde	$acc2,$acc2,$a2
1514	add	$acc4,$t0,$t0
1515	$SHRI	$t0,$t0,$BITS-1
1516	or	$acc3,$acc3,$t2
1517	$LD	$t2,$SIZE_T*7($tp)	# =tp[14]
1518	adde	$acc3,$acc3,$a3
1519	add	$acc5,$t1,$t1
1520	$SHRI	$t1,$t1,$BITS-1
1521	or	$acc4,$acc4,$t3
1522	$UMULL	$a6,$a7,$a7
1523	$UMULH	$a7,$a7,$a7
1524	adde	$acc4,$acc4,$a4
1525	add	$acc6,$t2,$t2
1526	$SHRI	$t2,$t2,$BITS-1
1527	or	$acc5,$acc5,$t0
1528	$ST	$acc1,$SIZE_T*2($tp)	# tp[9]=
1529	 $LD	$acc1,$SIZE_T*13($sp)	# =tp[1]
1530	adde	$acc5,$acc5,$a5
1531	or	$acc6,$acc6,$t1
1532	 $LD	$a0,$SIZE_T*1($np)
1533	 $LD	$a1,$SIZE_T*2($np)
1534	adde	$acc6,$acc6,$a6
1535	 $LD	$a2,$SIZE_T*3($np)
1536	 $LD	$a3,$SIZE_T*4($np)
1537	adde	$acc7,$a7,$t2
1538	 $LD	$a4,$SIZE_T*5($np)
1539	 $LD	$a5,$SIZE_T*6($np)
1540
1541	################################################################
1542	# Reduce by 8 limbs per iteration
1543	$UMULL	$na0,$n0,$acc0		# t[0]*n0
1544	li	$cnt,8
1545	$LD	$a6,$SIZE_T*7($np)
1546	add	$np_end,$np,$num
1547	$LDU	$a7,$SIZE_T*8($np)
1548	$ST	$acc2,$SIZE_T*3($tp)	# tp[10]=
1549	$LD	$acc2,$SIZE_T*14($sp)
1550	$ST	$acc3,$SIZE_T*4($tp)	# tp[11]=
1551	$LD	$acc3,$SIZE_T*15($sp)
1552	$ST	$acc4,$SIZE_T*5($tp)	# tp[12]=
1553	$LD	$acc4,$SIZE_T*16($sp)
1554	$ST	$acc5,$SIZE_T*6($tp)	# tp[13]=
1555	$LD	$acc5,$SIZE_T*17($sp)
1556	$ST	$acc6,$SIZE_T*7($tp)	# tp[14]=
1557	$LD	$acc6,$SIZE_T*18($sp)
1558	$ST	$acc7,$SIZE_T*8($tp)	# tp[15]=
1559	$LD	$acc7,$SIZE_T*19($sp)
1560	addi	$tp,$sp,$SIZE_T*11	# &tp[-1]
1561	mtctr	$cnt
1562	b	.Lsqr8x_reduction
1563
1564.align	5
1565.Lsqr8x_reduction:
1566	# (*)	$UMULL	$t0,$a0,$na0	# lo(n[0-7])*lo(t[0]*n0)
1567	$UMULL	$t1,$a1,$na0
1568	$UMULL	$t2,$a2,$na0
1569	$STU	$na0,$SIZE_T($tp)	# put aside t[0]*n0 for tail processing
1570	$UMULL	$t3,$a3,$na0
1571	# (*)	addc	$acc0,$acc0,$t0
1572	addic	$acc0,$acc0,-1		# (*)
1573	$UMULL	$t0,$a4,$na0
1574	adde	$acc0,$acc1,$t1
1575	$UMULL	$t1,$a5,$na0
1576	adde	$acc1,$acc2,$t2
1577	$UMULL	$t2,$a6,$na0
1578	adde	$acc2,$acc3,$t3
1579	$UMULL	$t3,$a7,$na0
1580	adde	$acc3,$acc4,$t0
1581	$UMULH	$t0,$a0,$na0		# hi(n[0-7])*lo(t[0]*n0)
1582	adde	$acc4,$acc5,$t1
1583	$UMULH	$t1,$a1,$na0
1584	adde	$acc5,$acc6,$t2
1585	$UMULH	$t2,$a2,$na0
1586	adde	$acc6,$acc7,$t3
1587	$UMULH	$t3,$a3,$na0
1588	addze	$acc7,$zero
1589	addc	$acc0,$acc0,$t0
1590	$UMULH	$t0,$a4,$na0
1591	adde	$acc1,$acc1,$t1
1592	$UMULH	$t1,$a5,$na0
1593	adde	$acc2,$acc2,$t2
1594	$UMULH	$t2,$a6,$na0
1595	adde	$acc3,$acc3,$t3
1596	$UMULH	$t3,$a7,$na0
1597	$UMULL	$na0,$n0,$acc0		# next t[0]*n0
1598	adde	$acc4,$acc4,$t0
1599	adde	$acc5,$acc5,$t1
1600	adde	$acc6,$acc6,$t2
1601	adde	$acc7,$acc7,$t3
1602	bdnz	.Lsqr8x_reduction
1603
1604	$LD	$t0,$SIZE_T*1($tp)
1605	$LD	$t1,$SIZE_T*2($tp)
1606	$LD	$t2,$SIZE_T*3($tp)
1607	$LD	$t3,$SIZE_T*4($tp)
1608	subi	$rp,$tp,$SIZE_T*7
1609	$UCMP	$np_end,$np		# done yet?
1610	addc	$acc0,$acc0,$t0
1611	$LD	$t0,$SIZE_T*5($tp)
1612	adde	$acc1,$acc1,$t1
1613	$LD	$t1,$SIZE_T*6($tp)
1614	adde	$acc2,$acc2,$t2
1615	$LD	$t2,$SIZE_T*7($tp)
1616	adde	$acc3,$acc3,$t3
1617	$LD	$t3,$SIZE_T*8($tp)
1618	adde	$acc4,$acc4,$t0
1619	adde	$acc5,$acc5,$t1
1620	adde	$acc6,$acc6,$t2
1621	adde	$acc7,$acc7,$t3
1622	#addze	$carry,$zero		# moved below
1623	beq	.Lsqr8x8_post_condition
1624
1625	$LD	$n0,$SIZE_T*0($rp)
1626	$LD	$a0,$SIZE_T*1($np)
1627	$LD	$a1,$SIZE_T*2($np)
1628	$LD	$a2,$SIZE_T*3($np)
1629	$LD	$a3,$SIZE_T*4($np)
1630	$LD	$a4,$SIZE_T*5($np)
1631	$LD	$a5,$SIZE_T*6($np)
1632	$LD	$a6,$SIZE_T*7($np)
1633	$LDU	$a7,$SIZE_T*8($np)
1634	li	$cnt,0
1635
1636.align	5
1637.Lsqr8x_tail:
1638	$UMULL	$t0,$a0,$n0
1639	addze	$carry,$zero		# carry bit, modulo-scheduled
1640	$UMULL	$t1,$a1,$n0
1641	addi	$cnt,$cnt,$SIZE_T
1642	$UMULL	$t2,$a2,$n0
1643	andi.	$cnt,$cnt,$SIZE_T*8-1
1644	$UMULL	$t3,$a3,$n0
1645	addc	$acc0,$acc0,$t0
1646	$UMULL	$t0,$a4,$n0
1647	adde	$acc1,$acc1,$t1
1648	$UMULL	$t1,$a5,$n0
1649	adde	$acc2,$acc2,$t2
1650	$UMULL	$t2,$a6,$n0
1651	adde	$acc3,$acc3,$t3
1652	$UMULL	$t3,$a7,$n0
1653	adde	$acc4,$acc4,$t0
1654	$UMULH	$t0,$a0,$n0
1655	adde	$acc5,$acc5,$t1
1656	$UMULH	$t1,$a1,$n0
1657	adde	$acc6,$acc6,$t2
1658	$UMULH	$t2,$a2,$n0
1659	adde	$acc7,$acc7,$t3
1660	$UMULH	$t3,$a3,$n0
1661	addze	$carry,$carry
1662	$STU	$acc0,$SIZE_T($tp)
1663	addc	$acc0,$acc1,$t0
1664	$UMULH	$t0,$a4,$n0
1665	adde	$acc1,$acc2,$t1
1666	$UMULH	$t1,$a5,$n0
1667	adde	$acc2,$acc3,$t2
1668	$UMULH	$t2,$a6,$n0
1669	adde	$acc3,$acc4,$t3
1670	$UMULH	$t3,$a7,$n0
1671	$LDX	$n0,$rp,$cnt
1672	adde	$acc4,$acc5,$t0
1673	adde	$acc5,$acc6,$t1
1674	adde	$acc6,$acc7,$t2
1675	adde	$acc7,$carry,$t3
1676	#addze	$carry,$zero		# moved above
1677	bne	.Lsqr8x_tail
1678					# note that carry flag is guaranteed
1679					# to be zero at this point
1680	$LD	$a0,$SIZE_T*1($tp)
1681	$POP	$carry,$SIZE_T*10($sp)	# pull top-most carry in case we break
1682	$UCMP	$np_end,$np		# done yet?
1683	$LD	$a1,$SIZE_T*2($tp)
1684	sub	$t2,$np_end,$num	# rewinded np
1685	$LD	$a2,$SIZE_T*3($tp)
1686	$LD	$a3,$SIZE_T*4($tp)
1687	$LD	$a4,$SIZE_T*5($tp)
1688	$LD	$a5,$SIZE_T*6($tp)
1689	$LD	$a6,$SIZE_T*7($tp)
1690	$LD	$a7,$SIZE_T*8($tp)
1691	beq	.Lsqr8x_tail_break
1692
1693	addc	$acc0,$acc0,$a0
1694	$LD	$a0,$SIZE_T*1($np)
1695	adde	$acc1,$acc1,$a1
1696	$LD	$a1,$SIZE_T*2($np)
1697	adde	$acc2,$acc2,$a2
1698	$LD	$a2,$SIZE_T*3($np)
1699	adde	$acc3,$acc3,$a3
1700	$LD	$a3,$SIZE_T*4($np)
1701	adde	$acc4,$acc4,$a4
1702	$LD	$a4,$SIZE_T*5($np)
1703	adde	$acc5,$acc5,$a5
1704	$LD	$a5,$SIZE_T*6($np)
1705	adde	$acc6,$acc6,$a6
1706	$LD	$a6,$SIZE_T*7($np)
1707	adde	$acc7,$acc7,$a7
1708	$LDU	$a7,$SIZE_T*8($np)
1709	#addze	$carry,$zero		# moved above
1710	b	.Lsqr8x_tail
1711
1712.align	5
1713.Lsqr8x_tail_break:
1714	$POP	$n0,$SIZE_T*8($sp)	# pull n0
1715	$POP	$t3,$SIZE_T*9($sp)	# &tp[2*num-1]
1716	addi	$cnt,$tp,$SIZE_T*8	# end of current t[num] window
1717
1718	addic	$carry,$carry,-1	# "move" top-most carry to carry bit
1719	adde	$t0,$acc0,$a0
1720	$LD	$acc0,$SIZE_T*8($rp)
1721	$LD	$a0,$SIZE_T*1($t2)	# recall that $t2 is &n[-1]
1722	adde	$t1,$acc1,$a1
1723	$LD	$acc1,$SIZE_T*9($rp)
1724	$LD	$a1,$SIZE_T*2($t2)
1725	adde	$acc2,$acc2,$a2
1726	$LD	$a2,$SIZE_T*3($t2)
1727	adde	$acc3,$acc3,$a3
1728	$LD	$a3,$SIZE_T*4($t2)
1729	adde	$acc4,$acc4,$a4
1730	$LD	$a4,$SIZE_T*5($t2)
1731	adde	$acc5,$acc5,$a5
1732	$LD	$a5,$SIZE_T*6($t2)
1733	adde	$acc6,$acc6,$a6
1734	$LD	$a6,$SIZE_T*7($t2)
1735	adde	$acc7,$acc7,$a7
1736	$LD	$a7,$SIZE_T*8($t2)
1737	addi	$np,$t2,$SIZE_T*8
1738	addze	$t2,$zero		# top-most carry
1739	$UMULL	$na0,$n0,$acc0
1740	$ST	$t0,$SIZE_T*1($tp)
1741	$UCMP	$cnt,$t3		# did we hit the bottom?
1742	$ST	$t1,$SIZE_T*2($tp)
1743	li	$cnt,8
1744	$ST	$acc2,$SIZE_T*3($tp)
1745	$LD	$acc2,$SIZE_T*10($rp)
1746	$ST	$acc3,$SIZE_T*4($tp)
1747	$LD	$acc3,$SIZE_T*11($rp)
1748	$ST	$acc4,$SIZE_T*5($tp)
1749	$LD	$acc4,$SIZE_T*12($rp)
1750	$ST	$acc5,$SIZE_T*6($tp)
1751	$LD	$acc5,$SIZE_T*13($rp)
1752	$ST	$acc6,$SIZE_T*7($tp)
1753	$LD	$acc6,$SIZE_T*14($rp)
1754	$ST	$acc7,$SIZE_T*8($tp)
1755	$LD	$acc7,$SIZE_T*15($rp)
1756	$PUSH	$t2,$SIZE_T*10($sp)	# off-load top-most carry
1757	addi	$tp,$rp,$SIZE_T*7	# slide the window
1758	mtctr	$cnt
1759	bne	.Lsqr8x_reduction
1760
1761	################################################################
1762	# Final step. We see if result is larger than modulus, and
1763	# if it is, subtract the modulus. But comparison implies
1764	# subtraction. So we subtract modulus, see if it borrowed,
1765	# and conditionally copy original value.
1766	$POP	$rp,$SIZE_T*6($sp)	# pull &rp[-1]
1767	srwi	$cnt,$num,`log($SIZE_T)/log(2)+3`
1768	mr	$n0,$tp			# put tp aside
1769	addi	$tp,$tp,$SIZE_T*8
1770	subi	$cnt,$cnt,1
1771	subfc	$t0,$a0,$acc0
1772	subfe	$t1,$a1,$acc1
1773	mr	$carry,$t2
1774	mr	$ap_end,$rp		# $rp copy
1775
1776	mtctr	$cnt
1777	b	.Lsqr8x_sub
1778
1779.align	5
1780.Lsqr8x_sub:
1781	$LD	$a0,$SIZE_T*1($np)
1782	$LD	$acc0,$SIZE_T*1($tp)
1783	$LD	$a1,$SIZE_T*2($np)
1784	$LD	$acc1,$SIZE_T*2($tp)
1785	subfe	$t2,$a2,$acc2
1786	$LD	$a2,$SIZE_T*3($np)
1787	$LD	$acc2,$SIZE_T*3($tp)
1788	subfe	$t3,$a3,$acc3
1789	$LD	$a3,$SIZE_T*4($np)
1790	$LD	$acc3,$SIZE_T*4($tp)
1791	$ST	$t0,$SIZE_T*1($rp)
1792	subfe	$t0,$a4,$acc4
1793	$LD	$a4,$SIZE_T*5($np)
1794	$LD	$acc4,$SIZE_T*5($tp)
1795	$ST	$t1,$SIZE_T*2($rp)
1796	subfe	$t1,$a5,$acc5
1797	$LD	$a5,$SIZE_T*6($np)
1798	$LD	$acc5,$SIZE_T*6($tp)
1799	$ST	$t2,$SIZE_T*3($rp)
1800	subfe	$t2,$a6,$acc6
1801	$LD	$a6,$SIZE_T*7($np)
1802	$LD	$acc6,$SIZE_T*7($tp)
1803	$ST	$t3,$SIZE_T*4($rp)
1804	subfe	$t3,$a7,$acc7
1805	$LDU	$a7,$SIZE_T*8($np)
1806	$LDU	$acc7,$SIZE_T*8($tp)
1807	$ST	$t0,$SIZE_T*5($rp)
1808	subfe	$t0,$a0,$acc0
1809	$ST	$t1,$SIZE_T*6($rp)
1810	subfe	$t1,$a1,$acc1
1811	$ST	$t2,$SIZE_T*7($rp)
1812	$STU	$t3,$SIZE_T*8($rp)
1813	bdnz	.Lsqr8x_sub
1814
1815	srwi	$cnt,$num,`log($SIZE_T)/log(2)+2`
1816	 $LD	$a0,$SIZE_T*1($ap_end)	# original $rp
1817	 $LD	$acc0,$SIZE_T*1($n0)	# original $tp
1818	subi	$cnt,$cnt,1
1819	 $LD	$a1,$SIZE_T*2($ap_end)
1820	 $LD	$acc1,$SIZE_T*2($n0)
1821	subfe	$t2,$a2,$acc2
1822	 $LD	$a2,$SIZE_T*3($ap_end)
1823	 $LD	$acc2,$SIZE_T*3($n0)
1824	subfe	$t3,$a3,$acc3
1825	 $LD	$a3,$SIZE_T*4($ap_end)
1826	 $LDU	$acc3,$SIZE_T*4($n0)
1827	$ST	$t0,$SIZE_T*1($rp)
1828	subfe	$t0,$a4,$acc4
1829	$ST	$t1,$SIZE_T*2($rp)
1830	subfe	$t1,$a5,$acc5
1831	$ST	$t2,$SIZE_T*3($rp)
1832	subfe	$t2,$a6,$acc6
1833	$ST	$t3,$SIZE_T*4($rp)
1834	subfe	$t3,$a7,$acc7
1835	$ST	$t0,$SIZE_T*5($rp)
1836	subfe	$carry,$zero,$carry	# did it borrow?
1837	$ST	$t1,$SIZE_T*6($rp)
1838	$ST	$t2,$SIZE_T*7($rp)
1839	$ST	$t3,$SIZE_T*8($rp)
1840
1841	addi	$tp,$sp,$SIZE_T*11
1842	mtctr	$cnt
1843
1844.Lsqr4x_cond_copy:
1845	andc	$a0,$a0,$carry
1846	 $ST	$zero,-$SIZE_T*3($n0)	# wipe stack clean
1847	and	$acc0,$acc0,$carry
1848	 $ST	$zero,-$SIZE_T*2($n0)
1849	andc	$a1,$a1,$carry
1850	 $ST	$zero,-$SIZE_T*1($n0)
1851	and	$acc1,$acc1,$carry
1852	 $ST	$zero,-$SIZE_T*0($n0)
1853	andc	$a2,$a2,$carry
1854	 $ST	$zero,$SIZE_T*1($tp)
1855	and	$acc2,$acc2,$carry
1856	 $ST	$zero,$SIZE_T*2($tp)
1857	andc	$a3,$a3,$carry
1858	 $ST	$zero,$SIZE_T*3($tp)
1859	and	$acc3,$acc3,$carry
1860	 $STU	$zero,$SIZE_T*4($tp)
1861	or	$t0,$a0,$acc0
1862	$LD	$a0,$SIZE_T*5($ap_end)
1863	$LD	$acc0,$SIZE_T*1($n0)
1864	or	$t1,$a1,$acc1
1865	$LD	$a1,$SIZE_T*6($ap_end)
1866	$LD	$acc1,$SIZE_T*2($n0)
1867	or	$t2,$a2,$acc2
1868	$LD	$a2,$SIZE_T*7($ap_end)
1869	$LD	$acc2,$SIZE_T*3($n0)
1870	or	$t3,$a3,$acc3
1871	$LD	$a3,$SIZE_T*8($ap_end)
1872	$LDU	$acc3,$SIZE_T*4($n0)
1873	$ST	$t0,$SIZE_T*1($ap_end)
1874	$ST	$t1,$SIZE_T*2($ap_end)
1875	$ST	$t2,$SIZE_T*3($ap_end)
1876	$STU	$t3,$SIZE_T*4($ap_end)
1877	bdnz	.Lsqr4x_cond_copy
1878
1879	$POP	$ap,0($sp)		# pull saved sp
1880	andc	$a0,$a0,$carry
1881	and	$acc0,$acc0,$carry
1882	andc	$a1,$a1,$carry
1883	and	$acc1,$acc1,$carry
1884	andc	$a2,$a2,$carry
1885	and	$acc2,$acc2,$carry
1886	andc	$a3,$a3,$carry
1887	and	$acc3,$acc3,$carry
1888	or	$t0,$a0,$acc0
1889	or	$t1,$a1,$acc1
1890	or	$t2,$a2,$acc2
1891	or	$t3,$a3,$acc3
1892	$ST	$t0,$SIZE_T*1($ap_end)
1893	$ST	$t1,$SIZE_T*2($ap_end)
1894	$ST	$t2,$SIZE_T*3($ap_end)
1895	$ST	$t3,$SIZE_T*4($ap_end)
1896
1897	b	.Lsqr8x_done
1898
1899.align	5
1900.Lsqr8x8_post_condition:
1901	$POP	$rp,$SIZE_T*6($sp)	# pull rp
1902	$POP	$ap,0($sp)		# pull saved sp
1903	addze	$carry,$zero
1904
1905	# $acc0-7,$carry hold result, $a0-7 hold modulus
1906	subfc	$acc0,$a0,$acc0
1907	subfe	$acc1,$a1,$acc1
1908	 $ST	$zero,$SIZE_T*12($sp)	# wipe stack clean
1909	 $ST	$zero,$SIZE_T*13($sp)
1910	subfe	$acc2,$a2,$acc2
1911	 $ST	$zero,$SIZE_T*14($sp)
1912	 $ST	$zero,$SIZE_T*15($sp)
1913	subfe	$acc3,$a3,$acc3
1914	 $ST	$zero,$SIZE_T*16($sp)
1915	 $ST	$zero,$SIZE_T*17($sp)
1916	subfe	$acc4,$a4,$acc4
1917	 $ST	$zero,$SIZE_T*18($sp)
1918	 $ST	$zero,$SIZE_T*19($sp)
1919	subfe	$acc5,$a5,$acc5
1920	 $ST	$zero,$SIZE_T*20($sp)
1921	 $ST	$zero,$SIZE_T*21($sp)
1922	subfe	$acc6,$a6,$acc6
1923	 $ST	$zero,$SIZE_T*22($sp)
1924	 $ST	$zero,$SIZE_T*23($sp)
1925	subfe	$acc7,$a7,$acc7
1926	 $ST	$zero,$SIZE_T*24($sp)
1927	 $ST	$zero,$SIZE_T*25($sp)
1928	subfe	$carry,$zero,$carry	# did it borrow?
1929	 $ST	$zero,$SIZE_T*26($sp)
1930	 $ST	$zero,$SIZE_T*27($sp)
1931
1932	and	$a0,$a0,$carry
1933	and	$a1,$a1,$carry
1934	addc	$acc0,$acc0,$a0		# add modulus back if borrowed
1935	and	$a2,$a2,$carry
1936	adde	$acc1,$acc1,$a1
1937	and	$a3,$a3,$carry
1938	adde	$acc2,$acc2,$a2
1939	and	$a4,$a4,$carry
1940	adde	$acc3,$acc3,$a3
1941	and	$a5,$a5,$carry
1942	adde	$acc4,$acc4,$a4
1943	and	$a6,$a6,$carry
1944	adde	$acc5,$acc5,$a5
1945	and	$a7,$a7,$carry
1946	adde	$acc6,$acc6,$a6
1947	adde	$acc7,$acc7,$a7
1948	$ST	$acc0,$SIZE_T*1($rp)
1949	$ST	$acc1,$SIZE_T*2($rp)
1950	$ST	$acc2,$SIZE_T*3($rp)
1951	$ST	$acc3,$SIZE_T*4($rp)
1952	$ST	$acc4,$SIZE_T*5($rp)
1953	$ST	$acc5,$SIZE_T*6($rp)
1954	$ST	$acc6,$SIZE_T*7($rp)
1955	$ST	$acc7,$SIZE_T*8($rp)
1956
1957.Lsqr8x_done:
1958	$PUSH	$zero,$SIZE_T*8($sp)
1959	$PUSH	$zero,$SIZE_T*10($sp)
1960
1961	$POP	r14,-$SIZE_T*18($ap)
1962	li	r3,1			# signal "done"
1963	$POP	r15,-$SIZE_T*17($ap)
1964	$POP	r16,-$SIZE_T*16($ap)
1965	$POP	r17,-$SIZE_T*15($ap)
1966	$POP	r18,-$SIZE_T*14($ap)
1967	$POP	r19,-$SIZE_T*13($ap)
1968	$POP	r20,-$SIZE_T*12($ap)
1969	$POP	r21,-$SIZE_T*11($ap)
1970	$POP	r22,-$SIZE_T*10($ap)
1971	$POP	r23,-$SIZE_T*9($ap)
1972	$POP	r24,-$SIZE_T*8($ap)
1973	$POP	r25,-$SIZE_T*7($ap)
1974	$POP	r26,-$SIZE_T*6($ap)
1975	$POP	r27,-$SIZE_T*5($ap)
1976	$POP	r28,-$SIZE_T*4($ap)
1977	$POP	r29,-$SIZE_T*3($ap)
1978	$POP	r30,-$SIZE_T*2($ap)
1979	$POP	r31,-$SIZE_T*1($ap)
1980	mr	$sp,$ap
1981	blr
1982	.long	0
1983	.byte	0,12,4,0x20,0x80,18,6,0
1984	.long	0
1985.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1986___
1987}
1988$code.=<<___;
1989.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1990___
1991
1992$code =~ s/\`([^\`]*)\`/eval $1/gem;
1993print $code;
1994close STDOUT or die "error closing STDOUT: $!";
1995