xref: /openssl/crypto/bn/asm/ppc64-mont.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# December 2007
18
19# The reason for undertaken effort is basically following. Even though
20# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
21# performance was observed to be less than impressive, essentially as
22# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
23# Well, it's not surprising that IBM had to make some sacrifices to
24# boost the clock frequency that much, but no overall improvement?
25# Having observed how much difference did switching to FPU make on
26# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
27# Unfortunately the resulting performance improvement is not as
28# impressive, ~30%, and in absolute terms is still very far from what
29# one would expect from 4.7GHz CPU. There is a chance that I'm doing
30# something wrong, but in the lack of assembler level micro-profiling
31# data or at least decent platform guide I can't tell... Or better
32# results might be achieved with VMX... Anyway, this module provides
33# *worse* performance on other PowerPC implementations, ~40-15% slower
34# on PPC970 depending on key length and ~40% slower on Power 5 for all
35# key lengths. As it's obviously inappropriate as "best all-round"
36# alternative, it has to be complemented with run-time CPU family
37# detection. Oh! It should also be noted that unlike other PowerPC
38# implementation IALU ppc-mont.pl module performs *suboptimally* on
39# >=1024-bit key lengths on Power 6. It should also be noted that
40# *everything* said so far applies to 64-bit builds! As far as 32-bit
41# application executed on 64-bit CPU goes, this module is likely to
42# become preferred choice, because it's easy to adapt it for such
43# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
44
45# February 2008
46
47# Micro-profiling assisted optimization results in ~15% improvement
48# over original ppc64-mont.pl version, or overall ~50% improvement
49# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
50# Power 6 CPU, this module is 5-150% faster depending on key length,
51# [hereafter] more for longer keys. But if compared to ppc-mont.pl
52# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
53# in absolute terms, but it's apparently the way Power 6 is...
54
55# December 2009
56
57# Adapted for 32-bit build this module delivers 25-120%, yes, more
58# than *twice* for longer keys, performance improvement over 32-bit
59# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
60# even 64-bit integer operations and the trouble is that most PPC
61# operating systems don't preserve upper halves of general purpose
62# registers upon 32-bit signal delivery. They do preserve them upon
63# context switch, but not signalling:-( This means that asynchronous
64# signals have to be blocked upon entry to this subroutine. Signal
65# masking (and of course complementary unmasking) has quite an impact
66# on performance, naturally larger for shorter keys. It's so severe
67# that 512-bit key performance can be as low as 1/3 of expected one.
68# This is why this routine can be engaged for longer key operations
69# only on these OSes, see crypto/ppccap.c for further details. MacOS X
70# is an exception from this and doesn't require signal masking, and
71# that's where above improvement coefficients were collected. For
72# others alternative would be to break dependence on upper halves of
73# GPRs by sticking to 32-bit integer operations...
74
75# December 2012
76
77# Remove above mentioned dependence on GPRs' upper halves in 32-bit
78# build. No signal masking overhead, but integer instructions are
79# *more* numerous... It's still "universally" faster than 32-bit
80# ppc-mont.pl, but improvement coefficient is not as impressive
81# for longer keys...
82
83# $output is the last argument if it looks like a file (it has an extension)
84# $flavour is the first argument if it doesn't look like a file
85$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
86$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
87
88if ($flavour =~ /32/) {
89	$SIZE_T=4;
90	$RZONE=	224;
91	$fname=	"bn_mul_mont_fpu64";
92
93	$STUX=	"stwux";	# store indexed and update
94	$PUSH=	"stw";
95	$POP=	"lwz";
96} elsif ($flavour =~ /64/) {
97	$SIZE_T=8;
98	$RZONE=	288;
99	$fname=	"bn_mul_mont_fpu64";
100
101	# same as above, but 64-bit mnemonics...
102	$STUX=	"stdux";	# store indexed and update
103	$PUSH=	"std";
104	$POP=	"ld";
105} else { die "nonsense $flavour"; }
106
107$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
108
109$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
110( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
111( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
112die "can't locate ppc-xlate.pl";
113
114open STDOUT,"| $^X $xlate $flavour \"$output\""
115    or die "can't call $xlate: $!";
116
117$FRAME=64;	# padded frame header
118$TRANSFER=16*8;
119
120$carry="r0";
121$sp="r1";
122$toc="r2";
123$rp="r3";	$ovf="r3";
124$ap="r4";
125$bp="r5";
126$np="r6";
127$n0="r7";
128$num="r8";
129$rp="r9";	# $rp is reassigned
130$tp="r10";
131$j="r11";
132$i="r12";
133# non-volatile registers
134$c1="r19";
135$n1="r20";
136$a1="r21";
137$nap_d="r22";	# interleaved ap and np in double format
138$a0="r23";	# ap[0]
139$t0="r24";	# temporary registers
140$t1="r25";
141$t2="r26";
142$t3="r27";
143$t4="r28";
144$t5="r29";
145$t6="r30";
146$t7="r31";
147
148# PPC offers enough register bank capacity to unroll inner loops twice
149#
150#     ..A3A2A1A0
151#           dcba
152#    -----------
153#            A0a
154#           A0b
155#          A0c
156#         A0d
157#          A1a
158#         A1b
159#        A1c
160#       A1d
161#        A2a
162#       A2b
163#      A2c
164#     A2d
165#      A3a
166#     A3b
167#    A3c
168#   A3d
169#    ..a
170#   ..b
171#
172$ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
173$na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
174$dota="f8";	$dotb="f9";
175$A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
176$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
177$T0a="f24";	$T0b="f25";
178$T1a="f26";	$T1b="f27";
179$T2a="f28";	$T2b="f29";
180$T3a="f30";	$T3b="f31";
181
182# sp----------->+-------------------------------+
183#		| saved sp			|
184#		+-------------------------------+
185#		.				.
186#   +64		+-------------------------------+
187#		| 16 gpr<->fpr transfer zone	|
188#		.				.
189#		.				.
190#   +16*8	+-------------------------------+
191#		| __int64 tmp[-1]		|
192#		+-------------------------------+
193#		| __int64 tmp[num]		|
194#		.				.
195#		.				.
196#		.				.
197#   +(num+1)*8	+-------------------------------+
198#		| padding to 64 byte boundary	|
199#		.				.
200#   +X		+-------------------------------+
201#		| double nap_d[4*num]		|
202#		.				.
203#		.				.
204#		.				.
205#		+-------------------------------+
206#		.				.
207#   -13*size_t	+-------------------------------+
208#		| 13 saved gpr, r19-r31		|
209#		.				.
210#		.				.
211#   -12*8	+-------------------------------+
212#		| 12 saved fpr, f20-f31		|
213#		.				.
214#		.				.
215#		+-------------------------------+
216
217$code=<<___;
218.machine "any"
219.text
220
221.globl	.$fname
222.align	5
223.$fname:
224	cmpwi	$num,`3*8/$SIZE_T`
225	mr	$rp,r3		; $rp is reassigned
226	li	r3,0		; possible "not handled" return code
227	bltlr-
228	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
229	bnelr-
230
231	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
232	li	$i,-4096
233	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
234	add	$tp,$tp,$num	; place for tp[num+1]
235	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
236	subf	$tp,$tp,$sp	; $sp-$tp
237	and	$tp,$tp,$i	; minimize TLB usage
238	subf	$tp,$sp,$tp	; $tp-$sp
239	mr	$i,$sp
240	$STUX	$sp,$sp,$tp	; alloca
241
242	$PUSH	r19,`-12*8-13*$SIZE_T`($i)
243	$PUSH	r20,`-12*8-12*$SIZE_T`($i)
244	$PUSH	r21,`-12*8-11*$SIZE_T`($i)
245	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
246	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
247	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
248	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
249	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
250	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
251	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
252	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
253	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
254	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
255	stfd	f20,`-12*8`($i)
256	stfd	f21,`-11*8`($i)
257	stfd	f22,`-10*8`($i)
258	stfd	f23,`-9*8`($i)
259	stfd	f24,`-8*8`($i)
260	stfd	f25,`-7*8`($i)
261	stfd	f26,`-6*8`($i)
262	stfd	f27,`-5*8`($i)
263	stfd	f28,`-4*8`($i)
264	stfd	f29,`-3*8`($i)
265	stfd	f30,`-2*8`($i)
266	stfd	f31,`-1*8`($i)
267
268	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
269	li	$i,-64
270	add	$nap_d,$tp,$num
271	and	$nap_d,$nap_d,$i	; align to 64 bytes
272	; nap_d is off by 1, because it's used with stfdu/lfdu
273	addi	$nap_d,$nap_d,-8
274	srwi	$j,$num,`3+1`	; counter register, num/2
275	addi	$j,$j,-1
276	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
277	li	$carry,0
278	mtctr	$j
279___
280
281$code.=<<___ if ($SIZE_T==8);
282	ld	$a0,0($ap)		; pull ap[0] value
283	ld	$t3,0($bp)		; bp[0]
284	ld	$n0,0($n0)		; pull n0[0] value
285
286	mulld	$t7,$a0,$t3		; ap[0]*bp[0]
287	; transfer bp[0] to FPU as 4x16-bit values
288	extrdi	$t0,$t3,16,48
289	extrdi	$t1,$t3,16,32
290	extrdi	$t2,$t3,16,16
291	extrdi	$t3,$t3,16,0
292	std	$t0,`$FRAME+0`($sp)
293	std	$t1,`$FRAME+8`($sp)
294	std	$t2,`$FRAME+16`($sp)
295	std	$t3,`$FRAME+24`($sp)
296
297	mulld	$t7,$t7,$n0		; tp[0]*n0
298	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
299	extrdi	$t4,$t7,16,48
300	extrdi	$t5,$t7,16,32
301	extrdi	$t6,$t7,16,16
302	extrdi	$t7,$t7,16,0
303	std	$t4,`$FRAME+32`($sp)
304	std	$t5,`$FRAME+40`($sp)
305	std	$t6,`$FRAME+48`($sp)
306	std	$t7,`$FRAME+56`($sp)
307
308	extrdi	$t0,$a0,32,32		; lwz	$t0,4($ap)
309	extrdi	$t1,$a0,32,0		; lwz	$t1,0($ap)
310	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[1] as 32-bit word pair
311	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
312	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[0] as 32-bit word pair
313	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
314	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[1] as 32-bit word pair
315	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
316___
317$code.=<<___ if ($SIZE_T==4);
318	lwz	$a0,0($ap)		; pull ap[0,1] value
319	mr	$n1,$n0
320	lwz	$a1,4($ap)
321	li	$c1,0
322	lwz	$t1,0($bp)		; bp[0,1]
323	lwz	$t3,4($bp)
324	lwz	$n0,0($n1)		; pull n0[0,1] value
325	lwz	$n1,4($n1)
326
327	mullw	$t4,$a0,$t1		; mulld ap[0]*bp[0]
328	mulhwu	$t5,$a0,$t1
329	mullw	$t6,$a1,$t1
330	mullw	$t7,$a0,$t3
331	add	$t5,$t5,$t6
332	add	$t5,$t5,$t7
333	; transfer bp[0] to FPU as 4x16-bit values
334	extrwi	$t0,$t1,16,16
335	extrwi	$t1,$t1,16,0
336	extrwi	$t2,$t3,16,16
337	extrwi	$t3,$t3,16,0
338	std	$t0,`$FRAME+0`($sp)	; yes, std in 32-bit build
339	std	$t1,`$FRAME+8`($sp)
340	std	$t2,`$FRAME+16`($sp)
341	std	$t3,`$FRAME+24`($sp)
342
343	mullw	$t0,$t4,$n0		; mulld tp[0]*n0
344	mulhwu	$t1,$t4,$n0
345	mullw	$t2,$t5,$n0
346	mullw	$t3,$t4,$n1
347	add	$t1,$t1,$t2
348	add	$t1,$t1,$t3
349	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
350	extrwi	$t4,$t0,16,16
351	extrwi	$t5,$t0,16,0
352	extrwi	$t6,$t1,16,16
353	extrwi	$t7,$t1,16,0
354	std	$t4,`$FRAME+32`($sp)	; yes, std in 32-bit build
355	std	$t5,`$FRAME+40`($sp)
356	std	$t6,`$FRAME+48`($sp)
357	std	$t7,`$FRAME+56`($sp)
358
359	mr	$t0,$a0			; lwz	$t0,0($ap)
360	mr	$t1,$a1			; lwz	$t1,4($ap)
361	lwz	$t2,8($ap)		; load a[j..j+3] as 32-bit word pairs
362	lwz	$t3,12($ap)
363	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
364	lwz	$t5,4($np)
365	lwz	$t6,8($np)
366	lwz	$t7,12($np)
367___
368$code.=<<___;
369	lfd	$ba,`$FRAME+0`($sp)
370	lfd	$bb,`$FRAME+8`($sp)
371	lfd	$bc,`$FRAME+16`($sp)
372	lfd	$bd,`$FRAME+24`($sp)
373	lfd	$na,`$FRAME+32`($sp)
374	lfd	$nb,`$FRAME+40`($sp)
375	lfd	$nc,`$FRAME+48`($sp)
376	lfd	$nd,`$FRAME+56`($sp)
377	std	$t0,`$FRAME+64`($sp)	; yes, std even in 32-bit build
378	std	$t1,`$FRAME+72`($sp)
379	std	$t2,`$FRAME+80`($sp)
380	std	$t3,`$FRAME+88`($sp)
381	std	$t4,`$FRAME+96`($sp)
382	std	$t5,`$FRAME+104`($sp)
383	std	$t6,`$FRAME+112`($sp)
384	std	$t7,`$FRAME+120`($sp)
385	fcfid	$ba,$ba
386	fcfid	$bb,$bb
387	fcfid	$bc,$bc
388	fcfid	$bd,$bd
389	fcfid	$na,$na
390	fcfid	$nb,$nb
391	fcfid	$nc,$nc
392	fcfid	$nd,$nd
393
394	lfd	$A0,`$FRAME+64`($sp)
395	lfd	$A1,`$FRAME+72`($sp)
396	lfd	$A2,`$FRAME+80`($sp)
397	lfd	$A3,`$FRAME+88`($sp)
398	lfd	$N0,`$FRAME+96`($sp)
399	lfd	$N1,`$FRAME+104`($sp)
400	lfd	$N2,`$FRAME+112`($sp)
401	lfd	$N3,`$FRAME+120`($sp)
402	fcfid	$A0,$A0
403	fcfid	$A1,$A1
404	fcfid	$A2,$A2
405	fcfid	$A3,$A3
406	fcfid	$N0,$N0
407	fcfid	$N1,$N1
408	fcfid	$N2,$N2
409	fcfid	$N3,$N3
410	addi	$ap,$ap,16
411	addi	$np,$np,16
412
413	fmul	$T1a,$A1,$ba
414	fmul	$T1b,$A1,$bb
415	stfd	$A0,8($nap_d)		; save a[j] in double format
416	stfd	$A1,16($nap_d)
417	fmul	$T2a,$A2,$ba
418	fmul	$T2b,$A2,$bb
419	stfd	$A2,24($nap_d)		; save a[j+1] in double format
420	stfd	$A3,32($nap_d)
421	fmul	$T3a,$A3,$ba
422	fmul	$T3b,$A3,$bb
423	stfd	$N0,40($nap_d)		; save n[j] in double format
424	stfd	$N1,48($nap_d)
425	fmul	$T0a,$A0,$ba
426	fmul	$T0b,$A0,$bb
427	stfd	$N2,56($nap_d)		; save n[j+1] in double format
428	stfdu	$N3,64($nap_d)
429
430	fmadd	$T1a,$A0,$bc,$T1a
431	fmadd	$T1b,$A0,$bd,$T1b
432	fmadd	$T2a,$A1,$bc,$T2a
433	fmadd	$T2b,$A1,$bd,$T2b
434	fmadd	$T3a,$A2,$bc,$T3a
435	fmadd	$T3b,$A2,$bd,$T3b
436	fmul	$dota,$A3,$bc
437	fmul	$dotb,$A3,$bd
438
439	fmadd	$T1a,$N1,$na,$T1a
440	fmadd	$T1b,$N1,$nb,$T1b
441	fmadd	$T2a,$N2,$na,$T2a
442	fmadd	$T2b,$N2,$nb,$T2b
443	fmadd	$T3a,$N3,$na,$T3a
444	fmadd	$T3b,$N3,$nb,$T3b
445	fmadd	$T0a,$N0,$na,$T0a
446	fmadd	$T0b,$N0,$nb,$T0b
447
448	fmadd	$T1a,$N0,$nc,$T1a
449	fmadd	$T1b,$N0,$nd,$T1b
450	fmadd	$T2a,$N1,$nc,$T2a
451	fmadd	$T2b,$N1,$nd,$T2b
452	fmadd	$T3a,$N2,$nc,$T3a
453	fmadd	$T3b,$N2,$nd,$T3b
454	fmadd	$dota,$N3,$nc,$dota
455	fmadd	$dotb,$N3,$nd,$dotb
456
457	fctid	$T0a,$T0a
458	fctid	$T0b,$T0b
459	fctid	$T1a,$T1a
460	fctid	$T1b,$T1b
461	fctid	$T2a,$T2a
462	fctid	$T2b,$T2b
463	fctid	$T3a,$T3a
464	fctid	$T3b,$T3b
465
466	stfd	$T0a,`$FRAME+0`($sp)
467	stfd	$T0b,`$FRAME+8`($sp)
468	stfd	$T1a,`$FRAME+16`($sp)
469	stfd	$T1b,`$FRAME+24`($sp)
470	stfd	$T2a,`$FRAME+32`($sp)
471	stfd	$T2b,`$FRAME+40`($sp)
472	stfd	$T3a,`$FRAME+48`($sp)
473	stfd	$T3b,`$FRAME+56`($sp)
474
475.align	5
476L1st:
477___
478$code.=<<___ if ($SIZE_T==8);
479	lwz	$t0,`4^$LITTLE_ENDIAN`($ap)	; load a[j] as 32-bit word pair
480	lwz	$t1,`0^$LITTLE_ENDIAN`($ap)
481	lwz	$t2,`12^$LITTLE_ENDIAN`($ap)	; load a[j+1] as 32-bit word pair
482	lwz	$t3,`8^$LITTLE_ENDIAN`($ap)
483	lwz	$t4,`4^$LITTLE_ENDIAN`($np)	; load n[j] as 32-bit word pair
484	lwz	$t5,`0^$LITTLE_ENDIAN`($np)
485	lwz	$t6,`12^$LITTLE_ENDIAN`($np)	; load n[j+1] as 32-bit word pair
486	lwz	$t7,`8^$LITTLE_ENDIAN`($np)
487___
488$code.=<<___ if ($SIZE_T==4);
489	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
490	lwz	$t1,4($ap)
491	lwz	$t2,8($ap)
492	lwz	$t3,12($ap)
493	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
494	lwz	$t5,4($np)
495	lwz	$t6,8($np)
496	lwz	$t7,12($np)
497___
498$code.=<<___;
499	std	$t0,`$FRAME+64`($sp)	; yes, std even in 32-bit build
500	std	$t1,`$FRAME+72`($sp)
501	std	$t2,`$FRAME+80`($sp)
502	std	$t3,`$FRAME+88`($sp)
503	std	$t4,`$FRAME+96`($sp)
504	std	$t5,`$FRAME+104`($sp)
505	std	$t6,`$FRAME+112`($sp)
506	std	$t7,`$FRAME+120`($sp)
507___
508if ($SIZE_T==8 or $flavour =~ /osx/) {
509$code.=<<___;
510	ld	$t0,`$FRAME+0`($sp)
511	ld	$t1,`$FRAME+8`($sp)
512	ld	$t2,`$FRAME+16`($sp)
513	ld	$t3,`$FRAME+24`($sp)
514	ld	$t4,`$FRAME+32`($sp)
515	ld	$t5,`$FRAME+40`($sp)
516	ld	$t6,`$FRAME+48`($sp)
517	ld	$t7,`$FRAME+56`($sp)
518___
519} else {
520$code.=<<___;
521	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
522	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
523	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
524	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
525	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
526	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
527	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
528	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
529___
530}
531$code.=<<___;
532	lfd	$A0,`$FRAME+64`($sp)
533	lfd	$A1,`$FRAME+72`($sp)
534	lfd	$A2,`$FRAME+80`($sp)
535	lfd	$A3,`$FRAME+88`($sp)
536	lfd	$N0,`$FRAME+96`($sp)
537	lfd	$N1,`$FRAME+104`($sp)
538	lfd	$N2,`$FRAME+112`($sp)
539	lfd	$N3,`$FRAME+120`($sp)
540	fcfid	$A0,$A0
541	fcfid	$A1,$A1
542	fcfid	$A2,$A2
543	fcfid	$A3,$A3
544	fcfid	$N0,$N0
545	fcfid	$N1,$N1
546	fcfid	$N2,$N2
547	fcfid	$N3,$N3
548	addi	$ap,$ap,16
549	addi	$np,$np,16
550
551	fmul	$T1a,$A1,$ba
552	fmul	$T1b,$A1,$bb
553	fmul	$T2a,$A2,$ba
554	fmul	$T2b,$A2,$bb
555	stfd	$A0,8($nap_d)		; save a[j] in double format
556	stfd	$A1,16($nap_d)
557	fmul	$T3a,$A3,$ba
558	fmul	$T3b,$A3,$bb
559	fmadd	$T0a,$A0,$ba,$dota
560	fmadd	$T0b,$A0,$bb,$dotb
561	stfd	$A2,24($nap_d)		; save a[j+1] in double format
562	stfd	$A3,32($nap_d)
563___
564if ($SIZE_T==8 or $flavour =~ /osx/) {
565$code.=<<___;
566	fmadd	$T1a,$A0,$bc,$T1a
567	fmadd	$T1b,$A0,$bd,$T1b
568	fmadd	$T2a,$A1,$bc,$T2a
569	fmadd	$T2b,$A1,$bd,$T2b
570	stfd	$N0,40($nap_d)		; save n[j] in double format
571	stfd	$N1,48($nap_d)
572	fmadd	$T3a,$A2,$bc,$T3a
573	fmadd	$T3b,$A2,$bd,$T3b
574	 add	$t0,$t0,$carry		; can not overflow
575	fmul	$dota,$A3,$bc
576	fmul	$dotb,$A3,$bd
577	stfd	$N2,56($nap_d)		; save n[j+1] in double format
578	stfdu	$N3,64($nap_d)
579	 srdi	$carry,$t0,16
580	 add	$t1,$t1,$carry
581	 srdi	$carry,$t1,16
582
583	fmadd	$T1a,$N1,$na,$T1a
584	fmadd	$T1b,$N1,$nb,$T1b
585	 insrdi	$t0,$t1,16,32
586	fmadd	$T2a,$N2,$na,$T2a
587	fmadd	$T2b,$N2,$nb,$T2b
588	 add	$t2,$t2,$carry
589	fmadd	$T3a,$N3,$na,$T3a
590	fmadd	$T3b,$N3,$nb,$T3b
591	 srdi	$carry,$t2,16
592	fmadd	$T0a,$N0,$na,$T0a
593	fmadd	$T0b,$N0,$nb,$T0b
594	 insrdi	$t0,$t2,16,16
595	 add	$t3,$t3,$carry
596	 srdi	$carry,$t3,16
597
598	fmadd	$T1a,$N0,$nc,$T1a
599	fmadd	$T1b,$N0,$nd,$T1b
600	 insrdi	$t0,$t3,16,0		; 0..63 bits
601	fmadd	$T2a,$N1,$nc,$T2a
602	fmadd	$T2b,$N1,$nd,$T2b
603	 add	$t4,$t4,$carry
604	fmadd	$T3a,$N2,$nc,$T3a
605	fmadd	$T3b,$N2,$nd,$T3b
606	 srdi	$carry,$t4,16
607	fmadd	$dota,$N3,$nc,$dota
608	fmadd	$dotb,$N3,$nd,$dotb
609	 add	$t5,$t5,$carry
610	 srdi	$carry,$t5,16
611	 insrdi	$t4,$t5,16,32
612
613	fctid	$T0a,$T0a
614	fctid	$T0b,$T0b
615	 add	$t6,$t6,$carry
616	fctid	$T1a,$T1a
617	fctid	$T1b,$T1b
618	 srdi	$carry,$t6,16
619	fctid	$T2a,$T2a
620	fctid	$T2b,$T2b
621	 insrdi	$t4,$t6,16,16
622	fctid	$T3a,$T3a
623	fctid	$T3b,$T3b
624	 add	$t7,$t7,$carry
625	 insrdi	$t4,$t7,16,0		; 64..127 bits
626	 srdi	$carry,$t7,16		; upper 33 bits
627
628	stfd	$T0a,`$FRAME+0`($sp)
629	stfd	$T0b,`$FRAME+8`($sp)
630	stfd	$T1a,`$FRAME+16`($sp)
631	stfd	$T1b,`$FRAME+24`($sp)
632	stfd	$T2a,`$FRAME+32`($sp)
633	stfd	$T2b,`$FRAME+40`($sp)
634	stfd	$T3a,`$FRAME+48`($sp)
635	stfd	$T3b,`$FRAME+56`($sp)
636	 std	$t0,8($tp)		; tp[j-1]
637	 stdu	$t4,16($tp)		; tp[j]
638___
639} else {
640$code.=<<___;
641	fmadd	$T1a,$A0,$bc,$T1a
642	fmadd	$T1b,$A0,$bd,$T1b
643	 addc	$t0,$t0,$carry
644	 adde	$t1,$t1,$c1
645	 srwi	$carry,$t0,16
646	fmadd	$T2a,$A1,$bc,$T2a
647	fmadd	$T2b,$A1,$bd,$T2b
648	stfd	$N0,40($nap_d)		; save n[j] in double format
649	stfd	$N1,48($nap_d)
650	 srwi	$c1,$t1,16
651	 insrwi	$carry,$t1,16,0
652	fmadd	$T3a,$A2,$bc,$T3a
653	fmadd	$T3b,$A2,$bd,$T3b
654	 addc	$t2,$t2,$carry
655	 adde	$t3,$t3,$c1
656	 srwi	$carry,$t2,16
657	fmul	$dota,$A3,$bc
658	fmul	$dotb,$A3,$bd
659	stfd	$N2,56($nap_d)		; save n[j+1] in double format
660	stfdu	$N3,64($nap_d)
661	 insrwi	$t0,$t2,16,0		; 0..31 bits
662	 srwi	$c1,$t3,16
663	 insrwi	$carry,$t3,16,0
664
665	fmadd	$T1a,$N1,$na,$T1a
666	fmadd	$T1b,$N1,$nb,$T1b
667	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
668	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
669	 addc	$t4,$t4,$carry
670	 adde	$t5,$t5,$c1
671	 srwi	$carry,$t4,16
672	fmadd	$T2a,$N2,$na,$T2a
673	fmadd	$T2b,$N2,$nb,$T2b
674	 srwi	$c1,$t5,16
675	 insrwi	$carry,$t5,16,0
676	fmadd	$T3a,$N3,$na,$T3a
677	fmadd	$T3b,$N3,$nb,$T3b
678	 addc	$t6,$t6,$carry
679	 adde	$t7,$t7,$c1
680	 srwi	$carry,$t6,16
681	fmadd	$T0a,$N0,$na,$T0a
682	fmadd	$T0b,$N0,$nb,$T0b
683	 insrwi	$t4,$t6,16,0		; 32..63 bits
684	 srwi	$c1,$t7,16
685	 insrwi	$carry,$t7,16,0
686
687	fmadd	$T1a,$N0,$nc,$T1a
688	fmadd	$T1b,$N0,$nd,$T1b
689	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
690	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
691	 addc	$t2,$t2,$carry
692	 adde	$t3,$t3,$c1
693	 srwi	$carry,$t2,16
694	fmadd	$T2a,$N1,$nc,$T2a
695	fmadd	$T2b,$N1,$nd,$T2b
696	 stw	$t0,12($tp)		; tp[j-1]
697	 stw	$t4,8($tp)
698	 srwi	$c1,$t3,16
699	 insrwi	$carry,$t3,16,0
700	fmadd	$T3a,$N2,$nc,$T3a
701	fmadd	$T3b,$N2,$nd,$T3b
702	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
703	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
704	 addc	$t6,$t6,$carry
705	 adde	$t7,$t7,$c1
706	 srwi	$carry,$t6,16
707	fmadd	$dota,$N3,$nc,$dota
708	fmadd	$dotb,$N3,$nd,$dotb
709	 insrwi	$t2,$t6,16,0		; 64..95 bits
710	 srwi	$c1,$t7,16
711	 insrwi	$carry,$t7,16,0
712
713	fctid	$T0a,$T0a
714	fctid	$T0b,$T0b
715	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
716	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
717	 addc	$t0,$t0,$carry
718	 adde	$t1,$t1,$c1
719	 srwi	$carry,$t0,16
720	fctid	$T1a,$T1a
721	fctid	$T1b,$T1b
722	 srwi	$c1,$t1,16
723	 insrwi	$carry,$t1,16,0
724	fctid	$T2a,$T2a
725	fctid	$T2b,$T2b
726	 addc	$t4,$t4,$carry
727	 adde	$t5,$t5,$c1
728	 srwi	$carry,$t4,16
729	fctid	$T3a,$T3a
730	fctid	$T3b,$T3b
731	 insrwi	$t0,$t4,16,0		; 96..127 bits
732	 srwi	$c1,$t5,16
733	 insrwi	$carry,$t5,16,0
734
735	stfd	$T0a,`$FRAME+0`($sp)
736	stfd	$T0b,`$FRAME+8`($sp)
737	stfd	$T1a,`$FRAME+16`($sp)
738	stfd	$T1b,`$FRAME+24`($sp)
739	stfd	$T2a,`$FRAME+32`($sp)
740	stfd	$T2b,`$FRAME+40`($sp)
741	stfd	$T3a,`$FRAME+48`($sp)
742	stfd	$T3b,`$FRAME+56`($sp)
743	 stw	$t2,20($tp)		; tp[j]
744	 stwu	$t0,16($tp)
745___
746}
747$code.=<<___;
748	bdnz	L1st
749
750	fctid	$dota,$dota
751	fctid	$dotb,$dotb
752___
753if ($SIZE_T==8 or $flavour =~ /osx/) {
754$code.=<<___;
755	ld	$t0,`$FRAME+0`($sp)
756	ld	$t1,`$FRAME+8`($sp)
757	ld	$t2,`$FRAME+16`($sp)
758	ld	$t3,`$FRAME+24`($sp)
759	ld	$t4,`$FRAME+32`($sp)
760	ld	$t5,`$FRAME+40`($sp)
761	ld	$t6,`$FRAME+48`($sp)
762	ld	$t7,`$FRAME+56`($sp)
763	stfd	$dota,`$FRAME+64`($sp)
764	stfd	$dotb,`$FRAME+72`($sp)
765
766	add	$t0,$t0,$carry		; can not overflow
767	srdi	$carry,$t0,16
768	add	$t1,$t1,$carry
769	srdi	$carry,$t1,16
770	insrdi	$t0,$t1,16,32
771	add	$t2,$t2,$carry
772	srdi	$carry,$t2,16
773	insrdi	$t0,$t2,16,16
774	add	$t3,$t3,$carry
775	srdi	$carry,$t3,16
776	insrdi	$t0,$t3,16,0		; 0..63 bits
777	add	$t4,$t4,$carry
778	srdi	$carry,$t4,16
779	add	$t5,$t5,$carry
780	srdi	$carry,$t5,16
781	insrdi	$t4,$t5,16,32
782	add	$t6,$t6,$carry
783	srdi	$carry,$t6,16
784	insrdi	$t4,$t6,16,16
785	add	$t7,$t7,$carry
786	insrdi	$t4,$t7,16,0		; 64..127 bits
787	srdi	$carry,$t7,16		; upper 33 bits
788	ld	$t6,`$FRAME+64`($sp)
789	ld	$t7,`$FRAME+72`($sp)
790
791	std	$t0,8($tp)		; tp[j-1]
792	stdu	$t4,16($tp)		; tp[j]
793
794	add	$t6,$t6,$carry		; can not overflow
795	srdi	$carry,$t6,16
796	add	$t7,$t7,$carry
797	insrdi	$t6,$t7,48,0
798	srdi	$ovf,$t7,48
799	std	$t6,8($tp)		; tp[num-1]
800___
801} else {
802$code.=<<___;
803	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
804	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
805	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
806	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
807	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
808	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
809	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
810	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
811	stfd	$dota,`$FRAME+64`($sp)
812	stfd	$dotb,`$FRAME+72`($sp)
813
814	addc	$t0,$t0,$carry
815	adde	$t1,$t1,$c1
816	srwi	$carry,$t0,16
817	insrwi	$carry,$t1,16,0
818	srwi	$c1,$t1,16
819	addc	$t2,$t2,$carry
820	adde	$t3,$t3,$c1
821	srwi	$carry,$t2,16
822	 insrwi	$t0,$t2,16,0		; 0..31 bits
823	insrwi	$carry,$t3,16,0
824	srwi	$c1,$t3,16
825	addc	$t4,$t4,$carry
826	adde	$t5,$t5,$c1
827	srwi	$carry,$t4,16
828	insrwi	$carry,$t5,16,0
829	srwi	$c1,$t5,16
830	addc	$t6,$t6,$carry
831	adde	$t7,$t7,$c1
832	srwi	$carry,$t6,16
833	 insrwi	$t4,$t6,16,0		; 32..63 bits
834	insrwi	$carry,$t7,16,0
835	srwi	$c1,$t7,16
836	 stw	$t0,12($tp)		; tp[j-1]
837	 stw	$t4,8($tp)
838
839	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
840	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
841	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
842	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
843	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
844	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
845	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
846	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
847
848	addc	$t2,$t2,$carry
849	adde	$t3,$t3,$c1
850	srwi	$carry,$t2,16
851	insrwi	$carry,$t3,16,0
852	srwi	$c1,$t3,16
853	addc	$t6,$t6,$carry
854	adde	$t7,$t7,$c1
855	srwi	$carry,$t6,16
856	 insrwi	$t2,$t6,16,0		; 64..95 bits
857	insrwi	$carry,$t7,16,0
858	srwi	$c1,$t7,16
859	addc	$t0,$t0,$carry
860	adde	$t1,$t1,$c1
861	srwi	$carry,$t0,16
862	insrwi	$carry,$t1,16,0
863	srwi	$c1,$t1,16
864	addc	$t4,$t4,$carry
865	adde	$t5,$t5,$c1
866	srwi	$carry,$t4,16
867	 insrwi	$t0,$t4,16,0		; 96..127 bits
868	insrwi	$carry,$t5,16,0
869	srwi	$c1,$t5,16
870	 stw	$t2,20($tp)		; tp[j]
871	 stwu	$t0,16($tp)
872
873	lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
874	lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
875	lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
876	lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
877
878	addc	$t6,$t6,$carry
879	adde	$t7,$t7,$c1
880	srwi	$carry,$t6,16
881	insrwi	$carry,$t7,16,0
882	srwi	$c1,$t7,16
883	addc	$t4,$t4,$carry
884	adde	$t5,$t5,$c1
885
886	insrwi	$t6,$t4,16,0
887	srwi	$t4,$t4,16
888	insrwi	$t4,$t5,16,0
889	srwi	$ovf,$t5,16
890	stw	$t6,12($tp)		; tp[num-1]
891	stw	$t4,8($tp)
892___
893}
894$code.=<<___;
895	slwi	$t7,$num,2
896	subf	$nap_d,$t7,$nap_d	; rewind pointer
897
898	li	$i,8			; i=1
899.align	5
900Louter:
901	addi	$tp,$sp,`$FRAME+$TRANSFER`
902	li	$carry,0
903	mtctr	$j
904___
905$code.=<<___ if ($SIZE_T==8);
906	ldx	$t3,$bp,$i		; bp[i]
907
908	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
909	mulld	$t7,$a0,$t3		; ap[0]*bp[i]
910	add	$t7,$t7,$t6		; ap[0]*bp[i]+tp[0]
911	; transfer bp[i] to FPU as 4x16-bit values
912	extrdi	$t0,$t3,16,48
913	extrdi	$t1,$t3,16,32
914	extrdi	$t2,$t3,16,16
915	extrdi	$t3,$t3,16,0
916	std	$t0,`$FRAME+0`($sp)
917	std	$t1,`$FRAME+8`($sp)
918	std	$t2,`$FRAME+16`($sp)
919	std	$t3,`$FRAME+24`($sp)
920
921	mulld	$t7,$t7,$n0		; tp[0]*n0
922	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
923	extrdi	$t4,$t7,16,48
924	extrdi	$t5,$t7,16,32
925	extrdi	$t6,$t7,16,16
926	extrdi	$t7,$t7,16,0
927	std	$t4,`$FRAME+32`($sp)
928	std	$t5,`$FRAME+40`($sp)
929	std	$t6,`$FRAME+48`($sp)
930	std	$t7,`$FRAME+56`($sp)
931___
932$code.=<<___ if ($SIZE_T==4);
933	add	$t0,$bp,$i
934	li	$c1,0
935	lwz	$t1,0($t0)		; bp[i,i+1]
936	lwz	$t3,4($t0)
937
938	mullw	$t4,$a0,$t1		; ap[0]*bp[i]
939	lwz	$t0,`$FRAME+$TRANSFER+8+4`($sp)	; tp[0]
940	mulhwu	$t5,$a0,$t1
941	lwz	$t2,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
942	mullw	$t6,$a1,$t1
943	mullw	$t7,$a0,$t3
944	add	$t5,$t5,$t6
945	add	$t5,$t5,$t7
946	addc	$t4,$t4,$t0		; ap[0]*bp[i]+tp[0]
947	adde	$t5,$t5,$t2
948	; transfer bp[i] to FPU as 4x16-bit values
949	extrwi	$t0,$t1,16,16
950	extrwi	$t1,$t1,16,0
951	extrwi	$t2,$t3,16,16
952	extrwi	$t3,$t3,16,0
953	std	$t0,`$FRAME+0`($sp)	; yes, std in 32-bit build
954	std	$t1,`$FRAME+8`($sp)
955	std	$t2,`$FRAME+16`($sp)
956	std	$t3,`$FRAME+24`($sp)
957
958	mullw	$t0,$t4,$n0		; mulld tp[0]*n0
959	mulhwu	$t1,$t4,$n0
960	mullw	$t2,$t5,$n0
961	mullw	$t3,$t4,$n1
962	add	$t1,$t1,$t2
963	add	$t1,$t1,$t3
964	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
965	extrwi	$t4,$t0,16,16
966	extrwi	$t5,$t0,16,0
967	extrwi	$t6,$t1,16,16
968	extrwi	$t7,$t1,16,0
969	std	$t4,`$FRAME+32`($sp)	; yes, std in 32-bit build
970	std	$t5,`$FRAME+40`($sp)
971	std	$t6,`$FRAME+48`($sp)
972	std	$t7,`$FRAME+56`($sp)
973___
974$code.=<<___;
975	lfd	$A0,8($nap_d)		; load a[j] in double format
976	lfd	$A1,16($nap_d)
977	lfd	$A2,24($nap_d)		; load a[j+1] in double format
978	lfd	$A3,32($nap_d)
979	lfd	$N0,40($nap_d)		; load n[j] in double format
980	lfd	$N1,48($nap_d)
981	lfd	$N2,56($nap_d)		; load n[j+1] in double format
982	lfdu	$N3,64($nap_d)
983
984	lfd	$ba,`$FRAME+0`($sp)
985	lfd	$bb,`$FRAME+8`($sp)
986	lfd	$bc,`$FRAME+16`($sp)
987	lfd	$bd,`$FRAME+24`($sp)
988	lfd	$na,`$FRAME+32`($sp)
989	lfd	$nb,`$FRAME+40`($sp)
990	lfd	$nc,`$FRAME+48`($sp)
991	lfd	$nd,`$FRAME+56`($sp)
992
993	fcfid	$ba,$ba
994	fcfid	$bb,$bb
995	fcfid	$bc,$bc
996	fcfid	$bd,$bd
997	fcfid	$na,$na
998	fcfid	$nb,$nb
999	fcfid	$nc,$nc
1000	fcfid	$nd,$nd
1001
1002	fmul	$T1a,$A1,$ba
1003	fmul	$T1b,$A1,$bb
1004	fmul	$T2a,$A2,$ba
1005	fmul	$T2b,$A2,$bb
1006	fmul	$T3a,$A3,$ba
1007	fmul	$T3b,$A3,$bb
1008	fmul	$T0a,$A0,$ba
1009	fmul	$T0b,$A0,$bb
1010
1011	fmadd	$T1a,$A0,$bc,$T1a
1012	fmadd	$T1b,$A0,$bd,$T1b
1013	fmadd	$T2a,$A1,$bc,$T2a
1014	fmadd	$T2b,$A1,$bd,$T2b
1015	fmadd	$T3a,$A2,$bc,$T3a
1016	fmadd	$T3b,$A2,$bd,$T3b
1017	fmul	$dota,$A3,$bc
1018	fmul	$dotb,$A3,$bd
1019
1020	fmadd	$T1a,$N1,$na,$T1a
1021	fmadd	$T1b,$N1,$nb,$T1b
1022	 lfd	$A0,8($nap_d)		; load a[j] in double format
1023	 lfd	$A1,16($nap_d)
1024	fmadd	$T2a,$N2,$na,$T2a
1025	fmadd	$T2b,$N2,$nb,$T2b
1026	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
1027	 lfd	$A3,32($nap_d)
1028	fmadd	$T3a,$N3,$na,$T3a
1029	fmadd	$T3b,$N3,$nb,$T3b
1030	fmadd	$T0a,$N0,$na,$T0a
1031	fmadd	$T0b,$N0,$nb,$T0b
1032
1033	fmadd	$T1a,$N0,$nc,$T1a
1034	fmadd	$T1b,$N0,$nd,$T1b
1035	fmadd	$T2a,$N1,$nc,$T2a
1036	fmadd	$T2b,$N1,$nd,$T2b
1037	fmadd	$T3a,$N2,$nc,$T3a
1038	fmadd	$T3b,$N2,$nd,$T3b
1039	fmadd	$dota,$N3,$nc,$dota
1040	fmadd	$dotb,$N3,$nd,$dotb
1041
1042	fctid	$T0a,$T0a
1043	fctid	$T0b,$T0b
1044	fctid	$T1a,$T1a
1045	fctid	$T1b,$T1b
1046	fctid	$T2a,$T2a
1047	fctid	$T2b,$T2b
1048	fctid	$T3a,$T3a
1049	fctid	$T3b,$T3b
1050
1051	stfd	$T0a,`$FRAME+0`($sp)
1052	stfd	$T0b,`$FRAME+8`($sp)
1053	stfd	$T1a,`$FRAME+16`($sp)
1054	stfd	$T1b,`$FRAME+24`($sp)
1055	stfd	$T2a,`$FRAME+32`($sp)
1056	stfd	$T2b,`$FRAME+40`($sp)
1057	stfd	$T3a,`$FRAME+48`($sp)
1058	stfd	$T3b,`$FRAME+56`($sp)
1059
1060.align	5
1061Linner:
1062	fmul	$T1a,$A1,$ba
1063	fmul	$T1b,$A1,$bb
1064	fmul	$T2a,$A2,$ba
1065	fmul	$T2b,$A2,$bb
1066	lfd	$N0,40($nap_d)		; load n[j] in double format
1067	lfd	$N1,48($nap_d)
1068	fmul	$T3a,$A3,$ba
1069	fmul	$T3b,$A3,$bb
1070	fmadd	$T0a,$A0,$ba,$dota
1071	fmadd	$T0b,$A0,$bb,$dotb
1072	lfd	$N2,56($nap_d)		; load n[j+1] in double format
1073	lfdu	$N3,64($nap_d)
1074
1075	fmadd	$T1a,$A0,$bc,$T1a
1076	fmadd	$T1b,$A0,$bd,$T1b
1077	fmadd	$T2a,$A1,$bc,$T2a
1078	fmadd	$T2b,$A1,$bd,$T2b
1079	 lfd	$A0,8($nap_d)		; load a[j] in double format
1080	 lfd	$A1,16($nap_d)
1081	fmadd	$T3a,$A2,$bc,$T3a
1082	fmadd	$T3b,$A2,$bd,$T3b
1083	fmul	$dota,$A3,$bc
1084	fmul	$dotb,$A3,$bd
1085	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
1086	 lfd	$A3,32($nap_d)
1087___
1088if ($SIZE_T==8 or $flavour =~ /osx/) {
1089$code.=<<___;
1090	fmadd	$T1a,$N1,$na,$T1a
1091	fmadd	$T1b,$N1,$nb,$T1b
1092	 ld	$t0,`$FRAME+0`($sp)
1093	 ld	$t1,`$FRAME+8`($sp)
1094	fmadd	$T2a,$N2,$na,$T2a
1095	fmadd	$T2b,$N2,$nb,$T2b
1096	 ld	$t2,`$FRAME+16`($sp)
1097	 ld	$t3,`$FRAME+24`($sp)
1098	fmadd	$T3a,$N3,$na,$T3a
1099	fmadd	$T3b,$N3,$nb,$T3b
1100	 add	$t0,$t0,$carry		; can not overflow
1101	 ld	$t4,`$FRAME+32`($sp)
1102	 ld	$t5,`$FRAME+40`($sp)
1103	fmadd	$T0a,$N0,$na,$T0a
1104	fmadd	$T0b,$N0,$nb,$T0b
1105	 srdi	$carry,$t0,16
1106	 add	$t1,$t1,$carry
1107	 srdi	$carry,$t1,16
1108	 ld	$t6,`$FRAME+48`($sp)
1109	 ld	$t7,`$FRAME+56`($sp)
1110
1111	fmadd	$T1a,$N0,$nc,$T1a
1112	fmadd	$T1b,$N0,$nd,$T1b
1113	 insrdi	$t0,$t1,16,32
1114	 ld	$t1,8($tp)		; tp[j]
1115	fmadd	$T2a,$N1,$nc,$T2a
1116	fmadd	$T2b,$N1,$nd,$T2b
1117	 add	$t2,$t2,$carry
1118	fmadd	$T3a,$N2,$nc,$T3a
1119	fmadd	$T3b,$N2,$nd,$T3b
1120	 srdi	$carry,$t2,16
1121	 insrdi	$t0,$t2,16,16
1122	fmadd	$dota,$N3,$nc,$dota
1123	fmadd	$dotb,$N3,$nd,$dotb
1124	 add	$t3,$t3,$carry
1125	 ldu	$t2,16($tp)		; tp[j+1]
1126	 srdi	$carry,$t3,16
1127	 insrdi	$t0,$t3,16,0		; 0..63 bits
1128	 add	$t4,$t4,$carry
1129
1130	fctid	$T0a,$T0a
1131	fctid	$T0b,$T0b
1132	 srdi	$carry,$t4,16
1133	fctid	$T1a,$T1a
1134	fctid	$T1b,$T1b
1135	 add	$t5,$t5,$carry
1136	fctid	$T2a,$T2a
1137	fctid	$T2b,$T2b
1138	 srdi	$carry,$t5,16
1139	 insrdi	$t4,$t5,16,32
1140	fctid	$T3a,$T3a
1141	fctid	$T3b,$T3b
1142	 add	$t6,$t6,$carry
1143	 srdi	$carry,$t6,16
1144	 insrdi	$t4,$t6,16,16
1145
1146	stfd	$T0a,`$FRAME+0`($sp)
1147	stfd	$T0b,`$FRAME+8`($sp)
1148	 add	$t7,$t7,$carry
1149	 addc	$t3,$t0,$t1
1150___
1151$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1152	extrdi	$t0,$t0,32,0
1153	extrdi	$t1,$t1,32,0
1154	adde	$t0,$t0,$t1
1155___
1156$code.=<<___;
1157	stfd	$T1a,`$FRAME+16`($sp)
1158	stfd	$T1b,`$FRAME+24`($sp)
1159	 insrdi	$t4,$t7,16,0		; 64..127 bits
1160	 srdi	$carry,$t7,16		; upper 33 bits
1161	stfd	$T2a,`$FRAME+32`($sp)
1162	stfd	$T2b,`$FRAME+40`($sp)
1163	 adde	$t5,$t4,$t2
1164___
1165$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1166	extrdi	$t4,$t4,32,0
1167	extrdi	$t2,$t2,32,0
1168	adde	$t4,$t4,$t2
1169___
1170$code.=<<___;
1171	stfd	$T3a,`$FRAME+48`($sp)
1172	stfd	$T3b,`$FRAME+56`($sp)
1173	 addze	$carry,$carry
1174	 std	$t3,-16($tp)		; tp[j-1]
1175	 std	$t5,-8($tp)		; tp[j]
1176___
1177} else {
1178$code.=<<___;
1179	fmadd	$T1a,$N1,$na,$T1a
1180	fmadd	$T1b,$N1,$nb,$T1b
1181	 lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1182	 lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1183	fmadd	$T2a,$N2,$na,$T2a
1184	fmadd	$T2b,$N2,$nb,$T2b
1185	 lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1186	 lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1187	fmadd	$T3a,$N3,$na,$T3a
1188	fmadd	$T3b,$N3,$nb,$T3b
1189	 lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1190	 lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1191	 addc	$t0,$t0,$carry
1192	 adde	$t1,$t1,$c1
1193	 srwi	$carry,$t0,16
1194	fmadd	$T0a,$N0,$na,$T0a
1195	fmadd	$T0b,$N0,$nb,$T0b
1196	 lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1197	 lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1198	 srwi	$c1,$t1,16
1199	 insrwi	$carry,$t1,16,0
1200
1201	fmadd	$T1a,$N0,$nc,$T1a
1202	fmadd	$T1b,$N0,$nd,$T1b
1203	 addc	$t2,$t2,$carry
1204	 adde	$t3,$t3,$c1
1205	 srwi	$carry,$t2,16
1206	fmadd	$T2a,$N1,$nc,$T2a
1207	fmadd	$T2b,$N1,$nd,$T2b
1208	 insrwi	$t0,$t2,16,0		; 0..31 bits
1209	 srwi	$c1,$t3,16
1210	 insrwi	$carry,$t3,16,0
1211	fmadd	$T3a,$N2,$nc,$T3a
1212	fmadd	$T3b,$N2,$nd,$T3b
1213	 lwz	$t2,12($tp)		; tp[j]
1214	 lwz	$t3,8($tp)
1215	 addc	$t4,$t4,$carry
1216	 adde	$t5,$t5,$c1
1217	 srwi	$carry,$t4,16
1218	fmadd	$dota,$N3,$nc,$dota
1219	fmadd	$dotb,$N3,$nd,$dotb
1220	 srwi	$c1,$t5,16
1221	 insrwi	$carry,$t5,16,0
1222
1223	fctid	$T0a,$T0a
1224	 addc	$t6,$t6,$carry
1225	 adde	$t7,$t7,$c1
1226	 srwi	$carry,$t6,16
1227	fctid	$T0b,$T0b
1228	 insrwi	$t4,$t6,16,0		; 32..63 bits
1229	 srwi	$c1,$t7,16
1230	 insrwi	$carry,$t7,16,0
1231	fctid	$T1a,$T1a
1232	 addc	$t0,$t0,$t2
1233	 adde	$t4,$t4,$t3
1234	 lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
1235	 lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
1236	fctid	$T1b,$T1b
1237	 addze	$carry,$carry
1238	 addze	$c1,$c1
1239	 stw	$t0,4($tp)		; tp[j-1]
1240	 stw	$t4,0($tp)
1241	fctid	$T2a,$T2a
1242	 addc	$t2,$t2,$carry
1243	 adde	$t3,$t3,$c1
1244	 srwi	$carry,$t2,16
1245	 lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
1246	 lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
1247	fctid	$T2b,$T2b
1248	 srwi	$c1,$t3,16
1249	 insrwi	$carry,$t3,16,0
1250	 lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
1251	 lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
1252	fctid	$T3a,$T3a
1253	 addc	$t6,$t6,$carry
1254	 adde	$t7,$t7,$c1
1255	 srwi	$carry,$t6,16
1256	 lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
1257	 lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
1258	fctid	$T3b,$T3b
1259
1260	 insrwi	$t2,$t6,16,0		; 64..95 bits
1261	insrwi	$carry,$t7,16,0
1262	srwi	$c1,$t7,16
1263	 lwz	$t6,20($tp)
1264	 lwzu	$t7,16($tp)
1265	addc	$t0,$t0,$carry
1266	 stfd	$T0a,`$FRAME+0`($sp)
1267	adde	$t1,$t1,$c1
1268	srwi	$carry,$t0,16
1269	 stfd	$T0b,`$FRAME+8`($sp)
1270	insrwi	$carry,$t1,16,0
1271	srwi	$c1,$t1,16
1272	addc	$t4,$t4,$carry
1273	 stfd	$T1a,`$FRAME+16`($sp)
1274	adde	$t5,$t5,$c1
1275	srwi	$carry,$t4,16
1276	 insrwi	$t0,$t4,16,0		; 96..127 bits
1277	 stfd	$T1b,`$FRAME+24`($sp)
1278	insrwi	$carry,$t5,16,0
1279	srwi	$c1,$t5,16
1280
1281	addc	$t2,$t2,$t6
1282	 stfd	$T2a,`$FRAME+32`($sp)
1283	adde	$t0,$t0,$t7
1284	 stfd	$T2b,`$FRAME+40`($sp)
1285	addze	$carry,$carry
1286	 stfd	$T3a,`$FRAME+48`($sp)
1287	addze	$c1,$c1
1288	 stfd	$T3b,`$FRAME+56`($sp)
1289	 stw	$t2,-4($tp)		; tp[j]
1290	 stw	$t0,-8($tp)
1291___
1292}
1293$code.=<<___;
1294	bdnz	Linner
1295
1296	fctid	$dota,$dota
1297	fctid	$dotb,$dotb
1298___
1299if ($SIZE_T==8 or $flavour =~ /osx/) {
1300$code.=<<___;
1301	ld	$t0,`$FRAME+0`($sp)
1302	ld	$t1,`$FRAME+8`($sp)
1303	ld	$t2,`$FRAME+16`($sp)
1304	ld	$t3,`$FRAME+24`($sp)
1305	ld	$t4,`$FRAME+32`($sp)
1306	ld	$t5,`$FRAME+40`($sp)
1307	ld	$t6,`$FRAME+48`($sp)
1308	ld	$t7,`$FRAME+56`($sp)
1309	stfd	$dota,`$FRAME+64`($sp)
1310	stfd	$dotb,`$FRAME+72`($sp)
1311
1312	add	$t0,$t0,$carry		; can not overflow
1313	srdi	$carry,$t0,16
1314	add	$t1,$t1,$carry
1315	srdi	$carry,$t1,16
1316	insrdi	$t0,$t1,16,32
1317	add	$t2,$t2,$carry
1318	ld	$t1,8($tp)		; tp[j]
1319	srdi	$carry,$t2,16
1320	insrdi	$t0,$t2,16,16
1321	add	$t3,$t3,$carry
1322	ldu	$t2,16($tp)		; tp[j+1]
1323	srdi	$carry,$t3,16
1324	insrdi	$t0,$t3,16,0		; 0..63 bits
1325	add	$t4,$t4,$carry
1326	srdi	$carry,$t4,16
1327	add	$t5,$t5,$carry
1328	srdi	$carry,$t5,16
1329	insrdi	$t4,$t5,16,32
1330	add	$t6,$t6,$carry
1331	srdi	$carry,$t6,16
1332	insrdi	$t4,$t6,16,16
1333	add	$t7,$t7,$carry
1334	insrdi	$t4,$t7,16,0		; 64..127 bits
1335	srdi	$carry,$t7,16		; upper 33 bits
1336	ld	$t6,`$FRAME+64`($sp)
1337	ld	$t7,`$FRAME+72`($sp)
1338
1339	addc	$t3,$t0,$t1
1340___
1341$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1342	extrdi	$t0,$t0,32,0
1343	extrdi	$t1,$t1,32,0
1344	adde	$t0,$t0,$t1
1345___
1346$code.=<<___;
1347	adde	$t5,$t4,$t2
1348___
1349$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
1350	extrdi	$t4,$t4,32,0
1351	extrdi	$t2,$t2,32,0
1352	adde	$t4,$t4,$t2
1353___
1354$code.=<<___;
1355	addze	$carry,$carry
1356
1357	std	$t3,-16($tp)		; tp[j-1]
1358	std	$t5,-8($tp)		; tp[j]
1359
1360	add	$carry,$carry,$ovf	; consume upmost overflow
1361	add	$t6,$t6,$carry		; can not overflow
1362	srdi	$carry,$t6,16
1363	add	$t7,$t7,$carry
1364	insrdi	$t6,$t7,48,0
1365	srdi	$ovf,$t7,48
1366	std	$t6,0($tp)		; tp[num-1]
1367___
1368} else {
1369$code.=<<___;
1370	lwz	$t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1371	lwz	$t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1372	lwz	$t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1373	lwz	$t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1374	lwz	$t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1375	lwz	$t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1376	lwz	$t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1377	lwz	$t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1378	stfd	$dota,`$FRAME+64`($sp)
1379	stfd	$dotb,`$FRAME+72`($sp)
1380
1381	addc	$t0,$t0,$carry
1382	adde	$t1,$t1,$c1
1383	srwi	$carry,$t0,16
1384	insrwi	$carry,$t1,16,0
1385	srwi	$c1,$t1,16
1386	addc	$t2,$t2,$carry
1387	adde	$t3,$t3,$c1
1388	srwi	$carry,$t2,16
1389	 insrwi	$t0,$t2,16,0		; 0..31 bits
1390	 lwz	$t2,12($tp)		; tp[j]
1391	insrwi	$carry,$t3,16,0
1392	srwi	$c1,$t3,16
1393	 lwz	$t3,8($tp)
1394	addc	$t4,$t4,$carry
1395	adde	$t5,$t5,$c1
1396	srwi	$carry,$t4,16
1397	insrwi	$carry,$t5,16,0
1398	srwi	$c1,$t5,16
1399	addc	$t6,$t6,$carry
1400	adde	$t7,$t7,$c1
1401	srwi	$carry,$t6,16
1402	 insrwi	$t4,$t6,16,0		; 32..63 bits
1403	insrwi	$carry,$t7,16,0
1404	srwi	$c1,$t7,16
1405
1406	addc	$t0,$t0,$t2
1407	adde	$t4,$t4,$t3
1408	addze	$carry,$carry
1409	addze	$c1,$c1
1410	 stw	$t0,4($tp)		; tp[j-1]
1411	 stw	$t4,0($tp)
1412
1413	lwz	$t3,`$FRAME+32^$LITTLE_ENDIAN`($sp)	; permuted $t1
1414	lwz	$t2,`$FRAME+36^$LITTLE_ENDIAN`($sp)	; permuted $t0
1415	lwz	$t7,`$FRAME+40^$LITTLE_ENDIAN`($sp)	; permuted $t3
1416	lwz	$t6,`$FRAME+44^$LITTLE_ENDIAN`($sp)	; permuted $t2
1417	lwz	$t1,`$FRAME+48^$LITTLE_ENDIAN`($sp)	; permuted $t5
1418	lwz	$t0,`$FRAME+52^$LITTLE_ENDIAN`($sp)	; permuted $t4
1419	lwz	$t5,`$FRAME+56^$LITTLE_ENDIAN`($sp)	; permuted $t7
1420	lwz	$t4,`$FRAME+60^$LITTLE_ENDIAN`($sp)	; permuted $t6
1421
1422	addc	$t2,$t2,$carry
1423	adde	$t3,$t3,$c1
1424	srwi	$carry,$t2,16
1425	insrwi	$carry,$t3,16,0
1426	srwi	$c1,$t3,16
1427	addc	$t6,$t6,$carry
1428	adde	$t7,$t7,$c1
1429	srwi	$carry,$t6,16
1430	 insrwi	$t2,$t6,16,0		; 64..95 bits
1431	 lwz	$t6,20($tp)
1432	insrwi	$carry,$t7,16,0
1433	srwi	$c1,$t7,16
1434	 lwzu	$t7,16($tp)
1435	addc	$t0,$t0,$carry
1436	adde	$t1,$t1,$c1
1437	srwi	$carry,$t0,16
1438	insrwi	$carry,$t1,16,0
1439	srwi	$c1,$t1,16
1440	addc	$t4,$t4,$carry
1441	adde	$t5,$t5,$c1
1442	srwi	$carry,$t4,16
1443	 insrwi	$t0,$t4,16,0		; 96..127 bits
1444	insrwi	$carry,$t5,16,0
1445	srwi	$c1,$t5,16
1446
1447	addc	$t2,$t2,$t6
1448	adde	$t0,$t0,$t7
1449	 lwz	$t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1450	 lwz	$t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1451	addze	$carry,$carry
1452	addze	$c1,$c1
1453	 lwz	$t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1454	 lwz	$t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1455
1456	addc	$t6,$t6,$carry
1457	adde	$t7,$t7,$c1
1458	 stw	$t2,-4($tp)		; tp[j]
1459	 stw	$t0,-8($tp)
1460	addc	$t6,$t6,$ovf
1461	addze	$t7,$t7
1462	srwi	$carry,$t6,16
1463	insrwi	$carry,$t7,16,0
1464	srwi	$c1,$t7,16
1465	addc	$t4,$t4,$carry
1466	adde	$t5,$t5,$c1
1467
1468	insrwi	$t6,$t4,16,0
1469	srwi	$t4,$t4,16
1470	insrwi	$t4,$t5,16,0
1471	srwi	$ovf,$t5,16
1472	stw	$t6,4($tp)		; tp[num-1]
1473	stw	$t4,0($tp)
1474___
1475}
1476$code.=<<___;
1477	slwi	$t7,$num,2
1478	addi	$i,$i,8
1479	subf	$nap_d,$t7,$nap_d	; rewind pointer
1480	cmpw	$i,$num
1481	blt-	Louter
1482___
1483
1484$code.=<<___ if ($SIZE_T==8);
1485	subf	$np,$num,$np	; rewind np
1486	addi	$j,$j,1		; restore counter
1487	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
1488	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
1489	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
1490	addi	$t5,$np,8
1491	addi	$t6,$rp,8
1492	mtctr	$j
1493
1494.align	4
1495Lsub:	ldx	$t0,$tp,$i
1496	ldx	$t1,$np,$i
1497	ldx	$t2,$t4,$i
1498	ldx	$t3,$t5,$i
1499	subfe	$t0,$t1,$t0	; tp[j]-np[j]
1500	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
1501	stdx	$t0,$rp,$i
1502	stdx	$t2,$t6,$i
1503	addi	$i,$i,16
1504	bdnz	Lsub
1505
1506	li	$i,0
1507	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1508	mtctr	$j
1509
1510.align	4
1511Lcopy:				; conditional copy
1512	ldx	$t0,$tp,$i
1513	ldx	$t1,$t4,$i
1514	ldx	$t2,$rp,$i
1515	ldx	$t3,$t6,$i
1516	std	$i,8($nap_d)	; zap nap_d
1517	std	$i,16($nap_d)
1518	std	$i,24($nap_d)
1519	std	$i,32($nap_d)
1520	std	$i,40($nap_d)
1521	std	$i,48($nap_d)
1522	std	$i,56($nap_d)
1523	stdu	$i,64($nap_d)
1524	and	$t0,$t0,$ovf
1525	and	$t1,$t1,$ovf
1526	andc	$t2,$t2,$ovf
1527	andc	$t3,$t3,$ovf
1528	or	$t0,$t0,$t2
1529	or	$t1,$t1,$t3
1530	stdx	$t0,$rp,$i
1531	stdx	$t1,$t6,$i
1532	stdx	$i,$tp,$i	; zap tp at once
1533	stdx	$i,$t4,$i
1534	addi	$i,$i,16
1535	bdnz	Lcopy
1536___
1537$code.=<<___ if ($SIZE_T==4);
1538	subf	$np,$num,$np	; rewind np
1539	addi	$j,$j,1		; restore counter
1540	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
1541	addi	$tp,$sp,`$FRAME+$TRANSFER`
1542	addi	$np,$np,-4
1543	addi	$rp,$rp,-4
1544	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
1545	mtctr	$j
1546
1547.align	4
1548Lsub:	lwz	$t0,12($tp)	; load tp[j..j+3] in 64-bit word order
1549	lwz	$t1,8($tp)
1550	lwz	$t2,20($tp)
1551	lwzu	$t3,16($tp)
1552	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
1553	lwz	$t5,8($np)
1554	lwz	$t6,12($np)
1555	lwzu	$t7,16($np)
1556	subfe	$t4,$t4,$t0	; tp[j]-np[j]
1557	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
1558	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
1559	 stw	$t1,8($ap)
1560	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
1561	 stw	$t2,12($ap)
1562	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
1563	 stwu	$t3,16($ap)
1564	stw	$t4,4($rp)
1565	stw	$t5,8($rp)
1566	stw	$t6,12($rp)
1567	stwu	$t7,16($rp)
1568	bdnz	Lsub
1569
1570	li	$i,0
1571	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1572	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
1573	subf	$rp,$num,$rp	; rewind rp
1574	addi	$tp,$sp,`$FRAME+$TRANSFER`
1575	mtctr	$j
1576
1577.align	4
1578Lcopy:				; conditional copy
1579	lwz	$t0,4($ap)
1580	lwz	$t1,8($ap)
1581	lwz	$t2,12($ap)
1582	lwzu	$t3,16($ap)
1583	lwz	$t4,4($rp)
1584	lwz	$t5,8($rp)
1585	lwz	$t6,12($rp)
1586	lwz	$t7,16($rp)
1587	std	$i,8($nap_d)	; zap nap_d
1588	std	$i,16($nap_d)
1589	std	$i,24($nap_d)
1590	std	$i,32($nap_d)
1591	std	$i,40($nap_d)
1592	std	$i,48($nap_d)
1593	std	$i,56($nap_d)
1594	stdu	$i,64($nap_d)
1595	and	$t0,$t0,$ovf
1596	and	$t1,$t1,$ovf
1597	and	$t2,$t2,$ovf
1598	and	$t3,$t3,$ovf
1599	andc	$t4,$t4,$ovf
1600	andc	$t5,$t5,$ovf
1601	andc	$t6,$t6,$ovf
1602	andc	$t7,$t7,$ovf
1603	or	$t0,$t0,$t4
1604	or	$t1,$t1,$t5
1605	or	$t2,$t2,$t6
1606	or	$t3,$t3,$t7
1607	stw	$t0,4($rp)
1608	stw	$t1,8($rp)
1609	stw	$t2,12($rp)
1610	stwu	$t3,16($rp)
1611	std	$i,8($tp)	; zap tp at once
1612	stdu	$i,16($tp)
1613	bdnz	Lcopy
1614___
1615
1616$code.=<<___;
1617	$POP	$i,0($sp)
1618	li	r3,1	; signal "handled"
1619	$POP	r19,`-12*8-13*$SIZE_T`($i)
1620	$POP	r20,`-12*8-12*$SIZE_T`($i)
1621	$POP	r21,`-12*8-11*$SIZE_T`($i)
1622	$POP	r22,`-12*8-10*$SIZE_T`($i)
1623	$POP	r23,`-12*8-9*$SIZE_T`($i)
1624	$POP	r24,`-12*8-8*$SIZE_T`($i)
1625	$POP	r25,`-12*8-7*$SIZE_T`($i)
1626	$POP	r26,`-12*8-6*$SIZE_T`($i)
1627	$POP	r27,`-12*8-5*$SIZE_T`($i)
1628	$POP	r28,`-12*8-4*$SIZE_T`($i)
1629	$POP	r29,`-12*8-3*$SIZE_T`($i)
1630	$POP	r30,`-12*8-2*$SIZE_T`($i)
1631	$POP	r31,`-12*8-1*$SIZE_T`($i)
1632	lfd	f20,`-12*8`($i)
1633	lfd	f21,`-11*8`($i)
1634	lfd	f22,`-10*8`($i)
1635	lfd	f23,`-9*8`($i)
1636	lfd	f24,`-8*8`($i)
1637	lfd	f25,`-7*8`($i)
1638	lfd	f26,`-6*8`($i)
1639	lfd	f27,`-5*8`($i)
1640	lfd	f28,`-4*8`($i)
1641	lfd	f29,`-3*8`($i)
1642	lfd	f30,`-2*8`($i)
1643	lfd	f31,`-1*8`($i)
1644	mr	$sp,$i
1645	blr
1646	.long	0
1647	.byte	0,12,4,0,0x8c,13,6,0
1648	.long	0
1649.size	.$fname,.-.$fname
1650
1651.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1652___
1653
1654$code =~ s/\`([^\`]*)\`/eval $1/gem;
1655print $code;
1656close STDOUT or die "error closing STDOUT: $!";
1657