xref: /openssl/crypto/bn/asm/parisc-mont.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# On PA-7100LC this module performs ~90-50% better, less for longer
18# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
19# that compiler utilized xmpyu instruction to perform 32x32=64-bit
20# multiplication, which in turn means that "baseline" performance was
21# optimal in respect to instruction set capabilities. Fair comparison
22# with vendor compiler is problematic, because OpenSSL doesn't define
23# BN_LLONG [presumably] for historical reasons, which drives compiler
24# toward 4 times 16x16=32-bit multiplications [plus complementary
25# shifts and additions] instead. This means that you should observe
26# several times improvement over code generated by vendor compiler
27# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
28# improvement coefficient was never collected on PA-7100LC, or any
29# other 1.1 CPU, because I don't have access to such machine with
30# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
31# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
32# of ~5x on PA-8600.
33#
34# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
35# reportedly ~2x faster than vendor compiler generated code [according
36# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
37# this implementation is actually 32-bit one, in the sense that it
38# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
39# 64-bit BN_LONGs... How do they interoperate then? No problem. This
40# module picks halves of 64-bit values in reverse order and pretends
41# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
42# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
43# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
44# i.e. there is no "wider" multiplication like on most other 64-bit
45# platforms. This means that even being effectively 32-bit, this
46# implementation performs "64-bit" computational task in same amount
47# of arithmetic operations, most notably multiplications. It requires
48# more memory references, most notably to tp[num], but this doesn't
49# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
50# 2.0 code path provides virtually same performance as pa-risc2[W].s:
51# it's ~10% better for shortest key length and ~10% worse for longest
52# one.
53#
54# In case it wasn't clear. The module has two distinct code paths:
55# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
56# additions and 64-bit integer loads, not to mention specific
57# instruction scheduling. In 64-bit build naturally only 2.0 code path
58# is assembled. In 32-bit application context both code paths are
59# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
60# is taken automatically. Also, in 32-bit build the module imposes
61# couple of limitations: vector lengths has to be even and vector
62# addresses has to be 64-bit aligned. Normally neither is a problem:
63# most common key lengths are even and vectors are commonly malloc-ed,
64# which ensures alignment.
65#
66# Special thanks to polarhome.com for providing HP-UX account on
67# PA-RISC 1.1 machine, and to correspondent who chose to remain
68# anonymous for testing the code on PA-RISC 2.0 machine.
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71
72# $output is the last argument if it looks like a file (it has an extension)
73# $flavour is the first argument if it doesn't look like a file
74$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
75$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
76
77$output and open STDOUT,">$output";
78
79if ($flavour =~ /64/) {
80	$LEVEL		="2.0W";
81	$SIZE_T		=8;
82	$FRAME_MARKER	=80;
83	$SAVED_RP	=16;
84	$PUSH		="std";
85	$PUSHMA		="std,ma";
86	$POP		="ldd";
87	$POPMB		="ldd,mb";
88	$BN_SZ		=$SIZE_T;
89} else {
90	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
91	$SIZE_T		=4;
92	$FRAME_MARKER	=48;
93	$SAVED_RP	=20;
94	$PUSH		="stw";
95	$PUSHMA		="stwm";
96	$POP		="ldw";
97	$POPMB		="ldwm";
98	$BN_SZ		=$SIZE_T;
99	if (open CONF,"<${dir}../../opensslconf.h") {
100	    while(<CONF>) {
101		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
102		    $BN_SZ=8;
103		    $LEVEL="2.0";
104		    last;
105		}
106	    }
107	    close CONF;
108	}
109}
110
111$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
112				#                [+ argument transfer]
113$LOCALS=$FRAME-$FRAME_MARKER;
114$FRAME+=32;			# local variables
115
116$tp="%r31";
117$ti1="%r29";
118$ti0="%r28";
119
120$rp="%r26";
121$ap="%r25";
122$bp="%r24";
123$np="%r23";
124$n0="%r22";	# passed through stack in 32-bit
125$num="%r21";	# passed through stack in 32-bit
126$idx="%r20";
127$arrsz="%r19";
128
129$nm1="%r7";
130$nm0="%r6";
131$ab1="%r5";
132$ab0="%r4";
133
134$fp="%r3";
135$hi1="%r2";
136$hi0="%r1";
137
138$xfer=$n0;	# accommodates [-16..15] offset in fld[dw]s
139
140$fm0="%fr4";	$fti=$fm0;
141$fbi="%fr5L";
142$fn0="%fr5R";
143$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
144$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
145
146$code=<<___;
147	.LEVEL	$LEVEL
148	.SPACE	\$TEXT\$
149	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
150
151	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
152	.ALIGN	64
153bn_mul_mont
154	.PROC
155	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
156	.ENTRY
157	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
158	$PUSHMA	%r3,$FRAME(%sp)
159	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
160	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
161	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
162	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
163	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
164	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
165	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
166	ldo	-$FRAME(%sp),$fp
167___
168$code.=<<___ if ($SIZE_T==4);
169	ldw	`-$FRAME_MARKER-4`($fp),$n0
170	ldw	`-$FRAME_MARKER-8`($fp),$num
171	nop
172	nop					; alignment
173___
174$code.=<<___ if ($BN_SZ==4);
175	comiclr,<=	6,$num,%r0		; are vectors long enough?
176	b		L\$abort
177	ldi		0,%r28			; signal "unhandled"
178	add,ev		%r0,$num,$num		; is $num even?
179	b		L\$abort
180	nop
181	or		$ap,$np,$ti1
182	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
183	b		L\$abort
184	nop
185	nop					; alignment
186	nop
187
188	fldws		0($n0),${fn0}
189	fldws,ma	4($bp),${fbi}		; bp[0]
190___
191$code.=<<___ if ($BN_SZ==8);
192	comib,>		3,$num,L\$abort		; are vectors long enough?
193	ldi		0,%r28			; signal "unhandled"
194	addl		$num,$num,$num		; I operate on 32-bit values
195
196	fldws		4($n0),${fn0}		; only low part of n0
197	fldws		4($bp),${fbi}		; bp[0] in flipped word order
198___
199$code.=<<___;
200	fldds		0($ap),${fai}		; ap[0,1]
201	fldds		0($np),${fni}		; np[0,1]
202
203	sh2addl		$num,%r0,$arrsz
204	ldi		31,$hi0
205	ldo		36($arrsz),$hi1		; space for tp[num+1]
206	andcm		$hi1,$hi0,$hi1		; align
207	addl		$hi1,%sp,%sp
208	$PUSH		$fp,-$SIZE_T(%sp)
209
210	ldo		`$LOCALS+16`($fp),$xfer
211	ldo		`$LOCALS+32+4`($fp),$tp
212
213	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
214	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
215	xmpyu		${fn0},${fab0}R,${fm0}
216
217	addl		$arrsz,$ap,$ap		; point at the end
218	addl		$arrsz,$np,$np
219	subi		0,$arrsz,$idx		; j=0
220	ldo		8($idx),$idx		; j++++
221
222	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
223	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
224	fstds		${fab0},-16($xfer)
225	fstds		${fnm0},-8($xfer)
226	fstds		${fab1},0($xfer)
227	fstds		${fnm1},8($xfer)
228	 flddx		$idx($ap),${fai}	; ap[2,3]
229	 flddx		$idx($np),${fni}	; np[2,3]
230___
231$code.=<<___ if ($BN_SZ==4);
232	mtctl		$hi0,%cr11		; $hi0 still holds 31
233	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
234	b		L\$parisc11
235	nop
236___
237$code.=<<___;					# PA-RISC 2.0 code-path
238	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
239	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
240	ldd		-16($xfer),$ab0
241	fstds		${fab0},-16($xfer)
242
243	extrd,u		$ab0,31,32,$hi0
244	extrd,u		$ab0,63,32,$ab0
245	ldd		-8($xfer),$nm0
246	fstds		${fnm0},-8($xfer)
247	 ldo		8($idx),$idx		; j++++
248	 addl		$ab0,$nm0,$nm0		; low part is discarded
249	 extrd,u	$nm0,31,32,$hi1
250
251L\$1st
252	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
253	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
254	ldd		0($xfer),$ab1
255	fstds		${fab1},0($xfer)
256	 addl		$hi0,$ab1,$ab1
257	 extrd,u	$ab1,31,32,$hi0
258	ldd		8($xfer),$nm1
259	fstds		${fnm1},8($xfer)
260	 extrd,u	$ab1,63,32,$ab1
261	 addl		$hi1,$nm1,$nm1
262	flddx		$idx($ap),${fai}	; ap[j,j+1]
263	flddx		$idx($np),${fni}	; np[j,j+1]
264	 addl		$ab1,$nm1,$nm1
265	 extrd,u	$nm1,31,32,$hi1
266
267	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
268	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
269	ldd		-16($xfer),$ab0
270	fstds		${fab0},-16($xfer)
271	 addl		$hi0,$ab0,$ab0
272	 extrd,u	$ab0,31,32,$hi0
273	ldd		-8($xfer),$nm0
274	fstds		${fnm0},-8($xfer)
275	 extrd,u	$ab0,63,32,$ab0
276	 addl		$hi1,$nm0,$nm0
277	stw		$nm1,-4($tp)		; tp[j-1]
278	 addl		$ab0,$nm0,$nm0
279	 stw,ma		$nm0,8($tp)		; tp[j-1]
280	addib,<>	8,$idx,L\$1st		; j++++
281	 extrd,u	$nm0,31,32,$hi1
282
283	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
284	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
285	ldd		0($xfer),$ab1
286	fstds		${fab1},0($xfer)
287	 addl		$hi0,$ab1,$ab1
288	 extrd,u	$ab1,31,32,$hi0
289	ldd		8($xfer),$nm1
290	fstds		${fnm1},8($xfer)
291	 extrd,u	$ab1,63,32,$ab1
292	 addl		$hi1,$nm1,$nm1
293	ldd		-16($xfer),$ab0
294	 addl		$ab1,$nm1,$nm1
295	ldd		-8($xfer),$nm0
296	 extrd,u	$nm1,31,32,$hi1
297
298	 addl		$hi0,$ab0,$ab0
299	 extrd,u	$ab0,31,32,$hi0
300	stw		$nm1,-4($tp)		; tp[j-1]
301	 extrd,u	$ab0,63,32,$ab0
302	 addl		$hi1,$nm0,$nm0
303	ldd		0($xfer),$ab1
304	 addl		$ab0,$nm0,$nm0
305	ldd,mb		8($xfer),$nm1
306	 extrd,u	$nm0,31,32,$hi1
307	stw,ma		$nm0,8($tp)		; tp[j-1]
308
309	ldo		-1($num),$num		; i--
310	subi		0,$arrsz,$idx		; j=0
311___
312$code.=<<___ if ($BN_SZ==4);
313	fldws,ma	4($bp),${fbi}		; bp[1]
314___
315$code.=<<___ if ($BN_SZ==8);
316	fldws		0($bp),${fbi}		; bp[1] in flipped word order
317___
318$code.=<<___;
319	 flddx		$idx($ap),${fai}	; ap[0,1]
320	 flddx		$idx($np),${fni}	; np[0,1]
321	 fldws		8($xfer),${fti}R	; tp[0]
322	addl		$hi0,$ab1,$ab1
323	 extrd,u	$ab1,31,32,$hi0
324	 extrd,u	$ab1,63,32,$ab1
325	 ldo		8($idx),$idx		; j++++
326	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
327	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
328	addl		$hi1,$nm1,$nm1
329	addl		$ab1,$nm1,$nm1
330	extrd,u		$nm1,31,32,$hi1
331	 fstws,mb	${fab0}L,-8($xfer)	; save high part
332	stw		$nm1,-4($tp)		; tp[j-1]
333
334	 fcpy,sgl	%fr0,${fti}L		; zero high part
335	 fcpy,sgl	%fr0,${fab0}L
336	addl		$hi1,$hi0,$hi0
337	extrd,u		$hi0,31,32,$hi1
338	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
339	 fcnvxf,dbl,dbl	${fab0},${fab0}
340	stw		$hi0,0($tp)
341	stw		$hi1,4($tp)
342
343	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
344	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
345	xmpyu		${fn0},${fab0}R,${fm0}
346	ldo		`$LOCALS+32+4`($fp),$tp
347L\$outer
348	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
349	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
350	fstds		${fab0},-16($xfer)	; 33-bit value
351	fstds		${fnm0},-8($xfer)
352	 flddx		$idx($ap),${fai}	; ap[2]
353	 flddx		$idx($np),${fni}	; np[2]
354	 ldo		8($idx),$idx		; j++++
355	ldd		-16($xfer),$ab0		; 33-bit value
356	ldd		-8($xfer),$nm0
357	ldw		0($xfer),$hi0		; high part
358
359	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
360	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
361	 extrd,u	$ab0,31,32,$ti0		; carry bit
362	 extrd,u	$ab0,63,32,$ab0
363	fstds		${fab1},0($xfer)
364	 addl		$ti0,$hi0,$hi0		; account carry bit
365	fstds		${fnm1},8($xfer)
366	 addl		$ab0,$nm0,$nm0		; low part is discarded
367	ldw		0($tp),$ti1		; tp[1]
368	 extrd,u	$nm0,31,32,$hi1
369	fstds		${fab0},-16($xfer)
370	fstds		${fnm0},-8($xfer)
371
372L\$inner
373	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
374	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
375	ldd		0($xfer),$ab1
376	fstds		${fab1},0($xfer)
377	 addl		$hi0,$ti1,$ti1
378	 addl		$ti1,$ab1,$ab1
379	ldd		8($xfer),$nm1
380	fstds		${fnm1},8($xfer)
381	 extrd,u	$ab1,31,32,$hi0
382	 extrd,u	$ab1,63,32,$ab1
383	flddx		$idx($ap),${fai}	; ap[j,j+1]
384	flddx		$idx($np),${fni}	; np[j,j+1]
385	 addl		$hi1,$nm1,$nm1
386	 addl		$ab1,$nm1,$nm1
387	ldw		4($tp),$ti0		; tp[j]
388	stw		$nm1,-4($tp)		; tp[j-1]
389
390	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
391	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
392	ldd		-16($xfer),$ab0
393	fstds		${fab0},-16($xfer)
394	 addl		$hi0,$ti0,$ti0
395	 addl		$ti0,$ab0,$ab0
396	ldd		-8($xfer),$nm0
397	fstds		${fnm0},-8($xfer)
398	 extrd,u	$ab0,31,32,$hi0
399	 extrd,u	$nm1,31,32,$hi1
400	ldw		8($tp),$ti1		; tp[j]
401	 extrd,u	$ab0,63,32,$ab0
402	 addl		$hi1,$nm0,$nm0
403	 addl		$ab0,$nm0,$nm0
404	 stw,ma		$nm0,8($tp)		; tp[j-1]
405	addib,<>	8,$idx,L\$inner		; j++++
406	 extrd,u	$nm0,31,32,$hi1
407
408	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
409	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
410	ldd		0($xfer),$ab1
411	fstds		${fab1},0($xfer)
412	 addl		$hi0,$ti1,$ti1
413	 addl		$ti1,$ab1,$ab1
414	ldd		8($xfer),$nm1
415	fstds		${fnm1},8($xfer)
416	 extrd,u	$ab1,31,32,$hi0
417	 extrd,u	$ab1,63,32,$ab1
418	ldw		4($tp),$ti0		; tp[j]
419	 addl		$hi1,$nm1,$nm1
420	 addl		$ab1,$nm1,$nm1
421	ldd		-16($xfer),$ab0
422	ldd		-8($xfer),$nm0
423	 extrd,u	$nm1,31,32,$hi1
424
425	addl		$hi0,$ab0,$ab0
426	 addl		$ti0,$ab0,$ab0
427	 stw		$nm1,-4($tp)		; tp[j-1]
428	 extrd,u	$ab0,31,32,$hi0
429	ldw		8($tp),$ti1		; tp[j]
430	 extrd,u	$ab0,63,32,$ab0
431	 addl		$hi1,$nm0,$nm0
432	ldd		0($xfer),$ab1
433	 addl		$ab0,$nm0,$nm0
434	ldd,mb		8($xfer),$nm1
435	 extrd,u	$nm0,31,32,$hi1
436	 stw,ma		$nm0,8($tp)		; tp[j-1]
437
438	addib,=		-1,$num,L\$outerdone	; i--
439	subi		0,$arrsz,$idx		; j=0
440___
441$code.=<<___ if ($BN_SZ==4);
442	fldws,ma	4($bp),${fbi}		; bp[i]
443___
444$code.=<<___ if ($BN_SZ==8);
445	ldi		12,$ti0			; bp[i] in flipped word order
446	addl,ev		%r0,$num,$num
447	ldi		-4,$ti0
448	addl		$ti0,$bp,$bp
449	fldws		0($bp),${fbi}
450___
451$code.=<<___;
452	 flddx		$idx($ap),${fai}	; ap[0]
453	addl		$hi0,$ab1,$ab1
454	 flddx		$idx($np),${fni}	; np[0]
455	 fldws		8($xfer),${fti}R	; tp[0]
456	addl		$ti1,$ab1,$ab1
457	extrd,u		$ab1,31,32,$hi0
458	extrd,u		$ab1,63,32,$ab1
459
460	 ldo		8($idx),$idx		; j++++
461	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
462	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
463	ldw		4($tp),$ti0		; tp[j]
464
465	addl		$hi1,$nm1,$nm1
466	 fstws,mb	${fab0}L,-8($xfer)	; save high part
467	addl		$ab1,$nm1,$nm1
468	extrd,u		$nm1,31,32,$hi1
469	 fcpy,sgl	%fr0,${fti}L		; zero high part
470	 fcpy,sgl	%fr0,${fab0}L
471	stw		$nm1,-4($tp)		; tp[j-1]
472
473	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
474	 fcnvxf,dbl,dbl	${fab0},${fab0}
475	addl		$hi1,$hi0,$hi0
476	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
477	addl		$ti0,$hi0,$hi0
478	extrd,u		$hi0,31,32,$hi1
479	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
480	stw		$hi0,0($tp)
481	stw		$hi1,4($tp)
482	 xmpyu		${fn0},${fab0}R,${fm0}
483
484	b		L\$outer
485	ldo		`$LOCALS+32+4`($fp),$tp
486
487L\$outerdone
488	addl		$hi0,$ab1,$ab1
489	addl		$ti1,$ab1,$ab1
490	extrd,u		$ab1,31,32,$hi0
491	extrd,u		$ab1,63,32,$ab1
492
493	ldw		4($tp),$ti0		; tp[j]
494
495	addl		$hi1,$nm1,$nm1
496	addl		$ab1,$nm1,$nm1
497	extrd,u		$nm1,31,32,$hi1
498	stw		$nm1,-4($tp)		; tp[j-1]
499
500	addl		$hi1,$hi0,$hi0
501	addl		$ti0,$hi0,$hi0
502	extrd,u		$hi0,31,32,$hi1
503	stw		$hi0,0($tp)
504	stw		$hi1,4($tp)
505
506	ldo		`$LOCALS+32`($fp),$tp
507	sub		%r0,%r0,%r0		; clear borrow
508___
509$code.=<<___ if ($BN_SZ==4);
510	ldws,ma		4($tp),$ti0
511	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
512	b		L\$sub_pa11
513	addl		$tp,$arrsz,$tp
514L\$sub
515	ldwx		$idx($np),$hi0
516	subb		$ti0,$hi0,$hi1
517	ldwx		$idx($tp),$ti0
518	addib,<>	4,$idx,L\$sub
519	stws,ma		$hi1,4($rp)
520
521	subb		$ti0,%r0,$hi1
522___
523$code.=<<___ if ($BN_SZ==8);
524	ldd,ma		8($tp),$ti0
525L\$sub
526	ldd		$idx($np),$hi0
527	shrpd		$ti0,$ti0,32,$ti0	; flip word order
528	std		$ti0,-8($tp)		; save flipped value
529	sub,db		$ti0,$hi0,$hi1
530	ldd,ma		8($tp),$ti0
531	addib,<>	8,$idx,L\$sub
532	std,ma		$hi1,8($rp)
533
534	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
535	sub,db		$ti0,%r0,$hi1
536___
537$code.=<<___;
538	ldo		`$LOCALS+32`($fp),$tp
539	sub		$rp,$arrsz,$rp		; rewind rp
540	subi		0,$arrsz,$idx
541L\$copy
542	ldd		0($tp),$ti0
543	ldd		0($rp),$hi0
544	std,ma		%r0,8($tp)
545	comiclr,=	0,$hi1,%r0
546	copy		$ti0,$hi0
547	addib,<>	8,$idx,L\$copy
548	std,ma		$hi0,8($rp)
549___
550
551if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
552$ablo=$ab0;
553$abhi=$ab1;
554$nmlo0=$nm0;
555$nmhi0=$nm1;
556$nmlo1="%r9";
557$nmhi1="%r8";
558
559$code.=<<___;
560	b		L\$done
561	nop
562
563	.ALIGN		8
564L\$parisc11
565	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
566	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
567	ldw		-12($xfer),$ablo
568	ldw		-16($xfer),$hi0
569	ldw		-4($xfer),$nmlo0
570	ldw		-8($xfer),$nmhi0
571	fstds		${fab0},-16($xfer)
572	fstds		${fnm0},-8($xfer)
573
574	 ldo		8($idx),$idx		; j++++
575	 add		$ablo,$nmlo0,$nmlo0	; discarded
576	 addc		%r0,$nmhi0,$hi1
577	ldw		4($xfer),$ablo
578	ldw		0($xfer),$abhi
579	nop
580
581L\$1st_pa11
582	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
583	flddx		$idx($ap),${fai}	; ap[j,j+1]
584	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
585	flddx		$idx($np),${fni}	; np[j,j+1]
586	 add		$hi0,$ablo,$ablo
587	ldw		12($xfer),$nmlo1
588	 addc		%r0,$abhi,$hi0
589	ldw		8($xfer),$nmhi1
590	 add		$ablo,$nmlo1,$nmlo1
591	fstds		${fab1},0($xfer)
592	 addc		%r0,$nmhi1,$nmhi1
593	fstds		${fnm1},8($xfer)
594	 add		$hi1,$nmlo1,$nmlo1
595	ldw		-12($xfer),$ablo
596	 addc		%r0,$nmhi1,$hi1
597	ldw		-16($xfer),$abhi
598
599	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
600	ldw		-4($xfer),$nmlo0
601	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
602	ldw		-8($xfer),$nmhi0
603	 add		$hi0,$ablo,$ablo
604	stw		$nmlo1,-4($tp)		; tp[j-1]
605	 addc		%r0,$abhi,$hi0
606	fstds		${fab0},-16($xfer)
607	 add		$ablo,$nmlo0,$nmlo0
608	fstds		${fnm0},-8($xfer)
609	 addc		%r0,$nmhi0,$nmhi0
610	ldw		0($xfer),$abhi
611	 add		$hi1,$nmlo0,$nmlo0
612	ldw		4($xfer),$ablo
613	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
614	addib,<>	8,$idx,L\$1st_pa11	; j++++
615	 addc		%r0,$nmhi0,$hi1
616
617	 ldw		8($xfer),$nmhi1
618	 ldw		12($xfer),$nmlo1
619	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
620	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
621	 add		$hi0,$ablo,$ablo
622	fstds		${fab1},0($xfer)
623	 addc		%r0,$abhi,$hi0
624	fstds		${fnm1},8($xfer)
625	 add		$ablo,$nmlo1,$nmlo1
626	ldw		-16($xfer),$abhi
627	 addc		%r0,$nmhi1,$nmhi1
628	ldw		-12($xfer),$ablo
629	 add		$hi1,$nmlo1,$nmlo1
630	ldw		-8($xfer),$nmhi0
631	 addc		%r0,$nmhi1,$hi1
632	ldw		-4($xfer),$nmlo0
633
634	 add		$hi0,$ablo,$ablo
635	stw		$nmlo1,-4($tp)		; tp[j-1]
636	 addc		%r0,$abhi,$hi0
637	ldw		0($xfer),$abhi
638	 add		$ablo,$nmlo0,$nmlo0
639	ldw		4($xfer),$ablo
640	 addc		%r0,$nmhi0,$nmhi0
641	ldws,mb		8($xfer),$nmhi1
642	 add		$hi1,$nmlo0,$nmlo0
643	ldw		4($xfer),$nmlo1
644	 addc		%r0,$nmhi0,$hi1
645	stws,ma		$nmlo0,8($tp)		; tp[j-1]
646
647	ldo		-1($num),$num		; i--
648	subi		0,$arrsz,$idx		; j=0
649
650	 fldws,ma	4($bp),${fbi}		; bp[1]
651	 flddx		$idx($ap),${fai}	; ap[0,1]
652	 flddx		$idx($np),${fni}	; np[0,1]
653	 fldws		8($xfer),${fti}R	; tp[0]
654	add		$hi0,$ablo,$ablo
655	addc		%r0,$abhi,$hi0
656	 ldo		8($idx),$idx		; j++++
657	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
658	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
659	add		$hi1,$nmlo1,$nmlo1
660	addc		%r0,$nmhi1,$nmhi1
661	add		$ablo,$nmlo1,$nmlo1
662	addc		%r0,$nmhi1,$hi1
663	 fstws,mb	${fab0}L,-8($xfer)	; save high part
664	stw		$nmlo1,-4($tp)		; tp[j-1]
665
666	 fcpy,sgl	%fr0,${fti}L		; zero high part
667	 fcpy,sgl	%fr0,${fab0}L
668	add		$hi1,$hi0,$hi0
669	addc		%r0,%r0,$hi1
670	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
671	 fcnvxf,dbl,dbl	${fab0},${fab0}
672	stw		$hi0,0($tp)
673	stw		$hi1,4($tp)
674
675	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
676	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
677	xmpyu		${fn0},${fab0}R,${fm0}
678	ldo		`$LOCALS+32+4`($fp),$tp
679L\$outer_pa11
680	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
681	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
682	fstds		${fab0},-16($xfer)	; 33-bit value
683	fstds		${fnm0},-8($xfer)
684	 flddx		$idx($ap),${fai}	; ap[2,3]
685	 flddx		$idx($np),${fni}	; np[2,3]
686	ldw		-16($xfer),$abhi	; carry bit actually
687	 ldo		8($idx),$idx		; j++++
688	ldw		-12($xfer),$ablo
689	ldw		-8($xfer),$nmhi0
690	ldw		-4($xfer),$nmlo0
691	ldw		0($xfer),$hi0		; high part
692
693	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
694	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
695	fstds		${fab1},0($xfer)
696	 addl		$abhi,$hi0,$hi0		; account carry bit
697	fstds		${fnm1},8($xfer)
698	 add		$ablo,$nmlo0,$nmlo0	; discarded
699	ldw		0($tp),$ti1		; tp[1]
700	 addc		%r0,$nmhi0,$hi1
701	fstds		${fab0},-16($xfer)
702	fstds		${fnm0},-8($xfer)
703	ldw		4($xfer),$ablo
704	ldw		0($xfer),$abhi
705
706L\$inner_pa11
707	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
708	flddx		$idx($ap),${fai}	; ap[j,j+1]
709	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
710	flddx		$idx($np),${fni}	; np[j,j+1]
711	 add		$hi0,$ablo,$ablo
712	ldw		4($tp),$ti0		; tp[j]
713	 addc		%r0,$abhi,$abhi
714	ldw		12($xfer),$nmlo1
715	 add		$ti1,$ablo,$ablo
716	ldw		8($xfer),$nmhi1
717	 addc		%r0,$abhi,$hi0
718	fstds		${fab1},0($xfer)
719	 add		$ablo,$nmlo1,$nmlo1
720	fstds		${fnm1},8($xfer)
721	 addc		%r0,$nmhi1,$nmhi1
722	ldw		-12($xfer),$ablo
723	 add		$hi1,$nmlo1,$nmlo1
724	ldw		-16($xfer),$abhi
725	 addc		%r0,$nmhi1,$hi1
726
727	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
728	ldw		8($tp),$ti1		; tp[j]
729	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
730	ldw		-4($xfer),$nmlo0
731	 add		$hi0,$ablo,$ablo
732	ldw		-8($xfer),$nmhi0
733	 addc		%r0,$abhi,$abhi
734	stw		$nmlo1,-4($tp)		; tp[j-1]
735	 add		$ti0,$ablo,$ablo
736	fstds		${fab0},-16($xfer)
737	 addc		%r0,$abhi,$hi0
738	fstds		${fnm0},-8($xfer)
739	 add		$ablo,$nmlo0,$nmlo0
740	ldw		4($xfer),$ablo
741	 addc		%r0,$nmhi0,$nmhi0
742	ldw		0($xfer),$abhi
743	 add		$hi1,$nmlo0,$nmlo0
744	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
745	addib,<>	8,$idx,L\$inner_pa11	; j++++
746	 addc		%r0,$nmhi0,$hi1
747
748	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
749	ldw		12($xfer),$nmlo1
750	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
751	ldw		8($xfer),$nmhi1
752	 add		$hi0,$ablo,$ablo
753	ldw		4($tp),$ti0		; tp[j]
754	 addc		%r0,$abhi,$abhi
755	fstds		${fab1},0($xfer)
756	 add		$ti1,$ablo,$ablo
757	fstds		${fnm1},8($xfer)
758	 addc		%r0,$abhi,$hi0
759	ldw		-16($xfer),$abhi
760	 add		$ablo,$nmlo1,$nmlo1
761	ldw		-12($xfer),$ablo
762	 addc		%r0,$nmhi1,$nmhi1
763	ldw		-8($xfer),$nmhi0
764	 add		$hi1,$nmlo1,$nmlo1
765	ldw		-4($xfer),$nmlo0
766	 addc		%r0,$nmhi1,$hi1
767
768	add		$hi0,$ablo,$ablo
769	 stw		$nmlo1,-4($tp)		; tp[j-1]
770	addc		%r0,$abhi,$abhi
771	 add		$ti0,$ablo,$ablo
772	ldw		8($tp),$ti1		; tp[j]
773	 addc		%r0,$abhi,$hi0
774	ldw		0($xfer),$abhi
775	 add		$ablo,$nmlo0,$nmlo0
776	ldw		4($xfer),$ablo
777	 addc		%r0,$nmhi0,$nmhi0
778	ldws,mb		8($xfer),$nmhi1
779	 add		$hi1,$nmlo0,$nmlo0
780	ldw		4($xfer),$nmlo1
781	 addc		%r0,$nmhi0,$hi1
782	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
783
784	addib,=		-1,$num,L\$outerdone_pa11; i--
785	subi		0,$arrsz,$idx		; j=0
786
787	 fldws,ma	4($bp),${fbi}		; bp[i]
788	 flddx		$idx($ap),${fai}	; ap[0]
789	add		$hi0,$ablo,$ablo
790	addc		%r0,$abhi,$abhi
791	 flddx		$idx($np),${fni}	; np[0]
792	 fldws		8($xfer),${fti}R	; tp[0]
793	add		$ti1,$ablo,$ablo
794	addc		%r0,$abhi,$hi0
795
796	 ldo		8($idx),$idx		; j++++
797	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
798	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
799	ldw		4($tp),$ti0		; tp[j]
800
801	add		$hi1,$nmlo1,$nmlo1
802	addc		%r0,$nmhi1,$nmhi1
803	 fstws,mb	${fab0}L,-8($xfer)	; save high part
804	add		$ablo,$nmlo1,$nmlo1
805	addc		%r0,$nmhi1,$hi1
806	 fcpy,sgl	%fr0,${fti}L		; zero high part
807	 fcpy,sgl	%fr0,${fab0}L
808	stw		$nmlo1,-4($tp)		; tp[j-1]
809
810	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
811	 fcnvxf,dbl,dbl	${fab0},${fab0}
812	add		$hi1,$hi0,$hi0
813	addc		%r0,%r0,$hi1
814	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
815	add		$ti0,$hi0,$hi0
816	addc		%r0,$hi1,$hi1
817	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
818	stw		$hi0,0($tp)
819	stw		$hi1,4($tp)
820	 xmpyu		${fn0},${fab0}R,${fm0}
821
822	b		L\$outer_pa11
823	ldo		`$LOCALS+32+4`($fp),$tp
824
825L\$outerdone_pa11
826	add		$hi0,$ablo,$ablo
827	addc		%r0,$abhi,$abhi
828	add		$ti1,$ablo,$ablo
829	addc		%r0,$abhi,$hi0
830
831	ldw		4($tp),$ti0		; tp[j]
832
833	add		$hi1,$nmlo1,$nmlo1
834	addc		%r0,$nmhi1,$nmhi1
835	add		$ablo,$nmlo1,$nmlo1
836	addc		%r0,$nmhi1,$hi1
837	stw		$nmlo1,-4($tp)		; tp[j-1]
838
839	add		$hi1,$hi0,$hi0
840	addc		%r0,%r0,$hi1
841	add		$ti0,$hi0,$hi0
842	addc		%r0,$hi1,$hi1
843	stw		$hi0,0($tp)
844	stw		$hi1,4($tp)
845
846	ldo		`$LOCALS+32+4`($fp),$tp
847	sub		%r0,%r0,%r0		; clear borrow
848	ldw		-4($tp),$ti0
849	addl		$tp,$arrsz,$tp
850L\$sub_pa11
851	ldwx		$idx($np),$hi0
852	subb		$ti0,$hi0,$hi1
853	ldwx		$idx($tp),$ti0
854	addib,<>	4,$idx,L\$sub_pa11
855	stws,ma		$hi1,4($rp)
856
857	subb		$ti0,%r0,$hi1
858
859	ldo		`$LOCALS+32`($fp),$tp
860	sub		$rp,$arrsz,$rp		; rewind rp
861	subi		0,$arrsz,$idx
862L\$copy_pa11
863	ldw		0($tp),$ti0
864	ldw		0($rp),$hi0
865	stws,ma		%r0,4($tp)
866	comiclr,=	0,$hi1,%r0
867	copy		$ti0,$hi0
868	addib,<>	4,$idx,L\$copy_pa11
869	stws,ma		$hi0,4($rp)
870
871	nop					; alignment
872L\$done
873___
874}
875
876$code.=<<___;
877	ldi		1,%r28			; signal "handled"
878	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
879
880	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
881	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
882	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
883	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
884	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
885	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
886	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
887	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
888L\$abort
889	bv	(%r2)
890	.EXIT
891	$POPMB	-$FRAME(%sp),%r3
892	.PROCEND
893	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
894___
895
896# Explicitly encode PA-RISC 2.0 instructions used in this module, so
897# that it can be compiled with .LEVEL 1.0. It should be noted that I
898# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
899# directive...
900
901my $ldd = sub {
902  my ($mod,$args) = @_;
903  my $orig = "ldd$mod\t$args";
904
905    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
906    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
907	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
908    }
909    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
910    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
911	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
912	$opcode|=(1<<5)  if ($mod =~ /^,m/);
913	$opcode|=(1<<13) if ($mod =~ /^,mb/);
914	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
915    }
916    else { "\t".$orig; }
917};
918
919my $std = sub {
920  my ($mod,$args) = @_;
921  my $orig = "std$mod\t$args";
922
923    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
924    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
925	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
926	$opcode|=(1<<5)  if ($mod =~ /^,m/);
927	$opcode|=(1<<13) if ($mod =~ /^,mb/);
928	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
929    }
930    else { "\t".$orig; }
931};
932
933my $extrd = sub {
934  my ($mod,$args) = @_;
935  my $orig = "extrd$mod\t$args";
936
937    # I only have ",u" completer, it's implicitly encoded...
938    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
939    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
940	my $len=32-$3;
941	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
942	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
943	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
944    }
945    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
946    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
947	my $len=32-$2;
948	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
949	$opcode |= (1<<13) if ($mod =~ /,\**=/);
950	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
951    }
952    else { "\t".$orig; }
953};
954
955my $shrpd = sub {
956  my ($mod,$args) = @_;
957  my $orig = "shrpd$mod\t$args";
958
959    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
960    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
961	my $cpos=63-$3;
962	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
963	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
964    }
965    else { "\t".$orig; }
966};
967
968my $sub = sub {
969  my ($mod,$args) = @_;
970  my $orig = "sub$mod\t$args";
971
972    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
973	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
974	$opcode|=(1<<10);	# e1
975	$opcode|=(1<<8);	# e2
976	$opcode|=(1<<5);	# d
977	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
978    }
979    else { "\t".$orig; }
980};
981
982sub assemble {
983  my ($mnemonic,$mod,$args)=@_;
984  my $opcode = eval("\$$mnemonic");
985
986    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
987}
988
989if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
990	=~ /GNU assembler/) {
991    $gnuas = 1;
992}
993
994foreach (split("\n",$code)) {
995	s/\`([^\`]*)\`/eval $1/ge;
996	# flip word order in 64-bit mode...
997	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
998	# assemble 2.0 instructions in 32-bit mode...
999	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
1000
1001	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
1002	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
1003	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
1004	s/\bbv\b/bve/			if ($SIZE_T==8);
1005
1006	print $_,"\n";
1007}
1008close STDOUT or die "error closing STDOUT: $!";
1009