xref: /openssl/crypto/sha/asm/sha512-parisc.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# SHA256/512 block procedure for PA-RISC.
18
19# June 2009.
20#
21# SHA256 performance is >75% better than gcc 3.2 generated code on
22# PA-7100LC. Compared to code generated by vendor compiler this
23# implementation is almost 70% faster in 64-bit build, but delivers
24# virtually same performance in 32-bit build on PA-8600.
25#
26# SHA512 performance is >2.9x better than gcc 3.2 generated code on
27# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
28# code is executed on PA-RISC 2.0 processor and switches to 64-bit
29# code path delivering adequate performance even in "blended" 32-bit
30# build. Though 64-bit code is not any faster than code generated by
31# vendor compiler on PA-8600...
32#
33# Special thanks to polarhome.com for providing HP-UX account.
34
35# $output is the last argument if it looks like a file (it has an extension)
36# $flavour is the first argument if it doesn't look like a file
37$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
38$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
39
40$output and open STDOUT,">$output";
41
42if ($flavour =~ /64/) {
43	$LEVEL		="2.0W";
44	$SIZE_T		=8;
45	$FRAME_MARKER	=80;
46	$SAVED_RP	=16;
47	$PUSH		="std";
48	$PUSHMA		="std,ma";
49	$POP		="ldd";
50	$POPMB		="ldd,mb";
51} else {
52	$LEVEL		="1.0";
53	$SIZE_T		=4;
54	$FRAME_MARKER	=48;
55	$SAVED_RP	=20;
56	$PUSH		="stw";
57	$PUSHMA		="stwm";
58	$POP		="ldw";
59	$POPMB		="ldwm";
60}
61
62if ($output =~ /512/) {
63	$func="sha512_block_data_order";
64	$SZ=8;
65	@Sigma0=(28,34,39);
66	@Sigma1=(14,18,41);
67	@sigma0=(1,  8, 7);
68	@sigma1=(19,61, 6);
69	$rounds=80;
70	$LAST10BITS=0x017;
71	$LD="ldd";
72	$LDM="ldd,ma";
73	$ST="std";
74} else {
75	$func="sha256_block_data_order";
76	$SZ=4;
77	@Sigma0=( 2,13,22);
78	@Sigma1=( 6,11,25);
79	@sigma0=( 7,18, 3);
80	@sigma1=(17,19,10);
81	$rounds=64;
82	$LAST10BITS=0x0f2;
83	$LD="ldw";
84	$LDM="ldwm";
85	$ST="stw";
86}
87
88$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
89				#                 [+ argument transfer]
90$XOFF=16*$SZ+32;		# local variables
91$FRAME+=$XOFF;
92$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
93
94$ctx="%r26";	# zapped by $a0
95$inp="%r25";	# zapped by $a1
96$num="%r24";	# zapped by $t0
97
98$a0 ="%r26";
99$a1 ="%r25";
100$t0 ="%r24";
101$t1 ="%r29";
102$Tbl="%r31";
103
104@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
105
106@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
107    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
108
109sub ROUND_00_15 {
110my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
111$code.=<<___;
112	_ror	$e,$Sigma1[0],$a0
113	and	$f,$e,$t0
114	_ror	$e,$Sigma1[1],$a1
115	addl	$t1,$h,$h
116	andcm	$g,$e,$t1
117	xor	$a1,$a0,$a0
118	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
119	or	$t0,$t1,$t1		; Ch(e,f,g)
120	addl	@X[$i%16],$h,$h
121	xor	$a0,$a1,$a1		; Sigma1(e)
122	addl	$t1,$h,$h
123	_ror	$a,$Sigma0[0],$a0
124	addl	$a1,$h,$h
125
126	_ror	$a,$Sigma0[1],$a1
127	and	$a,$b,$t0
128	and	$a,$c,$t1
129	xor	$a1,$a0,$a0
130	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
131	xor	$t1,$t0,$t0
132	and	$b,$c,$t1
133	xor	$a0,$a1,$a1		; Sigma0(a)
134	addl	$h,$d,$d
135	xor	$t1,$t0,$t0		; Maj(a,b,c)
136	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
137	addl	$a1,$h,$h
138	addl	$t0,$h,$h
139
140___
141}
142
143sub ROUND_16_xx {
144my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
145$i-=16;
146$code.=<<___;
147	_ror	@X[($i+1)%16],$sigma0[0],$a0
148	_ror	@X[($i+1)%16],$sigma0[1],$a1
149	addl	@X[($i+9)%16],@X[$i],@X[$i]
150	_ror	@X[($i+14)%16],$sigma1[0],$t0
151	_ror	@X[($i+14)%16],$sigma1[1],$t1
152	xor	$a1,$a0,$a0
153	_shr	@X[($i+1)%16],$sigma0[2],$a1
154	xor	$t1,$t0,$t0
155	_shr	@X[($i+14)%16],$sigma1[2],$t1
156	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
157	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
158	$LDM	$SZ($Tbl),$t1
159	addl	$a0,@X[$i],@X[$i]
160	addl	$t0,@X[$i],@X[$i]
161___
162$code.=<<___ if ($i==15);
163	extru	$t1,31,10,$a1
164	comiclr,<> $LAST10BITS,$a1,%r0
165	ldo	1($Tbl),$Tbl		; signal end of $Tbl
166___
167&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
168}
169
170$code=<<___;
171	.LEVEL	$LEVEL
172	.SPACE	\$TEXT\$
173	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
174
175	.ALIGN	64
176L\$table
177___
178$code.=<<___ if ($SZ==8);
179	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
180	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
181	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
182	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
183	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
184	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
185	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
186	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
187	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
188	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
189	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
190	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
191	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
192	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
193	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
194	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
195	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
196	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
197	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
198	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
199	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
200	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
201	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
202	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
203	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
204	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
205	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
206	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
207	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
208	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
209	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
210	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
211	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
212	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
213	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
214	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
215	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
216	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
217	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
218	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
219___
220$code.=<<___ if ($SZ==4);
221	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
222	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
223	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
224	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
225	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
226	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
227	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
228	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
229	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
230	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
231	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
232	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
233	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
234	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
235	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
236	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
237___
238$code.=<<___;
239
240	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
241	.ALIGN	64
242$func
243	.PROC
244	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
245	.ENTRY
246	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
247	$PUSHMA	%r3,$FRAME(%sp)
248	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
249	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
250	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
251	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
252	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
253	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
254	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
255	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
256	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
257	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
258	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
259	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
260	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
261	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
262	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
263
264	_shl	$num,`log(16*$SZ)/log(2)`,$num
265	addl	$inp,$num,$num		; $num to point at the end of $inp
266
267	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
268	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
269	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
270
271	blr	%r0,$Tbl
272	ldi	3,$t1
273L\$pic
274	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
275	ldo	L\$table-L\$pic($Tbl),$Tbl
276___
277$code.=<<___ if ($SZ==8 && $SIZE_T==4);
278	ldi	31,$t1
279	mtctl	$t1,%cr11
280	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
281	b	L\$parisc1
282	nop
283___
284$code.=<<___;
285	$LD	`0*$SZ`($ctx),$A	; load context
286	$LD	`1*$SZ`($ctx),$B
287	$LD	`2*$SZ`($ctx),$C
288	$LD	`3*$SZ`($ctx),$D
289	$LD	`4*$SZ`($ctx),$E
290	$LD	`5*$SZ`($ctx),$F
291	$LD	`6*$SZ`($ctx),$G
292	$LD	`7*$SZ`($ctx),$H
293
294	extru	$inp,31,`log($SZ)/log(2)`,$t0
295	sh3addl	$t0,%r0,$t0
296	subi	`8*$SZ`,$t0,$t0
297	mtctl	$t0,%cr11		; load %sar with align factor
298
299L\$oop
300	ldi	`$SZ-1`,$t0
301	$LDM	$SZ($Tbl),$t1
302	andcm	$inp,$t0,$t0		; align $inp
303___
304	for ($i=0;$i<15;$i++) {		# load input block
305	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
306$code.=<<___;
307	cmpb,*=	$inp,$t0,L\$aligned
308	$LD	`$SZ*15`($t0),@X[15]
309	$LD	`$SZ*16`($t0),@X[16]
310___
311	for ($i=0;$i<16;$i++) {		# align data
312	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
313$code.=<<___;
314L\$aligned
315	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
316___
317
318for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
319$code.=<<___;
320L\$rounds
321	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
322___
323for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
324$code.=<<___;
325	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
326	nop
327
328	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
329	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
330	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
331	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
332
333	$LD	`0*$SZ`($ctx),@X[0]	; load context
334	$LD	`1*$SZ`($ctx),@X[1]
335	$LD	`2*$SZ`($ctx),@X[2]
336	$LD	`3*$SZ`($ctx),@X[3]
337	$LD	`4*$SZ`($ctx),@X[4]
338	$LD	`5*$SZ`($ctx),@X[5]
339	addl	@X[0],$A,$A
340	$LD	`6*$SZ`($ctx),@X[6]
341	addl	@X[1],$B,$B
342	$LD	`7*$SZ`($ctx),@X[7]
343	ldo	`16*$SZ`($inp),$inp	; advance $inp
344
345	$ST	$A,`0*$SZ`($ctx)	; save context
346	addl	@X[2],$C,$C
347	$ST	$B,`1*$SZ`($ctx)
348	addl	@X[3],$D,$D
349	$ST	$C,`2*$SZ`($ctx)
350	addl	@X[4],$E,$E
351	$ST	$D,`3*$SZ`($ctx)
352	addl	@X[5],$F,$F
353	$ST	$E,`4*$SZ`($ctx)
354	addl	@X[6],$G,$G
355	$ST	$F,`5*$SZ`($ctx)
356	addl	@X[7],$H,$H
357	$ST	$G,`6*$SZ`($ctx)
358	$ST	$H,`7*$SZ`($ctx)
359
360	cmpb,*<>,n $inp,$num,L\$oop
361	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
362___
363if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
364{{
365$code.=<<___;
366	b	L\$done
367	nop
368
369	.ALIGN	64
370L\$parisc1
371___
372
373@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
374      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) =
375   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
376     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
377$a0 ="%r17";
378$a1 ="%r18";
379$a2 ="%r19";
380$a3 ="%r20";
381$t0 ="%r21";
382$t1 ="%r22";
383$t2 ="%r28";
384$t3 ="%r29";
385$Tbl="%r31";
386
387@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
388
389sub ROUND_00_15_pa1 {
390my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
391       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
392my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
393
394$code.=<<___ if (!$flag);
395	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
396	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
397___
398$code.=<<___;
399	shd	$ehi,$elo,$Sigma1[0],$t0
400	 add	$Xlo,$hlo,$hlo
401	shd	$elo,$ehi,$Sigma1[0],$t1
402	 addc	$Xhi,$hhi,$hhi		; h += X[i]
403	shd	$ehi,$elo,$Sigma1[1],$t2
404	 ldwm	8($Tbl),$Xhi
405	shd	$elo,$ehi,$Sigma1[1],$t3
406	 ldw	-4($Tbl),$Xlo		; load K[i]
407	xor	$t2,$t0,$t0
408	xor	$t3,$t1,$t1
409	 and	$flo,$elo,$a0
410	 and	$fhi,$ehi,$a1
411	shd	$ehi,$elo,$Sigma1[2],$t2
412	 andcm	$glo,$elo,$a2
413	shd	$elo,$ehi,$Sigma1[2],$t3
414	 andcm	$ghi,$ehi,$a3
415	xor	$t2,$t0,$t0
416	xor	$t3,$t1,$t1		; Sigma1(e)
417	add	$Xlo,$hlo,$hlo
418	 xor	$a2,$a0,$a0
419	addc	$Xhi,$hhi,$hhi		; h += K[i]
420	 xor	$a3,$a1,$a1		; Ch(e,f,g)
421
422	 add	$t0,$hlo,$hlo
423	shd	$ahi,$alo,$Sigma0[0],$t0
424	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
425	shd	$alo,$ahi,$Sigma0[0],$t1
426	 add	$a0,$hlo,$hlo
427	shd	$ahi,$alo,$Sigma0[1],$t2
428	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
429	shd	$alo,$ahi,$Sigma0[1],$t3
430
431	xor	$t2,$t0,$t0
432	xor	$t3,$t1,$t1
433	shd	$ahi,$alo,$Sigma0[2],$t2
434	and	$alo,$blo,$a0
435	shd	$alo,$ahi,$Sigma0[2],$t3
436	and	$ahi,$bhi,$a1
437	xor	$t2,$t0,$t0
438	xor	$t3,$t1,$t1		; Sigma0(a)
439
440	and	$alo,$clo,$a2
441	and	$ahi,$chi,$a3
442	xor	$a2,$a0,$a0
443	 add	$hlo,$dlo,$dlo
444	xor	$a3,$a1,$a1
445	 addc	$hhi,$dhi,$dhi		; d += h
446	and	$blo,$clo,$a2
447	 add	$t0,$hlo,$hlo
448	and	$bhi,$chi,$a3
449	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
450	xor	$a2,$a0,$a0
451	 add	$a0,$hlo,$hlo
452	xor	$a3,$a1,$a1		; Maj(a,b,c)
453	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
454
455___
456$code.=<<___ if ($i==15 && $flag);
457	extru	$Xlo,31,10,$Xlo
458	comiclr,= $LAST10BITS,$Xlo,%r0
459	b	L\$rounds_pa1
460	nop
461___
462push(@X,shift(@X)); push(@X,shift(@X));
463}
464
465sub ROUND_16_xx_pa1 {
466my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
467my ($i)=shift;
468$i-=16;
469$code.=<<___;
470	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
471	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
472	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
473	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
474	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
475	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
476	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
477	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
478	 add	$a0,$Xlo,$Xlo
479	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
480	 addc	$a1,$Xhi,$Xhi
481	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
482	xor	$t2,$t0,$t0
483	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
484	xor	$t3,$t1,$t1
485	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
486	xor	$t2,$t0,$t0
487	 shd	$a3,$a2,$sigma1[0],$a0
488	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
489	 shd	$a2,$a3,$sigma1[0],$a1
490	add	$t0,$Xlo,$Xlo
491	 shd	$a3,$a2,$sigma1[1],$t2
492	addc	$t1,$Xhi,$Xhi
493	 shd	$a2,$a3,$sigma1[1],$t3
494	xor	$t2,$a0,$a0
495	shd	$a3,$a2,$sigma1[2],$t2
496	xor	$t3,$a1,$a1
497	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
498	xor	$t2,$a0,$a0
499	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
500	add	$a0,$Xlo,$Xlo
501	addc	$a1,$Xhi,$Xhi
502
503	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
504	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
505___
506&ROUND_00_15_pa1($i,@_,1);
507}
508$code.=<<___;
509	ldw	`0*4`($ctx),$Ahi		; load context
510	ldw	`1*4`($ctx),$Alo
511	ldw	`2*4`($ctx),$Bhi
512	ldw	`3*4`($ctx),$Blo
513	ldw	`4*4`($ctx),$Chi
514	ldw	`5*4`($ctx),$Clo
515	ldw	`6*4`($ctx),$Dhi
516	ldw	`7*4`($ctx),$Dlo
517	ldw	`8*4`($ctx),$Ehi
518	ldw	`9*4`($ctx),$Elo
519	ldw	`10*4`($ctx),$Fhi
520	ldw	`11*4`($ctx),$Flo
521	ldw	`12*4`($ctx),$Ghi
522	ldw	`13*4`($ctx),$Glo
523	ldw	`14*4`($ctx),$Hhi
524	ldw	`15*4`($ctx),$Hlo
525
526	extru	$inp,31,2,$t0
527	sh3addl	$t0,%r0,$t0
528	subi	32,$t0,$t0
529	mtctl	$t0,%cr11		; load %sar with align factor
530
531L\$oop_pa1
532	extru	$inp,31,2,$a3
533	comib,=	0,$a3,L\$aligned_pa1
534	sub	$inp,$a3,$inp
535
536	ldw	`0*4`($inp),$X[0]
537	ldw	`1*4`($inp),$X[1]
538	ldw	`2*4`($inp),$t2
539	ldw	`3*4`($inp),$t3
540	ldw	`4*4`($inp),$a0
541	ldw	`5*4`($inp),$a1
542	ldw	`6*4`($inp),$a2
543	ldw	`7*4`($inp),$a3
544	vshd	$X[0],$X[1],$X[0]
545	vshd	$X[1],$t2,$X[1]
546	stw	$X[0],`-$XOFF+0*4`(%sp)
547	ldw	`8*4`($inp),$t0
548	vshd	$t2,$t3,$t2
549	stw	$X[1],`-$XOFF+1*4`(%sp)
550	ldw	`9*4`($inp),$t1
551	vshd	$t3,$a0,$t3
552___
553{
554my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
555for ($i=2;$i<=(128/4-8);$i++) {
556$code.=<<___;
557	stw	$t[0],`-$XOFF+$i*4`(%sp)
558	ldw	`(8+$i)*4`($inp),$t[0]
559	vshd	$t[1],$t[2],$t[1]
560___
561push(@t,shift(@t));
562}
563for (;$i<(128/4-1);$i++) {
564$code.=<<___;
565	stw	$t[0],`-$XOFF+$i*4`(%sp)
566	vshd	$t[1],$t[2],$t[1]
567___
568push(@t,shift(@t));
569}
570$code.=<<___;
571	b	L\$collected_pa1
572	stw	$t[0],`-$XOFF+$i*4`(%sp)
573
574___
575}
576$code.=<<___;
577L\$aligned_pa1
578	ldw	`0*4`($inp),$X[0]
579	ldw	`1*4`($inp),$X[1]
580	ldw	`2*4`($inp),$t2
581	ldw	`3*4`($inp),$t3
582	ldw	`4*4`($inp),$a0
583	ldw	`5*4`($inp),$a1
584	ldw	`6*4`($inp),$a2
585	ldw	`7*4`($inp),$a3
586	stw	$X[0],`-$XOFF+0*4`(%sp)
587	ldw	`8*4`($inp),$t0
588	stw	$X[1],`-$XOFF+1*4`(%sp)
589	ldw	`9*4`($inp),$t1
590___
591{
592my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
593for ($i=2;$i<(128/4-8);$i++) {
594$code.=<<___;
595	stw	$t[0],`-$XOFF+$i*4`(%sp)
596	ldw	`(8+$i)*4`($inp),$t[0]
597___
598push(@t,shift(@t));
599}
600for (;$i<128/4;$i++) {
601$code.=<<___;
602	stw	$t[0],`-$XOFF+$i*4`(%sp)
603___
604push(@t,shift(@t));
605}
606$code.="L\$collected_pa1\n";
607}
608
609for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
610$code.="L\$rounds_pa1\n";
611for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
612
613$code.=<<___;
614	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
615	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
616	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
617	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
618
619	ldw	`0*4`($ctx),$t1		; update context
620	ldw	`1*4`($ctx),$t0
621	ldw	`2*4`($ctx),$t3
622	ldw	`3*4`($ctx),$t2
623	ldw	`4*4`($ctx),$a1
624	ldw	`5*4`($ctx),$a0
625	ldw	`6*4`($ctx),$a3
626	add	$t0,$Alo,$Alo
627	ldw	`7*4`($ctx),$a2
628	addc	$t1,$Ahi,$Ahi
629	ldw	`8*4`($ctx),$t1
630	add	$t2,$Blo,$Blo
631	ldw	`9*4`($ctx),$t0
632	addc	$t3,$Bhi,$Bhi
633	ldw	`10*4`($ctx),$t3
634	add	$a0,$Clo,$Clo
635	ldw	`11*4`($ctx),$t2
636	addc	$a1,$Chi,$Chi
637	ldw	`12*4`($ctx),$a1
638	add	$a2,$Dlo,$Dlo
639	ldw	`13*4`($ctx),$a0
640	addc	$a3,$Dhi,$Dhi
641	ldw	`14*4`($ctx),$a3
642	add	$t0,$Elo,$Elo
643	ldw	`15*4`($ctx),$a2
644	addc	$t1,$Ehi,$Ehi
645	stw	$Ahi,`0*4`($ctx)
646	add	$t2,$Flo,$Flo
647	stw	$Alo,`1*4`($ctx)
648	addc	$t3,$Fhi,$Fhi
649	stw	$Bhi,`2*4`($ctx)
650	add	$a0,$Glo,$Glo
651	stw	$Blo,`3*4`($ctx)
652	addc	$a1,$Ghi,$Ghi
653	stw	$Chi,`4*4`($ctx)
654	add	$a2,$Hlo,$Hlo
655	stw	$Clo,`5*4`($ctx)
656	addc	$a3,$Hhi,$Hhi
657	stw	$Dhi,`6*4`($ctx)
658	ldo	`16*$SZ`($inp),$inp	; advance $inp
659	stw	$Dlo,`7*4`($ctx)
660	stw	$Ehi,`8*4`($ctx)
661	stw	$Elo,`9*4`($ctx)
662	stw	$Fhi,`10*4`($ctx)
663	stw	$Flo,`11*4`($ctx)
664	stw	$Ghi,`12*4`($ctx)
665	stw	$Glo,`13*4`($ctx)
666	stw	$Hhi,`14*4`($ctx)
667	comb,=	$inp,$num,L\$done
668	stw	$Hlo,`15*4`($ctx)
669	b	L\$oop_pa1
670	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
671L\$done
672___
673}}
674$code.=<<___;
675	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
676	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
677	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
678	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
679	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
680	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
681	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
682	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
683	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
684	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
685	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
686	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
687	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
688	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
689	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
690	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
691	bv	(%r2)
692	.EXIT
693	$POPMB	-$FRAME(%sp),%r3
694	.PROCEND
695	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
696___
697
698# Explicitly encode PA-RISC 2.0 instructions used in this module, so
699# that it can be compiled with .LEVEL 1.0. It should be noted that I
700# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
701# directive...
702
703my $ldd = sub {
704  my ($mod,$args) = @_;
705  my $orig = "ldd$mod\t$args";
706
707    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
708    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
709	$opcode|=(1<<3) if ($mod =~ /^,m/);
710	$opcode|=(1<<2) if ($mod =~ /^,mb/);
711	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
712    }
713    else { "\t".$orig; }
714};
715
716my $std = sub {
717  my ($mod,$args) = @_;
718  my $orig = "std$mod\t$args";
719
720    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
721    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
722	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
723    }
724    else { "\t".$orig; }
725};
726
727my $extrd = sub {
728  my ($mod,$args) = @_;
729  my $orig = "extrd$mod\t$args";
730
731    # I only have ",u" completer, it's implicitly encoded...
732    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
733    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
734	my $len=32-$3;
735	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
736	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
737	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
738    }
739    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
740    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
741	my $len=32-$2;
742	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
743	$opcode |= (1<<13) if ($mod =~ /,\**=/);
744	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
745    }
746    else { "\t".$orig; }
747};
748
749my $shrpd = sub {
750  my ($mod,$args) = @_;
751  my $orig = "shrpd$mod\t$args";
752
753    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
754    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
755	my $cpos=63-$3;
756	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
757	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
758    }
759    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
760    {	sprintf "\t.WORD\t0x%08x\t; %s",
761		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
762    }
763    else { "\t".$orig; }
764};
765
766sub assemble {
767  my ($mnemonic,$mod,$args)=@_;
768  my $opcode = eval("\$$mnemonic");
769
770    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
771}
772
773if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
774	=~ /GNU assembler/) {
775    $gnuas = 1;
776}
777
778foreach (split("\n",$code)) {
779	s/\`([^\`]*)\`/eval $1/ge;
780
781	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
782		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
783		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
784	# translate made up instructions: _ror, _shr, _align, _shl
785	s/_ror(\s+)(%r[0-9]+),/
786		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
787
788	s/_shr(\s+%r[0-9]+),([0-9]+),/
789		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
790		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
791
792	s/_align(\s+%r[0-9]+,%r[0-9]+),/
793		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
794
795	s/_shl(\s+%r[0-9]+),([0-9]+),/
796		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
797		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
798
799	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
800
801	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
802	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
803	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
804	s/cmpb,\*/comb,/ 		if ($SIZE_T==4);
805	s/\bbv\b/bve/    		if ($SIZE_T==8);
806
807	print $_,"\n";
808}
809
810close STDOUT or die "error closing STDOUT: $!";
811