xref: /openssl/crypto/modes/asm/ghash-parisc.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# April 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
22# it processes one byte in 19.6 cycles, which is more than twice as
23# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
24# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
25# processed byte. This is ~2.2x faster than 64-bit code generated by
26# vendor compiler (which used to be very hard to beat:-).
27#
28# Special thanks to polarhome.com for providing HP-UX account.
29
30# $output is the last argument if it looks like a file (it has an extension)
31# $flavour is the first argument if it doesn't look like a file
32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
34
35$output and open STDOUT,">$output";
36
37if ($flavour =~ /64/) {
38	$LEVEL		="2.0W";
39	$SIZE_T		=8;
40	$FRAME_MARKER	=80;
41	$SAVED_RP	=16;
42	$PUSH		="std";
43	$PUSHMA		="std,ma";
44	$POP		="ldd";
45	$POPMB		="ldd,mb";
46	$NREGS		=6;
47} else {
48	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
49	$SIZE_T		=4;
50	$FRAME_MARKER	=48;
51	$SAVED_RP	=20;
52	$PUSH		="stw";
53	$PUSHMA		="stwm";
54	$POP		="ldw";
55	$POPMB		="ldwm";
56	$NREGS		=11;
57}
58
59$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
60				#                 [+ argument transfer]
61
62################# volatile registers
63$Xi="%r26";	# argument block
64$Htbl="%r25";
65$inp="%r24";
66$len="%r23";
67$Hhh=$Htbl;	# variables
68$Hll="%r22";
69$Zhh="%r21";
70$Zll="%r20";
71$cnt="%r19";
72$rem_4bit="%r28";
73$rem="%r29";
74$mask0xf0="%r31";
75
76################# preserved registers
77$Thh="%r1";
78$Tll="%r2";
79$nlo="%r3";
80$nhi="%r4";
81$byte="%r5";
82if ($SIZE_T==4) {
83	$Zhl="%r6";
84	$Zlh="%r7";
85	$Hhl="%r8";
86	$Hlh="%r9";
87	$Thl="%r10";
88	$Tlh="%r11";
89}
90$rem2="%r6";	# used in PA-RISC 2.0 code
91
92$code.=<<___;
93	.LEVEL	$LEVEL
94	.SPACE	\$TEXT\$
95	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
96
97	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
98	.ALIGN	64
99gcm_gmult_4bit
100	.PROC
101	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
102	.ENTRY
103	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
104	$PUSHMA	%r3,$FRAME(%sp)
105	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
106	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
107	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
108___
109$code.=<<___ if ($SIZE_T==4);
110	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
111	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
112	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
113	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
114	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
115___
116$code.=<<___;
117	blr	%r0,$rem_4bit
118	ldi	3,$rem
119L\$pic_gmult
120	andcm	$rem_4bit,$rem,$rem_4bit
121	addl	$inp,$len,$len
122	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
123	ldi	0xf0,$mask0xf0
124___
125$code.=<<___ if ($SIZE_T==4);
126	ldi	31,$rem
127	mtctl	$rem,%cr11
128	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
129	b	L\$parisc1_gmult
130	nop
131___
132
133$code.=<<___;
134	ldb	15($Xi),$nlo
135	ldo	8($Htbl),$Hll
136
137	and	$mask0xf0,$nlo,$nhi
138	depd,z	$nlo,59,4,$nlo
139
140	ldd	$nlo($Hll),$Zll
141	ldd	$nlo($Hhh),$Zhh
142
143	depd,z	$Zll,60,4,$rem
144	shrpd	$Zhh,$Zll,4,$Zll
145	extrd,u	$Zhh,59,60,$Zhh
146	ldb	14($Xi),$nlo
147
148	ldd	$nhi($Hll),$Tll
149	ldd	$nhi($Hhh),$Thh
150	and	$mask0xf0,$nlo,$nhi
151	depd,z	$nlo,59,4,$nlo
152
153	xor	$Tll,$Zll,$Zll
154	xor	$Thh,$Zhh,$Zhh
155	ldd	$rem($rem_4bit),$rem
156	b	L\$oop_gmult_pa2
157	ldi	13,$cnt
158
159	.ALIGN	8
160L\$oop_gmult_pa2
161	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
162	depd,z	$Zll,60,4,$rem
163
164	shrpd	$Zhh,$Zll,4,$Zll
165	extrd,u	$Zhh,59,60,$Zhh
166	ldd	$nlo($Hll),$Tll
167	ldd	$nlo($Hhh),$Thh
168
169	xor	$Tll,$Zll,$Zll
170	xor	$Thh,$Zhh,$Zhh
171	ldd	$rem($rem_4bit),$rem
172
173	xor	$rem,$Zhh,$Zhh
174	depd,z	$Zll,60,4,$rem
175	ldbx	$cnt($Xi),$nlo
176
177	shrpd	$Zhh,$Zll,4,$Zll
178	extrd,u	$Zhh,59,60,$Zhh
179	ldd	$nhi($Hll),$Tll
180	ldd	$nhi($Hhh),$Thh
181
182	and	$mask0xf0,$nlo,$nhi
183	depd,z	$nlo,59,4,$nlo
184	ldd	$rem($rem_4bit),$rem
185
186	xor	$Tll,$Zll,$Zll
187	addib,uv -1,$cnt,L\$oop_gmult_pa2
188	xor	$Thh,$Zhh,$Zhh
189
190	xor	$rem,$Zhh,$Zhh
191	depd,z	$Zll,60,4,$rem
192
193	shrpd	$Zhh,$Zll,4,$Zll
194	extrd,u	$Zhh,59,60,$Zhh
195	ldd	$nlo($Hll),$Tll
196	ldd	$nlo($Hhh),$Thh
197
198	xor	$Tll,$Zll,$Zll
199	xor	$Thh,$Zhh,$Zhh
200	ldd	$rem($rem_4bit),$rem
201
202	xor	$rem,$Zhh,$Zhh
203	depd,z	$Zll,60,4,$rem
204
205	shrpd	$Zhh,$Zll,4,$Zll
206	extrd,u	$Zhh,59,60,$Zhh
207	ldd	$nhi($Hll),$Tll
208	ldd	$nhi($Hhh),$Thh
209
210	xor	$Tll,$Zll,$Zll
211	xor	$Thh,$Zhh,$Zhh
212	ldd	$rem($rem_4bit),$rem
213
214	xor	$rem,$Zhh,$Zhh
215	std	$Zll,8($Xi)
216	std	$Zhh,0($Xi)
217___
218
219$code.=<<___ if ($SIZE_T==4);
220	b	L\$done_gmult
221	nop
222
223L\$parisc1_gmult
224	ldb	15($Xi),$nlo
225	ldo	12($Htbl),$Hll
226	ldo	8($Htbl),$Hlh
227	ldo	4($Htbl),$Hhl
228
229	and	$mask0xf0,$nlo,$nhi
230	zdep	$nlo,27,4,$nlo
231
232	ldwx	$nlo($Hll),$Zll
233	ldwx	$nlo($Hlh),$Zlh
234	ldwx	$nlo($Hhl),$Zhl
235	ldwx	$nlo($Hhh),$Zhh
236	zdep	$Zll,28,4,$rem
237	ldb	14($Xi),$nlo
238	ldwx	$rem($rem_4bit),$rem
239	shrpw	$Zlh,$Zll,4,$Zll
240	ldwx	$nhi($Hll),$Tll
241	shrpw	$Zhl,$Zlh,4,$Zlh
242	ldwx	$nhi($Hlh),$Tlh
243	shrpw	$Zhh,$Zhl,4,$Zhl
244	ldwx	$nhi($Hhl),$Thl
245	extru	$Zhh,27,28,$Zhh
246	ldwx	$nhi($Hhh),$Thh
247	xor	$rem,$Zhh,$Zhh
248	and	$mask0xf0,$nlo,$nhi
249	zdep	$nlo,27,4,$nlo
250
251	xor	$Tll,$Zll,$Zll
252	ldwx	$nlo($Hll),$Tll
253	xor	$Tlh,$Zlh,$Zlh
254	ldwx	$nlo($Hlh),$Tlh
255	xor	$Thl,$Zhl,$Zhl
256	b	L\$oop_gmult_pa1
257	ldi	13,$cnt
258
259	.ALIGN	8
260L\$oop_gmult_pa1
261	zdep	$Zll,28,4,$rem
262	ldwx	$nlo($Hhl),$Thl
263	xor	$Thh,$Zhh,$Zhh
264	ldwx	$rem($rem_4bit),$rem
265	shrpw	$Zlh,$Zll,4,$Zll
266	ldwx	$nlo($Hhh),$Thh
267	shrpw	$Zhl,$Zlh,4,$Zlh
268	ldbx	$cnt($Xi),$nlo
269	xor	$Tll,$Zll,$Zll
270	ldwx	$nhi($Hll),$Tll
271	shrpw	$Zhh,$Zhl,4,$Zhl
272	xor	$Tlh,$Zlh,$Zlh
273	ldwx	$nhi($Hlh),$Tlh
274	extru	$Zhh,27,28,$Zhh
275	xor	$Thl,$Zhl,$Zhl
276	ldwx	$nhi($Hhl),$Thl
277	xor	$rem,$Zhh,$Zhh
278	zdep	$Zll,28,4,$rem
279	xor	$Thh,$Zhh,$Zhh
280	ldwx	$nhi($Hhh),$Thh
281	shrpw	$Zlh,$Zll,4,$Zll
282	ldwx	$rem($rem_4bit),$rem
283	shrpw	$Zhl,$Zlh,4,$Zlh
284	shrpw	$Zhh,$Zhl,4,$Zhl
285	and	$mask0xf0,$nlo,$nhi
286	extru	$Zhh,27,28,$Zhh
287	zdep	$nlo,27,4,$nlo
288	xor	$Tll,$Zll,$Zll
289	ldwx	$nlo($Hll),$Tll
290	xor	$Tlh,$Zlh,$Zlh
291	ldwx	$nlo($Hlh),$Tlh
292	xor	$rem,$Zhh,$Zhh
293	addib,uv -1,$cnt,L\$oop_gmult_pa1
294	xor	$Thl,$Zhl,$Zhl
295
296	zdep	$Zll,28,4,$rem
297	ldwx	$nlo($Hhl),$Thl
298	xor	$Thh,$Zhh,$Zhh
299	ldwx	$rem($rem_4bit),$rem
300	shrpw	$Zlh,$Zll,4,$Zll
301	ldwx	$nlo($Hhh),$Thh
302	shrpw	$Zhl,$Zlh,4,$Zlh
303	xor	$Tll,$Zll,$Zll
304	ldwx	$nhi($Hll),$Tll
305	shrpw	$Zhh,$Zhl,4,$Zhl
306	xor	$Tlh,$Zlh,$Zlh
307	ldwx	$nhi($Hlh),$Tlh
308	extru	$Zhh,27,28,$Zhh
309	xor	$rem,$Zhh,$Zhh
310	xor	$Thl,$Zhl,$Zhl
311	ldwx	$nhi($Hhl),$Thl
312	xor	$Thh,$Zhh,$Zhh
313	ldwx	$nhi($Hhh),$Thh
314	zdep	$Zll,28,4,$rem
315	ldwx	$rem($rem_4bit),$rem
316	shrpw	$Zlh,$Zll,4,$Zll
317	shrpw	$Zhl,$Zlh,4,$Zlh
318	shrpw	$Zhh,$Zhl,4,$Zhl
319	extru	$Zhh,27,28,$Zhh
320	xor	$Tll,$Zll,$Zll
321	xor	$Tlh,$Zlh,$Zlh
322	xor	$rem,$Zhh,$Zhh
323	stw	$Zll,12($Xi)
324	xor	$Thl,$Zhl,$Zhl
325	stw	$Zlh,8($Xi)
326	xor	$Thh,$Zhh,$Zhh
327	stw	$Zhl,4($Xi)
328	stw	$Zhh,0($Xi)
329___
330$code.=<<___;
331L\$done_gmult
332	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
333	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
334	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
335	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
336___
337$code.=<<___ if ($SIZE_T==4);
338	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
339	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
340	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
341	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
342	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
343___
344$code.=<<___;
345	bv	(%r2)
346	.EXIT
347	$POPMB	-$FRAME(%sp),%r3
348	.PROCEND
349
350	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
351	.ALIGN	64
352gcm_ghash_4bit
353	.PROC
354	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
355	.ENTRY
356	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
357	$PUSHMA	%r3,$FRAME(%sp)
358	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
359	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
360	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
361___
362$code.=<<___ if ($SIZE_T==4);
363	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
364	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
365	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
366	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
367	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
368___
369$code.=<<___;
370	blr	%r0,$rem_4bit
371	ldi	3,$rem
372L\$pic_ghash
373	andcm	$rem_4bit,$rem,$rem_4bit
374	addl	$inp,$len,$len
375	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
376	ldi	0xf0,$mask0xf0
377___
378$code.=<<___ if ($SIZE_T==4);
379	ldi	31,$rem
380	mtctl	$rem,%cr11
381	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
382	b	L\$parisc1_ghash
383	nop
384___
385
386$code.=<<___;
387	ldb	15($Xi),$nlo
388	ldo	8($Htbl),$Hll
389
390L\$outer_ghash_pa2
391	ldb	15($inp),$nhi
392	xor	$nhi,$nlo,$nlo
393	and	$mask0xf0,$nlo,$nhi
394	depd,z	$nlo,59,4,$nlo
395
396	ldd	$nlo($Hll),$Zll
397	ldd	$nlo($Hhh),$Zhh
398
399	depd,z	$Zll,60,4,$rem
400	shrpd	$Zhh,$Zll,4,$Zll
401	extrd,u	$Zhh,59,60,$Zhh
402	ldb	14($Xi),$nlo
403	ldb	14($inp),$byte
404
405	ldd	$nhi($Hll),$Tll
406	ldd	$nhi($Hhh),$Thh
407	xor	$byte,$nlo,$nlo
408	and	$mask0xf0,$nlo,$nhi
409	depd,z	$nlo,59,4,$nlo
410
411	xor	$Tll,$Zll,$Zll
412	xor	$Thh,$Zhh,$Zhh
413	ldd	$rem($rem_4bit),$rem
414	b	L\$oop_ghash_pa2
415	ldi	13,$cnt
416
417	.ALIGN	8
418L\$oop_ghash_pa2
419	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
420	depd,z	$Zll,60,4,$rem2
421
422	shrpd	$Zhh,$Zll,4,$Zll
423	extrd,u	$Zhh,59,60,$Zhh
424	ldd	$nlo($Hll),$Tll
425	ldd	$nlo($Hhh),$Thh
426
427	xor	$Tll,$Zll,$Zll
428	xor	$Thh,$Zhh,$Zhh
429	ldbx	$cnt($Xi),$nlo
430	ldbx	$cnt($inp),$byte
431
432	depd,z	$Zll,60,4,$rem
433	shrpd	$Zhh,$Zll,4,$Zll
434	ldd	$rem2($rem_4bit),$rem2
435
436	xor	$rem2,$Zhh,$Zhh
437	xor	$byte,$nlo,$nlo
438	ldd	$nhi($Hll),$Tll
439	ldd	$nhi($Hhh),$Thh
440
441	and	$mask0xf0,$nlo,$nhi
442	depd,z	$nlo,59,4,$nlo
443
444	extrd,u	$Zhh,59,60,$Zhh
445	xor	$Tll,$Zll,$Zll
446
447	ldd	$rem($rem_4bit),$rem
448	addib,uv -1,$cnt,L\$oop_ghash_pa2
449	xor	$Thh,$Zhh,$Zhh
450
451	xor	$rem,$Zhh,$Zhh
452	depd,z	$Zll,60,4,$rem2
453
454	shrpd	$Zhh,$Zll,4,$Zll
455	extrd,u	$Zhh,59,60,$Zhh
456	ldd	$nlo($Hll),$Tll
457	ldd	$nlo($Hhh),$Thh
458
459	xor	$Tll,$Zll,$Zll
460	xor	$Thh,$Zhh,$Zhh
461
462	depd,z	$Zll,60,4,$rem
463	shrpd	$Zhh,$Zll,4,$Zll
464	ldd	$rem2($rem_4bit),$rem2
465
466	xor	$rem2,$Zhh,$Zhh
467	ldd	$nhi($Hll),$Tll
468	ldd	$nhi($Hhh),$Thh
469
470	extrd,u	$Zhh,59,60,$Zhh
471	xor	$Tll,$Zll,$Zll
472	xor	$Thh,$Zhh,$Zhh
473	ldd	$rem($rem_4bit),$rem
474
475	xor	$rem,$Zhh,$Zhh
476	std	$Zll,8($Xi)
477	ldo	16($inp),$inp
478	std	$Zhh,0($Xi)
479	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
480	copy	$Zll,$nlo
481___
482
483$code.=<<___ if ($SIZE_T==4);
484	b	L\$done_ghash
485	nop
486
487L\$parisc1_ghash
488	ldb	15($Xi),$nlo
489	ldo	12($Htbl),$Hll
490	ldo	8($Htbl),$Hlh
491	ldo	4($Htbl),$Hhl
492
493L\$outer_ghash_pa1
494	ldb	15($inp),$byte
495	xor	$byte,$nlo,$nlo
496	and	$mask0xf0,$nlo,$nhi
497	zdep	$nlo,27,4,$nlo
498
499	ldwx	$nlo($Hll),$Zll
500	ldwx	$nlo($Hlh),$Zlh
501	ldwx	$nlo($Hhl),$Zhl
502	ldwx	$nlo($Hhh),$Zhh
503	zdep	$Zll,28,4,$rem
504	ldb	14($Xi),$nlo
505	ldb	14($inp),$byte
506	ldwx	$rem($rem_4bit),$rem
507	shrpw	$Zlh,$Zll,4,$Zll
508	ldwx	$nhi($Hll),$Tll
509	shrpw	$Zhl,$Zlh,4,$Zlh
510	ldwx	$nhi($Hlh),$Tlh
511	shrpw	$Zhh,$Zhl,4,$Zhl
512	ldwx	$nhi($Hhl),$Thl
513	extru	$Zhh,27,28,$Zhh
514	ldwx	$nhi($Hhh),$Thh
515	xor	$byte,$nlo,$nlo
516	xor	$rem,$Zhh,$Zhh
517	and	$mask0xf0,$nlo,$nhi
518	zdep	$nlo,27,4,$nlo
519
520	xor	$Tll,$Zll,$Zll
521	ldwx	$nlo($Hll),$Tll
522	xor	$Tlh,$Zlh,$Zlh
523	ldwx	$nlo($Hlh),$Tlh
524	xor	$Thl,$Zhl,$Zhl
525	b	L\$oop_ghash_pa1
526	ldi	13,$cnt
527
528	.ALIGN	8
529L\$oop_ghash_pa1
530	zdep	$Zll,28,4,$rem
531	ldwx	$nlo($Hhl),$Thl
532	xor	$Thh,$Zhh,$Zhh
533	ldwx	$rem($rem_4bit),$rem
534	shrpw	$Zlh,$Zll,4,$Zll
535	ldwx	$nlo($Hhh),$Thh
536	shrpw	$Zhl,$Zlh,4,$Zlh
537	ldbx	$cnt($Xi),$nlo
538	xor	$Tll,$Zll,$Zll
539	ldwx	$nhi($Hll),$Tll
540	shrpw	$Zhh,$Zhl,4,$Zhl
541	ldbx	$cnt($inp),$byte
542	xor	$Tlh,$Zlh,$Zlh
543	ldwx	$nhi($Hlh),$Tlh
544	extru	$Zhh,27,28,$Zhh
545	xor	$Thl,$Zhl,$Zhl
546	ldwx	$nhi($Hhl),$Thl
547	xor	$rem,$Zhh,$Zhh
548	zdep	$Zll,28,4,$rem
549	xor	$Thh,$Zhh,$Zhh
550	ldwx	$nhi($Hhh),$Thh
551	shrpw	$Zlh,$Zll,4,$Zll
552	ldwx	$rem($rem_4bit),$rem
553	shrpw	$Zhl,$Zlh,4,$Zlh
554	xor	$byte,$nlo,$nlo
555	shrpw	$Zhh,$Zhl,4,$Zhl
556	and	$mask0xf0,$nlo,$nhi
557	extru	$Zhh,27,28,$Zhh
558	zdep	$nlo,27,4,$nlo
559	xor	$Tll,$Zll,$Zll
560	ldwx	$nlo($Hll),$Tll
561	xor	$Tlh,$Zlh,$Zlh
562	ldwx	$nlo($Hlh),$Tlh
563	xor	$rem,$Zhh,$Zhh
564	addib,uv -1,$cnt,L\$oop_ghash_pa1
565	xor	$Thl,$Zhl,$Zhl
566
567	zdep	$Zll,28,4,$rem
568	ldwx	$nlo($Hhl),$Thl
569	xor	$Thh,$Zhh,$Zhh
570	ldwx	$rem($rem_4bit),$rem
571	shrpw	$Zlh,$Zll,4,$Zll
572	ldwx	$nlo($Hhh),$Thh
573	shrpw	$Zhl,$Zlh,4,$Zlh
574	xor	$Tll,$Zll,$Zll
575	ldwx	$nhi($Hll),$Tll
576	shrpw	$Zhh,$Zhl,4,$Zhl
577	xor	$Tlh,$Zlh,$Zlh
578	ldwx	$nhi($Hlh),$Tlh
579	extru	$Zhh,27,28,$Zhh
580	xor	$rem,$Zhh,$Zhh
581	xor	$Thl,$Zhl,$Zhl
582	ldwx	$nhi($Hhl),$Thl
583	xor	$Thh,$Zhh,$Zhh
584	ldwx	$nhi($Hhh),$Thh
585	zdep	$Zll,28,4,$rem
586	ldwx	$rem($rem_4bit),$rem
587	shrpw	$Zlh,$Zll,4,$Zll
588	shrpw	$Zhl,$Zlh,4,$Zlh
589	shrpw	$Zhh,$Zhl,4,$Zhl
590	extru	$Zhh,27,28,$Zhh
591	xor	$Tll,$Zll,$Zll
592	xor	$Tlh,$Zlh,$Zlh
593	xor	$rem,$Zhh,$Zhh
594	stw	$Zll,12($Xi)
595	xor	$Thl,$Zhl,$Zhl
596	stw	$Zlh,8($Xi)
597	xor	$Thh,$Zhh,$Zhh
598	stw	$Zhl,4($Xi)
599	ldo	16($inp),$inp
600	stw	$Zhh,0($Xi)
601	comb,<>	$inp,$len,L\$outer_ghash_pa1
602	copy	$Zll,$nlo
603___
604$code.=<<___;
605L\$done_ghash
606	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
607	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
608	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
609	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
610___
611$code.=<<___ if ($SIZE_T==4);
612	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
613	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
614	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
615	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
616	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
617___
618$code.=<<___;
619	bv	(%r2)
620	.EXIT
621	$POPMB	-$FRAME(%sp),%r3
622	.PROCEND
623
624	.ALIGN	64
625L\$rem_4bit
626	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
627	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
628	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
629	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
630	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
631	.ALIGN	64
632___
633
634# Explicitly encode PA-RISC 2.0 instructions used in this module, so
635# that it can be compiled with .LEVEL 1.0. It should be noted that I
636# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
637# directive...
638
639my $ldd = sub {
640  my ($mod,$args) = @_;
641  my $orig = "ldd$mod\t$args";
642
643    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
644    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
645	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
646    }
647    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
648    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
649	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
650	$opcode|=(1<<5)  if ($mod =~ /^,m/);
651	$opcode|=(1<<13) if ($mod =~ /^,mb/);
652	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
653    }
654    else { "\t".$orig; }
655};
656
657my $std = sub {
658  my ($mod,$args) = @_;
659  my $orig = "std$mod\t$args";
660
661    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
662    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
663	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
664    }
665    else { "\t".$orig; }
666};
667
668my $extrd = sub {
669  my ($mod,$args) = @_;
670  my $orig = "extrd$mod\t$args";
671
672    # I only have ",u" completer, it's implicitly encoded...
673    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
674    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
675	my $len=32-$3;
676	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
677	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
678	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
679    }
680    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
681    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
682	my $len=32-$2;
683	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
684	$opcode |= (1<<13) if ($mod =~ /,\**=/);
685	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
686    }
687    else { "\t".$orig; }
688};
689
690my $shrpd = sub {
691  my ($mod,$args) = @_;
692  my $orig = "shrpd$mod\t$args";
693
694    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
695    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
696	my $cpos=63-$3;
697	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
698	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
699    }
700    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
701    {	sprintf "\t.WORD\t0x%08x\t; %s",
702		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
703    }
704    else { "\t".$orig; }
705};
706
707my $depd = sub {
708  my ($mod,$args) = @_;
709  my $orig = "depd$mod\t$args";
710
711    # I only have ",z" completer, it's implicitly encoded...
712    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
713    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
714    	my $cpos=63-$2;
715	my $len=32-$3;
716	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
717	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
718	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
719    }
720    else { "\t".$orig; }
721};
722
723sub assemble {
724  my ($mnemonic,$mod,$args)=@_;
725  my $opcode = eval("\$$mnemonic");
726
727    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
728}
729
730if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
731	=~ /GNU assembler/) {
732    $gnuas = 1;
733}
734
735foreach (split("\n",$code)) {
736	s/\`([^\`]*)\`/eval $1/ge;
737	if ($SIZE_T==4) {
738		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
739		s/cmpb,\*/comb,/;
740		s/,\*/,/;
741	}
742
743	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
744	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
745	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
746	s/\bbv\b/bve/			if ($SIZE_T==8);
747
748	print $_,"\n";
749}
750
751close STDOUT or die "error closing STDOUT: $!";
752