xref: /openssl/crypto/modes/asm/ghash-ia64.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
22# GHASH performance was measured to be 6.67 cycles per processed byte
23# on Itanium 2, which is >90% better than Microsoft compiler generated
24# code. To anchor to something else sha1-ia64.pl module processes one
25# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
26# byte.
27
28# September 2010
29#
30# It was originally thought that it makes lesser sense to implement
31# "528B" variant on Itanium 2 for following reason. Because number of
32# functional units is naturally limited, it appeared impossible to
33# implement "528B" loop in 4 cycles, only in 5. This would mean that
34# theoretically performance improvement couldn't be more than 20%.
35# But occasionally you prove yourself wrong:-) I figured out a way to
36# fold couple of instructions and having freed yet another instruction
37# slot by unrolling the loop... Resulting performance is 4.45 cycles
38# per processed byte and 50% better than "256B" version. On original
39# Itanium performance should remain the same as the "256B" version,
40# i.e. ~8.5 cycles.
41
42$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
43
44if ($^O eq "hpux") {
45    $ADDP="addp4";
46    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
47} else { $ADDP="add"; }
48for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
49                $big_endian=0 if (/\-DL_ENDIAN/);  }
50if (!defined($big_endian))
51             {  $big_endian=(unpack('L',pack('N',1))==1);  }
52
53sub loop() {
54my $label=shift;
55my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
56
57# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
58# in scalable manner;-) Naturally assuming data in L1 cache...
59# Special note about 'dep' instruction, which is used to construct
60# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
61# bytes boundary and lower 7 bits of its address are guaranteed to
62# be zero.
63$code.=<<___;
64$label:
65{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
66	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
67{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
68	($p17)	xor	xi[1]=xi[1],in[1]	};;
69{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
70	(p19)	shrp	Zlo=Zhi,Zlo,4		}
71{ .mfi;	(p19)	ld8	rem=[rem]
72	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
73{ .mmi;	($p16)	ld1	in[0]=[inp],-1
74	(p18)	xor	Zlo=Zlo,Hlo
75	(p19)	shr.u	Zhi=Zhi,4		}
76{ .mib;	(p19)	xor	Hhi=Hhi,rem
77	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
78
79{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
80	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
81{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
82	(p18)	xor	Zhi=Zhi,Hhi		};;
83{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
84	(p18)	shrp	Zlo=Zhi,Zlo,4		}
85{ .mfi;	(p18)	ld8	rem=[rem]
86	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
87{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
88	(p18)	xor	Zlo=Zlo,Hlo
89	(p18)	shr.u	Zhi=Zhi,4		}
90{ .mib;	(p18)	xor	Hhi=Hhi,rem
91	(p17)	add	Hi[0]=Htbl,Hi[0]
92	br.ctop.sptk	$label			};;
93___
94}
95
96$code=<<___;
97.explicit
98.text
99
100prevfs=r2;	prevlc=r3;	prevpr=r8;
101mask0xf0=r21;
102rem=r22;	rem_4bitp=r23;
103Xi=r24;		Htbl=r25;
104inp=r26;	end=r27;
105Hhi=r28;	Hlo=r29;
106Zhi=r30;	Zlo=r31;
107
108.align	128
109.skip	16					// aligns loop body
110.global	gcm_gmult_4bit#
111.proc	gcm_gmult_4bit#
112gcm_gmult_4bit:
113	.prologue
114{ .mmi;	.save	ar.pfs,prevfs
115	alloc	prevfs=ar.pfs,2,6,0,8
116	$ADDP	Xi=15,in0			// &Xi[15]
117	mov	rem_4bitp=ip		}
118{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
119	.save	ar.lc,prevlc
120	mov	prevlc=ar.lc
121	.save	pr,prevpr
122	mov	prevpr=pr		};;
123
124	.body
125	.rotr	in[3],xi[3],Hi[2]
126
127{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
128	mov	mask0xf0=0xf0
129	brp.loop.imp	.Loop1,.Lend1-16};;
130{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
131					};;
132{ .mii;	shladd	Hi[1]=xi[2],4,r0
133	mov	pr.rot=0x7<<16
134	mov	ar.lc=13		};;
135{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
136	mov	ar.ec=3
137	xor	Zlo=Zlo,Zlo		};;
138{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
139	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
140	xor	Zhi=Zhi,Zhi		};;
141___
142	&loop	(".Loop1",1);
143$code.=<<___;
144.Lend1:
145{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
146{ .mib;	mux1	Zlo=Zlo,\@rev		};;
147{ .mib;	mux1	Zhi=Zhi,\@rev		};;
148{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
149	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
150{ .mib;	st8	[Hlo]=Zlo
151	mov	pr=prevpr,0x1ffff	};;
152{ .mib;	st8	[Hhi]=Zhi
153	mov	ar.lc=prevlc
154	br.ret.sptk.many	b0	};;
155.endp	gcm_gmult_4bit#
156___
157
158######################################################################
159# "528B" (well, "512B" actually) streamed GHASH
160#
161$Xip="in0";
162$Htbl="in1";
163$inp="in2";
164$len="in3";
165$rem_8bit="loc0";
166$mask0xff="loc1";
167($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
168
169sub load_htable() {
170    for (my $i=0;$i<8;$i++) {
171	$code.=<<___;
172{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
173	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
174{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
175	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
176___
177	$code.=shift	if (($i+$#_)==7);
178	$code.="\t};;\n"
179    }
180}
181
182$code.=<<___;
183prevsp=r3;
184
185.align	32
186.skip	16					// aligns loop body
187.global	gcm_ghash_4bit#
188.proc	gcm_ghash_4bit#
189gcm_ghash_4bit:
190	.prologue
191{ .mmi;	.save	ar.pfs,prevfs
192	alloc	prevfs=ar.pfs,4,2,0,0
193	.vframe	prevsp
194	mov	prevsp=sp
195	mov	$rem_8bit=ip		};;
196	.body
197{ .mfi;	$ADDP	r8=0+0,$Htbl
198	$ADDP	r9=0+8,$Htbl		}
199{ .mfi;	$ADDP	r10=128+0,$Htbl
200	$ADDP	r11=128+8,$Htbl		};;
201___
202	&load_htable(
203	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
204	"	$ADDP	$len=$len,$inp",	# &inp[len]
205	"	$ADDP	$inp=15,$inp",		# &inp[15]
206	"	mov	$mask0xff=0xff",
207	"	add	sp=-512,sp",
208	"	andcm	sp=sp,$mask0xff",	# align stack frame
209	"	add	r14=0,sp",
210	"	add	r15=8,sp");
211$code.=<<___;
212{ .mmi;	$sum	1<<1				// go big-endian
213	add	r8=256+0,sp
214	add	r9=256+8,sp		}
215{ .mmi;	add	r10=256+128+0,sp
216	add	r11=256+128+8,sp
217	add	$len=-17,$len		};;
218___
219for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
220my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
221$code.=<<___;
222{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
223	st8	[r9]=$rhi,16			// Htable[$i].hi
224	shrp	$rlo=$rhi,$rlo,4	}//;;
225{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
226	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
227	shr.u	$rhi=$rhi,4		};;
228{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
229	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
230___
231}
232$code.=<<___;
233{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
234	ld8	r17=[r9],16		};;	// Htable[8].hi
235{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
236	ld8	r19=[r9],16		}	// Htable[9].hi
237{ .mmi;	rum	1<<5				// clear um.mfh
238	shrp	r16=r17,r16,4		};;
239___
240for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
241$code.=<<___;
242{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
243	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
244	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
245{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
246	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
247	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
248___
249}
250$code.=<<___;
251{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
252{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
253	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
254	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
255{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
256	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
257	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
258{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
259	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
260___
261
262$in="r15";
263@xi=("r16","r17");
264@rem=("r18","r19");
265($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
266($Atbl,$Btbl)=("r26","r27");
267
268$code.=<<___;	# (p16)
269{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
270	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
271	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
272___
273push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
274
275$code.=<<___;	# (p16),(p17)
276{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
277	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
278{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
279	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
280	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
281.align	32
282.LOOP:
283{ .mmi;
284(p6)	st8	[$Xip]=$Zhi,13
285	xor	$Zlo=$Zlo,$Zlo
286	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
287___
288push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
289
290$code.=<<___;	# (p16),(p17),(p18)
291{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
292	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
293	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
294{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
295	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
296{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
297	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
298{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
299	ld1	$in=[$inp],-1		}	//(p16) *inp--
300{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
301	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
302	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
303{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
304	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
305	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
306{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
307	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
308___
309push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
310
311for ($i=1;$i<14;$i++) {
312# Above and below fragments are derived from this one by removing
313# unsuitable (p??) instructions.
314$code.=<<___;	# (p16),(p17),(p18),(p19)
315{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
316	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
317	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
318{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
319	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
320	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
321{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
322	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
323	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
324{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
325	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
326	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
327{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
328	ld1	$in=[$inp],-1			//(p16) *inp--
329	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
330{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
331	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
332	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
333{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
334	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
335	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
336{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
337	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
338	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
339___
340push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
341}
342
343$code.=<<___;	# (p17),(p18),(p19)
344{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
345	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
346	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
347{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
348	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
349	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
350{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
351	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
352	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
353{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
354	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
355	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
356{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
357	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
358{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
359	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
360	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
361{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
362	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
363{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
364	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
365	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
366___
367push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
368
369$code.=<<___;	# (p18),(p19)
370{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
371	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
372{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
373	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
374{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
375	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
376{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
377	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
378{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
379	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
380{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
381	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
382{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
383	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
384{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
385	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
386___
387push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
388
389$code.=<<___;	# (p19)
390{ .mmi;	cmp.ltu	p6,p0=$inp,$len
391	add	$inp=32,$inp
392	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
393{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
394	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
395	add	$Xip=9,$Xip		};;	//	&Xi.lo
396{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
397(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
398(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
399{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
400(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
401{ .mmi;	st8	[$Xip]=$Zlo,-8
402(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
403	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
404{ .mmi;
405(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
406	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
407(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
408{ .mib;
409(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
410(p6)	br.cond.dptk.many	.LOOP	};;
411
412{ .mib;	st8	[$Xip]=$Zhi		};;
413{ .mib;	$rum	1<<1				// return to little-endian
414	.restore	sp
415	mov	sp=prevsp
416	br.ret.sptk.many	b0	};;
417.endp	gcm_ghash_4bit#
418___
419$code.=<<___;
420.align	128
421.type	rem_4bit#,\@object
422rem_4bit:
423        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
424        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
425        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
426        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
427.size	rem_4bit#,128
428.type	rem_8bit#,\@object
429rem_8bit:
430	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
431	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
432	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
433	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
434	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
435	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
436	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
437	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
438	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
439	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
440	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
441	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
442	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
443	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
444	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
445	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
446	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
447	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
448	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
449	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
450	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
451	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
452	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
453	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
454	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
455	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
456	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
457	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
458	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
459	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
460	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
461	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
462.size	rem_8bit#,512
463stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
464___
465
466$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
467$code =~ s/\`([^\`]*)\`/eval $1/gem;
468
469print $code;
470close STDOUT or die "error closing STDOUT: $!";
471