xref: /openssl/crypto/modes/asm/ghashp8-ppc.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for for PowerISA v2.07.
18#
19# July 2014
20#
21# Accurate performance measurements are problematic, because it's
22# always virtualized setup with possibly throttled processor.
23# Relative comparison is therefore more informative. This initial
24# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25# faster than "4-bit" integer-only compiler-generated 64-bit code.
26# "Initial version" means that there is room for further improvement.
27
28# May 2016
29#
30# 2x aggregated reduction improves performance by 50% (resulting
31# performance on POWER8 is 1 cycle per processed byte), and 4x
32# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33# POWER9 delivers 0.51 cpb.
34
35# $output is the last argument if it looks like a file (it has an extension)
36# $flavour is the first argument if it doesn't look like a file
37$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
38$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
39
40if ($flavour =~ /64/) {
41	$SIZE_T=8;
42	$LRSAVE=2*$SIZE_T;
43	$STU="stdu";
44	$POP="ld";
45	$PUSH="std";
46	$UCMP="cmpld";
47	$SHRI="srdi";
48} elsif ($flavour =~ /32/) {
49	$SIZE_T=4;
50	$LRSAVE=$SIZE_T;
51	$STU="stwu";
52	$POP="lwz";
53	$PUSH="stw";
54	$UCMP="cmplw";
55	$SHRI="srwi";
56} else { die "nonsense $flavour"; }
57
58$sp="r1";
59$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
60
61$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
63( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
64die "can't locate ppc-xlate.pl";
65
66open STDOUT,"| $^X $xlate $flavour \"$output\""
67    or die "can't call $xlate: $!";
68
69my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
70
71my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
72my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
73my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
74my $vrsave="r12";
75
76$code=<<___;
77.machine	"any"
78
79.text
80
81.globl	.gcm_init_p8
82.align	5
83.gcm_init_p8:
84	li		r0,-4096
85	li		r8,0x10
86	mfspr		$vrsave,256
87	li		r9,0x20
88	mtspr		256,r0
89	li		r10,0x30
90	lvx_u		$H,0,r4			# load H
91
92	vspltisb	$xC2,-16		# 0xf0
93	vspltisb	$t0,1			# one
94	vaddubm		$xC2,$xC2,$xC2		# 0xe0
95	vxor		$zero,$zero,$zero
96	vor		$xC2,$xC2,$t0		# 0xe1
97	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
98	vsldoi		$t1,$zero,$t0,1		# ...1
99	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
100	vspltisb	$t2,7
101	vor		$xC2,$xC2,$t1		# 0xc2....01
102	vspltb		$t1,$H,0		# most significant byte
103	vsl		$H,$H,$t0		# H<<=1
104	vsrab		$t1,$t1,$t2		# broadcast carry bit
105	vand		$t1,$t1,$xC2
106	vxor		$IN,$H,$t1		# twisted H
107
108	vsldoi		$H,$IN,$IN,8		# twist even more ...
109	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
110	vsldoi		$Hl,$zero,$H,8		# ... and split
111	vsldoi		$Hh,$H,$zero,8
112
113	stvx_u		$xC2,0,r3		# save pre-computed table
114	stvx_u		$Hl,r8,r3
115	li		r8,0x40
116	stvx_u		$H, r9,r3
117	li		r9,0x50
118	stvx_u		$Hh,r10,r3
119	li		r10,0x60
120
121	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
122	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
123	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
124
125	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
126
127	vsldoi		$t0,$Xm,$zero,8
128	vsldoi		$t1,$zero,$Xm,8
129	vxor		$Xl,$Xl,$t0
130	vxor		$Xh,$Xh,$t1
131
132	vsldoi		$Xl,$Xl,$Xl,8
133	vxor		$Xl,$Xl,$t2
134
135	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
136	vpmsumd		$Xl,$Xl,$xC2
137	vxor		$t1,$t1,$Xh
138	vxor		$IN1,$Xl,$t1
139
140	vsldoi		$H2,$IN1,$IN1,8
141	vsldoi		$H2l,$zero,$H2,8
142	vsldoi		$H2h,$H2,$zero,8
143
144	stvx_u		$H2l,r8,r3		# save H^2
145	li		r8,0x70
146	stvx_u		$H2,r9,r3
147	li		r9,0x80
148	stvx_u		$H2h,r10,r3
149	li		r10,0x90
150___
151{
152my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
153$code.=<<___;
154	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
155	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
156	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
157	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
158	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
159	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
160
161	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
162	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
163
164	vsldoi		$t0,$Xm,$zero,8
165	vsldoi		$t1,$zero,$Xm,8
166	 vsldoi		$t4,$Xm1,$zero,8
167	 vsldoi		$t5,$zero,$Xm1,8
168	vxor		$Xl,$Xl,$t0
169	vxor		$Xh,$Xh,$t1
170	 vxor		$Xl1,$Xl1,$t4
171	 vxor		$Xh1,$Xh1,$t5
172
173	vsldoi		$Xl,$Xl,$Xl,8
174	 vsldoi		$Xl1,$Xl1,$Xl1,8
175	vxor		$Xl,$Xl,$t2
176	 vxor		$Xl1,$Xl1,$t6
177
178	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
179	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
180	vpmsumd		$Xl,$Xl,$xC2
181	 vpmsumd	$Xl1,$Xl1,$xC2
182	vxor		$t1,$t1,$Xh
183	 vxor		$t5,$t5,$Xh1
184	vxor		$Xl,$Xl,$t1
185	 vxor		$Xl1,$Xl1,$t5
186
187	vsldoi		$H,$Xl,$Xl,8
188	 vsldoi		$H2,$Xl1,$Xl1,8
189	vsldoi		$Hl,$zero,$H,8
190	vsldoi		$Hh,$H,$zero,8
191	 vsldoi		$H2l,$zero,$H2,8
192	 vsldoi		$H2h,$H2,$zero,8
193
194	stvx_u		$Hl,r8,r3		# save H^3
195	li		r8,0xa0
196	stvx_u		$H,r9,r3
197	li		r9,0xb0
198	stvx_u		$Hh,r10,r3
199	li		r10,0xc0
200	 stvx_u		$H2l,r8,r3		# save H^4
201	 stvx_u		$H2,r9,r3
202	 stvx_u		$H2h,r10,r3
203
204	mtspr		256,$vrsave
205	blr
206	.long		0
207	.byte		0,12,0x14,0,0,0,2,0
208	.long		0
209.size	.gcm_init_p8,.-.gcm_init_p8
210___
211}
212$code.=<<___;
213.globl	.gcm_gmult_p8
214.align	5
215.gcm_gmult_p8:
216	lis		r0,0xfff8
217	li		r8,0x10
218	mfspr		$vrsave,256
219	li		r9,0x20
220	mtspr		256,r0
221	li		r10,0x30
222	lvx_u		$IN,0,$Xip		# load Xi
223
224	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
225	 le?lvsl	$lemask,r0,r0
226	lvx_u		$H, r9,$Htbl
227	 le?vspltisb	$t0,0x07
228	lvx_u		$Hh,r10,$Htbl
229	 le?vxor	$lemask,$lemask,$t0
230	lvx_u		$xC2,0,$Htbl
231	 le?vperm	$IN,$IN,$IN,$lemask
232	vxor		$zero,$zero,$zero
233
234	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
235	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
236	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
237
238	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
239
240	vsldoi		$t0,$Xm,$zero,8
241	vsldoi		$t1,$zero,$Xm,8
242	vxor		$Xl,$Xl,$t0
243	vxor		$Xh,$Xh,$t1
244
245	vsldoi		$Xl,$Xl,$Xl,8
246	vxor		$Xl,$Xl,$t2
247
248	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
249	vpmsumd		$Xl,$Xl,$xC2
250	vxor		$t1,$t1,$Xh
251	vxor		$Xl,$Xl,$t1
252
253	le?vperm	$Xl,$Xl,$Xl,$lemask
254	stvx_u		$Xl,0,$Xip		# write out Xi
255
256	mtspr		256,$vrsave
257	blr
258	.long		0
259	.byte		0,12,0x14,0,0,0,2,0
260	.long		0
261.size	.gcm_gmult_p8,.-.gcm_gmult_p8
262
263.globl	.gcm_ghash_p8
264.align	5
265.gcm_ghash_p8:
266	li		r0,-4096
267	li		r8,0x10
268	mfspr		$vrsave,256
269	li		r9,0x20
270	mtspr		256,r0
271	li		r10,0x30
272	lvx_u		$Xl,0,$Xip		# load Xi
273
274	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
275	li		r8,0x40
276	 le?lvsl	$lemask,r0,r0
277	lvx_u		$H, r9,$Htbl
278	li		r9,0x50
279	 le?vspltisb	$t0,0x07
280	lvx_u		$Hh,r10,$Htbl
281	li		r10,0x60
282	 le?vxor	$lemask,$lemask,$t0
283	lvx_u		$xC2,0,$Htbl
284	 le?vperm	$Xl,$Xl,$Xl,$lemask
285	vxor		$zero,$zero,$zero
286
287	${UCMP}i	$len,64
288	bge		Lgcm_ghash_p8_4x
289
290	lvx_u		$IN,0,$inp
291	addi		$inp,$inp,16
292	subic.		$len,$len,16
293	 le?vperm	$IN,$IN,$IN,$lemask
294	vxor		$IN,$IN,$Xl
295	beq		Lshort
296
297	lvx_u		$H2l,r8,$Htbl		# load H^2
298	li		r8,16
299	lvx_u		$H2, r9,$Htbl
300	add		r9,$inp,$len		# end of input
301	lvx_u		$H2h,r10,$Htbl
302	be?b		Loop_2x
303
304.align	5
305Loop_2x:
306	lvx_u		$IN1,0,$inp
307	le?vperm	$IN1,$IN1,$IN1,$lemask
308
309	 subic		$len,$len,32
310	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
311	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
312	 subfe		r0,r0,r0		# borrow?-1:0
313	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
314	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
315	 and		r0,r0,$len
316	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
317	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
318	 add		$inp,$inp,r0
319
320	vxor		$Xl,$Xl,$Xl1
321	vxor		$Xm,$Xm,$Xm1
322
323	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
324
325	vsldoi		$t0,$Xm,$zero,8
326	vsldoi		$t1,$zero,$Xm,8
327	 vxor		$Xh,$Xh,$Xh1
328	vxor		$Xl,$Xl,$t0
329	vxor		$Xh,$Xh,$t1
330
331	vsldoi		$Xl,$Xl,$Xl,8
332	vxor		$Xl,$Xl,$t2
333	 lvx_u		$IN,r8,$inp
334	 addi		$inp,$inp,32
335
336	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
337	vpmsumd		$Xl,$Xl,$xC2
338	 le?vperm	$IN,$IN,$IN,$lemask
339	vxor		$t1,$t1,$Xh
340	vxor		$IN,$IN,$t1
341	vxor		$IN,$IN,$Xl
342	$UCMP		r9,$inp
343	bgt		Loop_2x			# done yet?
344
345	cmplwi		$len,0
346	bne		Leven
347
348Lshort:
349	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
350	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
351	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
352
353	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
354
355	vsldoi		$t0,$Xm,$zero,8
356	vsldoi		$t1,$zero,$Xm,8
357	vxor		$Xl,$Xl,$t0
358	vxor		$Xh,$Xh,$t1
359
360	vsldoi		$Xl,$Xl,$Xl,8
361	vxor		$Xl,$Xl,$t2
362
363	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
364	vpmsumd		$Xl,$Xl,$xC2
365	vxor		$t1,$t1,$Xh
366
367Leven:
368	vxor		$Xl,$Xl,$t1
369	le?vperm	$Xl,$Xl,$Xl,$lemask
370	stvx_u		$Xl,0,$Xip		# write out Xi
371
372	mtspr		256,$vrsave
373	blr
374	.long		0
375	.byte		0,12,0x14,0,0,0,4,0
376	.long		0
377___
378{
379my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
380    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
381my $IN0=$IN;
382my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
383
384$code.=<<___;
385.align	5
386.gcm_ghash_p8_4x:
387Lgcm_ghash_p8_4x:
388	$STU		$sp,-$FRAME($sp)
389	li		r10,`15+6*$SIZE_T`
390	li		r11,`31+6*$SIZE_T`
391	stvx		v20,r10,$sp
392	addi		r10,r10,32
393	stvx		v21,r11,$sp
394	addi		r11,r11,32
395	stvx		v22,r10,$sp
396	addi		r10,r10,32
397	stvx		v23,r11,$sp
398	addi		r11,r11,32
399	stvx		v24,r10,$sp
400	addi		r10,r10,32
401	stvx		v25,r11,$sp
402	addi		r11,r11,32
403	stvx		v26,r10,$sp
404	addi		r10,r10,32
405	stvx		v27,r11,$sp
406	addi		r11,r11,32
407	stvx		v28,r10,$sp
408	addi		r10,r10,32
409	stvx		v29,r11,$sp
410	addi		r11,r11,32
411	stvx		v30,r10,$sp
412	li		r10,0x60
413	stvx		v31,r11,$sp
414	li		r0,-1
415	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
416	mtspr		256,r0			# preserve all AltiVec registers
417
418	lvsl		$t0,0,r8		# 0x0001..0e0f
419	#lvx_u		$H2l,r8,$Htbl		# load H^2
420	li		r8,0x70
421	lvx_u		$H2, r9,$Htbl
422	li		r9,0x80
423	vspltisb	$t1,8			# 0x0808..0808
424	#lvx_u		$H2h,r10,$Htbl
425	li		r10,0x90
426	lvx_u		$H3l,r8,$Htbl		# load H^3
427	li		r8,0xa0
428	lvx_u		$H3, r9,$Htbl
429	li		r9,0xb0
430	lvx_u		$H3h,r10,$Htbl
431	li		r10,0xc0
432	lvx_u		$H4l,r8,$Htbl		# load H^4
433	li		r8,0x10
434	lvx_u		$H4, r9,$Htbl
435	li		r9,0x20
436	lvx_u		$H4h,r10,$Htbl
437	li		r10,0x30
438
439	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
440	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
441	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
442
443	$SHRI		$len,$len,4		# this allows to use sign bit
444						# as carry
445	lvx_u		$IN0,0,$inp		# load input
446	lvx_u		$IN1,r8,$inp
447	subic.		$len,$len,8
448	lvx_u		$IN2,r9,$inp
449	lvx_u		$IN3,r10,$inp
450	addi		$inp,$inp,0x40
451	le?vperm	$IN0,$IN0,$IN0,$lemask
452	le?vperm	$IN1,$IN1,$IN1,$lemask
453	le?vperm	$IN2,$IN2,$IN2,$lemask
454	le?vperm	$IN3,$IN3,$IN3,$lemask
455
456	vxor		$Xh,$IN0,$Xl
457
458	 vpmsumd	$Xl1,$IN1,$H3l
459	 vpmsumd	$Xm1,$IN1,$H3
460	 vpmsumd	$Xh1,$IN1,$H3h
461
462	 vperm		$H21l,$H2,$H,$hiperm
463	 vperm		$t0,$IN2,$IN3,$loperm
464	 vperm		$H21h,$H2,$H,$loperm
465	 vperm		$t1,$IN2,$IN3,$hiperm
466	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
467	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
468	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
469	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
470
471	 vxor		$Xm2,$Xm2,$Xm1
472	 vxor		$Xl3,$Xl3,$Xl1
473	 vxor		$Xm3,$Xm3,$Xm2
474	 vxor		$Xh3,$Xh3,$Xh1
475
476	blt		Ltail_4x
477
478Loop_4x:
479	lvx_u		$IN0,0,$inp
480	lvx_u		$IN1,r8,$inp
481	subic.		$len,$len,4
482	lvx_u		$IN2,r9,$inp
483	lvx_u		$IN3,r10,$inp
484	addi		$inp,$inp,0x40
485	le?vperm	$IN1,$IN1,$IN1,$lemask
486	le?vperm	$IN2,$IN2,$IN2,$lemask
487	le?vperm	$IN3,$IN3,$IN3,$lemask
488	le?vperm	$IN0,$IN0,$IN0,$lemask
489
490	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
491	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
492	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
493	 vpmsumd	$Xl1,$IN1,$H3l
494	 vpmsumd	$Xm1,$IN1,$H3
495	 vpmsumd	$Xh1,$IN1,$H3h
496
497	vxor		$Xl,$Xl,$Xl3
498	vxor		$Xm,$Xm,$Xm3
499	vxor		$Xh,$Xh,$Xh3
500	 vperm		$t0,$IN2,$IN3,$loperm
501	 vperm		$t1,$IN2,$IN3,$hiperm
502
503	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
504	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
505	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
506
507	vsldoi		$t0,$Xm,$zero,8
508	vsldoi		$t1,$zero,$Xm,8
509	vxor		$Xl,$Xl,$t0
510	vxor		$Xh,$Xh,$t1
511
512	vsldoi		$Xl,$Xl,$Xl,8
513	vxor		$Xl,$Xl,$t2
514
515	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
516	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
517	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
518	vpmsumd		$Xl,$Xl,$xC2
519
520	 vxor		$Xl3,$Xl3,$Xl1
521	 vxor		$Xh3,$Xh3,$Xh1
522	vxor		$Xh,$Xh,$IN0
523	 vxor		$Xm2,$Xm2,$Xm1
524	vxor		$Xh,$Xh,$t1
525	 vxor		$Xm3,$Xm3,$Xm2
526	vxor		$Xh,$Xh,$Xl
527	bge		Loop_4x
528
529Ltail_4x:
530	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
531	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
532	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
533
534	vxor		$Xl,$Xl,$Xl3
535	vxor		$Xm,$Xm,$Xm3
536
537	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
538
539	vsldoi		$t0,$Xm,$zero,8
540	vsldoi		$t1,$zero,$Xm,8
541	 vxor		$Xh,$Xh,$Xh3
542	vxor		$Xl,$Xl,$t0
543	vxor		$Xh,$Xh,$t1
544
545	vsldoi		$Xl,$Xl,$Xl,8
546	vxor		$Xl,$Xl,$t2
547
548	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
549	vpmsumd		$Xl,$Xl,$xC2
550	vxor		$t1,$t1,$Xh
551	vxor		$Xl,$Xl,$t1
552
553	addic.		$len,$len,4
554	beq		Ldone_4x
555
556	lvx_u		$IN0,0,$inp
557	${UCMP}i	$len,2
558	li		$len,-4
559	blt		Lone
560	lvx_u		$IN1,r8,$inp
561	beq		Ltwo
562
563Lthree:
564	lvx_u		$IN2,r9,$inp
565	le?vperm	$IN0,$IN0,$IN0,$lemask
566	le?vperm	$IN1,$IN1,$IN1,$lemask
567	le?vperm	$IN2,$IN2,$IN2,$lemask
568
569	vxor		$Xh,$IN0,$Xl
570	vmr		$H4l,$H3l
571	vmr		$H4, $H3
572	vmr		$H4h,$H3h
573
574	vperm		$t0,$IN1,$IN2,$loperm
575	vperm		$t1,$IN1,$IN2,$hiperm
576	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
577	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
578	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
579	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
580
581	vxor		$Xm3,$Xm3,$Xm2
582	b		Ltail_4x
583
584.align	4
585Ltwo:
586	le?vperm	$IN0,$IN0,$IN0,$lemask
587	le?vperm	$IN1,$IN1,$IN1,$lemask
588
589	vxor		$Xh,$IN0,$Xl
590	vperm		$t0,$zero,$IN1,$loperm
591	vperm		$t1,$zero,$IN1,$hiperm
592
593	vsldoi		$H4l,$zero,$H2,8
594	vmr		$H4, $H2
595	vsldoi		$H4h,$H2,$zero,8
596
597	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
598	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
599	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
600
601	b		Ltail_4x
602
603.align	4
604Lone:
605	le?vperm	$IN0,$IN0,$IN0,$lemask
606
607	vsldoi		$H4l,$zero,$H,8
608	vmr		$H4, $H
609	vsldoi		$H4h,$H,$zero,8
610
611	vxor		$Xh,$IN0,$Xl
612	vxor		$Xl3,$Xl3,$Xl3
613	vxor		$Xm3,$Xm3,$Xm3
614	vxor		$Xh3,$Xh3,$Xh3
615
616	b		Ltail_4x
617
618Ldone_4x:
619	le?vperm	$Xl,$Xl,$Xl,$lemask
620	stvx_u		$Xl,0,$Xip		# write out Xi
621
622	li		r10,`15+6*$SIZE_T`
623	li		r11,`31+6*$SIZE_T`
624	mtspr		256,$vrsave
625	lvx		v20,r10,$sp
626	addi		r10,r10,32
627	lvx		v21,r11,$sp
628	addi		r11,r11,32
629	lvx		v22,r10,$sp
630	addi		r10,r10,32
631	lvx		v23,r11,$sp
632	addi		r11,r11,32
633	lvx		v24,r10,$sp
634	addi		r10,r10,32
635	lvx		v25,r11,$sp
636	addi		r11,r11,32
637	lvx		v26,r10,$sp
638	addi		r10,r10,32
639	lvx		v27,r11,$sp
640	addi		r11,r11,32
641	lvx		v28,r10,$sp
642	addi		r10,r10,32
643	lvx		v29,r11,$sp
644	addi		r11,r11,32
645	lvx		v30,r10,$sp
646	lvx		v31,r11,$sp
647	addi		$sp,$sp,$FRAME
648	blr
649	.long		0
650	.byte		0,12,0x04,0,0x80,0,4,0
651	.long		0
652___
653}
654$code.=<<___;
655.size	.gcm_ghash_p8,.-.gcm_ghash_p8
656
657.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
658.align  2
659___
660
661foreach (split("\n",$code)) {
662	s/\`([^\`]*)\`/eval $1/geo;
663
664	if ($flavour =~ /le$/o) {	# little-endian
665	    s/le\?//o		or
666	    s/be\?/#be#/o;
667	} else {
668	    s/le\?/#le#/o	or
669	    s/be\?//o;
670	}
671	print $_,"\n";
672}
673
674close STDOUT or die "error closing STDOUT: $!"; # enforce flush
675