xref: /openssl/crypto/sha/asm/keccak1600p8-ppc.pl (revision 33388b44)
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PowerISA 2.07.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT SIMD implementation, but with
21# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
22# POWER8 processor spends 9.8 cycles to process byte out of large
23# buffer for r=1088, which matches SHA3-256. This is 17% better than
24# scalar PPC64 code. It probably should be noted that if POWER8's
25# successor can achieve higher scalar instruction issue rate, then
26# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
27
28# $output is the last argument if it looks like a file (it has an extension)
29# $flavour is the first argument if it doesn't look like a file
30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
32
33if ($flavour =~ /64/) {
34	$SIZE_T	=8;
35	$LRSAVE	=2*$SIZE_T;
36	$UCMP	="cmpld";
37	$STU	="stdu";
38	$POP	="ld";
39	$PUSH	="std";
40} elsif ($flavour =~ /32/) {
41	$SIZE_T	=4;
42	$LRSAVE	=$SIZE_T;
43	$STU	="stwu";
44	$POP	="lwz";
45	$PUSH	="stw";
46	$UCMP	="cmplw";
47} else { die "nonsense $flavour"; }
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52die "can't locate ppc-xlate.pl";
53
54open STDOUT,"| $^X $xlate $flavour \"$output\""
55    or die "can't call $xlate: $!";
56
57$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
58
59my $sp ="r1";
60
61my $iotas = "r12";
62
63########################################################################
64# Register layout:
65#
66# v0		A[0][0] A[1][0]
67# v1		A[0][1] A[1][1]
68# v2		A[0][2] A[1][2]
69# v3		A[0][3] A[1][3]
70# v4		A[0][4] A[1][4]
71#
72# v5		A[2][0] A[3][0]
73# v6		A[2][1] A[3][1]
74# v7		A[2][2] A[3][2]
75# v8		A[2][3] A[3][3]
76# v9		A[2][4] A[3][4]
77#
78# v10		A[4][0] A[4][1]
79# v11		A[4][2] A[4][3]
80# v12		A[4][4] A[4][4]
81#
82# v13..25	rhotates[][]
83# v26..31	volatile
84#
85$code.=<<___;
86.machine	"any"
87.text
88
89.type	KeccakF1600_int,\@function
90.align	5
91KeccakF1600_int:
92	li	r0,24
93	mtctr	r0
94	li	r0,0
95	b	.Loop
96
97.align	4
98.Loop:
99	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
100	vxor	v26,v0, v5		; A[0..1][0]^A[2..3][0]
101	vxor	v27,v1, v6		; A[0..1][1]^A[2..3][1]
102	vxor	v28,v2, v7		; A[0..1][2]^A[2..3][2]
103	vxor	v29,v3, v8		; A[0..1][3]^A[2..3][3]
104	vxor	v30,v4, v9		; A[0..1][4]^A[2..3][4]
105	vpermdi	v31,v26,v27,0b00	; A[0][0..1]^A[2][0..1]
106	vpermdi	v26,v26,v27,0b11	; A[1][0..1]^A[3][0..1]
107	vpermdi	v27,v28,v29,0b00	; A[0][2..3]^A[2][2..3]
108	vpermdi	v28,v28,v29,0b11	; A[1][2..3]^A[3][2..3]
109	vpermdi	v29,v30,v30,0b10	; A[1..0][4]^A[3..2][4]
110	vxor	v26,v26,v31		; C[0..1]
111	vxor	v27,v27,v28		; C[2..3]
112	vxor	v28,v29,v30		; C[4..4]
113	vspltisb v31,1
114	vxor	v26,v26,v10		; C[0..1] ^= A[4][0..1]
115	vxor	v27,v27,v11		; C[2..3] ^= A[4][2..3]
116	vxor	v28,v28,v12		; C[4..4] ^= A[4][4..4], low!
117
118	vrld	v29,v26,v31		; ROL64(C[0..1],1)
119	vrld	v30,v27,v31		; ROL64(C[2..3],1)
120	vrld	v31,v28,v31		; ROL64(C[4..4],1)
121	vpermdi	v31,v31,v29,0b10
122	vxor	v26,v26,v30		; C[0..1] ^= ROL64(C[2..3],1)
123	vxor	v27,v27,v31		; C[2..3] ^= ROL64(C[4..0],1)
124	vxor	v28,v28,v29		; C[4..4] ^= ROL64(C[0..1],1), low!
125
126	vpermdi	v29,v26,v26,0b00	; C[0..0]
127	vpermdi	v30,v28,v26,0b10	; C[4..0]
128	vpermdi	v31,v28,v28,0b11	; C[4..4]
129	vxor	v1, v1, v29		; A[0..1][1] ^= C[0..0]
130	vxor	v6, v6, v29		; A[2..3][1] ^= C[0..0]
131	vxor	v10,v10,v30		; A[4][0..1] ^= C[4..0]
132	vxor	v0, v0, v31		; A[0..1][0] ^= C[4..4]
133	vxor	v5, v5, v31		; A[2..3][0] ^= C[4..4]
134
135	vpermdi	v29,v27,v27,0b00	; C[2..2]
136	vpermdi	v30,v26,v26,0b11	; C[1..1]
137	vpermdi	v31,v26,v27,0b10	; C[1..2]
138	vxor	v3, v3, v29		; A[0..1][3] ^= C[2..2]
139	vxor	v8, v8, v29		; A[2..3][3] ^= C[2..2]
140	vxor	v2, v2, v30		; A[0..1][2] ^= C[1..1]
141	vxor	v7, v7, v30		; A[2..3][2] ^= C[1..1]
142	vxor	v11,v11,v31		; A[4][2..3] ^= C[1..2]
143
144	vpermdi	v29,v27,v27,0b11	; C[3..3]
145	vxor	v4, v4, v29		; A[0..1][4] ^= C[3..3]
146	vxor	v9, v9, v29		; A[2..3][4] ^= C[3..3]
147	vxor	v12,v12,v29		; A[4..4][4] ^= C[3..3]
148
149	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
150	vrld	v26,v0, v13		; v0
151	vrld	v1, v1, v14
152	vrld	v27,v2, v15		; v2
153	vrld	v28,v3, v16		; v3
154	vrld	v4, v4, v17
155	vrld	v5, v5, v18
156	vrld	v6, v6, v19
157	vrld	v29,v7, v20		; v7
158	vrld	v8, v8, v21
159	vrld	v9, v9, v22
160	vrld	v10,v10,v23
161	vrld	v30,v11,v24		; v11
162	vrld	v12,v12,v25
163
164	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
165	vpermdi	v0, v26,v28,0b00	; [0][0] [1][0] < [0][0] [0][3]
166	vpermdi	v2, v29,v5, 0b00	; [0][2] [1][2] < [2][2] [2][0]
167	vpermdi	v11,v9, v5, 0b01	; [4][2] [4][3] < [2][4] [3][0]
168	vpermdi	v5, v1, v4, 0b00	; [2][0] [3][0] < [0][1] [0][4]
169	vpermdi	v1, v1, v4, 0b11	; [0][1] [1][1] < [1][1] [1][4]
170	vpermdi	v3, v8, v6, 0b11	; [0][3] [1][3] < [3][3] [3][1]
171	vpermdi	v4, v12,v30,0b10	; [0][4] [1][4] < [4][4] [4][2]
172	vpermdi	v7, v8, v6, 0b00	; [2][2] [3][2] < [2][3] [2][1]
173	vpermdi	v6, v27,v26,0b11	; [2][1] [3][1] < [1][2] [1][0]
174	vpermdi	v8, v9, v29,0b11	; [2][3] [3][3] < [3][4] [3][2]
175	vpermdi	v12,v10,v10,0b11	; [4][4] [4][4] < [4][1] [4][1]
176	vpermdi	v9, v10,v30,0b01	; [2][4] [3][4] < [4][0] [4][3]
177	vpermdi	v10,v27,v28,0b01	; [4][0] [4][1] < [0][2] [1][3]
178
179	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
180	lvx_u	v31,$iotas,r0		; iotas[index]
181	addic	r0,r0,16		; index++
182
183	vandc	v26,v2, v1		; (~A[0..1][1] & A[0..1][2])
184	vandc	v27,v3, v2		; (~A[0..1][2] & A[0..1][3])
185	vandc	v28,v4, v3		; (~A[0..1][3] & A[0..1][4])
186	vandc	v29,v0, v4		; (~A[0..1][4] & A[0..1][0])
187	vandc	v30,v1, v0		; (~A[0..1][0] & A[0..1][1])
188	vxor	v0, v0, v26		; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
189	vxor	v1, v1, v27		; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
190	vxor	v2, v2, v28		; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
191	vxor	v3, v3, v29		; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
192	vxor	v4, v4, v30		; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
193
194	vandc	v26,v7, v6		; (~A[2..3][1] & A[2..3][2])
195	vandc	v27,v8, v7		; (~A[2..3][2] & A[2..3][3])
196	vandc	v28,v9, v8		; (~A[2..3][3] & A[2..3][4])
197	vandc	v29,v5, v9		; (~A[2..3][4] & A[2..3][0])
198	vandc	v30,v6, v5		; (~A[2..3][0] & A[2..3][1])
199	vxor	v5, v5, v26		; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
200	vxor	v6, v6, v27		; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
201	vxor	v7, v7, v28		; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
202	vxor	v8, v8, v29		; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
203	vxor	v9, v9, v30		; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
204
205	vxor	v0, v0, v31		; A[0][0] ^= iotas[index++]
206
207	vpermdi	v26,v10,v11,0b10	; A[4][1..2]
208	vpermdi	v27,v12,v10,0b00	; A[4][4..0]
209	vpermdi	v28,v11,v12,0b10	; A[4][3..4]
210	vpermdi	v29,v10,v10,0b10	; A[4][1..0]
211	vandc	v26,v11,v26		; (~A[4][1..2] & A[4][2..3])
212	vandc	v27,v27,v28		; (~A[4][3..4] & A[4][4..0])
213	vandc	v28,v10,v29		; (~A[4][1..0] & A[4][0..1])
214	vxor	v10,v10,v26		; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
215	vxor	v11,v11,v27		; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
216	vxor	v12,v12,v28		; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
217
218	bdnz	.Loop
219
220	vpermdi	v12,v12,v12,0b11	; broadcast A[4][4]
221	blr
222	.long	0
223	.byte	0,12,0x14,0,0,0,0,0
224.size	KeccakF1600_int,.-KeccakF1600_int
225
226.type	KeccakF1600,\@function
227.align	5
228KeccakF1600:
229	$STU	$sp,-$FRAME($sp)
230	li	r10,`15+6*$SIZE_T`
231	li	r11,`31+6*$SIZE_T`
232	mflr	r8
233	mfspr	r7, 256			; save vrsave
234	stvx	v20,r10,$sp
235	addi	r10,r10,32
236	stvx	v21,r11,$sp
237	addi	r11,r11,32
238	stvx	v22,r10,$sp
239	addi	r10,r10,32
240	stvx	v23,r11,$sp
241	addi	r11,r11,32
242	stvx	v24,r10,$sp
243	addi	r10,r10,32
244	stvx	v25,r11,$sp
245	addi	r11,r11,32
246	stvx	v26,r10,$sp
247	addi	r10,r10,32
248	stvx	v27,r11,$sp
249	addi	r11,r11,32
250	stvx	v28,r10,$sp
251	addi	r10,r10,32
252	stvx	v29,r11,$sp
253	addi	r11,r11,32
254	stvx	v30,r10,$sp
255	stvx	v31,r11,$sp
256	stw	r7,`$FRAME-4`($sp)	; save vrsave
257	li	r0, -1
258	$PUSH	r8,`$FRAME+$LRSAVE`($sp)
259	mtspr	256, r0			; preserve all AltiVec registers
260
261	li	r11,16
262	lvx_4w	v0,0,r3			; load A[5][5]
263	li	r10,32
264	lvx_4w	v1,r11,r3
265	addi	r11,r11,32
266	lvx_4w	v2,r10,r3
267	addi	r10,r10,32
268	lvx_4w	v3,r11,r3
269	addi	r11,r11,32
270	lvx_4w	v4,r10,r3
271	addi	r10,r10,32
272	lvx_4w	v5,r11,r3
273	addi	r11,r11,32
274	lvx_4w	v6,r10,r3
275	addi	r10,r10,32
276	lvx_4w	v7,r11,r3
277	addi	r11,r11,32
278	lvx_4w	v8,r10,r3
279	addi	r10,r10,32
280	lvx_4w	v9,r11,r3
281	addi	r11,r11,32
282	lvx_4w	v10,r10,r3
283	addi	r10,r10,32
284	lvx_4w	v11,r11,r3
285	lvx_splt v12,r10,r3
286
287	bl	PICmeup
288
289	li	r11,16
290	lvx_u	v13,0,r12		; load rhotates
291	li	r10,32
292	lvx_u	v14,r11,r12
293	addi	r11,r11,32
294	lvx_u	v15,r10,r12
295	addi	r10,r10,32
296	lvx_u	v16,r11,r12
297	addi	r11,r11,32
298	lvx_u	v17,r10,r12
299	addi	r10,r10,32
300	lvx_u	v18,r11,r12
301	addi	r11,r11,32
302	lvx_u	v19,r10,r12
303	addi	r10,r10,32
304	lvx_u	v20,r11,r12
305	addi	r11,r11,32
306	lvx_u	v21,r10,r12
307	addi	r10,r10,32
308	lvx_u	v22,r11,r12
309	addi	r11,r11,32
310	lvx_u	v23,r10,r12
311	addi	r10,r10,32
312	lvx_u	v24,r11,r12
313	lvx_u	v25,r10,r12
314	addi	r12,r12,`16*16`		; points at iotas
315
316	bl	KeccakF1600_int
317
318	li	r11,16
319	stvx_4w	v0,0,r3			; return A[5][5]
320	li	r10,32
321	stvx_4w	v1,r11,r3
322	addi	r11,r11,32
323	stvx_4w	v2,r10,r3
324	addi	r10,r10,32
325	stvx_4w	v3,r11,r3
326	addi	r11,r11,32
327	stvx_4w	v4,r10,r3
328	addi	r10,r10,32
329	stvx_4w	v5,r11,r3
330	addi	r11,r11,32
331	stvx_4w	v6,r10,r3
332	addi	r10,r10,32
333	stvx_4w	v7,r11,r3
334	addi	r11,r11,32
335	stvx_4w	v8,r10,r3
336	addi	r10,r10,32
337	stvx_4w	v9,r11,r3
338	addi	r11,r11,32
339	stvx_4w	v10,r10,r3
340	addi	r10,r10,32
341	stvx_4w	v11,r11,r3
342	stvdx_u v12,r10,r3
343
344	li	r10,`15+6*$SIZE_T`
345	li	r11,`31+6*$SIZE_T`
346	mtlr	r8
347	mtspr	256, r7			; restore vrsave
348	lvx	v20,r10,$sp
349	addi	r10,r10,32
350	lvx	v21,r11,$sp
351	addi	r11,r11,32
352	lvx	v22,r10,$sp
353	addi	r10,r10,32
354	lvx	v23,r11,$sp
355	addi	r11,r11,32
356	lvx	v24,r10,$sp
357	addi	r10,r10,32
358	lvx	v25,r11,$sp
359	addi	r11,r11,32
360	lvx	v26,r10,$sp
361	addi	r10,r10,32
362	lvx	v27,r11,$sp
363	addi	r11,r11,32
364	lvx	v28,r10,$sp
365	addi	r10,r10,32
366	lvx	v29,r11,$sp
367	addi	r11,r11,32
368	lvx	v30,r10,$sp
369	lvx	v31,r11,$sp
370	addi	$sp,$sp,$FRAME
371	blr
372	.long	0
373	.byte	0,12,0x04,1,0x80,0,1,0
374	.long	0
375.size	KeccakF1600,.-KeccakF1600
376___
377{
378my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
379
380$code.=<<___;
381.globl	SHA3_absorb
382.type	SHA3_absorb,\@function
383.align	5
384SHA3_absorb:
385	$STU	$sp,-$FRAME($sp)
386	li	r10,`15+6*$SIZE_T`
387	li	r11,`31+6*$SIZE_T`
388	mflr	r8
389	mfspr	r7, 256			; save vrsave
390	stvx	v20,r10,$sp
391	addi	r10,r10,32
392	stvx	v21,r11,$sp
393	addi	r11,r11,32
394	stvx	v22,r10,$sp
395	addi	r10,r10,32
396	stvx	v23,r11,$sp
397	addi	r11,r11,32
398	stvx	v24,r10,$sp
399	addi	r10,r10,32
400	stvx	v25,r11,$sp
401	addi	r11,r11,32
402	stvx	v26,r10,$sp
403	addi	r10,r10,32
404	stvx	v27,r11,$sp
405	addi	r11,r11,32
406	stvx	v28,r10,$sp
407	addi	r10,r10,32
408	stvx	v29,r11,$sp
409	addi	r11,r11,32
410	stvx	v30,r10,$sp
411	stvx	v31,r11,$sp
412	stw	r7,`$FRAME-4`($sp)	; save vrsave
413	li	r0, -1
414	$PUSH	r8,`$FRAME+$LRSAVE`($sp)
415	mtspr	256, r0			; preserve all AltiVec registers
416
417	li	r11,16
418	lvx_4w	v0,0,$A_jagged		; load A[5][5]
419	li	r10,32
420	lvx_4w	v1,r11,$A_jagged
421	addi	r11,r11,32
422	lvx_4w	v2,r10,$A_jagged
423	addi	r10,r10,32
424	lvx_4w	v3,r11,$A_jagged
425	addi	r11,r11,32
426	lvx_4w	v4,r10,$A_jagged
427	addi	r10,r10,32
428	lvx_4w	v5,r11,$A_jagged
429	addi	r11,r11,32
430	lvx_4w	v6,r10,$A_jagged
431	addi	r10,r10,32
432	lvx_4w	v7,r11,$A_jagged
433	addi	r11,r11,32
434	lvx_4w	v8,r10,$A_jagged
435	addi	r10,r10,32
436	lvx_4w	v9,r11,$A_jagged
437	addi	r11,r11,32
438	lvx_4w	v10,r10,$A_jagged
439	addi	r10,r10,32
440	lvx_4w	v11,r11,$A_jagged
441	lvx_splt v12,r10,$A_jagged
442
443	bl	PICmeup
444
445	li	r11,16
446	lvx_u	v13,0,r12		; load rhotates
447	li	r10,32
448	lvx_u	v14,r11,r12
449	addi	r11,r11,32
450	lvx_u	v15,r10,r12
451	addi	r10,r10,32
452	lvx_u	v16,r11,r12
453	addi	r11,r11,32
454	lvx_u	v17,r10,r12
455	addi	r10,r10,32
456	lvx_u	v18,r11,r12
457	addi	r11,r11,32
458	lvx_u	v19,r10,r12
459	addi	r10,r10,32
460	lvx_u	v20,r11,r12
461	addi	r11,r11,32
462	lvx_u	v21,r10,r12
463	addi	r10,r10,32
464	lvx_u	v22,r11,r12
465	addi	r11,r11,32
466	lvx_u	v23,r10,r12
467	addi	r10,r10,32
468	lvx_u	v24,r11,r12
469	lvx_u	v25,r10,r12
470	li	r10,-32
471	li	r11,-16
472	addi	r12,r12,`16*16`		; points at iotas
473	b	.Loop_absorb
474
475.align	4
476.Loop_absorb:
477	$UCMP	$len,$bsz		; len < bsz?
478	blt	.Labsorbed
479
480	sub	$len,$len,$bsz		; len -= bsz
481	srwi	r0,$bsz,3
482	mtctr	r0
483
484	lvx_u	v30,r10,r12		; permutation masks
485	lvx_u	v31,r11,r12
486	?vspltisb v27,7			; prepare masks for byte swap
487	?vxor	v30,v30,v27		; on big-endian
488	?vxor	v31,v31,v27
489
490	vxor	v27,v27,v27		; zero
491	lvdx_u	v26,0,$inp
492	addi	$inp,$inp,8
493	vperm	v26,v26,v27,v30
494	vxor	v0, v0, v26
495	bdz	.Lprocess_block
496	lvdx_u	v26,0,$inp
497	addi	$inp,$inp,8
498	vperm	v26,v26,v27,v30
499	vxor	v1, v1, v26
500	bdz	.Lprocess_block
501	lvdx_u	v26,0,$inp
502	addi	$inp,$inp,8
503	vperm	v26,v26,v27,v30
504	vxor	v2, v2, v26
505	bdz	.Lprocess_block
506	lvdx_u	v26,0,$inp
507	addi	$inp,$inp,8
508	vperm	v26,v26,v27,v30
509	vxor	v3, v3, v26
510	bdz	.Lprocess_block
511	lvdx_u	v26,0,$inp
512	addi	$inp,$inp,8
513	vperm	v26,v26,v27,v30
514	vxor	v4, v4, v26
515	bdz	.Lprocess_block
516	lvdx_u	v26,0,$inp
517	addi	$inp,$inp,8
518	vperm	v26,v26,v27,v31
519	vxor	v0, v0, v26
520	bdz	.Lprocess_block
521	lvdx_u	v26,0,$inp
522	addi	$inp,$inp,8
523	vperm	v26,v26,v27,v31
524	vxor	v1, v1, v26
525	bdz	.Lprocess_block
526	lvdx_u	v26,0,$inp
527	addi	$inp,$inp,8
528	vperm	v26,v26,v27,v31
529	vxor	v2, v2, v26
530	bdz	.Lprocess_block
531	lvdx_u	v26,0,$inp
532	addi	$inp,$inp,8
533	vperm	v26,v26,v27,v31
534	vxor	v3, v3, v26
535	bdz	.Lprocess_block
536	lvdx_u	v26,0,$inp
537	addi	$inp,$inp,8
538	vperm	v26,v26,v27,v31
539	vxor	v4, v4, v26
540	bdz	.Lprocess_block
541	lvdx_u	v26,0,$inp
542	addi	$inp,$inp,8
543	vperm	v26,v26,v27,v30
544	vxor	v5, v5, v26
545	bdz	.Lprocess_block
546	lvdx_u	v26,0,$inp
547	addi	$inp,$inp,8
548	vperm	v26,v26,v27,v30
549	vxor	v6, v6, v26
550	bdz	.Lprocess_block
551	lvdx_u	v26,0,$inp
552	addi	$inp,$inp,8
553	vperm	v26,v26,v27,v30
554	vxor	v7, v7, v26
555	bdz	.Lprocess_block
556	lvdx_u	v26,0,$inp
557	addi	$inp,$inp,8
558	vperm	v26,v26,v27,v30
559	vxor	v8, v8, v26
560	bdz	.Lprocess_block
561	lvdx_u	v26,0,$inp
562	addi	$inp,$inp,8
563	vperm	v26,v26,v27,v30
564	vxor	v9, v9, v26
565	bdz	.Lprocess_block
566	lvdx_u	v26,0,$inp
567	addi	$inp,$inp,8
568	vperm	v26,v26,v27,v31
569	vxor	v5, v5, v26
570	bdz	.Lprocess_block
571	lvdx_u	v26,0,$inp
572	addi	$inp,$inp,8
573	vperm	v26,v26,v27,v31
574	vxor	v6, v6, v26
575	bdz	.Lprocess_block
576	lvdx_u	v26,0,$inp
577	addi	$inp,$inp,8
578	vperm	v26,v26,v27,v31
579	vxor	v7, v7, v26
580	bdz	.Lprocess_block
581	lvdx_u	v26,0,$inp
582	addi	$inp,$inp,8
583	vperm	v26,v26,v27,v31
584	vxor	v8, v8, v26
585	bdz	.Lprocess_block
586	lvdx_u	v26,0,$inp
587	addi	$inp,$inp,8
588	vperm	v26,v26,v27,v31
589	vxor	v9, v9, v26
590	bdz	.Lprocess_block
591	lvdx_u	v26,0,$inp
592	addi	$inp,$inp,8
593	vperm	v26,v26,v27,v30
594	vxor	v10, v10, v26
595	bdz	.Lprocess_block
596	lvdx_u	v26,0,$inp
597	addi	$inp,$inp,8
598	vperm	v26,v26,v27,v31
599	vxor	v10, v10, v26
600	bdz	.Lprocess_block
601	lvdx_u	v26,0,$inp
602	addi	$inp,$inp,8
603	vperm	v26,v26,v27,v30
604	vxor	v11, v11, v26
605	bdz	.Lprocess_block
606	lvdx_u	v26,0,$inp
607	addi	$inp,$inp,8
608	vperm	v26,v26,v27,v31
609	vxor	v11, v11, v26
610	bdz	.Lprocess_block
611	lvdx_u	v26,0,$inp
612	addi	$inp,$inp,8
613	vperm	v26,v26,v27,v31
614	vxor	v12, v12, v26
615
616.Lprocess_block:
617	bl	KeccakF1600_int
618
619	b	.Loop_absorb
620
621.align	4
622.Labsorbed:
623	li	r11,16
624	stvx_4w	v0,0,$A_jagged		; return A[5][5]
625	li	r10,32
626	stvx_4w	v1,r11,$A_jagged
627	addi	r11,r11,32
628	stvx_4w	v2,r10,$A_jagged
629	addi	r10,r10,32
630	stvx_4w	v3,r11,$A_jagged
631	addi	r11,r11,32
632	stvx_4w	v4,r10,$A_jagged
633	addi	r10,r10,32
634	stvx_4w	v5,r11,$A_jagged
635	addi	r11,r11,32
636	stvx_4w	v6,r10,$A_jagged
637	addi	r10,r10,32
638	stvx_4w	v7,r11,$A_jagged
639	addi	r11,r11,32
640	stvx_4w	v8,r10,$A_jagged
641	addi	r10,r10,32
642	stvx_4w	v9,r11,$A_jagged
643	addi	r11,r11,32
644	stvx_4w	v10,r10,$A_jagged
645	addi	r10,r10,32
646	stvx_4w	v11,r11,$A_jagged
647	stvdx_u v12,r10,$A_jagged
648
649	mr	r3,$len			; return value
650	li	r10,`15+6*$SIZE_T`
651	li	r11,`31+6*$SIZE_T`
652	mtlr	r8
653	mtspr	256, r7			; restore vrsave
654	lvx	v20,r10,$sp
655	addi	r10,r10,32
656	lvx	v21,r11,$sp
657	addi	r11,r11,32
658	lvx	v22,r10,$sp
659	addi	r10,r10,32
660	lvx	v23,r11,$sp
661	addi	r11,r11,32
662	lvx	v24,r10,$sp
663	addi	r10,r10,32
664	lvx	v25,r11,$sp
665	addi	r11,r11,32
666	lvx	v26,r10,$sp
667	addi	r10,r10,32
668	lvx	v27,r11,$sp
669	addi	r11,r11,32
670	lvx	v28,r10,$sp
671	addi	r10,r10,32
672	lvx	v29,r11,$sp
673	addi	r11,r11,32
674	lvx	v30,r10,$sp
675	lvx	v31,r11,$sp
676	addi	$sp,$sp,$FRAME
677	blr
678	.long	0
679	.byte	0,12,0x04,1,0x80,0,4,0
680	.long	0
681.size	SHA3_absorb,.-SHA3_absorb
682___
683}
684{
685my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
686
687$code.=<<___;
688.globl	SHA3_squeeze
689.type	SHA3_squeeze,\@function
690.align	5
691SHA3_squeeze:
692	mflr	r9			; r9 is not touched by KeccakF1600
693	subi	$out,$out,1		; prepare for stbu
694	addi	r8,$A_jagged,4		; prepare volatiles
695	mr	r10,$bsz
696	li	r11,0
697	b	.Loop_squeeze
698.align	4
699.Loop_squeeze:
700	lwzx	r7,r11,r8		; lo
701	lwzx	r0,r11,$A_jagged	; hi
702	${UCMP}i $len,8
703	blt	.Lsqueeze_tail
704
705	stbu	r7,1($out)		; write lo
706	srwi	r7,r7,8
707	stbu	r7,1($out)
708	srwi	r7,r7,8
709	stbu	r7,1($out)
710	srwi	r7,r7,8
711	stbu	r7,1($out)
712	stbu	r0,1($out)		; write hi
713	srwi	r0,r0,8
714	stbu	r0,1($out)
715	srwi	r0,r0,8
716	stbu	r0,1($out)
717	srwi	r0,r0,8
718	stbu	r0,1($out)
719
720	subic.	$len,$len,8
721	beqlr				; return if done
722
723	subic.	r10,r10,8
724	ble	.Loutput_expand
725
726	addi	r11,r11,16		; calculate jagged index
727	cmplwi	r11,`16*5`
728	blt	.Loop_squeeze
729	subi	r11,r11,72
730	beq	.Loop_squeeze
731	addi	r11,r11,72
732	cmplwi	r11,`16*5+8`
733	subi	r11,r11,8
734	beq	.Loop_squeeze
735	addi	r11,r11,8
736	cmplwi	r11,`16*10`
737	subi	r11,r11,72
738	beq	.Loop_squeeze
739	addi	r11,r11,72
740	blt	.Loop_squeeze
741	subi	r11,r11,8
742	b	.Loop_squeeze
743
744.align	4
745.Loutput_expand:
746	bl	KeccakF1600
747	mtlr	r9
748
749	addi	r8,$A_jagged,4		; restore volatiles
750	mr	r10,$bsz
751	li	r11,0
752	b	.Loop_squeeze
753
754.align	4
755.Lsqueeze_tail:
756	mtctr	$len
757	subic.	$len,$len,4
758	ble	.Loop_tail_lo
759	li	r8,4
760	mtctr	r8
761.Loop_tail_lo:
762	stbu	r7,1($out)
763	srdi	r7,r7,8
764	bdnz	.Loop_tail_lo
765	ble	.Lsqueeze_done
766	mtctr	$len
767.Loop_tail_hi:
768	stbu	r0,1($out)
769	srdi	r0,r0,8
770	bdnz	.Loop_tail_hi
771
772.Lsqueeze_done:
773	blr
774	.long	0
775	.byte	0,12,0x14,0,0,0,4,0
776	.long	0
777.size	SHA3_squeeze,.-SHA3_squeeze
778___
779}
780$code.=<<___;
781.align	6
782PICmeup:
783	mflr	r0
784	bcl	20,31,\$+4
785	mflr	r12   ; vvvvvv "distance" between . and 1st data entry
786	addi	r12,r12,`64-8`
787	mtlr	r0
788	blr
789	.long	0
790	.byte	0,12,0x14,0,0,0,0,0
791	.space	`64-9*4`
792.type	rhotates,\@object
793.align	6
794rhotates:
795	.quad	0,  36
796	.quad	1,  44
797	.quad	62,  6
798	.quad	28, 55
799	.quad	27, 20
800	.quad	3,  41
801	.quad	10, 45
802	.quad	43, 15
803	.quad	25, 21
804	.quad	39,  8
805	.quad	18,  2
806	.quad	61, 56
807	.quad	14, 14
808.size	rhotates,.-rhotates
809	.quad	0,0
810	.quad	0x0001020304050607,0x1011121314151617
811	.quad	0x1011121314151617,0x0001020304050607
812.type	iotas,\@object
813iotas:
814	.quad	0x0000000000000001,0
815	.quad	0x0000000000008082,0
816	.quad	0x800000000000808a,0
817	.quad	0x8000000080008000,0
818	.quad	0x000000000000808b,0
819	.quad	0x0000000080000001,0
820	.quad	0x8000000080008081,0
821	.quad	0x8000000000008009,0
822	.quad	0x000000000000008a,0
823	.quad	0x0000000000000088,0
824	.quad	0x0000000080008009,0
825	.quad	0x000000008000000a,0
826	.quad	0x000000008000808b,0
827	.quad	0x800000000000008b,0
828	.quad	0x8000000000008089,0
829	.quad	0x8000000000008003,0
830	.quad	0x8000000000008002,0
831	.quad	0x8000000000000080,0
832	.quad	0x000000000000800a,0
833	.quad	0x800000008000000a,0
834	.quad	0x8000000080008081,0
835	.quad	0x8000000000008080,0
836	.quad	0x0000000080000001,0
837	.quad	0x8000000080008008,0
838.size	iotas,.-iotas
839.asciz	"Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
840___
841
842foreach  (split("\n",$code)) {
843	s/\`([^\`]*)\`/eval $1/ge;
844
845	if ($flavour =~ /le$/) {	# little-endian
846	    s/\?([a-z]+)/;$1/;
847	} else {			# big-endian
848	    s/\?([a-z]+)/$1/;
849	}
850
851	print $_,"\n";
852}
853
854close STDOUT or die "error closing STDOUT: $!";
855