xref: /openssl/crypto/sha/asm/keccak1600-ppc64.pl (revision 8f9842fd)
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for PPC64.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation that works on
21# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
22# it's possible to achieve performance better than below, but that is
23# naturally option only for POWER8 and successors...
24#
25######################################################################
26# Numbers are cycles per processed byte.
27#
28#		r=1088(*)
29#
30# PPC970/G5	14.0/+130%
31# POWER7	9.7/+110%
32# POWER8	10.6/+100%
33# POWER9	8.2/+66%
34#
35# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
36#	over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
37#	much better (but watch out for them generating code specific
38#	to processor they execute on).
39
40# $output is the last argument if it looks like a file (it has an extension)
41# $flavour is the first argument if it doesn't look like a file
42$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
43$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
44
45if ($flavour =~ /64/) {
46	$SIZE_T	=8;
47	$LRSAVE	=2*$SIZE_T;
48	$UCMP	="cmpld";
49	$STU	="stdu";
50	$POP	="ld";
51	$PUSH	="std";
52} else { die "nonsense $flavour"; }
53
54$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
55
56if ($LITTLE_ENDIAN) {
57	$DWORD_LE_LOAD = "ldu	r0,8(r3)";
58	$LE_LOAD_SIZE = "8";
59} else {
60	$DWORD_LE_LOAD = "bl	dword_le_load";
61	$LE_LOAD_SIZE = "1";
62}
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
67die "can't locate ppc-xlate.pl";
68
69open STDOUT,"| $^X $xlate $flavour \"$output\""
70    or die "can't call $xlate: $!";
71
72$FRAME=24*$SIZE_T+6*$SIZE_T+32;
73$LOCALS=6*$SIZE_T;
74$TEMP=$LOCALS+6*$SIZE_T;
75
76my $sp ="r1";
77
78my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
79            (7, 12, 17, 22, 27));
80   $A[1][1] = "r6"; # r13 is reserved
81
82my @C = map("r$_", (0,3,4,5));
83
84my @rhotates = ([  0,  1, 62, 28, 27 ],
85                [ 36, 44,  6, 55, 20 ],
86                [  3, 10, 43, 25, 39 ],
87                [ 41, 45, 15, 21,  8 ],
88                [ 18,  2, 61, 56, 14 ]);
89
90$code.=<<___;
91.text
92
93.type	KeccakF1600_int,\@function
94.align	5
95KeccakF1600_int:
96	li	r0,24
97	mtctr	r0
98	b	.Loop
99.align	4
100.Loop:
101	xor	$C[0],$A[0][0],$A[1][0]		; Theta
102	std	$A[0][4],`$TEMP+0`($sp)
103	xor	$C[1],$A[0][1],$A[1][1]
104	std	$A[1][4],`$TEMP+8`($sp)
105	xor	$C[2],$A[0][2],$A[1][2]
106	std	$A[2][4],`$TEMP+16`($sp)
107	xor	$C[3],$A[0][3],$A[1][3]
108	std	$A[3][4],`$TEMP+24`($sp)
109___
110	$C[4]=$A[0][4];
111	$C[5]=$A[1][4];
112	$C[6]=$A[2][4];
113	$C[7]=$A[3][4];
114$code.=<<___;
115	xor	$C[4],$A[0][4],$A[1][4]
116	xor	$C[0],$C[0],$A[2][0]
117	xor	$C[1],$C[1],$A[2][1]
118	xor	$C[2],$C[2],$A[2][2]
119	xor	$C[3],$C[3],$A[2][3]
120	xor	$C[4],$C[4],$A[2][4]
121	xor	$C[0],$C[0],$A[3][0]
122	xor	$C[1],$C[1],$A[3][1]
123	xor	$C[2],$C[2],$A[3][2]
124	xor	$C[3],$C[3],$A[3][3]
125	xor	$C[4],$C[4],$A[3][4]
126	xor	$C[0],$C[0],$A[4][0]
127	xor	$C[2],$C[2],$A[4][2]
128	xor	$C[1],$C[1],$A[4][1]
129	xor	$C[3],$C[3],$A[4][3]
130	rotldi	$C[5],$C[2],1
131	xor	$C[4],$C[4],$A[4][4]
132	rotldi	$C[6],$C[3],1
133	xor	$C[5],$C[5],$C[0]
134	rotldi	$C[7],$C[4],1
135
136	xor	$A[0][1],$A[0][1],$C[5]
137	xor	$A[1][1],$A[1][1],$C[5]
138	xor	$A[2][1],$A[2][1],$C[5]
139	xor	$A[3][1],$A[3][1],$C[5]
140	xor	$A[4][1],$A[4][1],$C[5]
141
142	rotldi	$C[5],$C[0],1
143	xor	$C[6],$C[6],$C[1]
144	xor	$C[2],$C[2],$C[7]
145	rotldi	$C[7],$C[1],1
146	xor	$C[3],$C[3],$C[5]
147	xor	$C[4],$C[4],$C[7]
148
149	xor	$C[1],   $A[0][2],$C[6]			;mr	$C[1],$A[0][2]
150	xor	$A[1][2],$A[1][2],$C[6]
151	xor	$A[2][2],$A[2][2],$C[6]
152	xor	$A[3][2],$A[3][2],$C[6]
153	xor	$A[4][2],$A[4][2],$C[6]
154
155	xor	$A[0][0],$A[0][0],$C[4]
156	xor	$A[1][0],$A[1][0],$C[4]
157	xor	$A[2][0],$A[2][0],$C[4]
158	xor	$A[3][0],$A[3][0],$C[4]
159	xor	$A[4][0],$A[4][0],$C[4]
160___
161	$C[4]=undef;
162	$C[5]=undef;
163	$C[6]=undef;
164	$C[7]=undef;
165$code.=<<___;
166	ld	$A[0][4],`$TEMP+0`($sp)
167	xor	$C[0],   $A[0][3],$C[2]			;mr	$C[0],$A[0][3]
168	ld	$A[1][4],`$TEMP+8`($sp)
169	xor	$A[1][3],$A[1][3],$C[2]
170	ld	$A[2][4],`$TEMP+16`($sp)
171	xor	$A[2][3],$A[2][3],$C[2]
172	ld	$A[3][4],`$TEMP+24`($sp)
173	xor	$A[3][3],$A[3][3],$C[2]
174	xor	$A[4][3],$A[4][3],$C[2]
175
176	xor	$C[2],   $A[0][4],$C[3]			;mr	$C[2],$A[0][4]
177	xor	$A[1][4],$A[1][4],$C[3]
178	xor	$A[2][4],$A[2][4],$C[3]
179	xor	$A[3][4],$A[3][4],$C[3]
180	xor	$A[4][4],$A[4][4],$C[3]
181
182	mr	$C[3],$A[0][1]				; Rho+Pi
183	rotldi	$A[0][1],$A[1][1],$rhotates[1][1]
184	;mr	$C[1],$A[0][2]
185	rotldi	$A[0][2],$A[2][2],$rhotates[2][2]
186	;mr	$C[0],$A[0][3]
187	rotldi	$A[0][3],$A[3][3],$rhotates[3][3]
188	;mr	$C[2],$A[0][4]
189	rotldi	$A[0][4],$A[4][4],$rhotates[4][4]
190
191	rotldi	$A[1][1],$A[1][4],$rhotates[1][4]
192	rotldi	$A[2][2],$A[2][3],$rhotates[2][3]
193	rotldi	$A[3][3],$A[3][2],$rhotates[3][2]
194	rotldi	$A[4][4],$A[4][1],$rhotates[4][1]
195
196	rotldi	$A[1][4],$A[4][2],$rhotates[4][2]
197	rotldi	$A[2][3],$A[3][4],$rhotates[3][4]
198	rotldi	$A[3][2],$A[2][1],$rhotates[2][1]
199	rotldi	$A[4][1],$A[1][3],$rhotates[1][3]
200
201	rotldi	$A[4][2],$A[2][4],$rhotates[2][4]
202	rotldi	$A[3][4],$A[4][3],$rhotates[4][3]
203	rotldi	$A[2][1],$A[1][2],$rhotates[1][2]
204	rotldi	$A[1][3],$A[3][1],$rhotates[3][1]
205
206	rotldi	$A[2][4],$A[4][0],$rhotates[4][0]
207	rotldi	$A[4][3],$A[3][0],$rhotates[3][0]
208	rotldi	$A[1][2],$A[2][0],$rhotates[2][0]
209	rotldi	$A[3][1],$A[1][0],$rhotates[1][0]
210
211	rotldi	$A[1][0],$C[0],$rhotates[0][3]
212	rotldi	$A[2][0],$C[3],$rhotates[0][1]
213	rotldi	$A[3][0],$C[2],$rhotates[0][4]
214	rotldi	$A[4][0],$C[1],$rhotates[0][2]
215
216	andc	$C[0],$A[0][2],$A[0][1]			; Chi+Iota
217	andc	$C[1],$A[0][3],$A[0][2]
218	andc	$C[2],$A[0][0],$A[0][4]
219	andc	$C[3],$A[0][1],$A[0][0]
220	xor	$A[0][0],$A[0][0],$C[0]
221	andc	$C[0],$A[0][4],$A[0][3]
222	xor	$A[0][1],$A[0][1],$C[1]
223	 ld	$C[1],`$LOCALS+4*$SIZE_T`($sp)
224	xor	$A[0][3],$A[0][3],$C[2]
225	xor	$A[0][4],$A[0][4],$C[3]
226	xor	$A[0][2],$A[0][2],$C[0]
227	 ldu	$C[3],8($C[1])				; Iota[i++]
228
229	andc	$C[0],$A[1][2],$A[1][1]
230	 std	$C[1],`$LOCALS+4*$SIZE_T`($sp)
231	andc	$C[1],$A[1][3],$A[1][2]
232	andc	$C[2],$A[1][0],$A[1][4]
233	 xor	$A[0][0],$A[0][0],$C[3]			; A[0][0] ^= Iota
234	andc	$C[3],$A[1][1],$A[1][0]
235	xor	$A[1][0],$A[1][0],$C[0]
236	andc	$C[0],$A[1][4],$A[1][3]
237	xor	$A[1][1],$A[1][1],$C[1]
238	xor	$A[1][3],$A[1][3],$C[2]
239	xor	$A[1][4],$A[1][4],$C[3]
240	xor	$A[1][2],$A[1][2],$C[0]
241
242	andc	$C[0],$A[2][2],$A[2][1]
243	andc	$C[1],$A[2][3],$A[2][2]
244	andc	$C[2],$A[2][0],$A[2][4]
245	andc	$C[3],$A[2][1],$A[2][0]
246	xor	$A[2][0],$A[2][0],$C[0]
247	andc	$C[0],$A[2][4],$A[2][3]
248	xor	$A[2][1],$A[2][1],$C[1]
249	xor	$A[2][3],$A[2][3],$C[2]
250	xor	$A[2][4],$A[2][4],$C[3]
251	xor	$A[2][2],$A[2][2],$C[0]
252
253	andc	$C[0],$A[3][2],$A[3][1]
254	andc	$C[1],$A[3][3],$A[3][2]
255	andc	$C[2],$A[3][0],$A[3][4]
256	andc	$C[3],$A[3][1],$A[3][0]
257	xor	$A[3][0],$A[3][0],$C[0]
258	andc	$C[0],$A[3][4],$A[3][3]
259	xor	$A[3][1],$A[3][1],$C[1]
260	xor	$A[3][3],$A[3][3],$C[2]
261	xor	$A[3][4],$A[3][4],$C[3]
262	xor	$A[3][2],$A[3][2],$C[0]
263
264	andc	$C[0],$A[4][2],$A[4][1]
265	andc	$C[1],$A[4][3],$A[4][2]
266	andc	$C[2],$A[4][0],$A[4][4]
267	andc	$C[3],$A[4][1],$A[4][0]
268	xor	$A[4][0],$A[4][0],$C[0]
269	andc	$C[0],$A[4][4],$A[4][3]
270	xor	$A[4][1],$A[4][1],$C[1]
271	xor	$A[4][3],$A[4][3],$C[2]
272	xor	$A[4][4],$A[4][4],$C[3]
273	xor	$A[4][2],$A[4][2],$C[0]
274
275	bdnz	.Loop
276
277	blr
278	.long	0
279	.byte	0,12,0x14,0,0,0,0,0
280.size	KeccakF1600_int,.-KeccakF1600_int
281
282.type	KeccakF1600,\@function
283.align	5
284KeccakF1600:
285	$STU	$sp,-$FRAME($sp)
286	mflr	r0
287	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
288	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
289	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
290	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
291	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
292	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
293	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
294	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
295	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
296	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
297	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
298	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
299	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
300	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
301	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
302	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
303	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
304	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
305	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
306
307	bl	PICmeup
308	subi	r12,r12,8			; prepare for ldu
309
310	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)
311	;$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)
312	;$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)
313	;$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)
314	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
315
316	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
317	ld	$A[0][1],`8*1`(r3)
318	ld	$A[0][2],`8*2`(r3)
319	ld	$A[0][3],`8*3`(r3)
320	ld	$A[0][4],`8*4`(r3)
321	ld	$A[1][0],`8*5`(r3)
322	ld	$A[1][1],`8*6`(r3)
323	ld	$A[1][2],`8*7`(r3)
324	ld	$A[1][3],`8*8`(r3)
325	ld	$A[1][4],`8*9`(r3)
326	ld	$A[2][0],`8*10`(r3)
327	ld	$A[2][1],`8*11`(r3)
328	ld	$A[2][2],`8*12`(r3)
329	ld	$A[2][3],`8*13`(r3)
330	ld	$A[2][4],`8*14`(r3)
331	ld	$A[3][0],`8*15`(r3)
332	ld	$A[3][1],`8*16`(r3)
333	ld	$A[3][2],`8*17`(r3)
334	ld	$A[3][3],`8*18`(r3)
335	ld	$A[3][4],`8*19`(r3)
336	ld	$A[4][0],`8*20`(r3)
337	ld	$A[4][1],`8*21`(r3)
338	ld	$A[4][2],`8*22`(r3)
339	ld	$A[4][3],`8*23`(r3)
340	ld	$A[4][4],`8*24`(r3)
341
342	bl	KeccakF1600_int
343
344	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
345	std	$A[0][0],`8*0`(r3)		; return A[5][5]
346	std	$A[0][1],`8*1`(r3)
347	std	$A[0][2],`8*2`(r3)
348	std	$A[0][3],`8*3`(r3)
349	std	$A[0][4],`8*4`(r3)
350	std	$A[1][0],`8*5`(r3)
351	std	$A[1][1],`8*6`(r3)
352	std	$A[1][2],`8*7`(r3)
353	std	$A[1][3],`8*8`(r3)
354	std	$A[1][4],`8*9`(r3)
355	std	$A[2][0],`8*10`(r3)
356	std	$A[2][1],`8*11`(r3)
357	std	$A[2][2],`8*12`(r3)
358	std	$A[2][3],`8*13`(r3)
359	std	$A[2][4],`8*14`(r3)
360	std	$A[3][0],`8*15`(r3)
361	std	$A[3][1],`8*16`(r3)
362	std	$A[3][2],`8*17`(r3)
363	std	$A[3][3],`8*18`(r3)
364	std	$A[3][4],`8*19`(r3)
365	std	$A[4][0],`8*20`(r3)
366	std	$A[4][1],`8*21`(r3)
367	std	$A[4][2],`8*22`(r3)
368	std	$A[4][3],`8*23`(r3)
369	std	$A[4][4],`8*24`(r3)
370
371	$POP	r0,`$FRAME+$LRSAVE`($sp)
372	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
373	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
374	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
375	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
376	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
377	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
378	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
379	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
380	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
381	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
382	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
383	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
384	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
385	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
386	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
387	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
388	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
389	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
390	mtlr	r0
391	addi	$sp,$sp,$FRAME
392	blr
393	.long	0
394	.byte	0,12,4,1,0x80,18,1,0
395	.long	0
396.size	KeccakF1600,.-KeccakF1600
397___
398if (!$LITTLE_ENDIAN) {
399$code.=<<___;
400.type	dword_le_load,\@function
401.align	5
402dword_le_load:
403	lbz	r0,1(r3)
404	lbz	r4,2(r3)
405	lbz	r5,3(r3)
406	insrdi	r0,r4,8,48
407	lbz	r4,4(r3)
408	insrdi	r0,r5,8,40
409	lbz	r5,5(r3)
410	insrdi	r0,r4,8,32
411	lbz	r4,6(r3)
412	insrdi	r0,r5,8,24
413	lbz	r5,7(r3)
414	insrdi	r0,r4,8,16
415	lbzu	r4,8(r3)
416	insrdi	r0,r5,8,8
417	insrdi	r0,r4,8,0
418	blr
419	.long	0
420	.byte	0,12,0x14,0,0,0,1,0
421	.long	0
422.size	dword_le_load,.-dword_le_load
423___
424}
425
426$code.=<<___;
427.globl	SHA3_absorb
428.type	SHA3_absorb,\@function
429.align	5
430SHA3_absorb:
431	$STU	$sp,-$FRAME($sp)
432	mflr	r0
433	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
434	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
435	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
436	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
437	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
438	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
439	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
440	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
441	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
442	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
443	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
444	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
445	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
446	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
447	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
448	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
449	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
450	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
451	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
452
453	bl	PICmeup
454	subi	r4,r4,$LE_LOAD_SIZE		; prepare for ldu or lbzu
455	subi	r12,r12,8			; prepare for ldu
456
457	$PUSH	r3,`$LOCALS+0*$SIZE_T`($sp)	; save A[][]
458	$PUSH	r4,`$LOCALS+1*$SIZE_T`($sp)	; save inp
459	$PUSH	r5,`$LOCALS+2*$SIZE_T`($sp)	; save len
460	$PUSH	r6,`$LOCALS+3*$SIZE_T`($sp)	; save bsz
461	mr	r0,r6
462	$PUSH	r12,`$LOCALS+4*$SIZE_T`($sp)
463
464	ld	$A[0][0],`8*0`(r3)		; load A[5][5]
465	ld	$A[0][1],`8*1`(r3)
466	ld	$A[0][2],`8*2`(r3)
467	ld	$A[0][3],`8*3`(r3)
468	ld	$A[0][4],`8*4`(r3)
469	ld	$A[1][0],`8*5`(r3)
470	ld	$A[1][1],`8*6`(r3)
471	ld	$A[1][2],`8*7`(r3)
472	ld	$A[1][3],`8*8`(r3)
473	ld	$A[1][4],`8*9`(r3)
474	ld	$A[2][0],`8*10`(r3)
475	ld	$A[2][1],`8*11`(r3)
476	ld	$A[2][2],`8*12`(r3)
477	ld	$A[2][3],`8*13`(r3)
478	ld	$A[2][4],`8*14`(r3)
479	ld	$A[3][0],`8*15`(r3)
480	ld	$A[3][1],`8*16`(r3)
481	ld	$A[3][2],`8*17`(r3)
482	ld	$A[3][3],`8*18`(r3)
483	ld	$A[3][4],`8*19`(r3)
484	ld	$A[4][0],`8*20`(r3)
485	ld	$A[4][1],`8*21`(r3)
486	ld	$A[4][2],`8*22`(r3)
487	ld	$A[4][3],`8*23`(r3)
488	ld	$A[4][4],`8*24`(r3)
489
490	mr	r3,r4
491	mr	r4,r5
492	mr	r5,r0
493
494	b	.Loop_absorb
495
496.align	4
497.Loop_absorb:
498	$UCMP	r4,r5				; len < bsz?
499	blt	.Labsorbed
500
501	sub	r4,r4,r5			; len -= bsz
502	srwi	r5,r5,3
503	$PUSH	r4,`$LOCALS+2*$SIZE_T`($sp)	; save len
504	mtctr	r5
505	$DWORD_LE_LOAD				; *inp++
506	xor	$A[0][0],$A[0][0],r0
507	bdz	.Lprocess_block
508	$DWORD_LE_LOAD				; *inp++
509	xor	$A[0][1],$A[0][1],r0
510	bdz	.Lprocess_block
511	$DWORD_LE_LOAD				; *inp++
512	xor	$A[0][2],$A[0][2],r0
513	bdz	.Lprocess_block
514	$DWORD_LE_LOAD				; *inp++
515	xor	$A[0][3],$A[0][3],r0
516	bdz	.Lprocess_block
517	$DWORD_LE_LOAD				; *inp++
518	xor	$A[0][4],$A[0][4],r0
519	bdz	.Lprocess_block
520	$DWORD_LE_LOAD				; *inp++
521	xor	$A[1][0],$A[1][0],r0
522	bdz	.Lprocess_block
523	$DWORD_LE_LOAD				; *inp++
524	xor	$A[1][1],$A[1][1],r0
525	bdz	.Lprocess_block
526	$DWORD_LE_LOAD				; *inp++
527	xor	$A[1][2],$A[1][2],r0
528	bdz	.Lprocess_block
529	$DWORD_LE_LOAD				; *inp++
530	xor	$A[1][3],$A[1][3],r0
531	bdz	.Lprocess_block
532	$DWORD_LE_LOAD				; *inp++
533	xor	$A[1][4],$A[1][4],r0
534	bdz	.Lprocess_block
535	$DWORD_LE_LOAD				; *inp++
536	xor	$A[2][0],$A[2][0],r0
537	bdz	.Lprocess_block
538	$DWORD_LE_LOAD				; *inp++
539	xor	$A[2][1],$A[2][1],r0
540	bdz	.Lprocess_block
541	$DWORD_LE_LOAD				; *inp++
542	xor	$A[2][2],$A[2][2],r0
543	bdz	.Lprocess_block
544	$DWORD_LE_LOAD				; *inp++
545	xor	$A[2][3],$A[2][3],r0
546	bdz	.Lprocess_block
547	$DWORD_LE_LOAD				; *inp++
548	xor	$A[2][4],$A[2][4],r0
549	bdz	.Lprocess_block
550	$DWORD_LE_LOAD				; *inp++
551	xor	$A[3][0],$A[3][0],r0
552	bdz	.Lprocess_block
553	$DWORD_LE_LOAD				; *inp++
554	xor	$A[3][1],$A[3][1],r0
555	bdz	.Lprocess_block
556	$DWORD_LE_LOAD				; *inp++
557	xor	$A[3][2],$A[3][2],r0
558	bdz	.Lprocess_block
559	$DWORD_LE_LOAD				; *inp++
560	xor	$A[3][3],$A[3][3],r0
561	bdz	.Lprocess_block
562	$DWORD_LE_LOAD				; *inp++
563	xor	$A[3][4],$A[3][4],r0
564	bdz	.Lprocess_block
565	$DWORD_LE_LOAD				; *inp++
566	xor	$A[4][0],$A[4][0],r0
567	bdz	.Lprocess_block
568	$DWORD_LE_LOAD				; *inp++
569	xor	$A[4][1],$A[4][1],r0
570	bdz	.Lprocess_block
571	$DWORD_LE_LOAD				; *inp++
572	xor	$A[4][2],$A[4][2],r0
573	bdz	.Lprocess_block
574	$DWORD_LE_LOAD				; *inp++
575	xor	$A[4][3],$A[4][3],r0
576	bdz	.Lprocess_block
577	$DWORD_LE_LOAD				; *inp++
578	xor	$A[4][4],$A[4][4],r0
579
580.Lprocess_block:
581	$PUSH	r3,`$LOCALS+1*$SIZE_T`($sp)	; save inp
582
583	bl	KeccakF1600_int
584
585	$POP	r0,`$LOCALS+4*$SIZE_T`($sp)	; pull iotas[24]
586	$POP	r5,`$LOCALS+3*$SIZE_T`($sp)	; restore bsz
587	$POP	r4,`$LOCALS+2*$SIZE_T`($sp)	; restore len
588	$POP	r3,`$LOCALS+1*$SIZE_T`($sp)	; restore inp
589	addic	r0,r0,`-8*24`			; rewind iotas
590	$PUSH	r0,`$LOCALS+4*$SIZE_T`($sp)
591
592	b	.Loop_absorb
593
594.align	4
595.Labsorbed:
596	$POP	r3,`$LOCALS+0*$SIZE_T`($sp)
597	std	$A[0][0],`8*0`(r3)		; return A[5][5]
598	std	$A[0][1],`8*1`(r3)
599	std	$A[0][2],`8*2`(r3)
600	std	$A[0][3],`8*3`(r3)
601	std	$A[0][4],`8*4`(r3)
602	std	$A[1][0],`8*5`(r3)
603	std	$A[1][1],`8*6`(r3)
604	std	$A[1][2],`8*7`(r3)
605	std	$A[1][3],`8*8`(r3)
606	std	$A[1][4],`8*9`(r3)
607	std	$A[2][0],`8*10`(r3)
608	std	$A[2][1],`8*11`(r3)
609	std	$A[2][2],`8*12`(r3)
610	std	$A[2][3],`8*13`(r3)
611	std	$A[2][4],`8*14`(r3)
612	std	$A[3][0],`8*15`(r3)
613	std	$A[3][1],`8*16`(r3)
614	std	$A[3][2],`8*17`(r3)
615	std	$A[3][3],`8*18`(r3)
616	std	$A[3][4],`8*19`(r3)
617	std	$A[4][0],`8*20`(r3)
618	std	$A[4][1],`8*21`(r3)
619	std	$A[4][2],`8*22`(r3)
620	std	$A[4][3],`8*23`(r3)
621	std	$A[4][4],`8*24`(r3)
622
623	mr	r3,r4				; return value
624	$POP	r0,`$FRAME+$LRSAVE`($sp)
625	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
626	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
627	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
628	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
629	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
630	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
631	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
632	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
633	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
634	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
635	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
636	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
637	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
638	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
639	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
640	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
641	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
642	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
643	mtlr	r0
644	addi	$sp,$sp,$FRAME
645	blr
646	.long	0
647	.byte	0,12,4,1,0x80,18,4,0
648	.long	0
649.size	SHA3_absorb,.-SHA3_absorb
650___
651{
652my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
653$code.=<<___;
654.globl	SHA3_squeeze
655.type	SHA3_squeeze,\@function
656.align	5
657SHA3_squeeze:
658	$STU	$sp,`-10*$SIZE_T`($sp)
659	mflr	r0
660	$PUSH	r28,`6*$SIZE_T`($sp)
661	$PUSH	r29,`7*$SIZE_T`($sp)
662	$PUSH	r30,`8*$SIZE_T`($sp)
663	$PUSH	r31,`9*$SIZE_T`($sp)
664	$PUSH	r0,`10*$SIZE_T+$LRSAVE`($sp)
665
666	mr	$A_flat,r3
667	subi	r3,r3,8			; prepare for ldu
668	subi	$out,r4,1		; prepare for stbu
669	mr	$len,r5
670	mr	$bsz,r6
671	b	.Loop_squeeze
672
673.align	4
674.Loop_squeeze:
675	ldu	r0,8(r3)
676	${UCMP}i $len,8
677	blt	.Lsqueeze_tail
678
679	stb	r0,1($out)
680	srdi	r0,r0,8
681	stb	r0,2($out)
682	srdi	r0,r0,8
683	stb	r0,3($out)
684	srdi	r0,r0,8
685	stb	r0,4($out)
686	srdi	r0,r0,8
687	stb	r0,5($out)
688	srdi	r0,r0,8
689	stb	r0,6($out)
690	srdi	r0,r0,8
691	stb	r0,7($out)
692	srdi	r0,r0,8
693	stbu	r0,8($out)
694
695	subic.	$len,$len,8
696	beq	.Lsqueeze_done
697
698	subic.	r6,r6,8
699	bgt	.Loop_squeeze
700
701	mr	r3,$A_flat
702	bl	KeccakF1600
703	subi	r3,$A_flat,8		; prepare for ldu
704	mr	r6,$bsz
705	b	.Loop_squeeze
706
707.align	4
708.Lsqueeze_tail:
709	mtctr	$len
710.Loop_tail:
711	stbu	r0,1($out)
712	srdi	r0,r0,8
713	bdnz	.Loop_tail
714
715.Lsqueeze_done:
716	$POP	r0,`10*$SIZE_T+$LRSAVE`($sp)
717	$POP	r28,`6*$SIZE_T`($sp)
718	$POP	r29,`7*$SIZE_T`($sp)
719	$POP	r30,`8*$SIZE_T`($sp)
720	$POP	r31,`9*$SIZE_T`($sp)
721	mtlr	r0
722	addi	$sp,$sp,`10*$SIZE_T`
723	blr
724	.long	0
725	.byte	0,12,4,1,0x80,4,4,0
726	.long	0
727.size	SHA3_squeeze,.-SHA3_squeeze
728___
729}
730
731# Ugly hack here, because PPC assembler syntax seem to vary too
732# much from platforms to platform...
733$code.=<<___;
734.align	6
735PICmeup:
736	mflr	r0
737	bcl	20,31,\$+4
738	mflr	r12   ; vvvvvv "distance" between . and 1st data entry
739	addi	r12,r12,`64-8`
740	mtlr	r0
741	blr
742	.long	0
743	.byte	0,12,0x14,0,0,0,0,0
744	.space	`64-9*4`
745.type	iotas,\@object
746iotas:
747	.quad	0x0000000000000001
748	.quad	0x0000000000008082
749	.quad	0x800000000000808a
750	.quad	0x8000000080008000
751	.quad	0x000000000000808b
752	.quad	0x0000000080000001
753	.quad	0x8000000080008081
754	.quad	0x8000000000008009
755	.quad	0x000000000000008a
756	.quad	0x0000000000000088
757	.quad	0x0000000080008009
758	.quad	0x000000008000000a
759	.quad	0x000000008000808b
760	.quad	0x800000000000008b
761	.quad	0x8000000000008089
762	.quad	0x8000000000008003
763	.quad	0x8000000000008002
764	.quad	0x8000000000000080
765	.quad	0x000000000000800a
766	.quad	0x800000008000000a
767	.quad	0x8000000080008081
768	.quad	0x8000000000008080
769	.quad	0x0000000080000001
770	.quad	0x8000000080008008
771.size	iotas,.-iotas
772.asciz	"Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
773___
774
775$code =~ s/\`([^\`]*)\`/eval $1/gem;
776print $code;
777close STDOUT or die "error closing STDOUT: $!";
778