xref: /openssl/crypto/sha/asm/keccak1600-armv8.pl (revision f58d39fb)
1#!/usr/bin/env perl
2# Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv8.
17#
18# June 2017.
19#
20# This is straightforward KECCAK_1X_ALT implementation. It makes no
21# sense to attempt SIMD/NEON implementation for following reason.
22# 64-bit lanes of vector registers can't be addressed as easily as in
23# 32-bit mode. This means that 64-bit NEON is bound to be slower than
24# 32-bit NEON, and this implementation is faster than 32-bit NEON on
25# same processor. Even though it takes more scalar xor's and andn's,
26# it gets compensated by availability of rotate. Not to forget that
27# most processors achieve higher issue rate with scalar instructions.
28#
29# February 2018.
30#
31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
32# variant with register permutation/rotation twist that allows to
33# eliminate copies to temporary registers. If you look closely you'll
34# notice that it uses only one lane of vector registers. The new
35# instructions effectively facilitate parallel hashing, which we don't
36# support [yet?]. But lowest-level core procedure is prepared for it.
37# The inner round is 67 [vector] instructions, so it's not actually
38# obvious that it will provide performance improvement [in serial
39# hash] as long as vector instructions issue rate is limited to 1 per
40# cycle...
41#
42######################################################################
43# Numbers are cycles per processed byte.
44#
45#		r=1088(*)
46#
47# Cortex-A53	13
48# Cortex-A57	12
49# X-Gene	14
50# Mongoose	10
51# Kryo		12
52# Denver	7.8
53# Apple A7	7.2
54# ThunderX2	9.7
55#
56# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
57#	because they vary too much from compiler to compiler. Newer
58#	compiler does much better and improvement varies from 5% on
59#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
60#	compiler this code is at least 2x faster...
61
62# $output is the last argument if it looks like a file (it has an extension)
63# $flavour is the first argument if it doesn't look like a file
64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66
67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
70die "can't locate arm-xlate.pl";
71
72open OUT,"| \"$^X\" $xlate $flavour \"$output\""
73    or die "can't call $xlate: $!";
74*STDOUT=*OUT;
75
76my @rhotates = ([  0,  1, 62, 28, 27 ],
77                [ 36, 44,  6, 55, 20 ],
78                [  3, 10, 43, 25, 39 ],
79                [ 41, 45, 15, 21,  8 ],
80                [ 18,  2, 61, 56, 14 ]);
81
82$code.=<<___;
83#include "arm_arch.h"
84
85.text
86
87.align 8	// strategic alignment and padding that allows to use
88		// address value as loop termination condition...
89	.quad	0,0,0,0,0,0,0,0
90.type	iotas,%object
91iotas:
92	.quad	0x0000000000000001
93	.quad	0x0000000000008082
94	.quad	0x800000000000808a
95	.quad	0x8000000080008000
96	.quad	0x000000000000808b
97	.quad	0x0000000080000001
98	.quad	0x8000000080008081
99	.quad	0x8000000000008009
100	.quad	0x000000000000008a
101	.quad	0x0000000000000088
102	.quad	0x0000000080008009
103	.quad	0x000000008000000a
104	.quad	0x000000008000808b
105	.quad	0x800000000000008b
106	.quad	0x8000000000008089
107	.quad	0x8000000000008003
108	.quad	0x8000000000008002
109	.quad	0x8000000000000080
110	.quad	0x000000000000800a
111	.quad	0x800000008000000a
112	.quad	0x8000000080008081
113	.quad	0x8000000000008080
114	.quad	0x0000000080000001
115	.quad	0x8000000080008008
116.size	iotas,.-iotas
117___
118								{{{
119my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
120            (0, 5, 10, 15, 20));
121   $A[3][3] = "x25"; # x18 is reserved
122
123my @C = map("x$_", (26,27,28,30));
124
125$code.=<<___;
126.type	KeccakF1600_int,%function
127.align	5
128KeccakF1600_int:
129	AARCH64_SIGN_LINK_REGISTER
130	adr	$C[2],iotas
131	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
132	b	.Loop
133.align	4
134.Loop:
135	////////////////////////////////////////// Theta
136	eor	$C[0],$A[0][0],$A[1][0]
137	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
138	eor	$C[1],$A[0][1],$A[1][1]
139	eor	$C[2],$A[0][2],$A[1][2]
140	eor	$C[3],$A[0][3],$A[1][3]
141___
142	$C[4]=$A[0][4];
143	$C[5]=$A[1][4];
144$code.=<<___;
145	eor	$C[4],$A[0][4],$A[1][4]
146	eor	$C[0],$C[0],$A[2][0]
147	eor	$C[1],$C[1],$A[2][1]
148	eor	$C[2],$C[2],$A[2][2]
149	eor	$C[3],$C[3],$A[2][3]
150	eor	$C[4],$C[4],$A[2][4]
151	eor	$C[0],$C[0],$A[3][0]
152	eor	$C[1],$C[1],$A[3][1]
153	eor	$C[2],$C[2],$A[3][2]
154	eor	$C[3],$C[3],$A[3][3]
155	eor	$C[4],$C[4],$A[3][4]
156	eor	$C[0],$C[0],$A[4][0]
157	eor	$C[2],$C[2],$A[4][2]
158	eor	$C[1],$C[1],$A[4][1]
159	eor	$C[3],$C[3],$A[4][3]
160	eor	$C[4],$C[4],$A[4][4]
161
162	eor	$C[5],$C[0],$C[2],ror#63
163
164	eor	$A[0][1],$A[0][1],$C[5]
165	eor	$A[1][1],$A[1][1],$C[5]
166	eor	$A[2][1],$A[2][1],$C[5]
167	eor	$A[3][1],$A[3][1],$C[5]
168	eor	$A[4][1],$A[4][1],$C[5]
169
170	eor	$C[5],$C[1],$C[3],ror#63
171	eor	$C[2],$C[2],$C[4],ror#63
172	eor	$C[3],$C[3],$C[0],ror#63
173	eor	$C[4],$C[4],$C[1],ror#63
174
175	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
176	eor	$A[1][2],$A[1][2],$C[5]
177	eor	$A[2][2],$A[2][2],$C[5]
178	eor	$A[3][2],$A[3][2],$C[5]
179	eor	$A[4][2],$A[4][2],$C[5]
180
181	eor	$A[0][0],$A[0][0],$C[4]
182	eor	$A[1][0],$A[1][0],$C[4]
183	eor	$A[2][0],$A[2][0],$C[4]
184	eor	$A[3][0],$A[3][0],$C[4]
185	eor	$A[4][0],$A[4][0],$C[4]
186___
187	$C[4]=undef;
188	$C[5]=undef;
189$code.=<<___;
190	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
191	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
192	eor	$A[1][3],$A[1][3],$C[2]
193	eor	$A[2][3],$A[2][3],$C[2]
194	eor	$A[3][3],$A[3][3],$C[2]
195	eor	$A[4][3],$A[4][3],$C[2]
196
197	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
198	eor	$A[1][4],$A[1][4],$C[3]
199	eor	$A[2][4],$A[2][4],$C[3]
200	eor	$A[3][4],$A[3][4],$C[3]
201	eor	$A[4][4],$A[4][4],$C[3]
202
203	////////////////////////////////////////// Rho+Pi
204	mov	$C[3],$A[0][1]
205	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
206	//mov	$C[1],$A[0][2]
207	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
208	//mov	$C[0],$A[0][3]
209	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]
210	//mov	$C[2],$A[0][4]
211	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]
212
213	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]
214	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]
215	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
216	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]
217
218	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
219	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
220	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
221	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]
222
223	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
224	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
225	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]
226	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]
227
228	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
229	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
230	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
231	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]
232
233	ror	$A[1][0],$C[0],#64-$rhotates[0][3]
234	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
235	ror	$A[3][0],$C[2],#64-$rhotates[0][4]
236	ror	$A[4][0],$C[1],#64-$rhotates[0][2]
237
238	////////////////////////////////////////// Chi+Iota
239	bic	$C[0],$A[0][2],$A[0][1]
240	bic	$C[1],$A[0][3],$A[0][2]
241	bic	$C[2],$A[0][0],$A[0][4]
242	bic	$C[3],$A[0][1],$A[0][0]
243	eor	$A[0][0],$A[0][0],$C[0]
244	bic	$C[0],$A[0][4],$A[0][3]
245	eor	$A[0][1],$A[0][1],$C[1]
246	 ldr	$C[1],[sp,#16]
247	eor	$A[0][3],$A[0][3],$C[2]
248	eor	$A[0][4],$A[0][4],$C[3]
249	eor	$A[0][2],$A[0][2],$C[0]
250	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
251
252	bic	$C[0],$A[1][2],$A[1][1]
253	 tst	$C[1],#255			// are we done?
254	 str	$C[1],[sp,#16]
255	bic	$C[1],$A[1][3],$A[1][2]
256	bic	$C[2],$A[1][0],$A[1][4]
257	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
258	bic	$C[3],$A[1][1],$A[1][0]
259	eor	$A[1][0],$A[1][0],$C[0]
260	bic	$C[0],$A[1][4],$A[1][3]
261	eor	$A[1][1],$A[1][1],$C[1]
262	eor	$A[1][3],$A[1][3],$C[2]
263	eor	$A[1][4],$A[1][4],$C[3]
264	eor	$A[1][2],$A[1][2],$C[0]
265
266	bic	$C[0],$A[2][2],$A[2][1]
267	bic	$C[1],$A[2][3],$A[2][2]
268	bic	$C[2],$A[2][0],$A[2][4]
269	bic	$C[3],$A[2][1],$A[2][0]
270	eor	$A[2][0],$A[2][0],$C[0]
271	bic	$C[0],$A[2][4],$A[2][3]
272	eor	$A[2][1],$A[2][1],$C[1]
273	eor	$A[2][3],$A[2][3],$C[2]
274	eor	$A[2][4],$A[2][4],$C[3]
275	eor	$A[2][2],$A[2][2],$C[0]
276
277	bic	$C[0],$A[3][2],$A[3][1]
278	bic	$C[1],$A[3][3],$A[3][2]
279	bic	$C[2],$A[3][0],$A[3][4]
280	bic	$C[3],$A[3][1],$A[3][0]
281	eor	$A[3][0],$A[3][0],$C[0]
282	bic	$C[0],$A[3][4],$A[3][3]
283	eor	$A[3][1],$A[3][1],$C[1]
284	eor	$A[3][3],$A[3][3],$C[2]
285	eor	$A[3][4],$A[3][4],$C[3]
286	eor	$A[3][2],$A[3][2],$C[0]
287
288	bic	$C[0],$A[4][2],$A[4][1]
289	bic	$C[1],$A[4][3],$A[4][2]
290	bic	$C[2],$A[4][0],$A[4][4]
291	bic	$C[3],$A[4][1],$A[4][0]
292	eor	$A[4][0],$A[4][0],$C[0]
293	bic	$C[0],$A[4][4],$A[4][3]
294	eor	$A[4][1],$A[4][1],$C[1]
295	eor	$A[4][3],$A[4][3],$C[2]
296	eor	$A[4][4],$A[4][4],$C[3]
297	eor	$A[4][2],$A[4][2],$C[0]
298
299	bne	.Loop
300
301	ldr	x30,[sp,#24]
302	AARCH64_VALIDATE_LINK_REGISTER
303	ret
304.size	KeccakF1600_int,.-KeccakF1600_int
305
306.type	KeccakF1600,%function
307.align	5
308KeccakF1600:
309	AARCH64_SIGN_LINK_REGISTER
310	stp	x29,x30,[sp,#-128]!
311	add	x29,sp,#0
312	stp	x19,x20,[sp,#16]
313	stp	x21,x22,[sp,#32]
314	stp	x23,x24,[sp,#48]
315	stp	x25,x26,[sp,#64]
316	stp	x27,x28,[sp,#80]
317	sub	sp,sp,#48
318
319	str	x0,[sp,#32]			// offload argument
320	mov	$C[0],x0
321	ldp	$A[0][0],$A[0][1],[x0,#16*0]
322	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
323	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
324	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
325	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
326	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
327	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
328	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
329	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
330	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
331	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
332	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
333	ldr	$A[4][4],[$C[0],#16*12]
334
335	bl	KeccakF1600_int
336
337	ldr	$C[0],[sp,#32]
338	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
339	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
340	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
341	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
342	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
343	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
344	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
345	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
346	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
347	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
348	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
349	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
350	str	$A[4][4],[$C[0],#16*12]
351
352	ldp	x19,x20,[x29,#16]
353	add	sp,sp,#48
354	ldp	x21,x22,[x29,#32]
355	ldp	x23,x24,[x29,#48]
356	ldp	x25,x26,[x29,#64]
357	ldp	x27,x28,[x29,#80]
358	ldp	x29,x30,[sp],#128
359	AARCH64_VALIDATE_LINK_REGISTER
360	ret
361.size	KeccakF1600,.-KeccakF1600
362
363.globl	SHA3_absorb
364.type	SHA3_absorb,%function
365.align	5
366SHA3_absorb:
367	AARCH64_SIGN_LINK_REGISTER
368	stp	x29,x30,[sp,#-128]!
369	add	x29,sp,#0
370	stp	x19,x20,[sp,#16]
371	stp	x21,x22,[sp,#32]
372	stp	x23,x24,[sp,#48]
373	stp	x25,x26,[sp,#64]
374	stp	x27,x28,[sp,#80]
375	sub	sp,sp,#64
376
377	stp	x0,x1,[sp,#32]			// offload arguments
378	stp	x2,x3,[sp,#48]
379
380	mov	$C[0],x0			// uint64_t A[5][5]
381	mov	$C[1],x1			// const void *inp
382	mov	$C[2],x2			// size_t len
383	mov	$C[3],x3			// size_t bsz
384	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
385	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
386	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
387	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
388	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
389	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
390	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
391	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
392	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
393	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
394	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
395	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
396	ldr	$A[4][4],[$C[0],#16*12]
397	b	.Loop_absorb
398
399.align	4
400.Loop_absorb:
401	subs	$C[0],$C[2],$C[3]		// len - bsz
402	blo	.Labsorbed
403
404	str	$C[0],[sp,#48]			// save len - bsz
405___
406for (my $i=0; $i<24; $i+=2) {
407my $j = $i+1;
408$code.=<<___;
409	ldr	$C[0],[$C[1]],#8		// *inp++
410#ifdef	__AARCH64EB__
411	rev	$C[0],$C[0]
412#endif
413	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
414	cmp	$C[3],#8*($i+2)
415	blo	.Lprocess_block
416	ldr	$C[0],[$C[1]],#8		// *inp++
417#ifdef	__AARCH64EB__
418	rev	$C[0],$C[0]
419#endif
420	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
421	beq	.Lprocess_block
422___
423}
424$code.=<<___;
425	ldr	$C[0],[$C[1]],#8		// *inp++
426#ifdef	__AARCH64EB__
427	rev	$C[0],$C[0]
428#endif
429	eor	$A[4][4],$A[4][4],$C[0]
430
431.Lprocess_block:
432	str	$C[1],[sp,#40]			// save inp
433
434	bl	KeccakF1600_int
435
436	ldr	$C[1],[sp,#40]			// restore arguments
437	ldp	$C[2],$C[3],[sp,#48]
438	b	.Loop_absorb
439
440.align	4
441.Labsorbed:
442	ldr	$C[1],[sp,#32]
443	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
444	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
445	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
446	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
447	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
448	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
449	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
450	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
451	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
452	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
453	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
454	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
455	str	$A[4][4],[$C[1],#16*12]
456
457	mov	x0,$C[2]			// return value
458	ldp	x19,x20,[x29,#16]
459	add	sp,sp,#64
460	ldp	x21,x22,[x29,#32]
461	ldp	x23,x24,[x29,#48]
462	ldp	x25,x26,[x29,#64]
463	ldp	x27,x28,[x29,#80]
464	ldp	x29,x30,[sp],#128
465	AARCH64_VALIDATE_LINK_REGISTER
466	ret
467.size	SHA3_absorb,.-SHA3_absorb
468___
469{
470my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
471$code.=<<___;
472.globl	SHA3_squeeze
473.type	SHA3_squeeze,%function
474.align	5
475SHA3_squeeze:
476	AARCH64_SIGN_LINK_REGISTER
477	stp	x29,x30,[sp,#-48]!
478	add	x29,sp,#0
479	stp	x19,x20,[sp,#16]
480	stp	x21,x22,[sp,#32]
481
482	mov	$A_flat,x0			// put aside arguments
483	mov	$out,x1
484	mov	$len,x2
485	mov	$bsz,x3
486	cmp	w4, #0				// w4 = 'next' argument
487	bne	.Lnext_block
488
489.Loop_squeeze:
490	ldr	x4,[x0],#8
491	cmp	$len,#8
492	blo	.Lsqueeze_tail
493#ifdef	__AARCH64EB__
494	rev	x4,x4
495#endif
496	str	x4,[$out],#8
497	subs	$len,$len,#8
498	beq	.Lsqueeze_done
499
500	subs	x3,x3,#8
501	bhi	.Loop_squeeze
502.Lnext_block:
503	mov	x0,$A_flat
504	bl	KeccakF1600
505	mov	x0,$A_flat
506	mov	x3,$bsz
507	b	.Loop_squeeze
508
509.align	4
510.Lsqueeze_tail:
511	strb	w4,[$out],#1
512	lsr	x4,x4,#8
513	subs	$len,$len,#1
514	beq	.Lsqueeze_done
515	strb	w4,[$out],#1
516	lsr	x4,x4,#8
517	subs	$len,$len,#1
518	beq	.Lsqueeze_done
519	strb	w4,[$out],#1
520	lsr	x4,x4,#8
521	subs	$len,$len,#1
522	beq	.Lsqueeze_done
523	strb	w4,[$out],#1
524	lsr	x4,x4,#8
525	subs	$len,$len,#1
526	beq	.Lsqueeze_done
527	strb	w4,[$out],#1
528	lsr	x4,x4,#8
529	subs	$len,$len,#1
530	beq	.Lsqueeze_done
531	strb	w4,[$out],#1
532	lsr	x4,x4,#8
533	subs	$len,$len,#1
534	beq	.Lsqueeze_done
535	strb	w4,[$out],#1
536
537.Lsqueeze_done:
538	ldp	x19,x20,[sp,#16]
539	ldp	x21,x22,[sp,#32]
540	ldp	x29,x30,[sp],#48
541	AARCH64_VALIDATE_LINK_REGISTER
542	ret
543.size	SHA3_squeeze,.-SHA3_squeeze
544___
545}								}}}
546								{{{
547my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
548                             "v".($_+3).".16b", "v".($_+4).".16b" ],
549            (0, 5, 10, 15, 20));
550
551my @C = map("v$_.16b", (25..31));
552my @D = @C[4,5,6,2,3];
553
554$code.=<<___;
555.type	KeccakF1600_ce,%function
556.align	5
557KeccakF1600_ce:
558	mov	x9,#24
559	adr	x10,iotas
560	b	.Loop_ce
561.align	4
562.Loop_ce:
563	////////////////////////////////////////////////// Theta
564	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
565	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
566	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
567	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
568	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
569	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
570	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
571	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
572	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
573	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]
574
575	rax1	$C[5],$C[0],$C[2]			// D[1]
576	rax1	$C[6],$C[1],$C[3]			// D[2]
577	rax1	$C[2],$C[2],$C[4]			// D[3]
578	rax1	$C[3],$C[3],$C[0]			// D[4]
579	rax1	$C[4],$C[4],$C[1]			// D[0]
580
581	////////////////////////////////////////////////// Theta+Rho+Pi
582	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]
583
584	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
585	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
586	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
587	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
588	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]
589
590	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]
591
592	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
593	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
594	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
595	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
596	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]
597
598	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]
599
600	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
601	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
602	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
603	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
604	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]
605
606	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]
607
608	eor	$A[0][0],$A[0][0],$D[0]
609
610	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
611	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
612	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
613	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
614	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]
615
616	////////////////////////////////////////////////// Chi+Iota
617	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
618	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
619	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
620	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
621	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]
622
623	ld1r	{$C[1]},[x10],#8
624
625	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
626	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
627	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
628	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
629	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]
630
631	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
632	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
633	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
634	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
635	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]
636
637	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
638	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
639	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
640	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
641	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]
642
643	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
644	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
645	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
646	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
647	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]
648
649	eor	$A[0][0],$A[0][0],$C[1]
650
651	subs	x9,x9,#1
652	bne	.Loop_ce
653
654	ret
655.size	KeccakF1600_ce,.-KeccakF1600_ce
656
657.type	KeccakF1600_cext,%function
658.align	5
659KeccakF1600_cext:
660	AARCH64_SIGN_LINK_REGISTER
661	stp	x29,x30,[sp,#-80]!
662	add	x29,sp,#0
663	stp	d8,d9,[sp,#16]		// per ABI requirement
664	stp	d10,d11,[sp,#32]
665	stp	d12,d13,[sp,#48]
666	stp	d14,d15,[sp,#64]
667___
668for($i=0; $i<24; $i+=2) {		# load A[5][5]
669my $j=$i+1;
670$code.=<<___;
671	ldp	d$i,d$j,[x0,#8*$i]
672___
673}
674$code.=<<___;
675	ldr	d24,[x0,#8*$i]
676	bl	KeccakF1600_ce
677	ldr	x30,[sp,#8]
678___
679for($i=0; $i<24; $i+=2) {		# store A[5][5]
680my $j=$i+1;
681$code.=<<___;
682	stp	d$i,d$j,[x0,#8*$i]
683___
684}
685$code.=<<___;
686	str	d24,[x0,#8*$i]
687
688	ldp	d8,d9,[sp,#16]
689	ldp	d10,d11,[sp,#32]
690	ldp	d12,d13,[sp,#48]
691	ldp	d14,d15,[sp,#64]
692	ldr	x29,[sp],#80
693	AARCH64_VALIDATE_LINK_REGISTER
694	ret
695.size	KeccakF1600_cext,.-KeccakF1600_cext
696___
697
698{
699my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
700
701$code.=<<___;
702.globl	SHA3_absorb_cext
703.type	SHA3_absorb_cext,%function
704.align	5
705SHA3_absorb_cext:
706	AARCH64_SIGN_LINK_REGISTER
707	stp	x29,x30,[sp,#-80]!
708	add	x29,sp,#0
709	stp	d8,d9,[sp,#16]		// per ABI requirement
710	stp	d10,d11,[sp,#32]
711	stp	d12,d13,[sp,#48]
712	stp	d14,d15,[sp,#64]
713___
714for($i=0; $i<24; $i+=2) {		# load A[5][5]
715my $j=$i+1;
716$code.=<<___;
717	ldp	d$i,d$j,[x0,#8*$i]
718___
719}
720$code.=<<___;
721	ldr	d24,[x0,#8*$i]
722	b	.Loop_absorb_ce
723
724.align	4
725.Loop_absorb_ce:
726	subs	$len,$len,$bsz		// len - bsz
727	blo	.Labsorbed_ce
728___
729for (my $i=0; $i<24; $i+=2) {
730my $j = $i+1;
731$code.=<<___;
732	ldr	d31,[$inp],#8		// *inp++
733#ifdef	__AARCH64EB__
734	rev64	v31.16b,v31.16b
735#endif
736	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
737	cmp	$bsz,#8*($i+2)
738	blo	.Lprocess_block_ce
739	ldr	d31,[$inp],#8		// *inp++
740#ifdef	__AARCH64EB__
741	rev64	v31.16b,v31.16b
742#endif
743	eor	$A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
744	beq	.Lprocess_block_ce
745___
746}
747$code.=<<___;
748	ldr	d31,[$inp],#8		// *inp++
749#ifdef	__AARCH64EB__
750	rev64	v31.16b,v31.16b
751#endif
752	eor	$A[4][4],$A[4][4],v31.16b
753
754.Lprocess_block_ce:
755
756	bl	KeccakF1600_ce
757
758	b	.Loop_absorb_ce
759
760.align	4
761.Labsorbed_ce:
762___
763for($i=0; $i<24; $i+=2) {		# store A[5][5]
764my $j=$i+1;
765$code.=<<___;
766	stp	d$i,d$j,[x0,#8*$i]
767___
768}
769$code.=<<___;
770	str	d24,[x0,#8*$i]
771	add	x0,$len,$bsz		// return value
772
773	ldp	d8,d9,[sp,#16]
774	ldp	d10,d11,[sp,#32]
775	ldp	d12,d13,[sp,#48]
776	ldp	d14,d15,[sp,#64]
777	ldp	x29,x30,[sp],#80
778	AARCH64_VALIDATE_LINK_REGISTER
779	ret
780.size	SHA3_absorb_cext,.-SHA3_absorb_cext
781___
782}
783{
784my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
785$code.=<<___;
786.globl	SHA3_squeeze_cext
787.type	SHA3_squeeze_cext,%function
788.align	5
789SHA3_squeeze_cext:
790	AARCH64_SIGN_LINK_REGISTER
791	stp	x29,x30,[sp,#-16]!
792	add	x29,sp,#0
793	mov	x9,$ctx
794	mov	x10,$bsz
795
796.Loop_squeeze_ce:
797	ldr	x4,[x9],#8
798	cmp	$len,#8
799	blo	.Lsqueeze_tail_ce
800#ifdef	__AARCH64EB__
801	rev	x4,x4
802#endif
803	str	x4,[$out],#8
804	beq	.Lsqueeze_done_ce
805
806	sub	$len,$len,#8
807	subs	x10,x10,#8
808	bhi	.Loop_squeeze_ce
809
810	bl	KeccakF1600_cext
811	ldr	x30,[sp,#8]
812	mov	x9,$ctx
813	mov	x10,$bsz
814	b	.Loop_squeeze_ce
815
816.align	4
817.Lsqueeze_tail_ce:
818	strb	w4,[$out],#1
819	lsr	x4,x4,#8
820	subs	$len,$len,#1
821	beq	.Lsqueeze_done_ce
822	strb	w4,[$out],#1
823	lsr	x4,x4,#8
824	subs	$len,$len,#1
825	beq	.Lsqueeze_done_ce
826	strb	w4,[$out],#1
827	lsr	x4,x4,#8
828	subs	$len,$len,#1
829	beq	.Lsqueeze_done_ce
830	strb	w4,[$out],#1
831	lsr	x4,x4,#8
832	subs	$len,$len,#1
833	beq	.Lsqueeze_done_ce
834	strb	w4,[$out],#1
835	lsr	x4,x4,#8
836	subs	$len,$len,#1
837	beq	.Lsqueeze_done_ce
838	strb	w4,[$out],#1
839	lsr	x4,x4,#8
840	subs	$len,$len,#1
841	beq	.Lsqueeze_done_ce
842	strb	w4,[$out],#1
843
844.Lsqueeze_done_ce:
845	ldr	x29,[sp],#16
846	AARCH64_VALIDATE_LINK_REGISTER
847	ret
848.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
849___
850}								}}}
851$code.=<<___;
852.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
853___
854
855{   my  %opcode = (
856	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
857	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);
858
859    sub unsha3 {
860	my ($mnemonic,$arg)=@_;
861
862	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
863	&&
864	sprintf ".inst\t0x%08x\t//%s %s",
865			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
866			$mnemonic,$arg;
867    }
868}
869
870foreach(split("\n",$code)) {
871
872	s/\`([^\`]*)\`/eval($1)/ge;
873
874	m/\bld1r\b/ and s/\.16b/.2d/g	or
875	s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
876
877	print $_,"\n";
878}
879
880close STDOUT or die "error closing STDOUT: $!";
881