xref: /openssl/crypto/sha/asm/keccak1600-armv4.pl (revision 69d4d528)
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for ARMv4.
17#
18# June 2017.
19#
20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21# interleaving. How does it compare to Keccak Code Package? It's as
22# fast, but several times smaller, and is endian- and ISA-neutral. ISA
23# neutrality means that minimum ISA requirement is ARMv4, yet it can
24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25# register layout taken from Keccak Code Package. It's also as fast,
26# in fact faster by 10-15% on some processors, and endian-neutral.
27#
28# August 2017.
29#
30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31# of rotate instructions with logical ones. This resulted in ~10%
32# improvement on most processors. Switch to KECCAK_2X effectively
33# minimizes re-loads from temporary storage, and merged rotates just
34# eliminate corresponding instructions. As for latter. When examining
35# code you'll notice commented ror instructions. These are eliminated
36# ones, and you should trace destination register below to see what's
37# going on. Just in case, why not all rotates are eliminated. Trouble
38# is that you have operations that require both inputs to be rotated,
39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41# that takes 'a' as input. And thing is that this next operation can
42# be in next round. It's totally possible to "carry" rotate "factors"
43# to the next round, but it makes code more complex. And the last word
44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
45# time being]...
46#
47# Reduce per-round instruction count in Thumb-2 case by 16%. This is
48# achieved by folding ldr/str pairs to their double-word counterparts.
49# Theoretically this should have improved performance on single-issue
50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
51# usual...
52#
53########################################################################
54# Numbers are cycles per processed byte. Non-NEON results account even
55# for input bit interleaving.
56#
57#		r=1088(*)   Thumb-2(**) NEON
58#
59# ARM11xx	82/+150%
60# Cortex-A5	88/+160%,   86,         36
61# Cortex-A7	78/+160%,   68,         34
62# Cortex-A8	51/+230%,   57,         30
63# Cortex-A9	53/+210%,   51,         26
64# Cortex-A15	42/+160%,   38,         18
65# Snapdragon S4	43/+210%,   38,         24
66#
67# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
68#	over compiler-generated KECCAK_2X reference code.
69# (**)	Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70#	Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71#	processors are presented mostly for reference purposes.
72
73# $output is the last argument if it looks like a file (it has an extension)
74# $flavour is the first argument if it doesn't look like a file
75$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
76$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
77
78if ($flavour && $flavour ne "void") {
79    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
80    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
81    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
82    die "can't locate arm-xlate.pl";
83
84    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
85        or die "can't call $xlate: $!";
86} else {
87    $output and open STDOUT,">$output";
88}
89
90my @C = map("r$_",(0..9));
91my @E = map("r$_",(10..12,14));
92
93########################################################################
94# Stack layout
95# ----->+-----------------------+
96#       | uint64_t A[5][5]      |
97#       | ...                   |
98# +200->+-----------------------+
99#       | uint64_t D[5]         |
100#       | ...                   |
101# +240->+-----------------------+
102#       | uint64_t T[5][5]      |
103#       | ...                   |
104# +440->+-----------------------+
105#       | saved lr              |
106# +444->+-----------------------+
107#       | loop counter          |
108# +448->+-----------------------+
109#       | ...
110
111my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
112my @D = map(8*$_, (25..29));
113my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
114
115$code.=<<___;
116#include "arm_arch.h"
117
118#if defined(__thumb2__)
119.syntax	unified
120.thumb
121#else
122.code	32
123#endif
124
125.text
126
127.type	iotas32, %object
128.align	5
129iotas32:
130	.long	0x00000001, 0x00000000
131	.long	0x00000000, 0x00000089
132	.long	0x00000000, 0x8000008b
133	.long	0x00000000, 0x80008080
134	.long	0x00000001, 0x0000008b
135	.long	0x00000001, 0x00008000
136	.long	0x00000001, 0x80008088
137	.long	0x00000001, 0x80000082
138	.long	0x00000000, 0x0000000b
139	.long	0x00000000, 0x0000000a
140	.long	0x00000001, 0x00008082
141	.long	0x00000000, 0x00008003
142	.long	0x00000001, 0x0000808b
143	.long	0x00000001, 0x8000000b
144	.long	0x00000001, 0x8000008a
145	.long	0x00000001, 0x80000081
146	.long	0x00000000, 0x80000081
147	.long	0x00000000, 0x80000008
148	.long	0x00000000, 0x00000083
149	.long	0x00000000, 0x80008003
150	.long	0x00000001, 0x80008088
151	.long	0x00000000, 0x80000088
152	.long	0x00000001, 0x00008000
153	.long	0x00000000, 0x80008082
154.size	iotas32,.-iotas32
155
156.type	KeccakF1600_int, %function
157.align	5
158KeccakF1600_int:
159	add	@C[9],sp,#$A[4][2]
160	add	@E[2],sp,#$A[0][0]
161	add	@E[0],sp,#$A[1][0]
162	ldmia	@C[9],{@C[4]-@C[9]}		@ A[4][2..4]
163KeccakF1600_enter:
164	str	lr,[sp,#440]
165	eor	@E[1],@E[1],@E[1]
166	str	@E[1],[sp,#444]
167	b	.Lround2x
168
169.align	4
170.Lround2x:
171___
172sub Round {
173my (@A,@R); (@A[0..4],@R) = @_;
174
175$code.=<<___;
176	ldmia	@E[2],{@C[0]-@C[3]}		@ A[0][0..1]
177	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][0..1]
178#ifdef	__thumb2__
179	eor	@C[0],@C[0],@E[0]
180	eor	@C[1],@C[1],@E[1]
181	eor	@C[2],@C[2],@E[2]
182	ldrd	@E[0],@E[1],[sp,#$A[1][2]]
183	eor	@C[3],@C[3],@E[3]
184	ldrd	@E[2],@E[3],[sp,#$A[1][3]]
185	eor	@C[4],@C[4],@E[0]
186	eor	@C[5],@C[5],@E[1]
187	eor	@C[6],@C[6],@E[2]
188	ldrd	@E[0],@E[1],[sp,#$A[1][4]]
189	eor	@C[7],@C[7],@E[3]
190	ldrd	@E[2],@E[3],[sp,#$A[2][0]]
191	eor	@C[8],@C[8],@E[0]
192	eor	@C[9],@C[9],@E[1]
193	eor	@C[0],@C[0],@E[2]
194	ldrd	@E[0],@E[1],[sp,#$A[2][1]]
195	eor	@C[1],@C[1],@E[3]
196	ldrd	@E[2],@E[3],[sp,#$A[2][2]]
197	eor	@C[2],@C[2],@E[0]
198	eor	@C[3],@C[3],@E[1]
199	eor	@C[4],@C[4],@E[2]
200	ldrd	@E[0],@E[1],[sp,#$A[2][3]]
201	eor	@C[5],@C[5],@E[3]
202	ldrd	@E[2],@E[3],[sp,#$A[2][4]]
203	eor	@C[6],@C[6],@E[0]
204	eor	@C[7],@C[7],@E[1]
205	eor	@C[8],@C[8],@E[2]
206	ldrd	@E[0],@E[1],[sp,#$A[3][0]]
207	eor	@C[9],@C[9],@E[3]
208	ldrd	@E[2],@E[3],[sp,#$A[3][1]]
209	eor	@C[0],@C[0],@E[0]
210	eor	@C[1],@C[1],@E[1]
211	eor	@C[2],@C[2],@E[2]
212	ldrd	@E[0],@E[1],[sp,#$A[3][2]]
213	eor	@C[3],@C[3],@E[3]
214	ldrd	@E[2],@E[3],[sp,#$A[3][3]]
215	eor	@C[4],@C[4],@E[0]
216	eor	@C[5],@C[5],@E[1]
217	eor	@C[6],@C[6],@E[2]
218	ldrd	@E[0],@E[1],[sp,#$A[3][4]]
219	eor	@C[7],@C[7],@E[3]
220	ldrd	@E[2],@E[3],[sp,#$A[4][0]]
221	eor	@C[8],@C[8],@E[0]
222	eor	@C[9],@C[9],@E[1]
223	eor	@C[0],@C[0],@E[2]
224	ldrd	@E[0],@E[1],[sp,#$A[4][1]]
225	eor	@C[1],@C[1],@E[3]
226	ldrd	@E[2],@E[3],[sp,#$A[0][2]]
227	eor	@C[2],@C[2],@E[0]
228	eor	@C[3],@C[3],@E[1]
229	eor	@C[4],@C[4],@E[2]
230	ldrd	@E[0],@E[1],[sp,#$A[0][3]]
231	eor	@C[5],@C[5],@E[3]
232	ldrd	@E[2],@E[3],[sp,#$A[0][4]]
233#else
234	eor	@C[0],@C[0],@E[0]
235	 add	@E[0],sp,#$A[1][2]
236	eor	@C[1],@C[1],@E[1]
237	eor	@C[2],@C[2],@E[2]
238	eor	@C[3],@C[3],@E[3]
239	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][2..3]
240	eor	@C[4],@C[4],@E[0]
241	 add	@E[0],sp,#$A[1][4]
242	eor	@C[5],@C[5],@E[1]
243	eor	@C[6],@C[6],@E[2]
244	eor	@C[7],@C[7],@E[3]
245	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[1][4]..A[2][0]
246	eor	@C[8],@C[8],@E[0]
247	 add	@E[0],sp,#$A[2][1]
248	eor	@C[9],@C[9],@E[1]
249	eor	@C[0],@C[0],@E[2]
250	eor	@C[1],@C[1],@E[3]
251	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][1..2]
252	eor	@C[2],@C[2],@E[0]
253	 add	@E[0],sp,#$A[2][3]
254	eor	@C[3],@C[3],@E[1]
255	eor	@C[4],@C[4],@E[2]
256	eor	@C[5],@C[5],@E[3]
257	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[2][3..4]
258	eor	@C[6],@C[6],@E[0]
259	 add	@E[0],sp,#$A[3][0]
260	eor	@C[7],@C[7],@E[1]
261	eor	@C[8],@C[8],@E[2]
262	eor	@C[9],@C[9],@E[3]
263	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][0..1]
264	eor	@C[0],@C[0],@E[0]
265	 add	@E[0],sp,#$A[3][2]
266	eor	@C[1],@C[1],@E[1]
267	eor	@C[2],@C[2],@E[2]
268	eor	@C[3],@C[3],@E[3]
269	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][2..3]
270	eor	@C[4],@C[4],@E[0]
271	 add	@E[0],sp,#$A[3][4]
272	eor	@C[5],@C[5],@E[1]
273	eor	@C[6],@C[6],@E[2]
274	eor	@C[7],@C[7],@E[3]
275	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[3][4]..A[4][0]
276	eor	@C[8],@C[8],@E[0]
277	ldr	@E[0],[sp,#$A[4][1]]		@ A[4][1]
278	eor	@C[9],@C[9],@E[1]
279	ldr	@E[1],[sp,#$A[4][1]+4]
280	eor	@C[0],@C[0],@E[2]
281	ldr	@E[2],[sp,#$A[0][2]]		@ A[0][2]
282	eor	@C[1],@C[1],@E[3]
283	ldr	@E[3],[sp,#$A[0][2]+4]
284	eor	@C[2],@C[2],@E[0]
285	 add	@E[0],sp,#$A[0][3]
286	eor	@C[3],@C[3],@E[1]
287	eor	@C[4],@C[4],@E[2]
288	eor	@C[5],@C[5],@E[3]
289	ldmia	@E[0],{@E[0]-@E[2],@E[3]}	@ A[0][3..4]
290#endif
291	eor	@C[6],@C[6],@E[0]
292	eor	@C[7],@C[7],@E[1]
293	eor	@C[8],@C[8],@E[2]
294	eor	@C[9],@C[9],@E[3]
295
296	eor	@E[0],@C[0],@C[5],ror#32-1	@ E[0] = ROL64(C[2], 1) ^ C[0];
297	str.l	@E[0],[sp,#$D[1]]		@ D[1] = E[0]
298	eor	@E[1],@C[1],@C[4]
299	str.h	@E[1],[sp,#$D[1]+4]
300	eor	@E[2],@C[6],@C[1],ror#32-1	@ E[1] = ROL64(C[0], 1) ^ C[3];
301	eor	@E[3],@C[7],@C[0]
302	str.l	@E[2],[sp,#$D[4]]		@ D[4] = E[1]
303	eor	@C[0],@C[8],@C[3],ror#32-1	@ C[0] = ROL64(C[1], 1) ^ C[4];
304	str.h	@E[3],[sp,#$D[4]+4]
305	eor	@C[1],@C[9],@C[2]
306	str.l	@C[0],[sp,#$D[0]]		@ D[0] = C[0]
307	eor	@C[2],@C[2],@C[7],ror#32-1	@ C[1] = ROL64(C[3], 1) ^ C[1];
308	 ldr.l	@C[7],[sp,#$A[3][3]]
309	eor	@C[3],@C[3],@C[6]
310	str.h	@C[1],[sp,#$D[0]+4]
311	 ldr.h	@C[6],[sp,#$A[3][3]+4]
312	str.l	@C[2],[sp,#$D[2]]		@ D[2] = C[1]
313	eor	@C[4],@C[4],@C[9],ror#32-1	@ C[2] = ROL64(C[4], 1) ^ C[2];
314	str.h	@C[3],[sp,#$D[2]+4]
315	eor	@C[5],@C[5],@C[8]
316
317	ldr.l	@C[8],[sp,#$A[4][4]]
318	ldr.h	@C[9],[sp,#$A[4][4]+4]
319	 str.l	@C[4],[sp,#$D[3]]		@ D[3] = C[2]
320	eor	@C[7],@C[7],@C[4]
321	 str.h	@C[5],[sp,#$D[3]+4]
322	eor	@C[6],@C[6],@C[5]
323	ldr.l	@C[4],[sp,#$A[0][0]]
324	@ ror	@C[7],@C[7],#32-10		@ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
325	@ ror	@C[6],@C[6],#32-11
326	ldr.h	@C[5],[sp,#$A[0][0]+4]
327	eor	@C[8],@C[8],@E[2]
328	eor	@C[9],@C[9],@E[3]
329	ldr.l	@E[2],[sp,#$A[2][2]]
330	eor	@C[0],@C[0],@C[4]
331	ldr.h	@E[3],[sp,#$A[2][2]+4]
332	@ ror	@C[8],@C[8],#32-7		@ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
333	@ ror	@C[9],@C[9],#32-7
334	eor	@C[1],@C[1],@C[5]		@ C[0] =       A[0][0] ^ C[0]; /* rotate by 0 */  /* D[0] */
335	eor	@E[2],@E[2],@C[2]
336	ldr.l	@C[2],[sp,#$A[1][1]]
337	eor	@E[3],@E[3],@C[3]
338	ldr.h	@C[3],[sp,#$A[1][1]+4]
339	ror	@C[5],@E[2],#32-21		@ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
340	 ldr	@E[2],[sp,#444]			@ load counter
341	eor	@C[2],@C[2],@E[0]
342	 adr	@E[0],iotas32
343	ror	@C[4],@E[3],#32-22
344	 add	@E[3],@E[0],@E[2]
345	eor	@C[3],@C[3],@E[1]
346___
347$code.=<<___	if ($A[0][0] != $T[0][0]);
348	ldmia	@E[3],{@E[0],@E[1]}		@ iotas[i]
349___
350$code.=<<___	if ($A[0][0] == $T[0][0]);
351	ldr.l	@E[0],[@E[3],#8]		@ iotas[i].lo
352	add	@E[2],@E[2],#16
353	ldr.h	@E[1],[@E[3],#12]		@ iotas[i].hi
354	cmp	@E[2],#192
355	str	@E[2],[sp,#444]			@ store counter
356___
357$code.=<<___;
358	bic	@E[2],@C[4],@C[2],ror#32-22
359	bic	@E[3],@C[5],@C[3],ror#32-22
360	 ror	@C[2],@C[2],#32-22		@ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
361	 ror	@C[3],@C[3],#32-22
362	eor	@E[2],@E[2],@C[0]
363	eor	@E[3],@E[3],@C[1]
364	eor	@E[0],@E[0],@E[2]
365	eor	@E[1],@E[1],@E[3]
366	str.l	@E[0],[sp,#$R[0][0]]		@ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
367	bic	@E[2],@C[6],@C[4],ror#11
368	str.h	@E[1],[sp,#$R[0][0]+4]
369	bic	@E[3],@C[7],@C[5],ror#10
370	bic	@E[0],@C[8],@C[6],ror#32-(11-7)
371	bic	@E[1],@C[9],@C[7],ror#32-(10-7)
372	eor	@E[2],@C[2],@E[2],ror#32-11
373	str.l	@E[2],[sp,#$R[0][1]]		@ R[0][1] = C[1] ^ (~C[2] & C[3]);
374	eor	@E[3],@C[3],@E[3],ror#32-10
375	str.h	@E[3],[sp,#$R[0][1]+4]
376	eor	@E[0],@C[4],@E[0],ror#32-7
377	eor	@E[1],@C[5],@E[1],ror#32-7
378	str.l	@E[0],[sp,#$R[0][2]]		@ R[0][2] = C[2] ^ (~C[3] & C[4]);
379	bic	@E[2],@C[0],@C[8],ror#32-7
380	str.h	@E[1],[sp,#$R[0][2]+4]
381	bic	@E[3],@C[1],@C[9],ror#32-7
382	eor	@E[2],@E[2],@C[6],ror#32-11
383	str.l	@E[2],[sp,#$R[0][3]]		@ R[0][3] = C[3] ^ (~C[4] & C[0]);
384	eor	@E[3],@E[3],@C[7],ror#32-10
385	str.h	@E[3],[sp,#$R[0][3]+4]
386	bic	@E[0],@C[2],@C[0]
387	 add	@E[3],sp,#$D[3]
388	 ldr.l	@C[0],[sp,#$A[0][3]]		@ A[0][3]
389	bic	@E[1],@C[3],@C[1]
390	 ldr.h	@C[1],[sp,#$A[0][3]+4]
391	eor	@E[0],@E[0],@C[8],ror#32-7
392	eor	@E[1],@E[1],@C[9],ror#32-7
393	str.l	@E[0],[sp,#$R[0][4]]		@ R[0][4] = C[4] ^ (~C[0] & C[1]);
394	 add	@C[9],sp,#$D[0]
395	str.h	@E[1],[sp,#$R[0][4]+4]
396
397	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[3..4]
398	ldmia	@C[9],{@C[6]-@C[9]}		@ D[0..1]
399
400	ldr.l	@C[2],[sp,#$A[1][4]]		@ A[1][4]
401	eor	@C[0],@C[0],@E[0]
402	ldr.h	@C[3],[sp,#$A[1][4]+4]
403	eor	@C[1],@C[1],@E[1]
404	@ ror	@C[0],@C[0],#32-14		@ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
405	ldr.l	@E[0],[sp,#$A[3][1]]		@ A[3][1]
406	@ ror	@C[1],@C[1],#32-14
407	ldr.h	@E[1],[sp,#$A[3][1]+4]
408
409	eor	@C[2],@C[2],@E[2]
410	ldr.l	@C[4],[sp,#$A[2][0]]		@ A[2][0]
411	eor	@C[3],@C[3],@E[3]
412	ldr.h	@C[5],[sp,#$A[2][0]+4]
413	@ ror	@C[2],@C[2],#32-10		@ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
414	@ ror	@C[3],@C[3],#32-10
415
416	eor	@C[6],@C[6],@C[4]
417	ldr.l	@E[2],[sp,#$D[2]]		@ D[2]
418	eor	@C[7],@C[7],@C[5]
419	ldr.h	@E[3],[sp,#$D[2]+4]
420	ror	@C[5],@C[6],#32-1		@ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
421	ror	@C[4],@C[7],#32-2
422
423	eor	@E[0],@E[0],@C[8]
424	ldr.l	@C[8],[sp,#$A[4][2]]		@ A[4][2]
425	eor	@E[1],@E[1],@C[9]
426	ldr.h	@C[9],[sp,#$A[4][2]+4]
427	ror	@C[7],@E[0],#32-22		@ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
428	ror	@C[6],@E[1],#32-23
429
430	bic	@E[0],@C[4],@C[2],ror#32-10
431	bic	@E[1],@C[5],@C[3],ror#32-10
432	 eor	@E[2],@E[2],@C[8]
433	 eor	@E[3],@E[3],@C[9]
434	 ror	@C[9],@E[2],#32-30		@ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
435	 ror	@C[8],@E[3],#32-31
436	eor	@E[0],@E[0],@C[0],ror#32-14
437	eor	@E[1],@E[1],@C[1],ror#32-14
438	str.l	@E[0],[sp,#$R[1][0]]		@ R[1][0] = C[0] ^ (~C[1] & C[2])
439	bic	@E[2],@C[6],@C[4]
440	str.h	@E[1],[sp,#$R[1][0]+4]
441	bic	@E[3],@C[7],@C[5]
442	eor	@E[2],@E[2],@C[2],ror#32-10
443	str.l	@E[2],[sp,#$R[1][1]]		@ R[1][1] = C[1] ^ (~C[2] & C[3]);
444	eor	@E[3],@E[3],@C[3],ror#32-10
445	str.h	@E[3],[sp,#$R[1][1]+4]
446	bic	@E[0],@C[8],@C[6]
447	bic	@E[1],@C[9],@C[7]
448	bic	@E[2],@C[0],@C[8],ror#14
449	bic	@E[3],@C[1],@C[9],ror#14
450	eor	@E[0],@E[0],@C[4]
451	eor	@E[1],@E[1],@C[5]
452	str.l	@E[0],[sp,#$R[1][2]]		@ R[1][2] = C[2] ^ (~C[3] & C[4]);
453	bic	@C[2],@C[2],@C[0],ror#32-(14-10)
454	str.h	@E[1],[sp,#$R[1][2]+4]
455	eor	@E[2],@C[6],@E[2],ror#32-14
456	bic	@E[1],@C[3],@C[1],ror#32-(14-10)
457	str.l	@E[2],[sp,#$R[1][3]]		@ R[1][3] = C[3] ^ (~C[4] & C[0]);
458	eor	@E[3],@C[7],@E[3],ror#32-14
459	str.h	@E[3],[sp,#$R[1][3]+4]
460	 add	@E[2],sp,#$D[1]
461	 ldr.l	@C[1],[sp,#$A[0][1]]		@ A[0][1]
462	eor	@E[0],@C[8],@C[2],ror#32-10
463	 ldr.h	@C[0],[sp,#$A[0][1]+4]
464	eor	@E[1],@C[9],@E[1],ror#32-10
465	str.l	@E[0],[sp,#$R[1][4]]		@ R[1][4] = C[4] ^ (~C[0] & C[1]);
466	str.h	@E[1],[sp,#$R[1][4]+4]
467
468	add	@C[9],sp,#$D[3]
469	ldmia	@E[2],{@E[0]-@E[2],@E[3]}	@ D[1..2]
470	ldr.l	@C[2],[sp,#$A[1][2]]		@ A[1][2]
471	ldr.h	@C[3],[sp,#$A[1][2]+4]
472	ldmia	@C[9],{@C[6]-@C[9]}		@ D[3..4]
473
474	eor	@C[1],@C[1],@E[0]
475	ldr.l	@C[4],[sp,#$A[2][3]]		@ A[2][3]
476	eor	@C[0],@C[0],@E[1]
477	ldr.h	@C[5],[sp,#$A[2][3]+4]
478	ror	@C[0],@C[0],#32-1		@ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
479
480	eor	@C[2],@C[2],@E[2]
481	ldr.l	@E[0],[sp,#$A[3][4]]		@ A[3][4]
482	eor	@C[3],@C[3],@E[3]
483	ldr.h	@E[1],[sp,#$A[3][4]+4]
484	@ ror	@C[2],@C[2],#32-3		@ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
485	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
486	@ ror	@C[3],@C[3],#32-3
487	ldr.h	@E[3],[sp,#$D[0]+4]
488
489	eor	@C[4],@C[4],@C[6]
490	eor	@C[5],@C[5],@C[7]
491	@ ror	@C[5],@C[6],#32-12		@ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
492	@ ror	@C[4],@C[7],#32-13		@ [track reverse order below]
493
494	eor	@E[0],@E[0],@C[8]
495	ldr.l	@C[8],[sp,#$A[4][0]]		@ A[4][0]
496	eor	@E[1],@E[1],@C[9]
497	ldr.h	@C[9],[sp,#$A[4][0]+4]
498	ror	@C[6],@E[0],#32-4		@ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
499	ror	@C[7],@E[1],#32-4
500
501	eor	@E[2],@E[2],@C[8]
502	eor	@E[3],@E[3],@C[9]
503	ror	@C[8],@E[2],#32-9		@ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
504	ror	@C[9],@E[3],#32-9
505
506	bic	@E[0],@C[5],@C[2],ror#13-3
507	bic	@E[1],@C[4],@C[3],ror#12-3
508	bic	@E[2],@C[6],@C[5],ror#32-13
509	bic	@E[3],@C[7],@C[4],ror#32-12
510	eor	@E[0],@C[0],@E[0],ror#32-13
511	eor	@E[1],@C[1],@E[1],ror#32-12
512	str.l	@E[0],[sp,#$R[2][0]]		@ R[2][0] = C[0] ^ (~C[1] & C[2])
513	eor	@E[2],@E[2],@C[2],ror#32-3
514	str.h	@E[1],[sp,#$R[2][0]+4]
515	eor	@E[3],@E[3],@C[3],ror#32-3
516	str.l	@E[2],[sp,#$R[2][1]]		@ R[2][1] = C[1] ^ (~C[2] & C[3]);
517	bic	@E[0],@C[8],@C[6]
518	bic	@E[1],@C[9],@C[7]
519	str.h	@E[3],[sp,#$R[2][1]+4]
520	eor	@E[0],@E[0],@C[5],ror#32-13
521	eor	@E[1],@E[1],@C[4],ror#32-12
522	str.l	@E[0],[sp,#$R[2][2]]		@ R[2][2] = C[2] ^ (~C[3] & C[4]);
523	bic	@E[2],@C[0],@C[8]
524	str.h	@E[1],[sp,#$R[2][2]+4]
525	bic	@E[3],@C[1],@C[9]
526	eor	@E[2],@E[2],@C[6]
527	eor	@E[3],@E[3],@C[7]
528	str.l	@E[2],[sp,#$R[2][3]]		@ R[2][3] = C[3] ^ (~C[4] & C[0]);
529	bic	@E[0],@C[2],@C[0],ror#3
530	str.h	@E[3],[sp,#$R[2][3]+4]
531	bic	@E[1],@C[3],@C[1],ror#3
532	 ldr.l	@C[1],[sp,#$A[0][4]]		@ A[0][4] [in reverse order]
533	eor	@E[0],@C[8],@E[0],ror#32-3
534	 ldr.h	@C[0],[sp,#$A[0][4]+4]
535	eor	@E[1],@C[9],@E[1],ror#32-3
536	str.l	@E[0],[sp,#$R[2][4]]		@ R[2][4] = C[4] ^ (~C[0] & C[1]);
537	 add	@C[9],sp,#$D[1]
538	str.h	@E[1],[sp,#$R[2][4]+4]
539
540	ldr.l	@E[0],[sp,#$D[4]]		@ D[4]
541	ldr.h	@E[1],[sp,#$D[4]+4]
542	ldr.l	@E[2],[sp,#$D[0]]		@ D[0]
543	ldr.h	@E[3],[sp,#$D[0]+4]
544
545	ldmia	@C[9],{@C[6]-@C[9]}		@ D[1..2]
546
547	eor	@C[1],@C[1],@E[0]
548	ldr.l	@C[2],[sp,#$A[1][0]]		@ A[1][0]
549	eor	@C[0],@C[0],@E[1]
550	ldr.h	@C[3],[sp,#$A[1][0]+4]
551	@ ror	@C[1],@E[0],#32-13		@ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
552	ldr.l	@C[4],[sp,#$A[2][1]]		@ A[2][1]
553	@ ror	@C[0],@E[1],#32-14		@ [was loaded in reverse order]
554	ldr.h	@C[5],[sp,#$A[2][1]+4]
555
556	eor	@C[2],@C[2],@E[2]
557	ldr.l	@E[0],[sp,#$A[3][2]]		@ A[3][2]
558	eor	@C[3],@C[3],@E[3]
559	ldr.h	@E[1],[sp,#$A[3][2]+4]
560	@ ror	@C[2],@C[2],#32-18		@ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
561	ldr.l	@E[2],[sp,#$D[3]]		@ D[3]
562	@ ror	@C[3],@C[3],#32-18
563	ldr.h	@E[3],[sp,#$D[3]+4]
564
565	eor	@C[6],@C[6],@C[4]
566	eor	@C[7],@C[7],@C[5]
567	ror	@C[4],@C[6],#32-5		@ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
568	ror	@C[5],@C[7],#32-5
569
570	eor	@E[0],@E[0],@C[8]
571	ldr.l	@C[8],[sp,#$A[4][3]]		@ A[4][3]
572	eor	@E[1],@E[1],@C[9]
573	ldr.h	@C[9],[sp,#$A[4][3]+4]
574	ror	@C[7],@E[0],#32-7		@ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
575	ror	@C[6],@E[1],#32-8
576
577	eor	@E[2],@E[2],@C[8]
578	eor	@E[3],@E[3],@C[9]
579	ror	@C[8],@E[2],#32-28		@ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
580	ror	@C[9],@E[3],#32-28
581
582	bic	@E[0],@C[4],@C[2],ror#32-18
583	bic	@E[1],@C[5],@C[3],ror#32-18
584	eor	@E[0],@E[0],@C[0],ror#32-14
585	eor	@E[1],@E[1],@C[1],ror#32-13
586	str.l	@E[0],[sp,#$R[3][0]]		@ R[3][0] = C[0] ^ (~C[1] & C[2])
587	bic	@E[2],@C[6],@C[4]
588	str.h	@E[1],[sp,#$R[3][0]+4]
589	bic	@E[3],@C[7],@C[5]
590	eor	@E[2],@E[2],@C[2],ror#32-18
591	str.l	@E[2],[sp,#$R[3][1]]		@ R[3][1] = C[1] ^ (~C[2] & C[3]);
592	eor	@E[3],@E[3],@C[3],ror#32-18
593	str.h	@E[3],[sp,#$R[3][1]+4]
594	bic	@E[0],@C[8],@C[6]
595	bic	@E[1],@C[9],@C[7]
596	bic	@E[2],@C[0],@C[8],ror#14
597	bic	@E[3],@C[1],@C[9],ror#13
598	eor	@E[0],@E[0],@C[4]
599	eor	@E[1],@E[1],@C[5]
600	str.l	@E[0],[sp,#$R[3][2]]		@ R[3][2] = C[2] ^ (~C[3] & C[4]);
601	bic	@C[2],@C[2],@C[0],ror#18-14
602	str.h	@E[1],[sp,#$R[3][2]+4]
603	eor	@E[2],@C[6],@E[2],ror#32-14
604	bic	@E[1],@C[3],@C[1],ror#18-13
605	eor	@E[3],@C[7],@E[3],ror#32-13
606	str.l	@E[2],[sp,#$R[3][3]]		@ R[3][3] = C[3] ^ (~C[4] & C[0]);
607	str.h	@E[3],[sp,#$R[3][3]+4]
608	 add	@E[3],sp,#$D[2]
609	 ldr.l	@C[0],[sp,#$A[0][2]]		@ A[0][2]
610	eor	@E[0],@C[8],@C[2],ror#32-18
611	 ldr.h	@C[1],[sp,#$A[0][2]+4]
612	eor	@E[1],@C[9],@E[1],ror#32-18
613	str.l	@E[0],[sp,#$R[3][4]]		@ R[3][4] = C[4] ^ (~C[0] & C[1]);
614	str.h	@E[1],[sp,#$R[3][4]+4]
615
616	ldmia	@E[3],{@E[0]-@E[2],@E[3]}	@ D[2..3]
617	ldr.l	@C[2],[sp,#$A[1][3]]		@ A[1][3]
618	ldr.h	@C[3],[sp,#$A[1][3]+4]
619	ldr.l	@C[6],[sp,#$D[4]]		@ D[4]
620	ldr.h	@C[7],[sp,#$D[4]+4]
621
622	eor	@C[0],@C[0],@E[0]
623	ldr.l	@C[4],[sp,#$A[2][4]]		@ A[2][4]
624	eor	@C[1],@C[1],@E[1]
625	ldr.h	@C[5],[sp,#$A[2][4]+4]
626	@ ror	@C[0],@C[0],#32-31		@ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
627	ldr.l	@C[8],[sp,#$D[0]]		@ D[0]
628	@ ror	@C[1],@C[1],#32-31
629	ldr.h	@C[9],[sp,#$D[0]+4]
630
631	eor	@E[2],@E[2],@C[2]
632	ldr.l	@E[0],[sp,#$A[3][0]]		@ A[3][0]
633	eor	@E[3],@E[3],@C[3]
634	ldr.h	@E[1],[sp,#$A[3][0]+4]
635	ror	@C[3],@E[2],#32-27		@ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
636	ldr.l	@E[2],[sp,#$D[1]]		@ D[1]
637	ror	@C[2],@E[3],#32-28
638	ldr.h	@E[3],[sp,#$D[1]+4]
639
640	eor	@C[6],@C[6],@C[4]
641	eor	@C[7],@C[7],@C[5]
642	ror	@C[5],@C[6],#32-19		@ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
643	ror	@C[4],@C[7],#32-20
644
645	eor	@E[0],@E[0],@C[8]
646	ldr.l	@C[8],[sp,#$A[4][1]]		@ A[4][1]
647	eor	@E[1],@E[1],@C[9]
648	ldr.h	@C[9],[sp,#$A[4][1]+4]
649	ror	@C[7],@E[0],#32-20		@ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
650	ror	@C[6],@E[1],#32-21
651
652	eor	@C[8],@C[8],@E[2]
653	eor	@C[9],@C[9],@E[3]
654	@ ror	@C[8],@C[2],#32-1		@ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
655	@ ror	@C[9],@C[3],#32-1
656
657	bic	@E[0],@C[4],@C[2]
658	bic	@E[1],@C[5],@C[3]
659	eor	@E[0],@E[0],@C[0],ror#32-31
660	str.l	@E[0],[sp,#$R[4][0]]		@ R[4][0] = C[0] ^ (~C[1] & C[2])
661	eor	@E[1],@E[1],@C[1],ror#32-31
662	str.h	@E[1],[sp,#$R[4][0]+4]
663	bic	@E[2],@C[6],@C[4]
664	bic	@E[3],@C[7],@C[5]
665	eor	@E[2],@E[2],@C[2]
666	eor	@E[3],@E[3],@C[3]
667	str.l	@E[2],[sp,#$R[4][1]]		@ R[4][1] = C[1] ^ (~C[2] & C[3]);
668	bic	@E[0],@C[8],@C[6],ror#1
669	str.h	@E[3],[sp,#$R[4][1]+4]
670	bic	@E[1],@C[9],@C[7],ror#1
671	bic	@E[2],@C[0],@C[8],ror#31-1
672	bic	@E[3],@C[1],@C[9],ror#31-1
673	eor	@C[4],@C[4],@E[0],ror#32-1
674	str.l	@C[4],[sp,#$R[4][2]]		@ R[4][2] = C[2] ^= (~C[3] & C[4]);
675	eor	@C[5],@C[5],@E[1],ror#32-1
676	str.h	@C[5],[sp,#$R[4][2]+4]
677	eor	@C[6],@C[6],@E[2],ror#32-31
678	eor	@C[7],@C[7],@E[3],ror#32-31
679	str.l	@C[6],[sp,#$R[4][3]]		@ R[4][3] = C[3] ^= (~C[4] & C[0]);
680	bic	@E[0],@C[2],@C[0],ror#32-31
681	str.h	@C[7],[sp,#$R[4][3]+4]
682	bic	@E[1],@C[3],@C[1],ror#32-31
683	 add	@E[2],sp,#$R[0][0]
684	eor	@C[8],@E[0],@C[8],ror#32-1
685	 add	@E[0],sp,#$R[1][0]
686	eor	@C[9],@E[1],@C[9],ror#32-1
687	str.l	@C[8],[sp,#$R[4][4]]		@ R[4][4] = C[4] ^= (~C[0] & C[1]);
688	str.h	@C[9],[sp,#$R[4][4]+4]
689___
690}
691	Round(@A,@T);
692	Round(@T,@A);
693$code.=<<___;
694	blo	.Lround2x
695
696#if __ARM_ARCH__>=5
697	ldr	pc,[sp,#440]
698#else
699	ldr	lr,[sp,#440]
700	tst	lr,#1
701	moveq	pc,lr		@ be binary compatible with V4, yet
702	bx	lr		@ interoperable with Thumb ISA:-)
703#endif
704.size	KeccakF1600_int,.-KeccakF1600_int
705
706.type	KeccakF1600, %function
707.align	5
708KeccakF1600:
709	stmdb	sp!,{r0,r4-r11,lr}
710	sub	sp,sp,#440+16			@ space for A[5][5],D[5],T[5][5],...
711
712	add	@E[0],r0,#$A[1][0]
713	add	@E[1],sp,#$A[1][0]
714	ldmia	r0,    {@C[0]-@C[9]}		@ copy A[5][5] to stack
715	stmia	sp,    {@C[0]-@C[9]}
716	ldmia	@E[0]!,{@C[0]-@C[9]}
717	stmia	@E[1]!,{@C[0]-@C[9]}
718	ldmia	@E[0]!,{@C[0]-@C[9]}
719	stmia	@E[1]!,{@C[0]-@C[9]}
720	ldmia	@E[0]!,{@C[0]-@C[9]}
721	stmia	@E[1]!,{@C[0]-@C[9]}
722	ldmia	@E[0], {@C[0]-@C[9]}
723	add	@E[2],sp,#$A[0][0]
724	add	@E[0],sp,#$A[1][0]
725	stmia	@E[1], {@C[0]-@C[9]}
726
727	bl	KeccakF1600_enter
728
729	ldr	@E[1], [sp,#440+16]		@ restore pointer to A
730	ldmia	sp,    {@C[0]-@C[9]}
731	stmia	@E[1]!,{@C[0]-@C[9]}		@ return A[5][5]
732	ldmia	@E[0]!,{@C[0]-@C[9]}
733	stmia	@E[1]!,{@C[0]-@C[9]}
734	ldmia	@E[0]!,{@C[0]-@C[9]}
735	stmia	@E[1]!,{@C[0]-@C[9]}
736	ldmia	@E[0]!,{@C[0]-@C[9]}
737	stmia	@E[1]!,{@C[0]-@C[9]}
738	ldmia	@E[0], {@C[0]-@C[9]}
739	stmia	@E[1], {@C[0]-@C[9]}
740
741	add	sp,sp,#440+20
742#if __ARM_ARCH__>=5
743	ldmia	sp!,{r4-r11,pc}
744#else
745	ldmia	sp!,{r4-r11,lr}
746	tst	lr,#1
747	moveq	pc,lr		@ be binary compatible with V4, yet
748	bx	lr		@ interoperable with Thumb ISA:-)
749#endif
750.size	KeccakF1600,.-KeccakF1600
751___
752{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
753
754########################################################################
755# Stack layout
756# ----->+-----------------------+
757#       | uint64_t A[5][5]      |
758#       | ...                   |
759#       | ...                   |
760# +456->+-----------------------+
761#       | 0x55555555            |
762# +460->+-----------------------+
763#       | 0x33333333            |
764# +464->+-----------------------+
765#       | 0x0f0f0f0f            |
766# +468->+-----------------------+
767#       | 0x00ff00ff            |
768# +472->+-----------------------+
769#       | uint64_t *A           |
770# +476->+-----------------------+
771#       | const void *inp       |
772# +480->+-----------------------+
773#       | size_t len            |
774# +484->+-----------------------+
775#       | size_t bs             |
776# +488->+-----------------------+
777#       | ....
778
779$code.=<<___;
780.global	SHA3_absorb
781.type	SHA3_absorb,%function
782.align	5
783SHA3_absorb:
784	stmdb	sp!,{r0-r12,lr}
785	sub	sp,sp,#456+16
786
787	add	$A_flat,r0,#$A[1][0]
788	@ mov	$inp,r1
789	mov	$len,r2
790	mov	$bsz,r3
791	cmp	r2,r3
792	blo	.Labsorb_abort
793
794	add	$inp,sp,#0
795	ldmia	r0,      {@C[0]-@C[9]}	@ copy A[5][5] to stack
796	stmia	$inp!,   {@C[0]-@C[9]}
797	ldmia	$A_flat!,{@C[0]-@C[9]}
798	stmia	$inp!,   {@C[0]-@C[9]}
799	ldmia	$A_flat!,{@C[0]-@C[9]}
800	stmia	$inp!,   {@C[0]-@C[9]}
801	ldmia	$A_flat!,{@C[0]-@C[9]}
802	stmia	$inp!,   {@C[0]-@C[9]}
803	ldmia	$A_flat!,{@C[0]-@C[9]}
804	stmia	$inp,    {@C[0]-@C[9]}
805
806	ldr	$inp,[sp,#476]		@ restore $inp
807#ifdef	__thumb2__
808	mov	r9,#0x00ff00ff
809	mov	r8,#0x0f0f0f0f
810	mov	r7,#0x33333333
811	mov	r6,#0x55555555
812#else
813	mov	r6,#0x11		@ compose constants
814	mov	r8,#0x0f
815	mov	r9,#0xff
816	orr	r6,r6,r6,lsl#8
817	orr	r8,r8,r8,lsl#8
818	orr	r6,r6,r6,lsl#16		@ 0x11111111
819	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
820	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
821	orr	r7,r6,r6,lsl#1		@ 0x33333333
822	orr	r6,r6,r6,lsl#2		@ 0x55555555
823#endif
824	str	r9,[sp,#468]
825	str	r8,[sp,#464]
826	str	r7,[sp,#460]
827	str	r6,[sp,#456]
828	b	.Loop_absorb
829
830.align	4
831.Loop_absorb:
832	subs	r0,$len,$bsz
833	blo	.Labsorbed
834	add	$A_flat,sp,#0
835	str	r0,[sp,#480]		@ save len - bsz
836
837.align	4
838.Loop_block:
839	ldrb	r0,[$inp],#1
840	ldrb	r1,[$inp],#1
841	ldrb	r2,[$inp],#1
842	ldrb	r3,[$inp],#1
843	ldrb	r4,[$inp],#1
844	orr	r0,r0,r1,lsl#8
845	ldrb	r1,[$inp],#1
846	orr	r0,r0,r2,lsl#16
847	ldrb	r2,[$inp],#1
848	orr	r0,r0,r3,lsl#24		@ lo
849	ldrb	r3,[$inp],#1
850	orr	r1,r4,r1,lsl#8
851	orr	r1,r1,r2,lsl#16
852	orr	r1,r1,r3,lsl#24		@ hi
853
854	and	r2,r0,r6		@ &=0x55555555
855	and	r0,r0,r6,lsl#1		@ &=0xaaaaaaaa
856	and	r3,r1,r6		@ &=0x55555555
857	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
858	orr	r2,r2,r2,lsr#1
859	orr	r0,r0,r0,lsl#1
860	orr	r3,r3,r3,lsr#1
861	orr	r1,r1,r1,lsl#1
862	and	r2,r2,r7		@ &=0x33333333
863	and	r0,r0,r7,lsl#2		@ &=0xcccccccc
864	and	r3,r3,r7		@ &=0x33333333
865	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
866	orr	r2,r2,r2,lsr#2
867	orr	r0,r0,r0,lsl#2
868	orr	r3,r3,r3,lsr#2
869	orr	r1,r1,r1,lsl#2
870	and	r2,r2,r8		@ &=0x0f0f0f0f
871	and	r0,r0,r8,lsl#4		@ &=0xf0f0f0f0
872	and	r3,r3,r8		@ &=0x0f0f0f0f
873	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
874	ldmia	$A_flat,{r4-r5}		@ A_flat[i]
875	orr	r2,r2,r2,lsr#4
876	orr	r0,r0,r0,lsl#4
877	orr	r3,r3,r3,lsr#4
878	orr	r1,r1,r1,lsl#4
879	and	r2,r2,r9		@ &=0x00ff00ff
880	and	r0,r0,r9,lsl#8		@ &=0xff00ff00
881	and	r3,r3,r9		@ &=0x00ff00ff
882	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
883	orr	r2,r2,r2,lsr#8
884	orr	r0,r0,r0,lsl#8
885	orr	r3,r3,r3,lsr#8
886	orr	r1,r1,r1,lsl#8
887
888	lsl	r2,r2,#16
889	lsr	r1,r1,#16
890	eor	r4,r4,r3,lsl#16
891	eor	r5,r5,r0,lsr#16
892	eor	r4,r4,r2,lsr#16
893	eor	r5,r5,r1,lsl#16
894	stmia	$A_flat!,{r4-r5}	@ A_flat[i++] ^= BitInterleave(inp[0..7])
895
896	subs	$bsz,$bsz,#8
897	bhi	.Loop_block
898
899	str	$inp,[sp,#476]
900
901	bl	KeccakF1600_int
902
903	add	r14,sp,#456
904	ldmia	r14,{r6-r12,r14}	@ restore constants and variables
905	b	.Loop_absorb
906
907.align	4
908.Labsorbed:
909	add	$inp,sp,#$A[1][0]
910	ldmia	sp,      {@C[0]-@C[9]}
911	stmia	$A_flat!,{@C[0]-@C[9]}	@ return A[5][5]
912	ldmia	$inp!,   {@C[0]-@C[9]}
913	stmia	$A_flat!,{@C[0]-@C[9]}
914	ldmia	$inp!,   {@C[0]-@C[9]}
915	stmia	$A_flat!,{@C[0]-@C[9]}
916	ldmia	$inp!,   {@C[0]-@C[9]}
917	stmia	$A_flat!,{@C[0]-@C[9]}
918	ldmia	$inp,    {@C[0]-@C[9]}
919	stmia	$A_flat, {@C[0]-@C[9]}
920
921.Labsorb_abort:
922	add	sp,sp,#456+32
923	mov	r0,$len			@ return value
924#if __ARM_ARCH__>=5
925	ldmia	sp!,{r4-r12,pc}
926#else
927	ldmia	sp!,{r4-r12,lr}
928	tst	lr,#1
929	moveq	pc,lr		@ be binary compatible with V4, yet
930	bx	lr		@ interoperable with Thumb ISA:-)
931#endif
932.size	SHA3_absorb,.-SHA3_absorb
933___
934}
935
936{ my ($out,$len,$A_flat,$bsz,$next) = map("r$_", (4,5,10,12,0));
937
938
939# void SHA3_squeeze(uint64_t A[5][5],
940#                   unsigned char *out, size_t len, size_t r, int next)
941#
942# The first 4 parameters are passed in via r0..r3,
943# next is passed on the stack [sp, #0]
944
945$code.=<<___;
946.global	SHA3_squeeze
947.type	SHA3_squeeze,%function
948.align	5
949SHA3_squeeze:
950	stmdb	sp!,{r0,r3-r10,lr}
951
952	mov	$A_flat,r0
953	mov	$out,r1
954	mov	$len,r2
955	mov	$bsz,r3
956	ldr	$next, [sp, #40]  @ next is after the 10 pushed registers (10*4)
957
958#ifdef	__thumb2__
959	mov	r9,#0x00ff00ff
960	mov	r8,#0x0f0f0f0f
961	mov	r7,#0x33333333
962	mov	r6,#0x55555555
963#else
964	mov	r6,#0x11		@ compose constants
965	mov	r8,#0x0f
966	mov	r9,#0xff
967	orr	r6,r6,r6,lsl#8
968	orr	r8,r8,r8,lsl#8
969	orr	r6,r6,r6,lsl#16		@ 0x11111111
970	orr	r9,r9,r9,lsl#16		@ 0x00ff00ff
971	orr	r8,r8,r8,lsl#16		@ 0x0f0f0f0f
972	orr	r7,r6,r6,lsl#1		@ 0x33333333
973	orr	r6,r6,r6,lsl#2		@ 0x55555555
974#endif
975	stmdb	sp!,{r6-r9}
976
977	mov	r14,$A_flat
978	cmp	$next, #1
979	beq	.Lnext_block
980	b	.Loop_squeeze
981
982.align	4
983.Loop_squeeze:
984	ldmia	$A_flat!,{r0,r1}	@ A_flat[i++]
985
986	lsl	r2,r0,#16
987	lsl	r3,r1,#16		@ r3 = r1 << 16
988	lsr	r2,r2,#16		@ r2 = r0 & 0x0000ffff
989	lsr	r1,r1,#16
990	lsr	r0,r0,#16		@ r0 = r0 >> 16
991	lsl	r1,r1,#16		@ r1 = r1 & 0xffff0000
992
993	orr	r2,r2,r2,lsl#8
994	orr	r3,r3,r3,lsr#8
995	orr	r0,r0,r0,lsl#8
996	orr	r1,r1,r1,lsr#8
997	and	r2,r2,r9		@ &=0x00ff00ff
998	and	r3,r3,r9,lsl#8		@ &=0xff00ff00
999	and	r0,r0,r9		@ &=0x00ff00ff
1000	and	r1,r1,r9,lsl#8		@ &=0xff00ff00
1001	orr	r2,r2,r2,lsl#4
1002	orr	r3,r3,r3,lsr#4
1003	orr	r0,r0,r0,lsl#4
1004	orr	r1,r1,r1,lsr#4
1005	and	r2,r2,r8		@ &=0x0f0f0f0f
1006	and	r3,r3,r8,lsl#4		@ &=0xf0f0f0f0
1007	and	r0,r0,r8		@ &=0x0f0f0f0f
1008	and	r1,r1,r8,lsl#4		@ &=0xf0f0f0f0
1009	orr	r2,r2,r2,lsl#2
1010	orr	r3,r3,r3,lsr#2
1011	orr	r0,r0,r0,lsl#2
1012	orr	r1,r1,r1,lsr#2
1013	and	r2,r2,r7		@ &=0x33333333
1014	and	r3,r3,r7,lsl#2		@ &=0xcccccccc
1015	and	r0,r0,r7		@ &=0x33333333
1016	and	r1,r1,r7,lsl#2		@ &=0xcccccccc
1017	orr	r2,r2,r2,lsl#1
1018	orr	r3,r3,r3,lsr#1
1019	orr	r0,r0,r0,lsl#1
1020	orr	r1,r1,r1,lsr#1
1021	and	r2,r2,r6		@ &=0x55555555
1022	and	r3,r3,r6,lsl#1		@ &=0xaaaaaaaa
1023	and	r0,r0,r6		@ &=0x55555555
1024	and	r1,r1,r6,lsl#1		@ &=0xaaaaaaaa
1025
1026	orr	r2,r2,r3
1027	orr	r0,r0,r1
1028
1029	cmp	$len,#8
1030	blo	.Lsqueeze_tail
1031	lsr	r1,r2,#8
1032	strb	r2,[$out],#1
1033	lsr	r3,r2,#16
1034	strb	r1,[$out],#1
1035	lsr	r2,r2,#24
1036	strb	r3,[$out],#1
1037	strb	r2,[$out],#1
1038
1039	lsr	r1,r0,#8
1040	strb	r0,[$out],#1
1041	lsr	r3,r0,#16
1042	strb	r1,[$out],#1
1043	lsr	r0,r0,#24
1044	strb	r3,[$out],#1
1045	strb	r0,[$out],#1
1046	subs	$len,$len,#8
1047	beq	.Lsqueeze_done
1048
1049	subs	$bsz,$bsz,#8		@ bsz -= 8
1050	bhi	.Loop_squeeze
1051.Lnext_block:
1052	mov	r0,r14			@ original $A_flat
1053
1054	bl	KeccakF1600
1055
1056	ldmia	sp,{r6-r10,r12}		@ restore constants and variables
1057	mov	r14,$A_flat
1058	b	.Loop_squeeze
1059
1060.align	4
1061.Lsqueeze_tail:
1062	strb	r2,[$out],#1
1063	lsr	r2,r2,#8
1064	subs	$len,$len,#1
1065	beq	.Lsqueeze_done
1066	strb	r2,[$out],#1
1067	lsr	r2,r2,#8
1068	subs	$len,$len,#1
1069	beq	.Lsqueeze_done
1070	strb	r2,[$out],#1
1071	lsr	r2,r2,#8
1072	subs	$len,$len,#1
1073	beq	.Lsqueeze_done
1074	strb	r2,[$out],#1
1075	subs	$len,$len,#1
1076	beq	.Lsqueeze_done
1077
1078	strb	r0,[$out],#1
1079	lsr	r0,r0,#8
1080	subs	$len,$len,#1
1081	beq	.Lsqueeze_done
1082	strb	r0,[$out],#1
1083	lsr	r0,r0,#8
1084	subs	$len,$len,#1
1085	beq	.Lsqueeze_done
1086	strb	r0,[$out]
1087	b	.Lsqueeze_done
1088
1089.align	4
1090.Lsqueeze_done:
1091	add	sp,sp,#24
1092#if __ARM_ARCH__>=5
1093	ldmia	sp!,{r4-r10,pc}
1094#else
1095	ldmia	sp!,{r4-r10,lr}
1096	tst	lr,#1
1097	moveq	pc,lr		@ be binary compatible with V4, yet
1098	bx	lr		@ interoperable with Thumb ISA:-)
1099#endif
1100.size	SHA3_squeeze,.-SHA3_squeeze
1101___
1102}
1103
1104$code.=<<___;
1105#if __ARM_MAX_ARCH__>=7
1106.fpu	neon
1107
1108.type	iotas64, %object
1109.align 5
1110iotas64:
1111	.quad	0x0000000000000001
1112	.quad	0x0000000000008082
1113	.quad	0x800000000000808a
1114	.quad	0x8000000080008000
1115	.quad	0x000000000000808b
1116	.quad	0x0000000080000001
1117	.quad	0x8000000080008081
1118	.quad	0x8000000000008009
1119	.quad	0x000000000000008a
1120	.quad	0x0000000000000088
1121	.quad	0x0000000080008009
1122	.quad	0x000000008000000a
1123	.quad	0x000000008000808b
1124	.quad	0x800000000000008b
1125	.quad	0x8000000000008089
1126	.quad	0x8000000000008003
1127	.quad	0x8000000000008002
1128	.quad	0x8000000000000080
1129	.quad	0x000000000000800a
1130	.quad	0x800000008000000a
1131	.quad	0x8000000080008081
1132	.quad	0x8000000000008080
1133	.quad	0x0000000080000001
1134	.quad	0x8000000080008008
1135.size	iotas64,.-iotas64
1136
1137.type	KeccakF1600_neon, %function
1138.align	5
1139KeccakF1600_neon:
1140	add	r1, r0, #16
1141	adr	r2, iotas64
1142	mov	r3, #24			@ loop counter
1143	b	.Loop_neon
1144
1145.align	4
1146.Loop_neon:
1147	@ Theta
1148	vst1.64		{q4},  [r0,:64]		@ offload A[0..1][4]
1149	veor		q13, q0,  q5		@ A[0..1][0]^A[2..3][0]
1150	vst1.64		{d18}, [r1,:64]		@ offload A[2][4]
1151	veor		q14, q1,  q6		@ A[0..1][1]^A[2..3][1]
1152	veor		q15, q2,  q7		@ A[0..1][2]^A[2..3][2]
1153	veor		d26, d26, d27		@ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1154	veor		d27, d28, d29		@ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1155	veor		q14, q3,  q8		@ A[0..1][3]^A[2..3][3]
1156	veor		q4,  q4,  q9		@ A[0..1][4]^A[2..3][4]
1157	veor		d30, d30, d31		@ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1158	veor		d31, d28, d29		@ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1159	veor		d25, d8,  d9		@ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1160	veor		q13, q13, q10		@ C[0..1]^=A[4][0..1]
1161	veor		q14, q15, q11		@ C[2..3]^=A[4][2..3]
1162	veor		d25, d25, d24		@ C[4]^=A[4][4]
1163
1164	vadd.u64	q4,  q13, q13		@ C[0..1]<<1
1165	vadd.u64	q15, q14, q14		@ C[2..3]<<1
1166	vadd.u64	d18, d25, d25		@ C[4]<<1
1167	vsri.u64	q4,  q13, #63		@ ROL64(C[0..1],1)
1168	vsri.u64	q15, q14, #63		@ ROL64(C[2..3],1)
1169	vsri.u64	d18, d25, #63		@ ROL64(C[4],1)
1170	veor		d25, d25, d9		@ D[0] = C[4] ^= ROL64(C[1],1)
1171	veor		q13, q13, q15		@ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1172	veor		d28, d28, d18		@ D[3] = C[2] ^= ROL64(C[4],1)
1173	veor		d29, d29, d8		@ D[4] = C[3] ^= ROL64(C[0],1)
1174
1175	veor		d0,  d0,  d25		@ A[0][0] ^= C[4]
1176	veor		d1,  d1,  d25		@ A[1][0] ^= C[4]
1177	veor		d10, d10, d25		@ A[2][0] ^= C[4]
1178	veor		d11, d11, d25		@ A[3][0] ^= C[4]
1179	veor		d20, d20, d25		@ A[4][0] ^= C[4]
1180
1181	veor		d2,  d2,  d26		@ A[0][1] ^= D[1]
1182	veor		d3,  d3,  d26		@ A[1][1] ^= D[1]
1183	veor		d12, d12, d26		@ A[2][1] ^= D[1]
1184	veor		d13, d13, d26		@ A[3][1] ^= D[1]
1185	veor		d21, d21, d26		@ A[4][1] ^= D[1]
1186	vmov		d26, d27
1187
1188	veor		d6,  d6,  d28		@ A[0][3] ^= C[2]
1189	veor		d7,  d7,  d28		@ A[1][3] ^= C[2]
1190	veor		d16, d16, d28		@ A[2][3] ^= C[2]
1191	veor		d17, d17, d28		@ A[3][3] ^= C[2]
1192	veor		d23, d23, d28		@ A[4][3] ^= C[2]
1193	vld1.64		{q4},  [r0,:64]		@ restore A[0..1][4]
1194	vmov		d28, d29
1195
1196	vld1.64		{d18}, [r1,:64]		@ restore A[2][4]
1197	veor		q2,  q2,  q13		@ A[0..1][2] ^= D[2]
1198	veor		q7,  q7,  q13		@ A[2..3][2] ^= D[2]
1199	veor		d22, d22, d27		@ A[4][2]    ^= D[2]
1200
1201	veor		q4,  q4,  q14		@ A[0..1][4] ^= C[3]
1202	veor		q9,  q9,  q14		@ A[2..3][4] ^= C[3]
1203	veor		d24, d24, d29		@ A[4][4]    ^= C[3]
1204
1205	@ Rho + Pi
1206	vmov		d26, d2			@ C[1] = A[0][1]
1207	vshl.u64	d2,  d3,  #44
1208	vmov		d27, d4			@ C[2] = A[0][2]
1209	vshl.u64	d4,  d14, #43
1210	vmov		d28, d6			@ C[3] = A[0][3]
1211	vshl.u64	d6,  d17, #21
1212	vmov		d29, d8			@ C[4] = A[0][4]
1213	vshl.u64	d8,  d24, #14
1214	vsri.u64	d2,  d3,  #64-44	@ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1215	vsri.u64	d4,  d14, #64-43	@ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1216	vsri.u64	d6,  d17, #64-21	@ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1217	vsri.u64	d8,  d24, #64-14	@ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1218
1219	vshl.u64	d3,  d9,  #20
1220	vshl.u64	d14, d16, #25
1221	vshl.u64	d17, d15, #15
1222	vshl.u64	d24, d21, #2
1223	vsri.u64	d3,  d9,  #64-20	@ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1224	vsri.u64	d14, d16, #64-25	@ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1225	vsri.u64	d17, d15, #64-15	@ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1226	vsri.u64	d24, d21, #64-2		@ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1227
1228	vshl.u64	d9,  d22, #61
1229	@ vshl.u64	d16, d19, #8
1230	vshl.u64	d15, d12, #10
1231	vshl.u64	d21, d7,  #55
1232	vsri.u64	d9,  d22, #64-61	@ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1233	vext.8		d16, d19, d19, #8-1	@ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1234	vsri.u64	d15, d12, #64-10	@ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1235	vsri.u64	d21, d7,  #64-55	@ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1236
1237	vshl.u64	d22, d18, #39
1238	@ vshl.u64	d19, d23, #56
1239	vshl.u64	d12, d5,  #6
1240	vshl.u64	d7,  d13, #45
1241	vsri.u64	d22, d18, #64-39	@ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1242	vext.8		d19, d23, d23, #8-7	@ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1243	vsri.u64	d12, d5,  #64-6		@ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1244	vsri.u64	d7,  d13, #64-45	@ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1245
1246	vshl.u64	d18, d20, #18
1247	vshl.u64	d23, d11, #41
1248	vshl.u64	d5,  d10, #3
1249	vshl.u64	d13, d1,  #36
1250	vsri.u64	d18, d20, #64-18	@ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1251	vsri.u64	d23, d11, #64-41	@ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1252	vsri.u64	d5,  d10, #64-3		@ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1253	vsri.u64	d13, d1,  #64-36	@ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1254
1255	vshl.u64	d1,  d28, #28
1256	vshl.u64	d10, d26, #1
1257	vshl.u64	d11, d29, #27
1258	vshl.u64	d20, d27, #62
1259	vsri.u64	d1,  d28, #64-28	@ A[1][0] = ROL64(C[3],    rhotates[0][3])
1260	vsri.u64	d10, d26, #64-1		@ A[2][0] = ROL64(C[1],    rhotates[0][1])
1261	vsri.u64	d11, d29, #64-27	@ A[3][0] = ROL64(C[4],    rhotates[0][4])
1262	vsri.u64	d20, d27, #64-62	@ A[4][0] = ROL64(C[2],    rhotates[0][2])
1263
1264	@ Chi + Iota
1265	vbic		q13, q2,  q1
1266	vbic		q14, q3,  q2
1267	vbic		q15, q4,  q3
1268	veor		q13, q13, q0		@ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1269	veor		q14, q14, q1		@ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1270	veor		q2,  q2,  q15		@ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1271	vst1.64		{q13}, [r0,:64]		@ offload A[0..1][0]
1272	vbic		q13, q0,  q4
1273	vbic		q15, q1,  q0
1274	vmov		q1,  q14		@ A[0..1][1]
1275	veor		q3,  q3,  q13		@ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1276	veor		q4,  q4,  q15		@ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1277
1278	vbic		q13, q7,  q6
1279	vmov		q0,  q5			@ A[2..3][0]
1280	vbic		q14, q8,  q7
1281	vmov		q15, q6			@ A[2..3][1]
1282	veor		q5,  q5,  q13		@ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1283	vbic		q13, q9,  q8
1284	veor		q6,  q6,  q14		@ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1285	vbic		q14, q0,  q9
1286	veor		q7,  q7,  q13		@ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1287	vbic		q13, q15, q0
1288	veor		q8,  q8,  q14		@ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1289	vmov		q14, q10		@ A[4][0..1]
1290	veor		q9,  q9,  q13		@ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1291
1292	vld1.64		d25, [r2,:64]!		@ Iota[i++]
1293	vbic		d26, d22, d21
1294	vbic		d27, d23, d22
1295	vld1.64		{q0}, [r0,:64]		@ restore A[0..1][0]
1296	veor		d20, d20, d26		@ A[4][0] ^= (~A[4][1] & A[4][2])
1297	vbic		d26, d24, d23
1298	veor		d21, d21, d27		@ A[4][1] ^= (~A[4][2] & A[4][3])
1299	vbic		d27, d28, d24
1300	veor		d22, d22, d26		@ A[4][2] ^= (~A[4][3] & A[4][4])
1301	vbic		d26, d29, d28
1302	veor		d23, d23, d27		@ A[4][3] ^= (~A[4][4] & A[4][0])
1303	veor		d0,  d0,  d25		@ A[0][0] ^= Iota[i]
1304	veor		d24, d24, d26		@ A[4][4] ^= (~A[4][0] & A[4][1])
1305
1306	subs	r3, r3, #1
1307	bne	.Loop_neon
1308
1309	ret
1310.size	KeccakF1600_neon,.-KeccakF1600_neon
1311
1312.global	SHA3_absorb_neon
1313.type	SHA3_absorb_neon, %function
1314.align	5
1315SHA3_absorb_neon:
1316	stmdb	sp!, {r4-r6,lr}
1317	vstmdb	sp!, {d8-d15}
1318
1319	mov	r4, r1			@ inp
1320	mov	r5, r2			@ len
1321	mov	r6, r3			@ bsz
1322
1323	vld1.32	{d0}, [r0,:64]!		@ A[0][0]
1324	vld1.32	{d2}, [r0,:64]!		@ A[0][1]
1325	vld1.32	{d4}, [r0,:64]!		@ A[0][2]
1326	vld1.32	{d6}, [r0,:64]!		@ A[0][3]
1327	vld1.32	{d8}, [r0,:64]!		@ A[0][4]
1328
1329	vld1.32	{d1}, [r0,:64]!		@ A[1][0]
1330	vld1.32	{d3}, [r0,:64]!		@ A[1][1]
1331	vld1.32	{d5}, [r0,:64]!		@ A[1][2]
1332	vld1.32	{d7}, [r0,:64]!		@ A[1][3]
1333	vld1.32	{d9}, [r0,:64]!		@ A[1][4]
1334
1335	vld1.32	{d10}, [r0,:64]!		@ A[2][0]
1336	vld1.32	{d12}, [r0,:64]!		@ A[2][1]
1337	vld1.32	{d14}, [r0,:64]!		@ A[2][2]
1338	vld1.32	{d16}, [r0,:64]!		@ A[2][3]
1339	vld1.32	{d18}, [r0,:64]!		@ A[2][4]
1340
1341	vld1.32	{d11}, [r0,:64]!		@ A[3][0]
1342	vld1.32	{d13}, [r0,:64]!		@ A[3][1]
1343	vld1.32	{d15}, [r0,:64]!		@ A[3][2]
1344	vld1.32	{d17}, [r0,:64]!		@ A[3][3]
1345	vld1.32	{d19}, [r0,:64]!		@ A[3][4]
1346
1347	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..3]
1348	vld1.32	{d24}, [r0,:64]		@ A[4][4]
1349	sub	r0, r0, #24*8		@ rewind
1350	b	.Loop_absorb_neon
1351
1352.align	4
1353.Loop_absorb_neon:
1354	subs	r12, r5, r6		@ len - bsz
1355	blo	.Labsorbed_neon
1356	mov	r5, r12
1357
1358	vld1.8	{d31}, [r4]!		@ endian-neutral loads...
1359	cmp	r6, #8*2
1360	veor	d0, d0, d31		@ A[0][0] ^= *inp++
1361	blo	.Lprocess_neon
1362	vld1.8	{d31}, [r4]!
1363	veor	d2, d2, d31		@ A[0][1] ^= *inp++
1364	beq	.Lprocess_neon
1365	vld1.8	{d31}, [r4]!
1366	cmp	r6, #8*4
1367	veor	d4, d4, d31		@ A[0][2] ^= *inp++
1368	blo	.Lprocess_neon
1369	vld1.8	{d31}, [r4]!
1370	veor	d6, d6, d31		@ A[0][3] ^= *inp++
1371	beq	.Lprocess_neon
1372	vld1.8	{d31},[r4]!
1373	cmp	r6, #8*6
1374	veor	d8, d8, d31		@ A[0][4] ^= *inp++
1375	blo	.Lprocess_neon
1376
1377	vld1.8	{d31}, [r4]!
1378	veor	d1, d1, d31		@ A[1][0] ^= *inp++
1379	beq	.Lprocess_neon
1380	vld1.8	{d31}, [r4]!
1381	cmp	r6, #8*8
1382	veor	d3, d3, d31		@ A[1][1] ^= *inp++
1383	blo	.Lprocess_neon
1384	vld1.8	{d31}, [r4]!
1385	veor	d5, d5, d31		@ A[1][2] ^= *inp++
1386	beq	.Lprocess_neon
1387	vld1.8	{d31}, [r4]!
1388	cmp	r6, #8*10
1389	veor	d7, d7, d31		@ A[1][3] ^= *inp++
1390	blo	.Lprocess_neon
1391	vld1.8	{d31}, [r4]!
1392	veor	d9, d9, d31		@ A[1][4] ^= *inp++
1393	beq	.Lprocess_neon
1394
1395	vld1.8	{d31}, [r4]!
1396	cmp	r6, #8*12
1397	veor	d10, d10, d31		@ A[2][0] ^= *inp++
1398	blo	.Lprocess_neon
1399	vld1.8	{d31}, [r4]!
1400	veor	d12, d12, d31		@ A[2][1] ^= *inp++
1401	beq	.Lprocess_neon
1402	vld1.8	{d31}, [r4]!
1403	cmp	r6, #8*14
1404	veor	d14, d14, d31		@ A[2][2] ^= *inp++
1405	blo	.Lprocess_neon
1406	vld1.8	{d31}, [r4]!
1407	veor	d16, d16, d31		@ A[2][3] ^= *inp++
1408	beq	.Lprocess_neon
1409	vld1.8	{d31}, [r4]!
1410	cmp	r6, #8*16
1411	veor	d18, d18, d31		@ A[2][4] ^= *inp++
1412	blo	.Lprocess_neon
1413
1414	vld1.8	{d31}, [r4]!
1415	veor	d11, d11, d31		@ A[3][0] ^= *inp++
1416	beq	.Lprocess_neon
1417	vld1.8	{d31}, [r4]!
1418	cmp	r6, #8*18
1419	veor	d13, d13, d31		@ A[3][1] ^= *inp++
1420	blo	.Lprocess_neon
1421	vld1.8	{d31}, [r4]!
1422	veor	d15, d15, d31		@ A[3][2] ^= *inp++
1423	beq	.Lprocess_neon
1424	vld1.8	{d31}, [r4]!
1425	cmp	r6, #8*20
1426	veor	d17, d17, d31		@ A[3][3] ^= *inp++
1427	blo	.Lprocess_neon
1428	vld1.8	{d31}, [r4]!
1429	veor	d19, d19, d31		@ A[3][4] ^= *inp++
1430	beq	.Lprocess_neon
1431
1432	vld1.8	{d31}, [r4]!
1433	cmp	r6, #8*22
1434	veor	d20, d20, d31		@ A[4][0] ^= *inp++
1435	blo	.Lprocess_neon
1436	vld1.8	{d31}, [r4]!
1437	veor	d21, d21, d31		@ A[4][1] ^= *inp++
1438	beq	.Lprocess_neon
1439	vld1.8	{d31}, [r4]!
1440	cmp	r6, #8*24
1441	veor	d22, d22, d31		@ A[4][2] ^= *inp++
1442	blo	.Lprocess_neon
1443	vld1.8	{d31}, [r4]!
1444	veor	d23, d23, d31		@ A[4][3] ^= *inp++
1445	beq	.Lprocess_neon
1446	vld1.8	{d31}, [r4]!
1447	veor	d24, d24, d31		@ A[4][4] ^= *inp++
1448
1449.Lprocess_neon:
1450	bl	KeccakF1600_neon
1451	b 	.Loop_absorb_neon
1452
1453.align	4
1454.Labsorbed_neon:
1455	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1456	vst1.32	{d2}, [r0,:64]!
1457	vst1.32	{d4}, [r0,:64]!
1458	vst1.32	{d6}, [r0,:64]!
1459	vst1.32	{d8}, [r0,:64]!
1460
1461	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1462	vst1.32	{d3}, [r0,:64]!
1463	vst1.32	{d5}, [r0,:64]!
1464	vst1.32	{d7}, [r0,:64]!
1465	vst1.32	{d9}, [r0,:64]!
1466
1467	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1468	vst1.32	{d12}, [r0,:64]!
1469	vst1.32	{d14}, [r0,:64]!
1470	vst1.32	{d16}, [r0,:64]!
1471	vst1.32	{d18}, [r0,:64]!
1472
1473	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1474	vst1.32	{d13}, [r0,:64]!
1475	vst1.32	{d15}, [r0,:64]!
1476	vst1.32	{d17}, [r0,:64]!
1477	vst1.32	{d19}, [r0,:64]!
1478
1479	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1480	vst1.32	{d24}, [r0,:64]
1481
1482	mov	r0, r5			@ return value
1483	vldmia	sp!, {d8-d15}
1484	ldmia	sp!, {r4-r6,pc}
1485.size	SHA3_absorb_neon,.-SHA3_absorb_neon
1486
1487.global	SHA3_squeeze_neon
1488.type	SHA3_squeeze_neon, %function
1489.align	5
1490SHA3_squeeze_neon:
1491	stmdb	sp!, {r4-r6,lr}
1492
1493	mov	r4, r1			@ out
1494	mov	r5, r2			@ len
1495	mov	r6, r3			@ bsz
1496	mov	r12, r0			@ A_flat
1497	mov	r14, r3			@ bsz
1498	b	.Loop_squeeze_neon
1499
1500.align	4
1501.Loop_squeeze_neon:
1502	cmp	r5, #8
1503	blo	.Lsqueeze_neon_tail
1504	vld1.32	{d0}, [r12]!
1505	vst1.8	{d0}, [r4]!		@ endian-neutral store
1506
1507	subs	r5, r5, #8		@ len -= 8
1508	beq	.Lsqueeze_neon_done
1509
1510	subs	r14, r14, #8		@ bsz -= 8
1511	bhi	.Loop_squeeze_neon
1512
1513	vstmdb	sp!,  {d8-d15}
1514
1515	vld1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1516	vld1.32	{d2}, [r0,:64]!
1517	vld1.32	{d4}, [r0,:64]!
1518	vld1.32	{d6}, [r0,:64]!
1519	vld1.32	{d8}, [r0,:64]!
1520
1521	vld1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1522	vld1.32	{d3}, [r0,:64]!
1523	vld1.32	{d5}, [r0,:64]!
1524	vld1.32	{d7}, [r0,:64]!
1525	vld1.32	{d9}, [r0,:64]!
1526
1527	vld1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1528	vld1.32	{d12}, [r0,:64]!
1529	vld1.32	{d14}, [r0,:64]!
1530	vld1.32	{d16}, [r0,:64]!
1531	vld1.32	{d18}, [r0,:64]!
1532
1533	vld1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1534	vld1.32	{d13}, [r0,:64]!
1535	vld1.32	{d15}, [r0,:64]!
1536	vld1.32	{d17}, [r0,:64]!
1537	vld1.32	{d19}, [r0,:64]!
1538
1539	vld1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1540	vld1.32	{d24}, [r0,:64]
1541	sub	r0, r0, #24*8		@ rewind
1542
1543	bl	KeccakF1600_neon
1544
1545	mov	r12, r0			@ A_flat
1546	vst1.32	{d0}, [r0,:64]!		@ A[0][0..4]
1547	vst1.32	{d2}, [r0,:64]!
1548	vst1.32	{d4}, [r0,:64]!
1549	vst1.32	{d6}, [r0,:64]!
1550	vst1.32	{d8}, [r0,:64]!
1551
1552	vst1.32	{d1}, [r0,:64]!		@ A[1][0..4]
1553	vst1.32	{d3}, [r0,:64]!
1554	vst1.32	{d5}, [r0,:64]!
1555	vst1.32	{d7}, [r0,:64]!
1556	vst1.32	{d9}, [r0,:64]!
1557
1558	vst1.32	{d10}, [r0,:64]!		@ A[2][0..4]
1559	vst1.32	{d12}, [r0,:64]!
1560	vst1.32	{d14}, [r0,:64]!
1561	vst1.32	{d16}, [r0,:64]!
1562	vst1.32	{d18}, [r0,:64]!
1563
1564	vst1.32	{d11}, [r0,:64]!		@ A[3][0..4]
1565	vst1.32	{d13}, [r0,:64]!
1566	vst1.32	{d15}, [r0,:64]!
1567	vst1.32	{d17}, [r0,:64]!
1568	vst1.32	{d19}, [r0,:64]!
1569
1570	vst1.32	{d20-d23}, [r0,:64]!	@ A[4][0..4]
1571	mov	r14, r6			@ bsz
1572	vst1.32	{d24}, [r0,:64]
1573	mov	r0,  r12		@ rewind
1574
1575	vldmia	sp!, {d8-d15}
1576	b	.Loop_squeeze_neon
1577
1578.align	4
1579.Lsqueeze_neon_tail:
1580	ldmia	r12, {r2,r3}
1581	cmp	r5, #2
1582	strb	r2, [r4],#1		@ endian-neutral store
1583	lsr	r2, r2, #8
1584	blo	.Lsqueeze_neon_done
1585	strb	r2, [r4], #1
1586	lsr	r2, r2, #8
1587	beq	.Lsqueeze_neon_done
1588	strb	r2, [r4], #1
1589	lsr	r2, r2, #8
1590	cmp	r5, #4
1591	blo	.Lsqueeze_neon_done
1592	strb	r2, [r4], #1
1593	beq	.Lsqueeze_neon_done
1594
1595	strb	r3, [r4], #1
1596	lsr	r3, r3, #8
1597	cmp	r5, #6
1598	blo	.Lsqueeze_neon_done
1599	strb	r3, [r4], #1
1600	lsr	r3, r3, #8
1601	beq	.Lsqueeze_neon_done
1602	strb	r3, [r4], #1
1603
1604.Lsqueeze_neon_done:
1605	ldmia	sp!, {r4-r6,pc}
1606.size	SHA3_squeeze_neon,.-SHA3_squeeze_neon
1607#endif
1608.asciz	"Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1609.align	2
1610___
1611
1612{
1613    my %ldr, %str;
1614
1615    sub ldrd {
1616	my ($mnemonic,$half,$reg,$ea) = @_;
1617	my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1618
1619	if ($half eq "l") {
1620	    $$op{reg} = $reg;
1621	    $$op{ea}  = $ea;
1622	    sprintf "#ifndef	__thumb2__\n"	.
1623		    "	%s\t%s,%s\n"		.
1624		    "#endif", $mnemonic,$reg,$ea;
1625	} else {
1626	    sprintf "#ifndef	__thumb2__\n"	.
1627		    "	%s\t%s,%s\n"		.
1628		    "#else\n"			.
1629		    "	%sd\t%s,%s,%s\n"	.
1630		    "#endif",	$mnemonic,$reg,$ea,
1631				$mnemonic,$$op{reg},$reg,$$op{ea};
1632	}
1633    }
1634}
1635
1636foreach (split($/,$code)) {
1637	s/\`([^\`]*)\`/eval $1/ge;
1638
1639	s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1640	s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov	$2$1#/g or
1641	s/\bret\b/bx	lr/g		or
1642	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
1643
1644	print $_,"\n";
1645}
1646
1647close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1648