xref: /openssl/crypto/chacha/asm/chacha-armv8.pl (revision fecb3aae)
1#! /usr/bin/env perl
2# Copyright 2016-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# June 2015
18#
19# ChaCha20 for ARMv8.
20#
21# April 2019
22#
23# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest
24# option on most(*), but not all, processors, yet 6+2 is retained.
25# This is because penalties are considered tolerable in comparison to
26# improvement on processors where 6+2 helps. Most notably +37% on
27# ThunderX2. It's server-oriented processor which will have to serve
28# as many requests as possible. While others are mostly clients, when
29# performance doesn't have to be absolute top-notch, just fast enough,
30# as majority of time is spent "entertaining" relatively slow human.
31#
32# Performance in cycles per byte out of large buffer.
33#
34#			IALU/gcc-4.9	4xNEON+1xIALU	6xNEON+2xIALU
35#
36# Apple A7		5.50/+49%	2.72		1.60
37# Cortex-A53		8.40/+80%	4.06		4.45(*)
38# Cortex-A57		8.06/+43%	4.15		4.40(*)
39# Denver		4.50/+82%	2.30		2.70(*)
40# X-Gene		9.50/+46%	8.20		8.90(*)
41# Mongoose		8.00/+44%	2.74		3.12(*)
42# Kryo			8.17/+50%	4.47		4.65(*)
43# ThunderX2		7.22/+48%	5.64		4.10
44#
45# (*)	slower than 4+1:-(
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
55die "can't locate arm-xlate.pl";
56
57open OUT,"| \"$^X\" $xlate $flavour \"$output\""
58    or die "can't call $xlate: $!";
59*STDOUT=*OUT;
60
61sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
62{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
63  my $arg = pop;
64    $arg = "#$arg" if ($arg*1 eq $arg);
65    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
66}
67
68my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
69
70my @x=map("x$_",(5..17,19..21));
71my @d=map("x$_",(22..28,30));
72
73sub ROUND {
74my ($a0,$b0,$c0,$d0)=@_;
75my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
76my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
77my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
78
79    (
80	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
81	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
82	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
83	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
84	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
85	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
86	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
87	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
88	"&ror_32	(@x[$d0],@x[$d0],16)",
89	 "&ror_32	(@x[$d1],@x[$d1],16)",
90	  "&ror_32	(@x[$d2],@x[$d2],16)",
91	   "&ror_32	(@x[$d3],@x[$d3],16)",
92
93	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
94	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
95	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
96	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
97	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
98	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
99	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
100	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
101	"&ror_32	(@x[$b0],@x[$b0],20)",
102	 "&ror_32	(@x[$b1],@x[$b1],20)",
103	  "&ror_32	(@x[$b2],@x[$b2],20)",
104	   "&ror_32	(@x[$b3],@x[$b3],20)",
105
106	"&add_32	(@x[$a0],@x[$a0],@x[$b0])",
107	 "&add_32	(@x[$a1],@x[$a1],@x[$b1])",
108	  "&add_32	(@x[$a2],@x[$a2],@x[$b2])",
109	   "&add_32	(@x[$a3],@x[$a3],@x[$b3])",
110	"&eor_32	(@x[$d0],@x[$d0],@x[$a0])",
111	 "&eor_32	(@x[$d1],@x[$d1],@x[$a1])",
112	  "&eor_32	(@x[$d2],@x[$d2],@x[$a2])",
113	   "&eor_32	(@x[$d3],@x[$d3],@x[$a3])",
114	"&ror_32	(@x[$d0],@x[$d0],24)",
115	 "&ror_32	(@x[$d1],@x[$d1],24)",
116	  "&ror_32	(@x[$d2],@x[$d2],24)",
117	   "&ror_32	(@x[$d3],@x[$d3],24)",
118
119	"&add_32	(@x[$c0],@x[$c0],@x[$d0])",
120	 "&add_32	(@x[$c1],@x[$c1],@x[$d1])",
121	  "&add_32	(@x[$c2],@x[$c2],@x[$d2])",
122	   "&add_32	(@x[$c3],@x[$c3],@x[$d3])",
123	"&eor_32	(@x[$b0],@x[$b0],@x[$c0])",
124	 "&eor_32	(@x[$b1],@x[$b1],@x[$c1])",
125	  "&eor_32	(@x[$b2],@x[$b2],@x[$c2])",
126	   "&eor_32	(@x[$b3],@x[$b3],@x[$c3])",
127	"&ror_32	(@x[$b0],@x[$b0],25)",
128	 "&ror_32	(@x[$b1],@x[$b1],25)",
129	  "&ror_32	(@x[$b2],@x[$b2],25)",
130	   "&ror_32	(@x[$b3],@x[$b3],25)"
131    );
132}
133
134$code.=<<___;
135#include "arm_arch.h"
136#ifndef	__KERNEL__
137.extern	OPENSSL_armcap_P
138.hidden	OPENSSL_armcap_P
139
140.extern ChaCha20_ctr32_sve
141#endif
142
143.text
144
145.align	5
146.Lsigma:
147.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
148.Lone:
149.long	1,2,3,4
150.Lrot24:
151.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
152.asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
153
154.globl	ChaCha20_ctr32_dflt
155.type	ChaCha20_ctr32_dflt,%function
156.align	5
157ChaCha20_ctr32_dflt:
158	AARCH64_SIGN_LINK_REGISTER
159	cmp	$len,#192
160	b.lo	.Lshort
161#ifndef	__KERNEL__
162	adrp	x17,OPENSSL_armcap_P
163	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
164.Lcheck_neon:
165	tst	w17,#ARMV7_NEON
166	b.ne	.LChaCha20_neon
167#endif
168
169.Lshort:
170	stp	x29,x30,[sp,#-96]!
171	add	x29,sp,#0
172
173	adr	@x[0],.Lsigma
174	stp	x19,x20,[sp,#16]
175	stp	x21,x22,[sp,#32]
176	stp	x23,x24,[sp,#48]
177	stp	x25,x26,[sp,#64]
178	stp	x27,x28,[sp,#80]
179	sub	sp,sp,#64
180
181	ldp	@d[0],@d[1],[@x[0]]		// load sigma
182	ldp	@d[2],@d[3],[$key]		// load key
183	ldp	@d[4],@d[5],[$key,#16]
184	ldp	@d[6],@d[7],[$ctr]		// load counter
185#ifdef	__AARCH64EB__
186	ror	@d[2],@d[2],#32
187	ror	@d[3],@d[3],#32
188	ror	@d[4],@d[4],#32
189	ror	@d[5],@d[5],#32
190	ror	@d[6],@d[6],#32
191	ror	@d[7],@d[7],#32
192#endif
193
194.Loop_outer:
195	mov.32	@x[0],@d[0]			// unpack key block
196	lsr	@x[1],@d[0],#32
197	mov.32	@x[2],@d[1]
198	lsr	@x[3],@d[1],#32
199	mov.32	@x[4],@d[2]
200	lsr	@x[5],@d[2],#32
201	mov.32	@x[6],@d[3]
202	lsr	@x[7],@d[3],#32
203	mov.32	@x[8],@d[4]
204	lsr	@x[9],@d[4],#32
205	mov.32	@x[10],@d[5]
206	lsr	@x[11],@d[5],#32
207	mov.32	@x[12],@d[6]
208	lsr	@x[13],@d[6],#32
209	mov.32	@x[14],@d[7]
210	lsr	@x[15],@d[7],#32
211
212	mov	$ctr,#10
213	subs	$len,$len,#64
214.Loop:
215	sub	$ctr,$ctr,#1
216___
217	foreach (&ROUND(0, 4, 8,12)) { eval; }
218	foreach (&ROUND(0, 5,10,15)) { eval; }
219$code.=<<___;
220	cbnz	$ctr,.Loop
221
222	add.32	@x[0],@x[0],@d[0]		// accumulate key block
223	add	@x[1],@x[1],@d[0],lsr#32
224	add.32	@x[2],@x[2],@d[1]
225	add	@x[3],@x[3],@d[1],lsr#32
226	add.32	@x[4],@x[4],@d[2]
227	add	@x[5],@x[5],@d[2],lsr#32
228	add.32	@x[6],@x[6],@d[3]
229	add	@x[7],@x[7],@d[3],lsr#32
230	add.32	@x[8],@x[8],@d[4]
231	add	@x[9],@x[9],@d[4],lsr#32
232	add.32	@x[10],@x[10],@d[5]
233	add	@x[11],@x[11],@d[5],lsr#32
234	add.32	@x[12],@x[12],@d[6]
235	add	@x[13],@x[13],@d[6],lsr#32
236	add.32	@x[14],@x[14],@d[7]
237	add	@x[15],@x[15],@d[7],lsr#32
238
239	b.lo	.Ltail
240
241	add	@x[0],@x[0],@x[1],lsl#32	// pack
242	add	@x[2],@x[2],@x[3],lsl#32
243	ldp	@x[1],@x[3],[$inp,#0]		// load input
244	add	@x[4],@x[4],@x[5],lsl#32
245	add	@x[6],@x[6],@x[7],lsl#32
246	ldp	@x[5],@x[7],[$inp,#16]
247	add	@x[8],@x[8],@x[9],lsl#32
248	add	@x[10],@x[10],@x[11],lsl#32
249	ldp	@x[9],@x[11],[$inp,#32]
250	add	@x[12],@x[12],@x[13],lsl#32
251	add	@x[14],@x[14],@x[15],lsl#32
252	ldp	@x[13],@x[15],[$inp,#48]
253	add	$inp,$inp,#64
254#ifdef	__AARCH64EB__
255	rev	@x[0],@x[0]
256	rev	@x[2],@x[2]
257	rev	@x[4],@x[4]
258	rev	@x[6],@x[6]
259	rev	@x[8],@x[8]
260	rev	@x[10],@x[10]
261	rev	@x[12],@x[12]
262	rev	@x[14],@x[14]
263#endif
264	eor	@x[0],@x[0],@x[1]
265	eor	@x[2],@x[2],@x[3]
266	eor	@x[4],@x[4],@x[5]
267	eor	@x[6],@x[6],@x[7]
268	eor	@x[8],@x[8],@x[9]
269	eor	@x[10],@x[10],@x[11]
270	eor	@x[12],@x[12],@x[13]
271	eor	@x[14],@x[14],@x[15]
272
273	stp	@x[0],@x[2],[$out,#0]		// store output
274	 add	@d[6],@d[6],#1			// increment counter
275	stp	@x[4],@x[6],[$out,#16]
276	stp	@x[8],@x[10],[$out,#32]
277	stp	@x[12],@x[14],[$out,#48]
278	add	$out,$out,#64
279
280	b.hi	.Loop_outer
281
282	ldp	x19,x20,[x29,#16]
283	add	sp,sp,#64
284	ldp	x21,x22,[x29,#32]
285	ldp	x23,x24,[x29,#48]
286	ldp	x25,x26,[x29,#64]
287	ldp	x27,x28,[x29,#80]
288	ldp	x29,x30,[sp],#96
289.Labort:
290	AARCH64_VALIDATE_LINK_REGISTER
291	ret
292
293.align	4
294.Ltail:
295	add	$len,$len,#64
296.Less_than_64:
297	sub	$out,$out,#1
298	add	$inp,$inp,$len
299	add	$out,$out,$len
300	add	$ctr,sp,$len
301	neg	$len,$len
302
303	add	@x[0],@x[0],@x[1],lsl#32	// pack
304	add	@x[2],@x[2],@x[3],lsl#32
305	add	@x[4],@x[4],@x[5],lsl#32
306	add	@x[6],@x[6],@x[7],lsl#32
307	add	@x[8],@x[8],@x[9],lsl#32
308	add	@x[10],@x[10],@x[11],lsl#32
309	add	@x[12],@x[12],@x[13],lsl#32
310	add	@x[14],@x[14],@x[15],lsl#32
311#ifdef	__AARCH64EB__
312	rev	@x[0],@x[0]
313	rev	@x[2],@x[2]
314	rev	@x[4],@x[4]
315	rev	@x[6],@x[6]
316	rev	@x[8],@x[8]
317	rev	@x[10],@x[10]
318	rev	@x[12],@x[12]
319	rev	@x[14],@x[14]
320#endif
321	stp	@x[0],@x[2],[sp,#0]
322	stp	@x[4],@x[6],[sp,#16]
323	stp	@x[8],@x[10],[sp,#32]
324	stp	@x[12],@x[14],[sp,#48]
325
326.Loop_tail:
327	ldrb	w10,[$inp,$len]
328	ldrb	w11,[$ctr,$len]
329	add	$len,$len,#1
330	eor	w10,w10,w11
331	strb	w10,[$out,$len]
332	cbnz	$len,.Loop_tail
333
334	stp	xzr,xzr,[sp,#0]
335	stp	xzr,xzr,[sp,#16]
336	stp	xzr,xzr,[sp,#32]
337	stp	xzr,xzr,[sp,#48]
338
339	ldp	x19,x20,[x29,#16]
340	add	sp,sp,#64
341	ldp	x21,x22,[x29,#32]
342	ldp	x23,x24,[x29,#48]
343	ldp	x25,x26,[x29,#64]
344	ldp	x27,x28,[x29,#80]
345	ldp	x29,x30,[sp],#96
346	AARCH64_VALIDATE_LINK_REGISTER
347	ret
348.size	ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
349
350.globl	ChaCha20_ctr32
351.type	ChaCha20_ctr32,%function
352.align	5
353ChaCha20_ctr32:
354	AARCH64_SIGN_LINK_REGISTER
355	cbz	$len,.Labort
356	cmp	$len,#192
357	b.lo	.Lshort
358#ifndef	__KERNEL__
359	adrp	x17,OPENSSL_armcap_P
360	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
361	tst	w17,#ARMV8_SVE
362	b.eq	.Lcheck_neon
363	stp	x29,x30,[sp,#-16]!
364	sub	sp,sp,#16
365	// SVE handling will inevitably increment the counter
366	// Neon/Scalar code that follows to process tail data needs to
367	// use new counter, unfortunately the input counter buffer
368	// pointed to by ctr is meant to be read-only per API contract
369	// we have to copy the buffer to stack to be writable by SVE
370	ldp	x5,x6,[$ctr]
371	stp	x5,x6,[sp]
372	mov	$ctr,sp
373	bl	ChaCha20_ctr32_sve
374	cbz	$len,1f
375	bl	ChaCha20_ctr32_dflt
3761:
377	add	sp,sp,#16
378	ldp	x29,x30,[sp],#16
379	AARCH64_VALIDATE_LINK_REGISTER
380	ret
381#endif
382	b	.Lshort
383.size	ChaCha20_ctr32,.-ChaCha20_ctr32
384___
385
386{{{
387my @K = map("v$_.4s",(0..3));
388my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9));
389my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31));
390my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
391    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X;
392
393sub NEON_lane_ROUND {
394my ($a0,$b0,$c0,$d0)=@_;
395my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
396my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
397my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
398my @x=map("'$_'",@X);
399
400	(
401	"&add		(@x[$a0],@x[$a0],@x[$b0])",	# Q1
402	 "&add		(@x[$a1],@x[$a1],@x[$b1])",	# Q2
403	  "&add		(@x[$a2],@x[$a2],@x[$b2])",	# Q3
404	   "&add	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
405	"&eor		(@x[$d0],@x[$d0],@x[$a0])",
406	 "&eor		(@x[$d1],@x[$d1],@x[$a1])",
407	  "&eor		(@x[$d2],@x[$d2],@x[$a2])",
408	   "&eor	(@x[$d3],@x[$d3],@x[$a3])",
409	"&rev32_16	(@x[$d0],@x[$d0])",
410	 "&rev32_16	(@x[$d1],@x[$d1])",
411	  "&rev32_16	(@x[$d2],@x[$d2])",
412	   "&rev32_16	(@x[$d3],@x[$d3])",
413
414	"&add		(@x[$c0],@x[$c0],@x[$d0])",
415	 "&add		(@x[$c1],@x[$c1],@x[$d1])",
416	  "&add		(@x[$c2],@x[$c2],@x[$d2])",
417	   "&add	(@x[$c3],@x[$c3],@x[$d3])",
418	"&eor		('$xt0',@x[$b0],@x[$c0])",
419	 "&eor		('$xt1',@x[$b1],@x[$c1])",
420	  "&eor		('$xt2',@x[$b2],@x[$c2])",
421	   "&eor	('$xt3',@x[$b3],@x[$c3])",
422	"&ushr		(@x[$b0],'$xt0',20)",
423	 "&ushr		(@x[$b1],'$xt1',20)",
424	  "&ushr	(@x[$b2],'$xt2',20)",
425	   "&ushr	(@x[$b3],'$xt3',20)",
426	"&sli		(@x[$b0],'$xt0',12)",
427	 "&sli		(@x[$b1],'$xt1',12)",
428	  "&sli		(@x[$b2],'$xt2',12)",
429	   "&sli	(@x[$b3],'$xt3',12)",
430
431	"&add		(@x[$a0],@x[$a0],@x[$b0])",
432	 "&add		(@x[$a1],@x[$a1],@x[$b1])",
433	  "&add		(@x[$a2],@x[$a2],@x[$b2])",
434	   "&add	(@x[$a3],@x[$a3],@x[$b3])",
435	"&eor		('$xt0',@x[$d0],@x[$a0])",
436	 "&eor		('$xt1',@x[$d1],@x[$a1])",
437	  "&eor		('$xt2',@x[$d2],@x[$a2])",
438	   "&eor	('$xt3',@x[$d3],@x[$a3])",
439	"&tbl		(@x[$d0],'{$xt0}','$ROT24')",
440	 "&tbl		(@x[$d1],'{$xt1}','$ROT24')",
441	  "&tbl		(@x[$d2],'{$xt2}','$ROT24')",
442	   "&tbl	(@x[$d3],'{$xt3}','$ROT24')",
443
444	"&add		(@x[$c0],@x[$c0],@x[$d0])",
445	 "&add		(@x[$c1],@x[$c1],@x[$d1])",
446	  "&add		(@x[$c2],@x[$c2],@x[$d2])",
447	   "&add	(@x[$c3],@x[$c3],@x[$d3])",
448	"&eor		('$xt0',@x[$b0],@x[$c0])",
449	 "&eor		('$xt1',@x[$b1],@x[$c1])",
450	  "&eor		('$xt2',@x[$b2],@x[$c2])",
451	   "&eor	('$xt3',@x[$b3],@x[$c3])",
452	"&ushr		(@x[$b0],'$xt0',25)",
453	 "&ushr		(@x[$b1],'$xt1',25)",
454	  "&ushr	(@x[$b2],'$xt2',25)",
455	   "&ushr	(@x[$b3],'$xt3',25)",
456	"&sli		(@x[$b0],'$xt0',7)",
457	 "&sli		(@x[$b1],'$xt1',7)",
458	  "&sli		(@x[$b2],'$xt2',7)",
459	   "&sli	(@x[$b3],'$xt3',7)"
460	);
461}
462
463$code.=<<___;
464
465#ifdef	__KERNEL__
466.globl	ChaCha20_neon
467#endif
468.type	ChaCha20_neon,%function
469.align	5
470ChaCha20_neon:
471	AARCH64_SIGN_LINK_REGISTER
472.LChaCha20_neon:
473	stp	x29,x30,[sp,#-96]!
474	add	x29,sp,#0
475
476	adr	@x[0],.Lsigma
477	stp	x19,x20,[sp,#16]
478	stp	x21,x22,[sp,#32]
479	stp	x23,x24,[sp,#48]
480	stp	x25,x26,[sp,#64]
481	stp	x27,x28,[sp,#80]
482	cmp	$len,#512
483	b.hs	.L512_or_more_neon
484
485	sub	sp,sp,#64
486
487	ldp	@d[0],@d[1],[@x[0]]		// load sigma
488	ld1	{@K[0]},[@x[0]],#16
489	ldp	@d[2],@d[3],[$key]		// load key
490	ldp	@d[4],@d[5],[$key,#16]
491	ld1	{@K[1],@K[2]},[$key]
492	ldp	@d[6],@d[7],[$ctr]		// load counter
493	ld1	{@K[3]},[$ctr]
494	stp	d8,d9,[sp]			// meet ABI requirements
495	ld1	{$CTR,$ROT24},[@x[0]]
496#ifdef	__AARCH64EB__
497	rev64	@K[0],@K[0]
498	ror	@d[2],@d[2],#32
499	ror	@d[3],@d[3],#32
500	ror	@d[4],@d[4],#32
501	ror	@d[5],@d[5],#32
502	ror	@d[6],@d[6],#32
503	ror	@d[7],@d[7],#32
504#endif
505
506.Loop_outer_neon:
507	dup	$xa0,@{K[0]}[0]			// unpack key block
508	 mov.32	@x[0],@d[0]
509	dup	$xa1,@{K[0]}[1]
510	 lsr	@x[1],@d[0],#32
511	dup	$xa2,@{K[0]}[2]
512	 mov.32	@x[2],@d[1]
513	dup	$xa3,@{K[0]}[3]
514	 lsr	@x[3],@d[1],#32
515	dup	$xb0,@{K[1]}[0]
516	 mov.32	@x[4],@d[2]
517	dup	$xb1,@{K[1]}[1]
518	 lsr	@x[5],@d[2],#32
519	dup	$xb2,@{K[1]}[2]
520	 mov.32	@x[6],@d[3]
521	dup	$xb3,@{K[1]}[3]
522	 lsr	@x[7],@d[3],#32
523	dup	$xd0,@{K[3]}[0]
524	 mov.32	@x[8],@d[4]
525	dup	$xd1,@{K[3]}[1]
526	 lsr	@x[9],@d[4],#32
527	dup	$xd2,@{K[3]}[2]
528	 mov.32	@x[10],@d[5]
529	dup	$xd3,@{K[3]}[3]
530	 lsr	@x[11],@d[5],#32
531	add	$xd0,$xd0,$CTR
532	 mov.32	@x[12],@d[6]
533	dup	$xc0,@{K[2]}[0]
534	 lsr	@x[13],@d[6],#32
535	dup	$xc1,@{K[2]}[1]
536	 mov.32	@x[14],@d[7]
537	dup	$xc2,@{K[2]}[2]
538	 lsr	@x[15],@d[7],#32
539	dup	$xc3,@{K[2]}[3]
540
541	mov	$ctr,#10
542	subs	$len,$len,#320
543.Loop_neon:
544	sub	$ctr,$ctr,#1
545___
546	my @plus_one=&ROUND(0,4,8,12);
547	foreach (&NEON_lane_ROUND(0,4,8,12))  { eval; eval(shift(@plus_one)); }
548
549	@plus_one=&ROUND(0,5,10,15);
550	foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); }
551$code.=<<___;
552	cbnz	$ctr,.Loop_neon
553
554	add	$xd0,$xd0,$CTR
555
556	zip1	$xt0,$xa0,$xa1			// transpose data
557	zip1	$xt1,$xa2,$xa3
558	zip2	$xt2,$xa0,$xa1
559	zip2	$xt3,$xa2,$xa3
560	zip1.64	$xa0,$xt0,$xt1
561	zip2.64	$xa1,$xt0,$xt1
562	zip1.64	$xa2,$xt2,$xt3
563	zip2.64	$xa3,$xt2,$xt3
564
565	zip1	$xt0,$xb0,$xb1
566	zip1	$xt1,$xb2,$xb3
567	zip2	$xt2,$xb0,$xb1
568	zip2	$xt3,$xb2,$xb3
569	zip1.64	$xb0,$xt0,$xt1
570	zip2.64	$xb1,$xt0,$xt1
571	zip1.64	$xb2,$xt2,$xt3
572	zip2.64	$xb3,$xt2,$xt3
573
574	zip1	$xt0,$xc0,$xc1
575	 add.32	@x[0],@x[0],@d[0]		// accumulate key block
576	zip1	$xt1,$xc2,$xc3
577	 add	@x[1],@x[1],@d[0],lsr#32
578	zip2	$xt2,$xc0,$xc1
579	 add.32	@x[2],@x[2],@d[1]
580	zip2	$xt3,$xc2,$xc3
581	 add	@x[3],@x[3],@d[1],lsr#32
582	zip1.64	$xc0,$xt0,$xt1
583	 add.32	@x[4],@x[4],@d[2]
584	zip2.64	$xc1,$xt0,$xt1
585	 add	@x[5],@x[5],@d[2],lsr#32
586	zip1.64	$xc2,$xt2,$xt3
587	 add.32	@x[6],@x[6],@d[3]
588	zip2.64	$xc3,$xt2,$xt3
589	 add	@x[7],@x[7],@d[3],lsr#32
590
591	zip1	$xt0,$xd0,$xd1
592	 add.32	@x[8],@x[8],@d[4]
593	zip1	$xt1,$xd2,$xd3
594	 add	@x[9],@x[9],@d[4],lsr#32
595	zip2	$xt2,$xd0,$xd1
596	 add.32	@x[10],@x[10],@d[5]
597	zip2	$xt3,$xd2,$xd3
598	 add	@x[11],@x[11],@d[5],lsr#32
599	zip1.64	$xd0,$xt0,$xt1
600	 add.32	@x[12],@x[12],@d[6]
601	zip2.64	$xd1,$xt0,$xt1
602	 add	@x[13],@x[13],@d[6],lsr#32
603	zip1.64	$xd2,$xt2,$xt3
604	 add.32	@x[14],@x[14],@d[7]
605	zip2.64	$xd3,$xt2,$xt3
606	 add	@x[15],@x[15],@d[7],lsr#32
607
608	b.lo	.Ltail_neon
609
610	add	@x[0],@x[0],@x[1],lsl#32	// pack
611	add	@x[2],@x[2],@x[3],lsl#32
612	ldp	@x[1],@x[3],[$inp,#0]		// load input
613	 add	$xa0,$xa0,@K[0]			// accumulate key block
614	add	@x[4],@x[4],@x[5],lsl#32
615	add	@x[6],@x[6],@x[7],lsl#32
616	ldp	@x[5],@x[7],[$inp,#16]
617	 add	$xb0,$xb0,@K[1]
618	add	@x[8],@x[8],@x[9],lsl#32
619	add	@x[10],@x[10],@x[11],lsl#32
620	ldp	@x[9],@x[11],[$inp,#32]
621	 add	$xc0,$xc0,@K[2]
622	add	@x[12],@x[12],@x[13],lsl#32
623	add	@x[14],@x[14],@x[15],lsl#32
624	ldp	@x[13],@x[15],[$inp,#48]
625	 add	$xd0,$xd0,@K[3]
626	add	$inp,$inp,#64
627#ifdef	__AARCH64EB__
628	rev	@x[0],@x[0]
629	rev	@x[2],@x[2]
630	rev	@x[4],@x[4]
631	rev	@x[6],@x[6]
632	rev	@x[8],@x[8]
633	rev	@x[10],@x[10]
634	rev	@x[12],@x[12]
635	rev	@x[14],@x[14]
636#endif
637	ld1.8	{$xt0-$xt3},[$inp],#64
638	eor	@x[0],@x[0],@x[1]
639	 add	$xa1,$xa1,@K[0]
640	eor	@x[2],@x[2],@x[3]
641	 add	$xb1,$xb1,@K[1]
642	eor	@x[4],@x[4],@x[5]
643	 add	$xc1,$xc1,@K[2]
644	eor	@x[6],@x[6],@x[7]
645	 add	$xd1,$xd1,@K[3]
646	eor	@x[8],@x[8],@x[9]
647	 eor	$xa0,$xa0,$xt0
648	 movi	$xt0,#5
649	eor	@x[10],@x[10],@x[11]
650	 eor	$xb0,$xb0,$xt1
651	eor	@x[12],@x[12],@x[13]
652	 eor	$xc0,$xc0,$xt2
653	eor	@x[14],@x[14],@x[15]
654	 eor	$xd0,$xd0,$xt3
655	 add	$CTR,$CTR,$xt0			// += 5
656	 ld1.8	{$xt0-$xt3},[$inp],#64
657
658	stp	@x[0],@x[2],[$out,#0]		// store output
659	 add	@d[6],@d[6],#5			// increment counter
660	stp	@x[4],@x[6],[$out,#16]
661	stp	@x[8],@x[10],[$out,#32]
662	stp	@x[12],@x[14],[$out,#48]
663	add	$out,$out,#64
664
665	st1.8	{$xa0-$xd0},[$out],#64
666	 add	$xa2,$xa2,@K[0]
667	 add	$xb2,$xb2,@K[1]
668	 add	$xc2,$xc2,@K[2]
669	 add	$xd2,$xd2,@K[3]
670	ld1.8	{$xa0-$xd0},[$inp],#64
671
672	eor	$xa1,$xa1,$xt0
673	eor	$xb1,$xb1,$xt1
674	eor	$xc1,$xc1,$xt2
675	eor	$xd1,$xd1,$xt3
676	st1.8	{$xa1-$xd1},[$out],#64
677	 add	$xa3,$xa3,@K[0]
678	 add	$xb3,$xb3,@K[1]
679	 add	$xc3,$xc3,@K[2]
680	 add	$xd3,$xd3,@K[3]
681	ld1.8	{$xa1-$xd1},[$inp],#64
682
683	eor	$xa2,$xa2,$xa0
684	eor	$xb2,$xb2,$xb0
685	eor	$xc2,$xc2,$xc0
686	eor	$xd2,$xd2,$xd0
687	st1.8	{$xa2-$xd2},[$out],#64
688
689	eor	$xa3,$xa3,$xa1
690	eor	$xb3,$xb3,$xb1
691	eor	$xc3,$xc3,$xc1
692	eor	$xd3,$xd3,$xd1
693	st1.8	{$xa3-$xd3},[$out],#64
694
695	b.hi	.Loop_outer_neon
696
697	ldp	d8,d9,[sp]			// meet ABI requirements
698
699	ldp	x19,x20,[x29,#16]
700	add	sp,sp,#64
701	ldp	x21,x22,[x29,#32]
702	ldp	x23,x24,[x29,#48]
703	ldp	x25,x26,[x29,#64]
704	ldp	x27,x28,[x29,#80]
705	ldp	x29,x30,[sp],#96
706	AARCH64_VALIDATE_LINK_REGISTER
707	ret
708
709.align	4
710.Ltail_neon:
711	add	$len,$len,#320
712	ldp	d8,d9,[sp]			// meet ABI requirements
713	cmp	$len,#64
714	b.lo	.Less_than_64
715
716	add	@x[0],@x[0],@x[1],lsl#32	// pack
717	add	@x[2],@x[2],@x[3],lsl#32
718	ldp	@x[1],@x[3],[$inp,#0]		// load input
719	add	@x[4],@x[4],@x[5],lsl#32
720	add	@x[6],@x[6],@x[7],lsl#32
721	ldp	@x[5],@x[7],[$inp,#16]
722	add	@x[8],@x[8],@x[9],lsl#32
723	add	@x[10],@x[10],@x[11],lsl#32
724	ldp	@x[9],@x[11],[$inp,#32]
725	add	@x[12],@x[12],@x[13],lsl#32
726	add	@x[14],@x[14],@x[15],lsl#32
727	ldp	@x[13],@x[15],[$inp,#48]
728	add	$inp,$inp,#64
729#ifdef	__AARCH64EB__
730	rev	@x[0],@x[0]
731	rev	@x[2],@x[2]
732	rev	@x[4],@x[4]
733	rev	@x[6],@x[6]
734	rev	@x[8],@x[8]
735	rev	@x[10],@x[10]
736	rev	@x[12],@x[12]
737	rev	@x[14],@x[14]
738#endif
739	eor	@x[0],@x[0],@x[1]
740	eor	@x[2],@x[2],@x[3]
741	eor	@x[4],@x[4],@x[5]
742	eor	@x[6],@x[6],@x[7]
743	eor	@x[8],@x[8],@x[9]
744	eor	@x[10],@x[10],@x[11]
745	eor	@x[12],@x[12],@x[13]
746	eor	@x[14],@x[14],@x[15]
747
748	stp	@x[0],@x[2],[$out,#0]		// store output
749	 add	$xa0,$xa0,@K[0]			// accumulate key block
750	stp	@x[4],@x[6],[$out,#16]
751	 add	$xb0,$xb0,@K[1]
752	stp	@x[8],@x[10],[$out,#32]
753	 add	$xc0,$xc0,@K[2]
754	stp	@x[12],@x[14],[$out,#48]
755	 add	$xd0,$xd0,@K[3]
756	add	$out,$out,#64
757	b.eq	.Ldone_neon
758	sub	$len,$len,#64
759	cmp	$len,#64
760	b.lo	.Last_neon
761
762	ld1.8	{$xt0-$xt3},[$inp],#64
763	eor	$xa0,$xa0,$xt0
764	eor	$xb0,$xb0,$xt1
765	eor	$xc0,$xc0,$xt2
766	eor	$xd0,$xd0,$xt3
767	st1.8	{$xa0-$xd0},[$out],#64
768	b.eq	.Ldone_neon
769
770	add	$xa0,$xa1,@K[0]
771	add	$xb0,$xb1,@K[1]
772	sub	$len,$len,#64
773	add	$xc0,$xc1,@K[2]
774	cmp	$len,#64
775	add	$xd0,$xd1,@K[3]
776	b.lo	.Last_neon
777
778	ld1.8	{$xt0-$xt3},[$inp],#64
779	eor	$xa1,$xa0,$xt0
780	eor	$xb1,$xb0,$xt1
781	eor	$xc1,$xc0,$xt2
782	eor	$xd1,$xd0,$xt3
783	st1.8	{$xa1-$xd1},[$out],#64
784	b.eq	.Ldone_neon
785
786	add	$xa0,$xa2,@K[0]
787	add	$xb0,$xb2,@K[1]
788	sub	$len,$len,#64
789	add	$xc0,$xc2,@K[2]
790	cmp	$len,#64
791	add	$xd0,$xd2,@K[3]
792	b.lo	.Last_neon
793
794	ld1.8	{$xt0-$xt3},[$inp],#64
795	eor	$xa2,$xa0,$xt0
796	eor	$xb2,$xb0,$xt1
797	eor	$xc2,$xc0,$xt2
798	eor	$xd2,$xd0,$xt3
799	st1.8	{$xa2-$xd2},[$out],#64
800	b.eq	.Ldone_neon
801
802	add	$xa0,$xa3,@K[0]
803	add	$xb0,$xb3,@K[1]
804	add	$xc0,$xc3,@K[2]
805	add	$xd0,$xd3,@K[3]
806	sub	$len,$len,#64
807
808.Last_neon:
809	st1.8	{$xa0-$xd0},[sp]
810
811	sub	$out,$out,#1
812	add	$inp,$inp,$len
813	add	$out,$out,$len
814	add	$ctr,sp,$len
815	neg	$len,$len
816
817.Loop_tail_neon:
818	ldrb	w10,[$inp,$len]
819	ldrb	w11,[$ctr,$len]
820	add	$len,$len,#1
821	eor	w10,w10,w11
822	strb	w10,[$out,$len]
823	cbnz	$len,.Loop_tail_neon
824
825	stp	xzr,xzr,[sp,#0]
826	stp	xzr,xzr,[sp,#16]
827	stp	xzr,xzr,[sp,#32]
828	stp	xzr,xzr,[sp,#48]
829
830.Ldone_neon:
831	ldp	x19,x20,[x29,#16]
832	add	sp,sp,#64
833	ldp	x21,x22,[x29,#32]
834	ldp	x23,x24,[x29,#48]
835	ldp	x25,x26,[x29,#64]
836	ldp	x27,x28,[x29,#80]
837	ldp	x29,x30,[sp],#96
838	AARCH64_VALIDATE_LINK_REGISTER
839	ret
840.size	ChaCha20_neon,.-ChaCha20_neon
841___
842{
843my @K = map("v$_.4s",(0..6));
844my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
845my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
846    $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31));
847my $rot24 = @K[6];
848my $ONE = "v7.4s";
849
850sub NEONROUND {
851my $odd = pop;
852my ($a,$b,$c,$d,$t)=@_;
853
854	(
855	"&add		('$a','$a','$b')",
856	"&eor		('$d','$d','$a')",
857	"&rev32_16	('$d','$d')",		# vrot ($d,16)
858
859	"&add		('$c','$c','$d')",
860	"&eor		('$t','$b','$c')",
861	"&ushr		('$b','$t',20)",
862	"&sli		('$b','$t',12)",
863
864	"&add		('$a','$a','$b')",
865	"&eor		('$d','$d','$a')",
866	"&tbl		('$d','{$d}','$rot24')",
867
868	"&add		('$c','$c','$d')",
869	"&eor		('$t','$b','$c')",
870	"&ushr		('$b','$t',25)",
871	"&sli		('$b','$t',7)",
872
873	"&ext		('$c','$c','$c',8)",
874	"&ext		('$d','$d','$d',$odd?4:12)",
875	"&ext		('$b','$b','$b',$odd?12:4)"
876	);
877}
878
879$code.=<<___;
880.type	ChaCha20_512_neon,%function
881.align	5
882ChaCha20_512_neon:
883	AARCH64_SIGN_LINK_REGISTER
884	stp	x29,x30,[sp,#-96]!
885	add	x29,sp,#0
886
887	adr	@x[0],.Lsigma
888	stp	x19,x20,[sp,#16]
889	stp	x21,x22,[sp,#32]
890	stp	x23,x24,[sp,#48]
891	stp	x25,x26,[sp,#64]
892	stp	x27,x28,[sp,#80]
893
894.L512_or_more_neon:
895	sub	sp,sp,#128+64
896
897	eor	$ONE,$ONE,$ONE
898	ldp	@d[0],@d[1],[@x[0]]		// load sigma
899	ld1	{@K[0]},[@x[0]],#16
900	ldp	@d[2],@d[3],[$key]		// load key
901	ldp	@d[4],@d[5],[$key,#16]
902	ld1	{@K[1],@K[2]},[$key]
903	ldp	@d[6],@d[7],[$ctr]		// load counter
904	ld1	{@K[3]},[$ctr]
905	ld1	{$ONE}[0],[@x[0]]
906	add	$key,@x[0],#16			// .Lrot24
907#ifdef	__AARCH64EB__
908	rev64	@K[0],@K[0]
909	ror	@d[2],@d[2],#32
910	ror	@d[3],@d[3],#32
911	ror	@d[4],@d[4],#32
912	ror	@d[5],@d[5],#32
913	ror	@d[6],@d[6],#32
914	ror	@d[7],@d[7],#32
915#endif
916	add	@K[3],@K[3],$ONE		// += 1
917	stp	@K[0],@K[1],[sp,#0]		// off-load key block, invariant part
918	add	@K[3],@K[3],$ONE		// not typo
919	str	@K[2],[sp,#32]
920	add	@K[4],@K[3],$ONE
921	add	@K[5],@K[4],$ONE
922	add	@K[6],@K[5],$ONE
923	shl	$ONE,$ONE,#2			// 1 -> 4
924
925	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
926	stp	d10,d11,[sp,#128+16]
927	stp	d12,d13,[sp,#128+32]
928	stp	d14,d15,[sp,#128+48]
929
930	sub	$len,$len,#512			// not typo
931
932.Loop_outer_512_neon:
933	 mov	$A0,@K[0]
934	 mov	$A1,@K[0]
935	 mov	$A2,@K[0]
936	 mov	$A3,@K[0]
937	 mov	$A4,@K[0]
938	 mov	$A5,@K[0]
939	 mov	$B0,@K[1]
940	mov.32	@x[0],@d[0]			// unpack key block
941	 mov	$B1,@K[1]
942	lsr	@x[1],@d[0],#32
943	 mov	$B2,@K[1]
944	mov.32	@x[2],@d[1]
945	 mov	$B3,@K[1]
946	lsr	@x[3],@d[1],#32
947	 mov	$B4,@K[1]
948	mov.32	@x[4],@d[2]
949	 mov	$B5,@K[1]
950	lsr	@x[5],@d[2],#32
951	 mov	$D0,@K[3]
952	mov.32	@x[6],@d[3]
953	 mov	$D1,@K[4]
954	lsr	@x[7],@d[3],#32
955	 mov	$D2,@K[5]
956	mov.32	@x[8],@d[4]
957	 mov	$D3,@K[6]
958	lsr	@x[9],@d[4],#32
959	 mov	$C0,@K[2]
960	mov.32	@x[10],@d[5]
961	 mov	$C1,@K[2]
962	lsr	@x[11],@d[5],#32
963	 add	$D4,$D0,$ONE			// +4
964	mov.32	@x[12],@d[6]
965	 add	$D5,$D1,$ONE			// +4
966	lsr	@x[13],@d[6],#32
967	 mov	$C2,@K[2]
968	mov.32	@x[14],@d[7]
969	 mov	$C3,@K[2]
970	lsr	@x[15],@d[7],#32
971	 mov	$C4,@K[2]
972	 stp	@K[3],@K[4],[sp,#48]		// off-load key block, variable part
973	 mov	$C5,@K[2]
974	 stp	@K[5],@K[6],[sp,#80]
975
976	mov	$ctr,#5
977	ld1	{$rot24},[$key]
978	subs	$len,$len,#512
979.Loop_upper_neon:
980	sub	$ctr,$ctr,#1
981___
982	my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
983	my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
984	my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
985	my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
986	my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
987	my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
988	my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
989	my $diff = ($#thread0+1)*6 - $#thread67 - 1;
990	my $i = 0;
991
992	foreach (@thread0) {
993		eval;			eval(shift(@thread67));
994		eval(shift(@thread1));	eval(shift(@thread67));
995		eval(shift(@thread2));	eval(shift(@thread67));
996		eval(shift(@thread3));	eval(shift(@thread67));
997		eval(shift(@thread4));	eval(shift(@thread67));
998		eval(shift(@thread5));	eval(shift(@thread67));
999	}
1000
1001	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
1002	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
1003	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
1004	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
1005	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
1006	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
1007	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1008
1009	foreach (@thread0) {
1010		eval;			eval(shift(@thread67));
1011		eval(shift(@thread1));	eval(shift(@thread67));
1012		eval(shift(@thread2));	eval(shift(@thread67));
1013		eval(shift(@thread3));	eval(shift(@thread67));
1014		eval(shift(@thread4));	eval(shift(@thread67));
1015		eval(shift(@thread5));	eval(shift(@thread67));
1016	}
1017$code.=<<___;
1018	cbnz	$ctr,.Loop_upper_neon
1019
1020	add.32	@x[0],@x[0],@d[0]		// accumulate key block
1021	add	@x[1],@x[1],@d[0],lsr#32
1022	add.32	@x[2],@x[2],@d[1]
1023	add	@x[3],@x[3],@d[1],lsr#32
1024	add.32	@x[4],@x[4],@d[2]
1025	add	@x[5],@x[5],@d[2],lsr#32
1026	add.32	@x[6],@x[6],@d[3]
1027	add	@x[7],@x[7],@d[3],lsr#32
1028	add.32	@x[8],@x[8],@d[4]
1029	add	@x[9],@x[9],@d[4],lsr#32
1030	add.32	@x[10],@x[10],@d[5]
1031	add	@x[11],@x[11],@d[5],lsr#32
1032	add.32	@x[12],@x[12],@d[6]
1033	add	@x[13],@x[13],@d[6],lsr#32
1034	add.32	@x[14],@x[14],@d[7]
1035	add	@x[15],@x[15],@d[7],lsr#32
1036
1037	add	@x[0],@x[0],@x[1],lsl#32	// pack
1038	add	@x[2],@x[2],@x[3],lsl#32
1039	ldp	@x[1],@x[3],[$inp,#0]		// load input
1040	add	@x[4],@x[4],@x[5],lsl#32
1041	add	@x[6],@x[6],@x[7],lsl#32
1042	ldp	@x[5],@x[7],[$inp,#16]
1043	add	@x[8],@x[8],@x[9],lsl#32
1044	add	@x[10],@x[10],@x[11],lsl#32
1045	ldp	@x[9],@x[11],[$inp,#32]
1046	add	@x[12],@x[12],@x[13],lsl#32
1047	add	@x[14],@x[14],@x[15],lsl#32
1048	ldp	@x[13],@x[15],[$inp,#48]
1049	add	$inp,$inp,#64
1050#ifdef	__AARCH64EB__
1051	rev	@x[0],@x[0]
1052	rev	@x[2],@x[2]
1053	rev	@x[4],@x[4]
1054	rev	@x[6],@x[6]
1055	rev	@x[8],@x[8]
1056	rev	@x[10],@x[10]
1057	rev	@x[12],@x[12]
1058	rev	@x[14],@x[14]
1059#endif
1060	eor	@x[0],@x[0],@x[1]
1061	eor	@x[2],@x[2],@x[3]
1062	eor	@x[4],@x[4],@x[5]
1063	eor	@x[6],@x[6],@x[7]
1064	eor	@x[8],@x[8],@x[9]
1065	eor	@x[10],@x[10],@x[11]
1066	eor	@x[12],@x[12],@x[13]
1067	eor	@x[14],@x[14],@x[15]
1068
1069	 stp	@x[0],@x[2],[$out,#0]		// store output
1070	 add	@d[6],@d[6],#1			// increment counter
1071	mov.32	@x[0],@d[0]			// unpack key block
1072	lsr	@x[1],@d[0],#32
1073	 stp	@x[4],@x[6],[$out,#16]
1074	mov.32	@x[2],@d[1]
1075	lsr	@x[3],@d[1],#32
1076	 stp	@x[8],@x[10],[$out,#32]
1077	mov.32	@x[4],@d[2]
1078	lsr	@x[5],@d[2],#32
1079	 stp	@x[12],@x[14],[$out,#48]
1080	 add	$out,$out,#64
1081	mov.32	@x[6],@d[3]
1082	lsr	@x[7],@d[3],#32
1083	mov.32	@x[8],@d[4]
1084	lsr	@x[9],@d[4],#32
1085	mov.32	@x[10],@d[5]
1086	lsr	@x[11],@d[5],#32
1087	mov.32	@x[12],@d[6]
1088	lsr	@x[13],@d[6],#32
1089	mov.32	@x[14],@d[7]
1090	lsr	@x[15],@d[7],#32
1091
1092	mov	$ctr,#5
1093.Loop_lower_neon:
1094	sub	$ctr,$ctr,#1
1095___
1096	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
1097	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
1098	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
1099	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
1100	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
1101	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
1102	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1103
1104	foreach (@thread0) {
1105		eval;			eval(shift(@thread67));
1106		eval(shift(@thread1));	eval(shift(@thread67));
1107		eval(shift(@thread2));	eval(shift(@thread67));
1108		eval(shift(@thread3));	eval(shift(@thread67));
1109		eval(shift(@thread4));	eval(shift(@thread67));
1110		eval(shift(@thread5));	eval(shift(@thread67));
1111	}
1112
1113	@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
1114	@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
1115	@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
1116	@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
1117	@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
1118	@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
1119	@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
1120
1121	foreach (@thread0) {
1122		eval;			eval(shift(@thread67));
1123		eval(shift(@thread1));	eval(shift(@thread67));
1124		eval(shift(@thread2));	eval(shift(@thread67));
1125		eval(shift(@thread3));	eval(shift(@thread67));
1126		eval(shift(@thread4));	eval(shift(@thread67));
1127		eval(shift(@thread5));	eval(shift(@thread67));
1128	}
1129$code.=<<___;
1130	cbnz	$ctr,.Loop_lower_neon
1131
1132	add.32	@x[0],@x[0],@d[0]		// accumulate key block
1133	 ldp	@K[0],@K[1],[sp,#0]
1134	add	@x[1],@x[1],@d[0],lsr#32
1135	 ldp	@K[2],@K[3],[sp,#32]
1136	add.32	@x[2],@x[2],@d[1]
1137	 ldp	@K[4],@K[5],[sp,#64]
1138	add	@x[3],@x[3],@d[1],lsr#32
1139	 ldr	@K[6],[sp,#96]
1140	 add	$A0,$A0,@K[0]
1141	add.32	@x[4],@x[4],@d[2]
1142	 add	$A1,$A1,@K[0]
1143	add	@x[5],@x[5],@d[2],lsr#32
1144	 add	$A2,$A2,@K[0]
1145	add.32	@x[6],@x[6],@d[3]
1146	 add	$A3,$A3,@K[0]
1147	add	@x[7],@x[7],@d[3],lsr#32
1148	 add	$A4,$A4,@K[0]
1149	add.32	@x[8],@x[8],@d[4]
1150	 add	$A5,$A5,@K[0]
1151	add	@x[9],@x[9],@d[4],lsr#32
1152	 add	$C0,$C0,@K[2]
1153	add.32	@x[10],@x[10],@d[5]
1154	 add	$C1,$C1,@K[2]
1155	add	@x[11],@x[11],@d[5],lsr#32
1156	 add	$C2,$C2,@K[2]
1157	add.32	@x[12],@x[12],@d[6]
1158	 add	$C3,$C3,@K[2]
1159	add	@x[13],@x[13],@d[6],lsr#32
1160	 add	$C4,$C4,@K[2]
1161	add.32	@x[14],@x[14],@d[7]
1162	 add	$C5,$C5,@K[2]
1163	add	@x[15],@x[15],@d[7],lsr#32
1164	 add	$D4,$D4,$ONE			// +4
1165	add	@x[0],@x[0],@x[1],lsl#32	// pack
1166	 add	$D5,$D5,$ONE			// +4
1167	add	@x[2],@x[2],@x[3],lsl#32
1168	 add	$D0,$D0,@K[3]
1169	ldp	@x[1],@x[3],[$inp,#0]		// load input
1170	 add	$D1,$D1,@K[4]
1171	add	@x[4],@x[4],@x[5],lsl#32
1172	 add	$D2,$D2,@K[5]
1173	add	@x[6],@x[6],@x[7],lsl#32
1174	 add	$D3,$D3,@K[6]
1175	ldp	@x[5],@x[7],[$inp,#16]
1176	 add	$D4,$D4,@K[3]
1177	add	@x[8],@x[8],@x[9],lsl#32
1178	 add	$D5,$D5,@K[4]
1179	add	@x[10],@x[10],@x[11],lsl#32
1180	 add	$B0,$B0,@K[1]
1181	ldp	@x[9],@x[11],[$inp,#32]
1182	 add	$B1,$B1,@K[1]
1183	add	@x[12],@x[12],@x[13],lsl#32
1184	 add	$B2,$B2,@K[1]
1185	add	@x[14],@x[14],@x[15],lsl#32
1186	 add	$B3,$B3,@K[1]
1187	ldp	@x[13],@x[15],[$inp,#48]
1188	 add	$B4,$B4,@K[1]
1189	add	$inp,$inp,#64
1190	 add	$B5,$B5,@K[1]
1191
1192#ifdef	__AARCH64EB__
1193	rev	@x[0],@x[0]
1194	rev	@x[2],@x[2]
1195	rev	@x[4],@x[4]
1196	rev	@x[6],@x[6]
1197	rev	@x[8],@x[8]
1198	rev	@x[10],@x[10]
1199	rev	@x[12],@x[12]
1200	rev	@x[14],@x[14]
1201#endif
1202	ld1.8	{$T0-$T3},[$inp],#64
1203	eor	@x[0],@x[0],@x[1]
1204	eor	@x[2],@x[2],@x[3]
1205	eor	@x[4],@x[4],@x[5]
1206	eor	@x[6],@x[6],@x[7]
1207	eor	@x[8],@x[8],@x[9]
1208	 eor	$A0,$A0,$T0
1209	eor	@x[10],@x[10],@x[11]
1210	 eor	$B0,$B0,$T1
1211	eor	@x[12],@x[12],@x[13]
1212	 eor	$C0,$C0,$T2
1213	eor	@x[14],@x[14],@x[15]
1214	 eor	$D0,$D0,$T3
1215	 ld1.8	{$T0-$T3},[$inp],#64
1216
1217	stp	@x[0],@x[2],[$out,#0]		// store output
1218	 add	@d[6],@d[6],#7			// increment counter
1219	stp	@x[4],@x[6],[$out,#16]
1220	stp	@x[8],@x[10],[$out,#32]
1221	stp	@x[12],@x[14],[$out,#48]
1222	add	$out,$out,#64
1223	st1.8	{$A0-$D0},[$out],#64
1224
1225	ld1.8	{$A0-$D0},[$inp],#64
1226	eor	$A1,$A1,$T0
1227	eor	$B1,$B1,$T1
1228	eor	$C1,$C1,$T2
1229	eor	$D1,$D1,$T3
1230	st1.8	{$A1-$D1},[$out],#64
1231
1232	ld1.8	{$A1-$D1},[$inp],#64
1233	eor	$A2,$A2,$A0
1234	 ldp	@K[0],@K[1],[sp,#0]
1235	eor	$B2,$B2,$B0
1236	 ldp	@K[2],@K[3],[sp,#32]
1237	eor	$C2,$C2,$C0
1238	eor	$D2,$D2,$D0
1239	st1.8	{$A2-$D2},[$out],#64
1240
1241	ld1.8	{$A2-$D2},[$inp],#64
1242	eor	$A3,$A3,$A1
1243	eor	$B3,$B3,$B1
1244	eor	$C3,$C3,$C1
1245	eor	$D3,$D3,$D1
1246	st1.8	{$A3-$D3},[$out],#64
1247
1248	ld1.8	{$A3-$D3},[$inp],#64
1249	eor	$A4,$A4,$A2
1250	eor	$B4,$B4,$B2
1251	eor	$C4,$C4,$C2
1252	eor	$D4,$D4,$D2
1253	st1.8	{$A4-$D4},[$out],#64
1254
1255	shl	$A0,$ONE,#1			// 4 -> 8
1256	eor	$A5,$A5,$A3
1257	eor	$B5,$B5,$B3
1258	eor	$C5,$C5,$C3
1259	eor	$D5,$D5,$D3
1260	st1.8	{$A5-$D5},[$out],#64
1261
1262	add	@K[3],@K[3],$A0			// += 8
1263	add	@K[4],@K[4],$A0
1264	add	@K[5],@K[5],$A0
1265	add	@K[6],@K[6],$A0
1266
1267	b.hs	.Loop_outer_512_neon
1268
1269	adds	$len,$len,#512
1270	ushr	$ONE,$ONE,#1			// 4 -> 2
1271
1272	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
1273	ldp	d12,d13,[sp,#128+32]
1274	ldp	d14,d15,[sp,#128+48]
1275
1276	stp	@K[0],@K[0],[sp,#0]		// wipe off-load area
1277	stp	@K[0],@K[0],[sp,#32]
1278	stp	@K[0],@K[0],[sp,#64]
1279
1280	b.eq	.Ldone_512_neon
1281
1282	sub	$key,$key,#16			// .Lone
1283	cmp	$len,#192
1284	add	sp,sp,#128
1285	sub	@K[3],@K[3],$ONE		// -= 2
1286	ld1	{$CTR,$ROT24},[$key]
1287	b.hs	.Loop_outer_neon
1288
1289	ldp	d8,d9,[sp,#0]			// meet ABI requirements
1290	eor	@K[1],@K[1],@K[1]
1291	eor	@K[2],@K[2],@K[2]
1292	eor	@K[3],@K[3],@K[3]
1293	eor	@K[4],@K[4],@K[4]
1294	eor	@K[5],@K[5],@K[5]
1295	eor	@K[6],@K[6],@K[6]
1296	b	.Loop_outer
1297
1298.Ldone_512_neon:
1299	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1300	ldp	x19,x20,[x29,#16]
1301	add	sp,sp,#128+64
1302	ldp	x21,x22,[x29,#32]
1303	ldp	x23,x24,[x29,#48]
1304	ldp	x25,x26,[x29,#64]
1305	ldp	x27,x28,[x29,#80]
1306	ldp	x29,x30,[sp],#96
1307	AARCH64_VALIDATE_LINK_REGISTER
1308	ret
1309.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1310___
1311}
1312}}}
1313
1314foreach (split("\n",$code)) {
1315	s/\`([^\`]*)\`/eval $1/geo;
1316
1317	(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1))	or
1318	(m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1))	or
1319	(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1))	or
1320	(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1))	or
1321	(m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1))	or
1322	(s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1))	or
1323	(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1324
1325	#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1326
1327	print $_,"\n";
1328}
1329close STDOUT or die "error closing STDOUT: $!";	# flush
1330