xref: /openssl/crypto/chacha/asm/chacha-armv4.pl (revision e1318686)
1#! /usr/bin/env perl
2# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2014
18#
19# ChaCha20 for ARMv4.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#			IALU/gcc-4.4    1xNEON      3xNEON+1xIALU
24#
25# Cortex-A5		19.3(*)/+95%    21.8        14.1
26# Cortex-A8		10.5(*)/+160%   13.9        6.35
27# Cortex-A9		12.9(**)/+110%  14.3        6.50
28# Cortex-A15		11.0/+40%       16.0        5.00
29# Snapdragon S4		11.5/+125%      13.6        4.90
30#
31# (*)	most "favourable" result for aligned data on little-endian
32#	processor, result for misaligned data is 10-15% lower;
33# (**)	this result is a trade-off: it can be improved by 20%,
34#	but then Snapdragon S4 and Cortex-A8 results get
35#	20-25% worse;
36
37# $output is the last argument if it looks like a file (it has an extension)
38# $flavour is the first argument if it doesn't look like a file
39$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
40$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
41
42if ($flavour && $flavour ne "void") {
43    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46    die "can't locate arm-xlate.pl";
47
48    open STDOUT,"| \"$^X\" $xlate $flavour $output"
49        or die "can't call $xlate: $!";
50} else {
51    $output and open STDOUT,">$output";
52}
53
54sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
56  my $arg = pop;
57    $arg = "#$arg" if ($arg*1 eq $arg);
58    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
59}
60
61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
62my @t=map("r$_",(8..11));
63
64sub ROUND {
65my ($a0,$b0,$c0,$d0)=@_;
66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69my $odd = $d0&1;
70my ($xc,$xc_) = (@t[0..1]);
71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
72my @ret;
73
74	# Consider order in which variables are addressed by their
75	# index:
76	#
77	#       a   b   c   d
78	#
79	#       0   4   8  12 < even round
80	#       1   5   9  13
81	#       2   6  10  14
82	#       3   7  11  15
83	#       0   5  10  15 < odd round
84	#       1   6  11  12
85	#       2   7   8  13
86	#       3   4   9  14
87	#
88	# 'a', 'b' are permanently allocated in registers, @x[0..7],
89	# while 'c's and pair of 'd's are maintained in memory. If
90	# you observe 'c' column, you'll notice that pair of 'c's is
91	# invariant between rounds. This means that we have to reload
92	# them once per round, in the middle. This is why you'll see
93	# bunch of 'c' stores and loads in the middle, but none in
94	# the beginning or end. If you observe 'd' column, you'll
95	# notice that 15 and 13 are reused in next pair of rounds.
96	# This is why these two are chosen for offloading to memory,
97	# to make loads count more.
98							push @ret,(
99	"&add	(@x[$a0],@x[$a0],@x[$b0])",
100	"&mov	($xd,$xd,'ror#16')",
101	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
102	 "&mov	($xd_,$xd_,'ror#16')",
103	"&eor	($xd,$xd,@x[$a0],'ror#16')",
104	 "&eor	($xd_,$xd_,@x[$a1],'ror#16')",
105
106	"&add	($xc,$xc,$xd)",
107	"&mov	(@x[$b0],@x[$b0],'ror#20')",
108	 "&add	($xc_,$xc_,$xd_)",
109	 "&mov	(@x[$b1],@x[$b1],'ror#20')",
110	"&eor	(@x[$b0],@x[$b0],$xc,'ror#20')",
111	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#20')",
112
113	"&add	(@x[$a0],@x[$a0],@x[$b0])",
114	"&mov	($xd,$xd,'ror#24')",
115	 "&add	(@x[$a1],@x[$a1],@x[$b1])",
116	 "&mov	($xd_,$xd_,'ror#24')",
117	"&eor	($xd,$xd,@x[$a0],'ror#24')",
118	 "&eor	($xd_,$xd_,@x[$a1],'ror#24')",
119
120	"&add	($xc,$xc,$xd)",
121	"&mov	(@x[$b0],@x[$b0],'ror#25')"		);
122							push @ret,(
123	"&str	($xd,'[sp,#4*(16+$d0)]')",
124	"&ldr	($xd,'[sp,#4*(16+$d2)]')"		) if ($odd);
125							push @ret,(
126	 "&add	($xc_,$xc_,$xd_)",
127	 "&mov	(@x[$b1],@x[$b1],'ror#25')"		);
128							push @ret,(
129	 "&str	($xd_,'[sp,#4*(16+$d1)]')",
130	 "&ldr	($xd_,'[sp,#4*(16+$d3)]')"		) if (!$odd);
131							push @ret,(
132	"&eor	(@x[$b0],@x[$b0],$xc,'ror#25')",
133	 "&eor	(@x[$b1],@x[$b1],$xc_,'ror#25')"	);
134
135	$xd=@x[$d2]					if (!$odd);
136	$xd_=@x[$d3]					if ($odd);
137							push @ret,(
138	"&str	($xc,'[sp,#4*(16+$c0)]')",
139	"&ldr	($xc,'[sp,#4*(16+$c2)]')",
140	"&add	(@x[$a2],@x[$a2],@x[$b2])",
141	"&mov	($xd,$xd,'ror#16')",
142	 "&str	($xc_,'[sp,#4*(16+$c1)]')",
143	 "&ldr	($xc_,'[sp,#4*(16+$c3)]')",
144	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
145	 "&mov	($xd_,$xd_,'ror#16')",
146	"&eor	($xd,$xd,@x[$a2],'ror#16')",
147	 "&eor	($xd_,$xd_,@x[$a3],'ror#16')",
148
149	"&add	($xc,$xc,$xd)",
150	"&mov	(@x[$b2],@x[$b2],'ror#20')",
151	 "&add	($xc_,$xc_,$xd_)",
152	 "&mov	(@x[$b3],@x[$b3],'ror#20')",
153	"&eor	(@x[$b2],@x[$b2],$xc,'ror#20')",
154	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#20')",
155
156	"&add	(@x[$a2],@x[$a2],@x[$b2])",
157	"&mov	($xd,$xd,'ror#24')",
158	 "&add	(@x[$a3],@x[$a3],@x[$b3])",
159	 "&mov	($xd_,$xd_,'ror#24')",
160	"&eor	($xd,$xd,@x[$a2],'ror#24')",
161	 "&eor	($xd_,$xd_,@x[$a3],'ror#24')",
162
163	"&add	($xc,$xc,$xd)",
164	"&mov	(@x[$b2],@x[$b2],'ror#25')",
165	 "&add	($xc_,$xc_,$xd_)",
166	 "&mov	(@x[$b3],@x[$b3],'ror#25')",
167	"&eor	(@x[$b2],@x[$b2],$xc,'ror#25')",
168	 "&eor	(@x[$b3],@x[$b3],$xc_,'ror#25')"	);
169
170	@ret;
171}
172
173$code.=<<___;
174#include "arm_arch.h"
175
176#if defined(__thumb2__) || defined(__clang__)
177.syntax	unified
178#endif
179#if defined(__thumb2__)
180.thumb
181#else
182.code	32
183#endif
184
185#if defined(__thumb2__) || defined(__clang__)
186#define ldrhsb	ldrbhs
187#endif
188
189.text
190
191.align	5
192.Lsigma:
193.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
194.Lone:
195.long	1,0,0,0
196#if __ARM_MAX_ARCH__>=7
197.LOPENSSL_armcap:
198# ifdef	_WIN32
199.word	OPENSSL_armcap_P
200# else
201.word   OPENSSL_armcap_P-.LChaCha20_ctr32
202# endif
203#else
204.word	-1
205#endif
206
207.globl	ChaCha20_ctr32
208.type	ChaCha20_ctr32,%function
209.align	5
210ChaCha20_ctr32:
211.LChaCha20_ctr32:
212	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
213	stmdb	sp!,{r0-r2,r4-r11,lr}
214#if __ARM_ARCH__<7 && !defined(__thumb2__)
215	sub	r14,pc,#16		@ ChaCha20_ctr32
216#else
217	adr	r14,.LChaCha20_ctr32
218#endif
219	cmp	r2,#0			@ len==0?
220#ifdef	__thumb2__
221	itt	eq
222#endif
223	addeq	sp,sp,#4*3
224	beq	.Lno_data
225#if __ARM_MAX_ARCH__>=7
226	cmp	r2,#192			@ test len
227	bls	.Lshort
228	ldr	r4,[r14,#-32]
229# if !defined(_WIN32)
230	ldr	r4,[r14,r4]
231# endif
232# if defined(__APPLE__) || defined(_WIN32)
233	ldr	r4,[r4]
234# endif
235	tst	r4,#ARMV7_NEON
236	bne	.LChaCha20_neon
237.Lshort:
238#endif
239	ldmia	r12,{r4-r7}		@ load counter and nonce
240	sub	sp,sp,#4*(16)		@ off-load area
241	sub	r14,r14,#64		@ .Lsigma
242	stmdb	sp!,{r4-r7}		@ copy counter and nonce
243	ldmia	r3,{r4-r11}		@ load key
244	ldmia	r14,{r0-r3}		@ load sigma
245	stmdb	sp!,{r4-r11}		@ copy key
246	stmdb	sp!,{r0-r3}		@ copy sigma
247	str	r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
248	str	r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
249	b	.Loop_outer_enter
250
251.align	4
252.Loop_outer:
253	ldmia	sp,{r0-r9}		@ load key material
254	str	@t[3],[sp,#4*(32+2)]	@ save len
255	str	r12,  [sp,#4*(32+1)]	@ save inp
256	str	r14,  [sp,#4*(32+0)]	@ save out
257.Loop_outer_enter:
258	ldr	@t[3], [sp,#4*(15)]
259	ldr	@x[12],[sp,#4*(12)]	@ modulo-scheduled load
260	ldr	@t[2], [sp,#4*(13)]
261	ldr	@x[14],[sp,#4*(14)]
262	str	@t[3], [sp,#4*(16+15)]
263	mov	@t[3],#10
264	b	.Loop
265
266.align	4
267.Loop:
268	subs	@t[3],@t[3],#1
269___
270	foreach (&ROUND(0, 4, 8,12)) { eval; }
271	foreach (&ROUND(0, 5,10,15)) { eval; }
272$code.=<<___;
273	bne	.Loop
274
275	ldr	@t[3],[sp,#4*(32+2)]	@ load len
276
277	str	@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
278	str	@t[1], [sp,#4*(16+9)]
279	str	@x[12],[sp,#4*(16+12)]
280	str	@t[2], [sp,#4*(16+13)]
281	str	@x[14],[sp,#4*(16+14)]
282
283	@ at this point we have first half of 512-bit result in
284	@ @x[0-7] and second half at sp+4*(16+8)
285
286	cmp	@t[3],#64		@ done yet?
287#ifdef	__thumb2__
288	itete	lo
289#endif
290	addlo	r12,sp,#4*(0)		@ shortcut or ...
291	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
292	addlo	r14,sp,#4*(0)		@ shortcut or ...
293	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
294
295	ldr	@t[0],[sp,#4*(0)]	@ load key material
296	ldr	@t[1],[sp,#4*(1)]
297
298#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
299# if __ARM_ARCH__<7
300	orr	@t[2],r12,r14
301	tst	@t[2],#3		@ are input and output aligned?
302	ldr	@t[2],[sp,#4*(2)]
303	bne	.Lunaligned
304	cmp	@t[3],#64		@ restore flags
305# else
306	ldr	@t[2],[sp,#4*(2)]
307# endif
308	ldr	@t[3],[sp,#4*(3)]
309
310	add	@x[0],@x[0],@t[0]	@ accumulate key material
311	add	@x[1],@x[1],@t[1]
312# ifdef	__thumb2__
313	itt	hs
314# endif
315	ldrhs	@t[0],[r12],#16		@ load input
316	ldrhs	@t[1],[r12,#-12]
317
318	add	@x[2],@x[2],@t[2]
319	add	@x[3],@x[3],@t[3]
320# ifdef	__thumb2__
321	itt	hs
322# endif
323	ldrhs	@t[2],[r12,#-8]
324	ldrhs	@t[3],[r12,#-4]
325# if __ARM_ARCH__>=6 && defined(__ARMEB__)
326	rev	@x[0],@x[0]
327	rev	@x[1],@x[1]
328	rev	@x[2],@x[2]
329	rev	@x[3],@x[3]
330# endif
331# ifdef	__thumb2__
332	itt	hs
333# endif
334	eorhs	@x[0],@x[0],@t[0]	@ xor with input
335	eorhs	@x[1],@x[1],@t[1]
336	 add	@t[0],sp,#4*(4)
337	str	@x[0],[r14],#16		@ store output
338# ifdef	__thumb2__
339	itt	hs
340# endif
341	eorhs	@x[2],@x[2],@t[2]
342	eorhs	@x[3],@x[3],@t[3]
343	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
344	str	@x[1],[r14,#-12]
345	str	@x[2],[r14,#-8]
346	str	@x[3],[r14,#-4]
347
348	add	@x[4],@x[4],@t[0]	@ accumulate key material
349	add	@x[5],@x[5],@t[1]
350# ifdef	__thumb2__
351	itt	hs
352# endif
353	ldrhs	@t[0],[r12],#16		@ load input
354	ldrhs	@t[1],[r12,#-12]
355	add	@x[6],@x[6],@t[2]
356	add	@x[7],@x[7],@t[3]
357# ifdef	__thumb2__
358	itt	hs
359# endif
360	ldrhs	@t[2],[r12,#-8]
361	ldrhs	@t[3],[r12,#-4]
362# if __ARM_ARCH__>=6 && defined(__ARMEB__)
363	rev	@x[4],@x[4]
364	rev	@x[5],@x[5]
365	rev	@x[6],@x[6]
366	rev	@x[7],@x[7]
367# endif
368# ifdef	__thumb2__
369	itt	hs
370# endif
371	eorhs	@x[4],@x[4],@t[0]
372	eorhs	@x[5],@x[5],@t[1]
373	 add	@t[0],sp,#4*(8)
374	str	@x[4],[r14],#16		@ store output
375# ifdef	__thumb2__
376	itt	hs
377# endif
378	eorhs	@x[6],@x[6],@t[2]
379	eorhs	@x[7],@x[7],@t[3]
380	str	@x[5],[r14,#-12]
381	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
382	str	@x[6],[r14,#-8]
383	 add	@x[0],sp,#4*(16+8)
384	str	@x[7],[r14,#-4]
385
386	ldmia	@x[0],{@x[0]-@x[7]}	@ load second half
387
388	add	@x[0],@x[0],@t[0]	@ accumulate key material
389	add	@x[1],@x[1],@t[1]
390# ifdef	__thumb2__
391	itt	hs
392# endif
393	ldrhs	@t[0],[r12],#16		@ load input
394	ldrhs	@t[1],[r12,#-12]
395# ifdef	__thumb2__
396	itt	hi
397# endif
398	 strhi	@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
399	 strhi	@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
400	add	@x[2],@x[2],@t[2]
401	add	@x[3],@x[3],@t[3]
402# ifdef	__thumb2__
403	itt	hs
404# endif
405	ldrhs	@t[2],[r12,#-8]
406	ldrhs	@t[3],[r12,#-4]
407# if __ARM_ARCH__>=6 && defined(__ARMEB__)
408	rev	@x[0],@x[0]
409	rev	@x[1],@x[1]
410	rev	@x[2],@x[2]
411	rev	@x[3],@x[3]
412# endif
413# ifdef	__thumb2__
414	itt	hs
415# endif
416	eorhs	@x[0],@x[0],@t[0]
417	eorhs	@x[1],@x[1],@t[1]
418	 add	@t[0],sp,#4*(12)
419	str	@x[0],[r14],#16		@ store output
420# ifdef	__thumb2__
421	itt	hs
422# endif
423	eorhs	@x[2],@x[2],@t[2]
424	eorhs	@x[3],@x[3],@t[3]
425	str	@x[1],[r14,#-12]
426	 ldmia	@t[0],{@t[0]-@t[3]}	@ load key material
427	str	@x[2],[r14,#-8]
428	str	@x[3],[r14,#-4]
429
430	add	@x[4],@x[4],@t[0]	@ accumulate key material
431	add	@x[5],@x[5],@t[1]
432# ifdef	__thumb2__
433	itt	hi
434# endif
435	 addhi	@t[0],@t[0],#1		@ next counter value
436	 strhi	@t[0],[sp,#4*(12)]	@ save next counter value
437# ifdef	__thumb2__
438	itt	hs
439# endif
440	ldrhs	@t[0],[r12],#16		@ load input
441	ldrhs	@t[1],[r12,#-12]
442	add	@x[6],@x[6],@t[2]
443	add	@x[7],@x[7],@t[3]
444# ifdef	__thumb2__
445	itt	hs
446# endif
447	ldrhs	@t[2],[r12,#-8]
448	ldrhs	@t[3],[r12,#-4]
449# if __ARM_ARCH__>=6 && defined(__ARMEB__)
450	rev	@x[4],@x[4]
451	rev	@x[5],@x[5]
452	rev	@x[6],@x[6]
453	rev	@x[7],@x[7]
454# endif
455# ifdef	__thumb2__
456	itt	hs
457# endif
458	eorhs	@x[4],@x[4],@t[0]
459	eorhs	@x[5],@x[5],@t[1]
460# ifdef	__thumb2__
461	 it	ne
462# endif
463	 ldrne	@t[0],[sp,#4*(32+2)]	@ re-load len
464# ifdef	__thumb2__
465	itt	hs
466# endif
467	eorhs	@x[6],@x[6],@t[2]
468	eorhs	@x[7],@x[7],@t[3]
469	str	@x[4],[r14],#16		@ store output
470	str	@x[5],[r14,#-12]
471# ifdef	__thumb2__
472	it	hs
473# endif
474	 subhs	@t[3],@t[0],#64		@ len-=64
475	str	@x[6],[r14,#-8]
476	str	@x[7],[r14,#-4]
477	bhi	.Loop_outer
478
479	beq	.Ldone
480# if __ARM_ARCH__<7
481	b	.Ltail
482
483.align	4
484.Lunaligned:				@ unaligned endian-neutral path
485	cmp	@t[3],#64		@ restore flags
486# endif
487#endif
488#if __ARM_ARCH__<7
489	ldr	@t[3],[sp,#4*(3)]
490___
491for ($i=0;$i<16;$i+=4) {
492my $j=$i&0x7;
493
494$code.=<<___	if ($i==4);
495	add	@x[0],sp,#4*(16+8)
496___
497$code.=<<___	if ($i==8);
498	ldmia	@x[0],{@x[0]-@x[7]}		@ load second half
499# ifdef	__thumb2__
500	itt	hi
501# endif
502	strhi	@t[2],[sp,#4*(16+10)]		@ copy "@x[10]"
503	strhi	@t[3],[sp,#4*(16+11)]		@ copy "@x[11]"
504___
505$code.=<<___;
506	add	@x[$j+0],@x[$j+0],@t[0]		@ accumulate key material
507___
508$code.=<<___	if ($i==12);
509# ifdef	__thumb2__
510	itt	hi
511# endif
512	addhi	@t[0],@t[0],#1			@ next counter value
513	strhi	@t[0],[sp,#4*(12)]		@ save next counter value
514___
515$code.=<<___;
516	add	@x[$j+1],@x[$j+1],@t[1]
517	add	@x[$j+2],@x[$j+2],@t[2]
518# ifdef	__thumb2__
519	itete	lo
520# endif
521	eorlo	@t[0],@t[0],@t[0]		@ zero or ...
522	ldrhsb	@t[0],[r12],#16			@ ... load input
523	eorlo	@t[1],@t[1],@t[1]
524	ldrhsb	@t[1],[r12,#-12]
525
526	add	@x[$j+3],@x[$j+3],@t[3]
527# ifdef	__thumb2__
528	itete	lo
529# endif
530	eorlo	@t[2],@t[2],@t[2]
531	ldrhsb	@t[2],[r12,#-8]
532	eorlo	@t[3],@t[3],@t[3]
533	ldrhsb	@t[3],[r12,#-4]
534
535	eor	@x[$j+0],@t[0],@x[$j+0]		@ xor with input (or zero)
536	eor	@x[$j+1],@t[1],@x[$j+1]
537# ifdef	__thumb2__
538	itt	hs
539# endif
540	ldrhsb	@t[0],[r12,#-15]		@ load more input
541	ldrhsb	@t[1],[r12,#-11]
542	eor	@x[$j+2],@t[2],@x[$j+2]
543	 strb	@x[$j+0],[r14],#16		@ store output
544	eor	@x[$j+3],@t[3],@x[$j+3]
545# ifdef	__thumb2__
546	itt	hs
547# endif
548	ldrhsb	@t[2],[r12,#-7]
549	ldrhsb	@t[3],[r12,#-3]
550	 strb	@x[$j+1],[r14,#-12]
551	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
552	 strb	@x[$j+2],[r14,#-8]
553	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
554# ifdef	__thumb2__
555	itt	hs
556# endif
557	ldrhsb	@t[0],[r12,#-14]		@ load more input
558	ldrhsb	@t[1],[r12,#-10]
559	 strb	@x[$j+3],[r14,#-4]
560	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
561	 strb	@x[$j+0],[r14,#-15]
562	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
563# ifdef	__thumb2__
564	itt	hs
565# endif
566	ldrhsb	@t[2],[r12,#-6]
567	ldrhsb	@t[3],[r12,#-2]
568	 strb	@x[$j+1],[r14,#-11]
569	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
570	 strb	@x[$j+2],[r14,#-7]
571	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
572# ifdef	__thumb2__
573	itt	hs
574# endif
575	ldrhsb	@t[0],[r12,#-13]		@ load more input
576	ldrhsb	@t[1],[r12,#-9]
577	 strb	@x[$j+3],[r14,#-3]
578	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
579	 strb	@x[$j+0],[r14,#-14]
580	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
581# ifdef	__thumb2__
582	itt	hs
583# endif
584	ldrhsb	@t[2],[r12,#-5]
585	ldrhsb	@t[3],[r12,#-1]
586	 strb	@x[$j+1],[r14,#-10]
587	 strb	@x[$j+2],[r14,#-6]
588	eor	@x[$j+0],@t[0],@x[$j+0],lsr#8
589	 strb	@x[$j+3],[r14,#-2]
590	eor	@x[$j+1],@t[1],@x[$j+1],lsr#8
591	 strb	@x[$j+0],[r14,#-13]
592	eor	@x[$j+2],@t[2],@x[$j+2],lsr#8
593	 strb	@x[$j+1],[r14,#-9]
594	eor	@x[$j+3],@t[3],@x[$j+3],lsr#8
595	 strb	@x[$j+2],[r14,#-5]
596	 strb	@x[$j+3],[r14,#-1]
597___
598$code.=<<___	if ($i<12);
599	add	@t[0],sp,#4*(4+$i)
600	ldmia	@t[0],{@t[0]-@t[3]}		@ load key material
601___
602}
603$code.=<<___;
604# ifdef	__thumb2__
605	it	ne
606# endif
607	ldrne	@t[0],[sp,#4*(32+2)]		@ re-load len
608# ifdef	__thumb2__
609	it	hs
610# endif
611	subhs	@t[3],@t[0],#64			@ len-=64
612	bhi	.Loop_outer
613
614	beq	.Ldone
615#endif
616
617.Ltail:
618	ldr	r12,[sp,#4*(32+1)]	@ load inp
619	add	@t[1],sp,#4*(0)
620	ldr	r14,[sp,#4*(32+0)]	@ load out
621
622.Loop_tail:
623	ldrb	@t[2],[@t[1]],#1	@ read buffer on stack
624	ldrb	@t[3],[r12],#1		@ read input
625	subs	@t[0],@t[0],#1
626	eor	@t[3],@t[3],@t[2]
627	strb	@t[3],[r14],#1		@ store output
628	bne	.Loop_tail
629
630.Ldone:
631	add	sp,sp,#4*(32+3)
632.Lno_data:
633	ldmia	sp!,{r4-r11,pc}
634.size	ChaCha20_ctr32,.-ChaCha20_ctr32
635___
636
637{{{
638my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
639    map("q$_",(0..15));
640
641sub NEONROUND {
642my $odd = pop;
643my ($a,$b,$c,$d,$t)=@_;
644
645	(
646	"&vadd_i32	($a,$a,$b)",
647	"&veor		($d,$d,$a)",
648	"&vrev32_16	($d,$d)",	# vrot ($d,16)
649
650	"&vadd_i32	($c,$c,$d)",
651	"&veor		($t,$b,$c)",
652	"&vshr_u32	($b,$t,20)",
653	"&vsli_32	($b,$t,12)",
654
655	"&vadd_i32	($a,$a,$b)",
656	"&veor		($t,$d,$a)",
657	"&vshr_u32	($d,$t,24)",
658	"&vsli_32	($d,$t,8)",
659
660	"&vadd_i32	($c,$c,$d)",
661	"&veor		($t,$b,$c)",
662	"&vshr_u32	($b,$t,25)",
663	"&vsli_32	($b,$t,7)",
664
665	"&vext_8	($c,$c,$c,8)",
666	"&vext_8	($b,$b,$b,$odd?12:4)",
667	"&vext_8	($d,$d,$d,$odd?4:12)"
668	);
669}
670
671$code.=<<___;
672#if __ARM_MAX_ARCH__>=7
673.arch	armv7-a
674.fpu	neon
675
676.type	ChaCha20_neon,%function
677.align	5
678ChaCha20_neon:
679	ldr		r12,[sp,#0]		@ pull pointer to counter and nonce
680	stmdb		sp!,{r0-r2,r4-r11,lr}
681.LChaCha20_neon:
682	adr		r14,.Lsigma
683	vstmdb		sp!,{d8-d15}		@ ABI spec says so
684	stmdb		sp!,{r0-r3}
685
686	vld1.32		{$b0-$c0},[r3]		@ load key
687	ldmia		r3,{r4-r11}		@ load key
688
689	sub		sp,sp,#4*(16+16)
690	vld1.32		{$d0},[r12]		@ load counter and nonce
691	add		r12,sp,#4*8
692	ldmia		r14,{r0-r3}		@ load sigma
693	vld1.32		{$a0},[r14]!		@ load sigma
694	vld1.32		{$t0},[r14]		@ one
695	vst1.32		{$c0-$d0},[r12]		@ copy 1/2key|counter|nonce
696	vst1.32		{$a0-$b0},[sp]		@ copy sigma|1/2key
697
698	str		r10,[sp,#4*(16+10)]	@ off-load "@x[10]"
699	str		r11,[sp,#4*(16+11)]	@ off-load "@x[11]"
700	vshl.i32	$t1#lo,$t0#lo,#1	@ two
701	vstr		$t0#lo,[sp,#4*(16+0)]
702	vshl.i32	$t2#lo,$t0#lo,#2	@ four
703	vstr		$t1#lo,[sp,#4*(16+2)]
704	vmov		$a1,$a0
705	vstr		$t2#lo,[sp,#4*(16+4)]
706	vmov		$a2,$a0
707	vmov		$b1,$b0
708	vmov		$b2,$b0
709	b		.Loop_neon_enter
710
711.align	4
712.Loop_neon_outer:
713	ldmia		sp,{r0-r9}		@ load key material
714	cmp		@t[3],#64*2		@ if len<=64*2
715	bls		.Lbreak_neon		@ switch to integer-only
716	vmov		$a1,$a0
717	str		@t[3],[sp,#4*(32+2)]	@ save len
718	vmov		$a2,$a0
719	str		r12,  [sp,#4*(32+1)]	@ save inp
720	vmov		$b1,$b0
721	str		r14,  [sp,#4*(32+0)]	@ save out
722	vmov		$b2,$b0
723.Loop_neon_enter:
724	ldr		@t[3], [sp,#4*(15)]
725	vadd.i32	$d1,$d0,$t0		@ counter+1
726	ldr		@x[12],[sp,#4*(12)]	@ modulo-scheduled load
727	vmov		$c1,$c0
728	ldr		@t[2], [sp,#4*(13)]
729	vmov		$c2,$c0
730	ldr		@x[14],[sp,#4*(14)]
731	vadd.i32	$d2,$d1,$t0		@ counter+2
732	str		@t[3], [sp,#4*(16+15)]
733	mov		@t[3],#10
734	add		@x[12],@x[12],#3	@ counter+3
735	b		.Loop_neon
736
737.align	4
738.Loop_neon:
739	subs		@t[3],@t[3],#1
740___
741	my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
742	my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
743	my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
744	my @thread3=&ROUND(0,4,8,12);
745
746	foreach (@thread0) {
747		eval;			eval(shift(@thread3));
748		eval(shift(@thread1));	eval(shift(@thread3));
749		eval(shift(@thread2));	eval(shift(@thread3));
750	}
751
752	@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
753	@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
754	@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
755	@thread3=&ROUND(0,5,10,15);
756
757	foreach (@thread0) {
758		eval;			eval(shift(@thread3));
759		eval(shift(@thread1));	eval(shift(@thread3));
760		eval(shift(@thread2));	eval(shift(@thread3));
761	}
762$code.=<<___;
763	bne		.Loop_neon
764
765	add		@t[3],sp,#32
766	vld1.32		{$t0-$t1},[sp]		@ load key material
767	vld1.32		{$t2-$t3},[@t[3]]
768
769	ldr		@t[3],[sp,#4*(32+2)]	@ load len
770
771	str		@t[0], [sp,#4*(16+8)]	@ modulo-scheduled store
772	str		@t[1], [sp,#4*(16+9)]
773	str		@x[12],[sp,#4*(16+12)]
774	str		@t[2], [sp,#4*(16+13)]
775	str		@x[14],[sp,#4*(16+14)]
776
777	@ at this point we have first half of 512-bit result in
778	@ @x[0-7] and second half at sp+4*(16+8)
779
780	ldr		r12,[sp,#4*(32+1)]	@ load inp
781	ldr		r14,[sp,#4*(32+0)]	@ load out
782
783	vadd.i32	$a0,$a0,$t0		@ accumulate key material
784	vadd.i32	$a1,$a1,$t0
785	vadd.i32	$a2,$a2,$t0
786	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
787
788	vadd.i32	$b0,$b0,$t1
789	vadd.i32	$b1,$b1,$t1
790	vadd.i32	$b2,$b2,$t1
791	vldr		$t1#lo,[sp,#4*(16+2)]	@ two
792
793	vadd.i32	$c0,$c0,$t2
794	vadd.i32	$c1,$c1,$t2
795	vadd.i32	$c2,$c2,$t2
796	vadd.i32	$d1#lo,$d1#lo,$t0#lo	@ counter+1
797	vadd.i32	$d2#lo,$d2#lo,$t1#lo	@ counter+2
798
799	vadd.i32	$d0,$d0,$t3
800	vadd.i32	$d1,$d1,$t3
801	vadd.i32	$d2,$d2,$t3
802
803	cmp		@t[3],#64*4
804	blo		.Ltail_neon
805
806	vld1.8		{$t0-$t1},[r12]!	@ load input
807	 mov		@t[3],sp
808	vld1.8		{$t2-$t3},[r12]!
809	veor		$a0,$a0,$t0		@ xor with input
810	veor		$b0,$b0,$t1
811	vld1.8		{$t0-$t1},[r12]!
812	veor		$c0,$c0,$t2
813	veor		$d0,$d0,$t3
814	vld1.8		{$t2-$t3},[r12]!
815
816	veor		$a1,$a1,$t0
817	 vst1.8		{$a0-$b0},[r14]!	@ store output
818	veor		$b1,$b1,$t1
819	vld1.8		{$t0-$t1},[r12]!
820	veor		$c1,$c1,$t2
821	 vst1.8		{$c0-$d0},[r14]!
822	veor		$d1,$d1,$t3
823	vld1.8		{$t2-$t3},[r12]!
824
825	veor		$a2,$a2,$t0
826	 vld1.32	{$a0-$b0},[@t[3]]!	@ load for next iteration
827	 veor		$t0#hi,$t0#hi,$t0#hi
828	 vldr		$t0#lo,[sp,#4*(16+4)]	@ four
829	veor		$b2,$b2,$t1
830	 vld1.32	{$c0-$d0},[@t[3]]
831	veor		$c2,$c2,$t2
832	 vst1.8		{$a1-$b1},[r14]!
833	veor		$d2,$d2,$t3
834	 vst1.8		{$c1-$d1},[r14]!
835
836	vadd.i32	$d0#lo,$d0#lo,$t0#lo	@ next counter value
837	vldr		$t0#lo,[sp,#4*(16+0)]	@ one
838
839	ldmia		sp,{@t[0]-@t[3]}	@ load key material
840	add		@x[0],@x[0],@t[0]	@ accumulate key material
841	ldr		@t[0],[r12],#16		@ load input
842	 vst1.8		{$a2-$b2},[r14]!
843	add		@x[1],@x[1],@t[1]
844	ldr		@t[1],[r12,#-12]
845	 vst1.8		{$c2-$d2},[r14]!
846	add		@x[2],@x[2],@t[2]
847	ldr		@t[2],[r12,#-8]
848	add		@x[3],@x[3],@t[3]
849	ldr		@t[3],[r12,#-4]
850# ifdef	__ARMEB__
851	rev		@x[0],@x[0]
852	rev		@x[1],@x[1]
853	rev		@x[2],@x[2]
854	rev		@x[3],@x[3]
855# endif
856	eor		@x[0],@x[0],@t[0]	@ xor with input
857	 add		@t[0],sp,#4*(4)
858	eor		@x[1],@x[1],@t[1]
859	str		@x[0],[r14],#16		@ store output
860	eor		@x[2],@x[2],@t[2]
861	str		@x[1],[r14,#-12]
862	eor		@x[3],@x[3],@t[3]
863	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
864	str		@x[2],[r14,#-8]
865	str		@x[3],[r14,#-4]
866
867	add		@x[4],@x[4],@t[0]	@ accumulate key material
868	ldr		@t[0],[r12],#16		@ load input
869	add		@x[5],@x[5],@t[1]
870	ldr		@t[1],[r12,#-12]
871	add		@x[6],@x[6],@t[2]
872	ldr		@t[2],[r12,#-8]
873	add		@x[7],@x[7],@t[3]
874	ldr		@t[3],[r12,#-4]
875# ifdef	__ARMEB__
876	rev		@x[4],@x[4]
877	rev		@x[5],@x[5]
878	rev		@x[6],@x[6]
879	rev		@x[7],@x[7]
880# endif
881	eor		@x[4],@x[4],@t[0]
882	 add		@t[0],sp,#4*(8)
883	eor		@x[5],@x[5],@t[1]
884	str		@x[4],[r14],#16		@ store output
885	eor		@x[6],@x[6],@t[2]
886	str		@x[5],[r14,#-12]
887	eor		@x[7],@x[7],@t[3]
888	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
889	str		@x[6],[r14,#-8]
890	 add		@x[0],sp,#4*(16+8)
891	str		@x[7],[r14,#-4]
892
893	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
894
895	add		@x[0],@x[0],@t[0]	@ accumulate key material
896	ldr		@t[0],[r12],#16		@ load input
897	add		@x[1],@x[1],@t[1]
898	ldr		@t[1],[r12,#-12]
899# ifdef	__thumb2__
900	it	hi
901# endif
902	 strhi		@t[2],[sp,#4*(16+10)]	@ copy "@x[10]" while at it
903	add		@x[2],@x[2],@t[2]
904	ldr		@t[2],[r12,#-8]
905# ifdef	__thumb2__
906	it	hi
907# endif
908	 strhi		@t[3],[sp,#4*(16+11)]	@ copy "@x[11]" while at it
909	add		@x[3],@x[3],@t[3]
910	ldr		@t[3],[r12,#-4]
911# ifdef	__ARMEB__
912	rev		@x[0],@x[0]
913	rev		@x[1],@x[1]
914	rev		@x[2],@x[2]
915	rev		@x[3],@x[3]
916# endif
917	eor		@x[0],@x[0],@t[0]
918	 add		@t[0],sp,#4*(12)
919	eor		@x[1],@x[1],@t[1]
920	str		@x[0],[r14],#16		@ store output
921	eor		@x[2],@x[2],@t[2]
922	str		@x[1],[r14,#-12]
923	eor		@x[3],@x[3],@t[3]
924	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
925	str		@x[2],[r14,#-8]
926	str		@x[3],[r14,#-4]
927
928	add		@x[4],@x[4],@t[0]	@ accumulate key material
929	 add		@t[0],@t[0],#4		@ next counter value
930	add		@x[5],@x[5],@t[1]
931	 str		@t[0],[sp,#4*(12)]	@ save next counter value
932	ldr		@t[0],[r12],#16		@ load input
933	add		@x[6],@x[6],@t[2]
934	 add		@x[4],@x[4],#3		@ counter+3
935	ldr		@t[1],[r12,#-12]
936	add		@x[7],@x[7],@t[3]
937	ldr		@t[2],[r12,#-8]
938	ldr		@t[3],[r12,#-4]
939# ifdef	__ARMEB__
940	rev		@x[4],@x[4]
941	rev		@x[5],@x[5]
942	rev		@x[6],@x[6]
943	rev		@x[7],@x[7]
944# endif
945	eor		@x[4],@x[4],@t[0]
946# ifdef	__thumb2__
947	it	hi
948# endif
949	 ldrhi		@t[0],[sp,#4*(32+2)]	@ re-load len
950	eor		@x[5],@x[5],@t[1]
951	eor		@x[6],@x[6],@t[2]
952	str		@x[4],[r14],#16		@ store output
953	eor		@x[7],@x[7],@t[3]
954	str		@x[5],[r14,#-12]
955	 sub		@t[3],@t[0],#64*4	@ len-=64*4
956	str		@x[6],[r14,#-8]
957	str		@x[7],[r14,#-4]
958	bhi		.Loop_neon_outer
959
960	b		.Ldone_neon
961
962.align	4
963.Lbreak_neon:
964	@ harmonize NEON and integer-only stack frames: load data
965	@ from NEON frame, but save to integer-only one; distance
966	@ between the two is 4*(32+4+16-32)=4*(20).
967
968	str		@t[3], [sp,#4*(20+32+2)]	@ save len
969	 add		@t[3],sp,#4*(32+4)
970	str		r12,   [sp,#4*(20+32+1)]	@ save inp
971	str		r14,   [sp,#4*(20+32+0)]	@ save out
972
973	ldr		@x[12],[sp,#4*(16+10)]
974	ldr		@x[14],[sp,#4*(16+11)]
975	 vldmia		@t[3],{d8-d15}			@ fulfill ABI requirement
976	str		@x[12],[sp,#4*(20+16+10)]	@ copy "@x[10]"
977	str		@x[14],[sp,#4*(20+16+11)]	@ copy "@x[11]"
978
979	ldr		@t[3], [sp,#4*(15)]
980	ldr		@x[12],[sp,#4*(12)]		@ modulo-scheduled load
981	ldr		@t[2], [sp,#4*(13)]
982	ldr		@x[14],[sp,#4*(14)]
983	str		@t[3], [sp,#4*(20+16+15)]
984	add		@t[3],sp,#4*(20)
985	vst1.32		{$a0-$b0},[@t[3]]!		@ copy key
986	add		sp,sp,#4*(20)			@ switch frame
987	vst1.32		{$c0-$d0},[@t[3]]
988	mov		@t[3],#10
989	b		.Loop				@ go integer-only
990
991.align	4
992.Ltail_neon:
993	cmp		@t[3],#64*3
994	bhs		.L192_or_more_neon
995	cmp		@t[3],#64*2
996	bhs		.L128_or_more_neon
997	cmp		@t[3],#64*1
998	bhs		.L64_or_more_neon
999
1000	add		@t[0],sp,#4*(8)
1001	vst1.8		{$a0-$b0},[sp]
1002	add		@t[2],sp,#4*(0)
1003	vst1.8		{$c0-$d0},[@t[0]]
1004	b		.Loop_tail_neon
1005
1006.align	4
1007.L64_or_more_neon:
1008	vld1.8		{$t0-$t1},[r12]!
1009	vld1.8		{$t2-$t3},[r12]!
1010	veor		$a0,$a0,$t0
1011	veor		$b0,$b0,$t1
1012	veor		$c0,$c0,$t2
1013	veor		$d0,$d0,$t3
1014	vst1.8		{$a0-$b0},[r14]!
1015	vst1.8		{$c0-$d0},[r14]!
1016
1017	beq		.Ldone_neon
1018
1019	add		@t[0],sp,#4*(8)
1020	vst1.8		{$a1-$b1},[sp]
1021	add		@t[2],sp,#4*(0)
1022	vst1.8		{$c1-$d1},[@t[0]]
1023	sub		@t[3],@t[3],#64*1	@ len-=64*1
1024	b		.Loop_tail_neon
1025
1026.align	4
1027.L128_or_more_neon:
1028	vld1.8		{$t0-$t1},[r12]!
1029	vld1.8		{$t2-$t3},[r12]!
1030	veor		$a0,$a0,$t0
1031	veor		$b0,$b0,$t1
1032	vld1.8		{$t0-$t1},[r12]!
1033	veor		$c0,$c0,$t2
1034	veor		$d0,$d0,$t3
1035	vld1.8		{$t2-$t3},[r12]!
1036
1037	veor		$a1,$a1,$t0
1038	veor		$b1,$b1,$t1
1039	 vst1.8		{$a0-$b0},[r14]!
1040	veor		$c1,$c1,$t2
1041	 vst1.8		{$c0-$d0},[r14]!
1042	veor		$d1,$d1,$t3
1043	vst1.8		{$a1-$b1},[r14]!
1044	vst1.8		{$c1-$d1},[r14]!
1045
1046	beq		.Ldone_neon
1047
1048	add		@t[0],sp,#4*(8)
1049	vst1.8		{$a2-$b2},[sp]
1050	add		@t[2],sp,#4*(0)
1051	vst1.8		{$c2-$d2},[@t[0]]
1052	sub		@t[3],@t[3],#64*2	@ len-=64*2
1053	b		.Loop_tail_neon
1054
1055.align	4
1056.L192_or_more_neon:
1057	vld1.8		{$t0-$t1},[r12]!
1058	vld1.8		{$t2-$t3},[r12]!
1059	veor		$a0,$a0,$t0
1060	veor		$b0,$b0,$t1
1061	vld1.8		{$t0-$t1},[r12]!
1062	veor		$c0,$c0,$t2
1063	veor		$d0,$d0,$t3
1064	vld1.8		{$t2-$t3},[r12]!
1065
1066	veor		$a1,$a1,$t0
1067	veor		$b1,$b1,$t1
1068	vld1.8		{$t0-$t1},[r12]!
1069	veor		$c1,$c1,$t2
1070	 vst1.8		{$a0-$b0},[r14]!
1071	veor		$d1,$d1,$t3
1072	vld1.8		{$t2-$t3},[r12]!
1073
1074	veor		$a2,$a2,$t0
1075	 vst1.8		{$c0-$d0},[r14]!
1076	veor		$b2,$b2,$t1
1077	 vst1.8		{$a1-$b1},[r14]!
1078	veor		$c2,$c2,$t2
1079	 vst1.8		{$c1-$d1},[r14]!
1080	veor		$d2,$d2,$t3
1081	vst1.8		{$a2-$b2},[r14]!
1082	vst1.8		{$c2-$d2},[r14]!
1083
1084	beq		.Ldone_neon
1085
1086	ldmia		sp,{@t[0]-@t[3]}	@ load key material
1087	add		@x[0],@x[0],@t[0]	@ accumulate key material
1088	 add		@t[0],sp,#4*(4)
1089	add		@x[1],@x[1],@t[1]
1090	add		@x[2],@x[2],@t[2]
1091	add		@x[3],@x[3],@t[3]
1092	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1093
1094	add		@x[4],@x[4],@t[0]	@ accumulate key material
1095	 add		@t[0],sp,#4*(8)
1096	add		@x[5],@x[5],@t[1]
1097	add		@x[6],@x[6],@t[2]
1098	add		@x[7],@x[7],@t[3]
1099	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1100# ifdef	__ARMEB__
1101	rev		@x[0],@x[0]
1102	rev		@x[1],@x[1]
1103	rev		@x[2],@x[2]
1104	rev		@x[3],@x[3]
1105	rev		@x[4],@x[4]
1106	rev		@x[5],@x[5]
1107	rev		@x[6],@x[6]
1108	rev		@x[7],@x[7]
1109# endif
1110	stmia		sp,{@x[0]-@x[7]}
1111	 add		@x[0],sp,#4*(16+8)
1112
1113	ldmia		@x[0],{@x[0]-@x[7]}	@ load second half
1114
1115	add		@x[0],@x[0],@t[0]	@ accumulate key material
1116	 add		@t[0],sp,#4*(12)
1117	add		@x[1],@x[1],@t[1]
1118	add		@x[2],@x[2],@t[2]
1119	add		@x[3],@x[3],@t[3]
1120	 ldmia		@t[0],{@t[0]-@t[3]}	@ load key material
1121
1122	add		@x[4],@x[4],@t[0]	@ accumulate key material
1123	 add		@t[0],sp,#4*(8)
1124	add		@x[5],@x[5],@t[1]
1125	 add		@x[4],@x[4],#3		@ counter+3
1126	add		@x[6],@x[6],@t[2]
1127	add		@x[7],@x[7],@t[3]
1128	 ldr		@t[3],[sp,#4*(32+2)]	@ re-load len
1129# ifdef	__ARMEB__
1130	rev		@x[0],@x[0]
1131	rev		@x[1],@x[1]
1132	rev		@x[2],@x[2]
1133	rev		@x[3],@x[3]
1134	rev		@x[4],@x[4]
1135	rev		@x[5],@x[5]
1136	rev		@x[6],@x[6]
1137	rev		@x[7],@x[7]
1138# endif
1139	stmia		@t[0],{@x[0]-@x[7]}
1140	 add		@t[2],sp,#4*(0)
1141	 sub		@t[3],@t[3],#64*3	@ len-=64*3
1142
1143.Loop_tail_neon:
1144	ldrb		@t[0],[@t[2]],#1	@ read buffer on stack
1145	ldrb		@t[1],[r12],#1		@ read input
1146	subs		@t[3],@t[3],#1
1147	eor		@t[0],@t[0],@t[1]
1148	strb		@t[0],[r14],#1		@ store output
1149	bne		.Loop_tail_neon
1150
1151.Ldone_neon:
1152	add		sp,sp,#4*(32+4)
1153	vldmia		sp,{d8-d15}
1154	add		sp,sp,#4*(16+3)
1155	ldmia		sp!,{r4-r11,pc}
1156.size	ChaCha20_neon,.-ChaCha20_neon
1157.extern	OPENSSL_armcap_P
1158.hidden	OPENSSL_armcap_P
1159#endif
1160___
1161}}}
1162
1163foreach (split("\n",$code)) {
1164	s/\`([^\`]*)\`/eval $1/geo;
1165
1166	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1167
1168	print $_,"\n";
1169}
1170close STDOUT or die "error closing STDOUT: $!";
1171