1#! /usr/bin/env perl
2# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17#			IALU(*)/gcc-4.4		NEON
18#
19# ARM11xx(ARMv6)	7.78/+100%		-
20# Cortex-A5		6.35/+130%		3.00
21# Cortex-A8		6.25/+115%		2.36
22# Cortex-A9		5.10/+95%		2.55
23# Cortex-A15		3.85/+85%		1.25(**)
24# Snapdragon S4		5.70/+100%		1.48(**)
25#
26# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
27# (**)	these are trade-off results, they can be improved by ~8% but at
28#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
29#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
30
31# $output is the last argument if it looks like a file (it has an extension)
32# $flavour is the first argument if it doesn't look like a file
33$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
34$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
35
36if ($flavour && $flavour ne "void") {
37    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40    die "can't locate arm-xlate.pl";
41
42    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
43        or die "can't call $xlate: $!";
44} else {
45    $output and open STDOUT,">$output";
46}
47
48($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
49
50$code.=<<___;
51#include "arm_arch.h"
52
53#if defined(__thumb2__)
54.syntax	unified
55.thumb
56#else
57.code	32
58#endif
59
60.text
61
62.globl	poly1305_emit
63.globl	poly1305_blocks
64.globl	poly1305_init
65.type	poly1305_init,%function
66.align	5
67poly1305_init:
68.Lpoly1305_init:
69	stmdb	sp!,{r4-r11}
70
71	eor	r3,r3,r3
72	cmp	$inp,#0
73	str	r3,[$ctx,#0]		@ zero hash value
74	str	r3,[$ctx,#4]
75	str	r3,[$ctx,#8]
76	str	r3,[$ctx,#12]
77	str	r3,[$ctx,#16]
78	str	r3,[$ctx,#36]		@ is_base2_26
79	add	$ctx,$ctx,#20
80
81#ifdef	__thumb2__
82	it	eq
83#endif
84	moveq	r0,#0
85	beq	.Lno_key
86
87#if	__ARM_MAX_ARCH__>=7
88	adr	r11,.Lpoly1305_init
89	ldr	r12,.LOPENSSL_armcap
90#endif
91	ldrb	r4,[$inp,#0]
92	mov	r10,#0x0fffffff
93	ldrb	r5,[$inp,#1]
94	and	r3,r10,#-4		@ 0x0ffffffc
95	ldrb	r6,[$inp,#2]
96	ldrb	r7,[$inp,#3]
97	orr	r4,r4,r5,lsl#8
98	ldrb	r5,[$inp,#4]
99	orr	r4,r4,r6,lsl#16
100	ldrb	r6,[$inp,#5]
101	orr	r4,r4,r7,lsl#24
102	ldrb	r7,[$inp,#6]
103	and	r4,r4,r10
104
105#if	__ARM_MAX_ARCH__>=7
106# if !defined(_WIN32)
107	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
108# endif
109# if defined(__APPLE__) || defined(_WIN32)
110	ldr	r12,[r12]
111# endif
112#endif
113	ldrb	r8,[$inp,#7]
114	orr	r5,r5,r6,lsl#8
115	ldrb	r6,[$inp,#8]
116	orr	r5,r5,r7,lsl#16
117	ldrb	r7,[$inp,#9]
118	orr	r5,r5,r8,lsl#24
119	ldrb	r8,[$inp,#10]
120	and	r5,r5,r3
121
122#if	__ARM_MAX_ARCH__>=7
123	tst	r12,#ARMV7_NEON		@ check for NEON
124# ifdef	__thumb2__
125	adr	r9,.Lpoly1305_blocks_neon
126	adr	r11,.Lpoly1305_blocks
127	adr	r12,.Lpoly1305_emit
128	adr	r10,.Lpoly1305_emit_neon
129	itt	ne
130	movne	r11,r9
131	movne	r12,r10
132	orr	r11,r11,#1	@ thumb-ify address
133	orr	r12,r12,#1
134# else
135	addeq	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
136	addne	r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
137	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
138	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
139# endif
140#endif
141	ldrb	r9,[$inp,#11]
142	orr	r6,r6,r7,lsl#8
143	ldrb	r7,[$inp,#12]
144	orr	r6,r6,r8,lsl#16
145	ldrb	r8,[$inp,#13]
146	orr	r6,r6,r9,lsl#24
147	ldrb	r9,[$inp,#14]
148	and	r6,r6,r3
149
150	ldrb	r10,[$inp,#15]
151	orr	r7,r7,r8,lsl#8
152	str	r4,[$ctx,#0]
153	orr	r7,r7,r9,lsl#16
154	str	r5,[$ctx,#4]
155	orr	r7,r7,r10,lsl#24
156	str	r6,[$ctx,#8]
157	and	r7,r7,r3
158	str	r7,[$ctx,#12]
159#if	__ARM_MAX_ARCH__>=7
160	stmia	r2,{r11,r12}		@ fill functions table
161	mov	r0,#1
162#else
163	mov	r0,#0
164#endif
165.Lno_key:
166	ldmia	sp!,{r4-r11}
167#if	__ARM_ARCH__>=5
168	ret				@ bx	lr
169#else
170	tst	lr,#1
171	moveq	pc,lr			@ be binary compatible with V4, yet
172	bx	lr			@ interoperable with Thumb ISA:-)
173#endif
174.size	poly1305_init,.-poly1305_init
175___
176{
177my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
178my ($s1,$s2,$s3)=($r1,$r2,$r3);
179
180$code.=<<___;
181.type	poly1305_blocks,%function
182.align	5
183poly1305_blocks:
184.Lpoly1305_blocks:
185	stmdb	sp!,{r3-r11,lr}
186
187	ands	$len,$len,#-16
188	beq	.Lno_data
189
190	cmp	$padbit,#0
191	add	$len,$len,$inp		@ end pointer
192	sub	sp,sp,#32
193
194	ldmia	$ctx,{$h0-$r3}		@ load context
195
196	str	$ctx,[sp,#12]		@ offload stuff
197	mov	lr,$inp
198	str	$len,[sp,#16]
199	str	$r1,[sp,#20]
200	str	$r2,[sp,#24]
201	str	$r3,[sp,#28]
202	b	.Loop
203
204.Loop:
205#if __ARM_ARCH__<7
206	ldrb	r0,[lr],#16		@ load input
207# ifdef	__thumb2__
208	it	hi
209# endif
210	addhi	$h4,$h4,#1		@ 1<<128
211	ldrb	r1,[lr,#-15]
212	ldrb	r2,[lr,#-14]
213	ldrb	r3,[lr,#-13]
214	orr	r1,r0,r1,lsl#8
215	ldrb	r0,[lr,#-12]
216	orr	r2,r1,r2,lsl#16
217	ldrb	r1,[lr,#-11]
218	orr	r3,r2,r3,lsl#24
219	ldrb	r2,[lr,#-10]
220	adds	$h0,$h0,r3		@ accumulate input
221
222	ldrb	r3,[lr,#-9]
223	orr	r1,r0,r1,lsl#8
224	ldrb	r0,[lr,#-8]
225	orr	r2,r1,r2,lsl#16
226	ldrb	r1,[lr,#-7]
227	orr	r3,r2,r3,lsl#24
228	ldrb	r2,[lr,#-6]
229	adcs	$h1,$h1,r3
230
231	ldrb	r3,[lr,#-5]
232	orr	r1,r0,r1,lsl#8
233	ldrb	r0,[lr,#-4]
234	orr	r2,r1,r2,lsl#16
235	ldrb	r1,[lr,#-3]
236	orr	r3,r2,r3,lsl#24
237	ldrb	r2,[lr,#-2]
238	adcs	$h2,$h2,r3
239
240	ldrb	r3,[lr,#-1]
241	orr	r1,r0,r1,lsl#8
242	str	lr,[sp,#8]		@ offload input pointer
243	orr	r2,r1,r2,lsl#16
244	add	$s1,$r1,$r1,lsr#2
245	orr	r3,r2,r3,lsl#24
246#else
247	ldr	r0,[lr],#16		@ load input
248# ifdef	__thumb2__
249	it	hi
250# endif
251	addhi	$h4,$h4,#1		@ padbit
252	ldr	r1,[lr,#-12]
253	ldr	r2,[lr,#-8]
254	ldr	r3,[lr,#-4]
255# ifdef	__ARMEB__
256	rev	r0,r0
257	rev	r1,r1
258	rev	r2,r2
259	rev	r3,r3
260# endif
261	adds	$h0,$h0,r0		@ accumulate input
262	str	lr,[sp,#8]		@ offload input pointer
263	adcs	$h1,$h1,r1
264	add	$s1,$r1,$r1,lsr#2
265	adcs	$h2,$h2,r2
266#endif
267	add	$s2,$r2,$r2,lsr#2
268	adcs	$h3,$h3,r3
269	add	$s3,$r3,$r3,lsr#2
270
271	umull	r2,r3,$h1,$r0
272	 adc	$h4,$h4,#0
273	umull	r0,r1,$h0,$r0
274	umlal	r2,r3,$h4,$s1
275	umlal	r0,r1,$h3,$s1
276	ldr	$r1,[sp,#20]		@ reload $r1
277	umlal	r2,r3,$h2,$s3
278	umlal	r0,r1,$h1,$s3
279	umlal	r2,r3,$h3,$s2
280	umlal	r0,r1,$h2,$s2
281	umlal	r2,r3,$h0,$r1
282	str	r0,[sp,#0]		@ future $h0
283	 mul	r0,$s2,$h4
284	ldr	$r2,[sp,#24]		@ reload $r2
285	adds	r2,r2,r1		@ d1+=d0>>32
286	 eor	r1,r1,r1
287	adc	lr,r3,#0		@ future $h2
288	str	r2,[sp,#4]		@ future $h1
289
290	mul	r2,$s3,$h4
291	eor	r3,r3,r3
292	umlal	r0,r1,$h3,$s3
293	ldr	$r3,[sp,#28]		@ reload $r3
294	umlal	r2,r3,$h3,$r0
295	umlal	r0,r1,$h2,$r0
296	umlal	r2,r3,$h2,$r1
297	umlal	r0,r1,$h1,$r1
298	umlal	r2,r3,$h1,$r2
299	umlal	r0,r1,$h0,$r2
300	umlal	r2,r3,$h0,$r3
301	ldr	$h0,[sp,#0]
302	mul	$h4,$r0,$h4
303	ldr	$h1,[sp,#4]
304
305	adds	$h2,lr,r0		@ d2+=d1>>32
306	ldr	lr,[sp,#8]		@ reload input pointer
307	adc	r1,r1,#0
308	adds	$h3,r2,r1		@ d3+=d2>>32
309	ldr	r0,[sp,#16]		@ reload end pointer
310	adc	r3,r3,#0
311	add	$h4,$h4,r3		@ h4+=d3>>32
312
313	and	r1,$h4,#-4
314	and	$h4,$h4,#3
315	add	r1,r1,r1,lsr#2		@ *=5
316	adds	$h0,$h0,r1
317	adcs	$h1,$h1,#0
318	adcs	$h2,$h2,#0
319	adcs	$h3,$h3,#0
320	adc	$h4,$h4,#0
321
322	cmp	r0,lr			@ done yet?
323	bhi	.Loop
324
325	ldr	$ctx,[sp,#12]
326	add	sp,sp,#32
327	stmia	$ctx,{$h0-$h4}		@ store the result
328
329.Lno_data:
330#if	__ARM_ARCH__>=5
331	ldmia	sp!,{r3-r11,pc}
332#else
333	ldmia	sp!,{r3-r11,lr}
334	tst	lr,#1
335	moveq	pc,lr			@ be binary compatible with V4, yet
336	bx	lr			@ interoperable with Thumb ISA:-)
337#endif
338.size	poly1305_blocks,.-poly1305_blocks
339___
340}
341{
342my ($ctx,$mac,$nonce)=map("r$_",(0..2));
343my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
344my $g4=$h4;
345
346$code.=<<___;
347.type	poly1305_emit,%function
348.align	5
349poly1305_emit:
350.Lpoly1305_emit:
351	stmdb	sp!,{r4-r11}
352.Lpoly1305_emit_enter:
353
354	ldmia	$ctx,{$h0-$h4}
355	adds	$g0,$h0,#5		@ compare to modulus
356	adcs	$g1,$h1,#0
357	adcs	$g2,$h2,#0
358	adcs	$g3,$h3,#0
359	adc	$g4,$h4,#0
360	tst	$g4,#4			@ did it carry/borrow?
361
362#ifdef	__thumb2__
363	it	ne
364#endif
365	movne	$h0,$g0
366	ldr	$g0,[$nonce,#0]
367#ifdef	__thumb2__
368	it	ne
369#endif
370	movne	$h1,$g1
371	ldr	$g1,[$nonce,#4]
372#ifdef	__thumb2__
373	it	ne
374#endif
375	movne	$h2,$g2
376	ldr	$g2,[$nonce,#8]
377#ifdef	__thumb2__
378	it	ne
379#endif
380	movne	$h3,$g3
381	ldr	$g3,[$nonce,#12]
382
383	adds	$h0,$h0,$g0
384	adcs	$h1,$h1,$g1
385	adcs	$h2,$h2,$g2
386	adc	$h3,$h3,$g3
387
388#if __ARM_ARCH__>=7
389# ifdef __ARMEB__
390	rev	$h0,$h0
391	rev	$h1,$h1
392	rev	$h2,$h2
393	rev	$h3,$h3
394# endif
395	str	$h0,[$mac,#0]
396	str	$h1,[$mac,#4]
397	str	$h2,[$mac,#8]
398	str	$h3,[$mac,#12]
399#else
400	strb	$h0,[$mac,#0]
401	mov	$h0,$h0,lsr#8
402	strb	$h1,[$mac,#4]
403	mov	$h1,$h1,lsr#8
404	strb	$h2,[$mac,#8]
405	mov	$h2,$h2,lsr#8
406	strb	$h3,[$mac,#12]
407	mov	$h3,$h3,lsr#8
408
409	strb	$h0,[$mac,#1]
410	mov	$h0,$h0,lsr#8
411	strb	$h1,[$mac,#5]
412	mov	$h1,$h1,lsr#8
413	strb	$h2,[$mac,#9]
414	mov	$h2,$h2,lsr#8
415	strb	$h3,[$mac,#13]
416	mov	$h3,$h3,lsr#8
417
418	strb	$h0,[$mac,#2]
419	mov	$h0,$h0,lsr#8
420	strb	$h1,[$mac,#6]
421	mov	$h1,$h1,lsr#8
422	strb	$h2,[$mac,#10]
423	mov	$h2,$h2,lsr#8
424	strb	$h3,[$mac,#14]
425	mov	$h3,$h3,lsr#8
426
427	strb	$h0,[$mac,#3]
428	strb	$h1,[$mac,#7]
429	strb	$h2,[$mac,#11]
430	strb	$h3,[$mac,#15]
431#endif
432	ldmia	sp!,{r4-r11}
433#if	__ARM_ARCH__>=5
434	ret				@ bx	lr
435#else
436	tst	lr,#1
437	moveq	pc,lr			@ be binary compatible with V4, yet
438	bx	lr			@ interoperable with Thumb ISA:-)
439#endif
440.size	poly1305_emit,.-poly1305_emit
441___
442{
443my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
444my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
445my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
446
447my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
448
449$code.=<<___;
450#if	__ARM_MAX_ARCH__>=7
451.fpu	neon
452
453.type	poly1305_init_neon,%function
454.align	5
455poly1305_init_neon:
456	ldr	r4,[$ctx,#20]		@ load key base 2^32
457	ldr	r5,[$ctx,#24]
458	ldr	r6,[$ctx,#28]
459	ldr	r7,[$ctx,#32]
460
461	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
462	mov	r3,r4,lsr#26
463	mov	r4,r5,lsr#20
464	orr	r3,r3,r5,lsl#6
465	mov	r5,r6,lsr#14
466	orr	r4,r4,r6,lsl#12
467	mov	r6,r7,lsr#8
468	orr	r5,r5,r7,lsl#18
469	and	r3,r3,#0x03ffffff
470	and	r4,r4,#0x03ffffff
471	and	r5,r5,#0x03ffffff
472
473	vdup.32	$R0,r2			@ r^1 in both lanes
474	add	r2,r3,r3,lsl#2		@ *5
475	vdup.32	$R1,r3
476	add	r3,r4,r4,lsl#2
477	vdup.32	$S1,r2
478	vdup.32	$R2,r4
479	add	r4,r5,r5,lsl#2
480	vdup.32	$S2,r3
481	vdup.32	$R3,r5
482	add	r5,r6,r6,lsl#2
483	vdup.32	$S3,r4
484	vdup.32	$R4,r6
485	vdup.32	$S4,r5
486
487	mov	$zeros,#2		@ counter
488
489.Lsquare_neon:
490	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
491	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
492	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
493	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
494	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
495	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
496
497	vmull.u32	$D0,$R0,${R0}[1]
498	vmull.u32	$D1,$R1,${R0}[1]
499	vmull.u32	$D2,$R2,${R0}[1]
500	vmull.u32	$D3,$R3,${R0}[1]
501	vmull.u32	$D4,$R4,${R0}[1]
502
503	vmlal.u32	$D0,$R4,${S1}[1]
504	vmlal.u32	$D1,$R0,${R1}[1]
505	vmlal.u32	$D2,$R1,${R1}[1]
506	vmlal.u32	$D3,$R2,${R1}[1]
507	vmlal.u32	$D4,$R3,${R1}[1]
508
509	vmlal.u32	$D0,$R3,${S2}[1]
510	vmlal.u32	$D1,$R4,${S2}[1]
511	vmlal.u32	$D3,$R1,${R2}[1]
512	vmlal.u32	$D2,$R0,${R2}[1]
513	vmlal.u32	$D4,$R2,${R2}[1]
514
515	vmlal.u32	$D0,$R2,${S3}[1]
516	vmlal.u32	$D3,$R0,${R3}[1]
517	vmlal.u32	$D1,$R3,${S3}[1]
518	vmlal.u32	$D2,$R4,${S3}[1]
519	vmlal.u32	$D4,$R1,${R3}[1]
520
521	vmlal.u32	$D3,$R4,${S4}[1]
522	vmlal.u32	$D0,$R1,${S4}[1]
523	vmlal.u32	$D1,$R2,${S4}[1]
524	vmlal.u32	$D2,$R3,${S4}[1]
525	vmlal.u32	$D4,$R0,${R4}[1]
526
527	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
528	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
529	@ and P. Schwabe
530	@
531	@ H0>>+H1>>+H2>>+H3>>+H4
532	@ H3>>+H4>>*5+H0>>+H1
533	@
534	@ Trivia.
535	@
536	@ Result of multiplication of n-bit number by m-bit number is
537	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
538	@ m-bit number multiplied by 2^n is still n+m bits wide.
539	@
540	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
541	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
542	@ one is n+1 bits wide.
543	@
544	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
545	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
546	@ can be 27. However! In cases when their width exceeds 26 bits
547	@ they are limited by 2^26+2^6. This in turn means that *sum*
548	@ of the products with these values can still be viewed as sum
549	@ of 52-bit numbers as long as the amount of addends is not a
550	@ power of 2. For example,
551	@
552	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
553	@
554	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
555	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
556	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
557	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
558	@ which is less than 32 * (2^52) or 2^57. And when processing
559	@ data we are looking at triple as many addends...
560	@
561	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
562	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
563	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
564	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
565	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
566	@ This means that result of reduction have to be compressed upon
567	@ loop wrap-around. This can be done in the process of reduction
568	@ to minimize amount of instructions [as well as amount of
569	@ 128-bit instructions, which benefits low-end processors], but
570	@ one has to watch for H2 (which is narrower than H0) and 5*H4
571	@ not being wider than 58 bits, so that result of right shift
572	@ by 26 bits fits in 32 bits. This is also useful on x86,
573	@ because it allows to use paddd in place for paddq, which
574	@ benefits Atom, where paddq is ridiculously slow.
575
576	vshr.u64	$T0,$D3,#26
577	vmovn.i64	$D3#lo,$D3
578	 vshr.u64	$T1,$D0,#26
579	 vmovn.i64	$D0#lo,$D0
580	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
581	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
582	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
583	 vbic.i32	$D0#lo,#0xfc000000
584
585	vshrn.u64	$T0#lo,$D4,#26
586	vmovn.i64	$D4#lo,$D4
587	 vshr.u64	$T1,$D1,#26
588	 vmovn.i64	$D1#lo,$D1
589	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
590	vbic.i32	$D4#lo,#0xfc000000
591	 vbic.i32	$D1#lo,#0xfc000000
592
593	vadd.i32	$D0#lo,$D0#lo,$T0#lo
594	vshl.u32	$T0#lo,$T0#lo,#2
595	 vshrn.u64	$T1#lo,$D2,#26
596	 vmovn.i64	$D2#lo,$D2
597	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
598	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
599	 vbic.i32	$D2#lo,#0xfc000000
600
601	vshr.u32	$T0#lo,$D0#lo,#26
602	vbic.i32	$D0#lo,#0xfc000000
603	 vshr.u32	$T1#lo,$D3#lo,#26
604	 vbic.i32	$D3#lo,#0xfc000000
605	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
606	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
607
608	subs		$zeros,$zeros,#1
609	beq		.Lsquare_break_neon
610
611	add		$tbl0,$ctx,#(48+0*9*4)
612	add		$tbl1,$ctx,#(48+1*9*4)
613
614	vtrn.32		$R0,$D0#lo		@ r^2:r^1
615	vtrn.32		$R2,$D2#lo
616	vtrn.32		$R3,$D3#lo
617	vtrn.32		$R1,$D1#lo
618	vtrn.32		$R4,$D4#lo
619
620	vshl.u32	$S2,$R2,#2		@ *5
621	vshl.u32	$S3,$R3,#2
622	vshl.u32	$S1,$R1,#2
623	vshl.u32	$S4,$R4,#2
624	vadd.i32	$S2,$S2,$R2
625	vadd.i32	$S1,$S1,$R1
626	vadd.i32	$S3,$S3,$R3
627	vadd.i32	$S4,$S4,$R4
628
629	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
630	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
631	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
632	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
633	vst1.32		{${S4}[0]},[$tbl0,:32]
634	vst1.32		{${S4}[1]},[$tbl1,:32]
635
636	b		.Lsquare_neon
637
638.align	4
639.Lsquare_break_neon:
640	add		$tbl0,$ctx,#(48+2*4*9)
641	add		$tbl1,$ctx,#(48+3*4*9)
642
643	vmov		$R0,$D0#lo		@ r^4:r^3
644	vshl.u32	$S1,$D1#lo,#2		@ *5
645	vmov		$R1,$D1#lo
646	vshl.u32	$S2,$D2#lo,#2
647	vmov		$R2,$D2#lo
648	vshl.u32	$S3,$D3#lo,#2
649	vmov		$R3,$D3#lo
650	vshl.u32	$S4,$D4#lo,#2
651	vmov		$R4,$D4#lo
652	vadd.i32	$S1,$S1,$D1#lo
653	vadd.i32	$S2,$S2,$D2#lo
654	vadd.i32	$S3,$S3,$D3#lo
655	vadd.i32	$S4,$S4,$D4#lo
656
657	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
658	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
659	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
660	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
661	vst1.32		{${S4}[0]},[$tbl0]
662	vst1.32		{${S4}[1]},[$tbl1]
663
664	ret				@ bx	lr
665.size	poly1305_init_neon,.-poly1305_init_neon
666
667.type	poly1305_blocks_neon,%function
668.align	5
669poly1305_blocks_neon:
670.Lpoly1305_blocks_neon:
671	ldr	ip,[$ctx,#36]		@ is_base2_26
672	ands	$len,$len,#-16
673	beq	.Lno_data_neon
674
675	cmp	$len,#64
676	bhs	.Lenter_neon
677	tst	ip,ip			@ is_base2_26?
678	beq	.Lpoly1305_blocks
679
680.Lenter_neon:
681	stmdb	sp!,{r4-r7}
682	vstmdb	sp!,{d8-d15}		@ ABI specification says so
683
684	tst	ip,ip			@ is_base2_26?
685	bne	.Lbase2_26_neon
686
687	stmdb	sp!,{r1-r3,lr}
688	bl	poly1305_init_neon
689
690	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
691	ldr	r5,[$ctx,#4]
692	ldr	r6,[$ctx,#8]
693	ldr	r7,[$ctx,#12]
694	ldr	ip,[$ctx,#16]
695
696	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
697	mov	r3,r4,lsr#26
698	 veor	$D0#lo,$D0#lo,$D0#lo
699	mov	r4,r5,lsr#20
700	orr	r3,r3,r5,lsl#6
701	 veor	$D1#lo,$D1#lo,$D1#lo
702	mov	r5,r6,lsr#14
703	orr	r4,r4,r6,lsl#12
704	 veor	$D2#lo,$D2#lo,$D2#lo
705	mov	r6,r7,lsr#8
706	orr	r5,r5,r7,lsl#18
707	 veor	$D3#lo,$D3#lo,$D3#lo
708	and	r3,r3,#0x03ffffff
709	orr	r6,r6,ip,lsl#24
710	 veor	$D4#lo,$D4#lo,$D4#lo
711	and	r4,r4,#0x03ffffff
712	mov	r1,#1
713	and	r5,r5,#0x03ffffff
714	str	r1,[$ctx,#36]		@ is_base2_26
715
716	vmov.32	$D0#lo[0],r2
717	vmov.32	$D1#lo[0],r3
718	vmov.32	$D2#lo[0],r4
719	vmov.32	$D3#lo[0],r5
720	vmov.32	$D4#lo[0],r6
721	adr	$zeros,.Lzeros
722
723	ldmia	sp!,{r1-r3,lr}
724	b	.Lbase2_32_neon
725
726.align	4
727.Lbase2_26_neon:
728	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
729	@ load hash value
730
731	veor		$D0#lo,$D0#lo,$D0#lo
732	veor		$D1#lo,$D1#lo,$D1#lo
733	veor		$D2#lo,$D2#lo,$D2#lo
734	veor		$D3#lo,$D3#lo,$D3#lo
735	veor		$D4#lo,$D4#lo,$D4#lo
736	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
737	adr		$zeros,.Lzeros
738	vld1.32		{$D4#lo[0]},[$ctx]
739	sub		$ctx,$ctx,#16		@ rewind
740
741.Lbase2_32_neon:
742	add		$in2,$inp,#32
743	mov		$padbit,$padbit,lsl#24
744	tst		$len,#31
745	beq		.Leven
746
747	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
748	vmov.32		$H4#lo[0],$padbit
749	sub		$len,$len,#16
750	add		$in2,$inp,#32
751
752# ifdef	__ARMEB__
753	vrev32.8	$H0,$H0
754	vrev32.8	$H3,$H3
755	vrev32.8	$H1,$H1
756	vrev32.8	$H2,$H2
757# endif
758	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
759	vshl.u32	$H3#lo,$H3#lo,#18
760
761	vsri.u32	$H3#lo,$H2#lo,#14
762	vshl.u32	$H2#lo,$H2#lo,#12
763	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
764
765	vbic.i32	$H3#lo,#0xfc000000
766	vsri.u32	$H2#lo,$H1#lo,#20
767	vshl.u32	$H1#lo,$H1#lo,#6
768
769	vbic.i32	$H2#lo,#0xfc000000
770	vsri.u32	$H1#lo,$H0#lo,#26
771	vadd.i32	$H3#hi,$H3#lo,$D3#lo
772
773	vbic.i32	$H0#lo,#0xfc000000
774	vbic.i32	$H1#lo,#0xfc000000
775	vadd.i32	$H2#hi,$H2#lo,$D2#lo
776
777	vadd.i32	$H0#hi,$H0#lo,$D0#lo
778	vadd.i32	$H1#hi,$H1#lo,$D1#lo
779
780	mov		$tbl1,$zeros
781	add		$tbl0,$ctx,#48
782
783	cmp		$len,$len
784	b		.Long_tail
785
786.align	4
787.Leven:
788	subs		$len,$len,#64
789	it		lo
790	movlo		$in2,$zeros
791
792	vmov.i32	$H4,#1<<24		@ padbit, yes, always
793	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
794	add		$inp,$inp,#64
795	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
796	add		$in2,$in2,#64
797	itt		hi
798	addhi		$tbl1,$ctx,#(48+1*9*4)
799	addhi		$tbl0,$ctx,#(48+3*9*4)
800
801# ifdef	__ARMEB__
802	vrev32.8	$H0,$H0
803	vrev32.8	$H3,$H3
804	vrev32.8	$H1,$H1
805	vrev32.8	$H2,$H2
806# endif
807	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
808	vshl.u32	$H3,$H3,#18
809
810	vsri.u32	$H3,$H2,#14
811	vshl.u32	$H2,$H2,#12
812
813	vbic.i32	$H3,#0xfc000000
814	vsri.u32	$H2,$H1,#20
815	vshl.u32	$H1,$H1,#6
816
817	vbic.i32	$H2,#0xfc000000
818	vsri.u32	$H1,$H0,#26
819
820	vbic.i32	$H0,#0xfc000000
821	vbic.i32	$H1,#0xfc000000
822
823	bls		.Lskip_loop
824
825	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
826	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
827	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
828	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
829	b		.Loop_neon
830
831.align	5
832.Loop_neon:
833	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
834	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
835	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
836	@   \___________________/
837	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
838	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
839	@   \___________________/ \____________________/
840	@
841	@ Note that we start with inp[2:3]*r^2. This is because it
842	@ doesn't depend on reduction in previous iteration.
843	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
844	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
845	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
846	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
847	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
848	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
849
850	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
851	@ inp[2:3]*r^2
852
853	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
854	vmull.u32	$D2,$H2#hi,${R0}[1]
855	vadd.i32	$H0#lo,$H0#lo,$D0#lo
856	vmull.u32	$D0,$H0#hi,${R0}[1]
857	vadd.i32	$H3#lo,$H3#lo,$D3#lo
858	vmull.u32	$D3,$H3#hi,${R0}[1]
859	vmlal.u32	$D2,$H1#hi,${R1}[1]
860	vadd.i32	$H1#lo,$H1#lo,$D1#lo
861	vmull.u32	$D1,$H1#hi,${R0}[1]
862
863	vadd.i32	$H4#lo,$H4#lo,$D4#lo
864	vmull.u32	$D4,$H4#hi,${R0}[1]
865	subs		$len,$len,#64
866	vmlal.u32	$D0,$H4#hi,${S1}[1]
867	it		lo
868	movlo		$in2,$zeros
869	vmlal.u32	$D3,$H2#hi,${R1}[1]
870	vld1.32		${S4}[1],[$tbl1,:32]
871	vmlal.u32	$D1,$H0#hi,${R1}[1]
872	vmlal.u32	$D4,$H3#hi,${R1}[1]
873
874	vmlal.u32	$D0,$H3#hi,${S2}[1]
875	vmlal.u32	$D3,$H1#hi,${R2}[1]
876	vmlal.u32	$D4,$H2#hi,${R2}[1]
877	vmlal.u32	$D1,$H4#hi,${S2}[1]
878	vmlal.u32	$D2,$H0#hi,${R2}[1]
879
880	vmlal.u32	$D3,$H0#hi,${R3}[1]
881	vmlal.u32	$D0,$H2#hi,${S3}[1]
882	vmlal.u32	$D4,$H1#hi,${R3}[1]
883	vmlal.u32	$D1,$H3#hi,${S3}[1]
884	vmlal.u32	$D2,$H4#hi,${S3}[1]
885
886	vmlal.u32	$D3,$H4#hi,${S4}[1]
887	vmlal.u32	$D0,$H1#hi,${S4}[1]
888	vmlal.u32	$D4,$H0#hi,${R4}[1]
889	vmlal.u32	$D1,$H2#hi,${S4}[1]
890	vmlal.u32	$D2,$H3#hi,${S4}[1]
891
892	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
893	add		$in2,$in2,#64
894
895	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
896	@ (hash+inp[0:1])*r^4 and accumulate
897
898	vmlal.u32	$D3,$H3#lo,${R0}[0]
899	vmlal.u32	$D0,$H0#lo,${R0}[0]
900	vmlal.u32	$D4,$H4#lo,${R0}[0]
901	vmlal.u32	$D1,$H1#lo,${R0}[0]
902	vmlal.u32	$D2,$H2#lo,${R0}[0]
903	vld1.32		${S4}[0],[$tbl0,:32]
904
905	vmlal.u32	$D3,$H2#lo,${R1}[0]
906	vmlal.u32	$D0,$H4#lo,${S1}[0]
907	vmlal.u32	$D4,$H3#lo,${R1}[0]
908	vmlal.u32	$D1,$H0#lo,${R1}[0]
909	vmlal.u32	$D2,$H1#lo,${R1}[0]
910
911	vmlal.u32	$D3,$H1#lo,${R2}[0]
912	vmlal.u32	$D0,$H3#lo,${S2}[0]
913	vmlal.u32	$D4,$H2#lo,${R2}[0]
914	vmlal.u32	$D1,$H4#lo,${S2}[0]
915	vmlal.u32	$D2,$H0#lo,${R2}[0]
916
917	vmlal.u32	$D3,$H0#lo,${R3}[0]
918	vmlal.u32	$D0,$H2#lo,${S3}[0]
919	vmlal.u32	$D4,$H1#lo,${R3}[0]
920	vmlal.u32	$D1,$H3#lo,${S3}[0]
921	vmlal.u32	$D3,$H4#lo,${S4}[0]
922
923	vmlal.u32	$D2,$H4#lo,${S3}[0]
924	vmlal.u32	$D0,$H1#lo,${S4}[0]
925	vmlal.u32	$D4,$H0#lo,${R4}[0]
926	vmov.i32	$H4,#1<<24		@ padbit, yes, always
927	vmlal.u32	$D1,$H2#lo,${S4}[0]
928	vmlal.u32	$D2,$H3#lo,${S4}[0]
929
930	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
931	add		$inp,$inp,#64
932# ifdef	__ARMEB__
933	vrev32.8	$H0,$H0
934	vrev32.8	$H1,$H1
935	vrev32.8	$H2,$H2
936	vrev32.8	$H3,$H3
937# endif
938
939	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
940	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
941	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
942
943	vshr.u64	$T0,$D3,#26
944	vmovn.i64	$D3#lo,$D3
945	 vshr.u64	$T1,$D0,#26
946	 vmovn.i64	$D0#lo,$D0
947	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
948	vbic.i32	$D3#lo,#0xfc000000
949	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
950	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
951	  vshl.u32	$H3,$H3,#18
952	 vbic.i32	$D0#lo,#0xfc000000
953
954	vshrn.u64	$T0#lo,$D4,#26
955	vmovn.i64	$D4#lo,$D4
956	 vshr.u64	$T1,$D1,#26
957	 vmovn.i64	$D1#lo,$D1
958	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
959	  vsri.u32	$H3,$H2,#14
960	vbic.i32	$D4#lo,#0xfc000000
961	  vshl.u32	$H2,$H2,#12
962	 vbic.i32	$D1#lo,#0xfc000000
963
964	vadd.i32	$D0#lo,$D0#lo,$T0#lo
965	vshl.u32	$T0#lo,$T0#lo,#2
966	  vbic.i32	$H3,#0xfc000000
967	 vshrn.u64	$T1#lo,$D2,#26
968	 vmovn.i64	$D2#lo,$D2
969	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
970	  vsri.u32	$H2,$H1,#20
971	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
972	  vshl.u32	$H1,$H1,#6
973	 vbic.i32	$D2#lo,#0xfc000000
974	  vbic.i32	$H2,#0xfc000000
975
976	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
977	vmovn.i64	$D0#lo,$D0
978	  vsri.u32	$H1,$H0,#26
979	  vbic.i32	$H0,#0xfc000000
980	 vshr.u32	$T1#lo,$D3#lo,#26
981	 vbic.i32	$D3#lo,#0xfc000000
982	vbic.i32	$D0#lo,#0xfc000000
983	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
984	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
985	  vbic.i32	$H1,#0xfc000000
986
987	bhi		.Loop_neon
988
989.Lskip_loop:
990	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
991	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
992
993	add		$tbl1,$ctx,#(48+0*9*4)
994	add		$tbl0,$ctx,#(48+1*9*4)
995	adds		$len,$len,#32
996	it		ne
997	movne		$len,#0
998	bne		.Long_tail
999
1000	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
1001	vadd.i32	$H0#hi,$H0#lo,$D0#lo
1002	vadd.i32	$H3#hi,$H3#lo,$D3#lo
1003	vadd.i32	$H1#hi,$H1#lo,$D1#lo
1004	vadd.i32	$H4#hi,$H4#lo,$D4#lo
1005
1006.Long_tail:
1007	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
1008	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
1009
1010	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
1011	vmull.u32	$D2,$H2#hi,$R0
1012	vadd.i32	$H0#lo,$H0#lo,$D0#lo
1013	vmull.u32	$D0,$H0#hi,$R0
1014	vadd.i32	$H3#lo,$H3#lo,$D3#lo
1015	vmull.u32	$D3,$H3#hi,$R0
1016	vadd.i32	$H1#lo,$H1#lo,$D1#lo
1017	vmull.u32	$D1,$H1#hi,$R0
1018	vadd.i32	$H4#lo,$H4#lo,$D4#lo
1019	vmull.u32	$D4,$H4#hi,$R0
1020
1021	vmlal.u32	$D0,$H4#hi,$S1
1022	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1023	vmlal.u32	$D3,$H2#hi,$R1
1024	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1025	vmlal.u32	$D1,$H0#hi,$R1
1026	vmlal.u32	$D4,$H3#hi,$R1
1027	vmlal.u32	$D2,$H1#hi,$R1
1028
1029	vmlal.u32	$D3,$H1#hi,$R2
1030	vld1.32		${S4}[1],[$tbl1,:32]
1031	vmlal.u32	$D0,$H3#hi,$S2
1032	vld1.32		${S4}[0],[$tbl0,:32]
1033	vmlal.u32	$D4,$H2#hi,$R2
1034	vmlal.u32	$D1,$H4#hi,$S2
1035	vmlal.u32	$D2,$H0#hi,$R2
1036
1037	vmlal.u32	$D3,$H0#hi,$R3
1038	 it		ne
1039	 addne		$tbl1,$ctx,#(48+2*9*4)
1040	vmlal.u32	$D0,$H2#hi,$S3
1041	 it		ne
1042	 addne		$tbl0,$ctx,#(48+3*9*4)
1043	vmlal.u32	$D4,$H1#hi,$R3
1044	vmlal.u32	$D1,$H3#hi,$S3
1045	vmlal.u32	$D2,$H4#hi,$S3
1046
1047	vmlal.u32	$D3,$H4#hi,$S4
1048	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
1049	vmlal.u32	$D0,$H1#hi,$S4
1050	 vshr.u64	$MASK,$MASK,#38
1051	vmlal.u32	$D4,$H0#hi,$R4
1052	vmlal.u32	$D1,$H2#hi,$S4
1053	vmlal.u32	$D2,$H3#hi,$S4
1054
1055	beq		.Lshort_tail
1056
1057	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1058	@ (hash+inp[0:1])*r^4:r^3 and accumulate
1059
1060	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
1061	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
1062
1063	vmlal.u32	$D2,$H2#lo,$R0
1064	vmlal.u32	$D0,$H0#lo,$R0
1065	vmlal.u32	$D3,$H3#lo,$R0
1066	vmlal.u32	$D1,$H1#lo,$R0
1067	vmlal.u32	$D4,$H4#lo,$R0
1068
1069	vmlal.u32	$D0,$H4#lo,$S1
1070	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1071	vmlal.u32	$D3,$H2#lo,$R1
1072	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1073	vmlal.u32	$D1,$H0#lo,$R1
1074	vmlal.u32	$D4,$H3#lo,$R1
1075	vmlal.u32	$D2,$H1#lo,$R1
1076
1077	vmlal.u32	$D3,$H1#lo,$R2
1078	vld1.32		${S4}[1],[$tbl1,:32]
1079	vmlal.u32	$D0,$H3#lo,$S2
1080	vld1.32		${S4}[0],[$tbl0,:32]
1081	vmlal.u32	$D4,$H2#lo,$R2
1082	vmlal.u32	$D1,$H4#lo,$S2
1083	vmlal.u32	$D2,$H0#lo,$R2
1084
1085	vmlal.u32	$D3,$H0#lo,$R3
1086	vmlal.u32	$D0,$H2#lo,$S3
1087	vmlal.u32	$D4,$H1#lo,$R3
1088	vmlal.u32	$D1,$H3#lo,$S3
1089	vmlal.u32	$D2,$H4#lo,$S3
1090
1091	vmlal.u32	$D3,$H4#lo,$S4
1092	 vorn		$MASK,$MASK,$MASK	@ all-ones
1093	vmlal.u32	$D0,$H1#lo,$S4
1094	 vshr.u64	$MASK,$MASK,#38
1095	vmlal.u32	$D4,$H0#lo,$R4
1096	vmlal.u32	$D1,$H2#lo,$S4
1097	vmlal.u32	$D2,$H3#lo,$S4
1098
1099.Lshort_tail:
1100	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1101	@ horizontal addition
1102
1103	vadd.i64	$D3#lo,$D3#lo,$D3#hi
1104	vadd.i64	$D0#lo,$D0#lo,$D0#hi
1105	vadd.i64	$D4#lo,$D4#lo,$D4#hi
1106	vadd.i64	$D1#lo,$D1#lo,$D1#hi
1107	vadd.i64	$D2#lo,$D2#lo,$D2#hi
1108
1109	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1110	@ lazy reduction, but without narrowing
1111
1112	vshr.u64	$T0,$D3,#26
1113	vand.i64	$D3,$D3,$MASK
1114	 vshr.u64	$T1,$D0,#26
1115	 vand.i64	$D0,$D0,$MASK
1116	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
1117	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
1118
1119	vshr.u64	$T0,$D4,#26
1120	vand.i64	$D4,$D4,$MASK
1121	 vshr.u64	$T1,$D1,#26
1122	 vand.i64	$D1,$D1,$MASK
1123	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
1124
1125	vadd.i64	$D0,$D0,$T0
1126	vshl.u64	$T0,$T0,#2
1127	 vshr.u64	$T1,$D2,#26
1128	 vand.i64	$D2,$D2,$MASK
1129	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
1130	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
1131
1132	vshr.u64	$T0,$D0,#26
1133	vand.i64	$D0,$D0,$MASK
1134	 vshr.u64	$T1,$D3,#26
1135	 vand.i64	$D3,$D3,$MASK
1136	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
1137	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
1138
1139	cmp		$len,#0
1140	bne		.Leven
1141
1142	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1143	@ store hash value
1144
1145	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1146	vst1.32		{$D4#lo[0]},[$ctx]
1147
1148	vldmia	sp!,{d8-d15}			@ epilogue
1149	ldmia	sp!,{r4-r7}
1150.Lno_data_neon:
1151	ret					@ bx	lr
1152.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1153
1154.type	poly1305_emit_neon,%function
1155.align	5
1156poly1305_emit_neon:
1157.Lpoly1305_emit_neon:
1158	ldr	ip,[$ctx,#36]		@ is_base2_26
1159
1160	stmdb	sp!,{r4-r11}
1161
1162	tst	ip,ip
1163	beq	.Lpoly1305_emit_enter
1164
1165	ldmia	$ctx,{$h0-$h4}
1166	eor	$g0,$g0,$g0
1167
1168	adds	$h0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
1169	mov	$h1,$h1,lsr#6
1170	adcs	$h1,$h1,$h2,lsl#20
1171	mov	$h2,$h2,lsr#12
1172	adcs	$h2,$h2,$h3,lsl#14
1173	mov	$h3,$h3,lsr#18
1174	adcs	$h3,$h3,$h4,lsl#8
1175	adc	$h4,$g0,$h4,lsr#24	@ can be partially reduced ...
1176
1177	and	$g0,$h4,#-4		@ ... so reduce
1178	and	$h4,$h3,#3
1179	add	$g0,$g0,$g0,lsr#2	@ *= 5
1180	adds	$h0,$h0,$g0
1181	adcs	$h1,$h1,#0
1182	adcs	$h2,$h2,#0
1183	adcs	$h3,$h3,#0
1184	adc	$h4,$h4,#0
1185
1186	adds	$g0,$h0,#5		@ compare to modulus
1187	adcs	$g1,$h1,#0
1188	adcs	$g2,$h2,#0
1189	adcs	$g3,$h3,#0
1190	adc	$g4,$h4,#0
1191	tst	$g4,#4			@ did it carry/borrow?
1192
1193	it	ne
1194	movne	$h0,$g0
1195	ldr	$g0,[$nonce,#0]
1196	it	ne
1197	movne	$h1,$g1
1198	ldr	$g1,[$nonce,#4]
1199	it	ne
1200	movne	$h2,$g2
1201	ldr	$g2,[$nonce,#8]
1202	it	ne
1203	movne	$h3,$g3
1204	ldr	$g3,[$nonce,#12]
1205
1206	adds	$h0,$h0,$g0		@ accumulate nonce
1207	adcs	$h1,$h1,$g1
1208	adcs	$h2,$h2,$g2
1209	adc	$h3,$h3,$g3
1210
1211# ifdef __ARMEB__
1212	rev	$h0,$h0
1213	rev	$h1,$h1
1214	rev	$h2,$h2
1215	rev	$h3,$h3
1216# endif
1217	str	$h0,[$mac,#0]		@ store the result
1218	str	$h1,[$mac,#4]
1219	str	$h2,[$mac,#8]
1220	str	$h3,[$mac,#12]
1221
1222	ldmia	sp!,{r4-r11}
1223	ret				@ bx	lr
1224.size	poly1305_emit_neon,.-poly1305_emit_neon
1225
1226.align	5
1227.Lzeros:
1228.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1229.LOPENSSL_armcap:
1230# ifdef	_WIN32
1231.word	OPENSSL_armcap_P
1232# else
1233.word	OPENSSL_armcap_P-.Lpoly1305_init
1234# endif
1235#endif
1236___
1237}	}
1238$code.=<<___;
1239.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1240.align	2
1241#if	__ARM_MAX_ARCH__>=7
1242.extern   OPENSSL_armcap_P
1243.hidden   OPENSSL_armcap_P
1244#endif
1245___
1246
1247foreach (split("\n",$code)) {
1248	s/\`([^\`]*)\`/eval $1/geo;
1249
1250	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
1251	s/\bret\b/bx	lr/go						or
1252	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
1253
1254	print $_,"\n";
1255}
1256close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1257