xref: /openssl/crypto/aes/asm/aesv8-armx.pl (revision b6461792)
1#! /usr/bin/env perl
2# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# April 2019
31#
32# Key to performance of parallelize-able modes is round instruction
33# interleaving. But which factor to use? There is optimal one for
34# each combination of instruction latency and issue rate, beyond
35# which increasing interleave factor doesn't pay off. While on cons
36# side we have code size increase and resource waste on platforms for
37# which interleave factor is too high. In other words you want it to
38# be just right. So far interleave factor of 3x was serving well all
39# platforms. But for ThunderX2 optimal interleave factor was measured
40# to be 5x...
41#
42# Performance in cycles per byte processed with 128-bit key:
43#
44#		CBC enc		CBC dec		CTR
45# Apple A7	2.39		1.20		1.20
46# Cortex-A53	1.32		1.17/1.29(**)	1.36/1.46
47# Cortex-A57(*)	1.95		0.82/0.85	0.89/0.93
48# Cortex-A72	1.33		0.85/0.88	0.92/0.96
49# Denver	1.96		0.65/0.86	0.76/0.80
50# Mongoose	1.33		1.23/1.20	1.30/1.20
51# Kryo		1.26		0.87/0.94	1.00/1.00
52# ThunderX2	5.95		1.25		1.30
53#
54# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
55#	and are still same even for updated module;
56# (**)	numbers after slash are for 32-bit code, which is 3x-
57#	interleaved;
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70    or die "can't call $xlate: $!";
71*STDOUT=*OUT;
72
73$prefix="aes_v8";
74
75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77$code=<<___;
78#include "arm_arch.h"
79
80#if __ARM_MAX_ARCH__>=7
81___
82$code.=".arch	armv8-a+crypto\n.text\n"		if ($flavour =~ /64/);
83$code.=<<___						if ($flavour !~ /64/);
84.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
85.fpu	neon
86#ifdef	__thumb2__
87.syntax	unified
88.thumb
89# define INST(a,b,c,d)	$_byte	c,d|0xc,a,b
90#else
91.code	32
92# define INST(a,b,c,d)	$_byte	a,b,c,d
93#endif
94
95.text
96___
97
98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100# maintain both 32- and 64-bit codes within single module and
101# transliterate common code to either flavour with regex vodoo.
102#
103{{{
104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109$code.=<<___;
110.align	5
111.Lrcon:
112.long	0x01,0x01,0x01,0x01
113.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
114.long	0x1b,0x1b,0x1b,0x1b
115
116.globl	${prefix}_set_encrypt_key
117.type	${prefix}_set_encrypt_key,%function
118.align	5
119${prefix}_set_encrypt_key:
120.Lenc_key:
121___
122$code.=<<___	if ($flavour =~ /64/);
123	AARCH64_VALID_CALL_TARGET
124	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125	stp	x29,x30,[sp,#-16]!
126	add	x29,sp,#0
127___
128$code.=<<___;
129	mov	$ptr,#-1
130	cmp	$inp,#0
131	b.eq	.Lenc_key_abort
132	cmp	$out,#0
133	b.eq	.Lenc_key_abort
134	mov	$ptr,#-2
135	cmp	$bits,#128
136	b.lt	.Lenc_key_abort
137	cmp	$bits,#256
138	b.gt	.Lenc_key_abort
139	tst	$bits,#0x3f
140	b.ne	.Lenc_key_abort
141
142	adr	$ptr,.Lrcon
143	cmp	$bits,#192
144
145	veor	$zero,$zero,$zero
146	vld1.8	{$in0},[$inp],#16
147	mov	$bits,#8		// reuse $bits
148	vld1.32	{$rcon,$mask},[$ptr],#32
149
150	b.lt	.Loop128
151	b.eq	.L192
152	b	.L256
153
154.align	4
155.Loop128:
156	vtbl.8	$key,{$in0},$mask
157	vext.8	$tmp,$zero,$in0,#12
158	vst1.32	{$in0},[$out],#16
159	aese	$key,$zero
160	subs	$bits,$bits,#1
161
162	veor	$in0,$in0,$tmp
163	vext.8	$tmp,$zero,$tmp,#12
164	veor	$in0,$in0,$tmp
165	vext.8	$tmp,$zero,$tmp,#12
166	 veor	$key,$key,$rcon
167	veor	$in0,$in0,$tmp
168	vshl.u8	$rcon,$rcon,#1
169	veor	$in0,$in0,$key
170	b.ne	.Loop128
171
172	vld1.32	{$rcon},[$ptr]
173
174	vtbl.8	$key,{$in0},$mask
175	vext.8	$tmp,$zero,$in0,#12
176	vst1.32	{$in0},[$out],#16
177	aese	$key,$zero
178
179	veor	$in0,$in0,$tmp
180	vext.8	$tmp,$zero,$tmp,#12
181	veor	$in0,$in0,$tmp
182	vext.8	$tmp,$zero,$tmp,#12
183	 veor	$key,$key,$rcon
184	veor	$in0,$in0,$tmp
185	vshl.u8	$rcon,$rcon,#1
186	veor	$in0,$in0,$key
187
188	vtbl.8	$key,{$in0},$mask
189	vext.8	$tmp,$zero,$in0,#12
190	vst1.32	{$in0},[$out],#16
191	aese	$key,$zero
192
193	veor	$in0,$in0,$tmp
194	vext.8	$tmp,$zero,$tmp,#12
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	 veor	$key,$key,$rcon
198	veor	$in0,$in0,$tmp
199	veor	$in0,$in0,$key
200	vst1.32	{$in0},[$out]
201	add	$out,$out,#0x50
202
203	mov	$rounds,#10
204	b	.Ldone
205
206.align	4
207.L192:
208	vld1.8	{$in1},[$inp],#8
209	vmov.i8	$key,#8			// borrow $key
210	vst1.32	{$in0},[$out],#16
211	vsub.i8	$mask,$mask,$key	// adjust the mask
212
213.Loop192:
214	vtbl.8	$key,{$in1},$mask
215	vext.8	$tmp,$zero,$in0,#12
216#ifdef __ARMEB__
217	vst1.32	{$in1},[$out],#16
218	sub	$out,$out,#8
219#else
220	vst1.32	{$in1},[$out],#8
221#endif
222	aese	$key,$zero
223	subs	$bits,$bits,#1
224
225	veor	$in0,$in0,$tmp
226	vext.8	$tmp,$zero,$tmp,#12
227	veor	$in0,$in0,$tmp
228	vext.8	$tmp,$zero,$tmp,#12
229	veor	$in0,$in0,$tmp
230
231	vdup.32	$tmp,${in0}[3]
232	veor	$tmp,$tmp,$in1
233	 veor	$key,$key,$rcon
234	vext.8	$in1,$zero,$in1,#12
235	vshl.u8	$rcon,$rcon,#1
236	veor	$in1,$in1,$tmp
237	veor	$in0,$in0,$key
238	veor	$in1,$in1,$key
239	vst1.32	{$in0},[$out],#16
240	b.ne	.Loop192
241
242	mov	$rounds,#12
243	add	$out,$out,#0x20
244	b	.Ldone
245
246.align	4
247.L256:
248	vld1.8	{$in1},[$inp]
249	mov	$bits,#7
250	mov	$rounds,#14
251	vst1.32	{$in0},[$out],#16
252
253.Loop256:
254	vtbl.8	$key,{$in1},$mask
255	vext.8	$tmp,$zero,$in0,#12
256	vst1.32	{$in1},[$out],#16
257	aese	$key,$zero
258	subs	$bits,$bits,#1
259
260	veor	$in0,$in0,$tmp
261	vext.8	$tmp,$zero,$tmp,#12
262	veor	$in0,$in0,$tmp
263	vext.8	$tmp,$zero,$tmp,#12
264	 veor	$key,$key,$rcon
265	veor	$in0,$in0,$tmp
266	vshl.u8	$rcon,$rcon,#1
267	veor	$in0,$in0,$key
268	vst1.32	{$in0},[$out],#16
269	b.eq	.Ldone
270
271	vdup.32	$key,${in0}[3]		// just splat
272	vext.8	$tmp,$zero,$in1,#12
273	aese	$key,$zero
274
275	veor	$in1,$in1,$tmp
276	vext.8	$tmp,$zero,$tmp,#12
277	veor	$in1,$in1,$tmp
278	vext.8	$tmp,$zero,$tmp,#12
279	veor	$in1,$in1,$tmp
280
281	veor	$in1,$in1,$key
282	b	.Loop256
283
284.Ldone:
285	str	$rounds,[$out]
286	mov	$ptr,#0
287
288.Lenc_key_abort:
289	mov	x0,$ptr			// return value
290	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
291	ret
292.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
293
294.globl	${prefix}_set_decrypt_key
295.type	${prefix}_set_decrypt_key,%function
296.align	5
297${prefix}_set_decrypt_key:
298___
299$code.=<<___	if ($flavour =~ /64/);
300	AARCH64_SIGN_LINK_REGISTER
301	stp	x29,x30,[sp,#-16]!
302	add	x29,sp,#0
303___
304$code.=<<___	if ($flavour !~ /64/);
305	stmdb	sp!,{r4,lr}
306___
307$code.=<<___;
308	bl	.Lenc_key
309
310	cmp	x0,#0
311	b.ne	.Ldec_key_abort
312
313	sub	$out,$out,#240		// restore original $out
314	mov	x4,#-16
315	add	$inp,$out,x12,lsl#4	// end of key schedule
316
317	vld1.32	{v0.16b},[$out]
318	vld1.32	{v1.16b},[$inp]
319	vst1.32	{v0.16b},[$inp],x4
320	vst1.32	{v1.16b},[$out],#16
321
322.Loop_imc:
323	vld1.32	{v0.16b},[$out]
324	vld1.32	{v1.16b},[$inp]
325	aesimc	v0.16b,v0.16b
326	aesimc	v1.16b,v1.16b
327	vst1.32	{v0.16b},[$inp],x4
328	vst1.32	{v1.16b},[$out],#16
329	cmp	$inp,$out
330	b.hi	.Loop_imc
331
332	vld1.32	{v0.16b},[$out]
333	aesimc	v0.16b,v0.16b
334	vst1.32	{v0.16b},[$inp]
335
336	eor	x0,x0,x0		// return value
337.Ldec_key_abort:
338___
339$code.=<<___	if ($flavour !~ /64/);
340	ldmia	sp!,{r4,pc}
341___
342$code.=<<___	if ($flavour =~ /64/);
343	ldp	x29,x30,[sp],#16
344	AARCH64_VALIDATE_LINK_REGISTER
345	ret
346___
347$code.=<<___;
348.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
349___
350}}}
351{{{
352sub gen_block () {
353my $dir = shift;
354my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355my ($inp,$out,$key)=map("x$_",(0..2));
356my $rounds="w3";
357my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358
359$code.=<<___;
360.globl	${prefix}_${dir}crypt
361.type	${prefix}_${dir}crypt,%function
362.align	5
363${prefix}_${dir}crypt:
364___
365$code.=<<___	if ($flavour =~ /64/);
366	AARCH64_VALID_CALL_TARGET
367___
368$code.=<<___;
369	ldr	$rounds,[$key,#240]
370	vld1.32	{$rndkey0},[$key],#16
371	vld1.8	{$inout},[$inp]
372	sub	$rounds,$rounds,#2
373	vld1.32	{$rndkey1},[$key],#16
374
375.Loop_${dir}c:
376	aes$e	$inout,$rndkey0
377	aes$mc	$inout,$inout
378	vld1.32	{$rndkey0},[$key],#16
379	subs	$rounds,$rounds,#2
380	aes$e	$inout,$rndkey1
381	aes$mc	$inout,$inout
382	vld1.32	{$rndkey1},[$key],#16
383	b.gt	.Loop_${dir}c
384
385	aes$e	$inout,$rndkey0
386	aes$mc	$inout,$inout
387	vld1.32	{$rndkey0},[$key]
388	aes$e	$inout,$rndkey1
389	veor	$inout,$inout,$rndkey0
390
391	vst1.8	{$inout},[$out]
392	ret
393.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
394___
395}
396&gen_block("en");
397&gen_block("de");
398}}}
399
400# Performance in cycles per byte.
401# Processed with AES-ECB different key size.
402# It shows the value before and after optimization as below:
403# (before/after):
404#
405#		AES-128-ECB		AES-192-ECB		AES-256-ECB
406# Cortex-A57	1.85/0.82		2.16/0.96		2.47/1.10
407# Cortex-A72	1.64/0.85		1.82/0.99		2.13/1.14
408
409# Optimization is implemented by loop unrolling and interleaving.
410# Commonly, we choose the unrolling factor as 5, if the input
411# data size smaller than 5 blocks, but not smaller than 3 blocks,
412# choose 3 as the unrolling factor.
413# If the input data size dsize >= 5*16 bytes, then take 5 blocks
414# as one iteration, every loop the left size lsize -= 5*16.
415# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416# every loop lsize -=3*16.
417# If lsize < 3*16 bytes, treat them as the tail, interleave the
418# two blocks AES instructions.
419# There is one special case, if the original input data size dsize
420# = 16 bytes, we will treat it separately to improve the
421# performance: one independent code block without LR, FP load and
422# store, just looks like what the original ECB implementation does.
423
424{{{
425my ($inp,$out,$len,$key)=map("x$_",(0..3));
426my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
428
429my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
430
431### q7	last round key
432### q10-q15	q7 Last 7 round keys
433### q8-q9	preloaded round keys except last 7 keys for big size
434### q5, q6, q8-q9	preloaded round keys except last 7 keys for only 16 byte
435
436{
437my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
438
439my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
440my ($dat4,$in4,$tmp4);
441if ($flavour =~ /64/) {
442    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
443}
444
445$code.=<<___;
446.globl	${prefix}_ecb_encrypt
447.type	${prefix}_ecb_encrypt,%function
448.align	5
449${prefix}_ecb_encrypt:
450___
451$code.=<<___	if ($flavour =~ /64/);
452	AARCH64_VALID_CALL_TARGET
453	subs	$len,$len,#16
454	// Original input data size bigger than 16, jump to big size processing.
455	b.ne    .Lecb_big_size
456	vld1.8	{$dat0},[$inp]
457	cmp	$enc,#0					// en- or decrypting?
458	ldr	$rounds,[$key,#240]
459	vld1.32	{q5-q6},[$key],#32			// load key schedule...
460
461	b.eq .Lecb_small_dec
462	aese	$dat0,q5
463	aesmc	$dat0,$dat0
464	vld1.32	{q8-q9},[$key],#32			// load key schedule...
465	aese	$dat0,q6
466	aesmc	$dat0,$dat0
467	subs	$rounds,$rounds,#10			// if rounds==10, jump to aes-128-ecb processing
468	b.eq    .Lecb_128_enc
469.Lecb_round_loop:
470	aese	$dat0,q8
471	aesmc	$dat0,$dat0
472	vld1.32	{q8},[$key],#16				// load key schedule...
473	aese	$dat0,q9
474	aesmc	$dat0,$dat0
475	vld1.32	{q9},[$key],#16				// load key schedule...
476	subs	$rounds,$rounds,#2			// bias
477	b.gt    .Lecb_round_loop
478.Lecb_128_enc:
479	vld1.32	{q10-q11},[$key],#32		// load key schedule...
480	aese	$dat0,q8
481	aesmc	$dat0,$dat0
482	aese	$dat0,q9
483	aesmc	$dat0,$dat0
484	vld1.32	{q12-q13},[$key],#32		// load key schedule...
485	aese	$dat0,q10
486	aesmc	$dat0,$dat0
487	aese	$dat0,q11
488	aesmc	$dat0,$dat0
489	vld1.32	{q14-q15},[$key],#32		// load key schedule...
490	aese	$dat0,q12
491	aesmc	$dat0,$dat0
492	aese	$dat0,q13
493	aesmc	$dat0,$dat0
494	vld1.32	{$rndlast},[$key]
495	aese	$dat0,q14
496	aesmc	$dat0,$dat0
497	aese	$dat0,q15
498	veor	$dat0,$dat0,$rndlast
499	vst1.8	{$dat0},[$out]
500	b	.Lecb_Final_abort
501.Lecb_small_dec:
502	aesd	$dat0,q5
503	aesimc	$dat0,$dat0
504	vld1.32	{q8-q9},[$key],#32			// load key schedule...
505	aesd	$dat0,q6
506	aesimc	$dat0,$dat0
507	subs	$rounds,$rounds,#10			// bias
508	b.eq    .Lecb_128_dec
509.Lecb_dec_round_loop:
510	aesd	$dat0,q8
511	aesimc	$dat0,$dat0
512	vld1.32	{q8},[$key],#16				// load key schedule...
513	aesd	$dat0,q9
514	aesimc	$dat0,$dat0
515	vld1.32	{q9},[$key],#16				// load key schedule...
516	subs	$rounds,$rounds,#2			// bias
517	b.gt    .Lecb_dec_round_loop
518.Lecb_128_dec:
519	vld1.32	{q10-q11},[$key],#32		// load key schedule...
520	aesd	$dat0,q8
521	aesimc	$dat0,$dat0
522	aesd	$dat0,q9
523	aesimc	$dat0,$dat0
524	vld1.32	{q12-q13},[$key],#32		// load key schedule...
525	aesd	$dat0,q10
526	aesimc	$dat0,$dat0
527	aesd	$dat0,q11
528	aesimc	$dat0,$dat0
529	vld1.32	{q14-q15},[$key],#32		// load key schedule...
530	aesd	$dat0,q12
531	aesimc	$dat0,$dat0
532	aesd	$dat0,q13
533	aesimc	$dat0,$dat0
534	vld1.32	{$rndlast},[$key]
535	aesd	$dat0,q14
536	aesimc	$dat0,$dat0
537	aesd	$dat0,q15
538	veor	$dat0,$dat0,$rndlast
539	vst1.8	{$dat0},[$out]
540	b	.Lecb_Final_abort
541.Lecb_big_size:
542___
543$code.=<<___	if ($flavour =~ /64/);
544	stp	x29,x30,[sp,#-16]!
545	add	x29,sp,#0
546___
547$code.=<<___	if ($flavour !~ /64/);
548	mov	ip,sp
549	stmdb	sp!,{r4-r8,lr}
550	vstmdb	sp!,{d8-d15}			@ ABI specification says so
551	ldmia	ip,{r4-r5}			@ load remaining args
552	subs	$len,$len,#16
553___
554$code.=<<___;
555	mov	$step,#16
556	b.lo	.Lecb_done
557	cclr	$step,eq
558
559	cmp	$enc,#0					// en- or decrypting?
560	ldr	$rounds,[$key,#240]
561	and	$len,$len,#-16
562	vld1.8	{$dat},[$inp],$step
563
564	vld1.32	{q8-q9},[$key]				// load key schedule...
565	sub	$rounds,$rounds,#6
566	add	$key_,$key,x5,lsl#4				// pointer to last 7 round keys
567	sub	$rounds,$rounds,#2
568	vld1.32	{q10-q11},[$key_],#32
569	vld1.32	{q12-q13},[$key_],#32
570	vld1.32	{q14-q15},[$key_],#32
571	vld1.32	{$rndlast},[$key_]
572
573	add	$key_,$key,#32
574	mov	$cnt,$rounds
575	b.eq	.Lecb_dec
576
577	vld1.8	{$dat1},[$inp],#16
578	subs	$len,$len,#32				// bias
579	add	$cnt,$rounds,#2
580	vorr	$in1,$dat1,$dat1
581	vorr	$dat2,$dat1,$dat1
582	vorr	$dat1,$dat,$dat
583	b.lo	.Lecb_enc_tail
584
585	vorr	$dat1,$in1,$in1
586	vld1.8	{$dat2},[$inp],#16
587___
588$code.=<<___	if ($flavour =~ /64/);
589	cmp	$len,#32
590	b.lo	.Loop3x_ecb_enc
591
592	vld1.8	{$dat3},[$inp],#16
593	vld1.8	{$dat4},[$inp],#16
594	sub	$len,$len,#32				// bias
595	mov	$cnt,$rounds
596
597.Loop5x_ecb_enc:
598	aese	$dat0,q8
599	aesmc	$dat0,$dat0
600	aese	$dat1,q8
601	aesmc	$dat1,$dat1
602	aese	$dat2,q8
603	aesmc	$dat2,$dat2
604	aese	$dat3,q8
605	aesmc	$dat3,$dat3
606	aese	$dat4,q8
607	aesmc	$dat4,$dat4
608	vld1.32	{q8},[$key_],#16
609	subs	$cnt,$cnt,#2
610	aese	$dat0,q9
611	aesmc	$dat0,$dat0
612	aese	$dat1,q9
613	aesmc	$dat1,$dat1
614	aese	$dat2,q9
615	aesmc	$dat2,$dat2
616	aese	$dat3,q9
617	aesmc	$dat3,$dat3
618	aese	$dat4,q9
619	aesmc	$dat4,$dat4
620	vld1.32	{q9},[$key_],#16
621	b.gt	.Loop5x_ecb_enc
622
623	aese	$dat0,q8
624	aesmc	$dat0,$dat0
625	aese	$dat1,q8
626	aesmc	$dat1,$dat1
627	aese	$dat2,q8
628	aesmc	$dat2,$dat2
629	aese	$dat3,q8
630	aesmc	$dat3,$dat3
631	aese	$dat4,q8
632	aesmc	$dat4,$dat4
633	cmp	$len,#0x40					// because .Lecb_enc_tail4x
634	sub	$len,$len,#0x50
635
636	aese	$dat0,q9
637	aesmc	$dat0,$dat0
638	aese	$dat1,q9
639	aesmc	$dat1,$dat1
640	aese	$dat2,q9
641	aesmc	$dat2,$dat2
642	aese	$dat3,q9
643	aesmc	$dat3,$dat3
644	aese	$dat4,q9
645	aesmc	$dat4,$dat4
646	csel	x6,xzr,$len,gt			// borrow x6, $cnt, "gt" is not typo
647	mov	$key_,$key
648
649	aese	$dat0,q10
650	aesmc	$dat0,$dat0
651	aese	$dat1,q10
652	aesmc	$dat1,$dat1
653	aese	$dat2,q10
654	aesmc	$dat2,$dat2
655	aese	$dat3,q10
656	aesmc	$dat3,$dat3
657	aese	$dat4,q10
658	aesmc	$dat4,$dat4
659	add	$inp,$inp,x6				// $inp is adjusted in such way that
660							// at exit from the loop $dat1-$dat4
661							// are loaded with last "words"
662	add	x6,$len,#0x60		    // because .Lecb_enc_tail4x
663
664	aese	$dat0,q11
665	aesmc	$dat0,$dat0
666	aese	$dat1,q11
667	aesmc	$dat1,$dat1
668	aese	$dat2,q11
669	aesmc	$dat2,$dat2
670	aese	$dat3,q11
671	aesmc	$dat3,$dat3
672	aese	$dat4,q11
673	aesmc	$dat4,$dat4
674
675	aese	$dat0,q12
676	aesmc	$dat0,$dat0
677	aese	$dat1,q12
678	aesmc	$dat1,$dat1
679	aese	$dat2,q12
680	aesmc	$dat2,$dat2
681	aese	$dat3,q12
682	aesmc	$dat3,$dat3
683	aese	$dat4,q12
684	aesmc	$dat4,$dat4
685
686	aese	$dat0,q13
687	aesmc	$dat0,$dat0
688	aese	$dat1,q13
689	aesmc	$dat1,$dat1
690	aese	$dat2,q13
691	aesmc	$dat2,$dat2
692	aese	$dat3,q13
693	aesmc	$dat3,$dat3
694	aese	$dat4,q13
695	aesmc	$dat4,$dat4
696
697	aese	$dat0,q14
698	aesmc	$dat0,$dat0
699	aese	$dat1,q14
700	aesmc	$dat1,$dat1
701	aese	$dat2,q14
702	aesmc	$dat2,$dat2
703	aese	$dat3,q14
704	aesmc	$dat3,$dat3
705	aese	$dat4,q14
706	aesmc	$dat4,$dat4
707
708	aese	$dat0,q15
709	vld1.8	{$in0},[$inp],#16
710	aese	$dat1,q15
711	vld1.8	{$in1},[$inp],#16
712	aese	$dat2,q15
713	vld1.8	{$in2},[$inp],#16
714	aese	$dat3,q15
715	vld1.8	{$in3},[$inp],#16
716	aese	$dat4,q15
717	vld1.8	{$in4},[$inp],#16
718	cbz	x6,.Lecb_enc_tail4x
719	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
720	veor	$tmp0,$rndlast,$dat0
721	vorr	$dat0,$in0,$in0
722	veor	$tmp1,$rndlast,$dat1
723	vorr	$dat1,$in1,$in1
724	veor	$tmp2,$rndlast,$dat2
725	vorr	$dat2,$in2,$in2
726	veor	$tmp3,$rndlast,$dat3
727	vorr	$dat3,$in3,$in3
728	veor	$tmp4,$rndlast,$dat4
729	vst1.8	{$tmp0},[$out],#16
730	vorr	$dat4,$in4,$in4
731	vst1.8	{$tmp1},[$out],#16
732	mov	$cnt,$rounds
733	vst1.8	{$tmp2},[$out],#16
734	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
735	vst1.8	{$tmp3},[$out],#16
736	vst1.8	{$tmp4},[$out],#16
737	b.hs	.Loop5x_ecb_enc
738
739	add	$len,$len,#0x50
740	cbz	$len,.Lecb_done
741
742	add	$cnt,$rounds,#2
743	subs	$len,$len,#0x30
744	vorr	$dat0,$in2,$in2
745	vorr	$dat1,$in3,$in3
746	vorr	$dat2,$in4,$in4
747	b.lo	.Lecb_enc_tail
748
749	b	.Loop3x_ecb_enc
750
751.align	4
752.Lecb_enc_tail4x:
753	veor	$tmp1,$rndlast,$dat1
754	veor	$tmp2,$rndlast,$dat2
755	veor	$tmp3,$rndlast,$dat3
756	veor	$tmp4,$rndlast,$dat4
757	vst1.8	{$tmp1},[$out],#16
758	vst1.8	{$tmp2},[$out],#16
759	vst1.8	{$tmp3},[$out],#16
760	vst1.8	{$tmp4},[$out],#16
761
762	b	.Lecb_done
763.align	4
764___
765$code.=<<___;
766.Loop3x_ecb_enc:
767	aese	$dat0,q8
768	aesmc	$dat0,$dat0
769	aese	$dat1,q8
770	aesmc	$dat1,$dat1
771	aese	$dat2,q8
772	aesmc	$dat2,$dat2
773	vld1.32	{q8},[$key_],#16
774	subs	$cnt,$cnt,#2
775	aese	$dat0,q9
776	aesmc	$dat0,$dat0
777	aese	$dat1,q9
778	aesmc	$dat1,$dat1
779	aese	$dat2,q9
780	aesmc	$dat2,$dat2
781	vld1.32	{q9},[$key_],#16
782	b.gt	.Loop3x_ecb_enc
783
784	aese	$dat0,q8
785	aesmc	$dat0,$dat0
786	aese	$dat1,q8
787	aesmc	$dat1,$dat1
788	aese	$dat2,q8
789	aesmc	$dat2,$dat2
790	subs	$len,$len,#0x30
791	mov.lo	x6,$len				// x6, $cnt, is zero at this point
792	aese	$dat0,q9
793	aesmc	$dat0,$dat0
794	aese	$dat1,q9
795	aesmc	$dat1,$dat1
796	aese	$dat2,q9
797	aesmc	$dat2,$dat2
798	add	$inp,$inp,x6			// $inp is adjusted in such way that
799						// at exit from the loop $dat1-$dat2
800						// are loaded with last "words"
801	mov	$key_,$key
802	aese	$dat0,q12
803	aesmc	$dat0,$dat0
804	aese	$dat1,q12
805	aesmc	$dat1,$dat1
806	aese	$dat2,q12
807	aesmc	$dat2,$dat2
808	vld1.8	{$in0},[$inp],#16
809	aese	$dat0,q13
810	aesmc	$dat0,$dat0
811	aese	$dat1,q13
812	aesmc	$dat1,$dat1
813	aese	$dat2,q13
814	aesmc	$dat2,$dat2
815	vld1.8	{$in1},[$inp],#16
816	aese	$dat0,q14
817	aesmc	$dat0,$dat0
818	aese	$dat1,q14
819	aesmc	$dat1,$dat1
820	aese	$dat2,q14
821	aesmc	$dat2,$dat2
822	vld1.8	{$in2},[$inp],#16
823	aese	$dat0,q15
824	aese	$dat1,q15
825	aese	$dat2,q15
826	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
827	add	$cnt,$rounds,#2
828	veor	$tmp0,$rndlast,$dat0
829	veor	$tmp1,$rndlast,$dat1
830	veor	$dat2,$dat2,$rndlast
831	vld1.32 {q9},[$key_],#16		// re-pre-load rndkey[1]
832	vst1.8	{$tmp0},[$out],#16
833	vorr	$dat0,$in0,$in0
834	vst1.8	{$tmp1},[$out],#16
835	vorr	$dat1,$in1,$in1
836	vst1.8	{$dat2},[$out],#16
837	vorr	$dat2,$in2,$in2
838	b.hs	.Loop3x_ecb_enc
839
840	cmn	$len,#0x30
841	b.eq	.Lecb_done
842	nop
843
844.Lecb_enc_tail:
845	aese	$dat1,q8
846	aesmc	$dat1,$dat1
847	aese	$dat2,q8
848	aesmc	$dat2,$dat2
849	vld1.32	{q8},[$key_],#16
850	subs	$cnt,$cnt,#2
851	aese	$dat1,q9
852	aesmc	$dat1,$dat1
853	aese	$dat2,q9
854	aesmc	$dat2,$dat2
855	vld1.32	{q9},[$key_],#16
856	b.gt	.Lecb_enc_tail
857
858	aese	$dat1,q8
859	aesmc	$dat1,$dat1
860	aese	$dat2,q8
861	aesmc	$dat2,$dat2
862	aese	$dat1,q9
863	aesmc	$dat1,$dat1
864	aese	$dat2,q9
865	aesmc	$dat2,$dat2
866	aese	$dat1,q12
867	aesmc	$dat1,$dat1
868	aese	$dat2,q12
869	aesmc	$dat2,$dat2
870	cmn	$len,#0x20
871	aese	$dat1,q13
872	aesmc	$dat1,$dat1
873	aese	$dat2,q13
874	aesmc	$dat2,$dat2
875	aese	$dat1,q14
876	aesmc	$dat1,$dat1
877	aese	$dat2,q14
878	aesmc	$dat2,$dat2
879	aese	$dat1,q15
880	aese	$dat2,q15
881	b.eq	.Lecb_enc_one
882	veor	$tmp1,$rndlast,$dat1
883	veor	$tmp2,$rndlast,$dat2
884	vst1.8	{$tmp1},[$out],#16
885	vst1.8	{$tmp2},[$out],#16
886	b	.Lecb_done
887
888.Lecb_enc_one:
889	veor	$tmp1,$rndlast,$dat2
890	vst1.8	{$tmp1},[$out],#16
891	b	.Lecb_done
892___
893
894$code.=<<___;
895.align	5
896.Lecb_dec:
897	vld1.8	{$dat1},[$inp],#16
898	subs	$len,$len,#32			// bias
899	add	$cnt,$rounds,#2
900	vorr	$in1,$dat1,$dat1
901	vorr	$dat2,$dat1,$dat1
902	vorr	$dat1,$dat,$dat
903	b.lo	.Lecb_dec_tail
904
905	vorr	$dat1,$in1,$in1
906	vld1.8	{$dat2},[$inp],#16
907___
908$code.=<<___	if ($flavour =~ /64/);
909	cmp	$len,#32
910	b.lo	.Loop3x_ecb_dec
911
912	vld1.8	{$dat3},[$inp],#16
913	vld1.8	{$dat4},[$inp],#16
914	sub	$len,$len,#32				// bias
915	mov	$cnt,$rounds
916
917.Loop5x_ecb_dec:
918	aesd	$dat0,q8
919	aesimc	$dat0,$dat0
920	aesd	$dat1,q8
921	aesimc	$dat1,$dat1
922	aesd	$dat2,q8
923	aesimc	$dat2,$dat2
924	aesd	$dat3,q8
925	aesimc	$dat3,$dat3
926	aesd	$dat4,q8
927	aesimc	$dat4,$dat4
928	vld1.32	{q8},[$key_],#16
929	subs	$cnt,$cnt,#2
930	aesd	$dat0,q9
931	aesimc	$dat0,$dat0
932	aesd	$dat1,q9
933	aesimc	$dat1,$dat1
934	aesd	$dat2,q9
935	aesimc	$dat2,$dat2
936	aesd	$dat3,q9
937	aesimc	$dat3,$dat3
938	aesd	$dat4,q9
939	aesimc	$dat4,$dat4
940	vld1.32	{q9},[$key_],#16
941	b.gt	.Loop5x_ecb_dec
942
943	aesd	$dat0,q8
944	aesimc	$dat0,$dat0
945	aesd	$dat1,q8
946	aesimc	$dat1,$dat1
947	aesd	$dat2,q8
948	aesimc	$dat2,$dat2
949	aesd	$dat3,q8
950	aesimc	$dat3,$dat3
951	aesd	$dat4,q8
952	aesimc	$dat4,$dat4
953	cmp	$len,#0x40				// because .Lecb_tail4x
954	sub	$len,$len,#0x50
955
956	aesd	$dat0,q9
957	aesimc	$dat0,$dat0
958	aesd	$dat1,q9
959	aesimc	$dat1,$dat1
960	aesd	$dat2,q9
961	aesimc	$dat2,$dat2
962	aesd	$dat3,q9
963	aesimc	$dat3,$dat3
964	aesd	$dat4,q9
965	aesimc	$dat4,$dat4
966	csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
967	mov	$key_,$key
968
969	aesd	$dat0,q10
970	aesimc	$dat0,$dat0
971	aesd	$dat1,q10
972	aesimc	$dat1,$dat1
973	aesd	$dat2,q10
974	aesimc	$dat2,$dat2
975	aesd	$dat3,q10
976	aesimc	$dat3,$dat3
977	aesd	$dat4,q10
978	aesimc	$dat4,$dat4
979	add	$inp,$inp,x6				// $inp is adjusted in such way that
980							// at exit from the loop $dat1-$dat4
981							// are loaded with last "words"
982	add	x6,$len,#0x60			// because .Lecb_tail4x
983
984	aesd	$dat0,q11
985	aesimc	$dat0,$dat0
986	aesd	$dat1,q11
987	aesimc	$dat1,$dat1
988	aesd	$dat2,q11
989	aesimc	$dat2,$dat2
990	aesd	$dat3,q11
991	aesimc	$dat3,$dat3
992	aesd	$dat4,q11
993	aesimc	$dat4,$dat4
994
995	aesd	$dat0,q12
996	aesimc	$dat0,$dat0
997	aesd	$dat1,q12
998	aesimc	$dat1,$dat1
999	aesd	$dat2,q12
1000	aesimc	$dat2,$dat2
1001	aesd	$dat3,q12
1002	aesimc	$dat3,$dat3
1003	aesd	$dat4,q12
1004	aesimc	$dat4,$dat4
1005
1006	aesd	$dat0,q13
1007	aesimc	$dat0,$dat0
1008	aesd	$dat1,q13
1009	aesimc	$dat1,$dat1
1010	aesd	$dat2,q13
1011	aesimc	$dat2,$dat2
1012	aesd	$dat3,q13
1013	aesimc	$dat3,$dat3
1014	aesd	$dat4,q13
1015	aesimc	$dat4,$dat4
1016
1017	aesd	$dat0,q14
1018	aesimc	$dat0,$dat0
1019	aesd	$dat1,q14
1020	aesimc	$dat1,$dat1
1021	aesd	$dat2,q14
1022	aesimc	$dat2,$dat2
1023	aesd	$dat3,q14
1024	aesimc	$dat3,$dat3
1025	aesd	$dat4,q14
1026	aesimc	$dat4,$dat4
1027
1028	aesd	$dat0,q15
1029	vld1.8	{$in0},[$inp],#16
1030	aesd	$dat1,q15
1031	vld1.8	{$in1},[$inp],#16
1032	aesd	$dat2,q15
1033	vld1.8	{$in2},[$inp],#16
1034	aesd	$dat3,q15
1035	vld1.8	{$in3},[$inp],#16
1036	aesd	$dat4,q15
1037	vld1.8	{$in4},[$inp],#16
1038	cbz	x6,.Lecb_tail4x
1039	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1040	veor	$tmp0,$rndlast,$dat0
1041	vorr	$dat0,$in0,$in0
1042	veor	$tmp1,$rndlast,$dat1
1043	vorr	$dat1,$in1,$in1
1044	veor	$tmp2,$rndlast,$dat2
1045	vorr	$dat2,$in2,$in2
1046	veor	$tmp3,$rndlast,$dat3
1047	vorr	$dat3,$in3,$in3
1048	veor	$tmp4,$rndlast,$dat4
1049	vst1.8	{$tmp0},[$out],#16
1050	vorr	$dat4,$in4,$in4
1051	vst1.8	{$tmp1},[$out],#16
1052	mov	$cnt,$rounds
1053	vst1.8	{$tmp2},[$out],#16
1054	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1055	vst1.8	{$tmp3},[$out],#16
1056	vst1.8	{$tmp4},[$out],#16
1057	b.hs	.Loop5x_ecb_dec
1058
1059	add	$len,$len,#0x50
1060	cbz	$len,.Lecb_done
1061
1062	add	$cnt,$rounds,#2
1063	subs	$len,$len,#0x30
1064	vorr	$dat0,$in2,$in2
1065	vorr	$dat1,$in3,$in3
1066	vorr	$dat2,$in4,$in4
1067	b.lo	.Lecb_dec_tail
1068
1069	b	.Loop3x_ecb_dec
1070
1071.align	4
1072.Lecb_tail4x:
1073	veor	$tmp1,$rndlast,$dat1
1074	veor	$tmp2,$rndlast,$dat2
1075	veor	$tmp3,$rndlast,$dat3
1076	veor	$tmp4,$rndlast,$dat4
1077	vst1.8	{$tmp1},[$out],#16
1078	vst1.8	{$tmp2},[$out],#16
1079	vst1.8	{$tmp3},[$out],#16
1080	vst1.8	{$tmp4},[$out],#16
1081
1082	b	.Lecb_done
1083.align	4
1084___
1085$code.=<<___;
1086.Loop3x_ecb_dec:
1087	aesd	$dat0,q8
1088	aesimc	$dat0,$dat0
1089	aesd	$dat1,q8
1090	aesimc	$dat1,$dat1
1091	aesd	$dat2,q8
1092	aesimc	$dat2,$dat2
1093	vld1.32	{q8},[$key_],#16
1094	subs	$cnt,$cnt,#2
1095	aesd	$dat0,q9
1096	aesimc	$dat0,$dat0
1097	aesd	$dat1,q9
1098	aesimc	$dat1,$dat1
1099	aesd	$dat2,q9
1100	aesimc	$dat2,$dat2
1101	vld1.32	{q9},[$key_],#16
1102	b.gt	.Loop3x_ecb_dec
1103
1104	aesd	$dat0,q8
1105	aesimc	$dat0,$dat0
1106	aesd	$dat1,q8
1107	aesimc	$dat1,$dat1
1108	aesd	$dat2,q8
1109	aesimc	$dat2,$dat2
1110	subs	$len,$len,#0x30
1111	mov.lo	x6,$len				// x6, $cnt, is zero at this point
1112	aesd	$dat0,q9
1113	aesimc	$dat0,$dat0
1114	aesd	$dat1,q9
1115	aesimc	$dat1,$dat1
1116	aesd	$dat2,q9
1117	aesimc	$dat2,$dat2
1118	add	$inp,$inp,x6 			// $inp is adjusted in such way that
1119						// at exit from the loop $dat1-$dat2
1120						// are loaded with last "words"
1121	mov	$key_,$key
1122	aesd	$dat0,q12
1123	aesimc	$dat0,$dat0
1124	aesd	$dat1,q12
1125	aesimc	$dat1,$dat1
1126	aesd	$dat2,q12
1127	aesimc	$dat2,$dat2
1128	vld1.8	{$in0},[$inp],#16
1129	aesd	$dat0,q13
1130	aesimc	$dat0,$dat0
1131	aesd	$dat1,q13
1132	aesimc	$dat1,$dat1
1133	aesd	$dat2,q13
1134	aesimc	$dat2,$dat2
1135	vld1.8	{$in1},[$inp],#16
1136	aesd	$dat0,q14
1137	aesimc	$dat0,$dat0
1138	aesd	$dat1,q14
1139	aesimc	$dat1,$dat1
1140	aesd	$dat2,q14
1141	aesimc	$dat2,$dat2
1142	vld1.8	{$in2},[$inp],#16
1143	aesd	$dat0,q15
1144	aesd	$dat1,q15
1145	aesd	$dat2,q15
1146	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1147	add	$cnt,$rounds,#2
1148	veor	$tmp0,$rndlast,$dat0
1149	veor	$tmp1,$rndlast,$dat1
1150	veor	$dat2,$dat2,$rndlast
1151	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1152	vst1.8	{$tmp0},[$out],#16
1153	vorr	$dat0,$in0,$in0
1154	vst1.8	{$tmp1},[$out],#16
1155	vorr	$dat1,$in1,$in1
1156	vst1.8	{$dat2},[$out],#16
1157	vorr	$dat2,$in2,$in2
1158	b.hs	.Loop3x_ecb_dec
1159
1160	cmn	$len,#0x30
1161	b.eq	.Lecb_done
1162	nop
1163
1164.Lecb_dec_tail:
1165	aesd	$dat1,q8
1166	aesimc	$dat1,$dat1
1167	aesd	$dat2,q8
1168	aesimc	$dat2,$dat2
1169	vld1.32	{q8},[$key_],#16
1170	subs	$cnt,$cnt,#2
1171	aesd	$dat1,q9
1172	aesimc	$dat1,$dat1
1173	aesd	$dat2,q9
1174	aesimc	$dat2,$dat2
1175	vld1.32	{q9},[$key_],#16
1176	b.gt	.Lecb_dec_tail
1177
1178	aesd	$dat1,q8
1179	aesimc	$dat1,$dat1
1180	aesd	$dat2,q8
1181	aesimc	$dat2,$dat2
1182	aesd	$dat1,q9
1183	aesimc	$dat1,$dat1
1184	aesd	$dat2,q9
1185	aesimc	$dat2,$dat2
1186	aesd	$dat1,q12
1187	aesimc	$dat1,$dat1
1188	aesd	$dat2,q12
1189	aesimc	$dat2,$dat2
1190	cmn	$len,#0x20
1191	aesd	$dat1,q13
1192	aesimc	$dat1,$dat1
1193	aesd	$dat2,q13
1194	aesimc	$dat2,$dat2
1195	aesd	$dat1,q14
1196	aesimc	$dat1,$dat1
1197	aesd	$dat2,q14
1198	aesimc	$dat2,$dat2
1199	aesd	$dat1,q15
1200	aesd	$dat2,q15
1201	b.eq	.Lecb_dec_one
1202	veor	$tmp1,$rndlast,$dat1
1203	veor	$tmp2,$rndlast,$dat2
1204	vst1.8	{$tmp1},[$out],#16
1205	vst1.8	{$tmp2},[$out],#16
1206	b	.Lecb_done
1207
1208.Lecb_dec_one:
1209	veor	$tmp1,$rndlast,$dat2
1210	vst1.8	{$tmp1},[$out],#16
1211
1212.Lecb_done:
1213___
1214}
1215$code.=<<___	if ($flavour !~ /64/);
1216	vldmia	sp!,{d8-d15}
1217	ldmia	sp!,{r4-r8,pc}
1218___
1219$code.=<<___	if ($flavour =~ /64/);
1220	ldr	x29,[sp],#16
1221___
1222$code.=<<___	if ($flavour =~ /64/);
1223.Lecb_Final_abort:
1224	ret
1225___
1226$code.=<<___;
1227.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1228___
1229}}}
1230{{{
1231my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1234
1235my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1237
1238### q8-q15	preloaded key schedule
1239
1240$code.=<<___;
1241.globl	${prefix}_cbc_encrypt
1242.type	${prefix}_cbc_encrypt,%function
1243.align	5
1244${prefix}_cbc_encrypt:
1245___
1246$code.=<<___	if ($flavour =~ /64/);
1247	AARCH64_VALID_CALL_TARGET
1248	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249	stp	x29,x30,[sp,#-16]!
1250	add	x29,sp,#0
1251___
1252$code.=<<___	if ($flavour !~ /64/);
1253	mov	ip,sp
1254	stmdb	sp!,{r4-r8,lr}
1255	vstmdb	sp!,{d8-d15}            @ ABI specification says so
1256	ldmia	ip,{r4-r5}		@ load remaining args
1257___
1258$code.=<<___;
1259	subs	$len,$len,#16
1260	mov	$step,#16
1261	b.lo	.Lcbc_abort
1262	cclr	$step,eq
1263
1264	cmp	$enc,#0			// en- or decrypting?
1265	ldr	$rounds,[$key,#240]
1266	and	$len,$len,#-16
1267	vld1.8	{$ivec},[$ivp]
1268	vld1.8	{$dat},[$inp],$step
1269
1270	vld1.32	{q8-q9},[$key]		// load key schedule...
1271	sub	$rounds,$rounds,#6
1272	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
1273	sub	$rounds,$rounds,#2
1274	vld1.32	{q10-q11},[$key_],#32
1275	vld1.32	{q12-q13},[$key_],#32
1276	vld1.32	{q14-q15},[$key_],#32
1277	vld1.32	{$rndlast},[$key_]
1278
1279	add	$key_,$key,#32
1280	mov	$cnt,$rounds
1281	b.eq	.Lcbc_dec
1282
1283	cmp	$rounds,#2
1284	veor	$dat,$dat,$ivec
1285	veor	$rndzero_n_last,q8,$rndlast
1286	b.eq	.Lcbc_enc128
1287
1288	vld1.32	{$in0-$in1},[$key_]
1289	add	$key_,$key,#16
1290	add	$key4,$key,#16*4
1291	add	$key5,$key,#16*5
1292	aese	$dat,q8
1293	aesmc	$dat,$dat
1294	add	$key6,$key,#16*6
1295	add	$key7,$key,#16*7
1296	b	.Lenter_cbc_enc
1297
1298.align	4
1299.Loop_cbc_enc:
1300	aese	$dat,q8
1301	aesmc	$dat,$dat
1302	 vst1.8	{$ivec},[$out],#16
1303.Lenter_cbc_enc:
1304	aese	$dat,q9
1305	aesmc	$dat,$dat
1306	aese	$dat,$in0
1307	aesmc	$dat,$dat
1308	vld1.32	{q8},[$key4]
1309	cmp	$rounds,#4
1310	aese	$dat,$in1
1311	aesmc	$dat,$dat
1312	vld1.32	{q9},[$key5]
1313	b.eq	.Lcbc_enc192
1314
1315	aese	$dat,q8
1316	aesmc	$dat,$dat
1317	vld1.32	{q8},[$key6]
1318	aese	$dat,q9
1319	aesmc	$dat,$dat
1320	vld1.32	{q9},[$key7]
1321	nop
1322
1323.Lcbc_enc192:
1324	aese	$dat,q8
1325	aesmc	$dat,$dat
1326	 subs	$len,$len,#16
1327	aese	$dat,q9
1328	aesmc	$dat,$dat
1329	 cclr	$step,eq
1330	aese	$dat,q10
1331	aesmc	$dat,$dat
1332	aese	$dat,q11
1333	aesmc	$dat,$dat
1334	 vld1.8	{q8},[$inp],$step
1335	aese	$dat,q12
1336	aesmc	$dat,$dat
1337	 veor	q8,q8,$rndzero_n_last
1338	aese	$dat,q13
1339	aesmc	$dat,$dat
1340	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
1341	aese	$dat,q14
1342	aesmc	$dat,$dat
1343	aese	$dat,q15
1344	veor	$ivec,$dat,$rndlast
1345	b.hs	.Loop_cbc_enc
1346
1347	vst1.8	{$ivec},[$out],#16
1348	b	.Lcbc_done
1349
1350.align	5
1351.Lcbc_enc128:
1352	vld1.32	{$in0-$in1},[$key_]
1353	aese	$dat,q8
1354	aesmc	$dat,$dat
1355	b	.Lenter_cbc_enc128
1356.Loop_cbc_enc128:
1357	aese	$dat,q8
1358	aesmc	$dat,$dat
1359	 vst1.8	{$ivec},[$out],#16
1360.Lenter_cbc_enc128:
1361	aese	$dat,q9
1362	aesmc	$dat,$dat
1363	 subs	$len,$len,#16
1364	aese	$dat,$in0
1365	aesmc	$dat,$dat
1366	 cclr	$step,eq
1367	aese	$dat,$in1
1368	aesmc	$dat,$dat
1369	aese	$dat,q10
1370	aesmc	$dat,$dat
1371	aese	$dat,q11
1372	aesmc	$dat,$dat
1373	 vld1.8	{q8},[$inp],$step
1374	aese	$dat,q12
1375	aesmc	$dat,$dat
1376	aese	$dat,q13
1377	aesmc	$dat,$dat
1378	aese	$dat,q14
1379	aesmc	$dat,$dat
1380	 veor	q8,q8,$rndzero_n_last
1381	aese	$dat,q15
1382	veor	$ivec,$dat,$rndlast
1383	b.hs	.Loop_cbc_enc128
1384
1385	vst1.8	{$ivec},[$out],#16
1386	b	.Lcbc_done
1387___
1388{
1389my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1390
1391my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
1392my ($dat4,$in4,$tmp4);
1393if ($flavour =~ /64/) {
1394    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1395}
1396
1397$code.=<<___;
1398.align	5
1399.Lcbc_dec:
1400	vld1.8	{$dat2},[$inp],#16
1401	subs	$len,$len,#32		// bias
1402	add	$cnt,$rounds,#2
1403	vorr	$in1,$dat,$dat
1404	vorr	$dat1,$dat,$dat
1405	vorr	$in2,$dat2,$dat2
1406	b.lo	.Lcbc_dec_tail
1407
1408	vorr	$dat1,$dat2,$dat2
1409	vld1.8	{$dat2},[$inp],#16
1410	vorr	$in0,$dat,$dat
1411	vorr	$in1,$dat1,$dat1
1412	vorr	$in2,$dat2,$dat2
1413___
1414$code.=<<___	if ($flavour =~ /64/);
1415	cmp	$len,#32
1416	b.lo	.Loop3x_cbc_dec
1417
1418	vld1.8	{$dat3},[$inp],#16
1419	vld1.8	{$dat4},[$inp],#16
1420	sub	$len,$len,#32		// bias
1421	mov	$cnt,$rounds
1422	vorr	$in3,$dat3,$dat3
1423	vorr	$in4,$dat4,$dat4
1424
1425.Loop5x_cbc_dec:
1426	aesd	$dat0,q8
1427	aesimc	$dat0,$dat0
1428	aesd	$dat1,q8
1429	aesimc	$dat1,$dat1
1430	aesd	$dat2,q8
1431	aesimc	$dat2,$dat2
1432	aesd	$dat3,q8
1433	aesimc	$dat3,$dat3
1434	aesd	$dat4,q8
1435	aesimc	$dat4,$dat4
1436	vld1.32	{q8},[$key_],#16
1437	subs	$cnt,$cnt,#2
1438	aesd	$dat0,q9
1439	aesimc	$dat0,$dat0
1440	aesd	$dat1,q9
1441	aesimc	$dat1,$dat1
1442	aesd	$dat2,q9
1443	aesimc	$dat2,$dat2
1444	aesd	$dat3,q9
1445	aesimc	$dat3,$dat3
1446	aesd	$dat4,q9
1447	aesimc	$dat4,$dat4
1448	vld1.32	{q9},[$key_],#16
1449	b.gt	.Loop5x_cbc_dec
1450
1451	aesd	$dat0,q8
1452	aesimc	$dat0,$dat0
1453	aesd	$dat1,q8
1454	aesimc	$dat1,$dat1
1455	aesd	$dat2,q8
1456	aesimc	$dat2,$dat2
1457	aesd	$dat3,q8
1458	aesimc	$dat3,$dat3
1459	aesd	$dat4,q8
1460	aesimc	$dat4,$dat4
1461	 cmp	$len,#0x40		// because .Lcbc_tail4x
1462	 sub	$len,$len,#0x50
1463
1464	aesd	$dat0,q9
1465	aesimc	$dat0,$dat0
1466	aesd	$dat1,q9
1467	aesimc	$dat1,$dat1
1468	aesd	$dat2,q9
1469	aesimc	$dat2,$dat2
1470	aesd	$dat3,q9
1471	aesimc	$dat3,$dat3
1472	aesd	$dat4,q9
1473	aesimc	$dat4,$dat4
1474	 csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
1475	 mov	$key_,$key
1476
1477	aesd	$dat0,q10
1478	aesimc	$dat0,$dat0
1479	aesd	$dat1,q10
1480	aesimc	$dat1,$dat1
1481	aesd	$dat2,q10
1482	aesimc	$dat2,$dat2
1483	aesd	$dat3,q10
1484	aesimc	$dat3,$dat3
1485	aesd	$dat4,q10
1486	aesimc	$dat4,$dat4
1487	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1488					// at exit from the loop $dat1-$dat4
1489					// are loaded with last "words"
1490	 add	x6,$len,#0x60		// because .Lcbc_tail4x
1491
1492	aesd	$dat0,q11
1493	aesimc	$dat0,$dat0
1494	aesd	$dat1,q11
1495	aesimc	$dat1,$dat1
1496	aesd	$dat2,q11
1497	aesimc	$dat2,$dat2
1498	aesd	$dat3,q11
1499	aesimc	$dat3,$dat3
1500	aesd	$dat4,q11
1501	aesimc	$dat4,$dat4
1502
1503	aesd	$dat0,q12
1504	aesimc	$dat0,$dat0
1505	aesd	$dat1,q12
1506	aesimc	$dat1,$dat1
1507	aesd	$dat2,q12
1508	aesimc	$dat2,$dat2
1509	aesd	$dat3,q12
1510	aesimc	$dat3,$dat3
1511	aesd	$dat4,q12
1512	aesimc	$dat4,$dat4
1513
1514	aesd	$dat0,q13
1515	aesimc	$dat0,$dat0
1516	aesd	$dat1,q13
1517	aesimc	$dat1,$dat1
1518	aesd	$dat2,q13
1519	aesimc	$dat2,$dat2
1520	aesd	$dat3,q13
1521	aesimc	$dat3,$dat3
1522	aesd	$dat4,q13
1523	aesimc	$dat4,$dat4
1524
1525	aesd	$dat0,q14
1526	aesimc	$dat0,$dat0
1527	aesd	$dat1,q14
1528	aesimc	$dat1,$dat1
1529	aesd	$dat2,q14
1530	aesimc	$dat2,$dat2
1531	aesd	$dat3,q14
1532	aesimc	$dat3,$dat3
1533	aesd	$dat4,q14
1534	aesimc	$dat4,$dat4
1535
1536	 veor	$tmp0,$ivec,$rndlast
1537	aesd	$dat0,q15
1538	 veor	$tmp1,$in0,$rndlast
1539	 vld1.8	{$in0},[$inp],#16
1540	aesd	$dat1,q15
1541	 veor	$tmp2,$in1,$rndlast
1542	 vld1.8	{$in1},[$inp],#16
1543	aesd	$dat2,q15
1544	 veor	$tmp3,$in2,$rndlast
1545	 vld1.8	{$in2},[$inp],#16
1546	aesd	$dat3,q15
1547	 veor	$tmp4,$in3,$rndlast
1548	 vld1.8	{$in3},[$inp],#16
1549	aesd	$dat4,q15
1550	 vorr	$ivec,$in4,$in4
1551	 vld1.8	{$in4},[$inp],#16
1552	cbz	x6,.Lcbc_tail4x
1553	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1554	veor	$tmp0,$tmp0,$dat0
1555	 vorr	$dat0,$in0,$in0
1556	veor	$tmp1,$tmp1,$dat1
1557	 vorr	$dat1,$in1,$in1
1558	veor	$tmp2,$tmp2,$dat2
1559	 vorr	$dat2,$in2,$in2
1560	veor	$tmp3,$tmp3,$dat3
1561	 vorr	$dat3,$in3,$in3
1562	veor	$tmp4,$tmp4,$dat4
1563	vst1.8	{$tmp0},[$out],#16
1564	 vorr	$dat4,$in4,$in4
1565	vst1.8	{$tmp1},[$out],#16
1566	 mov	$cnt,$rounds
1567	vst1.8	{$tmp2},[$out],#16
1568	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1569	vst1.8	{$tmp3},[$out],#16
1570	vst1.8	{$tmp4},[$out],#16
1571	b.hs	.Loop5x_cbc_dec
1572
1573	add	$len,$len,#0x50
1574	cbz	$len,.Lcbc_done
1575
1576	add	$cnt,$rounds,#2
1577	subs	$len,$len,#0x30
1578	vorr	$dat0,$in2,$in2
1579	vorr	$in0,$in2,$in2
1580	vorr	$dat1,$in3,$in3
1581	vorr	$in1,$in3,$in3
1582	vorr	$dat2,$in4,$in4
1583	vorr	$in2,$in4,$in4
1584	b.lo	.Lcbc_dec_tail
1585
1586	b	.Loop3x_cbc_dec
1587
1588.align	4
1589.Lcbc_tail4x:
1590	veor	$tmp1,$tmp0,$dat1
1591	veor	$tmp2,$tmp2,$dat2
1592	veor	$tmp3,$tmp3,$dat3
1593	veor	$tmp4,$tmp4,$dat4
1594	vst1.8	{$tmp1},[$out],#16
1595	vst1.8	{$tmp2},[$out],#16
1596	vst1.8	{$tmp3},[$out],#16
1597	vst1.8	{$tmp4},[$out],#16
1598
1599	b	.Lcbc_done
1600.align	4
1601___
1602$code.=<<___;
1603.Loop3x_cbc_dec:
1604	aesd	$dat0,q8
1605	aesimc	$dat0,$dat0
1606	aesd	$dat1,q8
1607	aesimc	$dat1,$dat1
1608	aesd	$dat2,q8
1609	aesimc	$dat2,$dat2
1610	vld1.32	{q8},[$key_],#16
1611	subs	$cnt,$cnt,#2
1612	aesd	$dat0,q9
1613	aesimc	$dat0,$dat0
1614	aesd	$dat1,q9
1615	aesimc	$dat1,$dat1
1616	aesd	$dat2,q9
1617	aesimc	$dat2,$dat2
1618	vld1.32	{q9},[$key_],#16
1619	b.gt	.Loop3x_cbc_dec
1620
1621	aesd	$dat0,q8
1622	aesimc	$dat0,$dat0
1623	aesd	$dat1,q8
1624	aesimc	$dat1,$dat1
1625	aesd	$dat2,q8
1626	aesimc	$dat2,$dat2
1627	 veor	$tmp0,$ivec,$rndlast
1628	 subs	$len,$len,#0x30
1629	 veor	$tmp1,$in0,$rndlast
1630	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
1631	aesd	$dat0,q9
1632	aesimc	$dat0,$dat0
1633	aesd	$dat1,q9
1634	aesimc	$dat1,$dat1
1635	aesd	$dat2,q9
1636	aesimc	$dat2,$dat2
1637	 veor	$tmp2,$in1,$rndlast
1638	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1639					// at exit from the loop $dat1-$dat2
1640					// are loaded with last "words"
1641	 vorr	$ivec,$in2,$in2
1642	 mov	$key_,$key
1643	aesd	$dat0,q12
1644	aesimc	$dat0,$dat0
1645	aesd	$dat1,q12
1646	aesimc	$dat1,$dat1
1647	aesd	$dat2,q12
1648	aesimc	$dat2,$dat2
1649	 vld1.8	{$in0},[$inp],#16
1650	aesd	$dat0,q13
1651	aesimc	$dat0,$dat0
1652	aesd	$dat1,q13
1653	aesimc	$dat1,$dat1
1654	aesd	$dat2,q13
1655	aesimc	$dat2,$dat2
1656	 vld1.8	{$in1},[$inp],#16
1657	aesd	$dat0,q14
1658	aesimc	$dat0,$dat0
1659	aesd	$dat1,q14
1660	aesimc	$dat1,$dat1
1661	aesd	$dat2,q14
1662	aesimc	$dat2,$dat2
1663	 vld1.8	{$in2},[$inp],#16
1664	aesd	$dat0,q15
1665	aesd	$dat1,q15
1666	aesd	$dat2,q15
1667	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1668	 add	$cnt,$rounds,#2
1669	veor	$tmp0,$tmp0,$dat0
1670	veor	$tmp1,$tmp1,$dat1
1671	veor	$dat2,$dat2,$tmp2
1672	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1673	vst1.8	{$tmp0},[$out],#16
1674	 vorr	$dat0,$in0,$in0
1675	vst1.8	{$tmp1},[$out],#16
1676	 vorr	$dat1,$in1,$in1
1677	vst1.8	{$dat2},[$out],#16
1678	 vorr	$dat2,$in2,$in2
1679	b.hs	.Loop3x_cbc_dec
1680
1681	cmn	$len,#0x30
1682	b.eq	.Lcbc_done
1683	nop
1684
1685.Lcbc_dec_tail:
1686	aesd	$dat1,q8
1687	aesimc	$dat1,$dat1
1688	aesd	$dat2,q8
1689	aesimc	$dat2,$dat2
1690	vld1.32	{q8},[$key_],#16
1691	subs	$cnt,$cnt,#2
1692	aesd	$dat1,q9
1693	aesimc	$dat1,$dat1
1694	aesd	$dat2,q9
1695	aesimc	$dat2,$dat2
1696	vld1.32	{q9},[$key_],#16
1697	b.gt	.Lcbc_dec_tail
1698
1699	aesd	$dat1,q8
1700	aesimc	$dat1,$dat1
1701	aesd	$dat2,q8
1702	aesimc	$dat2,$dat2
1703	aesd	$dat1,q9
1704	aesimc	$dat1,$dat1
1705	aesd	$dat2,q9
1706	aesimc	$dat2,$dat2
1707	aesd	$dat1,q12
1708	aesimc	$dat1,$dat1
1709	aesd	$dat2,q12
1710	aesimc	$dat2,$dat2
1711	 cmn	$len,#0x20
1712	aesd	$dat1,q13
1713	aesimc	$dat1,$dat1
1714	aesd	$dat2,q13
1715	aesimc	$dat2,$dat2
1716	 veor	$tmp1,$ivec,$rndlast
1717	aesd	$dat1,q14
1718	aesimc	$dat1,$dat1
1719	aesd	$dat2,q14
1720	aesimc	$dat2,$dat2
1721	 veor	$tmp2,$in1,$rndlast
1722	aesd	$dat1,q15
1723	aesd	$dat2,q15
1724	b.eq	.Lcbc_dec_one
1725	veor	$tmp1,$tmp1,$dat1
1726	veor	$tmp2,$tmp2,$dat2
1727	 vorr	$ivec,$in2,$in2
1728	vst1.8	{$tmp1},[$out],#16
1729	vst1.8	{$tmp2},[$out],#16
1730	b	.Lcbc_done
1731
1732.Lcbc_dec_one:
1733	veor	$tmp1,$tmp1,$dat2
1734	 vorr	$ivec,$in2,$in2
1735	vst1.8	{$tmp1},[$out],#16
1736
1737.Lcbc_done:
1738	vst1.8	{$ivec},[$ivp]
1739.Lcbc_abort:
1740___
1741}
1742$code.=<<___	if ($flavour !~ /64/);
1743	vldmia	sp!,{d8-d15}
1744	ldmia	sp!,{r4-r8,pc}
1745___
1746$code.=<<___	if ($flavour =~ /64/);
1747	ldr	x29,[sp],#16
1748	ret
1749___
1750$code.=<<___;
1751.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1752___
1753}}}
1754
1755{{{
1756my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1757my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7");
1758my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1759my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15));
1760my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23));
1761
1762# q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15
1763my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3));
1764my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9));
1765my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15));
1766my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21));
1767my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27));
1768my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27));
1769
1770#q_X => qX, for ldp & stp
1771my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7));
1772my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23));
1773
1774my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11));
1775
1776$code.=<<___	if ($flavour =~ /64/);
1777.globl	${prefix}_ctr32_encrypt_blocks_unroll12_eor3
1778.type	${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function
1779.align	5
1780${prefix}_ctr32_encrypt_blocks_unroll12_eor3:
1781	AARCH64_VALID_CALL_TARGET
1782	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1783	stp		x29,x30,[sp,#-80]!
1784	stp		d8,d9,[sp, #16]
1785	stp		d10,d11,[sp, #32]
1786	stp		d12,d13,[sp, #48]
1787	stp		d14,d15,[sp, #64]
1788	add		x29,sp,#0
1789
1790	 ldr		$rounds,[$key,#240]
1791
1792	 ldr		$ctr, [$ivp, #12]
1793#ifdef __AARCH64EB__
1794	vld1.8		{$dat0},[$ivp]
1795#else
1796	vld1.32		{$dat0},[$ivp]
1797#endif
1798	vld1.32		{$rndping-$rndpang},[$key]		// load key schedule...
1799	 sub		$rounds,$rounds,#4
1800	 cmp		$len,#2
1801	 add		$key_,$key,$roundsx,lsl#4	// pointer to last round key
1802	 sub		$rounds,$rounds,#2
1803	 add		$key_, $key_, #64
1804	vld1.32		{$rndlast},[$key_]
1805	 add		$key_,$key,#32
1806	 mov		$cnt,$rounds
1807#ifndef __AARCH64EB__
1808	rev		$ctr, $ctr
1809#endif
1810
1811	vorr		$dat1,$dat0,$dat0
1812	 add		$tctr1, $ctr, #1
1813	vorr		$dat2,$dat0,$dat0
1814	 add		$ctr, $ctr, #2
1815	vorr		$ivec,$dat0,$dat0
1816	 rev		$tctr1, $tctr1
1817	vmov.32		${dat1}[3],$tctr1
1818	b.ls		.Lctr32_tail_unroll
1819	 cmp		$len,#6
1820	 rev		$tctr2, $ctr
1821	 sub		$len,$len,#3		// bias
1822	vmov.32		${dat2}[3],$tctr2
1823	b.lo		.Loop3x_ctr32_unroll
1824	 cmp		$len,#9
1825	vorr		$dat3,$dat0,$dat0
1826	 add		$tctr3, $ctr, #1
1827	vorr		$dat4,$dat0,$dat0
1828	 add		$tctr4, $ctr, #2
1829	 rev		$tctr3, $tctr3
1830	vorr		$dat5,$dat0,$dat0
1831	 add		$ctr, $ctr, #3
1832	 rev		$tctr4, $tctr4
1833	vmov.32		${dat3}[3],$tctr3
1834	 rev		$tctr5, $ctr
1835	vmov.32		${dat4}[3],$tctr4
1836	vmov.32		${dat5}[3],$tctr5
1837	 sub		$len,$len,#3
1838	b.lo		.Loop6x_ctr32_unroll
1839
1840	// push regs to stack when 12 data chunks are interleaved
1841	 stp		x19,x20,[sp,#-16]!
1842	 stp		x21,x22,[sp,#-16]!
1843	 stp		x23,x24,[sp,#-16]!
1844	 stp		$dat8d,$dat9d,[sp,#-32]!
1845	 stp		$dat10d,$dat11d,[sp,#-32]!
1846
1847	 add		$tctr6,$ctr,#1
1848	 add		$tctr7,$ctr,#2
1849	 add		$tctr8,$ctr,#3
1850	 add		$tctr9,$ctr,#4
1851	 add		$tctr10,$ctr,#5
1852	 add		$ctr,$ctr,#6
1853	vorr		$dat6,$dat0,$dat0
1854	 rev		$tctr6,$tctr6
1855	vorr		$dat7,$dat0,$dat0
1856	 rev		$tctr7,$tctr7
1857	vorr		$dat8,$dat0,$dat0
1858	 rev		$tctr8,$tctr8
1859	vorr		$dat9,$dat0,$dat0
1860	 rev		$tctr9,$tctr9
1861	vorr		$dat10,$dat0,$dat0
1862	 rev		$tctr10,$tctr10
1863	vorr		$dat11,$dat0,$dat0
1864	 rev		$tctr11,$ctr
1865
1866	 sub		$len,$len,#6		// bias
1867	vmov.32		${dat6}[3],$tctr6
1868	vmov.32		${dat7}[3],$tctr7
1869	vmov.32		${dat8}[3],$tctr8
1870	vmov.32		${dat9}[3],$tctr9
1871	vmov.32		${dat10}[3],$tctr10
1872	vmov.32		${dat11}[3],$tctr11
1873	b		.Loop12x_ctr32_unroll
1874
1875.align	4
1876.Loop12x_ctr32_unroll:
1877	aese		$dat0,$rndping
1878	aesmc		$dat0,$dat0
1879	aese		$dat1,$rndping
1880	aesmc		$dat1,$dat1
1881	aese		$dat2,$rndping
1882	aesmc		$dat2,$dat2
1883	aese		$dat3,$rndping
1884	aesmc		$dat3,$dat3
1885	aese		$dat4,$rndping
1886	aesmc		$dat4,$dat4
1887	aese		$dat5,$rndping
1888	aesmc		$dat5,$dat5
1889	aese		$dat6,$rndping
1890	aesmc		$dat6,$dat6
1891	aese		$dat7,$rndping
1892	aesmc		$dat7,$dat7
1893	aese		$dat8,$rndping
1894	aesmc		$dat8,$dat8
1895	aese		$dat9,$rndping
1896	aesmc		$dat9,$dat9
1897	aese		$dat10,$rndping
1898	aesmc		$dat10,$dat10
1899	aese		$dat11,$rndping
1900	aesmc		$dat11,$dat11
1901	vld1.32		{$rndping},[$key_],#16
1902	subs		$cnt,$cnt,#2
1903	aese		$dat0,$rndpang
1904	aesmc		$dat0,$dat0
1905	aese		$dat1,$rndpang
1906	aesmc		$dat1,$dat1
1907	aese		$dat2,$rndpang
1908	aesmc		$dat2,$dat2
1909	aese		$dat3,$rndpang
1910	aesmc		$dat3,$dat3
1911	aese		$dat4,$rndpang
1912	aesmc		$dat4,$dat4
1913	aese		$dat5,$rndpang
1914	aesmc		$dat5,$dat5
1915	aese		$dat6,$rndpang
1916	aesmc		$dat6,$dat6
1917	aese		$dat7,$rndpang
1918	aesmc		$dat7,$dat7
1919	aese		$dat8,$rndpang
1920	aesmc		$dat8,$dat8
1921	aese		$dat9,$rndpang
1922	aesmc		$dat9,$dat9
1923	aese		$dat10,$rndpang
1924	aesmc		$dat10,$dat10
1925	aese		$dat11,$rndpang
1926	aesmc		$dat11,$dat11
1927	vld1.32		{$rndpang},[$key_],#16
1928	b.gt		.Loop12x_ctr32_unroll
1929
1930	aese		$dat0,$rndping
1931	aesmc		$dat0,$dat0
1932	aese		$dat1,$rndping
1933	aesmc		$dat1,$dat1
1934	aese		$dat2,$rndping
1935	aesmc		$dat2,$dat2
1936	aese		$dat3,$rndping
1937	aesmc		$dat3,$dat3
1938	aese		$dat4,$rndping
1939	aesmc		$dat4,$dat4
1940	aese		$dat5,$rndping
1941	aesmc		$dat5,$dat5
1942	aese		$dat6,$rndping
1943	aesmc		$dat6,$dat6
1944	aese		$dat7,$rndping
1945	aesmc		$dat7,$dat7
1946	aese		$dat8,$rndping
1947	aesmc		$dat8,$dat8
1948	aese		$dat9,$rndping
1949	aesmc		$dat9,$dat9
1950	aese		$dat10,$rndping
1951	aesmc		$dat10,$dat10
1952	aese		$dat11,$rndping
1953	aesmc		$dat11,$dat11
1954	vld1.32	 	{$rndping},[$key_],#16
1955
1956	aese		$dat0,$rndpang
1957	aesmc		$dat0,$dat0
1958	aese		$dat1,$rndpang
1959	aesmc		$dat1,$dat1
1960	aese		$dat2,$rndpang
1961	aesmc		$dat2,$dat2
1962	aese		$dat3,$rndpang
1963	aesmc		$dat3,$dat3
1964	aese		$dat4,$rndpang
1965	aesmc		$dat4,$dat4
1966	aese		$dat5,$rndpang
1967	aesmc		$dat5,$dat5
1968	aese		$dat6,$rndpang
1969	aesmc		$dat6,$dat6
1970	aese		$dat7,$rndpang
1971	aesmc		$dat7,$dat7
1972	aese		$dat8,$rndpang
1973	aesmc		$dat8,$dat8
1974	aese		$dat9,$rndpang
1975	aesmc		$dat9,$dat9
1976	aese		$dat10,$rndpang
1977	aesmc		$dat10,$dat10
1978	aese		$dat11,$rndpang
1979	aesmc		$dat11,$dat11
1980	vld1.32	 	{$rndpang},[$key_],#16
1981
1982	aese		$dat0,$rndping
1983	aesmc		$dat0,$dat0
1984	 add		$tctr0,$ctr,#1
1985	 add		$tctr1,$ctr,#2
1986	aese		$dat1,$rndping
1987	aesmc		$dat1,$dat1
1988	 add		$tctr2,$ctr,#3
1989	 add		$tctr3,$ctr,#4
1990	aese		$dat2,$rndping
1991	aesmc		$dat2,$dat2
1992	 add		$tctr4,$ctr,#5
1993	 add		$tctr5,$ctr,#6
1994	 rev		$tctr0,$tctr0
1995	aese		$dat3,$rndping
1996	aesmc		$dat3,$dat3
1997	 add		$tctr6,$ctr,#7
1998	 add		$tctr7,$ctr,#8
1999	 rev		$tctr1,$tctr1
2000	 rev		$tctr2,$tctr2
2001	aese		$dat4,$rndping
2002	aesmc		$dat4,$dat4
2003	 add		$tctr8,$ctr,#9
2004	 add		$tctr9,$ctr,#10
2005	 rev		$tctr3,$tctr3
2006	 rev		$tctr4,$tctr4
2007	aese		$dat5,$rndping
2008	aesmc		$dat5,$dat5
2009	 add		$tctr10,$ctr,#11
2010	 add		$tctr11,$ctr,#12
2011	 rev		$tctr5,$tctr5
2012	 rev		$tctr6,$tctr6
2013	aese		$dat6,$rndping
2014	aesmc		$dat6,$dat6
2015	 rev		$tctr7,$tctr7
2016	 rev		$tctr8,$tctr8
2017	aese		$dat7,$rndping
2018	aesmc		$dat7,$dat7
2019	 rev		$tctr9,$tctr9
2020	 rev		$tctr10,$tctr10
2021	aese		$dat8,$rndping
2022	aesmc		$dat8,$dat8
2023	 rev		$tctr11,$tctr11
2024	aese		$dat9,$rndping
2025	aesmc		$dat9,$dat9
2026	aese		$dat10,$rndping
2027	aesmc		$dat10,$dat10
2028	aese		$dat11,$rndping
2029	aesmc		$dat11,$dat11
2030	vld1.32	 	{$rndping},[$key_],#16
2031
2032	aese		$dat0,$rndpang
2033	aesmc		$dat0,$dat0
2034	aese		$dat1,$rndpang
2035	aesmc		$dat1,$dat1
2036	aese		$dat2,$rndpang
2037	aesmc		$dat2,$dat2
2038	aese		$dat3,$rndpang
2039	aesmc		$dat3,$dat3
2040	vld1.8		{$in0,$in1,$in2,$in3},[$inp],#64
2041	aese		$dat4,$rndpang
2042	aesmc		$dat4,$dat4
2043	aese		$dat5,$rndpang
2044	aesmc		$dat5,$dat5
2045	aese		$dat6,$rndpang
2046	aesmc		$dat6,$dat6
2047	aese		$dat7,$rndpang
2048	aesmc		$dat7,$dat7
2049	vld1.8		{$in4,$in5,$in6,$in7},[$inp],#64
2050	aese		$dat8,$rndpang
2051	aesmc		$dat8,$dat8
2052	aese		$dat9,$rndpang
2053	aesmc		$dat9,$dat9
2054	aese		$dat10,$rndpang
2055	aesmc		$dat10,$dat10
2056	aese		$dat11,$rndpang
2057	aesmc		$dat11,$dat11
2058	vld1.8		{$in8,$in9,$in10,$in11},[$inp],#64
2059	vld1.32	 	{$rndpang},[$key_],#16
2060
2061	 mov		$key_, $key
2062	aese		$dat0,$rndping
2063	aesmc		$dat0,$dat0
2064	aese		$dat1,$rndping
2065	aesmc		$dat1,$dat1
2066	aese		$dat2,$rndping
2067	aesmc		$dat2,$dat2
2068	aese		$dat3,$rndping
2069	aesmc		$dat3,$dat3
2070	aese		$dat4,$rndping
2071	aesmc		$dat4,$dat4
2072	aese		$dat5,$rndping
2073	aesmc		$dat5,$dat5
2074	aese		$dat6,$rndping
2075	aesmc		$dat6,$dat6
2076	aese		$dat7,$rndping
2077	aesmc		$dat7,$dat7
2078	aese		$dat8,$rndping
2079	aesmc		$dat8,$dat8
2080	aese		$dat9,$rndping
2081	aesmc		$dat9,$dat9
2082	aese		$dat10,$rndping
2083	aesmc		$dat10,$dat10
2084	aese		$dat11,$rndping
2085	aesmc		$dat11,$dat11
2086	vld1.32	 	{$rndping},[$key_],#16	// re-pre-load rndkey[0]
2087
2088	aese		$dat0,$rndpang
2089	 eor3		$in0,$in0,$rndlast,$dat0
2090	vorr		$dat0,$ivec,$ivec
2091	aese		$dat1,$rndpang
2092	 eor3		$in1,$in1,$rndlast,$dat1
2093	vorr		$dat1,$ivec,$ivec
2094	aese		$dat2,$rndpang
2095	 eor3		$in2,$in2,$rndlast,$dat2
2096	vorr		$dat2,$ivec,$ivec
2097	aese		$dat3,$rndpang
2098	 eor3		$in3,$in3,$rndlast,$dat3
2099	vorr		$dat3,$ivec,$ivec
2100	aese		$dat4,$rndpang
2101	 eor3		$in4,$in4,$rndlast,$dat4
2102	vorr		$dat4,$ivec,$ivec
2103	aese		$dat5,$rndpang
2104	 eor3		$in5,$in5,$rndlast,$dat5
2105	vorr		$dat5,$ivec,$ivec
2106	aese		$dat6,$rndpang
2107	 eor3		$in6,$in6,$rndlast,$dat6
2108	vorr		$dat6,$ivec,$ivec
2109	aese		$dat7,$rndpang
2110	 eor3		$in7,$in7,$rndlast,$dat7
2111	vorr		$dat7,$ivec,$ivec
2112	aese		$dat8,$rndpang
2113	 eor3		$in8,$in8,$rndlast,$dat8
2114	vorr		$dat8,$ivec,$ivec
2115	aese		$dat9,$rndpang
2116	 eor3		$in9,$in9,$rndlast,$dat9
2117	vorr		$dat9,$ivec,$ivec
2118	aese		$dat10,$rndpang
2119	 eor3		$in10,$in10,$rndlast,$dat10
2120	vorr		$dat10,$ivec,$ivec
2121	aese		$dat11,$rndpang
2122	 eor3		$in11,$in11,$rndlast,$dat11
2123	vorr		$dat11,$ivec,$ivec
2124	vld1.32	 	{$rndpang},[$key_],#16	// re-pre-load rndkey[1]
2125
2126	vmov.32		${dat0}[3],$tctr0
2127	vmov.32		${dat1}[3],$tctr1
2128	vmov.32		${dat2}[3],$tctr2
2129	vmov.32		${dat3}[3],$tctr3
2130	vst1.8		{$in0,$in1,$in2,$in3},[$out],#64
2131	vmov.32		${dat4}[3],$tctr4
2132	vmov.32		${dat5}[3],$tctr5
2133	vmov.32		${dat6}[3],$tctr6
2134	vmov.32		${dat7}[3],$tctr7
2135	vst1.8		{$in4,$in5,$in6,$in7},[$out],#64
2136	vmov.32		${dat8}[3],$tctr8
2137	vmov.32		${dat9}[3],$tctr9
2138	vmov.32		${dat10}[3],$tctr10
2139	vmov.32		${dat11}[3],$tctr11
2140	vst1.8		{$in8,$in9,$in10,$in11},[$out],#64
2141
2142	 mov		$cnt,$rounds
2143
2144	 add		$ctr,$ctr,#12
2145	subs		$len,$len,#12
2146	b.hs		.Loop12x_ctr32_unroll
2147
2148	// pop regs from stack when 12 data chunks are interleaved
2149	 ldp		$dat10d,$dat11d,[sp],#32
2150	 ldp		$dat8d,$dat9d,[sp],#32
2151	 ldp		x23,x24,[sp],#16
2152	 ldp		x21,x22,[sp],#16
2153	 ldp		x19,x20,[sp],#16
2154
2155	 add		$len,$len,#12
2156	 cbz		$len,.Lctr32_done_unroll
2157	 sub		$ctr,$ctr,#12
2158
2159	 cmp		$len,#2
2160	b.ls		.Lctr32_tail_unroll
2161
2162	 cmp		$len,#6
2163	 sub		$len,$len,#3		// bias
2164	 add		$ctr,$ctr,#3
2165	b.lo		.Loop3x_ctr32_unroll
2166
2167	 sub		$len,$len,#3
2168	 add		$ctr,$ctr,#3
2169	b.lo		.Loop6x_ctr32_unroll
2170
2171.align	4
2172.Loop6x_ctr32_unroll:
2173	aese		$dat0,$rndping
2174	aesmc		$dat0,$dat0
2175	aese		$dat1,$rndping
2176	aesmc		$dat1,$dat1
2177	aese		$dat2,$rndping
2178	aesmc		$dat2,$dat2
2179	aese		$dat3,$rndping
2180	aesmc		$dat3,$dat3
2181	aese		$dat4,$rndping
2182	aesmc		$dat4,$dat4
2183	aese		$dat5,$rndping
2184	aesmc		$dat5,$dat5
2185	vld1.32		{$rndping},[$key_],#16
2186	subs		$cnt,$cnt,#2
2187	aese		$dat0,$rndpang
2188	aesmc		$dat0,$dat0
2189	aese		$dat1,$rndpang
2190	aesmc		$dat1,$dat1
2191	aese		$dat2,$rndpang
2192	aesmc		$dat2,$dat2
2193	aese		$dat3,$rndpang
2194	aesmc		$dat3,$dat3
2195	aese		$dat4,$rndpang
2196	aesmc		$dat4,$dat4
2197	aese		$dat5,$rndpang
2198	aesmc		$dat5,$dat5
2199	vld1.32		{$rndpang},[$key_],#16
2200	b.gt		.Loop6x_ctr32_unroll
2201
2202	aese		$dat0,$rndping
2203	aesmc		$dat0,$dat0
2204	aese		$dat1,$rndping
2205	aesmc		$dat1,$dat1
2206	aese		$dat2,$rndping
2207	aesmc		$dat2,$dat2
2208	aese		$dat3,$rndping
2209	aesmc		$dat3,$dat3
2210	aese		$dat4,$rndping
2211	aesmc		$dat4,$dat4
2212	aese		$dat5,$rndping
2213	aesmc		$dat5,$dat5
2214	vld1.32	 	{$rndping},[$key_],#16
2215
2216	aese		$dat0,$rndpang
2217	aesmc		$dat0,$dat0
2218	aese		$dat1,$rndpang
2219	aesmc		$dat1,$dat1
2220	aese		$dat2,$rndpang
2221	aesmc		$dat2,$dat2
2222	aese		$dat3,$rndpang
2223	aesmc		$dat3,$dat3
2224	aese		$dat4,$rndpang
2225	aesmc		$dat4,$dat4
2226	aese		$dat5,$rndpang
2227	aesmc		$dat5,$dat5
2228	vld1.32	 	{$rndpang},[$key_],#16
2229
2230	aese		$dat0,$rndping
2231	aesmc		$dat0,$dat0
2232	 add		$tctr0,$ctr,#1
2233	 add		$tctr1,$ctr,#2
2234	aese		$dat1,$rndping
2235	aesmc		$dat1,$dat1
2236	 add		$tctr2,$ctr,#3
2237	 add		$tctr3,$ctr,#4
2238	aese		$dat2,$rndping
2239	aesmc		$dat2,$dat2
2240	 add		$tctr4,$ctr,#5
2241	 add		$tctr5,$ctr,#6
2242	 rev		$tctr0,$tctr0
2243	aese		$dat3,$rndping
2244	aesmc		$dat3,$dat3
2245	 rev		$tctr1,$tctr1
2246	 rev		$tctr2,$tctr2
2247	aese		$dat4,$rndping
2248	aesmc		$dat4,$dat4
2249	 rev		$tctr3,$tctr3
2250	 rev		$tctr4,$tctr4
2251	aese		$dat5,$rndping
2252	aesmc		$dat5,$dat5
2253	 rev		$tctr5,$tctr5
2254	vld1.32	 	{$rndping},[$key_],#16
2255
2256	aese		$dat0,$rndpang
2257	aesmc		$dat0,$dat0
2258	aese		$dat1,$rndpang
2259	aesmc		$dat1,$dat1
2260	vld1.8		{$in0,$in1,$in2,$in3},[$inp],#64
2261	aese		$dat2,$rndpang
2262	aesmc		$dat2,$dat2
2263	aese		$dat3,$rndpang
2264	aesmc		$dat3,$dat3
2265	vld1.8		{$in4,$in5},[$inp],#32
2266	aese		$dat4,$rndpang
2267	aesmc		$dat4,$dat4
2268	aese		$dat5,$rndpang
2269	aesmc		$dat5,$dat5
2270	vld1.32	 	{$rndpang},[$key_],#16
2271
2272	 mov		$key_, $key
2273	aese		$dat0,$rndping
2274	aesmc		$dat0,$dat0
2275	aese		$dat1,$rndping
2276	aesmc		$dat1,$dat1
2277	aese		$dat2,$rndping
2278	aesmc		$dat2,$dat2
2279	aese		$dat3,$rndping
2280	aesmc		$dat3,$dat3
2281	aese		$dat4,$rndping
2282	aesmc		$dat4,$dat4
2283	aese		$dat5,$rndping
2284	aesmc		$dat5,$dat5
2285	vld1.32	 	{$rndping},[$key_],#16	// re-pre-load rndkey[0]
2286
2287	aese		$dat0,$rndpang
2288	 eor3		$in0,$in0,$rndlast,$dat0
2289	aese		$dat1,$rndpang
2290	 eor3		$in1,$in1,$rndlast,$dat1
2291	aese		$dat2,$rndpang
2292	 eor3		$in2,$in2,$rndlast,$dat2
2293	aese		$dat3,$rndpang
2294	 eor3		$in3,$in3,$rndlast,$dat3
2295	aese		$dat4,$rndpang
2296	 eor3		$in4,$in4,$rndlast,$dat4
2297	aese		$dat5,$rndpang
2298	 eor3		$in5,$in5,$rndlast,$dat5
2299	vld1.32	 	{$rndpang},[$key_],#16	// re-pre-load rndkey[1]
2300
2301	vorr		$dat0,$ivec,$ivec
2302	vorr		$dat1,$ivec,$ivec
2303	vorr		$dat2,$ivec,$ivec
2304	vorr		$dat3,$ivec,$ivec
2305	vorr		$dat4,$ivec,$ivec
2306	vorr		$dat5,$ivec,$ivec
2307
2308	vmov.32		${dat0}[3],$tctr0
2309	vmov.32		${dat1}[3],$tctr1
2310	vst1.8		{$in0,$in1,$in2,$in3},[$out],#64
2311	vmov.32		${dat2}[3],$tctr2
2312	vmov.32		${dat3}[3],$tctr3
2313	vst1.8		{$in4,$in5},[$out],#32
2314	vmov.32		${dat4}[3],$tctr4
2315	vmov.32		${dat5}[3],$tctr5
2316
2317	 cbz		$len,.Lctr32_done_unroll
2318	 mov		$cnt,$rounds
2319
2320	 cmp		$len,#2
2321	b.ls		.Lctr32_tail_unroll
2322
2323	 sub		$len,$len,#3		// bias
2324	 add		$ctr,$ctr,#3
2325	 b		.Loop3x_ctr32_unroll
2326
2327.align	4
2328.Loop3x_ctr32_unroll:
2329	aese		$dat0,$rndping
2330	aesmc		$dat0,$dat0
2331	aese		$dat1,$rndping
2332	aesmc		$dat1,$dat1
2333	aese		$dat2,$rndping
2334	aesmc		$dat2,$dat2
2335	vld1.32		{$rndping},[$key_],#16
2336	subs		$cnt,$cnt,#2
2337	aese		$dat0,$rndpang
2338	aesmc		$dat0,$dat0
2339	aese		$dat1,$rndpang
2340	aesmc		$dat1,$dat1
2341	aese		$dat2,$rndpang
2342	aesmc		$dat2,$dat2
2343	vld1.32		{$rndpang},[$key_],#16
2344	b.gt		.Loop3x_ctr32_unroll
2345
2346	aese		$dat0,$rndping
2347	aesmc		$tmp0,$dat0
2348	aese		$dat1,$rndping
2349	aesmc		$tmp1,$dat1
2350	vld1.8		{$in0,$in1,$in2},[$inp],#48
2351	vorr		$dat0,$ivec,$ivec
2352	aese		$dat2,$rndping
2353	aesmc		$dat2,$dat2
2354	vld1.32		{$rndping},[$key_],#16
2355	vorr		$dat1,$ivec,$ivec
2356	aese		$tmp0,$rndpang
2357	aesmc		$tmp0,$tmp0
2358	aese		$tmp1,$rndpang
2359	aesmc		$tmp1,$tmp1
2360	aese		$dat2,$rndpang
2361	aesmc		$tmp2,$dat2
2362	vld1.32		{$rndpang},[$key_],#16
2363	vorr		$dat2,$ivec,$ivec
2364	 add		$tctr0,$ctr,#1
2365	aese		$tmp0,$rndping
2366	aesmc		$tmp0,$tmp0
2367	aese		$tmp1,$rndping
2368	aesmc		$tmp1,$tmp1
2369	 add		$tctr1,$ctr,#2
2370	aese		$tmp2,$rndping
2371	aesmc		$tmp2,$tmp2
2372	vld1.32		{$rndping},[$key_],#16
2373	 add		$ctr,$ctr,#3
2374	aese		$tmp0,$rndpang
2375	aesmc		$tmp0,$tmp0
2376	aese		$tmp1,$rndpang
2377	aesmc		$tmp1,$tmp1
2378
2379	 rev		$tctr0,$tctr0
2380	aese		$tmp2,$rndpang
2381	aesmc		$tmp2,$tmp2
2382	vld1.32		{$rndpang},[$key_],#16
2383	vmov.32		${dat0}[3], $tctr0
2384	 mov		$key_,$key
2385	 rev		$tctr1,$tctr1
2386	aese		$tmp0,$rndping
2387	aesmc		$tmp0,$tmp0
2388
2389	aese		$tmp1,$rndping
2390	aesmc		$tmp1,$tmp1
2391	vmov.32		${dat1}[3], $tctr1
2392	 rev		$tctr2,$ctr
2393	aese		$tmp2,$rndping
2394	aesmc		$tmp2,$tmp2
2395	vmov.32		${dat2}[3], $tctr2
2396
2397	aese		$tmp0,$rndpang
2398	aese		$tmp1,$rndpang
2399	aese		$tmp2,$rndpang
2400
2401	 eor3		$in0,$in0,$rndlast,$tmp0
2402	vld1.32		{$rndping},[$key_],#16	// re-pre-load rndkey[0]
2403	 eor3		$in1,$in1,$rndlast,$tmp1
2404	 mov		$cnt,$rounds
2405	 eor3		$in2,$in2,$rndlast,$tmp2
2406	vld1.32		{$rndpang},[$key_],#16	// re-pre-load rndkey[1]
2407	vst1.8		{$in0,$in1,$in2},[$out],#48
2408
2409	 cbz		$len,.Lctr32_done_unroll
2410
2411.Lctr32_tail_unroll:
2412	 cmp		$len,#1
2413	b.eq		.Lctr32_tail_1_unroll
2414
2415.Lctr32_tail_2_unroll:
2416	aese		$dat0,$rndping
2417	aesmc		$dat0,$dat0
2418	aese		$dat1,$rndping
2419	aesmc		$dat1,$dat1
2420	vld1.32		{$rndping},[$key_],#16
2421	subs		$cnt,$cnt,#2
2422	aese		$dat0,$rndpang
2423	aesmc		$dat0,$dat0
2424	aese		$dat1,$rndpang
2425	aesmc		$dat1,$dat1
2426	vld1.32		{$rndpang},[$key_],#16
2427	b.gt		.Lctr32_tail_2_unroll
2428
2429	aese		$dat0,$rndping
2430	aesmc		$dat0,$dat0
2431	aese		$dat1,$rndping
2432	aesmc		$dat1,$dat1
2433	vld1.32		{$rndping},[$key_],#16
2434	aese		$dat0,$rndpang
2435	aesmc		$dat0,$dat0
2436	aese		$dat1,$rndpang
2437	aesmc		$dat1,$dat1
2438	vld1.32		{$rndpang},[$key_],#16
2439	vld1.8		{$in0,$in1},[$inp],#32
2440	aese		$dat0,$rndping
2441	aesmc		$dat0,$dat0
2442	aese		$dat1,$rndping
2443	aesmc		$dat1,$dat1
2444	vld1.32		{$rndping},[$key_],#16
2445	aese		$dat0,$rndpang
2446	aesmc		$dat0,$dat0
2447	aese		$dat1,$rndpang
2448	aesmc		$dat1,$dat1
2449	vld1.32		{$rndpang},[$key_],#16
2450	aese		$dat0,$rndping
2451	aesmc		$dat0,$dat0
2452	aese		$dat1,$rndping
2453	aesmc		$dat1,$dat1
2454	aese		$dat0,$rndpang
2455	aese		$dat1,$rndpang
2456
2457	 eor3		$in0,$in0,$rndlast,$dat0
2458	 eor3		$in1,$in1,$rndlast,$dat1
2459	vst1.8		{$in0,$in1},[$out],#32
2460	 b		.Lctr32_done_unroll
2461
2462.Lctr32_tail_1_unroll:
2463	aese		$dat0,$rndping
2464	aesmc		$dat0,$dat0
2465	vld1.32		{$rndping},[$key_],#16
2466	subs		$cnt,$cnt,#2
2467	aese		$dat0,$rndpang
2468	aesmc		$dat0,$dat0
2469	vld1.32		{$rndpang},[$key_],#16
2470	b.gt		.Lctr32_tail_1_unroll
2471
2472	aese		$dat0,$rndping
2473	aesmc		$dat0,$dat0
2474	vld1.32		{$rndping},[$key_],#16
2475	aese		$dat0,$rndpang
2476	aesmc		$dat0,$dat0
2477	vld1.32		{$rndpang},[$key_],#16
2478	vld1.8		{$in0},[$inp]
2479	aese		$dat0,$rndping
2480	aesmc		$dat0,$dat0
2481	vld1.32		{$rndping},[$key_],#16
2482	aese		$dat0,$rndpang
2483	aesmc		$dat0,$dat0
2484	vld1.32		{$rndpang},[$key_],#16
2485	aese		$dat0,$rndping
2486	aesmc		$dat0,$dat0
2487	aese		$dat0,$rndpang
2488
2489	 eor3		$in0,$in0,$rndlast,$dat0
2490	vst1.8		{$in0},[$out],#16
2491
2492.Lctr32_done_unroll:
2493	ldp		d8,d9,[sp, #16]
2494	ldp		d10,d11,[sp, #32]
2495	ldp		d12,d13,[sp, #48]
2496	ldp		d15,d16,[sp, #64]
2497	ldr		x29,[sp],#80
2498	ret
2499.size	${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3
2500___
2501}}}
2502
2503{{{
2504my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
2505my ($rounds,$cnt,$key_)=("w5","w6","x7");
2506my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
2507my $step="x12";		# aliases with $tctr2
2508
2509my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
2510my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2511
2512# used only in 64-bit mode...
2513my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
2514
2515my ($dat,$tmp)=($dat0,$tmp0);
2516
2517### q8-q15	preloaded key schedule
2518
2519$code.=<<___;
2520.globl	${prefix}_ctr32_encrypt_blocks
2521.type	${prefix}_ctr32_encrypt_blocks,%function
2522.align	5
2523${prefix}_ctr32_encrypt_blocks:
2524___
2525$code.=<<___	if ($flavour =~ /64/);
2526	AARCH64_VALID_CALL_TARGET
2527	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
2528	stp		x29,x30,[sp,#-16]!
2529	add		x29,sp,#0
2530___
2531$code.=<<___	if ($flavour !~ /64/);
2532	mov		ip,sp
2533	stmdb		sp!,{r4-r10,lr}
2534	vstmdb		sp!,{d8-d15}            @ ABI specification says so
2535	ldr		r4, [ip]		@ load remaining arg
2536___
2537$code.=<<___;
2538	ldr		$rounds,[$key,#240]
2539
2540	ldr		$ctr, [$ivp, #12]
2541#ifdef __ARMEB__
2542	vld1.8		{$dat0},[$ivp]
2543#else
2544	vld1.32		{$dat0},[$ivp]
2545#endif
2546	vld1.32		{q8-q9},[$key]		// load key schedule...
2547	sub		$rounds,$rounds,#4
2548	mov		$step,#16
2549	cmp		$len,#2
2550	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
2551	sub		$rounds,$rounds,#2
2552	vld1.32		{q12-q13},[$key_],#32
2553	vld1.32		{q14-q15},[$key_],#32
2554	vld1.32		{$rndlast},[$key_]
2555	add		$key_,$key,#32
2556	mov		$cnt,$rounds
2557	cclr		$step,lo
2558#ifndef __ARMEB__
2559	rev		$ctr, $ctr
2560#endif
2561___
2562$code.=<<___	if ($flavour =~ /64/);
2563	vorr		$dat1,$dat0,$dat0
2564	add		$tctr1, $ctr, #1
2565	vorr		$dat2,$dat0,$dat0
2566	add		$ctr, $ctr, #2
2567	vorr		$ivec,$dat0,$dat0
2568	rev		$tctr1, $tctr1
2569	vmov.32		${dat1}[3],$tctr1
2570	b.ls		.Lctr32_tail
2571	rev		$tctr2, $ctr
2572	sub		$len,$len,#3		// bias
2573	vmov.32		${dat2}[3],$tctr2
2574___
2575$code.=<<___	if ($flavour !~ /64/);
2576	add		$tctr1, $ctr, #1
2577	vorr		$ivec,$dat0,$dat0
2578	rev		$tctr1, $tctr1
2579	vmov.32		${ivec}[3],$tctr1
2580	add		$ctr, $ctr, #2
2581	vorr		$dat1,$ivec,$ivec
2582	b.ls		.Lctr32_tail
2583	rev		$tctr2, $ctr
2584	vmov.32		${ivec}[3],$tctr2
2585	sub		$len,$len,#3		// bias
2586	vorr		$dat2,$ivec,$ivec
2587___
2588$code.=<<___	if ($flavour =~ /64/);
2589	cmp		$len,#32
2590	b.lo		.Loop3x_ctr32
2591
2592	add		w13,$ctr,#1
2593	add		w14,$ctr,#2
2594	vorr		$dat3,$dat0,$dat0
2595	rev		w13,w13
2596	vorr		$dat4,$dat0,$dat0
2597	rev		w14,w14
2598	vmov.32		${dat3}[3],w13
2599	sub		$len,$len,#2		// bias
2600	vmov.32		${dat4}[3],w14
2601	add		$ctr,$ctr,#2
2602	b		.Loop5x_ctr32
2603
2604.align	4
2605.Loop5x_ctr32:
2606	aese		$dat0,q8
2607	aesmc		$dat0,$dat0
2608	aese		$dat1,q8
2609	aesmc		$dat1,$dat1
2610	aese		$dat2,q8
2611	aesmc		$dat2,$dat2
2612	aese		$dat3,q8
2613	aesmc		$dat3,$dat3
2614	aese		$dat4,q8
2615	aesmc		$dat4,$dat4
2616	vld1.32		{q8},[$key_],#16
2617	subs		$cnt,$cnt,#2
2618	aese		$dat0,q9
2619	aesmc		$dat0,$dat0
2620	aese		$dat1,q9
2621	aesmc		$dat1,$dat1
2622	aese		$dat2,q9
2623	aesmc		$dat2,$dat2
2624	aese		$dat3,q9
2625	aesmc		$dat3,$dat3
2626	aese		$dat4,q9
2627	aesmc		$dat4,$dat4
2628	vld1.32		{q9},[$key_],#16
2629	b.gt		.Loop5x_ctr32
2630
2631	mov		$key_,$key
2632	aese		$dat0,q8
2633	aesmc		$dat0,$dat0
2634	aese		$dat1,q8
2635	aesmc		$dat1,$dat1
2636	aese		$dat2,q8
2637	aesmc		$dat2,$dat2
2638	aese		$dat3,q8
2639	aesmc		$dat3,$dat3
2640	aese		$dat4,q8
2641	aesmc		$dat4,$dat4
2642	vld1.32	 	{q8},[$key_],#16	// re-pre-load rndkey[0]
2643
2644	aese		$dat0,q9
2645	aesmc		$dat0,$dat0
2646	aese		$dat1,q9
2647	aesmc		$dat1,$dat1
2648	aese		$dat2,q9
2649	aesmc		$dat2,$dat2
2650	aese		$dat3,q9
2651	aesmc		$dat3,$dat3
2652	aese		$dat4,q9
2653	aesmc		$dat4,$dat4
2654	vld1.32	 	{q9},[$key_],#16	// re-pre-load rndkey[1]
2655
2656	aese		$dat0,q12
2657	aesmc		$dat0,$dat0
2658	 add		$tctr0,$ctr,#1
2659	 add		$tctr1,$ctr,#2
2660	aese		$dat1,q12
2661	aesmc		$dat1,$dat1
2662	 add		$tctr2,$ctr,#3
2663	 add		w13,$ctr,#4
2664	aese		$dat2,q12
2665	aesmc		$dat2,$dat2
2666	 add		w14,$ctr,#5
2667	 rev		$tctr0,$tctr0
2668	aese		$dat3,q12
2669	aesmc		$dat3,$dat3
2670	 rev		$tctr1,$tctr1
2671	 rev		$tctr2,$tctr2
2672	aese		$dat4,q12
2673	aesmc		$dat4,$dat4
2674	 rev		w13,w13
2675	 rev		w14,w14
2676
2677	aese		$dat0,q13
2678	aesmc		$dat0,$dat0
2679	aese		$dat1,q13
2680	aesmc		$dat1,$dat1
2681	aese		$dat2,q13
2682	aesmc		$dat2,$dat2
2683	aese		$dat3,q13
2684	aesmc		$dat3,$dat3
2685	aese		$dat4,q13
2686	aesmc		$dat4,$dat4
2687
2688	aese		$dat0,q14
2689	aesmc		$dat0,$dat0
2690	 vld1.8		{$in0},[$inp],#16
2691	aese		$dat1,q14
2692	aesmc		$dat1,$dat1
2693	 vld1.8		{$in1},[$inp],#16
2694	aese		$dat2,q14
2695	aesmc		$dat2,$dat2
2696	 vld1.8		{$in2},[$inp],#16
2697	aese		$dat3,q14
2698	aesmc		$dat3,$dat3
2699	 vld1.8		{$in3},[$inp],#16
2700	aese		$dat4,q14
2701	aesmc		$dat4,$dat4
2702	 vld1.8		{$in4},[$inp],#16
2703
2704	aese		$dat0,q15
2705	 veor		$in0,$in0,$rndlast
2706	aese		$dat1,q15
2707	 veor		$in1,$in1,$rndlast
2708	aese		$dat2,q15
2709	 veor		$in2,$in2,$rndlast
2710	aese		$dat3,q15
2711	 veor		$in3,$in3,$rndlast
2712	aese		$dat4,q15
2713	 veor		$in4,$in4,$rndlast
2714
2715	veor		$in0,$in0,$dat0
2716	 vorr		$dat0,$ivec,$ivec
2717	veor		$in1,$in1,$dat1
2718	 vorr		$dat1,$ivec,$ivec
2719	veor		$in2,$in2,$dat2
2720	 vorr		$dat2,$ivec,$ivec
2721	veor		$in3,$in3,$dat3
2722	 vorr		$dat3,$ivec,$ivec
2723	veor		$in4,$in4,$dat4
2724	 vorr		$dat4,$ivec,$ivec
2725
2726	vst1.8		{$in0},[$out],#16
2727	 vmov.32	${dat0}[3],$tctr0
2728	vst1.8		{$in1},[$out],#16
2729	 vmov.32	${dat1}[3],$tctr1
2730	vst1.8		{$in2},[$out],#16
2731	 vmov.32	${dat2}[3],$tctr2
2732	vst1.8		{$in3},[$out],#16
2733	 vmov.32	${dat3}[3],w13
2734	vst1.8		{$in4},[$out],#16
2735	 vmov.32	${dat4}[3],w14
2736
2737	mov		$cnt,$rounds
2738	cbz		$len,.Lctr32_done
2739
2740	add		$ctr,$ctr,#5
2741	subs		$len,$len,#5
2742	b.hs		.Loop5x_ctr32
2743
2744	add		$len,$len,#5
2745	sub		$ctr,$ctr,#5
2746
2747	cmp		$len,#2
2748	mov		$step,#16
2749	cclr		$step,lo
2750	b.ls		.Lctr32_tail
2751
2752	sub		$len,$len,#3		// bias
2753	add		$ctr,$ctr,#3
2754___
2755$code.=<<___;
2756	b		.Loop3x_ctr32
2757
2758.align	4
2759.Loop3x_ctr32:
2760	aese		$dat0,q8
2761	aesmc		$dat0,$dat0
2762	aese		$dat1,q8
2763	aesmc		$dat1,$dat1
2764	aese		$dat2,q8
2765	aesmc		$dat2,$dat2
2766	vld1.32		{q8},[$key_],#16
2767	subs		$cnt,$cnt,#2
2768	aese		$dat0,q9
2769	aesmc		$dat0,$dat0
2770	aese		$dat1,q9
2771	aesmc		$dat1,$dat1
2772	aese		$dat2,q9
2773	aesmc		$dat2,$dat2
2774	vld1.32		{q9},[$key_],#16
2775	b.gt		.Loop3x_ctr32
2776
2777	aese		$dat0,q8
2778	aesmc		$tmp0,$dat0
2779	aese		$dat1,q8
2780	aesmc		$tmp1,$dat1
2781	 vld1.8		{$in0},[$inp],#16
2782___
2783$code.=<<___	if ($flavour =~ /64/);
2784	 vorr		$dat0,$ivec,$ivec
2785___
2786$code.=<<___	if ($flavour !~ /64/);
2787	 add		$tctr0,$ctr,#1
2788___
2789$code.=<<___;
2790	aese		$dat2,q8
2791	aesmc		$dat2,$dat2
2792	 vld1.8		{$in1},[$inp],#16
2793___
2794$code.=<<___	if ($flavour =~ /64/);
2795	 vorr		$dat1,$ivec,$ivec
2796___
2797$code.=<<___	if ($flavour !~ /64/);
2798	 rev		$tctr0,$tctr0
2799___
2800$code.=<<___;
2801	aese		$tmp0,q9
2802	aesmc		$tmp0,$tmp0
2803	aese		$tmp1,q9
2804	aesmc		$tmp1,$tmp1
2805	 vld1.8		{$in2},[$inp],#16
2806	 mov		$key_,$key
2807	aese		$dat2,q9
2808	aesmc		$tmp2,$dat2
2809___
2810$code.=<<___	if ($flavour =~ /64/);
2811	 vorr		$dat2,$ivec,$ivec
2812	 add		$tctr0,$ctr,#1
2813___
2814$code.=<<___;
2815	aese		$tmp0,q12
2816	aesmc		$tmp0,$tmp0
2817	aese		$tmp1,q12
2818	aesmc		$tmp1,$tmp1
2819	 veor		$in0,$in0,$rndlast
2820	 add		$tctr1,$ctr,#2
2821	aese		$tmp2,q12
2822	aesmc		$tmp2,$tmp2
2823	 veor		$in1,$in1,$rndlast
2824	 add		$ctr,$ctr,#3
2825	aese		$tmp0,q13
2826	aesmc		$tmp0,$tmp0
2827	aese		$tmp1,q13
2828	aesmc		$tmp1,$tmp1
2829	 veor		$in2,$in2,$rndlast
2830___
2831$code.=<<___	if ($flavour =~ /64/);
2832	 rev		$tctr0,$tctr0
2833	aese		$tmp2,q13
2834	aesmc		$tmp2,$tmp2
2835	 vmov.32	${dat0}[3], $tctr0
2836___
2837$code.=<<___	if ($flavour !~ /64/);
2838	 vmov.32	${ivec}[3], $tctr0
2839	aese		$tmp2,q13
2840	aesmc		$tmp2,$tmp2
2841	 vorr		$dat0,$ivec,$ivec
2842___
2843$code.=<<___;
2844	 rev		$tctr1,$tctr1
2845	aese		$tmp0,q14
2846	aesmc		$tmp0,$tmp0
2847___
2848$code.=<<___	if ($flavour !~ /64/);
2849	 vmov.32	${ivec}[3], $tctr1
2850	 rev		$tctr2,$ctr
2851___
2852$code.=<<___;
2853	aese		$tmp1,q14
2854	aesmc		$tmp1,$tmp1
2855___
2856$code.=<<___	if ($flavour =~ /64/);
2857	 vmov.32	${dat1}[3], $tctr1
2858	 rev		$tctr2,$ctr
2859	aese		$tmp2,q14
2860	aesmc		$tmp2,$tmp2
2861	 vmov.32	${dat2}[3], $tctr2
2862___
2863$code.=<<___	if ($flavour !~ /64/);
2864	 vorr		$dat1,$ivec,$ivec
2865	 vmov.32	${ivec}[3], $tctr2
2866	aese		$tmp2,q14
2867	aesmc		$tmp2,$tmp2
2868	 vorr		$dat2,$ivec,$ivec
2869___
2870$code.=<<___;
2871	 subs		$len,$len,#3
2872	aese		$tmp0,q15
2873	aese		$tmp1,q15
2874	aese		$tmp2,q15
2875
2876	veor		$in0,$in0,$tmp0
2877	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
2878	vst1.8		{$in0},[$out],#16
2879	veor		$in1,$in1,$tmp1
2880	 mov		$cnt,$rounds
2881	vst1.8		{$in1},[$out],#16
2882	veor		$in2,$in2,$tmp2
2883	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
2884	vst1.8		{$in2},[$out],#16
2885	b.hs		.Loop3x_ctr32
2886
2887	adds		$len,$len,#3
2888	b.eq		.Lctr32_done
2889	cmp		$len,#1
2890	mov		$step,#16
2891	cclr		$step,eq
2892
2893.Lctr32_tail:
2894	aese		$dat0,q8
2895	aesmc		$dat0,$dat0
2896	aese		$dat1,q8
2897	aesmc		$dat1,$dat1
2898	vld1.32		{q8},[$key_],#16
2899	subs		$cnt,$cnt,#2
2900	aese		$dat0,q9
2901	aesmc		$dat0,$dat0
2902	aese		$dat1,q9
2903	aesmc		$dat1,$dat1
2904	vld1.32		{q9},[$key_],#16
2905	b.gt		.Lctr32_tail
2906
2907	aese		$dat0,q8
2908	aesmc		$dat0,$dat0
2909	aese		$dat1,q8
2910	aesmc		$dat1,$dat1
2911	aese		$dat0,q9
2912	aesmc		$dat0,$dat0
2913	aese		$dat1,q9
2914	aesmc		$dat1,$dat1
2915	 vld1.8		{$in0},[$inp],$step
2916	aese		$dat0,q12
2917	aesmc		$dat0,$dat0
2918	aese		$dat1,q12
2919	aesmc		$dat1,$dat1
2920	 vld1.8		{$in1},[$inp]
2921	aese		$dat0,q13
2922	aesmc		$dat0,$dat0
2923	aese		$dat1,q13
2924	aesmc		$dat1,$dat1
2925	 veor		$in0,$in0,$rndlast
2926	aese		$dat0,q14
2927	aesmc		$dat0,$dat0
2928	aese		$dat1,q14
2929	aesmc		$dat1,$dat1
2930	 veor		$in1,$in1,$rndlast
2931	aese		$dat0,q15
2932	aese		$dat1,q15
2933
2934	cmp		$len,#1
2935	veor		$in0,$in0,$dat0
2936	veor		$in1,$in1,$dat1
2937	vst1.8		{$in0},[$out],#16
2938	b.eq		.Lctr32_done
2939	vst1.8		{$in1},[$out]
2940
2941.Lctr32_done:
2942___
2943$code.=<<___	if ($flavour !~ /64/);
2944	vldmia		sp!,{d8-d15}
2945	ldmia		sp!,{r4-r10,pc}
2946___
2947$code.=<<___	if ($flavour =~ /64/);
2948	ldr		x29,[sp],#16
2949	ret
2950___
2951$code.=<<___;
2952.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2953___
2954}}}
2955# Performance in cycles per byte.
2956# Processed with AES-XTS different key size.
2957# It shows the value before and after optimization as below:
2958# (before/after):
2959#
2960#		AES-128-XTS		AES-256-XTS
2961# Cortex-A57	3.36/1.09		4.02/1.37
2962# Cortex-A72	3.03/1.02		3.28/1.33
2963
2964# Optimization is implemented by loop unrolling and interleaving.
2965# Commonly, we choose the unrolling factor as 5, if the input
2966# data size smaller than 5 blocks, but not smaller than 3 blocks,
2967# choose 3 as the unrolling factor.
2968# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2969# as one iteration, every loop the left size lsize -= 5*16.
2970# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2971# will be processed specially, which be integrated into the 5*16 bytes
2972# loop to improve the efficiency.
2973# There is one special case, if the original input data size dsize
2974# = 16 bytes, we will treat it separately to improve the
2975# performance: one independent code block without LR, FP load and
2976# store.
2977# Encryption will process the (length -tailcnt) bytes as mentioned
2978# previously, then encrypt the composite block as last second
2979# cipher block.
2980# Decryption will process the (length -tailcnt -1) bytes as mentioned
2981# previously, then decrypt the last second cipher block to get the
2982# last plain block(tail), decrypt the composite block as last second
2983# plain text block.
2984
2985{{{
2986my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2987my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2988my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2989my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2990my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2991my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2992my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2993my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2994my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2995
2996my ($tmpin)=("v26.16b");
2997my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2998
2999# q7	last round key
3000# q10-q15, q7	Last 7 round keys
3001# q8-q9	preloaded round keys except last 7 keys for big size
3002# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
3003
3004
3005my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3006
3007my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
3008my ($dat4,$in4,$tmp4);
3009if ($flavour =~ /64/) {
3010    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3011}
3012
3013$code.=<<___	if ($flavour =~ /64/);
3014.globl	${prefix}_xts_encrypt
3015.type	${prefix}_xts_encrypt,%function
3016.align	5
3017${prefix}_xts_encrypt:
3018___
3019$code.=<<___	if ($flavour =~ /64/);
3020	AARCH64_VALID_CALL_TARGET
3021	cmp	$len,#16
3022	// Original input data size bigger than 16, jump to big size processing.
3023	b.ne	.Lxts_enc_big_size
3024	// Encrypt the iv with key2, as the first XEX iv.
3025	ldr	$rounds,[$key2,#240]
3026	vld1.32	{$dat},[$key2],#16
3027	vld1.8	{$iv0},[$ivp]
3028	sub	$rounds,$rounds,#2
3029	vld1.32	{$dat1},[$key2],#16
3030
3031.Loop_enc_iv_enc:
3032	aese	$iv0,$dat
3033	aesmc	$iv0,$iv0
3034	vld1.32	{$dat},[$key2],#16
3035	subs	$rounds,$rounds,#2
3036	aese	$iv0,$dat1
3037	aesmc	$iv0,$iv0
3038	vld1.32	{$dat1},[$key2],#16
3039	b.gt	.Loop_enc_iv_enc
3040
3041	aese	$iv0,$dat
3042	aesmc	$iv0,$iv0
3043	vld1.32	{$dat},[$key2]
3044	aese	$iv0,$dat1
3045	veor	$iv0,$iv0,$dat
3046
3047	vld1.8	{$dat0},[$inp]
3048	veor	$dat0,$iv0,$dat0
3049
3050	ldr	$rounds,[$key1,#240]
3051	vld1.32	{q20-q21},[$key1],#32		// load key schedule...
3052
3053	aese	$dat0,q20
3054	aesmc	$dat0,$dat0
3055	vld1.32	{q8-q9},[$key1],#32		// load key schedule...
3056	aese	$dat0,q21
3057	aesmc	$dat0,$dat0
3058	subs	$rounds,$rounds,#10		// if rounds==10, jump to aes-128-xts processing
3059	b.eq	.Lxts_128_enc
3060.Lxts_enc_round_loop:
3061	aese	$dat0,q8
3062	aesmc	$dat0,$dat0
3063	vld1.32	{q8},[$key1],#16		// load key schedule...
3064	aese	$dat0,q9
3065	aesmc	$dat0,$dat0
3066	vld1.32	{q9},[$key1],#16		// load key schedule...
3067	subs	$rounds,$rounds,#2		// bias
3068	b.gt	.Lxts_enc_round_loop
3069.Lxts_128_enc:
3070	vld1.32	{q10-q11},[$key1],#32		// load key schedule...
3071	aese	$dat0,q8
3072	aesmc	$dat0,$dat0
3073	aese	$dat0,q9
3074	aesmc	$dat0,$dat0
3075	vld1.32	{q12-q13},[$key1],#32		// load key schedule...
3076	aese	$dat0,q10
3077	aesmc	$dat0,$dat0
3078	aese	$dat0,q11
3079	aesmc	$dat0,$dat0
3080	vld1.32	{q14-q15},[$key1],#32		// load key schedule...
3081	aese	$dat0,q12
3082	aesmc	$dat0,$dat0
3083	aese	$dat0,q13
3084	aesmc	$dat0,$dat0
3085	vld1.32	{$rndlast},[$key1]
3086	aese	$dat0,q14
3087	aesmc	$dat0,$dat0
3088	aese	$dat0,q15
3089	veor	$dat0,$dat0,$rndlast
3090	veor	$dat0,$dat0,$iv0
3091	vst1.8	{$dat0},[$out]
3092	b	.Lxts_enc_final_abort
3093
3094.align	4
3095.Lxts_enc_big_size:
3096___
3097$code.=<<___	if ($flavour =~ /64/);
3098	stp	$constnumx,$tmpinp,[sp,#-64]!
3099	stp	$tailcnt,$midnumx,[sp,#48]
3100	stp	$ivd10,$ivd20,[sp,#32]
3101	stp	$ivd30,$ivd40,[sp,#16]
3102
3103	// tailcnt store the tail value of length%16.
3104	and	$tailcnt,$len,#0xf
3105	and	$len,$len,#-16
3106	subs	$len,$len,#16
3107	mov	$step,#16
3108	b.lo	.Lxts_abort
3109	csel	$step,xzr,$step,eq
3110
3111	// Firstly, encrypt the iv with key2, as the first iv of XEX.
3112	ldr	$rounds,[$key2,#240]
3113	vld1.32	{$dat},[$key2],#16
3114	vld1.8	{$iv0},[$ivp]
3115	sub	$rounds,$rounds,#2
3116	vld1.32	{$dat1},[$key2],#16
3117
3118.Loop_iv_enc:
3119	aese	$iv0,$dat
3120	aesmc	$iv0,$iv0
3121	vld1.32	{$dat},[$key2],#16
3122	subs	$rounds,$rounds,#2
3123	aese	$iv0,$dat1
3124	aesmc	$iv0,$iv0
3125	vld1.32	{$dat1},[$key2],#16
3126	b.gt	.Loop_iv_enc
3127
3128	aese	$iv0,$dat
3129	aesmc	$iv0,$iv0
3130	vld1.32	{$dat},[$key2]
3131	aese	$iv0,$dat1
3132	veor	$iv0,$iv0,$dat
3133
3134	// The iv for second block
3135	// $ivl- iv(low), $ivh - iv(high)
3136	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3137	fmov	$ivl,$ivd00
3138	fmov	$ivh,$ivd01
3139	mov	$constnum,#0x87
3140	extr	$midnumx,$ivh,$ivh,#32
3141	extr	$ivh,$ivh,$ivl,#63
3142	and	$tmpmw,$constnum,$midnum,asr#31
3143	eor	$ivl,$tmpmx,$ivl,lsl#1
3144	fmov	$ivd10,$ivl
3145	fmov	$ivd11,$ivh
3146
3147	ldr	$rounds0,[$key1,#240]		// next starting point
3148	vld1.8	{$dat},[$inp],$step
3149
3150	vld1.32	{q8-q9},[$key1]			// load key schedule...
3151	sub	$rounds0,$rounds0,#6
3152	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
3153	sub	$rounds0,$rounds0,#2
3154	vld1.32	{q10-q11},[$key_],#32
3155	vld1.32	{q12-q13},[$key_],#32
3156	vld1.32	{q14-q15},[$key_],#32
3157	vld1.32	{$rndlast},[$key_]
3158
3159	add	$key_,$key1,#32
3160	mov	$rounds,$rounds0
3161
3162	// Encryption
3163.Lxts_enc:
3164	vld1.8	{$dat2},[$inp],#16
3165	subs	$len,$len,#32			// bias
3166	add	$rounds,$rounds0,#2
3167	vorr	$in1,$dat,$dat
3168	vorr	$dat1,$dat,$dat
3169	vorr	$in3,$dat,$dat
3170	vorr	$in2,$dat2,$dat2
3171	vorr	$in4,$dat2,$dat2
3172	b.lo	.Lxts_inner_enc_tail
3173	veor	$dat,$dat,$iv0			// before encryption, xor with iv
3174	veor	$dat2,$dat2,$iv1
3175
3176	// The iv for third block
3177	extr	$midnumx,$ivh,$ivh,#32
3178	extr	$ivh,$ivh,$ivl,#63
3179	and	$tmpmw,$constnum,$midnum,asr#31
3180	eor	$ivl,$tmpmx,$ivl,lsl#1
3181	fmov	$ivd20,$ivl
3182	fmov	$ivd21,$ivh
3183
3184
3185	vorr	$dat1,$dat2,$dat2
3186	vld1.8	{$dat2},[$inp],#16
3187	vorr	$in0,$dat,$dat
3188	vorr	$in1,$dat1,$dat1
3189	veor	$in2,$dat2,$iv2 		// the third block
3190	veor	$dat2,$dat2,$iv2
3191	cmp	$len,#32
3192	b.lo	.Lxts_outer_enc_tail
3193
3194	// The iv for fourth block
3195	extr	$midnumx,$ivh,$ivh,#32
3196	extr	$ivh,$ivh,$ivl,#63
3197	and	$tmpmw,$constnum,$midnum,asr#31
3198	eor	$ivl,$tmpmx,$ivl,lsl#1
3199	fmov	$ivd30,$ivl
3200	fmov	$ivd31,$ivh
3201
3202	vld1.8	{$dat3},[$inp],#16
3203	// The iv for fifth block
3204	extr	$midnumx,$ivh,$ivh,#32
3205	extr	$ivh,$ivh,$ivl,#63
3206	and	$tmpmw,$constnum,$midnum,asr#31
3207	eor	$ivl,$tmpmx,$ivl,lsl#1
3208	fmov	$ivd40,$ivl
3209	fmov	$ivd41,$ivh
3210
3211	vld1.8	{$dat4},[$inp],#16
3212	veor	$dat3,$dat3,$iv3		// the fourth block
3213	veor	$dat4,$dat4,$iv4
3214	sub	$len,$len,#32			// bias
3215	mov	$rounds,$rounds0
3216	b	.Loop5x_xts_enc
3217
3218.align	4
3219.Loop5x_xts_enc:
3220	aese	$dat0,q8
3221	aesmc	$dat0,$dat0
3222	aese	$dat1,q8
3223	aesmc	$dat1,$dat1
3224	aese	$dat2,q8
3225	aesmc	$dat2,$dat2
3226	aese	$dat3,q8
3227	aesmc	$dat3,$dat3
3228	aese	$dat4,q8
3229	aesmc	$dat4,$dat4
3230	vld1.32	{q8},[$key_],#16
3231	subs	$rounds,$rounds,#2
3232	aese	$dat0,q9
3233	aesmc	$dat0,$dat0
3234	aese	$dat1,q9
3235	aesmc	$dat1,$dat1
3236	aese	$dat2,q9
3237	aesmc	$dat2,$dat2
3238	aese	$dat3,q9
3239	aesmc	$dat3,$dat3
3240	aese	$dat4,q9
3241	aesmc	$dat4,$dat4
3242	vld1.32	{q9},[$key_],#16
3243	b.gt	.Loop5x_xts_enc
3244
3245	aese	$dat0,q8
3246	aesmc	$dat0,$dat0
3247	aese	$dat1,q8
3248	aesmc	$dat1,$dat1
3249	aese	$dat2,q8
3250	aesmc	$dat2,$dat2
3251	aese	$dat3,q8
3252	aesmc	$dat3,$dat3
3253	aese	$dat4,q8
3254	aesmc	$dat4,$dat4
3255	subs	$len,$len,#0x50			// because .Lxts_enc_tail4x
3256
3257	aese	$dat0,q9
3258	aesmc	$dat0,$dat0
3259	aese	$dat1,q9
3260	aesmc	$dat1,$dat1
3261	aese	$dat2,q9
3262	aesmc	$dat2,$dat2
3263	aese	$dat3,q9
3264	aesmc	$dat3,$dat3
3265	aese	$dat4,q9
3266	aesmc	$dat4,$dat4
3267	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
3268	mov	$key_,$key1
3269
3270	aese	$dat0,q10
3271	aesmc	$dat0,$dat0
3272	aese	$dat1,q10
3273	aesmc	$dat1,$dat1
3274	aese	$dat2,q10
3275	aesmc	$dat2,$dat2
3276	aese	$dat3,q10
3277	aesmc	$dat3,$dat3
3278	aese	$dat4,q10
3279	aesmc	$dat4,$dat4
3280	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
3281						// at exit from the loop v1.16b-v26.16b
3282						// are loaded with last "words"
3283	add	$xoffset,$len,#0x60		// because .Lxts_enc_tail4x
3284
3285	aese	$dat0,q11
3286	aesmc	$dat0,$dat0
3287	aese	$dat1,q11
3288	aesmc	$dat1,$dat1
3289	aese	$dat2,q11
3290	aesmc	$dat2,$dat2
3291	aese	$dat3,q11
3292	aesmc	$dat3,$dat3
3293	aese	$dat4,q11
3294	aesmc	$dat4,$dat4
3295
3296	aese	$dat0,q12
3297	aesmc	$dat0,$dat0
3298	aese	$dat1,q12
3299	aesmc	$dat1,$dat1
3300	aese	$dat2,q12
3301	aesmc	$dat2,$dat2
3302	aese	$dat3,q12
3303	aesmc	$dat3,$dat3
3304	aese	$dat4,q12
3305	aesmc	$dat4,$dat4
3306
3307	aese	$dat0,q13
3308	aesmc	$dat0,$dat0
3309	aese	$dat1,q13
3310	aesmc	$dat1,$dat1
3311	aese	$dat2,q13
3312	aesmc	$dat2,$dat2
3313	aese	$dat3,q13
3314	aesmc	$dat3,$dat3
3315	aese	$dat4,q13
3316	aesmc	$dat4,$dat4
3317
3318	aese	$dat0,q14
3319	aesmc	$dat0,$dat0
3320	aese	$dat1,q14
3321	aesmc	$dat1,$dat1
3322	aese	$dat2,q14
3323	aesmc	$dat2,$dat2
3324	aese	$dat3,q14
3325	aesmc	$dat3,$dat3
3326	aese	$dat4,q14
3327	aesmc	$dat4,$dat4
3328
3329	veor	$tmp0,$rndlast,$iv0
3330	aese	$dat0,q15
3331	// The iv for first block of one iteration
3332	extr	$midnumx,$ivh,$ivh,#32
3333	extr	$ivh,$ivh,$ivl,#63
3334	and	$tmpmw,$constnum,$midnum,asr#31
3335	eor	$ivl,$tmpmx,$ivl,lsl#1
3336	fmov	$ivd00,$ivl
3337	fmov	$ivd01,$ivh
3338	veor	$tmp1,$rndlast,$iv1
3339	vld1.8	{$in0},[$inp],#16
3340	aese	$dat1,q15
3341	// The iv for second block
3342	extr	$midnumx,$ivh,$ivh,#32
3343	extr	$ivh,$ivh,$ivl,#63
3344	and	$tmpmw,$constnum,$midnum,asr#31
3345	eor	$ivl,$tmpmx,$ivl,lsl#1
3346	fmov	$ivd10,$ivl
3347	fmov	$ivd11,$ivh
3348	veor	$tmp2,$rndlast,$iv2
3349	vld1.8	{$in1},[$inp],#16
3350	aese	$dat2,q15
3351	// The iv for third block
3352	extr	$midnumx,$ivh,$ivh,#32
3353	extr	$ivh,$ivh,$ivl,#63
3354	and	$tmpmw,$constnum,$midnum,asr#31
3355	eor	$ivl,$tmpmx,$ivl,lsl#1
3356	fmov	$ivd20,$ivl
3357	fmov	$ivd21,$ivh
3358	veor	$tmp3,$rndlast,$iv3
3359	vld1.8	{$in2},[$inp],#16
3360	aese	$dat3,q15
3361	// The iv for fourth block
3362	extr	$midnumx,$ivh,$ivh,#32
3363	extr	$ivh,$ivh,$ivl,#63
3364	and	$tmpmw,$constnum,$midnum,asr#31
3365	eor	$ivl,$tmpmx,$ivl,lsl#1
3366	fmov	$ivd30,$ivl
3367	fmov	$ivd31,$ivh
3368	veor	$tmp4,$rndlast,$iv4
3369	vld1.8	{$in3},[$inp],#16
3370	aese	$dat4,q15
3371
3372	// The iv for fifth block
3373	extr	$midnumx,$ivh,$ivh,#32
3374	extr	$ivh,$ivh,$ivl,#63
3375	and	$tmpmw,$constnum,$midnum,asr #31
3376	eor	$ivl,$tmpmx,$ivl,lsl #1
3377	fmov	$ivd40,$ivl
3378	fmov	$ivd41,$ivh
3379
3380	vld1.8	{$in4},[$inp],#16
3381	cbz	$xoffset,.Lxts_enc_tail4x
3382	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
3383	veor	$tmp0,$tmp0,$dat0
3384	veor	$dat0,$in0,$iv0
3385	veor	$tmp1,$tmp1,$dat1
3386	veor	$dat1,$in1,$iv1
3387	veor	$tmp2,$tmp2,$dat2
3388	veor	$dat2,$in2,$iv2
3389	veor	$tmp3,$tmp3,$dat3
3390	veor	$dat3,$in3,$iv3
3391	veor	$tmp4,$tmp4,$dat4
3392	vst1.8	{$tmp0},[$out],#16
3393	veor	$dat4,$in4,$iv4
3394	vst1.8	{$tmp1},[$out],#16
3395	mov	$rounds,$rounds0
3396	vst1.8	{$tmp2},[$out],#16
3397	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
3398	vst1.8	{$tmp3},[$out],#16
3399	vst1.8	{$tmp4},[$out],#16
3400	b.hs	.Loop5x_xts_enc
3401
3402
3403	// If left 4 blocks, borrow the five block's processing.
3404	cmn	$len,#0x10
3405	b.ne	.Loop5x_enc_after
3406	vorr	$iv4,$iv3,$iv3
3407	vorr	$iv3,$iv2,$iv2
3408	vorr	$iv2,$iv1,$iv1
3409	vorr	$iv1,$iv0,$iv0
3410	fmov	$ivl,$ivd40
3411	fmov	$ivh,$ivd41
3412	veor	$dat0,$iv0,$in0
3413	veor	$dat1,$iv1,$in1
3414	veor	$dat2,$in2,$iv2
3415	veor	$dat3,$in3,$iv3
3416	veor	$dat4,$in4,$iv4
3417	b.eq	.Loop5x_xts_enc
3418
3419.Loop5x_enc_after:
3420	add	$len,$len,#0x50
3421	cbz	$len,.Lxts_enc_done
3422
3423	add	$rounds,$rounds0,#2
3424	subs	$len,$len,#0x30
3425	b.lo	.Lxts_inner_enc_tail
3426
3427	veor	$dat0,$iv0,$in2
3428	veor	$dat1,$iv1,$in3
3429	veor	$dat2,$in4,$iv2
3430	b	.Lxts_outer_enc_tail
3431
3432.align	4
3433.Lxts_enc_tail4x:
3434	add	$inp,$inp,#16
3435	veor	$tmp1,$dat1,$tmp1
3436	vst1.8	{$tmp1},[$out],#16
3437	veor	$tmp2,$dat2,$tmp2
3438	vst1.8	{$tmp2},[$out],#16
3439	veor	$tmp3,$dat3,$tmp3
3440	veor	$tmp4,$dat4,$tmp4
3441	vst1.8	{$tmp3-$tmp4},[$out],#32
3442
3443	b	.Lxts_enc_done
3444.align	4
3445.Lxts_outer_enc_tail:
3446	aese	$dat0,q8
3447	aesmc	$dat0,$dat0
3448	aese	$dat1,q8
3449	aesmc	$dat1,$dat1
3450	aese	$dat2,q8
3451	aesmc	$dat2,$dat2
3452	vld1.32	{q8},[$key_],#16
3453	subs	$rounds,$rounds,#2
3454	aese	$dat0,q9
3455	aesmc	$dat0,$dat0
3456	aese	$dat1,q9
3457	aesmc	$dat1,$dat1
3458	aese	$dat2,q9
3459	aesmc	$dat2,$dat2
3460	vld1.32	{q9},[$key_],#16
3461	b.gt	.Lxts_outer_enc_tail
3462
3463	aese	$dat0,q8
3464	aesmc	$dat0,$dat0
3465	aese	$dat1,q8
3466	aesmc	$dat1,$dat1
3467	aese	$dat2,q8
3468	aesmc	$dat2,$dat2
3469	veor	$tmp0,$iv0,$rndlast
3470	subs	$len,$len,#0x30
3471	// The iv for first block
3472	fmov	$ivl,$ivd20
3473	fmov	$ivh,$ivd21
3474	//mov	$constnum,#0x87
3475	extr	$midnumx,$ivh,$ivh,#32
3476	extr	$ivh,$ivh,$ivl,#63
3477	and	$tmpmw,$constnum,$midnum,asr#31
3478	eor	$ivl,$tmpmx,$ivl,lsl#1
3479	fmov	$ivd00,$ivl
3480	fmov	$ivd01,$ivh
3481	veor	$tmp1,$iv1,$rndlast
3482	csel	$xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
3483	aese	$dat0,q9
3484	aesmc	$dat0,$dat0
3485	aese	$dat1,q9
3486	aesmc	$dat1,$dat1
3487	aese	$dat2,q9
3488	aesmc	$dat2,$dat2
3489	veor	$tmp2,$iv2,$rndlast
3490
3491	add	$xoffset,$xoffset,#0x20
3492	add	$inp,$inp,$xoffset
3493	mov	$key_,$key1
3494
3495	aese	$dat0,q12
3496	aesmc	$dat0,$dat0
3497	aese	$dat1,q12
3498	aesmc	$dat1,$dat1
3499	aese	$dat2,q12
3500	aesmc	$dat2,$dat2
3501	aese	$dat0,q13
3502	aesmc	$dat0,$dat0
3503	aese	$dat1,q13
3504	aesmc	$dat1,$dat1
3505	aese	$dat2,q13
3506	aesmc	$dat2,$dat2
3507	aese	$dat0,q14
3508	aesmc	$dat0,$dat0
3509	aese	$dat1,q14
3510	aesmc	$dat1,$dat1
3511	aese	$dat2,q14
3512	aesmc	$dat2,$dat2
3513	aese	$dat0,q15
3514	aese	$dat1,q15
3515	aese	$dat2,q15
3516	vld1.8	{$in2},[$inp],#16
3517	add	$rounds,$rounds0,#2
3518	vld1.32	{q8},[$key_],#16                // re-pre-load rndkey[0]
3519	veor	$tmp0,$tmp0,$dat0
3520	veor	$tmp1,$tmp1,$dat1
3521	veor	$dat2,$dat2,$tmp2
3522	vld1.32	{q9},[$key_],#16                // re-pre-load rndkey[1]
3523	vst1.8	{$tmp0},[$out],#16
3524	vst1.8	{$tmp1},[$out],#16
3525	vst1.8	{$dat2},[$out],#16
3526	cmn	$len,#0x30
3527	b.eq	.Lxts_enc_done
3528.Lxts_encxor_one:
3529	vorr	$in3,$in1,$in1
3530	vorr	$in4,$in2,$in2
3531	nop
3532
3533.Lxts_inner_enc_tail:
3534	cmn	$len,#0x10
3535	veor	$dat1,$in3,$iv0
3536	veor	$dat2,$in4,$iv1
3537	b.eq	.Lxts_enc_tail_loop
3538	veor	$dat2,$in4,$iv0
3539.Lxts_enc_tail_loop:
3540	aese	$dat1,q8
3541	aesmc	$dat1,$dat1
3542	aese	$dat2,q8
3543	aesmc	$dat2,$dat2
3544	vld1.32	{q8},[$key_],#16
3545	subs	$rounds,$rounds,#2
3546	aese	$dat1,q9
3547	aesmc	$dat1,$dat1
3548	aese	$dat2,q9
3549	aesmc	$dat2,$dat2
3550	vld1.32	{q9},[$key_],#16
3551	b.gt	.Lxts_enc_tail_loop
3552
3553	aese	$dat1,q8
3554	aesmc	$dat1,$dat1
3555	aese	$dat2,q8
3556	aesmc	$dat2,$dat2
3557	aese	$dat1,q9
3558	aesmc	$dat1,$dat1
3559	aese	$dat2,q9
3560	aesmc	$dat2,$dat2
3561	aese	$dat1,q12
3562	aesmc	$dat1,$dat1
3563	aese	$dat2,q12
3564	aesmc	$dat2,$dat2
3565	cmn	$len,#0x20
3566	aese	$dat1,q13
3567	aesmc	$dat1,$dat1
3568	aese	$dat2,q13
3569	aesmc	$dat2,$dat2
3570	veor	$tmp1,$iv0,$rndlast
3571	aese	$dat1,q14
3572	aesmc	$dat1,$dat1
3573	aese	$dat2,q14
3574	aesmc	$dat2,$dat2
3575	veor	$tmp2,$iv1,$rndlast
3576	aese	$dat1,q15
3577	aese	$dat2,q15
3578	b.eq	.Lxts_enc_one
3579	veor	$tmp1,$tmp1,$dat1
3580	vst1.8	{$tmp1},[$out],#16
3581	veor	$tmp2,$tmp2,$dat2
3582	vorr	$iv0,$iv1,$iv1
3583	vst1.8	{$tmp2},[$out],#16
3584	fmov	$ivl,$ivd10
3585	fmov	$ivh,$ivd11
3586	mov	$constnum,#0x87
3587	extr	$midnumx,$ivh,$ivh,#32
3588	extr	$ivh,$ivh,$ivl,#63
3589	and	$tmpmw,$constnum,$midnum,asr #31
3590	eor	$ivl,$tmpmx,$ivl,lsl #1
3591	fmov	$ivd00,$ivl
3592	fmov	$ivd01,$ivh
3593	b	.Lxts_enc_done
3594
3595.Lxts_enc_one:
3596	veor	$tmp1,$tmp1,$dat2
3597	vorr	$iv0,$iv0,$iv0
3598	vst1.8	{$tmp1},[$out],#16
3599	fmov	$ivl,$ivd00
3600	fmov	$ivh,$ivd01
3601	mov	$constnum,#0x87
3602	extr	$midnumx,$ivh,$ivh,#32
3603	extr	$ivh,$ivh,$ivl,#63
3604	and	$tmpmw,$constnum,$midnum,asr #31
3605	eor	$ivl,$tmpmx,$ivl,lsl #1
3606	fmov	$ivd00,$ivl
3607	fmov	$ivd01,$ivh
3608	b	.Lxts_enc_done
3609.align	5
3610.Lxts_enc_done:
3611	// Process the tail block with cipher stealing.
3612	tst	$tailcnt,#0xf
3613	b.eq	.Lxts_abort
3614
3615	mov	$tmpinp,$inp
3616	mov	$tmpoutp,$out
3617	sub	$out,$out,#16
3618.composite_enc_loop:
3619	subs	$tailcnt,$tailcnt,#1
3620	ldrb	$l2outp,[$out,$tailcnt]
3621	ldrb	$loutp,[$tmpinp,$tailcnt]
3622	strb	$l2outp,[$tmpoutp,$tailcnt]
3623	strb	$loutp,[$out,$tailcnt]
3624	b.gt	.composite_enc_loop
3625.Lxts_enc_load_done:
3626	vld1.8	{$tmpin},[$out]
3627	veor	$tmpin,$tmpin,$iv0
3628
3629	// Encrypt the composite block to get the last second encrypted text block
3630	ldr	$rounds,[$key1,#240]		// load key schedule...
3631	vld1.32	{$dat},[$key1],#16
3632	sub	$rounds,$rounds,#2
3633	vld1.32	{$dat1},[$key1],#16		// load key schedule...
3634.Loop_final_enc:
3635	aese	$tmpin,$dat0
3636	aesmc	$tmpin,$tmpin
3637	vld1.32	{$dat0},[$key1],#16
3638	subs	$rounds,$rounds,#2
3639	aese	$tmpin,$dat1
3640	aesmc	$tmpin,$tmpin
3641	vld1.32	{$dat1},[$key1],#16
3642	b.gt	.Loop_final_enc
3643
3644	aese	$tmpin,$dat0
3645	aesmc	$tmpin,$tmpin
3646	vld1.32	{$dat0},[$key1]
3647	aese	$tmpin,$dat1
3648	veor	$tmpin,$tmpin,$dat0
3649	veor	$tmpin,$tmpin,$iv0
3650	vst1.8	{$tmpin},[$out]
3651
3652.Lxts_abort:
3653	ldp	$tailcnt,$midnumx,[sp,#48]
3654	ldp	$ivd10,$ivd20,[sp,#32]
3655	ldp	$ivd30,$ivd40,[sp,#16]
3656	ldp	$constnumx,$tmpinp,[sp],#64
3657.Lxts_enc_final_abort:
3658	ret
3659.size	${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
3660___
3661
3662}}}
3663{{{
3664my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
3665my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
3666my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
3667my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
3668my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
3669my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
3670my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
3671my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
3672my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
3673
3674my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
3675
3676# q7	last round key
3677# q10-q15, q7	Last 7 round keys
3678# q8-q9	preloaded round keys except last 7 keys for big size
3679# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
3680
3681{
3682my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3683
3684my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
3685my ($dat4,$in4,$tmp4);
3686if ($flavour =~ /64/) {
3687    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3688}
3689
3690$code.=<<___	if ($flavour =~ /64/);
3691.globl	${prefix}_xts_decrypt
3692.type	${prefix}_xts_decrypt,%function
3693.align	5
3694${prefix}_xts_decrypt:
3695	AARCH64_VALID_CALL_TARGET
3696___
3697$code.=<<___	if ($flavour =~ /64/);
3698	cmp	$len,#16
3699	// Original input data size bigger than 16, jump to big size processing.
3700	b.ne	.Lxts_dec_big_size
3701	// Encrypt the iv with key2, as the first XEX iv.
3702	ldr	$rounds,[$key2,#240]
3703	vld1.32	{$dat},[$key2],#16
3704	vld1.8	{$iv0},[$ivp]
3705	sub	$rounds,$rounds,#2
3706	vld1.32	{$dat1},[$key2],#16
3707
3708.Loop_dec_small_iv_enc:
3709	aese	$iv0,$dat
3710	aesmc	$iv0,$iv0
3711	vld1.32	{$dat},[$key2],#16
3712	subs	$rounds,$rounds,#2
3713	aese	$iv0,$dat1
3714	aesmc	$iv0,$iv0
3715	vld1.32	{$dat1},[$key2],#16
3716	b.gt	.Loop_dec_small_iv_enc
3717
3718	aese	$iv0,$dat
3719	aesmc	$iv0,$iv0
3720	vld1.32	{$dat},[$key2]
3721	aese	$iv0,$dat1
3722	veor	$iv0,$iv0,$dat
3723
3724	vld1.8	{$dat0},[$inp]
3725	veor	$dat0,$iv0,$dat0
3726
3727	ldr	$rounds,[$key1,#240]
3728	vld1.32	{q20-q21},[$key1],#32			// load key schedule...
3729
3730	aesd	$dat0,q20
3731	aesimc	$dat0,$dat0
3732	vld1.32	{q8-q9},[$key1],#32			// load key schedule...
3733	aesd	$dat0,q21
3734	aesimc	$dat0,$dat0
3735	subs	$rounds,$rounds,#10			// bias
3736	b.eq	.Lxts_128_dec
3737.Lxts_dec_round_loop:
3738	aesd	$dat0,q8
3739	aesimc	$dat0,$dat0
3740	vld1.32	{q8},[$key1],#16			// load key schedule...
3741	aesd	$dat0,q9
3742	aesimc	$dat0,$dat0
3743	vld1.32	{q9},[$key1],#16			// load key schedule...
3744	subs	$rounds,$rounds,#2			// bias
3745	b.gt	.Lxts_dec_round_loop
3746.Lxts_128_dec:
3747	vld1.32	{q10-q11},[$key1],#32			// load key schedule...
3748	aesd	$dat0,q8
3749	aesimc	$dat0,$dat0
3750	aesd	$dat0,q9
3751	aesimc	$dat0,$dat0
3752	vld1.32	{q12-q13},[$key1],#32			// load key schedule...
3753	aesd	$dat0,q10
3754	aesimc	$dat0,$dat0
3755	aesd	$dat0,q11
3756	aesimc	$dat0,$dat0
3757	vld1.32	{q14-q15},[$key1],#32			// load key schedule...
3758	aesd	$dat0,q12
3759	aesimc	$dat0,$dat0
3760	aesd	$dat0,q13
3761	aesimc	$dat0,$dat0
3762	vld1.32	{$rndlast},[$key1]
3763	aesd	$dat0,q14
3764	aesimc	$dat0,$dat0
3765	aesd	$dat0,q15
3766	veor	$dat0,$dat0,$rndlast
3767	veor	$dat0,$iv0,$dat0
3768	vst1.8	{$dat0},[$out]
3769	b	.Lxts_dec_final_abort
3770.Lxts_dec_big_size:
3771___
3772$code.=<<___	if ($flavour =~ /64/);
3773	stp	$constnumx,$tmpinp,[sp,#-64]!
3774	stp	$tailcnt,$midnumx,[sp,#48]
3775	stp	$ivd10,$ivd20,[sp,#32]
3776	stp	$ivd30,$ivd40,[sp,#16]
3777
3778	and	$tailcnt,$len,#0xf
3779	and	$len,$len,#-16
3780	subs	$len,$len,#16
3781	mov	$step,#16
3782	b.lo	.Lxts_dec_abort
3783
3784	// Encrypt the iv with key2, as the first XEX iv
3785	ldr	$rounds,[$key2,#240]
3786	vld1.32	{$dat},[$key2],#16
3787	vld1.8	{$iv0},[$ivp]
3788	sub	$rounds,$rounds,#2
3789	vld1.32	{$dat1},[$key2],#16
3790
3791.Loop_dec_iv_enc:
3792	aese	$iv0,$dat
3793	aesmc	$iv0,$iv0
3794	vld1.32	{$dat},[$key2],#16
3795	subs	$rounds,$rounds,#2
3796	aese	$iv0,$dat1
3797	aesmc	$iv0,$iv0
3798	vld1.32	{$dat1},[$key2],#16
3799	b.gt	.Loop_dec_iv_enc
3800
3801	aese	$iv0,$dat
3802	aesmc	$iv0,$iv0
3803	vld1.32	{$dat},[$key2]
3804	aese	$iv0,$dat1
3805	veor	$iv0,$iv0,$dat
3806
3807	// The iv for second block
3808	// $ivl- iv(low), $ivh - iv(high)
3809	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3810	fmov	$ivl,$ivd00
3811	fmov	$ivh,$ivd01
3812	mov	$constnum,#0x87
3813	extr	$midnumx,$ivh,$ivh,#32
3814	extr	$ivh,$ivh,$ivl,#63
3815	and	$tmpmw,$constnum,$midnum,asr #31
3816	eor	$ivl,$tmpmx,$ivl,lsl #1
3817	fmov	$ivd10,$ivl
3818	fmov	$ivd11,$ivh
3819
3820	ldr	$rounds0,[$key1,#240]		// load rounds number
3821
3822	// The iv for third block
3823	extr	$midnumx,$ivh,$ivh,#32
3824	extr	$ivh,$ivh,$ivl,#63
3825	and	$tmpmw,$constnum,$midnum,asr #31
3826	eor	$ivl,$tmpmx,$ivl,lsl #1
3827	fmov	$ivd20,$ivl
3828	fmov	$ivd21,$ivh
3829
3830	vld1.32	{q8-q9},[$key1]			// load key schedule...
3831	sub	$rounds0,$rounds0,#6
3832	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
3833	sub	$rounds0,$rounds0,#2
3834	vld1.32	{q10-q11},[$key_],#32		// load key schedule...
3835	vld1.32	{q12-q13},[$key_],#32
3836	vld1.32	{q14-q15},[$key_],#32
3837	vld1.32	{$rndlast},[$key_]
3838
3839	// The iv for fourth block
3840	extr	$midnumx,$ivh,$ivh,#32
3841	extr	$ivh,$ivh,$ivl,#63
3842	and	$tmpmw,$constnum,$midnum,asr #31
3843	eor	$ivl,$tmpmx,$ivl,lsl #1
3844	fmov	$ivd30,$ivl
3845	fmov	$ivd31,$ivh
3846
3847	add	$key_,$key1,#32
3848	mov	$rounds,$rounds0
3849	b	.Lxts_dec
3850
3851	// Decryption
3852.align	5
3853.Lxts_dec:
3854	tst	$tailcnt,#0xf
3855	b.eq	.Lxts_dec_begin
3856	subs	$len,$len,#16
3857	csel	$step,xzr,$step,eq
3858	vld1.8	{$dat},[$inp],#16
3859	b.lo	.Lxts_done
3860	sub	$inp,$inp,#16
3861.Lxts_dec_begin:
3862	vld1.8	{$dat},[$inp],$step
3863	subs	$len,$len,#32			// bias
3864	add	$rounds,$rounds0,#2
3865	vorr	$in1,$dat,$dat
3866	vorr	$dat1,$dat,$dat
3867	vorr	$in3,$dat,$dat
3868	vld1.8	{$dat2},[$inp],#16
3869	vorr	$in2,$dat2,$dat2
3870	vorr	$in4,$dat2,$dat2
3871	b.lo	.Lxts_inner_dec_tail
3872	veor	$dat,$dat,$iv0			// before decryt, xor with iv
3873	veor	$dat2,$dat2,$iv1
3874
3875	vorr	$dat1,$dat2,$dat2
3876	vld1.8	{$dat2},[$inp],#16
3877	vorr	$in0,$dat,$dat
3878	vorr	$in1,$dat1,$dat1
3879	veor	$in2,$dat2,$iv2			// third block xox with third iv
3880	veor	$dat2,$dat2,$iv2
3881	cmp	$len,#32
3882	b.lo	.Lxts_outer_dec_tail
3883
3884	vld1.8	{$dat3},[$inp],#16
3885
3886	// The iv for fifth block
3887	extr	$midnumx,$ivh,$ivh,#32
3888	extr	$ivh,$ivh,$ivl,#63
3889	and	$tmpmw,$constnum,$midnum,asr #31
3890	eor	$ivl,$tmpmx,$ivl,lsl #1
3891	fmov	$ivd40,$ivl
3892	fmov	$ivd41,$ivh
3893
3894	vld1.8	{$dat4},[$inp],#16
3895	veor	$dat3,$dat3,$iv3		// the fourth block
3896	veor	$dat4,$dat4,$iv4
3897	sub $len,$len,#32			// bias
3898	mov	$rounds,$rounds0
3899	b	.Loop5x_xts_dec
3900
3901.align	4
3902.Loop5x_xts_dec:
3903	aesd	$dat0,q8
3904	aesimc	$dat0,$dat0
3905	aesd	$dat1,q8
3906	aesimc	$dat1,$dat1
3907	aesd	$dat2,q8
3908	aesimc	$dat2,$dat2
3909	aesd	$dat3,q8
3910	aesimc	$dat3,$dat3
3911	aesd	$dat4,q8
3912	aesimc	$dat4,$dat4
3913	vld1.32	{q8},[$key_],#16		// load key schedule...
3914	subs	$rounds,$rounds,#2
3915	aesd	$dat0,q9
3916	aesimc	$dat0,$dat0
3917	aesd	$dat1,q9
3918	aesimc	$dat1,$dat1
3919	aesd	$dat2,q9
3920	aesimc	$dat2,$dat2
3921	aesd	$dat3,q9
3922	aesimc	$dat3,$dat3
3923	aesd	$dat4,q9
3924	aesimc	$dat4,$dat4
3925	vld1.32	{q9},[$key_],#16		// load key schedule...
3926	b.gt	.Loop5x_xts_dec
3927
3928	aesd	$dat0,q8
3929	aesimc	$dat0,$dat0
3930	aesd	$dat1,q8
3931	aesimc	$dat1,$dat1
3932	aesd	$dat2,q8
3933	aesimc	$dat2,$dat2
3934	aesd	$dat3,q8
3935	aesimc	$dat3,$dat3
3936	aesd	$dat4,q8
3937	aesimc	$dat4,$dat4
3938	subs	$len,$len,#0x50			// because .Lxts_dec_tail4x
3939
3940	aesd	$dat0,q9
3941	aesimc	$dat0,$dat
3942	aesd	$dat1,q9
3943	aesimc	$dat1,$dat1
3944	aesd	$dat2,q9
3945	aesimc	$dat2,$dat2
3946	aesd	$dat3,q9
3947	aesimc	$dat3,$dat3
3948	aesd	$dat4,q9
3949	aesimc	$dat4,$dat4
3950	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
3951	mov	$key_,$key1
3952
3953	aesd	$dat0,q10
3954	aesimc	$dat0,$dat0
3955	aesd	$dat1,q10
3956	aesimc	$dat1,$dat1
3957	aesd	$dat2,q10
3958	aesimc	$dat2,$dat2
3959	aesd	$dat3,q10
3960	aesimc	$dat3,$dat3
3961	aesd	$dat4,q10
3962	aesimc	$dat4,$dat4
3963	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
3964						// at exit from the loop v1.16b-v26.16b
3965						// are loaded with last "words"
3966	add	$xoffset,$len,#0x60		// because .Lxts_dec_tail4x
3967
3968	aesd	$dat0,q11
3969	aesimc	$dat0,$dat0
3970	aesd	$dat1,q11
3971	aesimc	$dat1,$dat1
3972	aesd	$dat2,q11
3973	aesimc	$dat2,$dat2
3974	aesd	$dat3,q11
3975	aesimc	$dat3,$dat3
3976	aesd	$dat4,q11
3977	aesimc	$dat4,$dat4
3978
3979	aesd	$dat0,q12
3980	aesimc	$dat0,$dat0
3981	aesd	$dat1,q12
3982	aesimc	$dat1,$dat1
3983	aesd	$dat2,q12
3984	aesimc	$dat2,$dat2
3985	aesd	$dat3,q12
3986	aesimc	$dat3,$dat3
3987	aesd	$dat4,q12
3988	aesimc	$dat4,$dat4
3989
3990	aesd	$dat0,q13
3991	aesimc	$dat0,$dat0
3992	aesd	$dat1,q13
3993	aesimc	$dat1,$dat1
3994	aesd	$dat2,q13
3995	aesimc	$dat2,$dat2
3996	aesd	$dat3,q13
3997	aesimc	$dat3,$dat3
3998	aesd	$dat4,q13
3999	aesimc	$dat4,$dat4
4000
4001	aesd	$dat0,q14
4002	aesimc	$dat0,$dat0
4003	aesd	$dat1,q14
4004	aesimc	$dat1,$dat1
4005	aesd	$dat2,q14
4006	aesimc	$dat2,$dat2
4007	aesd	$dat3,q14
4008	aesimc	$dat3,$dat3
4009	aesd	$dat4,q14
4010	aesimc	$dat4,$dat4
4011
4012	veor	$tmp0,$rndlast,$iv0
4013	aesd	$dat0,q15
4014	// The iv for first block of next iteration.
4015	extr	$midnumx,$ivh,$ivh,#32
4016	extr	$ivh,$ivh,$ivl,#63
4017	and	$tmpmw,$constnum,$midnum,asr #31
4018	eor	$ivl,$tmpmx,$ivl,lsl #1
4019	fmov	$ivd00,$ivl
4020	fmov	$ivd01,$ivh
4021	veor	$tmp1,$rndlast,$iv1
4022	vld1.8	{$in0},[$inp],#16
4023	aesd	$dat1,q15
4024	// The iv for second block
4025	extr	$midnumx,$ivh,$ivh,#32
4026	extr	$ivh,$ivh,$ivl,#63
4027	and	$tmpmw,$constnum,$midnum,asr #31
4028	eor	$ivl,$tmpmx,$ivl,lsl #1
4029	fmov	$ivd10,$ivl
4030	fmov	$ivd11,$ivh
4031	veor	$tmp2,$rndlast,$iv2
4032	vld1.8	{$in1},[$inp],#16
4033	aesd	$dat2,q15
4034	// The iv for third block
4035	extr	$midnumx,$ivh,$ivh,#32
4036	extr	$ivh,$ivh,$ivl,#63
4037	and	$tmpmw,$constnum,$midnum,asr #31
4038	eor	$ivl,$tmpmx,$ivl,lsl #1
4039	fmov	$ivd20,$ivl
4040	fmov	$ivd21,$ivh
4041	veor	$tmp3,$rndlast,$iv3
4042	vld1.8	{$in2},[$inp],#16
4043	aesd	$dat3,q15
4044	// The iv for fourth block
4045	extr	$midnumx,$ivh,$ivh,#32
4046	extr	$ivh,$ivh,$ivl,#63
4047	and	$tmpmw,$constnum,$midnum,asr #31
4048	eor	$ivl,$tmpmx,$ivl,lsl #1
4049	fmov	$ivd30,$ivl
4050	fmov	$ivd31,$ivh
4051	veor	$tmp4,$rndlast,$iv4
4052	vld1.8	{$in3},[$inp],#16
4053	aesd	$dat4,q15
4054
4055	// The iv for fifth block
4056	extr	$midnumx,$ivh,$ivh,#32
4057	extr	$ivh,$ivh,$ivl,#63
4058	and	$tmpmw,$constnum,$midnum,asr #31
4059	eor	$ivl,$tmpmx,$ivl,lsl #1
4060	fmov	$ivd40,$ivl
4061	fmov	$ivd41,$ivh
4062
4063	vld1.8	{$in4},[$inp],#16
4064	cbz	$xoffset,.Lxts_dec_tail4x
4065	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
4066	veor	$tmp0,$tmp0,$dat0
4067	veor	$dat0,$in0,$iv0
4068	veor	$tmp1,$tmp1,$dat1
4069	veor	$dat1,$in1,$iv1
4070	veor	$tmp2,$tmp2,$dat2
4071	veor	$dat2,$in2,$iv2
4072	veor	$tmp3,$tmp3,$dat3
4073	veor	$dat3,$in3,$iv3
4074	veor	$tmp4,$tmp4,$dat4
4075	vst1.8	{$tmp0},[$out],#16
4076	veor	$dat4,$in4,$iv4
4077	vst1.8	{$tmp1},[$out],#16
4078	mov	$rounds,$rounds0
4079	vst1.8	{$tmp2},[$out],#16
4080	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
4081	vst1.8	{$tmp3},[$out],#16
4082	vst1.8	{$tmp4},[$out],#16
4083	b.hs	.Loop5x_xts_dec
4084
4085	cmn	$len,#0x10
4086	b.ne	.Loop5x_dec_after
4087	// If x2($len) equal to -0x10, the left blocks is 4.
4088	// After specially processing, utilize the five blocks processing again.
4089	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
4090	vorr	$iv4,$iv3,$iv3
4091	vorr	$iv3,$iv2,$iv2
4092	vorr	$iv2,$iv1,$iv1
4093	vorr	$iv1,$iv0,$iv0
4094	fmov	$ivl,$ivd40
4095	fmov	$ivh,$ivd41
4096	veor	$dat0,$iv0,$in0
4097	veor	$dat1,$iv1,$in1
4098	veor	$dat2,$in2,$iv2
4099	veor	$dat3,$in3,$iv3
4100	veor	$dat4,$in4,$iv4
4101	b.eq	.Loop5x_xts_dec
4102
4103.Loop5x_dec_after:
4104	add	$len,$len,#0x50
4105	cbz	$len,.Lxts_done
4106
4107	add	$rounds,$rounds0,#2
4108	subs	$len,$len,#0x30
4109	b.lo	.Lxts_inner_dec_tail
4110
4111	veor	$dat0,$iv0,$in2
4112	veor	$dat1,$iv1,$in3
4113	veor	$dat2,$in4,$iv2
4114	b	.Lxts_outer_dec_tail
4115
4116.align	4
4117.Lxts_dec_tail4x:
4118	add	$inp,$inp,#16
4119	tst	$tailcnt,#0xf
4120	veor	$tmp1,$dat1,$tmp0
4121	vst1.8	{$tmp1},[$out],#16
4122	veor	$tmp2,$dat2,$tmp2
4123	vst1.8	{$tmp2},[$out],#16
4124	veor	$tmp3,$dat3,$tmp3
4125	veor	$tmp4,$dat4,$tmp4
4126	vst1.8	{$tmp3-$tmp4},[$out],#32
4127
4128	b.eq	.Lxts_dec_abort
4129	vld1.8	{$dat0},[$inp],#16
4130	b	.Lxts_done
4131.align	4
4132.Lxts_outer_dec_tail:
4133	aesd	$dat0,q8
4134	aesimc	$dat0,$dat0
4135	aesd	$dat1,q8
4136	aesimc	$dat1,$dat1
4137	aesd	$dat2,q8
4138	aesimc	$dat2,$dat2
4139	vld1.32	{q8},[$key_],#16
4140	subs	$rounds,$rounds,#2
4141	aesd	$dat0,q9
4142	aesimc	$dat0,$dat0
4143	aesd	$dat1,q9
4144	aesimc	$dat1,$dat1
4145	aesd	$dat2,q9
4146	aesimc	$dat2,$dat2
4147	vld1.32	{q9},[$key_],#16
4148	b.gt	.Lxts_outer_dec_tail
4149
4150	aesd	$dat0,q8
4151	aesimc	$dat0,$dat0
4152	aesd	$dat1,q8
4153	aesimc	$dat1,$dat1
4154	aesd	$dat2,q8
4155	aesimc	$dat2,$dat2
4156	veor	$tmp0,$iv0,$rndlast
4157	subs	$len,$len,#0x30
4158	// The iv for first block
4159	fmov	$ivl,$ivd20
4160	fmov	$ivh,$ivd21
4161	mov	$constnum,#0x87
4162	extr	$midnumx,$ivh,$ivh,#32
4163	extr	$ivh,$ivh,$ivl,#63
4164	and	$tmpmw,$constnum,$midnum,asr #31
4165	eor	$ivl,$tmpmx,$ivl,lsl #1
4166	fmov	$ivd00,$ivl
4167	fmov	$ivd01,$ivh
4168	veor	$tmp1,$iv1,$rndlast
4169	csel	$xoffset,$len,$xoffset,lo	// x6, w6, is zero at this point
4170	aesd	$dat0,q9
4171	aesimc	$dat0,$dat0
4172	aesd	$dat1,q9
4173	aesimc	$dat1,$dat1
4174	aesd	$dat2,q9
4175	aesimc	$dat2,$dat2
4176	veor	$tmp2,$iv2,$rndlast
4177	// The iv for second block
4178	extr	$midnumx,$ivh,$ivh,#32
4179	extr	$ivh,$ivh,$ivl,#63
4180	and	$tmpmw,$constnum,$midnum,asr #31
4181	eor	$ivl,$tmpmx,$ivl,lsl #1
4182	fmov	$ivd10,$ivl
4183	fmov	$ivd11,$ivh
4184
4185	add	$xoffset,$xoffset,#0x20
4186	add	$inp,$inp,$xoffset		// $inp is adjusted to the last data
4187
4188	mov	$key_,$key1
4189
4190	// The iv for third block
4191	extr	$midnumx,$ivh,$ivh,#32
4192	extr	$ivh,$ivh,$ivl,#63
4193	and	$tmpmw,$constnum,$midnum,asr #31
4194	eor	$ivl,$tmpmx,$ivl,lsl #1
4195	fmov	$ivd20,$ivl
4196	fmov	$ivd21,$ivh
4197
4198	aesd	$dat0,q12
4199	aesimc	$dat0,$dat0
4200	aesd	$dat1,q12
4201	aesimc	$dat1,$dat1
4202	aesd	$dat2,q12
4203	aesimc	$dat2,$dat2
4204	aesd	$dat0,q13
4205	aesimc	$dat0,$dat0
4206	aesd	$dat1,q13
4207	aesimc	$dat1,$dat1
4208	aesd	$dat2,q13
4209	aesimc	$dat2,$dat2
4210	aesd	$dat0,q14
4211	aesimc	$dat0,$dat0
4212	aesd	$dat1,q14
4213	aesimc	$dat1,$dat1
4214	aesd	$dat2,q14
4215	aesimc	$dat2,$dat2
4216	vld1.8	{$in2},[$inp],#16
4217	aesd	$dat0,q15
4218	aesd	$dat1,q15
4219	aesd	$dat2,q15
4220	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
4221	add	$rounds,$rounds0,#2
4222	veor	$tmp0,$tmp0,$dat0
4223	veor	$tmp1,$tmp1,$dat1
4224	veor	$dat2,$dat2,$tmp2
4225	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
4226	vst1.8	{$tmp0},[$out],#16
4227	vst1.8	{$tmp1},[$out],#16
4228	vst1.8	{$dat2},[$out],#16
4229
4230	cmn	$len,#0x30
4231	add	$len,$len,#0x30
4232	b.eq	.Lxts_done
4233	sub	$len,$len,#0x30
4234	vorr	$in3,$in1,$in1
4235	vorr	$in4,$in2,$in2
4236	nop
4237
4238.Lxts_inner_dec_tail:
4239	// $len == -0x10 means two blocks left.
4240	cmn	$len,#0x10
4241	veor	$dat1,$in3,$iv0
4242	veor	$dat2,$in4,$iv1
4243	b.eq	.Lxts_dec_tail_loop
4244	veor	$dat2,$in4,$iv0
4245.Lxts_dec_tail_loop:
4246	aesd	$dat1,q8
4247	aesimc	$dat1,$dat1
4248	aesd	$dat2,q8
4249	aesimc	$dat2,$dat2
4250	vld1.32	{q8},[$key_],#16
4251	subs	$rounds,$rounds,#2
4252	aesd	$dat1,q9
4253	aesimc	$dat1,$dat1
4254	aesd	$dat2,q9
4255	aesimc	$dat2,$dat2
4256	vld1.32	{q9},[$key_],#16
4257	b.gt	.Lxts_dec_tail_loop
4258
4259	aesd	$dat1,q8
4260	aesimc	$dat1,$dat1
4261	aesd	$dat2,q8
4262	aesimc	$dat2,$dat2
4263	aesd	$dat1,q9
4264	aesimc	$dat1,$dat1
4265	aesd	$dat2,q9
4266	aesimc	$dat2,$dat2
4267	aesd	$dat1,q12
4268	aesimc	$dat1,$dat1
4269	aesd	$dat2,q12
4270	aesimc	$dat2,$dat2
4271	cmn	$len,#0x20
4272	aesd	$dat1,q13
4273	aesimc	$dat1,$dat1
4274	aesd	$dat2,q13
4275	aesimc	$dat2,$dat2
4276	veor	$tmp1,$iv0,$rndlast
4277	aesd	$dat1,q14
4278	aesimc	$dat1,$dat1
4279	aesd	$dat2,q14
4280	aesimc	$dat2,$dat2
4281	veor	$tmp2,$iv1,$rndlast
4282	aesd	$dat1,q15
4283	aesd	$dat2,q15
4284	b.eq	.Lxts_dec_one
4285	veor	$tmp1,$tmp1,$dat1
4286	veor	$tmp2,$tmp2,$dat2
4287	vorr	$iv0,$iv2,$iv2
4288	vorr	$iv1,$iv3,$iv3
4289	vst1.8	{$tmp1},[$out],#16
4290	vst1.8	{$tmp2},[$out],#16
4291	add	$len,$len,#16
4292	b	.Lxts_done
4293
4294.Lxts_dec_one:
4295	veor	$tmp1,$tmp1,$dat2
4296	vorr	$iv0,$iv1,$iv1
4297	vorr	$iv1,$iv2,$iv2
4298	vst1.8	{$tmp1},[$out],#16
4299	add	$len,$len,#32
4300
4301.Lxts_done:
4302	tst	$tailcnt,#0xf
4303	b.eq	.Lxts_dec_abort
4304	// Processing the last two blocks with cipher stealing.
4305	mov	x7,x3
4306	cbnz	x2,.Lxts_dec_1st_done
4307	vld1.8	{$dat0},[$inp],#16
4308
4309	// Decrypt the last second block to get the last plain text block
4310.Lxts_dec_1st_done:
4311	eor	$tmpin,$dat0,$iv1
4312	ldr	$rounds,[$key1,#240]
4313	vld1.32	{$dat0},[$key1],#16
4314	sub	$rounds,$rounds,#2
4315	vld1.32	{$dat1},[$key1],#16
4316.Loop_final_2nd_dec:
4317	aesd	$tmpin,$dat0
4318	aesimc	$tmpin,$tmpin
4319	vld1.32	{$dat0},[$key1],#16		// load key schedule...
4320	subs	$rounds,$rounds,#2
4321	aesd	$tmpin,$dat1
4322	aesimc	$tmpin,$tmpin
4323	vld1.32	{$dat1},[$key1],#16		// load key schedule...
4324	b.gt	.Loop_final_2nd_dec
4325
4326	aesd	$tmpin,$dat0
4327	aesimc	$tmpin,$tmpin
4328	vld1.32	{$dat0},[$key1]
4329	aesd	$tmpin,$dat1
4330	veor	$tmpin,$tmpin,$dat0
4331	veor	$tmpin,$tmpin,$iv1
4332	vst1.8	{$tmpin},[$out]
4333
4334	mov	$tmpinp,$inp
4335	add	$tmpoutp,$out,#16
4336
4337	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
4338	// to get the last encrypted block.
4339.composite_dec_loop:
4340	subs	$tailcnt,$tailcnt,#1
4341	ldrb	$l2outp,[$out,$tailcnt]
4342	ldrb	$loutp,[$tmpinp,$tailcnt]
4343	strb	$l2outp,[$tmpoutp,$tailcnt]
4344	strb	$loutp,[$out,$tailcnt]
4345	b.gt	.composite_dec_loop
4346.Lxts_dec_load_done:
4347	vld1.8	{$tmpin},[$out]
4348	veor	$tmpin,$tmpin,$iv0
4349
4350	// Decrypt the composite block to get the last second plain text block
4351	ldr	$rounds,[$key_,#240]
4352	vld1.32	{$dat},[$key_],#16
4353	sub	$rounds,$rounds,#2
4354	vld1.32	{$dat1},[$key_],#16
4355.Loop_final_dec:
4356	aesd	$tmpin,$dat0
4357	aesimc	$tmpin,$tmpin
4358	vld1.32	{$dat0},[$key_],#16		// load key schedule...
4359	subs	$rounds,$rounds,#2
4360	aesd	$tmpin,$dat1
4361	aesimc	$tmpin,$tmpin
4362	vld1.32	{$dat1},[$key_],#16		// load key schedule...
4363	b.gt	.Loop_final_dec
4364
4365	aesd	$tmpin,$dat0
4366	aesimc	$tmpin,$tmpin
4367	vld1.32	{$dat0},[$key_]
4368	aesd	$tmpin,$dat1
4369	veor	$tmpin,$tmpin,$dat0
4370	veor	$tmpin,$tmpin,$iv0
4371	vst1.8	{$tmpin},[$out]
4372
4373.Lxts_dec_abort:
4374	ldp	$tailcnt,$midnumx,[sp,#48]
4375	ldp	$ivd10,$ivd20,[sp,#32]
4376	ldp	$ivd30,$ivd40,[sp,#16]
4377	ldp	$constnumx,$tmpinp,[sp],#64
4378
4379.Lxts_dec_final_abort:
4380	ret
4381.size	${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
4382___
4383}
4384}}}
4385$code.=<<___;
4386#endif
4387___
4388########################################
4389if ($flavour =~ /64/) {			######## 64-bit code
4390    my %opcode = (
4391	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
4392	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800,
4393	"eor3"	=>	0xce000000,	);
4394
4395    local *unaes = sub {
4396	my ($mnemonic,$arg)=@_;
4397
4398	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
4399	sprintf ".inst\t0x%08x\t//%s %s",
4400			$opcode{$mnemonic}|$1|($2<<5),
4401			$mnemonic,$arg;
4402    };
4403
4404    sub unsha3 {
4405		 my ($mnemonic,$arg)=@_;
4406
4407		 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
4408		 &&
4409		 sprintf ".inst\t0x%08x\t//%s %s",
4410			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
4411			$mnemonic,$arg;
4412    }
4413
4414    foreach(split("\n",$code)) {
4415	s/\`([^\`]*)\`/eval($1)/geo;
4416
4417	s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo;	# old->new registers
4418	s/\bq_([0-9]+)\b/"q".$1/geo;	# old->new registers
4419	s/@\s/\/\//o;			# old->new style commentary
4420
4421	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
4422	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
4423	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
4424	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
4425	s/vext\.8/ext/o		or
4426	s/vrev32\.8/rev32/o	or
4427	s/vtst\.8/cmtst/o	or
4428	s/vshr/ushr/o		or
4429	s/^(\s+)v/$1/o		or	# strip off v prefix
4430	s/\bbx\s+lr\b/ret/o;
4431	s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge;
4432
4433	# fix up remaining legacy suffixes
4434	s/\.[ui]?8//o;
4435	m/\],#8/o and s/\.16b/\.8b/go;
4436	s/\.[ui]?32//o and s/\.16b/\.4s/go;
4437	s/\.[ui]?64//o and s/\.16b/\.2d/go;
4438	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
4439
4440	# Switch preprocessor checks to aarch64 versions.
4441	s/__ARME([BL])__/__AARCH64E$1__/go;
4442
4443	print $_,"\n";
4444    }
4445} else {				######## 32-bit code
4446    my %opcode = (
4447	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
4448	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
4449
4450    local *unaes = sub {
4451	my ($mnemonic,$arg)=@_;
4452
4453	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
4454	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
4455					 |(($2&7)<<1) |(($2&8)<<2);
4456	    # since ARMv7 instructions are always encoded little-endian.
4457	    # correct solution is to use .inst directive, but older
4458	    # assemblers don't implement it:-(
4459	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
4460			$word&0xff,($word>>8)&0xff,
4461			($word>>16)&0xff,($word>>24)&0xff,
4462			$mnemonic,$arg;
4463	}
4464    };
4465
4466    sub unvtbl {
4467	my $arg=shift;
4468
4469	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
4470	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
4471		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
4472    }
4473
4474    sub unvdup32 {
4475	my $arg=shift;
4476
4477	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
4478	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
4479    }
4480
4481    sub unvmov32 {
4482	my $arg=shift;
4483
4484	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
4485	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
4486    }
4487
4488    foreach(split("\n",$code)) {
4489	s/\`([^\`]*)\`/eval($1)/geo;
4490
4491	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
4492	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
4493	s/\/\/\s?/@ /o;				# new->old style commentary
4494
4495	# fix up remaining new-style suffixes
4496	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
4497	s/\],#[0-9]+/]!/o;
4498
4499	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
4500	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o	or
4501	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
4502	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
4503	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
4504	s/^(\s+)b\./$1b/o				or
4505	s/^(\s+)ret/$1bx\tlr/o;
4506
4507	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
4508	    print "	it	$2\n";
4509	}
4510
4511	print $_,"\n";
4512    }
4513}
4514
4515close STDOUT or die "error closing STDOUT: $!";
4516