xref: /openssl/crypto/aes/asm/aesv8-armx.pl (revision 5261f3ca)
1#! /usr/bin/env perl
2# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# April 2019
31#
32# Key to performance of parallelize-able modes is round instruction
33# interleaving. But which factor to use? There is optimal one for
34# each combination of instruction latency and issue rate, beyond
35# which increasing interleave factor doesn't pay off. While on cons
36# side we have code size increase and resource waste on platforms for
37# which interleave factor is too high. In other words you want it to
38# be just right. So far interleave factor of 3x was serving well all
39# platforms. But for ThunderX2 optimal interleave factor was measured
40# to be 5x...
41#
42# Performance in cycles per byte processed with 128-bit key:
43#
44#		CBC enc		CBC dec		CTR
45# Apple A7	2.39		1.20		1.20
46# Cortex-A53	1.32		1.17/1.29(**)	1.36/1.46
47# Cortex-A57(*)	1.95		0.82/0.85	0.89/0.93
48# Cortex-A72	1.33		0.85/0.88	0.92/0.96
49# Denver	1.96		0.65/0.86	0.76/0.80
50# Mongoose	1.33		1.23/1.20	1.30/1.20
51# Kryo		1.26		0.87/0.94	1.00/1.00
52# ThunderX2	5.95		1.25		1.30
53#
54# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
55#	and are still same even for updated module;
56# (**)	numbers after slash are for 32-bit code, which is 3x-
57#	interleaved;
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70    or die "can't call $xlate: $!";
71*STDOUT=*OUT;
72
73$prefix="aes_v8";
74
75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77$code=<<___;
78#include "arm_arch.h"
79
80#if __ARM_MAX_ARCH__>=7
81___
82$code.=".arch	armv8-a+crypto\n.text\n"		if ($flavour =~ /64/);
83$code.=<<___						if ($flavour !~ /64/);
84.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
85.fpu	neon
86#ifdef	__thumb2__
87.syntax	unified
88.thumb
89# define INST(a,b,c,d)	$_byte	c,d|0xc,a,b
90#else
91.code	32
92# define INST(a,b,c,d)	$_byte	a,b,c,d
93#endif
94
95.text
96___
97
98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100# maintain both 32- and 64-bit codes within single module and
101# transliterate common code to either flavour with regex vodoo.
102#
103{{{
104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109#
110# This file generates .s file for 64-bit and 32-bit CPUs.
111# We don't implement .rodata on 32-bit CPUs yet.
112#
113$code.=".rodata\n"	if ($flavour =~ /64/);
114$code.=<<___;
115.align	5
116.Lrcon:
117.long	0x01,0x01,0x01,0x01
118.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
119.long	0x1b,0x1b,0x1b,0x1b
120___
121$code.=".previous\n"	if ($flavour =~ /64/);
122
123$code.=<<___;
124.globl	${prefix}_set_encrypt_key
125.type	${prefix}_set_encrypt_key,%function
126.align	5
127${prefix}_set_encrypt_key:
128.Lenc_key:
129___
130$code.=<<___	if ($flavour =~ /64/);
131	AARCH64_VALID_CALL_TARGET
132	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
133	stp	x29,x30,[sp,#-16]!
134	add	x29,sp,#0
135___
136$code.=<<___;
137	mov	$ptr,#-1
138	cmp	$inp,#0
139	b.eq	.Lenc_key_abort
140	cmp	$out,#0
141	b.eq	.Lenc_key_abort
142	mov	$ptr,#-2
143	cmp	$bits,#128
144	b.lt	.Lenc_key_abort
145	cmp	$bits,#256
146	b.gt	.Lenc_key_abort
147	tst	$bits,#0x3f
148	b.ne	.Lenc_key_abort
149
150___
151$code.=<<___	if ($flavour =~ /64/);
152	adrp	$ptr,.Lrcon
153	add	$ptr,$ptr,:lo12:.Lrcon
154___
155$code.=<<___	if ($flavour =~ /32/);
156	adr	$ptr,.Lrcon
157___
158$code.=<<___;
159	cmp	$bits,#192
160
161	veor	$zero,$zero,$zero
162	vld1.8	{$in0},[$inp],#16
163	mov	$bits,#8		// reuse $bits
164	vld1.32	{$rcon,$mask},[$ptr],#32
165
166	b.lt	.Loop128
167	b.eq	.L192
168	b	.L256
169
170.align	4
171.Loop128:
172	vtbl.8	$key,{$in0},$mask
173	vext.8	$tmp,$zero,$in0,#12
174	vst1.32	{$in0},[$out],#16
175	aese	$key,$zero
176	subs	$bits,$bits,#1
177
178	veor	$in0,$in0,$tmp
179	vext.8	$tmp,$zero,$tmp,#12
180	veor	$in0,$in0,$tmp
181	vext.8	$tmp,$zero,$tmp,#12
182	 veor	$key,$key,$rcon
183	veor	$in0,$in0,$tmp
184	vshl.u8	$rcon,$rcon,#1
185	veor	$in0,$in0,$key
186	b.ne	.Loop128
187
188	vld1.32	{$rcon},[$ptr]
189
190	vtbl.8	$key,{$in0},$mask
191	vext.8	$tmp,$zero,$in0,#12
192	vst1.32	{$in0},[$out],#16
193	aese	$key,$zero
194
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	veor	$in0,$in0,$tmp
198	vext.8	$tmp,$zero,$tmp,#12
199	 veor	$key,$key,$rcon
200	veor	$in0,$in0,$tmp
201	vshl.u8	$rcon,$rcon,#1
202	veor	$in0,$in0,$key
203
204	vtbl.8	$key,{$in0},$mask
205	vext.8	$tmp,$zero,$in0,#12
206	vst1.32	{$in0},[$out],#16
207	aese	$key,$zero
208
209	veor	$in0,$in0,$tmp
210	vext.8	$tmp,$zero,$tmp,#12
211	veor	$in0,$in0,$tmp
212	vext.8	$tmp,$zero,$tmp,#12
213	 veor	$key,$key,$rcon
214	veor	$in0,$in0,$tmp
215	veor	$in0,$in0,$key
216	vst1.32	{$in0},[$out]
217	add	$out,$out,#0x50
218
219	mov	$rounds,#10
220	b	.Ldone
221
222.align	4
223.L192:
224	vld1.8	{$in1},[$inp],#8
225	vmov.i8	$key,#8			// borrow $key
226	vst1.32	{$in0},[$out],#16
227	vsub.i8	$mask,$mask,$key	// adjust the mask
228
229.Loop192:
230	vtbl.8	$key,{$in1},$mask
231	vext.8	$tmp,$zero,$in0,#12
232#ifdef __ARMEB__
233	vst1.32	{$in1},[$out],#16
234	sub	$out,$out,#8
235#else
236	vst1.32	{$in1},[$out],#8
237#endif
238	aese	$key,$zero
239	subs	$bits,$bits,#1
240
241	veor	$in0,$in0,$tmp
242	vext.8	$tmp,$zero,$tmp,#12
243	veor	$in0,$in0,$tmp
244	vext.8	$tmp,$zero,$tmp,#12
245	veor	$in0,$in0,$tmp
246
247	vdup.32	$tmp,${in0}[3]
248	veor	$tmp,$tmp,$in1
249	 veor	$key,$key,$rcon
250	vext.8	$in1,$zero,$in1,#12
251	vshl.u8	$rcon,$rcon,#1
252	veor	$in1,$in1,$tmp
253	veor	$in0,$in0,$key
254	veor	$in1,$in1,$key
255	vst1.32	{$in0},[$out],#16
256	b.ne	.Loop192
257
258	mov	$rounds,#12
259	add	$out,$out,#0x20
260	b	.Ldone
261
262.align	4
263.L256:
264	vld1.8	{$in1},[$inp]
265	mov	$bits,#7
266	mov	$rounds,#14
267	vst1.32	{$in0},[$out],#16
268
269.Loop256:
270	vtbl.8	$key,{$in1},$mask
271	vext.8	$tmp,$zero,$in0,#12
272	vst1.32	{$in1},[$out],#16
273	aese	$key,$zero
274	subs	$bits,$bits,#1
275
276	veor	$in0,$in0,$tmp
277	vext.8	$tmp,$zero,$tmp,#12
278	veor	$in0,$in0,$tmp
279	vext.8	$tmp,$zero,$tmp,#12
280	 veor	$key,$key,$rcon
281	veor	$in0,$in0,$tmp
282	vshl.u8	$rcon,$rcon,#1
283	veor	$in0,$in0,$key
284	vst1.32	{$in0},[$out],#16
285	b.eq	.Ldone
286
287	vdup.32	$key,${in0}[3]		// just splat
288	vext.8	$tmp,$zero,$in1,#12
289	aese	$key,$zero
290
291	veor	$in1,$in1,$tmp
292	vext.8	$tmp,$zero,$tmp,#12
293	veor	$in1,$in1,$tmp
294	vext.8	$tmp,$zero,$tmp,#12
295	veor	$in1,$in1,$tmp
296
297	veor	$in1,$in1,$key
298	b	.Loop256
299
300.Ldone:
301	str	$rounds,[$out]
302	mov	$ptr,#0
303
304.Lenc_key_abort:
305	mov	x0,$ptr			// return value
306	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
307	ret
308.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
309
310.globl	${prefix}_set_decrypt_key
311.type	${prefix}_set_decrypt_key,%function
312.align	5
313${prefix}_set_decrypt_key:
314___
315$code.=<<___	if ($flavour =~ /64/);
316	AARCH64_SIGN_LINK_REGISTER
317	stp	x29,x30,[sp,#-16]!
318	add	x29,sp,#0
319___
320$code.=<<___	if ($flavour !~ /64/);
321	stmdb	sp!,{r4,lr}
322___
323$code.=<<___;
324	bl	.Lenc_key
325
326	cmp	x0,#0
327	b.ne	.Ldec_key_abort
328
329	sub	$out,$out,#240		// restore original $out
330	mov	x4,#-16
331	add	$inp,$out,x12,lsl#4	// end of key schedule
332
333	vld1.32	{v0.16b},[$out]
334	vld1.32	{v1.16b},[$inp]
335	vst1.32	{v0.16b},[$inp],x4
336	vst1.32	{v1.16b},[$out],#16
337
338.Loop_imc:
339	vld1.32	{v0.16b},[$out]
340	vld1.32	{v1.16b},[$inp]
341	aesimc	v0.16b,v0.16b
342	aesimc	v1.16b,v1.16b
343	vst1.32	{v0.16b},[$inp],x4
344	vst1.32	{v1.16b},[$out],#16
345	cmp	$inp,$out
346	b.hi	.Loop_imc
347
348	vld1.32	{v0.16b},[$out]
349	aesimc	v0.16b,v0.16b
350	vst1.32	{v0.16b},[$inp]
351
352	eor	x0,x0,x0		// return value
353.Ldec_key_abort:
354___
355$code.=<<___	if ($flavour !~ /64/);
356	ldmia	sp!,{r4,pc}
357___
358$code.=<<___	if ($flavour =~ /64/);
359	ldp	x29,x30,[sp],#16
360	AARCH64_VALIDATE_LINK_REGISTER
361	ret
362___
363$code.=<<___;
364.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
365___
366}}}
367{{{
368sub gen_block () {
369my $dir = shift;
370my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
371my ($inp,$out,$key)=map("x$_",(0..2));
372my $rounds="w3";
373my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
374
375$code.=<<___;
376.globl	${prefix}_${dir}crypt
377.type	${prefix}_${dir}crypt,%function
378.align	5
379${prefix}_${dir}crypt:
380___
381$code.=<<___	if ($flavour =~ /64/);
382	AARCH64_VALID_CALL_TARGET
383___
384$code.=<<___;
385	ldr	$rounds,[$key,#240]
386	vld1.32	{$rndkey0},[$key],#16
387	vld1.8	{$inout},[$inp]
388	sub	$rounds,$rounds,#2
389	vld1.32	{$rndkey1},[$key],#16
390
391.Loop_${dir}c:
392	aes$e	$inout,$rndkey0
393	aes$mc	$inout,$inout
394	vld1.32	{$rndkey0},[$key],#16
395	subs	$rounds,$rounds,#2
396	aes$e	$inout,$rndkey1
397	aes$mc	$inout,$inout
398	vld1.32	{$rndkey1},[$key],#16
399	b.gt	.Loop_${dir}c
400
401	aes$e	$inout,$rndkey0
402	aes$mc	$inout,$inout
403	vld1.32	{$rndkey0},[$key]
404	aes$e	$inout,$rndkey1
405	veor	$inout,$inout,$rndkey0
406
407	vst1.8	{$inout},[$out]
408	ret
409.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
410___
411}
412&gen_block("en");
413&gen_block("de");
414}}}
415
416# Performance in cycles per byte.
417# Processed with AES-ECB different key size.
418# It shows the value before and after optimization as below:
419# (before/after):
420#
421#		AES-128-ECB		AES-192-ECB		AES-256-ECB
422# Cortex-A57	1.85/0.82		2.16/0.96		2.47/1.10
423# Cortex-A72	1.64/0.85		1.82/0.99		2.13/1.14
424
425# Optimization is implemented by loop unrolling and interleaving.
426# Commonly, we choose the unrolling factor as 5, if the input
427# data size smaller than 5 blocks, but not smaller than 3 blocks,
428# choose 3 as the unrolling factor.
429# If the input data size dsize >= 5*16 bytes, then take 5 blocks
430# as one iteration, every loop the left size lsize -= 5*16.
431# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
432# every loop lsize -=3*16.
433# If lsize < 3*16 bytes, treat them as the tail, interleave the
434# two blocks AES instructions.
435# There is one special case, if the original input data size dsize
436# = 16 bytes, we will treat it separately to improve the
437# performance: one independent code block without LR, FP load and
438# store, just looks like what the original ECB implementation does.
439
440{{{
441my ($inp,$out,$len,$key)=map("x$_",(0..3));
442my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
443my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
444
445my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
446
447### q7	last round key
448### q10-q15	q7 Last 7 round keys
449### q8-q9	preloaded round keys except last 7 keys for big size
450### q5, q6, q8-q9	preloaded round keys except last 7 keys for only 16 byte
451
452{
453my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
454
455my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
456my ($dat4,$in4,$tmp4);
457if ($flavour =~ /64/) {
458    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
459}
460
461$code.=<<___;
462.globl	${prefix}_ecb_encrypt
463.type	${prefix}_ecb_encrypt,%function
464.align	5
465${prefix}_ecb_encrypt:
466___
467$code.=<<___	if ($flavour =~ /64/);
468	AARCH64_VALID_CALL_TARGET
469	subs	$len,$len,#16
470	// Original input data size bigger than 16, jump to big size processing.
471	b.ne    .Lecb_big_size
472	vld1.8	{$dat0},[$inp]
473	cmp	$enc,#0					// en- or decrypting?
474	ldr	$rounds,[$key,#240]
475	vld1.32	{q5-q6},[$key],#32			// load key schedule...
476
477	b.eq .Lecb_small_dec
478	aese	$dat0,q5
479	aesmc	$dat0,$dat0
480	vld1.32	{q8-q9},[$key],#32			// load key schedule...
481	aese	$dat0,q6
482	aesmc	$dat0,$dat0
483	subs	$rounds,$rounds,#10			// if rounds==10, jump to aes-128-ecb processing
484	b.eq    .Lecb_128_enc
485.Lecb_round_loop:
486	aese	$dat0,q8
487	aesmc	$dat0,$dat0
488	vld1.32	{q8},[$key],#16				// load key schedule...
489	aese	$dat0,q9
490	aesmc	$dat0,$dat0
491	vld1.32	{q9},[$key],#16				// load key schedule...
492	subs	$rounds,$rounds,#2			// bias
493	b.gt    .Lecb_round_loop
494.Lecb_128_enc:
495	vld1.32	{q10-q11},[$key],#32		// load key schedule...
496	aese	$dat0,q8
497	aesmc	$dat0,$dat0
498	aese	$dat0,q9
499	aesmc	$dat0,$dat0
500	vld1.32	{q12-q13},[$key],#32		// load key schedule...
501	aese	$dat0,q10
502	aesmc	$dat0,$dat0
503	aese	$dat0,q11
504	aesmc	$dat0,$dat0
505	vld1.32	{q14-q15},[$key],#32		// load key schedule...
506	aese	$dat0,q12
507	aesmc	$dat0,$dat0
508	aese	$dat0,q13
509	aesmc	$dat0,$dat0
510	vld1.32	{$rndlast},[$key]
511	aese	$dat0,q14
512	aesmc	$dat0,$dat0
513	aese	$dat0,q15
514	veor	$dat0,$dat0,$rndlast
515	vst1.8	{$dat0},[$out]
516	b	.Lecb_Final_abort
517.Lecb_small_dec:
518	aesd	$dat0,q5
519	aesimc	$dat0,$dat0
520	vld1.32	{q8-q9},[$key],#32			// load key schedule...
521	aesd	$dat0,q6
522	aesimc	$dat0,$dat0
523	subs	$rounds,$rounds,#10			// bias
524	b.eq    .Lecb_128_dec
525.Lecb_dec_round_loop:
526	aesd	$dat0,q8
527	aesimc	$dat0,$dat0
528	vld1.32	{q8},[$key],#16				// load key schedule...
529	aesd	$dat0,q9
530	aesimc	$dat0,$dat0
531	vld1.32	{q9},[$key],#16				// load key schedule...
532	subs	$rounds,$rounds,#2			// bias
533	b.gt    .Lecb_dec_round_loop
534.Lecb_128_dec:
535	vld1.32	{q10-q11},[$key],#32		// load key schedule...
536	aesd	$dat0,q8
537	aesimc	$dat0,$dat0
538	aesd	$dat0,q9
539	aesimc	$dat0,$dat0
540	vld1.32	{q12-q13},[$key],#32		// load key schedule...
541	aesd	$dat0,q10
542	aesimc	$dat0,$dat0
543	aesd	$dat0,q11
544	aesimc	$dat0,$dat0
545	vld1.32	{q14-q15},[$key],#32		// load key schedule...
546	aesd	$dat0,q12
547	aesimc	$dat0,$dat0
548	aesd	$dat0,q13
549	aesimc	$dat0,$dat0
550	vld1.32	{$rndlast},[$key]
551	aesd	$dat0,q14
552	aesimc	$dat0,$dat0
553	aesd	$dat0,q15
554	veor	$dat0,$dat0,$rndlast
555	vst1.8	{$dat0},[$out]
556	b	.Lecb_Final_abort
557.Lecb_big_size:
558___
559$code.=<<___	if ($flavour =~ /64/);
560	stp	x29,x30,[sp,#-16]!
561	add	x29,sp,#0
562___
563$code.=<<___	if ($flavour !~ /64/);
564	mov	ip,sp
565	stmdb	sp!,{r4-r8,lr}
566	vstmdb	sp!,{d8-d15}			@ ABI specification says so
567	ldmia	ip,{r4-r5}			@ load remaining args
568	subs	$len,$len,#16
569___
570$code.=<<___;
571	mov	$step,#16
572	b.lo	.Lecb_done
573	cclr	$step,eq
574
575	cmp	$enc,#0					// en- or decrypting?
576	ldr	$rounds,[$key,#240]
577	and	$len,$len,#-16
578	vld1.8	{$dat},[$inp],$step
579
580	vld1.32	{q8-q9},[$key]				// load key schedule...
581	sub	$rounds,$rounds,#6
582	add	$key_,$key,x5,lsl#4				// pointer to last 7 round keys
583	sub	$rounds,$rounds,#2
584	vld1.32	{q10-q11},[$key_],#32
585	vld1.32	{q12-q13},[$key_],#32
586	vld1.32	{q14-q15},[$key_],#32
587	vld1.32	{$rndlast},[$key_]
588
589	add	$key_,$key,#32
590	mov	$cnt,$rounds
591	b.eq	.Lecb_dec
592
593	vld1.8	{$dat1},[$inp],#16
594	subs	$len,$len,#32				// bias
595	add	$cnt,$rounds,#2
596	vorr	$in1,$dat1,$dat1
597	vorr	$dat2,$dat1,$dat1
598	vorr	$dat1,$dat,$dat
599	b.lo	.Lecb_enc_tail
600
601	vorr	$dat1,$in1,$in1
602	vld1.8	{$dat2},[$inp],#16
603___
604$code.=<<___	if ($flavour =~ /64/);
605	cmp	$len,#32
606	b.lo	.Loop3x_ecb_enc
607
608	vld1.8	{$dat3},[$inp],#16
609	vld1.8	{$dat4},[$inp],#16
610	sub	$len,$len,#32				// bias
611	mov	$cnt,$rounds
612
613.Loop5x_ecb_enc:
614	aese	$dat0,q8
615	aesmc	$dat0,$dat0
616	aese	$dat1,q8
617	aesmc	$dat1,$dat1
618	aese	$dat2,q8
619	aesmc	$dat2,$dat2
620	aese	$dat3,q8
621	aesmc	$dat3,$dat3
622	aese	$dat4,q8
623	aesmc	$dat4,$dat4
624	vld1.32	{q8},[$key_],#16
625	subs	$cnt,$cnt,#2
626	aese	$dat0,q9
627	aesmc	$dat0,$dat0
628	aese	$dat1,q9
629	aesmc	$dat1,$dat1
630	aese	$dat2,q9
631	aesmc	$dat2,$dat2
632	aese	$dat3,q9
633	aesmc	$dat3,$dat3
634	aese	$dat4,q9
635	aesmc	$dat4,$dat4
636	vld1.32	{q9},[$key_],#16
637	b.gt	.Loop5x_ecb_enc
638
639	aese	$dat0,q8
640	aesmc	$dat0,$dat0
641	aese	$dat1,q8
642	aesmc	$dat1,$dat1
643	aese	$dat2,q8
644	aesmc	$dat2,$dat2
645	aese	$dat3,q8
646	aesmc	$dat3,$dat3
647	aese	$dat4,q8
648	aesmc	$dat4,$dat4
649	cmp	$len,#0x40					// because .Lecb_enc_tail4x
650	sub	$len,$len,#0x50
651
652	aese	$dat0,q9
653	aesmc	$dat0,$dat0
654	aese	$dat1,q9
655	aesmc	$dat1,$dat1
656	aese	$dat2,q9
657	aesmc	$dat2,$dat2
658	aese	$dat3,q9
659	aesmc	$dat3,$dat3
660	aese	$dat4,q9
661	aesmc	$dat4,$dat4
662	csel	x6,xzr,$len,gt			// borrow x6, $cnt, "gt" is not typo
663	mov	$key_,$key
664
665	aese	$dat0,q10
666	aesmc	$dat0,$dat0
667	aese	$dat1,q10
668	aesmc	$dat1,$dat1
669	aese	$dat2,q10
670	aesmc	$dat2,$dat2
671	aese	$dat3,q10
672	aesmc	$dat3,$dat3
673	aese	$dat4,q10
674	aesmc	$dat4,$dat4
675	add	$inp,$inp,x6				// $inp is adjusted in such way that
676							// at exit from the loop $dat1-$dat4
677							// are loaded with last "words"
678	add	x6,$len,#0x60		    // because .Lecb_enc_tail4x
679
680	aese	$dat0,q11
681	aesmc	$dat0,$dat0
682	aese	$dat1,q11
683	aesmc	$dat1,$dat1
684	aese	$dat2,q11
685	aesmc	$dat2,$dat2
686	aese	$dat3,q11
687	aesmc	$dat3,$dat3
688	aese	$dat4,q11
689	aesmc	$dat4,$dat4
690
691	aese	$dat0,q12
692	aesmc	$dat0,$dat0
693	aese	$dat1,q12
694	aesmc	$dat1,$dat1
695	aese	$dat2,q12
696	aesmc	$dat2,$dat2
697	aese	$dat3,q12
698	aesmc	$dat3,$dat3
699	aese	$dat4,q12
700	aesmc	$dat4,$dat4
701
702	aese	$dat0,q13
703	aesmc	$dat0,$dat0
704	aese	$dat1,q13
705	aesmc	$dat1,$dat1
706	aese	$dat2,q13
707	aesmc	$dat2,$dat2
708	aese	$dat3,q13
709	aesmc	$dat3,$dat3
710	aese	$dat4,q13
711	aesmc	$dat4,$dat4
712
713	aese	$dat0,q14
714	aesmc	$dat0,$dat0
715	aese	$dat1,q14
716	aesmc	$dat1,$dat1
717	aese	$dat2,q14
718	aesmc	$dat2,$dat2
719	aese	$dat3,q14
720	aesmc	$dat3,$dat3
721	aese	$dat4,q14
722	aesmc	$dat4,$dat4
723
724	aese	$dat0,q15
725	vld1.8	{$in0},[$inp],#16
726	aese	$dat1,q15
727	vld1.8	{$in1},[$inp],#16
728	aese	$dat2,q15
729	vld1.8	{$in2},[$inp],#16
730	aese	$dat3,q15
731	vld1.8	{$in3},[$inp],#16
732	aese	$dat4,q15
733	vld1.8	{$in4},[$inp],#16
734	cbz	x6,.Lecb_enc_tail4x
735	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
736	veor	$tmp0,$rndlast,$dat0
737	vorr	$dat0,$in0,$in0
738	veor	$tmp1,$rndlast,$dat1
739	vorr	$dat1,$in1,$in1
740	veor	$tmp2,$rndlast,$dat2
741	vorr	$dat2,$in2,$in2
742	veor	$tmp3,$rndlast,$dat3
743	vorr	$dat3,$in3,$in3
744	veor	$tmp4,$rndlast,$dat4
745	vst1.8	{$tmp0},[$out],#16
746	vorr	$dat4,$in4,$in4
747	vst1.8	{$tmp1},[$out],#16
748	mov	$cnt,$rounds
749	vst1.8	{$tmp2},[$out],#16
750	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
751	vst1.8	{$tmp3},[$out],#16
752	vst1.8	{$tmp4},[$out],#16
753	b.hs	.Loop5x_ecb_enc
754
755	add	$len,$len,#0x50
756	cbz	$len,.Lecb_done
757
758	add	$cnt,$rounds,#2
759	subs	$len,$len,#0x30
760	vorr	$dat0,$in2,$in2
761	vorr	$dat1,$in3,$in3
762	vorr	$dat2,$in4,$in4
763	b.lo	.Lecb_enc_tail
764
765	b	.Loop3x_ecb_enc
766
767.align	4
768.Lecb_enc_tail4x:
769	veor	$tmp1,$rndlast,$dat1
770	veor	$tmp2,$rndlast,$dat2
771	veor	$tmp3,$rndlast,$dat3
772	veor	$tmp4,$rndlast,$dat4
773	vst1.8	{$tmp1},[$out],#16
774	vst1.8	{$tmp2},[$out],#16
775	vst1.8	{$tmp3},[$out],#16
776	vst1.8	{$tmp4},[$out],#16
777
778	b	.Lecb_done
779.align	4
780___
781$code.=<<___;
782.Loop3x_ecb_enc:
783	aese	$dat0,q8
784	aesmc	$dat0,$dat0
785	aese	$dat1,q8
786	aesmc	$dat1,$dat1
787	aese	$dat2,q8
788	aesmc	$dat2,$dat2
789	vld1.32	{q8},[$key_],#16
790	subs	$cnt,$cnt,#2
791	aese	$dat0,q9
792	aesmc	$dat0,$dat0
793	aese	$dat1,q9
794	aesmc	$dat1,$dat1
795	aese	$dat2,q9
796	aesmc	$dat2,$dat2
797	vld1.32	{q9},[$key_],#16
798	b.gt	.Loop3x_ecb_enc
799
800	aese	$dat0,q8
801	aesmc	$dat0,$dat0
802	aese	$dat1,q8
803	aesmc	$dat1,$dat1
804	aese	$dat2,q8
805	aesmc	$dat2,$dat2
806	subs	$len,$len,#0x30
807	mov.lo	x6,$len				// x6, $cnt, is zero at this point
808	aese	$dat0,q9
809	aesmc	$dat0,$dat0
810	aese	$dat1,q9
811	aesmc	$dat1,$dat1
812	aese	$dat2,q9
813	aesmc	$dat2,$dat2
814	add	$inp,$inp,x6			// $inp is adjusted in such way that
815						// at exit from the loop $dat1-$dat2
816						// are loaded with last "words"
817	mov	$key_,$key
818	aese	$dat0,q12
819	aesmc	$dat0,$dat0
820	aese	$dat1,q12
821	aesmc	$dat1,$dat1
822	aese	$dat2,q12
823	aesmc	$dat2,$dat2
824	vld1.8	{$in0},[$inp],#16
825	aese	$dat0,q13
826	aesmc	$dat0,$dat0
827	aese	$dat1,q13
828	aesmc	$dat1,$dat1
829	aese	$dat2,q13
830	aesmc	$dat2,$dat2
831	vld1.8	{$in1},[$inp],#16
832	aese	$dat0,q14
833	aesmc	$dat0,$dat0
834	aese	$dat1,q14
835	aesmc	$dat1,$dat1
836	aese	$dat2,q14
837	aesmc	$dat2,$dat2
838	vld1.8	{$in2},[$inp],#16
839	aese	$dat0,q15
840	aese	$dat1,q15
841	aese	$dat2,q15
842	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
843	add	$cnt,$rounds,#2
844	veor	$tmp0,$rndlast,$dat0
845	veor	$tmp1,$rndlast,$dat1
846	veor	$dat2,$dat2,$rndlast
847	vld1.32 {q9},[$key_],#16		// re-pre-load rndkey[1]
848	vst1.8	{$tmp0},[$out],#16
849	vorr	$dat0,$in0,$in0
850	vst1.8	{$tmp1},[$out],#16
851	vorr	$dat1,$in1,$in1
852	vst1.8	{$dat2},[$out],#16
853	vorr	$dat2,$in2,$in2
854	b.hs	.Loop3x_ecb_enc
855
856	cmn	$len,#0x30
857	b.eq	.Lecb_done
858	nop
859
860.Lecb_enc_tail:
861	aese	$dat1,q8
862	aesmc	$dat1,$dat1
863	aese	$dat2,q8
864	aesmc	$dat2,$dat2
865	vld1.32	{q8},[$key_],#16
866	subs	$cnt,$cnt,#2
867	aese	$dat1,q9
868	aesmc	$dat1,$dat1
869	aese	$dat2,q9
870	aesmc	$dat2,$dat2
871	vld1.32	{q9},[$key_],#16
872	b.gt	.Lecb_enc_tail
873
874	aese	$dat1,q8
875	aesmc	$dat1,$dat1
876	aese	$dat2,q8
877	aesmc	$dat2,$dat2
878	aese	$dat1,q9
879	aesmc	$dat1,$dat1
880	aese	$dat2,q9
881	aesmc	$dat2,$dat2
882	aese	$dat1,q12
883	aesmc	$dat1,$dat1
884	aese	$dat2,q12
885	aesmc	$dat2,$dat2
886	cmn	$len,#0x20
887	aese	$dat1,q13
888	aesmc	$dat1,$dat1
889	aese	$dat2,q13
890	aesmc	$dat2,$dat2
891	aese	$dat1,q14
892	aesmc	$dat1,$dat1
893	aese	$dat2,q14
894	aesmc	$dat2,$dat2
895	aese	$dat1,q15
896	aese	$dat2,q15
897	b.eq	.Lecb_enc_one
898	veor	$tmp1,$rndlast,$dat1
899	veor	$tmp2,$rndlast,$dat2
900	vst1.8	{$tmp1},[$out],#16
901	vst1.8	{$tmp2},[$out],#16
902	b	.Lecb_done
903
904.Lecb_enc_one:
905	veor	$tmp1,$rndlast,$dat2
906	vst1.8	{$tmp1},[$out],#16
907	b	.Lecb_done
908___
909
910$code.=<<___;
911.align	5
912.Lecb_dec:
913	vld1.8	{$dat1},[$inp],#16
914	subs	$len,$len,#32			// bias
915	add	$cnt,$rounds,#2
916	vorr	$in1,$dat1,$dat1
917	vorr	$dat2,$dat1,$dat1
918	vorr	$dat1,$dat,$dat
919	b.lo	.Lecb_dec_tail
920
921	vorr	$dat1,$in1,$in1
922	vld1.8	{$dat2},[$inp],#16
923___
924$code.=<<___	if ($flavour =~ /64/);
925	cmp	$len,#32
926	b.lo	.Loop3x_ecb_dec
927
928	vld1.8	{$dat3},[$inp],#16
929	vld1.8	{$dat4},[$inp],#16
930	sub	$len,$len,#32				// bias
931	mov	$cnt,$rounds
932
933.Loop5x_ecb_dec:
934	aesd	$dat0,q8
935	aesimc	$dat0,$dat0
936	aesd	$dat1,q8
937	aesimc	$dat1,$dat1
938	aesd	$dat2,q8
939	aesimc	$dat2,$dat2
940	aesd	$dat3,q8
941	aesimc	$dat3,$dat3
942	aesd	$dat4,q8
943	aesimc	$dat4,$dat4
944	vld1.32	{q8},[$key_],#16
945	subs	$cnt,$cnt,#2
946	aesd	$dat0,q9
947	aesimc	$dat0,$dat0
948	aesd	$dat1,q9
949	aesimc	$dat1,$dat1
950	aesd	$dat2,q9
951	aesimc	$dat2,$dat2
952	aesd	$dat3,q9
953	aesimc	$dat3,$dat3
954	aesd	$dat4,q9
955	aesimc	$dat4,$dat4
956	vld1.32	{q9},[$key_],#16
957	b.gt	.Loop5x_ecb_dec
958
959	aesd	$dat0,q8
960	aesimc	$dat0,$dat0
961	aesd	$dat1,q8
962	aesimc	$dat1,$dat1
963	aesd	$dat2,q8
964	aesimc	$dat2,$dat2
965	aesd	$dat3,q8
966	aesimc	$dat3,$dat3
967	aesd	$dat4,q8
968	aesimc	$dat4,$dat4
969	cmp	$len,#0x40				// because .Lecb_tail4x
970	sub	$len,$len,#0x50
971
972	aesd	$dat0,q9
973	aesimc	$dat0,$dat0
974	aesd	$dat1,q9
975	aesimc	$dat1,$dat1
976	aesd	$dat2,q9
977	aesimc	$dat2,$dat2
978	aesd	$dat3,q9
979	aesimc	$dat3,$dat3
980	aesd	$dat4,q9
981	aesimc	$dat4,$dat4
982	csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
983	mov	$key_,$key
984
985	aesd	$dat0,q10
986	aesimc	$dat0,$dat0
987	aesd	$dat1,q10
988	aesimc	$dat1,$dat1
989	aesd	$dat2,q10
990	aesimc	$dat2,$dat2
991	aesd	$dat3,q10
992	aesimc	$dat3,$dat3
993	aesd	$dat4,q10
994	aesimc	$dat4,$dat4
995	add	$inp,$inp,x6				// $inp is adjusted in such way that
996							// at exit from the loop $dat1-$dat4
997							// are loaded with last "words"
998	add	x6,$len,#0x60			// because .Lecb_tail4x
999
1000	aesd	$dat0,q11
1001	aesimc	$dat0,$dat0
1002	aesd	$dat1,q11
1003	aesimc	$dat1,$dat1
1004	aesd	$dat2,q11
1005	aesimc	$dat2,$dat2
1006	aesd	$dat3,q11
1007	aesimc	$dat3,$dat3
1008	aesd	$dat4,q11
1009	aesimc	$dat4,$dat4
1010
1011	aesd	$dat0,q12
1012	aesimc	$dat0,$dat0
1013	aesd	$dat1,q12
1014	aesimc	$dat1,$dat1
1015	aesd	$dat2,q12
1016	aesimc	$dat2,$dat2
1017	aesd	$dat3,q12
1018	aesimc	$dat3,$dat3
1019	aesd	$dat4,q12
1020	aesimc	$dat4,$dat4
1021
1022	aesd	$dat0,q13
1023	aesimc	$dat0,$dat0
1024	aesd	$dat1,q13
1025	aesimc	$dat1,$dat1
1026	aesd	$dat2,q13
1027	aesimc	$dat2,$dat2
1028	aesd	$dat3,q13
1029	aesimc	$dat3,$dat3
1030	aesd	$dat4,q13
1031	aesimc	$dat4,$dat4
1032
1033	aesd	$dat0,q14
1034	aesimc	$dat0,$dat0
1035	aesd	$dat1,q14
1036	aesimc	$dat1,$dat1
1037	aesd	$dat2,q14
1038	aesimc	$dat2,$dat2
1039	aesd	$dat3,q14
1040	aesimc	$dat3,$dat3
1041	aesd	$dat4,q14
1042	aesimc	$dat4,$dat4
1043
1044	aesd	$dat0,q15
1045	vld1.8	{$in0},[$inp],#16
1046	aesd	$dat1,q15
1047	vld1.8	{$in1},[$inp],#16
1048	aesd	$dat2,q15
1049	vld1.8	{$in2},[$inp],#16
1050	aesd	$dat3,q15
1051	vld1.8	{$in3},[$inp],#16
1052	aesd	$dat4,q15
1053	vld1.8	{$in4},[$inp],#16
1054	cbz	x6,.Lecb_tail4x
1055	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1056	veor	$tmp0,$rndlast,$dat0
1057	vorr	$dat0,$in0,$in0
1058	veor	$tmp1,$rndlast,$dat1
1059	vorr	$dat1,$in1,$in1
1060	veor	$tmp2,$rndlast,$dat2
1061	vorr	$dat2,$in2,$in2
1062	veor	$tmp3,$rndlast,$dat3
1063	vorr	$dat3,$in3,$in3
1064	veor	$tmp4,$rndlast,$dat4
1065	vst1.8	{$tmp0},[$out],#16
1066	vorr	$dat4,$in4,$in4
1067	vst1.8	{$tmp1},[$out],#16
1068	mov	$cnt,$rounds
1069	vst1.8	{$tmp2},[$out],#16
1070	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1071	vst1.8	{$tmp3},[$out],#16
1072	vst1.8	{$tmp4},[$out],#16
1073	b.hs	.Loop5x_ecb_dec
1074
1075	add	$len,$len,#0x50
1076	cbz	$len,.Lecb_done
1077
1078	add	$cnt,$rounds,#2
1079	subs	$len,$len,#0x30
1080	vorr	$dat0,$in2,$in2
1081	vorr	$dat1,$in3,$in3
1082	vorr	$dat2,$in4,$in4
1083	b.lo	.Lecb_dec_tail
1084
1085	b	.Loop3x_ecb_dec
1086
1087.align	4
1088.Lecb_tail4x:
1089	veor	$tmp1,$rndlast,$dat1
1090	veor	$tmp2,$rndlast,$dat2
1091	veor	$tmp3,$rndlast,$dat3
1092	veor	$tmp4,$rndlast,$dat4
1093	vst1.8	{$tmp1},[$out],#16
1094	vst1.8	{$tmp2},[$out],#16
1095	vst1.8	{$tmp3},[$out],#16
1096	vst1.8	{$tmp4},[$out],#16
1097
1098	b	.Lecb_done
1099.align	4
1100___
1101$code.=<<___;
1102.Loop3x_ecb_dec:
1103	aesd	$dat0,q8
1104	aesimc	$dat0,$dat0
1105	aesd	$dat1,q8
1106	aesimc	$dat1,$dat1
1107	aesd	$dat2,q8
1108	aesimc	$dat2,$dat2
1109	vld1.32	{q8},[$key_],#16
1110	subs	$cnt,$cnt,#2
1111	aesd	$dat0,q9
1112	aesimc	$dat0,$dat0
1113	aesd	$dat1,q9
1114	aesimc	$dat1,$dat1
1115	aesd	$dat2,q9
1116	aesimc	$dat2,$dat2
1117	vld1.32	{q9},[$key_],#16
1118	b.gt	.Loop3x_ecb_dec
1119
1120	aesd	$dat0,q8
1121	aesimc	$dat0,$dat0
1122	aesd	$dat1,q8
1123	aesimc	$dat1,$dat1
1124	aesd	$dat2,q8
1125	aesimc	$dat2,$dat2
1126	subs	$len,$len,#0x30
1127	mov.lo	x6,$len				// x6, $cnt, is zero at this point
1128	aesd	$dat0,q9
1129	aesimc	$dat0,$dat0
1130	aesd	$dat1,q9
1131	aesimc	$dat1,$dat1
1132	aesd	$dat2,q9
1133	aesimc	$dat2,$dat2
1134	add	$inp,$inp,x6 			// $inp is adjusted in such way that
1135						// at exit from the loop $dat1-$dat2
1136						// are loaded with last "words"
1137	mov	$key_,$key
1138	aesd	$dat0,q12
1139	aesimc	$dat0,$dat0
1140	aesd	$dat1,q12
1141	aesimc	$dat1,$dat1
1142	aesd	$dat2,q12
1143	aesimc	$dat2,$dat2
1144	vld1.8	{$in0},[$inp],#16
1145	aesd	$dat0,q13
1146	aesimc	$dat0,$dat0
1147	aesd	$dat1,q13
1148	aesimc	$dat1,$dat1
1149	aesd	$dat2,q13
1150	aesimc	$dat2,$dat2
1151	vld1.8	{$in1},[$inp],#16
1152	aesd	$dat0,q14
1153	aesimc	$dat0,$dat0
1154	aesd	$dat1,q14
1155	aesimc	$dat1,$dat1
1156	aesd	$dat2,q14
1157	aesimc	$dat2,$dat2
1158	vld1.8	{$in2},[$inp],#16
1159	aesd	$dat0,q15
1160	aesd	$dat1,q15
1161	aesd	$dat2,q15
1162	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1163	add	$cnt,$rounds,#2
1164	veor	$tmp0,$rndlast,$dat0
1165	veor	$tmp1,$rndlast,$dat1
1166	veor	$dat2,$dat2,$rndlast
1167	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1168	vst1.8	{$tmp0},[$out],#16
1169	vorr	$dat0,$in0,$in0
1170	vst1.8	{$tmp1},[$out],#16
1171	vorr	$dat1,$in1,$in1
1172	vst1.8	{$dat2},[$out],#16
1173	vorr	$dat2,$in2,$in2
1174	b.hs	.Loop3x_ecb_dec
1175
1176	cmn	$len,#0x30
1177	b.eq	.Lecb_done
1178	nop
1179
1180.Lecb_dec_tail:
1181	aesd	$dat1,q8
1182	aesimc	$dat1,$dat1
1183	aesd	$dat2,q8
1184	aesimc	$dat2,$dat2
1185	vld1.32	{q8},[$key_],#16
1186	subs	$cnt,$cnt,#2
1187	aesd	$dat1,q9
1188	aesimc	$dat1,$dat1
1189	aesd	$dat2,q9
1190	aesimc	$dat2,$dat2
1191	vld1.32	{q9},[$key_],#16
1192	b.gt	.Lecb_dec_tail
1193
1194	aesd	$dat1,q8
1195	aesimc	$dat1,$dat1
1196	aesd	$dat2,q8
1197	aesimc	$dat2,$dat2
1198	aesd	$dat1,q9
1199	aesimc	$dat1,$dat1
1200	aesd	$dat2,q9
1201	aesimc	$dat2,$dat2
1202	aesd	$dat1,q12
1203	aesimc	$dat1,$dat1
1204	aesd	$dat2,q12
1205	aesimc	$dat2,$dat2
1206	cmn	$len,#0x20
1207	aesd	$dat1,q13
1208	aesimc	$dat1,$dat1
1209	aesd	$dat2,q13
1210	aesimc	$dat2,$dat2
1211	aesd	$dat1,q14
1212	aesimc	$dat1,$dat1
1213	aesd	$dat2,q14
1214	aesimc	$dat2,$dat2
1215	aesd	$dat1,q15
1216	aesd	$dat2,q15
1217	b.eq	.Lecb_dec_one
1218	veor	$tmp1,$rndlast,$dat1
1219	veor	$tmp2,$rndlast,$dat2
1220	vst1.8	{$tmp1},[$out],#16
1221	vst1.8	{$tmp2},[$out],#16
1222	b	.Lecb_done
1223
1224.Lecb_dec_one:
1225	veor	$tmp1,$rndlast,$dat2
1226	vst1.8	{$tmp1},[$out],#16
1227
1228.Lecb_done:
1229___
1230}
1231$code.=<<___	if ($flavour !~ /64/);
1232	vldmia	sp!,{d8-d15}
1233	ldmia	sp!,{r4-r8,pc}
1234___
1235$code.=<<___	if ($flavour =~ /64/);
1236	ldr	x29,[sp],#16
1237___
1238$code.=<<___	if ($flavour =~ /64/);
1239.Lecb_Final_abort:
1240	ret
1241___
1242$code.=<<___;
1243.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1244___
1245}}}
1246{{{
1247my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1248my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1249my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1250
1251my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1252my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1253
1254### q8-q15	preloaded key schedule
1255
1256$code.=<<___;
1257.globl	${prefix}_cbc_encrypt
1258.type	${prefix}_cbc_encrypt,%function
1259.align	5
1260${prefix}_cbc_encrypt:
1261___
1262$code.=<<___	if ($flavour =~ /64/);
1263	AARCH64_VALID_CALL_TARGET
1264	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1265	stp	x29,x30,[sp,#-16]!
1266	add	x29,sp,#0
1267___
1268$code.=<<___	if ($flavour !~ /64/);
1269	mov	ip,sp
1270	stmdb	sp!,{r4-r8,lr}
1271	vstmdb	sp!,{d8-d15}            @ ABI specification says so
1272	ldmia	ip,{r4-r5}		@ load remaining args
1273___
1274$code.=<<___;
1275	subs	$len,$len,#16
1276	mov	$step,#16
1277	b.lo	.Lcbc_abort
1278	cclr	$step,eq
1279
1280	cmp	$enc,#0			// en- or decrypting?
1281	ldr	$rounds,[$key,#240]
1282	and	$len,$len,#-16
1283	vld1.8	{$ivec},[$ivp]
1284	vld1.8	{$dat},[$inp],$step
1285
1286	vld1.32	{q8-q9},[$key]		// load key schedule...
1287	sub	$rounds,$rounds,#6
1288	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
1289	sub	$rounds,$rounds,#2
1290	vld1.32	{q10-q11},[$key_],#32
1291	vld1.32	{q12-q13},[$key_],#32
1292	vld1.32	{q14-q15},[$key_],#32
1293	vld1.32	{$rndlast},[$key_]
1294
1295	add	$key_,$key,#32
1296	mov	$cnt,$rounds
1297	b.eq	.Lcbc_dec
1298
1299	cmp	$rounds,#2
1300	veor	$dat,$dat,$ivec
1301	veor	$rndzero_n_last,q8,$rndlast
1302	b.eq	.Lcbc_enc128
1303
1304	vld1.32	{$in0-$in1},[$key_]
1305	add	$key_,$key,#16
1306	add	$key4,$key,#16*4
1307	add	$key5,$key,#16*5
1308	aese	$dat,q8
1309	aesmc	$dat,$dat
1310	add	$key6,$key,#16*6
1311	add	$key7,$key,#16*7
1312	b	.Lenter_cbc_enc
1313
1314.align	4
1315.Loop_cbc_enc:
1316	aese	$dat,q8
1317	aesmc	$dat,$dat
1318	 vst1.8	{$ivec},[$out],#16
1319.Lenter_cbc_enc:
1320	aese	$dat,q9
1321	aesmc	$dat,$dat
1322	aese	$dat,$in0
1323	aesmc	$dat,$dat
1324	vld1.32	{q8},[$key4]
1325	cmp	$rounds,#4
1326	aese	$dat,$in1
1327	aesmc	$dat,$dat
1328	vld1.32	{q9},[$key5]
1329	b.eq	.Lcbc_enc192
1330
1331	aese	$dat,q8
1332	aesmc	$dat,$dat
1333	vld1.32	{q8},[$key6]
1334	aese	$dat,q9
1335	aesmc	$dat,$dat
1336	vld1.32	{q9},[$key7]
1337	nop
1338
1339.Lcbc_enc192:
1340	aese	$dat,q8
1341	aesmc	$dat,$dat
1342	 subs	$len,$len,#16
1343	aese	$dat,q9
1344	aesmc	$dat,$dat
1345	 cclr	$step,eq
1346	aese	$dat,q10
1347	aesmc	$dat,$dat
1348	aese	$dat,q11
1349	aesmc	$dat,$dat
1350	 vld1.8	{q8},[$inp],$step
1351	aese	$dat,q12
1352	aesmc	$dat,$dat
1353	 veor	q8,q8,$rndzero_n_last
1354	aese	$dat,q13
1355	aesmc	$dat,$dat
1356	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
1357	aese	$dat,q14
1358	aesmc	$dat,$dat
1359	aese	$dat,q15
1360	veor	$ivec,$dat,$rndlast
1361	b.hs	.Loop_cbc_enc
1362
1363	vst1.8	{$ivec},[$out],#16
1364	b	.Lcbc_done
1365
1366.align	5
1367.Lcbc_enc128:
1368	vld1.32	{$in0-$in1},[$key_]
1369	aese	$dat,q8
1370	aesmc	$dat,$dat
1371	b	.Lenter_cbc_enc128
1372.Loop_cbc_enc128:
1373	aese	$dat,q8
1374	aesmc	$dat,$dat
1375	 vst1.8	{$ivec},[$out],#16
1376.Lenter_cbc_enc128:
1377	aese	$dat,q9
1378	aesmc	$dat,$dat
1379	 subs	$len,$len,#16
1380	aese	$dat,$in0
1381	aesmc	$dat,$dat
1382	 cclr	$step,eq
1383	aese	$dat,$in1
1384	aesmc	$dat,$dat
1385	aese	$dat,q10
1386	aesmc	$dat,$dat
1387	aese	$dat,q11
1388	aesmc	$dat,$dat
1389	 vld1.8	{q8},[$inp],$step
1390	aese	$dat,q12
1391	aesmc	$dat,$dat
1392	aese	$dat,q13
1393	aesmc	$dat,$dat
1394	aese	$dat,q14
1395	aesmc	$dat,$dat
1396	 veor	q8,q8,$rndzero_n_last
1397	aese	$dat,q15
1398	veor	$ivec,$dat,$rndlast
1399	b.hs	.Loop_cbc_enc128
1400
1401	vst1.8	{$ivec},[$out],#16
1402	b	.Lcbc_done
1403___
1404{
1405my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1406
1407my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
1408my ($dat4,$in4,$tmp4);
1409if ($flavour =~ /64/) {
1410    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1411}
1412
1413$code.=<<___;
1414.align	5
1415.Lcbc_dec:
1416	vld1.8	{$dat2},[$inp],#16
1417	subs	$len,$len,#32		// bias
1418	add	$cnt,$rounds,#2
1419	vorr	$in1,$dat,$dat
1420	vorr	$dat1,$dat,$dat
1421	vorr	$in2,$dat2,$dat2
1422	b.lo	.Lcbc_dec_tail
1423
1424	vorr	$dat1,$dat2,$dat2
1425	vld1.8	{$dat2},[$inp],#16
1426	vorr	$in0,$dat,$dat
1427	vorr	$in1,$dat1,$dat1
1428	vorr	$in2,$dat2,$dat2
1429___
1430$code.=<<___	if ($flavour =~ /64/);
1431	cmp	$len,#32
1432	b.lo	.Loop3x_cbc_dec
1433
1434	vld1.8	{$dat3},[$inp],#16
1435	vld1.8	{$dat4},[$inp],#16
1436	sub	$len,$len,#32		// bias
1437	mov	$cnt,$rounds
1438	vorr	$in3,$dat3,$dat3
1439	vorr	$in4,$dat4,$dat4
1440
1441.Loop5x_cbc_dec:
1442	aesd	$dat0,q8
1443	aesimc	$dat0,$dat0
1444	aesd	$dat1,q8
1445	aesimc	$dat1,$dat1
1446	aesd	$dat2,q8
1447	aesimc	$dat2,$dat2
1448	aesd	$dat3,q8
1449	aesimc	$dat3,$dat3
1450	aesd	$dat4,q8
1451	aesimc	$dat4,$dat4
1452	vld1.32	{q8},[$key_],#16
1453	subs	$cnt,$cnt,#2
1454	aesd	$dat0,q9
1455	aesimc	$dat0,$dat0
1456	aesd	$dat1,q9
1457	aesimc	$dat1,$dat1
1458	aesd	$dat2,q9
1459	aesimc	$dat2,$dat2
1460	aesd	$dat3,q9
1461	aesimc	$dat3,$dat3
1462	aesd	$dat4,q9
1463	aesimc	$dat4,$dat4
1464	vld1.32	{q9},[$key_],#16
1465	b.gt	.Loop5x_cbc_dec
1466
1467	aesd	$dat0,q8
1468	aesimc	$dat0,$dat0
1469	aesd	$dat1,q8
1470	aesimc	$dat1,$dat1
1471	aesd	$dat2,q8
1472	aesimc	$dat2,$dat2
1473	aesd	$dat3,q8
1474	aesimc	$dat3,$dat3
1475	aesd	$dat4,q8
1476	aesimc	$dat4,$dat4
1477	 cmp	$len,#0x40		// because .Lcbc_tail4x
1478	 sub	$len,$len,#0x50
1479
1480	aesd	$dat0,q9
1481	aesimc	$dat0,$dat0
1482	aesd	$dat1,q9
1483	aesimc	$dat1,$dat1
1484	aesd	$dat2,q9
1485	aesimc	$dat2,$dat2
1486	aesd	$dat3,q9
1487	aesimc	$dat3,$dat3
1488	aesd	$dat4,q9
1489	aesimc	$dat4,$dat4
1490	 csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
1491	 mov	$key_,$key
1492
1493	aesd	$dat0,q10
1494	aesimc	$dat0,$dat0
1495	aesd	$dat1,q10
1496	aesimc	$dat1,$dat1
1497	aesd	$dat2,q10
1498	aesimc	$dat2,$dat2
1499	aesd	$dat3,q10
1500	aesimc	$dat3,$dat3
1501	aesd	$dat4,q10
1502	aesimc	$dat4,$dat4
1503	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1504					// at exit from the loop $dat1-$dat4
1505					// are loaded with last "words"
1506	 add	x6,$len,#0x60		// because .Lcbc_tail4x
1507
1508	aesd	$dat0,q11
1509	aesimc	$dat0,$dat0
1510	aesd	$dat1,q11
1511	aesimc	$dat1,$dat1
1512	aesd	$dat2,q11
1513	aesimc	$dat2,$dat2
1514	aesd	$dat3,q11
1515	aesimc	$dat3,$dat3
1516	aesd	$dat4,q11
1517	aesimc	$dat4,$dat4
1518
1519	aesd	$dat0,q12
1520	aesimc	$dat0,$dat0
1521	aesd	$dat1,q12
1522	aesimc	$dat1,$dat1
1523	aesd	$dat2,q12
1524	aesimc	$dat2,$dat2
1525	aesd	$dat3,q12
1526	aesimc	$dat3,$dat3
1527	aesd	$dat4,q12
1528	aesimc	$dat4,$dat4
1529
1530	aesd	$dat0,q13
1531	aesimc	$dat0,$dat0
1532	aesd	$dat1,q13
1533	aesimc	$dat1,$dat1
1534	aesd	$dat2,q13
1535	aesimc	$dat2,$dat2
1536	aesd	$dat3,q13
1537	aesimc	$dat3,$dat3
1538	aesd	$dat4,q13
1539	aesimc	$dat4,$dat4
1540
1541	aesd	$dat0,q14
1542	aesimc	$dat0,$dat0
1543	aesd	$dat1,q14
1544	aesimc	$dat1,$dat1
1545	aesd	$dat2,q14
1546	aesimc	$dat2,$dat2
1547	aesd	$dat3,q14
1548	aesimc	$dat3,$dat3
1549	aesd	$dat4,q14
1550	aesimc	$dat4,$dat4
1551
1552	 veor	$tmp0,$ivec,$rndlast
1553	aesd	$dat0,q15
1554	 veor	$tmp1,$in0,$rndlast
1555	 vld1.8	{$in0},[$inp],#16
1556	aesd	$dat1,q15
1557	 veor	$tmp2,$in1,$rndlast
1558	 vld1.8	{$in1},[$inp],#16
1559	aesd	$dat2,q15
1560	 veor	$tmp3,$in2,$rndlast
1561	 vld1.8	{$in2},[$inp],#16
1562	aesd	$dat3,q15
1563	 veor	$tmp4,$in3,$rndlast
1564	 vld1.8	{$in3},[$inp],#16
1565	aesd	$dat4,q15
1566	 vorr	$ivec,$in4,$in4
1567	 vld1.8	{$in4},[$inp],#16
1568	cbz	x6,.Lcbc_tail4x
1569	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1570	veor	$tmp0,$tmp0,$dat0
1571	 vorr	$dat0,$in0,$in0
1572	veor	$tmp1,$tmp1,$dat1
1573	 vorr	$dat1,$in1,$in1
1574	veor	$tmp2,$tmp2,$dat2
1575	 vorr	$dat2,$in2,$in2
1576	veor	$tmp3,$tmp3,$dat3
1577	 vorr	$dat3,$in3,$in3
1578	veor	$tmp4,$tmp4,$dat4
1579	vst1.8	{$tmp0},[$out],#16
1580	 vorr	$dat4,$in4,$in4
1581	vst1.8	{$tmp1},[$out],#16
1582	 mov	$cnt,$rounds
1583	vst1.8	{$tmp2},[$out],#16
1584	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1585	vst1.8	{$tmp3},[$out],#16
1586	vst1.8	{$tmp4},[$out],#16
1587	b.hs	.Loop5x_cbc_dec
1588
1589	add	$len,$len,#0x50
1590	cbz	$len,.Lcbc_done
1591
1592	add	$cnt,$rounds,#2
1593	subs	$len,$len,#0x30
1594	vorr	$dat0,$in2,$in2
1595	vorr	$in0,$in2,$in2
1596	vorr	$dat1,$in3,$in3
1597	vorr	$in1,$in3,$in3
1598	vorr	$dat2,$in4,$in4
1599	vorr	$in2,$in4,$in4
1600	b.lo	.Lcbc_dec_tail
1601
1602	b	.Loop3x_cbc_dec
1603
1604.align	4
1605.Lcbc_tail4x:
1606	veor	$tmp1,$tmp0,$dat1
1607	veor	$tmp2,$tmp2,$dat2
1608	veor	$tmp3,$tmp3,$dat3
1609	veor	$tmp4,$tmp4,$dat4
1610	vst1.8	{$tmp1},[$out],#16
1611	vst1.8	{$tmp2},[$out],#16
1612	vst1.8	{$tmp3},[$out],#16
1613	vst1.8	{$tmp4},[$out],#16
1614
1615	b	.Lcbc_done
1616.align	4
1617___
1618$code.=<<___;
1619.Loop3x_cbc_dec:
1620	aesd	$dat0,q8
1621	aesimc	$dat0,$dat0
1622	aesd	$dat1,q8
1623	aesimc	$dat1,$dat1
1624	aesd	$dat2,q8
1625	aesimc	$dat2,$dat2
1626	vld1.32	{q8},[$key_],#16
1627	subs	$cnt,$cnt,#2
1628	aesd	$dat0,q9
1629	aesimc	$dat0,$dat0
1630	aesd	$dat1,q9
1631	aesimc	$dat1,$dat1
1632	aesd	$dat2,q9
1633	aesimc	$dat2,$dat2
1634	vld1.32	{q9},[$key_],#16
1635	b.gt	.Loop3x_cbc_dec
1636
1637	aesd	$dat0,q8
1638	aesimc	$dat0,$dat0
1639	aesd	$dat1,q8
1640	aesimc	$dat1,$dat1
1641	aesd	$dat2,q8
1642	aesimc	$dat2,$dat2
1643	 veor	$tmp0,$ivec,$rndlast
1644	 subs	$len,$len,#0x30
1645	 veor	$tmp1,$in0,$rndlast
1646	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
1647	aesd	$dat0,q9
1648	aesimc	$dat0,$dat0
1649	aesd	$dat1,q9
1650	aesimc	$dat1,$dat1
1651	aesd	$dat2,q9
1652	aesimc	$dat2,$dat2
1653	 veor	$tmp2,$in1,$rndlast
1654	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1655					// at exit from the loop $dat1-$dat2
1656					// are loaded with last "words"
1657	 vorr	$ivec,$in2,$in2
1658	 mov	$key_,$key
1659	aesd	$dat0,q12
1660	aesimc	$dat0,$dat0
1661	aesd	$dat1,q12
1662	aesimc	$dat1,$dat1
1663	aesd	$dat2,q12
1664	aesimc	$dat2,$dat2
1665	 vld1.8	{$in0},[$inp],#16
1666	aesd	$dat0,q13
1667	aesimc	$dat0,$dat0
1668	aesd	$dat1,q13
1669	aesimc	$dat1,$dat1
1670	aesd	$dat2,q13
1671	aesimc	$dat2,$dat2
1672	 vld1.8	{$in1},[$inp],#16
1673	aesd	$dat0,q14
1674	aesimc	$dat0,$dat0
1675	aesd	$dat1,q14
1676	aesimc	$dat1,$dat1
1677	aesd	$dat2,q14
1678	aesimc	$dat2,$dat2
1679	 vld1.8	{$in2},[$inp],#16
1680	aesd	$dat0,q15
1681	aesd	$dat1,q15
1682	aesd	$dat2,q15
1683	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1684	 add	$cnt,$rounds,#2
1685	veor	$tmp0,$tmp0,$dat0
1686	veor	$tmp1,$tmp1,$dat1
1687	veor	$dat2,$dat2,$tmp2
1688	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1689	vst1.8	{$tmp0},[$out],#16
1690	 vorr	$dat0,$in0,$in0
1691	vst1.8	{$tmp1},[$out],#16
1692	 vorr	$dat1,$in1,$in1
1693	vst1.8	{$dat2},[$out],#16
1694	 vorr	$dat2,$in2,$in2
1695	b.hs	.Loop3x_cbc_dec
1696
1697	cmn	$len,#0x30
1698	b.eq	.Lcbc_done
1699	nop
1700
1701.Lcbc_dec_tail:
1702	aesd	$dat1,q8
1703	aesimc	$dat1,$dat1
1704	aesd	$dat2,q8
1705	aesimc	$dat2,$dat2
1706	vld1.32	{q8},[$key_],#16
1707	subs	$cnt,$cnt,#2
1708	aesd	$dat1,q9
1709	aesimc	$dat1,$dat1
1710	aesd	$dat2,q9
1711	aesimc	$dat2,$dat2
1712	vld1.32	{q9},[$key_],#16
1713	b.gt	.Lcbc_dec_tail
1714
1715	aesd	$dat1,q8
1716	aesimc	$dat1,$dat1
1717	aesd	$dat2,q8
1718	aesimc	$dat2,$dat2
1719	aesd	$dat1,q9
1720	aesimc	$dat1,$dat1
1721	aesd	$dat2,q9
1722	aesimc	$dat2,$dat2
1723	aesd	$dat1,q12
1724	aesimc	$dat1,$dat1
1725	aesd	$dat2,q12
1726	aesimc	$dat2,$dat2
1727	 cmn	$len,#0x20
1728	aesd	$dat1,q13
1729	aesimc	$dat1,$dat1
1730	aesd	$dat2,q13
1731	aesimc	$dat2,$dat2
1732	 veor	$tmp1,$ivec,$rndlast
1733	aesd	$dat1,q14
1734	aesimc	$dat1,$dat1
1735	aesd	$dat2,q14
1736	aesimc	$dat2,$dat2
1737	 veor	$tmp2,$in1,$rndlast
1738	aesd	$dat1,q15
1739	aesd	$dat2,q15
1740	b.eq	.Lcbc_dec_one
1741	veor	$tmp1,$tmp1,$dat1
1742	veor	$tmp2,$tmp2,$dat2
1743	 vorr	$ivec,$in2,$in2
1744	vst1.8	{$tmp1},[$out],#16
1745	vst1.8	{$tmp2},[$out],#16
1746	b	.Lcbc_done
1747
1748.Lcbc_dec_one:
1749	veor	$tmp1,$tmp1,$dat2
1750	 vorr	$ivec,$in2,$in2
1751	vst1.8	{$tmp1},[$out],#16
1752
1753.Lcbc_done:
1754	vst1.8	{$ivec},[$ivp]
1755.Lcbc_abort:
1756___
1757}
1758$code.=<<___	if ($flavour !~ /64/);
1759	vldmia	sp!,{d8-d15}
1760	ldmia	sp!,{r4-r8,pc}
1761___
1762$code.=<<___	if ($flavour =~ /64/);
1763	ldr	x29,[sp],#16
1764	ret
1765___
1766$code.=<<___;
1767.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1768___
1769}}}
1770
1771{{{
1772my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1773my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7");
1774my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1775my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15));
1776my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23));
1777
1778# q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15
1779my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3));
1780my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9));
1781my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15));
1782my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21));
1783my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27));
1784my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27));
1785
1786#q_X => qX, for ldp & stp
1787my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7));
1788my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23));
1789
1790my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11));
1791
1792$code.=<<___	if ($flavour =~ /64/);
1793.globl	${prefix}_ctr32_encrypt_blocks_unroll12_eor3
1794.type	${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function
1795.align	5
1796${prefix}_ctr32_encrypt_blocks_unroll12_eor3:
1797	AARCH64_VALID_CALL_TARGET
1798	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1799	stp		x29,x30,[sp,#-80]!
1800	stp		d8,d9,[sp, #16]
1801	stp		d10,d11,[sp, #32]
1802	stp		d12,d13,[sp, #48]
1803	stp		d14,d15,[sp, #64]
1804	add		x29,sp,#0
1805
1806	 ldr		$rounds,[$key,#240]
1807
1808	 ldr		$ctr, [$ivp, #12]
1809#ifdef __AARCH64EB__
1810	vld1.8		{$dat0},[$ivp]
1811#else
1812	vld1.32		{$dat0},[$ivp]
1813#endif
1814	vld1.32		{$rndping-$rndpang},[$key]		// load key schedule...
1815	 sub		$rounds,$rounds,#4
1816	 cmp		$len,#2
1817	 add		$key_,$key,$roundsx,lsl#4	// pointer to last round key
1818	 sub		$rounds,$rounds,#2
1819	 add		$key_, $key_, #64
1820	vld1.32		{$rndlast},[$key_]
1821	 add		$key_,$key,#32
1822	 mov		$cnt,$rounds
1823#ifndef __AARCH64EB__
1824	rev		$ctr, $ctr
1825#endif
1826
1827	vorr		$dat1,$dat0,$dat0
1828	 add		$tctr1, $ctr, #1
1829	vorr		$dat2,$dat0,$dat0
1830	 add		$ctr, $ctr, #2
1831	vorr		$ivec,$dat0,$dat0
1832	 rev		$tctr1, $tctr1
1833	vmov.32		${dat1}[3],$tctr1
1834	b.ls		.Lctr32_tail_unroll
1835	 cmp		$len,#6
1836	 rev		$tctr2, $ctr
1837	 sub		$len,$len,#3		// bias
1838	vmov.32		${dat2}[3],$tctr2
1839	b.lo		.Loop3x_ctr32_unroll
1840	 cmp		$len,#9
1841	vorr		$dat3,$dat0,$dat0
1842	 add		$tctr3, $ctr, #1
1843	vorr		$dat4,$dat0,$dat0
1844	 add		$tctr4, $ctr, #2
1845	 rev		$tctr3, $tctr3
1846	vorr		$dat5,$dat0,$dat0
1847	 add		$ctr, $ctr, #3
1848	 rev		$tctr4, $tctr4
1849	vmov.32		${dat3}[3],$tctr3
1850	 rev		$tctr5, $ctr
1851	vmov.32		${dat4}[3],$tctr4
1852	vmov.32		${dat5}[3],$tctr5
1853	 sub		$len,$len,#3
1854	b.lo		.Loop6x_ctr32_unroll
1855
1856	// push regs to stack when 12 data chunks are interleaved
1857	 stp		x19,x20,[sp,#-16]!
1858	 stp		x21,x22,[sp,#-16]!
1859	 stp		x23,x24,[sp,#-16]!
1860	 stp		$dat8d,$dat9d,[sp,#-32]!
1861	 stp		$dat10d,$dat11d,[sp,#-32]!
1862
1863	 add		$tctr6,$ctr,#1
1864	 add		$tctr7,$ctr,#2
1865	 add		$tctr8,$ctr,#3
1866	 add		$tctr9,$ctr,#4
1867	 add		$tctr10,$ctr,#5
1868	 add		$ctr,$ctr,#6
1869	vorr		$dat6,$dat0,$dat0
1870	 rev		$tctr6,$tctr6
1871	vorr		$dat7,$dat0,$dat0
1872	 rev		$tctr7,$tctr7
1873	vorr		$dat8,$dat0,$dat0
1874	 rev		$tctr8,$tctr8
1875	vorr		$dat9,$dat0,$dat0
1876	 rev		$tctr9,$tctr9
1877	vorr		$dat10,$dat0,$dat0
1878	 rev		$tctr10,$tctr10
1879	vorr		$dat11,$dat0,$dat0
1880	 rev		$tctr11,$ctr
1881
1882	 sub		$len,$len,#6		// bias
1883	vmov.32		${dat6}[3],$tctr6
1884	vmov.32		${dat7}[3],$tctr7
1885	vmov.32		${dat8}[3],$tctr8
1886	vmov.32		${dat9}[3],$tctr9
1887	vmov.32		${dat10}[3],$tctr10
1888	vmov.32		${dat11}[3],$tctr11
1889	b		.Loop12x_ctr32_unroll
1890
1891.align	4
1892.Loop12x_ctr32_unroll:
1893	aese		$dat0,$rndping
1894	aesmc		$dat0,$dat0
1895	aese		$dat1,$rndping
1896	aesmc		$dat1,$dat1
1897	aese		$dat2,$rndping
1898	aesmc		$dat2,$dat2
1899	aese		$dat3,$rndping
1900	aesmc		$dat3,$dat3
1901	aese		$dat4,$rndping
1902	aesmc		$dat4,$dat4
1903	aese		$dat5,$rndping
1904	aesmc		$dat5,$dat5
1905	aese		$dat6,$rndping
1906	aesmc		$dat6,$dat6
1907	aese		$dat7,$rndping
1908	aesmc		$dat7,$dat7
1909	aese		$dat8,$rndping
1910	aesmc		$dat8,$dat8
1911	aese		$dat9,$rndping
1912	aesmc		$dat9,$dat9
1913	aese		$dat10,$rndping
1914	aesmc		$dat10,$dat10
1915	aese		$dat11,$rndping
1916	aesmc		$dat11,$dat11
1917	vld1.32		{$rndping},[$key_],#16
1918	subs		$cnt,$cnt,#2
1919	aese		$dat0,$rndpang
1920	aesmc		$dat0,$dat0
1921	aese		$dat1,$rndpang
1922	aesmc		$dat1,$dat1
1923	aese		$dat2,$rndpang
1924	aesmc		$dat2,$dat2
1925	aese		$dat3,$rndpang
1926	aesmc		$dat3,$dat3
1927	aese		$dat4,$rndpang
1928	aesmc		$dat4,$dat4
1929	aese		$dat5,$rndpang
1930	aesmc		$dat5,$dat5
1931	aese		$dat6,$rndpang
1932	aesmc		$dat6,$dat6
1933	aese		$dat7,$rndpang
1934	aesmc		$dat7,$dat7
1935	aese		$dat8,$rndpang
1936	aesmc		$dat8,$dat8
1937	aese		$dat9,$rndpang
1938	aesmc		$dat9,$dat9
1939	aese		$dat10,$rndpang
1940	aesmc		$dat10,$dat10
1941	aese		$dat11,$rndpang
1942	aesmc		$dat11,$dat11
1943	vld1.32		{$rndpang},[$key_],#16
1944	b.gt		.Loop12x_ctr32_unroll
1945
1946	aese		$dat0,$rndping
1947	aesmc		$dat0,$dat0
1948	aese		$dat1,$rndping
1949	aesmc		$dat1,$dat1
1950	aese		$dat2,$rndping
1951	aesmc		$dat2,$dat2
1952	aese		$dat3,$rndping
1953	aesmc		$dat3,$dat3
1954	aese		$dat4,$rndping
1955	aesmc		$dat4,$dat4
1956	aese		$dat5,$rndping
1957	aesmc		$dat5,$dat5
1958	aese		$dat6,$rndping
1959	aesmc		$dat6,$dat6
1960	aese		$dat7,$rndping
1961	aesmc		$dat7,$dat7
1962	aese		$dat8,$rndping
1963	aesmc		$dat8,$dat8
1964	aese		$dat9,$rndping
1965	aesmc		$dat9,$dat9
1966	aese		$dat10,$rndping
1967	aesmc		$dat10,$dat10
1968	aese		$dat11,$rndping
1969	aesmc		$dat11,$dat11
1970	vld1.32	 	{$rndping},[$key_],#16
1971
1972	aese		$dat0,$rndpang
1973	aesmc		$dat0,$dat0
1974	aese		$dat1,$rndpang
1975	aesmc		$dat1,$dat1
1976	aese		$dat2,$rndpang
1977	aesmc		$dat2,$dat2
1978	aese		$dat3,$rndpang
1979	aesmc		$dat3,$dat3
1980	aese		$dat4,$rndpang
1981	aesmc		$dat4,$dat4
1982	aese		$dat5,$rndpang
1983	aesmc		$dat5,$dat5
1984	aese		$dat6,$rndpang
1985	aesmc		$dat6,$dat6
1986	aese		$dat7,$rndpang
1987	aesmc		$dat7,$dat7
1988	aese		$dat8,$rndpang
1989	aesmc		$dat8,$dat8
1990	aese		$dat9,$rndpang
1991	aesmc		$dat9,$dat9
1992	aese		$dat10,$rndpang
1993	aesmc		$dat10,$dat10
1994	aese		$dat11,$rndpang
1995	aesmc		$dat11,$dat11
1996	vld1.32	 	{$rndpang},[$key_],#16
1997
1998	aese		$dat0,$rndping
1999	aesmc		$dat0,$dat0
2000	 add		$tctr0,$ctr,#1
2001	 add		$tctr1,$ctr,#2
2002	aese		$dat1,$rndping
2003	aesmc		$dat1,$dat1
2004	 add		$tctr2,$ctr,#3
2005	 add		$tctr3,$ctr,#4
2006	aese		$dat2,$rndping
2007	aesmc		$dat2,$dat2
2008	 add		$tctr4,$ctr,#5
2009	 add		$tctr5,$ctr,#6
2010	 rev		$tctr0,$tctr0
2011	aese		$dat3,$rndping
2012	aesmc		$dat3,$dat3
2013	 add		$tctr6,$ctr,#7
2014	 add		$tctr7,$ctr,#8
2015	 rev		$tctr1,$tctr1
2016	 rev		$tctr2,$tctr2
2017	aese		$dat4,$rndping
2018	aesmc		$dat4,$dat4
2019	 add		$tctr8,$ctr,#9
2020	 add		$tctr9,$ctr,#10
2021	 rev		$tctr3,$tctr3
2022	 rev		$tctr4,$tctr4
2023	aese		$dat5,$rndping
2024	aesmc		$dat5,$dat5
2025	 add		$tctr10,$ctr,#11
2026	 add		$tctr11,$ctr,#12
2027	 rev		$tctr5,$tctr5
2028	 rev		$tctr6,$tctr6
2029	aese		$dat6,$rndping
2030	aesmc		$dat6,$dat6
2031	 rev		$tctr7,$tctr7
2032	 rev		$tctr8,$tctr8
2033	aese		$dat7,$rndping
2034	aesmc		$dat7,$dat7
2035	 rev		$tctr9,$tctr9
2036	 rev		$tctr10,$tctr10
2037	aese		$dat8,$rndping
2038	aesmc		$dat8,$dat8
2039	 rev		$tctr11,$tctr11
2040	aese		$dat9,$rndping
2041	aesmc		$dat9,$dat9
2042	aese		$dat10,$rndping
2043	aesmc		$dat10,$dat10
2044	aese		$dat11,$rndping
2045	aesmc		$dat11,$dat11
2046	vld1.32	 	{$rndping},[$key_],#16
2047
2048	aese		$dat0,$rndpang
2049	aesmc		$dat0,$dat0
2050	aese		$dat1,$rndpang
2051	aesmc		$dat1,$dat1
2052	aese		$dat2,$rndpang
2053	aesmc		$dat2,$dat2
2054	aese		$dat3,$rndpang
2055	aesmc		$dat3,$dat3
2056	vld1.8		{$in0,$in1,$in2,$in3},[$inp],#64
2057	aese		$dat4,$rndpang
2058	aesmc		$dat4,$dat4
2059	aese		$dat5,$rndpang
2060	aesmc		$dat5,$dat5
2061	aese		$dat6,$rndpang
2062	aesmc		$dat6,$dat6
2063	aese		$dat7,$rndpang
2064	aesmc		$dat7,$dat7
2065	vld1.8		{$in4,$in5,$in6,$in7},[$inp],#64
2066	aese		$dat8,$rndpang
2067	aesmc		$dat8,$dat8
2068	aese		$dat9,$rndpang
2069	aesmc		$dat9,$dat9
2070	aese		$dat10,$rndpang
2071	aesmc		$dat10,$dat10
2072	aese		$dat11,$rndpang
2073	aesmc		$dat11,$dat11
2074	vld1.8		{$in8,$in9,$in10,$in11},[$inp],#64
2075	vld1.32	 	{$rndpang},[$key_],#16
2076
2077	 mov		$key_, $key
2078	aese		$dat0,$rndping
2079	aesmc		$dat0,$dat0
2080	aese		$dat1,$rndping
2081	aesmc		$dat1,$dat1
2082	aese		$dat2,$rndping
2083	aesmc		$dat2,$dat2
2084	aese		$dat3,$rndping
2085	aesmc		$dat3,$dat3
2086	aese		$dat4,$rndping
2087	aesmc		$dat4,$dat4
2088	aese		$dat5,$rndping
2089	aesmc		$dat5,$dat5
2090	aese		$dat6,$rndping
2091	aesmc		$dat6,$dat6
2092	aese		$dat7,$rndping
2093	aesmc		$dat7,$dat7
2094	aese		$dat8,$rndping
2095	aesmc		$dat8,$dat8
2096	aese		$dat9,$rndping
2097	aesmc		$dat9,$dat9
2098	aese		$dat10,$rndping
2099	aesmc		$dat10,$dat10
2100	aese		$dat11,$rndping
2101	aesmc		$dat11,$dat11
2102	vld1.32	 	{$rndping},[$key_],#16	// re-pre-load rndkey[0]
2103
2104	aese		$dat0,$rndpang
2105	 eor3		$in0,$in0,$rndlast,$dat0
2106	vorr		$dat0,$ivec,$ivec
2107	aese		$dat1,$rndpang
2108	 eor3		$in1,$in1,$rndlast,$dat1
2109	vorr		$dat1,$ivec,$ivec
2110	aese		$dat2,$rndpang
2111	 eor3		$in2,$in2,$rndlast,$dat2
2112	vorr		$dat2,$ivec,$ivec
2113	aese		$dat3,$rndpang
2114	 eor3		$in3,$in3,$rndlast,$dat3
2115	vorr		$dat3,$ivec,$ivec
2116	aese		$dat4,$rndpang
2117	 eor3		$in4,$in4,$rndlast,$dat4
2118	vorr		$dat4,$ivec,$ivec
2119	aese		$dat5,$rndpang
2120	 eor3		$in5,$in5,$rndlast,$dat5
2121	vorr		$dat5,$ivec,$ivec
2122	aese		$dat6,$rndpang
2123	 eor3		$in6,$in6,$rndlast,$dat6
2124	vorr		$dat6,$ivec,$ivec
2125	aese		$dat7,$rndpang
2126	 eor3		$in7,$in7,$rndlast,$dat7
2127	vorr		$dat7,$ivec,$ivec
2128	aese		$dat8,$rndpang
2129	 eor3		$in8,$in8,$rndlast,$dat8
2130	vorr		$dat8,$ivec,$ivec
2131	aese		$dat9,$rndpang
2132	 eor3		$in9,$in9,$rndlast,$dat9
2133	vorr		$dat9,$ivec,$ivec
2134	aese		$dat10,$rndpang
2135	 eor3		$in10,$in10,$rndlast,$dat10
2136	vorr		$dat10,$ivec,$ivec
2137	aese		$dat11,$rndpang
2138	 eor3		$in11,$in11,$rndlast,$dat11
2139	vorr		$dat11,$ivec,$ivec
2140	vld1.32	 	{$rndpang},[$key_],#16	// re-pre-load rndkey[1]
2141
2142	vmov.32		${dat0}[3],$tctr0
2143	vmov.32		${dat1}[3],$tctr1
2144	vmov.32		${dat2}[3],$tctr2
2145	vmov.32		${dat3}[3],$tctr3
2146	vst1.8		{$in0,$in1,$in2,$in3},[$out],#64
2147	vmov.32		${dat4}[3],$tctr4
2148	vmov.32		${dat5}[3],$tctr5
2149	vmov.32		${dat6}[3],$tctr6
2150	vmov.32		${dat7}[3],$tctr7
2151	vst1.8		{$in4,$in5,$in6,$in7},[$out],#64
2152	vmov.32		${dat8}[3],$tctr8
2153	vmov.32		${dat9}[3],$tctr9
2154	vmov.32		${dat10}[3],$tctr10
2155	vmov.32		${dat11}[3],$tctr11
2156	vst1.8		{$in8,$in9,$in10,$in11},[$out],#64
2157
2158	 mov		$cnt,$rounds
2159
2160	 add		$ctr,$ctr,#12
2161	subs		$len,$len,#12
2162	b.hs		.Loop12x_ctr32_unroll
2163
2164	// pop regs from stack when 12 data chunks are interleaved
2165	 ldp		$dat10d,$dat11d,[sp],#32
2166	 ldp		$dat8d,$dat9d,[sp],#32
2167	 ldp		x23,x24,[sp],#16
2168	 ldp		x21,x22,[sp],#16
2169	 ldp		x19,x20,[sp],#16
2170
2171	 add		$len,$len,#12
2172	 cbz		$len,.Lctr32_done_unroll
2173	 sub		$ctr,$ctr,#12
2174
2175	 cmp		$len,#2
2176	b.ls		.Lctr32_tail_unroll
2177
2178	 cmp		$len,#6
2179	 sub		$len,$len,#3		// bias
2180	 add		$ctr,$ctr,#3
2181	b.lo		.Loop3x_ctr32_unroll
2182
2183	 sub		$len,$len,#3
2184	 add		$ctr,$ctr,#3
2185	b.lo		.Loop6x_ctr32_unroll
2186
2187.align	4
2188.Loop6x_ctr32_unroll:
2189	aese		$dat0,$rndping
2190	aesmc		$dat0,$dat0
2191	aese		$dat1,$rndping
2192	aesmc		$dat1,$dat1
2193	aese		$dat2,$rndping
2194	aesmc		$dat2,$dat2
2195	aese		$dat3,$rndping
2196	aesmc		$dat3,$dat3
2197	aese		$dat4,$rndping
2198	aesmc		$dat4,$dat4
2199	aese		$dat5,$rndping
2200	aesmc		$dat5,$dat5
2201	vld1.32		{$rndping},[$key_],#16
2202	subs		$cnt,$cnt,#2
2203	aese		$dat0,$rndpang
2204	aesmc		$dat0,$dat0
2205	aese		$dat1,$rndpang
2206	aesmc		$dat1,$dat1
2207	aese		$dat2,$rndpang
2208	aesmc		$dat2,$dat2
2209	aese		$dat3,$rndpang
2210	aesmc		$dat3,$dat3
2211	aese		$dat4,$rndpang
2212	aesmc		$dat4,$dat4
2213	aese		$dat5,$rndpang
2214	aesmc		$dat5,$dat5
2215	vld1.32		{$rndpang},[$key_],#16
2216	b.gt		.Loop6x_ctr32_unroll
2217
2218	aese		$dat0,$rndping
2219	aesmc		$dat0,$dat0
2220	aese		$dat1,$rndping
2221	aesmc		$dat1,$dat1
2222	aese		$dat2,$rndping
2223	aesmc		$dat2,$dat2
2224	aese		$dat3,$rndping
2225	aesmc		$dat3,$dat3
2226	aese		$dat4,$rndping
2227	aesmc		$dat4,$dat4
2228	aese		$dat5,$rndping
2229	aesmc		$dat5,$dat5
2230	vld1.32	 	{$rndping},[$key_],#16
2231
2232	aese		$dat0,$rndpang
2233	aesmc		$dat0,$dat0
2234	aese		$dat1,$rndpang
2235	aesmc		$dat1,$dat1
2236	aese		$dat2,$rndpang
2237	aesmc		$dat2,$dat2
2238	aese		$dat3,$rndpang
2239	aesmc		$dat3,$dat3
2240	aese		$dat4,$rndpang
2241	aesmc		$dat4,$dat4
2242	aese		$dat5,$rndpang
2243	aesmc		$dat5,$dat5
2244	vld1.32	 	{$rndpang},[$key_],#16
2245
2246	aese		$dat0,$rndping
2247	aesmc		$dat0,$dat0
2248	 add		$tctr0,$ctr,#1
2249	 add		$tctr1,$ctr,#2
2250	aese		$dat1,$rndping
2251	aesmc		$dat1,$dat1
2252	 add		$tctr2,$ctr,#3
2253	 add		$tctr3,$ctr,#4
2254	aese		$dat2,$rndping
2255	aesmc		$dat2,$dat2
2256	 add		$tctr4,$ctr,#5
2257	 add		$tctr5,$ctr,#6
2258	 rev		$tctr0,$tctr0
2259	aese		$dat3,$rndping
2260	aesmc		$dat3,$dat3
2261	 rev		$tctr1,$tctr1
2262	 rev		$tctr2,$tctr2
2263	aese		$dat4,$rndping
2264	aesmc		$dat4,$dat4
2265	 rev		$tctr3,$tctr3
2266	 rev		$tctr4,$tctr4
2267	aese		$dat5,$rndping
2268	aesmc		$dat5,$dat5
2269	 rev		$tctr5,$tctr5
2270	vld1.32	 	{$rndping},[$key_],#16
2271
2272	aese		$dat0,$rndpang
2273	aesmc		$dat0,$dat0
2274	aese		$dat1,$rndpang
2275	aesmc		$dat1,$dat1
2276	vld1.8		{$in0,$in1,$in2,$in3},[$inp],#64
2277	aese		$dat2,$rndpang
2278	aesmc		$dat2,$dat2
2279	aese		$dat3,$rndpang
2280	aesmc		$dat3,$dat3
2281	vld1.8		{$in4,$in5},[$inp],#32
2282	aese		$dat4,$rndpang
2283	aesmc		$dat4,$dat4
2284	aese		$dat5,$rndpang
2285	aesmc		$dat5,$dat5
2286	vld1.32	 	{$rndpang},[$key_],#16
2287
2288	 mov		$key_, $key
2289	aese		$dat0,$rndping
2290	aesmc		$dat0,$dat0
2291	aese		$dat1,$rndping
2292	aesmc		$dat1,$dat1
2293	aese		$dat2,$rndping
2294	aesmc		$dat2,$dat2
2295	aese		$dat3,$rndping
2296	aesmc		$dat3,$dat3
2297	aese		$dat4,$rndping
2298	aesmc		$dat4,$dat4
2299	aese		$dat5,$rndping
2300	aesmc		$dat5,$dat5
2301	vld1.32	 	{$rndping},[$key_],#16	// re-pre-load rndkey[0]
2302
2303	aese		$dat0,$rndpang
2304	 eor3		$in0,$in0,$rndlast,$dat0
2305	aese		$dat1,$rndpang
2306	 eor3		$in1,$in1,$rndlast,$dat1
2307	aese		$dat2,$rndpang
2308	 eor3		$in2,$in2,$rndlast,$dat2
2309	aese		$dat3,$rndpang
2310	 eor3		$in3,$in3,$rndlast,$dat3
2311	aese		$dat4,$rndpang
2312	 eor3		$in4,$in4,$rndlast,$dat4
2313	aese		$dat5,$rndpang
2314	 eor3		$in5,$in5,$rndlast,$dat5
2315	vld1.32	 	{$rndpang},[$key_],#16	// re-pre-load rndkey[1]
2316
2317	vorr		$dat0,$ivec,$ivec
2318	vorr		$dat1,$ivec,$ivec
2319	vorr		$dat2,$ivec,$ivec
2320	vorr		$dat3,$ivec,$ivec
2321	vorr		$dat4,$ivec,$ivec
2322	vorr		$dat5,$ivec,$ivec
2323
2324	vmov.32		${dat0}[3],$tctr0
2325	vmov.32		${dat1}[3],$tctr1
2326	vst1.8		{$in0,$in1,$in2,$in3},[$out],#64
2327	vmov.32		${dat2}[3],$tctr2
2328	vmov.32		${dat3}[3],$tctr3
2329	vst1.8		{$in4,$in5},[$out],#32
2330	vmov.32		${dat4}[3],$tctr4
2331	vmov.32		${dat5}[3],$tctr5
2332
2333	 cbz		$len,.Lctr32_done_unroll
2334	 mov		$cnt,$rounds
2335
2336	 cmp		$len,#2
2337	b.ls		.Lctr32_tail_unroll
2338
2339	 sub		$len,$len,#3		// bias
2340	 add		$ctr,$ctr,#3
2341	 b		.Loop3x_ctr32_unroll
2342
2343.align	4
2344.Loop3x_ctr32_unroll:
2345	aese		$dat0,$rndping
2346	aesmc		$dat0,$dat0
2347	aese		$dat1,$rndping
2348	aesmc		$dat1,$dat1
2349	aese		$dat2,$rndping
2350	aesmc		$dat2,$dat2
2351	vld1.32		{$rndping},[$key_],#16
2352	subs		$cnt,$cnt,#2
2353	aese		$dat0,$rndpang
2354	aesmc		$dat0,$dat0
2355	aese		$dat1,$rndpang
2356	aesmc		$dat1,$dat1
2357	aese		$dat2,$rndpang
2358	aesmc		$dat2,$dat2
2359	vld1.32		{$rndpang},[$key_],#16
2360	b.gt		.Loop3x_ctr32_unroll
2361
2362	aese		$dat0,$rndping
2363	aesmc		$tmp0,$dat0
2364	aese		$dat1,$rndping
2365	aesmc		$tmp1,$dat1
2366	vld1.8		{$in0,$in1,$in2},[$inp],#48
2367	vorr		$dat0,$ivec,$ivec
2368	aese		$dat2,$rndping
2369	aesmc		$dat2,$dat2
2370	vld1.32		{$rndping},[$key_],#16
2371	vorr		$dat1,$ivec,$ivec
2372	aese		$tmp0,$rndpang
2373	aesmc		$tmp0,$tmp0
2374	aese		$tmp1,$rndpang
2375	aesmc		$tmp1,$tmp1
2376	aese		$dat2,$rndpang
2377	aesmc		$tmp2,$dat2
2378	vld1.32		{$rndpang},[$key_],#16
2379	vorr		$dat2,$ivec,$ivec
2380	 add		$tctr0,$ctr,#1
2381	aese		$tmp0,$rndping
2382	aesmc		$tmp0,$tmp0
2383	aese		$tmp1,$rndping
2384	aesmc		$tmp1,$tmp1
2385	 add		$tctr1,$ctr,#2
2386	aese		$tmp2,$rndping
2387	aesmc		$tmp2,$tmp2
2388	vld1.32		{$rndping},[$key_],#16
2389	 add		$ctr,$ctr,#3
2390	aese		$tmp0,$rndpang
2391	aesmc		$tmp0,$tmp0
2392	aese		$tmp1,$rndpang
2393	aesmc		$tmp1,$tmp1
2394
2395	 rev		$tctr0,$tctr0
2396	aese		$tmp2,$rndpang
2397	aesmc		$tmp2,$tmp2
2398	vld1.32		{$rndpang},[$key_],#16
2399	vmov.32		${dat0}[3], $tctr0
2400	 mov		$key_,$key
2401	 rev		$tctr1,$tctr1
2402	aese		$tmp0,$rndping
2403	aesmc		$tmp0,$tmp0
2404
2405	aese		$tmp1,$rndping
2406	aesmc		$tmp1,$tmp1
2407	vmov.32		${dat1}[3], $tctr1
2408	 rev		$tctr2,$ctr
2409	aese		$tmp2,$rndping
2410	aesmc		$tmp2,$tmp2
2411	vmov.32		${dat2}[3], $tctr2
2412
2413	aese		$tmp0,$rndpang
2414	aese		$tmp1,$rndpang
2415	aese		$tmp2,$rndpang
2416
2417	 eor3		$in0,$in0,$rndlast,$tmp0
2418	vld1.32		{$rndping},[$key_],#16	// re-pre-load rndkey[0]
2419	 eor3		$in1,$in1,$rndlast,$tmp1
2420	 mov		$cnt,$rounds
2421	 eor3		$in2,$in2,$rndlast,$tmp2
2422	vld1.32		{$rndpang},[$key_],#16	// re-pre-load rndkey[1]
2423	vst1.8		{$in0,$in1,$in2},[$out],#48
2424
2425	 cbz		$len,.Lctr32_done_unroll
2426
2427.Lctr32_tail_unroll:
2428	 cmp		$len,#1
2429	b.eq		.Lctr32_tail_1_unroll
2430
2431.Lctr32_tail_2_unroll:
2432	aese		$dat0,$rndping
2433	aesmc		$dat0,$dat0
2434	aese		$dat1,$rndping
2435	aesmc		$dat1,$dat1
2436	vld1.32		{$rndping},[$key_],#16
2437	subs		$cnt,$cnt,#2
2438	aese		$dat0,$rndpang
2439	aesmc		$dat0,$dat0
2440	aese		$dat1,$rndpang
2441	aesmc		$dat1,$dat1
2442	vld1.32		{$rndpang},[$key_],#16
2443	b.gt		.Lctr32_tail_2_unroll
2444
2445	aese		$dat0,$rndping
2446	aesmc		$dat0,$dat0
2447	aese		$dat1,$rndping
2448	aesmc		$dat1,$dat1
2449	vld1.32		{$rndping},[$key_],#16
2450	aese		$dat0,$rndpang
2451	aesmc		$dat0,$dat0
2452	aese		$dat1,$rndpang
2453	aesmc		$dat1,$dat1
2454	vld1.32		{$rndpang},[$key_],#16
2455	vld1.8		{$in0,$in1},[$inp],#32
2456	aese		$dat0,$rndping
2457	aesmc		$dat0,$dat0
2458	aese		$dat1,$rndping
2459	aesmc		$dat1,$dat1
2460	vld1.32		{$rndping},[$key_],#16
2461	aese		$dat0,$rndpang
2462	aesmc		$dat0,$dat0
2463	aese		$dat1,$rndpang
2464	aesmc		$dat1,$dat1
2465	vld1.32		{$rndpang},[$key_],#16
2466	aese		$dat0,$rndping
2467	aesmc		$dat0,$dat0
2468	aese		$dat1,$rndping
2469	aesmc		$dat1,$dat1
2470	aese		$dat0,$rndpang
2471	aese		$dat1,$rndpang
2472
2473	 eor3		$in0,$in0,$rndlast,$dat0
2474	 eor3		$in1,$in1,$rndlast,$dat1
2475	vst1.8		{$in0,$in1},[$out],#32
2476	 b		.Lctr32_done_unroll
2477
2478.Lctr32_tail_1_unroll:
2479	aese		$dat0,$rndping
2480	aesmc		$dat0,$dat0
2481	vld1.32		{$rndping},[$key_],#16
2482	subs		$cnt,$cnt,#2
2483	aese		$dat0,$rndpang
2484	aesmc		$dat0,$dat0
2485	vld1.32		{$rndpang},[$key_],#16
2486	b.gt		.Lctr32_tail_1_unroll
2487
2488	aese		$dat0,$rndping
2489	aesmc		$dat0,$dat0
2490	vld1.32		{$rndping},[$key_],#16
2491	aese		$dat0,$rndpang
2492	aesmc		$dat0,$dat0
2493	vld1.32		{$rndpang},[$key_],#16
2494	vld1.8		{$in0},[$inp]
2495	aese		$dat0,$rndping
2496	aesmc		$dat0,$dat0
2497	vld1.32		{$rndping},[$key_],#16
2498	aese		$dat0,$rndpang
2499	aesmc		$dat0,$dat0
2500	vld1.32		{$rndpang},[$key_],#16
2501	aese		$dat0,$rndping
2502	aesmc		$dat0,$dat0
2503	aese		$dat0,$rndpang
2504
2505	 eor3		$in0,$in0,$rndlast,$dat0
2506	vst1.8		{$in0},[$out],#16
2507
2508.Lctr32_done_unroll:
2509	ldp		d8,d9,[sp, #16]
2510	ldp		d10,d11,[sp, #32]
2511	ldp		d12,d13,[sp, #48]
2512	ldp		d14,d15,[sp, #64]
2513	ldr		x29,[sp],#80
2514	ret
2515.size	${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3
2516___
2517}}}
2518
2519{{{
2520my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
2521my ($rounds,$cnt,$key_)=("w5","w6","x7");
2522my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
2523my $step="x12";		# aliases with $tctr2
2524
2525my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
2526my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2527
2528# used only in 64-bit mode...
2529my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
2530
2531my ($dat,$tmp)=($dat0,$tmp0);
2532
2533### q8-q15	preloaded key schedule
2534
2535$code.=<<___;
2536.globl	${prefix}_ctr32_encrypt_blocks
2537.type	${prefix}_ctr32_encrypt_blocks,%function
2538.align	5
2539${prefix}_ctr32_encrypt_blocks:
2540___
2541$code.=<<___	if ($flavour =~ /64/);
2542	AARCH64_VALID_CALL_TARGET
2543	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
2544	stp		x29,x30,[sp,#-16]!
2545	add		x29,sp,#0
2546___
2547$code.=<<___	if ($flavour !~ /64/);
2548	mov		ip,sp
2549	stmdb		sp!,{r4-r10,lr}
2550	vstmdb		sp!,{d8-d15}            @ ABI specification says so
2551	ldr		r4, [ip]		@ load remaining arg
2552___
2553$code.=<<___;
2554	ldr		$rounds,[$key,#240]
2555
2556	ldr		$ctr, [$ivp, #12]
2557#ifdef __ARMEB__
2558	vld1.8		{$dat0},[$ivp]
2559#else
2560	vld1.32		{$dat0},[$ivp]
2561#endif
2562	vld1.32		{q8-q9},[$key]		// load key schedule...
2563	sub		$rounds,$rounds,#4
2564	mov		$step,#16
2565	cmp		$len,#2
2566	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
2567	sub		$rounds,$rounds,#2
2568	vld1.32		{q12-q13},[$key_],#32
2569	vld1.32		{q14-q15},[$key_],#32
2570	vld1.32		{$rndlast},[$key_]
2571	add		$key_,$key,#32
2572	mov		$cnt,$rounds
2573	cclr		$step,lo
2574#ifndef __ARMEB__
2575	rev		$ctr, $ctr
2576#endif
2577___
2578$code.=<<___	if ($flavour =~ /64/);
2579	vorr		$dat1,$dat0,$dat0
2580	add		$tctr1, $ctr, #1
2581	vorr		$dat2,$dat0,$dat0
2582	add		$ctr, $ctr, #2
2583	vorr		$ivec,$dat0,$dat0
2584	rev		$tctr1, $tctr1
2585	vmov.32		${dat1}[3],$tctr1
2586	b.ls		.Lctr32_tail
2587	rev		$tctr2, $ctr
2588	sub		$len,$len,#3		// bias
2589	vmov.32		${dat2}[3],$tctr2
2590___
2591$code.=<<___	if ($flavour !~ /64/);
2592	add		$tctr1, $ctr, #1
2593	vorr		$ivec,$dat0,$dat0
2594	rev		$tctr1, $tctr1
2595	vmov.32		${ivec}[3],$tctr1
2596	add		$ctr, $ctr, #2
2597	vorr		$dat1,$ivec,$ivec
2598	b.ls		.Lctr32_tail
2599	rev		$tctr2, $ctr
2600	vmov.32		${ivec}[3],$tctr2
2601	sub		$len,$len,#3		// bias
2602	vorr		$dat2,$ivec,$ivec
2603___
2604$code.=<<___	if ($flavour =~ /64/);
2605	cmp		$len,#32
2606	b.lo		.Loop3x_ctr32
2607
2608	add		w13,$ctr,#1
2609	add		w14,$ctr,#2
2610	vorr		$dat3,$dat0,$dat0
2611	rev		w13,w13
2612	vorr		$dat4,$dat0,$dat0
2613	rev		w14,w14
2614	vmov.32		${dat3}[3],w13
2615	sub		$len,$len,#2		// bias
2616	vmov.32		${dat4}[3],w14
2617	add		$ctr,$ctr,#2
2618	b		.Loop5x_ctr32
2619
2620.align	4
2621.Loop5x_ctr32:
2622	aese		$dat0,q8
2623	aesmc		$dat0,$dat0
2624	aese		$dat1,q8
2625	aesmc		$dat1,$dat1
2626	aese		$dat2,q8
2627	aesmc		$dat2,$dat2
2628	aese		$dat3,q8
2629	aesmc		$dat3,$dat3
2630	aese		$dat4,q8
2631	aesmc		$dat4,$dat4
2632	vld1.32		{q8},[$key_],#16
2633	subs		$cnt,$cnt,#2
2634	aese		$dat0,q9
2635	aesmc		$dat0,$dat0
2636	aese		$dat1,q9
2637	aesmc		$dat1,$dat1
2638	aese		$dat2,q9
2639	aesmc		$dat2,$dat2
2640	aese		$dat3,q9
2641	aesmc		$dat3,$dat3
2642	aese		$dat4,q9
2643	aesmc		$dat4,$dat4
2644	vld1.32		{q9},[$key_],#16
2645	b.gt		.Loop5x_ctr32
2646
2647	mov		$key_,$key
2648	aese		$dat0,q8
2649	aesmc		$dat0,$dat0
2650	aese		$dat1,q8
2651	aesmc		$dat1,$dat1
2652	aese		$dat2,q8
2653	aesmc		$dat2,$dat2
2654	aese		$dat3,q8
2655	aesmc		$dat3,$dat3
2656	aese		$dat4,q8
2657	aesmc		$dat4,$dat4
2658	vld1.32	 	{q8},[$key_],#16	// re-pre-load rndkey[0]
2659
2660	aese		$dat0,q9
2661	aesmc		$dat0,$dat0
2662	aese		$dat1,q9
2663	aesmc		$dat1,$dat1
2664	aese		$dat2,q9
2665	aesmc		$dat2,$dat2
2666	aese		$dat3,q9
2667	aesmc		$dat3,$dat3
2668	aese		$dat4,q9
2669	aesmc		$dat4,$dat4
2670	vld1.32	 	{q9},[$key_],#16	// re-pre-load rndkey[1]
2671
2672	aese		$dat0,q12
2673	aesmc		$dat0,$dat0
2674	 add		$tctr0,$ctr,#1
2675	 add		$tctr1,$ctr,#2
2676	aese		$dat1,q12
2677	aesmc		$dat1,$dat1
2678	 add		$tctr2,$ctr,#3
2679	 add		w13,$ctr,#4
2680	aese		$dat2,q12
2681	aesmc		$dat2,$dat2
2682	 add		w14,$ctr,#5
2683	 rev		$tctr0,$tctr0
2684	aese		$dat3,q12
2685	aesmc		$dat3,$dat3
2686	 rev		$tctr1,$tctr1
2687	 rev		$tctr2,$tctr2
2688	aese		$dat4,q12
2689	aesmc		$dat4,$dat4
2690	 rev		w13,w13
2691	 rev		w14,w14
2692
2693	aese		$dat0,q13
2694	aesmc		$dat0,$dat0
2695	aese		$dat1,q13
2696	aesmc		$dat1,$dat1
2697	aese		$dat2,q13
2698	aesmc		$dat2,$dat2
2699	aese		$dat3,q13
2700	aesmc		$dat3,$dat3
2701	aese		$dat4,q13
2702	aesmc		$dat4,$dat4
2703
2704	aese		$dat0,q14
2705	aesmc		$dat0,$dat0
2706	 vld1.8		{$in0},[$inp],#16
2707	aese		$dat1,q14
2708	aesmc		$dat1,$dat1
2709	 vld1.8		{$in1},[$inp],#16
2710	aese		$dat2,q14
2711	aesmc		$dat2,$dat2
2712	 vld1.8		{$in2},[$inp],#16
2713	aese		$dat3,q14
2714	aesmc		$dat3,$dat3
2715	 vld1.8		{$in3},[$inp],#16
2716	aese		$dat4,q14
2717	aesmc		$dat4,$dat4
2718	 vld1.8		{$in4},[$inp],#16
2719
2720	aese		$dat0,q15
2721	 veor		$in0,$in0,$rndlast
2722	aese		$dat1,q15
2723	 veor		$in1,$in1,$rndlast
2724	aese		$dat2,q15
2725	 veor		$in2,$in2,$rndlast
2726	aese		$dat3,q15
2727	 veor		$in3,$in3,$rndlast
2728	aese		$dat4,q15
2729	 veor		$in4,$in4,$rndlast
2730
2731	veor		$in0,$in0,$dat0
2732	 vorr		$dat0,$ivec,$ivec
2733	veor		$in1,$in1,$dat1
2734	 vorr		$dat1,$ivec,$ivec
2735	veor		$in2,$in2,$dat2
2736	 vorr		$dat2,$ivec,$ivec
2737	veor		$in3,$in3,$dat3
2738	 vorr		$dat3,$ivec,$ivec
2739	veor		$in4,$in4,$dat4
2740	 vorr		$dat4,$ivec,$ivec
2741
2742	vst1.8		{$in0},[$out],#16
2743	 vmov.32	${dat0}[3],$tctr0
2744	vst1.8		{$in1},[$out],#16
2745	 vmov.32	${dat1}[3],$tctr1
2746	vst1.8		{$in2},[$out],#16
2747	 vmov.32	${dat2}[3],$tctr2
2748	vst1.8		{$in3},[$out],#16
2749	 vmov.32	${dat3}[3],w13
2750	vst1.8		{$in4},[$out],#16
2751	 vmov.32	${dat4}[3],w14
2752
2753	mov		$cnt,$rounds
2754	cbz		$len,.Lctr32_done
2755
2756	add		$ctr,$ctr,#5
2757	subs		$len,$len,#5
2758	b.hs		.Loop5x_ctr32
2759
2760	add		$len,$len,#5
2761	sub		$ctr,$ctr,#5
2762
2763	cmp		$len,#2
2764	mov		$step,#16
2765	cclr		$step,lo
2766	b.ls		.Lctr32_tail
2767
2768	sub		$len,$len,#3		// bias
2769	add		$ctr,$ctr,#3
2770___
2771$code.=<<___;
2772	b		.Loop3x_ctr32
2773
2774.align	4
2775.Loop3x_ctr32:
2776	aese		$dat0,q8
2777	aesmc		$dat0,$dat0
2778	aese		$dat1,q8
2779	aesmc		$dat1,$dat1
2780	aese		$dat2,q8
2781	aesmc		$dat2,$dat2
2782	vld1.32		{q8},[$key_],#16
2783	subs		$cnt,$cnt,#2
2784	aese		$dat0,q9
2785	aesmc		$dat0,$dat0
2786	aese		$dat1,q9
2787	aesmc		$dat1,$dat1
2788	aese		$dat2,q9
2789	aesmc		$dat2,$dat2
2790	vld1.32		{q9},[$key_],#16
2791	b.gt		.Loop3x_ctr32
2792
2793	aese		$dat0,q8
2794	aesmc		$tmp0,$dat0
2795	aese		$dat1,q8
2796	aesmc		$tmp1,$dat1
2797	 vld1.8		{$in0},[$inp],#16
2798___
2799$code.=<<___	if ($flavour =~ /64/);
2800	 vorr		$dat0,$ivec,$ivec
2801___
2802$code.=<<___	if ($flavour !~ /64/);
2803	 add		$tctr0,$ctr,#1
2804___
2805$code.=<<___;
2806	aese		$dat2,q8
2807	aesmc		$dat2,$dat2
2808	 vld1.8		{$in1},[$inp],#16
2809___
2810$code.=<<___	if ($flavour =~ /64/);
2811	 vorr		$dat1,$ivec,$ivec
2812___
2813$code.=<<___	if ($flavour !~ /64/);
2814	 rev		$tctr0,$tctr0
2815___
2816$code.=<<___;
2817	aese		$tmp0,q9
2818	aesmc		$tmp0,$tmp0
2819	aese		$tmp1,q9
2820	aesmc		$tmp1,$tmp1
2821	 vld1.8		{$in2},[$inp],#16
2822	 mov		$key_,$key
2823	aese		$dat2,q9
2824	aesmc		$tmp2,$dat2
2825___
2826$code.=<<___	if ($flavour =~ /64/);
2827	 vorr		$dat2,$ivec,$ivec
2828	 add		$tctr0,$ctr,#1
2829___
2830$code.=<<___;
2831	aese		$tmp0,q12
2832	aesmc		$tmp0,$tmp0
2833	aese		$tmp1,q12
2834	aesmc		$tmp1,$tmp1
2835	 veor		$in0,$in0,$rndlast
2836	 add		$tctr1,$ctr,#2
2837	aese		$tmp2,q12
2838	aesmc		$tmp2,$tmp2
2839	 veor		$in1,$in1,$rndlast
2840	 add		$ctr,$ctr,#3
2841	aese		$tmp0,q13
2842	aesmc		$tmp0,$tmp0
2843	aese		$tmp1,q13
2844	aesmc		$tmp1,$tmp1
2845	 veor		$in2,$in2,$rndlast
2846___
2847$code.=<<___	if ($flavour =~ /64/);
2848	 rev		$tctr0,$tctr0
2849	aese		$tmp2,q13
2850	aesmc		$tmp2,$tmp2
2851	 vmov.32	${dat0}[3], $tctr0
2852___
2853$code.=<<___	if ($flavour !~ /64/);
2854	 vmov.32	${ivec}[3], $tctr0
2855	aese		$tmp2,q13
2856	aesmc		$tmp2,$tmp2
2857	 vorr		$dat0,$ivec,$ivec
2858___
2859$code.=<<___;
2860	 rev		$tctr1,$tctr1
2861	aese		$tmp0,q14
2862	aesmc		$tmp0,$tmp0
2863___
2864$code.=<<___	if ($flavour !~ /64/);
2865	 vmov.32	${ivec}[3], $tctr1
2866	 rev		$tctr2,$ctr
2867___
2868$code.=<<___;
2869	aese		$tmp1,q14
2870	aesmc		$tmp1,$tmp1
2871___
2872$code.=<<___	if ($flavour =~ /64/);
2873	 vmov.32	${dat1}[3], $tctr1
2874	 rev		$tctr2,$ctr
2875	aese		$tmp2,q14
2876	aesmc		$tmp2,$tmp2
2877	 vmov.32	${dat2}[3], $tctr2
2878___
2879$code.=<<___	if ($flavour !~ /64/);
2880	 vorr		$dat1,$ivec,$ivec
2881	 vmov.32	${ivec}[3], $tctr2
2882	aese		$tmp2,q14
2883	aesmc		$tmp2,$tmp2
2884	 vorr		$dat2,$ivec,$ivec
2885___
2886$code.=<<___;
2887	 subs		$len,$len,#3
2888	aese		$tmp0,q15
2889	aese		$tmp1,q15
2890	aese		$tmp2,q15
2891
2892	veor		$in0,$in0,$tmp0
2893	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
2894	vst1.8		{$in0},[$out],#16
2895	veor		$in1,$in1,$tmp1
2896	 mov		$cnt,$rounds
2897	vst1.8		{$in1},[$out],#16
2898	veor		$in2,$in2,$tmp2
2899	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
2900	vst1.8		{$in2},[$out],#16
2901	b.hs		.Loop3x_ctr32
2902
2903	adds		$len,$len,#3
2904	b.eq		.Lctr32_done
2905	cmp		$len,#1
2906	mov		$step,#16
2907	cclr		$step,eq
2908
2909.Lctr32_tail:
2910	aese		$dat0,q8
2911	aesmc		$dat0,$dat0
2912	aese		$dat1,q8
2913	aesmc		$dat1,$dat1
2914	vld1.32		{q8},[$key_],#16
2915	subs		$cnt,$cnt,#2
2916	aese		$dat0,q9
2917	aesmc		$dat0,$dat0
2918	aese		$dat1,q9
2919	aesmc		$dat1,$dat1
2920	vld1.32		{q9},[$key_],#16
2921	b.gt		.Lctr32_tail
2922
2923	aese		$dat0,q8
2924	aesmc		$dat0,$dat0
2925	aese		$dat1,q8
2926	aesmc		$dat1,$dat1
2927	aese		$dat0,q9
2928	aesmc		$dat0,$dat0
2929	aese		$dat1,q9
2930	aesmc		$dat1,$dat1
2931	 vld1.8		{$in0},[$inp],$step
2932	aese		$dat0,q12
2933	aesmc		$dat0,$dat0
2934	aese		$dat1,q12
2935	aesmc		$dat1,$dat1
2936	 vld1.8		{$in1},[$inp]
2937	aese		$dat0,q13
2938	aesmc		$dat0,$dat0
2939	aese		$dat1,q13
2940	aesmc		$dat1,$dat1
2941	 veor		$in0,$in0,$rndlast
2942	aese		$dat0,q14
2943	aesmc		$dat0,$dat0
2944	aese		$dat1,q14
2945	aesmc		$dat1,$dat1
2946	 veor		$in1,$in1,$rndlast
2947	aese		$dat0,q15
2948	aese		$dat1,q15
2949
2950	cmp		$len,#1
2951	veor		$in0,$in0,$dat0
2952	veor		$in1,$in1,$dat1
2953	vst1.8		{$in0},[$out],#16
2954	b.eq		.Lctr32_done
2955	vst1.8		{$in1},[$out]
2956
2957.Lctr32_done:
2958___
2959$code.=<<___	if ($flavour !~ /64/);
2960	vldmia		sp!,{d8-d15}
2961	ldmia		sp!,{r4-r10,pc}
2962___
2963$code.=<<___	if ($flavour =~ /64/);
2964	ldr		x29,[sp],#16
2965	ret
2966___
2967$code.=<<___;
2968.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2969___
2970}}}
2971# Performance in cycles per byte.
2972# Processed with AES-XTS different key size.
2973# It shows the value before and after optimization as below:
2974# (before/after):
2975#
2976#		AES-128-XTS		AES-256-XTS
2977# Cortex-A57	3.36/1.09		4.02/1.37
2978# Cortex-A72	3.03/1.02		3.28/1.33
2979
2980# Optimization is implemented by loop unrolling and interleaving.
2981# Commonly, we choose the unrolling factor as 5, if the input
2982# data size smaller than 5 blocks, but not smaller than 3 blocks,
2983# choose 3 as the unrolling factor.
2984# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2985# as one iteration, every loop the left size lsize -= 5*16.
2986# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2987# will be processed specially, which be integrated into the 5*16 bytes
2988# loop to improve the efficiency.
2989# There is one special case, if the original input data size dsize
2990# = 16 bytes, we will treat it separately to improve the
2991# performance: one independent code block without LR, FP load and
2992# store.
2993# Encryption will process the (length -tailcnt) bytes as mentioned
2994# previously, then encrypt the composite block as last second
2995# cipher block.
2996# Decryption will process the (length -tailcnt -1) bytes as mentioned
2997# previously, then decrypt the last second cipher block to get the
2998# last plain block(tail), decrypt the composite block as last second
2999# plain text block.
3000
3001{{{
3002my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
3003my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
3004my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
3005my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
3006my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
3007my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
3008my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
3009my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
3010my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
3011
3012my ($tmpin)=("v26.16b");
3013my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
3014
3015# q7	last round key
3016# q10-q15, q7	Last 7 round keys
3017# q8-q9	preloaded round keys except last 7 keys for big size
3018# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
3019
3020
3021my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3022
3023my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
3024my ($dat4,$in4,$tmp4);
3025if ($flavour =~ /64/) {
3026    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3027}
3028
3029$code.=<<___	if ($flavour =~ /64/);
3030.globl	${prefix}_xts_encrypt
3031.type	${prefix}_xts_encrypt,%function
3032.align	5
3033${prefix}_xts_encrypt:
3034___
3035$code.=<<___	if ($flavour =~ /64/);
3036	AARCH64_VALID_CALL_TARGET
3037	cmp	$len,#16
3038	// Original input data size bigger than 16, jump to big size processing.
3039	b.ne	.Lxts_enc_big_size
3040	// Encrypt the iv with key2, as the first XEX iv.
3041	ldr	$rounds,[$key2,#240]
3042	vld1.32	{$dat},[$key2],#16
3043	vld1.8	{$iv0},[$ivp]
3044	sub	$rounds,$rounds,#2
3045	vld1.32	{$dat1},[$key2],#16
3046
3047.Loop_enc_iv_enc:
3048	aese	$iv0,$dat
3049	aesmc	$iv0,$iv0
3050	vld1.32	{$dat},[$key2],#16
3051	subs	$rounds,$rounds,#2
3052	aese	$iv0,$dat1
3053	aesmc	$iv0,$iv0
3054	vld1.32	{$dat1},[$key2],#16
3055	b.gt	.Loop_enc_iv_enc
3056
3057	aese	$iv0,$dat
3058	aesmc	$iv0,$iv0
3059	vld1.32	{$dat},[$key2]
3060	aese	$iv0,$dat1
3061	veor	$iv0,$iv0,$dat
3062
3063	vld1.8	{$dat0},[$inp]
3064	veor	$dat0,$iv0,$dat0
3065
3066	ldr	$rounds,[$key1,#240]
3067	vld1.32	{q20-q21},[$key1],#32		// load key schedule...
3068
3069	aese	$dat0,q20
3070	aesmc	$dat0,$dat0
3071	vld1.32	{q8-q9},[$key1],#32		// load key schedule...
3072	aese	$dat0,q21
3073	aesmc	$dat0,$dat0
3074	subs	$rounds,$rounds,#10		// if rounds==10, jump to aes-128-xts processing
3075	b.eq	.Lxts_128_enc
3076.Lxts_enc_round_loop:
3077	aese	$dat0,q8
3078	aesmc	$dat0,$dat0
3079	vld1.32	{q8},[$key1],#16		// load key schedule...
3080	aese	$dat0,q9
3081	aesmc	$dat0,$dat0
3082	vld1.32	{q9},[$key1],#16		// load key schedule...
3083	subs	$rounds,$rounds,#2		// bias
3084	b.gt	.Lxts_enc_round_loop
3085.Lxts_128_enc:
3086	vld1.32	{q10-q11},[$key1],#32		// load key schedule...
3087	aese	$dat0,q8
3088	aesmc	$dat0,$dat0
3089	aese	$dat0,q9
3090	aesmc	$dat0,$dat0
3091	vld1.32	{q12-q13},[$key1],#32		// load key schedule...
3092	aese	$dat0,q10
3093	aesmc	$dat0,$dat0
3094	aese	$dat0,q11
3095	aesmc	$dat0,$dat0
3096	vld1.32	{q14-q15},[$key1],#32		// load key schedule...
3097	aese	$dat0,q12
3098	aesmc	$dat0,$dat0
3099	aese	$dat0,q13
3100	aesmc	$dat0,$dat0
3101	vld1.32	{$rndlast},[$key1]
3102	aese	$dat0,q14
3103	aesmc	$dat0,$dat0
3104	aese	$dat0,q15
3105	veor	$dat0,$dat0,$rndlast
3106	veor	$dat0,$dat0,$iv0
3107	vst1.8	{$dat0},[$out]
3108	b	.Lxts_enc_final_abort
3109
3110.align	4
3111.Lxts_enc_big_size:
3112___
3113$code.=<<___	if ($flavour =~ /64/);
3114	stp	$constnumx,$tmpinp,[sp,#-64]!
3115	stp	$tailcnt,$midnumx,[sp,#48]
3116	stp	$ivd10,$ivd20,[sp,#32]
3117	stp	$ivd30,$ivd40,[sp,#16]
3118
3119	// tailcnt store the tail value of length%16.
3120	and	$tailcnt,$len,#0xf
3121	and	$len,$len,#-16
3122	subs	$len,$len,#16
3123	mov	$step,#16
3124	b.lo	.Lxts_abort
3125	csel	$step,xzr,$step,eq
3126
3127	// Firstly, encrypt the iv with key2, as the first iv of XEX.
3128	ldr	$rounds,[$key2,#240]
3129	vld1.32	{$dat},[$key2],#16
3130	vld1.8	{$iv0},[$ivp]
3131	sub	$rounds,$rounds,#2
3132	vld1.32	{$dat1},[$key2],#16
3133
3134.Loop_iv_enc:
3135	aese	$iv0,$dat
3136	aesmc	$iv0,$iv0
3137	vld1.32	{$dat},[$key2],#16
3138	subs	$rounds,$rounds,#2
3139	aese	$iv0,$dat1
3140	aesmc	$iv0,$iv0
3141	vld1.32	{$dat1},[$key2],#16
3142	b.gt	.Loop_iv_enc
3143
3144	aese	$iv0,$dat
3145	aesmc	$iv0,$iv0
3146	vld1.32	{$dat},[$key2]
3147	aese	$iv0,$dat1
3148	veor	$iv0,$iv0,$dat
3149
3150	// The iv for second block
3151	// $ivl- iv(low), $ivh - iv(high)
3152	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3153	fmov	$ivl,$ivd00
3154	fmov	$ivh,$ivd01
3155	mov	$constnum,#0x87
3156	extr	$midnumx,$ivh,$ivh,#32
3157	extr	$ivh,$ivh,$ivl,#63
3158	and	$tmpmw,$constnum,$midnum,asr#31
3159	eor	$ivl,$tmpmx,$ivl,lsl#1
3160	fmov	$ivd10,$ivl
3161	fmov	$ivd11,$ivh
3162
3163	ldr	$rounds0,[$key1,#240]		// next starting point
3164	vld1.8	{$dat},[$inp],$step
3165
3166	vld1.32	{q8-q9},[$key1]			// load key schedule...
3167	sub	$rounds0,$rounds0,#6
3168	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
3169	sub	$rounds0,$rounds0,#2
3170	vld1.32	{q10-q11},[$key_],#32
3171	vld1.32	{q12-q13},[$key_],#32
3172	vld1.32	{q14-q15},[$key_],#32
3173	vld1.32	{$rndlast},[$key_]
3174
3175	add	$key_,$key1,#32
3176	mov	$rounds,$rounds0
3177
3178	// Encryption
3179.Lxts_enc:
3180	vld1.8	{$dat2},[$inp],#16
3181	subs	$len,$len,#32			// bias
3182	add	$rounds,$rounds0,#2
3183	vorr	$in1,$dat,$dat
3184	vorr	$dat1,$dat,$dat
3185	vorr	$in3,$dat,$dat
3186	vorr	$in2,$dat2,$dat2
3187	vorr	$in4,$dat2,$dat2
3188	b.lo	.Lxts_inner_enc_tail
3189	veor	$dat,$dat,$iv0			// before encryption, xor with iv
3190	veor	$dat2,$dat2,$iv1
3191
3192	// The iv for third block
3193	extr	$midnumx,$ivh,$ivh,#32
3194	extr	$ivh,$ivh,$ivl,#63
3195	and	$tmpmw,$constnum,$midnum,asr#31
3196	eor	$ivl,$tmpmx,$ivl,lsl#1
3197	fmov	$ivd20,$ivl
3198	fmov	$ivd21,$ivh
3199
3200
3201	vorr	$dat1,$dat2,$dat2
3202	vld1.8	{$dat2},[$inp],#16
3203	vorr	$in0,$dat,$dat
3204	vorr	$in1,$dat1,$dat1
3205	veor	$in2,$dat2,$iv2 		// the third block
3206	veor	$dat2,$dat2,$iv2
3207	cmp	$len,#32
3208	b.lo	.Lxts_outer_enc_tail
3209
3210	// The iv for fourth block
3211	extr	$midnumx,$ivh,$ivh,#32
3212	extr	$ivh,$ivh,$ivl,#63
3213	and	$tmpmw,$constnum,$midnum,asr#31
3214	eor	$ivl,$tmpmx,$ivl,lsl#1
3215	fmov	$ivd30,$ivl
3216	fmov	$ivd31,$ivh
3217
3218	vld1.8	{$dat3},[$inp],#16
3219	// The iv for fifth block
3220	extr	$midnumx,$ivh,$ivh,#32
3221	extr	$ivh,$ivh,$ivl,#63
3222	and	$tmpmw,$constnum,$midnum,asr#31
3223	eor	$ivl,$tmpmx,$ivl,lsl#1
3224	fmov	$ivd40,$ivl
3225	fmov	$ivd41,$ivh
3226
3227	vld1.8	{$dat4},[$inp],#16
3228	veor	$dat3,$dat3,$iv3		// the fourth block
3229	veor	$dat4,$dat4,$iv4
3230	sub	$len,$len,#32			// bias
3231	mov	$rounds,$rounds0
3232	b	.Loop5x_xts_enc
3233
3234.align	4
3235.Loop5x_xts_enc:
3236	aese	$dat0,q8
3237	aesmc	$dat0,$dat0
3238	aese	$dat1,q8
3239	aesmc	$dat1,$dat1
3240	aese	$dat2,q8
3241	aesmc	$dat2,$dat2
3242	aese	$dat3,q8
3243	aesmc	$dat3,$dat3
3244	aese	$dat4,q8
3245	aesmc	$dat4,$dat4
3246	vld1.32	{q8},[$key_],#16
3247	subs	$rounds,$rounds,#2
3248	aese	$dat0,q9
3249	aesmc	$dat0,$dat0
3250	aese	$dat1,q9
3251	aesmc	$dat1,$dat1
3252	aese	$dat2,q9
3253	aesmc	$dat2,$dat2
3254	aese	$dat3,q9
3255	aesmc	$dat3,$dat3
3256	aese	$dat4,q9
3257	aesmc	$dat4,$dat4
3258	vld1.32	{q9},[$key_],#16
3259	b.gt	.Loop5x_xts_enc
3260
3261	aese	$dat0,q8
3262	aesmc	$dat0,$dat0
3263	aese	$dat1,q8
3264	aesmc	$dat1,$dat1
3265	aese	$dat2,q8
3266	aesmc	$dat2,$dat2
3267	aese	$dat3,q8
3268	aesmc	$dat3,$dat3
3269	aese	$dat4,q8
3270	aesmc	$dat4,$dat4
3271	subs	$len,$len,#0x50			// because .Lxts_enc_tail4x
3272
3273	aese	$dat0,q9
3274	aesmc	$dat0,$dat0
3275	aese	$dat1,q9
3276	aesmc	$dat1,$dat1
3277	aese	$dat2,q9
3278	aesmc	$dat2,$dat2
3279	aese	$dat3,q9
3280	aesmc	$dat3,$dat3
3281	aese	$dat4,q9
3282	aesmc	$dat4,$dat4
3283	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
3284	mov	$key_,$key1
3285
3286	aese	$dat0,q10
3287	aesmc	$dat0,$dat0
3288	aese	$dat1,q10
3289	aesmc	$dat1,$dat1
3290	aese	$dat2,q10
3291	aesmc	$dat2,$dat2
3292	aese	$dat3,q10
3293	aesmc	$dat3,$dat3
3294	aese	$dat4,q10
3295	aesmc	$dat4,$dat4
3296	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
3297						// at exit from the loop v1.16b-v26.16b
3298						// are loaded with last "words"
3299	add	$xoffset,$len,#0x60		// because .Lxts_enc_tail4x
3300
3301	aese	$dat0,q11
3302	aesmc	$dat0,$dat0
3303	aese	$dat1,q11
3304	aesmc	$dat1,$dat1
3305	aese	$dat2,q11
3306	aesmc	$dat2,$dat2
3307	aese	$dat3,q11
3308	aesmc	$dat3,$dat3
3309	aese	$dat4,q11
3310	aesmc	$dat4,$dat4
3311
3312	aese	$dat0,q12
3313	aesmc	$dat0,$dat0
3314	aese	$dat1,q12
3315	aesmc	$dat1,$dat1
3316	aese	$dat2,q12
3317	aesmc	$dat2,$dat2
3318	aese	$dat3,q12
3319	aesmc	$dat3,$dat3
3320	aese	$dat4,q12
3321	aesmc	$dat4,$dat4
3322
3323	aese	$dat0,q13
3324	aesmc	$dat0,$dat0
3325	aese	$dat1,q13
3326	aesmc	$dat1,$dat1
3327	aese	$dat2,q13
3328	aesmc	$dat2,$dat2
3329	aese	$dat3,q13
3330	aesmc	$dat3,$dat3
3331	aese	$dat4,q13
3332	aesmc	$dat4,$dat4
3333
3334	aese	$dat0,q14
3335	aesmc	$dat0,$dat0
3336	aese	$dat1,q14
3337	aesmc	$dat1,$dat1
3338	aese	$dat2,q14
3339	aesmc	$dat2,$dat2
3340	aese	$dat3,q14
3341	aesmc	$dat3,$dat3
3342	aese	$dat4,q14
3343	aesmc	$dat4,$dat4
3344
3345	veor	$tmp0,$rndlast,$iv0
3346	aese	$dat0,q15
3347	// The iv for first block of one iteration
3348	extr	$midnumx,$ivh,$ivh,#32
3349	extr	$ivh,$ivh,$ivl,#63
3350	and	$tmpmw,$constnum,$midnum,asr#31
3351	eor	$ivl,$tmpmx,$ivl,lsl#1
3352	fmov	$ivd00,$ivl
3353	fmov	$ivd01,$ivh
3354	veor	$tmp1,$rndlast,$iv1
3355	vld1.8	{$in0},[$inp],#16
3356	aese	$dat1,q15
3357	// The iv for second block
3358	extr	$midnumx,$ivh,$ivh,#32
3359	extr	$ivh,$ivh,$ivl,#63
3360	and	$tmpmw,$constnum,$midnum,asr#31
3361	eor	$ivl,$tmpmx,$ivl,lsl#1
3362	fmov	$ivd10,$ivl
3363	fmov	$ivd11,$ivh
3364	veor	$tmp2,$rndlast,$iv2
3365	vld1.8	{$in1},[$inp],#16
3366	aese	$dat2,q15
3367	// The iv for third block
3368	extr	$midnumx,$ivh,$ivh,#32
3369	extr	$ivh,$ivh,$ivl,#63
3370	and	$tmpmw,$constnum,$midnum,asr#31
3371	eor	$ivl,$tmpmx,$ivl,lsl#1
3372	fmov	$ivd20,$ivl
3373	fmov	$ivd21,$ivh
3374	veor	$tmp3,$rndlast,$iv3
3375	vld1.8	{$in2},[$inp],#16
3376	aese	$dat3,q15
3377	// The iv for fourth block
3378	extr	$midnumx,$ivh,$ivh,#32
3379	extr	$ivh,$ivh,$ivl,#63
3380	and	$tmpmw,$constnum,$midnum,asr#31
3381	eor	$ivl,$tmpmx,$ivl,lsl#1
3382	fmov	$ivd30,$ivl
3383	fmov	$ivd31,$ivh
3384	veor	$tmp4,$rndlast,$iv4
3385	vld1.8	{$in3},[$inp],#16
3386	aese	$dat4,q15
3387
3388	// The iv for fifth block
3389	extr	$midnumx,$ivh,$ivh,#32
3390	extr	$ivh,$ivh,$ivl,#63
3391	and	$tmpmw,$constnum,$midnum,asr #31
3392	eor	$ivl,$tmpmx,$ivl,lsl #1
3393	fmov	$ivd40,$ivl
3394	fmov	$ivd41,$ivh
3395
3396	vld1.8	{$in4},[$inp],#16
3397	cbz	$xoffset,.Lxts_enc_tail4x
3398	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
3399	veor	$tmp0,$tmp0,$dat0
3400	veor	$dat0,$in0,$iv0
3401	veor	$tmp1,$tmp1,$dat1
3402	veor	$dat1,$in1,$iv1
3403	veor	$tmp2,$tmp2,$dat2
3404	veor	$dat2,$in2,$iv2
3405	veor	$tmp3,$tmp3,$dat3
3406	veor	$dat3,$in3,$iv3
3407	veor	$tmp4,$tmp4,$dat4
3408	vst1.8	{$tmp0},[$out],#16
3409	veor	$dat4,$in4,$iv4
3410	vst1.8	{$tmp1},[$out],#16
3411	mov	$rounds,$rounds0
3412	vst1.8	{$tmp2},[$out],#16
3413	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
3414	vst1.8	{$tmp3},[$out],#16
3415	vst1.8	{$tmp4},[$out],#16
3416	b.hs	.Loop5x_xts_enc
3417
3418
3419	// If left 4 blocks, borrow the five block's processing.
3420	cmn	$len,#0x10
3421	b.ne	.Loop5x_enc_after
3422	vorr	$iv4,$iv3,$iv3
3423	vorr	$iv3,$iv2,$iv2
3424	vorr	$iv2,$iv1,$iv1
3425	vorr	$iv1,$iv0,$iv0
3426	fmov	$ivl,$ivd40
3427	fmov	$ivh,$ivd41
3428	veor	$dat0,$iv0,$in0
3429	veor	$dat1,$iv1,$in1
3430	veor	$dat2,$in2,$iv2
3431	veor	$dat3,$in3,$iv3
3432	veor	$dat4,$in4,$iv4
3433	b.eq	.Loop5x_xts_enc
3434
3435.Loop5x_enc_after:
3436	add	$len,$len,#0x50
3437	cbz	$len,.Lxts_enc_done
3438
3439	add	$rounds,$rounds0,#2
3440	subs	$len,$len,#0x30
3441	b.lo	.Lxts_inner_enc_tail
3442
3443	veor	$dat0,$iv0,$in2
3444	veor	$dat1,$iv1,$in3
3445	veor	$dat2,$in4,$iv2
3446	b	.Lxts_outer_enc_tail
3447
3448.align	4
3449.Lxts_enc_tail4x:
3450	add	$inp,$inp,#16
3451	veor	$tmp1,$dat1,$tmp1
3452	vst1.8	{$tmp1},[$out],#16
3453	veor	$tmp2,$dat2,$tmp2
3454	vst1.8	{$tmp2},[$out],#16
3455	veor	$tmp3,$dat3,$tmp3
3456	veor	$tmp4,$dat4,$tmp4
3457	vst1.8	{$tmp3-$tmp4},[$out],#32
3458
3459	b	.Lxts_enc_done
3460.align	4
3461.Lxts_outer_enc_tail:
3462	aese	$dat0,q8
3463	aesmc	$dat0,$dat0
3464	aese	$dat1,q8
3465	aesmc	$dat1,$dat1
3466	aese	$dat2,q8
3467	aesmc	$dat2,$dat2
3468	vld1.32	{q8},[$key_],#16
3469	subs	$rounds,$rounds,#2
3470	aese	$dat0,q9
3471	aesmc	$dat0,$dat0
3472	aese	$dat1,q9
3473	aesmc	$dat1,$dat1
3474	aese	$dat2,q9
3475	aesmc	$dat2,$dat2
3476	vld1.32	{q9},[$key_],#16
3477	b.gt	.Lxts_outer_enc_tail
3478
3479	aese	$dat0,q8
3480	aesmc	$dat0,$dat0
3481	aese	$dat1,q8
3482	aesmc	$dat1,$dat1
3483	aese	$dat2,q8
3484	aesmc	$dat2,$dat2
3485	veor	$tmp0,$iv0,$rndlast
3486	subs	$len,$len,#0x30
3487	// The iv for first block
3488	fmov	$ivl,$ivd20
3489	fmov	$ivh,$ivd21
3490	//mov	$constnum,#0x87
3491	extr	$midnumx,$ivh,$ivh,#32
3492	extr	$ivh,$ivh,$ivl,#63
3493	and	$tmpmw,$constnum,$midnum,asr#31
3494	eor	$ivl,$tmpmx,$ivl,lsl#1
3495	fmov	$ivd00,$ivl
3496	fmov	$ivd01,$ivh
3497	veor	$tmp1,$iv1,$rndlast
3498	csel	$xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
3499	aese	$dat0,q9
3500	aesmc	$dat0,$dat0
3501	aese	$dat1,q9
3502	aesmc	$dat1,$dat1
3503	aese	$dat2,q9
3504	aesmc	$dat2,$dat2
3505	veor	$tmp2,$iv2,$rndlast
3506
3507	add	$xoffset,$xoffset,#0x20
3508	add	$inp,$inp,$xoffset
3509	mov	$key_,$key1
3510
3511	aese	$dat0,q12
3512	aesmc	$dat0,$dat0
3513	aese	$dat1,q12
3514	aesmc	$dat1,$dat1
3515	aese	$dat2,q12
3516	aesmc	$dat2,$dat2
3517	aese	$dat0,q13
3518	aesmc	$dat0,$dat0
3519	aese	$dat1,q13
3520	aesmc	$dat1,$dat1
3521	aese	$dat2,q13
3522	aesmc	$dat2,$dat2
3523	aese	$dat0,q14
3524	aesmc	$dat0,$dat0
3525	aese	$dat1,q14
3526	aesmc	$dat1,$dat1
3527	aese	$dat2,q14
3528	aesmc	$dat2,$dat2
3529	aese	$dat0,q15
3530	aese	$dat1,q15
3531	aese	$dat2,q15
3532	vld1.8	{$in2},[$inp],#16
3533	add	$rounds,$rounds0,#2
3534	vld1.32	{q8},[$key_],#16                // re-pre-load rndkey[0]
3535	veor	$tmp0,$tmp0,$dat0
3536	veor	$tmp1,$tmp1,$dat1
3537	veor	$dat2,$dat2,$tmp2
3538	vld1.32	{q9},[$key_],#16                // re-pre-load rndkey[1]
3539	vst1.8	{$tmp0},[$out],#16
3540	vst1.8	{$tmp1},[$out],#16
3541	vst1.8	{$dat2},[$out],#16
3542	cmn	$len,#0x30
3543	b.eq	.Lxts_enc_done
3544.Lxts_encxor_one:
3545	vorr	$in3,$in1,$in1
3546	vorr	$in4,$in2,$in2
3547	nop
3548
3549.Lxts_inner_enc_tail:
3550	cmn	$len,#0x10
3551	veor	$dat1,$in3,$iv0
3552	veor	$dat2,$in4,$iv1
3553	b.eq	.Lxts_enc_tail_loop
3554	veor	$dat2,$in4,$iv0
3555.Lxts_enc_tail_loop:
3556	aese	$dat1,q8
3557	aesmc	$dat1,$dat1
3558	aese	$dat2,q8
3559	aesmc	$dat2,$dat2
3560	vld1.32	{q8},[$key_],#16
3561	subs	$rounds,$rounds,#2
3562	aese	$dat1,q9
3563	aesmc	$dat1,$dat1
3564	aese	$dat2,q9
3565	aesmc	$dat2,$dat2
3566	vld1.32	{q9},[$key_],#16
3567	b.gt	.Lxts_enc_tail_loop
3568
3569	aese	$dat1,q8
3570	aesmc	$dat1,$dat1
3571	aese	$dat2,q8
3572	aesmc	$dat2,$dat2
3573	aese	$dat1,q9
3574	aesmc	$dat1,$dat1
3575	aese	$dat2,q9
3576	aesmc	$dat2,$dat2
3577	aese	$dat1,q12
3578	aesmc	$dat1,$dat1
3579	aese	$dat2,q12
3580	aesmc	$dat2,$dat2
3581	cmn	$len,#0x20
3582	aese	$dat1,q13
3583	aesmc	$dat1,$dat1
3584	aese	$dat2,q13
3585	aesmc	$dat2,$dat2
3586	veor	$tmp1,$iv0,$rndlast
3587	aese	$dat1,q14
3588	aesmc	$dat1,$dat1
3589	aese	$dat2,q14
3590	aesmc	$dat2,$dat2
3591	veor	$tmp2,$iv1,$rndlast
3592	aese	$dat1,q15
3593	aese	$dat2,q15
3594	b.eq	.Lxts_enc_one
3595	veor	$tmp1,$tmp1,$dat1
3596	vst1.8	{$tmp1},[$out],#16
3597	veor	$tmp2,$tmp2,$dat2
3598	vorr	$iv0,$iv1,$iv1
3599	vst1.8	{$tmp2},[$out],#16
3600	fmov	$ivl,$ivd10
3601	fmov	$ivh,$ivd11
3602	mov	$constnum,#0x87
3603	extr	$midnumx,$ivh,$ivh,#32
3604	extr	$ivh,$ivh,$ivl,#63
3605	and	$tmpmw,$constnum,$midnum,asr #31
3606	eor	$ivl,$tmpmx,$ivl,lsl #1
3607	fmov	$ivd00,$ivl
3608	fmov	$ivd01,$ivh
3609	b	.Lxts_enc_done
3610
3611.Lxts_enc_one:
3612	veor	$tmp1,$tmp1,$dat2
3613	vorr	$iv0,$iv0,$iv0
3614	vst1.8	{$tmp1},[$out],#16
3615	fmov	$ivl,$ivd00
3616	fmov	$ivh,$ivd01
3617	mov	$constnum,#0x87
3618	extr	$midnumx,$ivh,$ivh,#32
3619	extr	$ivh,$ivh,$ivl,#63
3620	and	$tmpmw,$constnum,$midnum,asr #31
3621	eor	$ivl,$tmpmx,$ivl,lsl #1
3622	fmov	$ivd00,$ivl
3623	fmov	$ivd01,$ivh
3624	b	.Lxts_enc_done
3625.align	5
3626.Lxts_enc_done:
3627	// Process the tail block with cipher stealing.
3628	tst	$tailcnt,#0xf
3629	b.eq	.Lxts_abort
3630
3631	mov	$tmpinp,$inp
3632	mov	$tmpoutp,$out
3633	sub	$out,$out,#16
3634.composite_enc_loop:
3635	subs	$tailcnt,$tailcnt,#1
3636	ldrb	$l2outp,[$out,$tailcnt]
3637	ldrb	$loutp,[$tmpinp,$tailcnt]
3638	strb	$l2outp,[$tmpoutp,$tailcnt]
3639	strb	$loutp,[$out,$tailcnt]
3640	b.gt	.composite_enc_loop
3641.Lxts_enc_load_done:
3642	vld1.8	{$tmpin},[$out]
3643	veor	$tmpin,$tmpin,$iv0
3644
3645	// Encrypt the composite block to get the last second encrypted text block
3646	ldr	$rounds,[$key1,#240]		// load key schedule...
3647	vld1.32	{$dat},[$key1],#16
3648	sub	$rounds,$rounds,#2
3649	vld1.32	{$dat1},[$key1],#16		// load key schedule...
3650.Loop_final_enc:
3651	aese	$tmpin,$dat0
3652	aesmc	$tmpin,$tmpin
3653	vld1.32	{$dat0},[$key1],#16
3654	subs	$rounds,$rounds,#2
3655	aese	$tmpin,$dat1
3656	aesmc	$tmpin,$tmpin
3657	vld1.32	{$dat1},[$key1],#16
3658	b.gt	.Loop_final_enc
3659
3660	aese	$tmpin,$dat0
3661	aesmc	$tmpin,$tmpin
3662	vld1.32	{$dat0},[$key1]
3663	aese	$tmpin,$dat1
3664	veor	$tmpin,$tmpin,$dat0
3665	veor	$tmpin,$tmpin,$iv0
3666	vst1.8	{$tmpin},[$out]
3667
3668.Lxts_abort:
3669	ldp	$tailcnt,$midnumx,[sp,#48]
3670	ldp	$ivd10,$ivd20,[sp,#32]
3671	ldp	$ivd30,$ivd40,[sp,#16]
3672	ldp	$constnumx,$tmpinp,[sp],#64
3673.Lxts_enc_final_abort:
3674	ret
3675.size	${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
3676___
3677
3678}}}
3679{{{
3680my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
3681my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
3682my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
3683my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
3684my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
3685my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
3686my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
3687my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
3688my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
3689
3690my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
3691
3692# q7	last round key
3693# q10-q15, q7	Last 7 round keys
3694# q8-q9	preloaded round keys except last 7 keys for big size
3695# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
3696
3697{
3698my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3699
3700my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
3701my ($dat4,$in4,$tmp4);
3702if ($flavour =~ /64/) {
3703    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3704}
3705
3706$code.=<<___	if ($flavour =~ /64/);
3707.globl	${prefix}_xts_decrypt
3708.type	${prefix}_xts_decrypt,%function
3709.align	5
3710${prefix}_xts_decrypt:
3711	AARCH64_VALID_CALL_TARGET
3712___
3713$code.=<<___	if ($flavour =~ /64/);
3714	cmp	$len,#16
3715	// Original input data size bigger than 16, jump to big size processing.
3716	b.ne	.Lxts_dec_big_size
3717	// Encrypt the iv with key2, as the first XEX iv.
3718	ldr	$rounds,[$key2,#240]
3719	vld1.32	{$dat},[$key2],#16
3720	vld1.8	{$iv0},[$ivp]
3721	sub	$rounds,$rounds,#2
3722	vld1.32	{$dat1},[$key2],#16
3723
3724.Loop_dec_small_iv_enc:
3725	aese	$iv0,$dat
3726	aesmc	$iv0,$iv0
3727	vld1.32	{$dat},[$key2],#16
3728	subs	$rounds,$rounds,#2
3729	aese	$iv0,$dat1
3730	aesmc	$iv0,$iv0
3731	vld1.32	{$dat1},[$key2],#16
3732	b.gt	.Loop_dec_small_iv_enc
3733
3734	aese	$iv0,$dat
3735	aesmc	$iv0,$iv0
3736	vld1.32	{$dat},[$key2]
3737	aese	$iv0,$dat1
3738	veor	$iv0,$iv0,$dat
3739
3740	vld1.8	{$dat0},[$inp]
3741	veor	$dat0,$iv0,$dat0
3742
3743	ldr	$rounds,[$key1,#240]
3744	vld1.32	{q20-q21},[$key1],#32			// load key schedule...
3745
3746	aesd	$dat0,q20
3747	aesimc	$dat0,$dat0
3748	vld1.32	{q8-q9},[$key1],#32			// load key schedule...
3749	aesd	$dat0,q21
3750	aesimc	$dat0,$dat0
3751	subs	$rounds,$rounds,#10			// bias
3752	b.eq	.Lxts_128_dec
3753.Lxts_dec_round_loop:
3754	aesd	$dat0,q8
3755	aesimc	$dat0,$dat0
3756	vld1.32	{q8},[$key1],#16			// load key schedule...
3757	aesd	$dat0,q9
3758	aesimc	$dat0,$dat0
3759	vld1.32	{q9},[$key1],#16			// load key schedule...
3760	subs	$rounds,$rounds,#2			// bias
3761	b.gt	.Lxts_dec_round_loop
3762.Lxts_128_dec:
3763	vld1.32	{q10-q11},[$key1],#32			// load key schedule...
3764	aesd	$dat0,q8
3765	aesimc	$dat0,$dat0
3766	aesd	$dat0,q9
3767	aesimc	$dat0,$dat0
3768	vld1.32	{q12-q13},[$key1],#32			// load key schedule...
3769	aesd	$dat0,q10
3770	aesimc	$dat0,$dat0
3771	aesd	$dat0,q11
3772	aesimc	$dat0,$dat0
3773	vld1.32	{q14-q15},[$key1],#32			// load key schedule...
3774	aesd	$dat0,q12
3775	aesimc	$dat0,$dat0
3776	aesd	$dat0,q13
3777	aesimc	$dat0,$dat0
3778	vld1.32	{$rndlast},[$key1]
3779	aesd	$dat0,q14
3780	aesimc	$dat0,$dat0
3781	aesd	$dat0,q15
3782	veor	$dat0,$dat0,$rndlast
3783	veor	$dat0,$iv0,$dat0
3784	vst1.8	{$dat0},[$out]
3785	b	.Lxts_dec_final_abort
3786.Lxts_dec_big_size:
3787___
3788$code.=<<___	if ($flavour =~ /64/);
3789	stp	$constnumx,$tmpinp,[sp,#-64]!
3790	stp	$tailcnt,$midnumx,[sp,#48]
3791	stp	$ivd10,$ivd20,[sp,#32]
3792	stp	$ivd30,$ivd40,[sp,#16]
3793
3794	and	$tailcnt,$len,#0xf
3795	and	$len,$len,#-16
3796	subs	$len,$len,#16
3797	mov	$step,#16
3798	b.lo	.Lxts_dec_abort
3799
3800	// Encrypt the iv with key2, as the first XEX iv
3801	ldr	$rounds,[$key2,#240]
3802	vld1.32	{$dat},[$key2],#16
3803	vld1.8	{$iv0},[$ivp]
3804	sub	$rounds,$rounds,#2
3805	vld1.32	{$dat1},[$key2],#16
3806
3807.Loop_dec_iv_enc:
3808	aese	$iv0,$dat
3809	aesmc	$iv0,$iv0
3810	vld1.32	{$dat},[$key2],#16
3811	subs	$rounds,$rounds,#2
3812	aese	$iv0,$dat1
3813	aesmc	$iv0,$iv0
3814	vld1.32	{$dat1},[$key2],#16
3815	b.gt	.Loop_dec_iv_enc
3816
3817	aese	$iv0,$dat
3818	aesmc	$iv0,$iv0
3819	vld1.32	{$dat},[$key2]
3820	aese	$iv0,$dat1
3821	veor	$iv0,$iv0,$dat
3822
3823	// The iv for second block
3824	// $ivl- iv(low), $ivh - iv(high)
3825	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3826	fmov	$ivl,$ivd00
3827	fmov	$ivh,$ivd01
3828	mov	$constnum,#0x87
3829	extr	$midnumx,$ivh,$ivh,#32
3830	extr	$ivh,$ivh,$ivl,#63
3831	and	$tmpmw,$constnum,$midnum,asr #31
3832	eor	$ivl,$tmpmx,$ivl,lsl #1
3833	fmov	$ivd10,$ivl
3834	fmov	$ivd11,$ivh
3835
3836	ldr	$rounds0,[$key1,#240]		// load rounds number
3837
3838	// The iv for third block
3839	extr	$midnumx,$ivh,$ivh,#32
3840	extr	$ivh,$ivh,$ivl,#63
3841	and	$tmpmw,$constnum,$midnum,asr #31
3842	eor	$ivl,$tmpmx,$ivl,lsl #1
3843	fmov	$ivd20,$ivl
3844	fmov	$ivd21,$ivh
3845
3846	vld1.32	{q8-q9},[$key1]			// load key schedule...
3847	sub	$rounds0,$rounds0,#6
3848	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
3849	sub	$rounds0,$rounds0,#2
3850	vld1.32	{q10-q11},[$key_],#32		// load key schedule...
3851	vld1.32	{q12-q13},[$key_],#32
3852	vld1.32	{q14-q15},[$key_],#32
3853	vld1.32	{$rndlast},[$key_]
3854
3855	// The iv for fourth block
3856	extr	$midnumx,$ivh,$ivh,#32
3857	extr	$ivh,$ivh,$ivl,#63
3858	and	$tmpmw,$constnum,$midnum,asr #31
3859	eor	$ivl,$tmpmx,$ivl,lsl #1
3860	fmov	$ivd30,$ivl
3861	fmov	$ivd31,$ivh
3862
3863	add	$key_,$key1,#32
3864	mov	$rounds,$rounds0
3865	b	.Lxts_dec
3866
3867	// Decryption
3868.align	5
3869.Lxts_dec:
3870	tst	$tailcnt,#0xf
3871	b.eq	.Lxts_dec_begin
3872	subs	$len,$len,#16
3873	csel	$step,xzr,$step,eq
3874	vld1.8	{$dat},[$inp],#16
3875	b.lo	.Lxts_done
3876	sub	$inp,$inp,#16
3877.Lxts_dec_begin:
3878	vld1.8	{$dat},[$inp],$step
3879	subs	$len,$len,#32			// bias
3880	add	$rounds,$rounds0,#2
3881	vorr	$in1,$dat,$dat
3882	vorr	$dat1,$dat,$dat
3883	vorr	$in3,$dat,$dat
3884	vld1.8	{$dat2},[$inp],#16
3885	vorr	$in2,$dat2,$dat2
3886	vorr	$in4,$dat2,$dat2
3887	b.lo	.Lxts_inner_dec_tail
3888	veor	$dat,$dat,$iv0			// before decryt, xor with iv
3889	veor	$dat2,$dat2,$iv1
3890
3891	vorr	$dat1,$dat2,$dat2
3892	vld1.8	{$dat2},[$inp],#16
3893	vorr	$in0,$dat,$dat
3894	vorr	$in1,$dat1,$dat1
3895	veor	$in2,$dat2,$iv2			// third block xox with third iv
3896	veor	$dat2,$dat2,$iv2
3897	cmp	$len,#32
3898	b.lo	.Lxts_outer_dec_tail
3899
3900	vld1.8	{$dat3},[$inp],#16
3901
3902	// The iv for fifth block
3903	extr	$midnumx,$ivh,$ivh,#32
3904	extr	$ivh,$ivh,$ivl,#63
3905	and	$tmpmw,$constnum,$midnum,asr #31
3906	eor	$ivl,$tmpmx,$ivl,lsl #1
3907	fmov	$ivd40,$ivl
3908	fmov	$ivd41,$ivh
3909
3910	vld1.8	{$dat4},[$inp],#16
3911	veor	$dat3,$dat3,$iv3		// the fourth block
3912	veor	$dat4,$dat4,$iv4
3913	sub $len,$len,#32			// bias
3914	mov	$rounds,$rounds0
3915	b	.Loop5x_xts_dec
3916
3917.align	4
3918.Loop5x_xts_dec:
3919	aesd	$dat0,q8
3920	aesimc	$dat0,$dat0
3921	aesd	$dat1,q8
3922	aesimc	$dat1,$dat1
3923	aesd	$dat2,q8
3924	aesimc	$dat2,$dat2
3925	aesd	$dat3,q8
3926	aesimc	$dat3,$dat3
3927	aesd	$dat4,q8
3928	aesimc	$dat4,$dat4
3929	vld1.32	{q8},[$key_],#16		// load key schedule...
3930	subs	$rounds,$rounds,#2
3931	aesd	$dat0,q9
3932	aesimc	$dat0,$dat0
3933	aesd	$dat1,q9
3934	aesimc	$dat1,$dat1
3935	aesd	$dat2,q9
3936	aesimc	$dat2,$dat2
3937	aesd	$dat3,q9
3938	aesimc	$dat3,$dat3
3939	aesd	$dat4,q9
3940	aesimc	$dat4,$dat4
3941	vld1.32	{q9},[$key_],#16		// load key schedule...
3942	b.gt	.Loop5x_xts_dec
3943
3944	aesd	$dat0,q8
3945	aesimc	$dat0,$dat0
3946	aesd	$dat1,q8
3947	aesimc	$dat1,$dat1
3948	aesd	$dat2,q8
3949	aesimc	$dat2,$dat2
3950	aesd	$dat3,q8
3951	aesimc	$dat3,$dat3
3952	aesd	$dat4,q8
3953	aesimc	$dat4,$dat4
3954	subs	$len,$len,#0x50			// because .Lxts_dec_tail4x
3955
3956	aesd	$dat0,q9
3957	aesimc	$dat0,$dat
3958	aesd	$dat1,q9
3959	aesimc	$dat1,$dat1
3960	aesd	$dat2,q9
3961	aesimc	$dat2,$dat2
3962	aesd	$dat3,q9
3963	aesimc	$dat3,$dat3
3964	aesd	$dat4,q9
3965	aesimc	$dat4,$dat4
3966	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
3967	mov	$key_,$key1
3968
3969	aesd	$dat0,q10
3970	aesimc	$dat0,$dat0
3971	aesd	$dat1,q10
3972	aesimc	$dat1,$dat1
3973	aesd	$dat2,q10
3974	aesimc	$dat2,$dat2
3975	aesd	$dat3,q10
3976	aesimc	$dat3,$dat3
3977	aesd	$dat4,q10
3978	aesimc	$dat4,$dat4
3979	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
3980						// at exit from the loop v1.16b-v26.16b
3981						// are loaded with last "words"
3982	add	$xoffset,$len,#0x60		// because .Lxts_dec_tail4x
3983
3984	aesd	$dat0,q11
3985	aesimc	$dat0,$dat0
3986	aesd	$dat1,q11
3987	aesimc	$dat1,$dat1
3988	aesd	$dat2,q11
3989	aesimc	$dat2,$dat2
3990	aesd	$dat3,q11
3991	aesimc	$dat3,$dat3
3992	aesd	$dat4,q11
3993	aesimc	$dat4,$dat4
3994
3995	aesd	$dat0,q12
3996	aesimc	$dat0,$dat0
3997	aesd	$dat1,q12
3998	aesimc	$dat1,$dat1
3999	aesd	$dat2,q12
4000	aesimc	$dat2,$dat2
4001	aesd	$dat3,q12
4002	aesimc	$dat3,$dat3
4003	aesd	$dat4,q12
4004	aesimc	$dat4,$dat4
4005
4006	aesd	$dat0,q13
4007	aesimc	$dat0,$dat0
4008	aesd	$dat1,q13
4009	aesimc	$dat1,$dat1
4010	aesd	$dat2,q13
4011	aesimc	$dat2,$dat2
4012	aesd	$dat3,q13
4013	aesimc	$dat3,$dat3
4014	aesd	$dat4,q13
4015	aesimc	$dat4,$dat4
4016
4017	aesd	$dat0,q14
4018	aesimc	$dat0,$dat0
4019	aesd	$dat1,q14
4020	aesimc	$dat1,$dat1
4021	aesd	$dat2,q14
4022	aesimc	$dat2,$dat2
4023	aesd	$dat3,q14
4024	aesimc	$dat3,$dat3
4025	aesd	$dat4,q14
4026	aesimc	$dat4,$dat4
4027
4028	veor	$tmp0,$rndlast,$iv0
4029	aesd	$dat0,q15
4030	// The iv for first block of next iteration.
4031	extr	$midnumx,$ivh,$ivh,#32
4032	extr	$ivh,$ivh,$ivl,#63
4033	and	$tmpmw,$constnum,$midnum,asr #31
4034	eor	$ivl,$tmpmx,$ivl,lsl #1
4035	fmov	$ivd00,$ivl
4036	fmov	$ivd01,$ivh
4037	veor	$tmp1,$rndlast,$iv1
4038	vld1.8	{$in0},[$inp],#16
4039	aesd	$dat1,q15
4040	// The iv for second block
4041	extr	$midnumx,$ivh,$ivh,#32
4042	extr	$ivh,$ivh,$ivl,#63
4043	and	$tmpmw,$constnum,$midnum,asr #31
4044	eor	$ivl,$tmpmx,$ivl,lsl #1
4045	fmov	$ivd10,$ivl
4046	fmov	$ivd11,$ivh
4047	veor	$tmp2,$rndlast,$iv2
4048	vld1.8	{$in1},[$inp],#16
4049	aesd	$dat2,q15
4050	// The iv for third block
4051	extr	$midnumx,$ivh,$ivh,#32
4052	extr	$ivh,$ivh,$ivl,#63
4053	and	$tmpmw,$constnum,$midnum,asr #31
4054	eor	$ivl,$tmpmx,$ivl,lsl #1
4055	fmov	$ivd20,$ivl
4056	fmov	$ivd21,$ivh
4057	veor	$tmp3,$rndlast,$iv3
4058	vld1.8	{$in2},[$inp],#16
4059	aesd	$dat3,q15
4060	// The iv for fourth block
4061	extr	$midnumx,$ivh,$ivh,#32
4062	extr	$ivh,$ivh,$ivl,#63
4063	and	$tmpmw,$constnum,$midnum,asr #31
4064	eor	$ivl,$tmpmx,$ivl,lsl #1
4065	fmov	$ivd30,$ivl
4066	fmov	$ivd31,$ivh
4067	veor	$tmp4,$rndlast,$iv4
4068	vld1.8	{$in3},[$inp],#16
4069	aesd	$dat4,q15
4070
4071	// The iv for fifth block
4072	extr	$midnumx,$ivh,$ivh,#32
4073	extr	$ivh,$ivh,$ivl,#63
4074	and	$tmpmw,$constnum,$midnum,asr #31
4075	eor	$ivl,$tmpmx,$ivl,lsl #1
4076	fmov	$ivd40,$ivl
4077	fmov	$ivd41,$ivh
4078
4079	vld1.8	{$in4},[$inp],#16
4080	cbz	$xoffset,.Lxts_dec_tail4x
4081	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
4082	veor	$tmp0,$tmp0,$dat0
4083	veor	$dat0,$in0,$iv0
4084	veor	$tmp1,$tmp1,$dat1
4085	veor	$dat1,$in1,$iv1
4086	veor	$tmp2,$tmp2,$dat2
4087	veor	$dat2,$in2,$iv2
4088	veor	$tmp3,$tmp3,$dat3
4089	veor	$dat3,$in3,$iv3
4090	veor	$tmp4,$tmp4,$dat4
4091	vst1.8	{$tmp0},[$out],#16
4092	veor	$dat4,$in4,$iv4
4093	vst1.8	{$tmp1},[$out],#16
4094	mov	$rounds,$rounds0
4095	vst1.8	{$tmp2},[$out],#16
4096	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
4097	vst1.8	{$tmp3},[$out],#16
4098	vst1.8	{$tmp4},[$out],#16
4099	b.hs	.Loop5x_xts_dec
4100
4101	cmn	$len,#0x10
4102	b.ne	.Loop5x_dec_after
4103	// If x2($len) equal to -0x10, the left blocks is 4.
4104	// After specially processing, utilize the five blocks processing again.
4105	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
4106	vorr	$iv4,$iv3,$iv3
4107	vorr	$iv3,$iv2,$iv2
4108	vorr	$iv2,$iv1,$iv1
4109	vorr	$iv1,$iv0,$iv0
4110	fmov	$ivl,$ivd40
4111	fmov	$ivh,$ivd41
4112	veor	$dat0,$iv0,$in0
4113	veor	$dat1,$iv1,$in1
4114	veor	$dat2,$in2,$iv2
4115	veor	$dat3,$in3,$iv3
4116	veor	$dat4,$in4,$iv4
4117	b.eq	.Loop5x_xts_dec
4118
4119.Loop5x_dec_after:
4120	add	$len,$len,#0x50
4121	cbz	$len,.Lxts_done
4122
4123	add	$rounds,$rounds0,#2
4124	subs	$len,$len,#0x30
4125	b.lo	.Lxts_inner_dec_tail
4126
4127	veor	$dat0,$iv0,$in2
4128	veor	$dat1,$iv1,$in3
4129	veor	$dat2,$in4,$iv2
4130	b	.Lxts_outer_dec_tail
4131
4132.align	4
4133.Lxts_dec_tail4x:
4134	add	$inp,$inp,#16
4135	tst	$tailcnt,#0xf
4136	veor	$tmp1,$dat1,$tmp0
4137	vst1.8	{$tmp1},[$out],#16
4138	veor	$tmp2,$dat2,$tmp2
4139	vst1.8	{$tmp2},[$out],#16
4140	veor	$tmp3,$dat3,$tmp3
4141	veor	$tmp4,$dat4,$tmp4
4142	vst1.8	{$tmp3-$tmp4},[$out],#32
4143
4144	b.eq	.Lxts_dec_abort
4145	vld1.8	{$dat0},[$inp],#16
4146	b	.Lxts_done
4147.align	4
4148.Lxts_outer_dec_tail:
4149	aesd	$dat0,q8
4150	aesimc	$dat0,$dat0
4151	aesd	$dat1,q8
4152	aesimc	$dat1,$dat1
4153	aesd	$dat2,q8
4154	aesimc	$dat2,$dat2
4155	vld1.32	{q8},[$key_],#16
4156	subs	$rounds,$rounds,#2
4157	aesd	$dat0,q9
4158	aesimc	$dat0,$dat0
4159	aesd	$dat1,q9
4160	aesimc	$dat1,$dat1
4161	aesd	$dat2,q9
4162	aesimc	$dat2,$dat2
4163	vld1.32	{q9},[$key_],#16
4164	b.gt	.Lxts_outer_dec_tail
4165
4166	aesd	$dat0,q8
4167	aesimc	$dat0,$dat0
4168	aesd	$dat1,q8
4169	aesimc	$dat1,$dat1
4170	aesd	$dat2,q8
4171	aesimc	$dat2,$dat2
4172	veor	$tmp0,$iv0,$rndlast
4173	subs	$len,$len,#0x30
4174	// The iv for first block
4175	fmov	$ivl,$ivd20
4176	fmov	$ivh,$ivd21
4177	mov	$constnum,#0x87
4178	extr	$midnumx,$ivh,$ivh,#32
4179	extr	$ivh,$ivh,$ivl,#63
4180	and	$tmpmw,$constnum,$midnum,asr #31
4181	eor	$ivl,$tmpmx,$ivl,lsl #1
4182	fmov	$ivd00,$ivl
4183	fmov	$ivd01,$ivh
4184	veor	$tmp1,$iv1,$rndlast
4185	csel	$xoffset,$len,$xoffset,lo	// x6, w6, is zero at this point
4186	aesd	$dat0,q9
4187	aesimc	$dat0,$dat0
4188	aesd	$dat1,q9
4189	aesimc	$dat1,$dat1
4190	aesd	$dat2,q9
4191	aesimc	$dat2,$dat2
4192	veor	$tmp2,$iv2,$rndlast
4193	// The iv for second block
4194	extr	$midnumx,$ivh,$ivh,#32
4195	extr	$ivh,$ivh,$ivl,#63
4196	and	$tmpmw,$constnum,$midnum,asr #31
4197	eor	$ivl,$tmpmx,$ivl,lsl #1
4198	fmov	$ivd10,$ivl
4199	fmov	$ivd11,$ivh
4200
4201	add	$xoffset,$xoffset,#0x20
4202	add	$inp,$inp,$xoffset		// $inp is adjusted to the last data
4203
4204	mov	$key_,$key1
4205
4206	// The iv for third block
4207	extr	$midnumx,$ivh,$ivh,#32
4208	extr	$ivh,$ivh,$ivl,#63
4209	and	$tmpmw,$constnum,$midnum,asr #31
4210	eor	$ivl,$tmpmx,$ivl,lsl #1
4211	fmov	$ivd20,$ivl
4212	fmov	$ivd21,$ivh
4213
4214	aesd	$dat0,q12
4215	aesimc	$dat0,$dat0
4216	aesd	$dat1,q12
4217	aesimc	$dat1,$dat1
4218	aesd	$dat2,q12
4219	aesimc	$dat2,$dat2
4220	aesd	$dat0,q13
4221	aesimc	$dat0,$dat0
4222	aesd	$dat1,q13
4223	aesimc	$dat1,$dat1
4224	aesd	$dat2,q13
4225	aesimc	$dat2,$dat2
4226	aesd	$dat0,q14
4227	aesimc	$dat0,$dat0
4228	aesd	$dat1,q14
4229	aesimc	$dat1,$dat1
4230	aesd	$dat2,q14
4231	aesimc	$dat2,$dat2
4232	vld1.8	{$in2},[$inp],#16
4233	aesd	$dat0,q15
4234	aesd	$dat1,q15
4235	aesd	$dat2,q15
4236	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
4237	add	$rounds,$rounds0,#2
4238	veor	$tmp0,$tmp0,$dat0
4239	veor	$tmp1,$tmp1,$dat1
4240	veor	$dat2,$dat2,$tmp2
4241	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
4242	vst1.8	{$tmp0},[$out],#16
4243	vst1.8	{$tmp1},[$out],#16
4244	vst1.8	{$dat2},[$out],#16
4245
4246	cmn	$len,#0x30
4247	add	$len,$len,#0x30
4248	b.eq	.Lxts_done
4249	sub	$len,$len,#0x30
4250	vorr	$in3,$in1,$in1
4251	vorr	$in4,$in2,$in2
4252	nop
4253
4254.Lxts_inner_dec_tail:
4255	// $len == -0x10 means two blocks left.
4256	cmn	$len,#0x10
4257	veor	$dat1,$in3,$iv0
4258	veor	$dat2,$in4,$iv1
4259	b.eq	.Lxts_dec_tail_loop
4260	veor	$dat2,$in4,$iv0
4261.Lxts_dec_tail_loop:
4262	aesd	$dat1,q8
4263	aesimc	$dat1,$dat1
4264	aesd	$dat2,q8
4265	aesimc	$dat2,$dat2
4266	vld1.32	{q8},[$key_],#16
4267	subs	$rounds,$rounds,#2
4268	aesd	$dat1,q9
4269	aesimc	$dat1,$dat1
4270	aesd	$dat2,q9
4271	aesimc	$dat2,$dat2
4272	vld1.32	{q9},[$key_],#16
4273	b.gt	.Lxts_dec_tail_loop
4274
4275	aesd	$dat1,q8
4276	aesimc	$dat1,$dat1
4277	aesd	$dat2,q8
4278	aesimc	$dat2,$dat2
4279	aesd	$dat1,q9
4280	aesimc	$dat1,$dat1
4281	aesd	$dat2,q9
4282	aesimc	$dat2,$dat2
4283	aesd	$dat1,q12
4284	aesimc	$dat1,$dat1
4285	aesd	$dat2,q12
4286	aesimc	$dat2,$dat2
4287	cmn	$len,#0x20
4288	aesd	$dat1,q13
4289	aesimc	$dat1,$dat1
4290	aesd	$dat2,q13
4291	aesimc	$dat2,$dat2
4292	veor	$tmp1,$iv0,$rndlast
4293	aesd	$dat1,q14
4294	aesimc	$dat1,$dat1
4295	aesd	$dat2,q14
4296	aesimc	$dat2,$dat2
4297	veor	$tmp2,$iv1,$rndlast
4298	aesd	$dat1,q15
4299	aesd	$dat2,q15
4300	b.eq	.Lxts_dec_one
4301	veor	$tmp1,$tmp1,$dat1
4302	veor	$tmp2,$tmp2,$dat2
4303	vorr	$iv0,$iv2,$iv2
4304	vorr	$iv1,$iv3,$iv3
4305	vst1.8	{$tmp1},[$out],#16
4306	vst1.8	{$tmp2},[$out],#16
4307	add	$len,$len,#16
4308	b	.Lxts_done
4309
4310.Lxts_dec_one:
4311	veor	$tmp1,$tmp1,$dat2
4312	vorr	$iv0,$iv1,$iv1
4313	vorr	$iv1,$iv2,$iv2
4314	vst1.8	{$tmp1},[$out],#16
4315	add	$len,$len,#32
4316
4317.Lxts_done:
4318	tst	$tailcnt,#0xf
4319	b.eq	.Lxts_dec_abort
4320	// Processing the last two blocks with cipher stealing.
4321	mov	x7,x3
4322	cbnz	x2,.Lxts_dec_1st_done
4323	vld1.8	{$dat0},[$inp],#16
4324
4325	// Decrypt the last second block to get the last plain text block
4326.Lxts_dec_1st_done:
4327	eor	$tmpin,$dat0,$iv1
4328	ldr	$rounds,[$key1,#240]
4329	vld1.32	{$dat0},[$key1],#16
4330	sub	$rounds,$rounds,#2
4331	vld1.32	{$dat1},[$key1],#16
4332.Loop_final_2nd_dec:
4333	aesd	$tmpin,$dat0
4334	aesimc	$tmpin,$tmpin
4335	vld1.32	{$dat0},[$key1],#16		// load key schedule...
4336	subs	$rounds,$rounds,#2
4337	aesd	$tmpin,$dat1
4338	aesimc	$tmpin,$tmpin
4339	vld1.32	{$dat1},[$key1],#16		// load key schedule...
4340	b.gt	.Loop_final_2nd_dec
4341
4342	aesd	$tmpin,$dat0
4343	aesimc	$tmpin,$tmpin
4344	vld1.32	{$dat0},[$key1]
4345	aesd	$tmpin,$dat1
4346	veor	$tmpin,$tmpin,$dat0
4347	veor	$tmpin,$tmpin,$iv1
4348	vst1.8	{$tmpin},[$out]
4349
4350	mov	$tmpinp,$inp
4351	add	$tmpoutp,$out,#16
4352
4353	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
4354	// to get the last encrypted block.
4355.composite_dec_loop:
4356	subs	$tailcnt,$tailcnt,#1
4357	ldrb	$l2outp,[$out,$tailcnt]
4358	ldrb	$loutp,[$tmpinp,$tailcnt]
4359	strb	$l2outp,[$tmpoutp,$tailcnt]
4360	strb	$loutp,[$out,$tailcnt]
4361	b.gt	.composite_dec_loop
4362.Lxts_dec_load_done:
4363	vld1.8	{$tmpin},[$out]
4364	veor	$tmpin,$tmpin,$iv0
4365
4366	// Decrypt the composite block to get the last second plain text block
4367	ldr	$rounds,[$key_,#240]
4368	vld1.32	{$dat},[$key_],#16
4369	sub	$rounds,$rounds,#2
4370	vld1.32	{$dat1},[$key_],#16
4371.Loop_final_dec:
4372	aesd	$tmpin,$dat0
4373	aesimc	$tmpin,$tmpin
4374	vld1.32	{$dat0},[$key_],#16		// load key schedule...
4375	subs	$rounds,$rounds,#2
4376	aesd	$tmpin,$dat1
4377	aesimc	$tmpin,$tmpin
4378	vld1.32	{$dat1},[$key_],#16		// load key schedule...
4379	b.gt	.Loop_final_dec
4380
4381	aesd	$tmpin,$dat0
4382	aesimc	$tmpin,$tmpin
4383	vld1.32	{$dat0},[$key_]
4384	aesd	$tmpin,$dat1
4385	veor	$tmpin,$tmpin,$dat0
4386	veor	$tmpin,$tmpin,$iv0
4387	vst1.8	{$tmpin},[$out]
4388
4389.Lxts_dec_abort:
4390	ldp	$tailcnt,$midnumx,[sp,#48]
4391	ldp	$ivd10,$ivd20,[sp,#32]
4392	ldp	$ivd30,$ivd40,[sp,#16]
4393	ldp	$constnumx,$tmpinp,[sp],#64
4394
4395.Lxts_dec_final_abort:
4396	ret
4397.size	${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
4398___
4399}
4400}}}
4401$code.=<<___;
4402#endif
4403___
4404########################################
4405if ($flavour =~ /64/) {			######## 64-bit code
4406    my %opcode = (
4407	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
4408	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800,
4409	"eor3"	=>	0xce000000,	);
4410
4411    local *unaes = sub {
4412	my ($mnemonic,$arg)=@_;
4413
4414	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
4415	sprintf ".inst\t0x%08x\t//%s %s",
4416			$opcode{$mnemonic}|$1|($2<<5),
4417			$mnemonic,$arg;
4418    };
4419
4420    sub unsha3 {
4421		 my ($mnemonic,$arg)=@_;
4422
4423		 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
4424		 &&
4425		 sprintf ".inst\t0x%08x\t//%s %s",
4426			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
4427			$mnemonic,$arg;
4428    }
4429
4430    foreach(split("\n",$code)) {
4431	s/\`([^\`]*)\`/eval($1)/geo;
4432
4433	s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo;	# old->new registers
4434	s/\bq_([0-9]+)\b/"q".$1/geo;	# old->new registers
4435	s/@\s/\/\//o;			# old->new style commentary
4436
4437	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
4438	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
4439	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
4440	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
4441	s/vext\.8/ext/o		or
4442	s/vrev32\.8/rev32/o	or
4443	s/vtst\.8/cmtst/o	or
4444	s/vshr/ushr/o		or
4445	s/^(\s+)v/$1/o		or	# strip off v prefix
4446	s/\bbx\s+lr\b/ret/o;
4447	s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge;
4448
4449	# fix up remaining legacy suffixes
4450	s/\.[ui]?8//o;
4451	m/\],#8/o and s/\.16b/\.8b/go;
4452	s/\.[ui]?32//o and s/\.16b/\.4s/go;
4453	s/\.[ui]?64//o and s/\.16b/\.2d/go;
4454	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
4455
4456	# Switch preprocessor checks to aarch64 versions.
4457	s/__ARME([BL])__/__AARCH64E$1__/go;
4458
4459	print $_,"\n";
4460    }
4461} else {				######## 32-bit code
4462    my %opcode = (
4463	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
4464	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
4465
4466    local *unaes = sub {
4467	my ($mnemonic,$arg)=@_;
4468
4469	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
4470	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
4471					 |(($2&7)<<1) |(($2&8)<<2);
4472	    # since ARMv7 instructions are always encoded little-endian.
4473	    # correct solution is to use .inst directive, but older
4474	    # assemblers don't implement it:-(
4475	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
4476			$word&0xff,($word>>8)&0xff,
4477			($word>>16)&0xff,($word>>24)&0xff,
4478			$mnemonic,$arg;
4479	}
4480    };
4481
4482    sub unvtbl {
4483	my $arg=shift;
4484
4485	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
4486	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
4487		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
4488    }
4489
4490    sub unvdup32 {
4491	my $arg=shift;
4492
4493	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
4494	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
4495    }
4496
4497    sub unvmov32 {
4498	my $arg=shift;
4499
4500	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
4501	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
4502    }
4503
4504    foreach(split("\n",$code)) {
4505	s/\`([^\`]*)\`/eval($1)/geo;
4506
4507	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
4508	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
4509	s/\/\/\s?/@ /o;				# new->old style commentary
4510
4511	# fix up remaining new-style suffixes
4512	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
4513	s/\],#[0-9]+/]!/o;
4514
4515	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
4516	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o	or
4517	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
4518	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
4519	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
4520	s/^(\s+)b\./$1b/o				or
4521	s/^(\s+)ret/$1bx\tlr/o;
4522
4523	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
4524	    print "	it	$2\n";
4525	}
4526
4527	print $_,"\n";
4528    }
4529}
4530
4531close STDOUT or die "error closing STDOUT: $!";
4532