xref: /openssl/crypto/aes/asm/aesv8-armx.pl (revision 65523758)
1#! /usr/bin/env perl
2# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# April 2019
31#
32# Key to performance of parallelize-able modes is round instruction
33# interleaving. But which factor to use? There is optimal one for
34# each combination of instruction latency and issue rate, beyond
35# which increasing interleave factor doesn't pay off. While on cons
36# side we have code size increase and resource waste on platforms for
37# which interleave factor is too high. In other words you want it to
38# be just right. So far interleave factor of 3x was serving well all
39# platforms. But for ThunderX2 optimal interleave factor was measured
40# to be 5x...
41#
42# Performance in cycles per byte processed with 128-bit key:
43#
44#		CBC enc		CBC dec		CTR
45# Apple A7	2.39		1.20		1.20
46# Cortex-A53	1.32		1.17/1.29(**)	1.36/1.46
47# Cortex-A57(*)	1.95		0.82/0.85	0.89/0.93
48# Cortex-A72	1.33		0.85/0.88	0.92/0.96
49# Denver	1.96		0.65/0.86	0.76/0.80
50# Mongoose	1.33		1.23/1.20	1.30/1.20
51# Kryo		1.26		0.87/0.94	1.00/1.00
52# ThunderX2	5.95		1.25		1.30
53#
54# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
55#	and are still same even for updated module;
56# (**)	numbers after slash are for 32-bit code, which is 3x-
57#	interleaved;
58
59# $output is the last argument if it looks like a file (it has an extension)
60# $flavour is the first argument if it doesn't look like a file
61$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67die "can't locate arm-xlate.pl";
68
69open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70    or die "can't call $xlate: $!";
71*STDOUT=*OUT;
72
73$prefix="aes_v8";
74
75$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
76
77$code=<<___;
78#include "arm_arch.h"
79
80#if __ARM_MAX_ARCH__>=7
81___
82$code.=".arch	armv8-a+crypto\n.text\n"		if ($flavour =~ /64/);
83$code.=<<___						if ($flavour !~ /64/);
84.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
85.fpu	neon
86#ifdef	__thumb2__
87.syntax	unified
88.thumb
89# define INST(a,b,c,d)	$_byte	c,d|0xc,a,b
90#else
91.code	32
92# define INST(a,b,c,d)	$_byte	a,b,c,d
93#endif
94
95.text
96___
97
98# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100# maintain both 32- and 64-bit codes within single module and
101# transliterate common code to either flavour with regex vodoo.
102#
103{{{
104my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
107
108
109$code.=<<___;
110.align	5
111.Lrcon:
112.long	0x01,0x01,0x01,0x01
113.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
114.long	0x1b,0x1b,0x1b,0x1b
115
116.globl	${prefix}_set_encrypt_key
117.type	${prefix}_set_encrypt_key,%function
118.align	5
119${prefix}_set_encrypt_key:
120.Lenc_key:
121___
122$code.=<<___	if ($flavour =~ /64/);
123	AARCH64_VALID_CALL_TARGET
124	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125	stp	x29,x30,[sp,#-16]!
126	add	x29,sp,#0
127___
128$code.=<<___;
129	mov	$ptr,#-1
130	cmp	$inp,#0
131	b.eq	.Lenc_key_abort
132	cmp	$out,#0
133	b.eq	.Lenc_key_abort
134	mov	$ptr,#-2
135	cmp	$bits,#128
136	b.lt	.Lenc_key_abort
137	cmp	$bits,#256
138	b.gt	.Lenc_key_abort
139	tst	$bits,#0x3f
140	b.ne	.Lenc_key_abort
141
142	adr	$ptr,.Lrcon
143	cmp	$bits,#192
144
145	veor	$zero,$zero,$zero
146	vld1.8	{$in0},[$inp],#16
147	mov	$bits,#8		// reuse $bits
148	vld1.32	{$rcon,$mask},[$ptr],#32
149
150	b.lt	.Loop128
151	b.eq	.L192
152	b	.L256
153
154.align	4
155.Loop128:
156	vtbl.8	$key,{$in0},$mask
157	vext.8	$tmp,$zero,$in0,#12
158	vst1.32	{$in0},[$out],#16
159	aese	$key,$zero
160	subs	$bits,$bits,#1
161
162	veor	$in0,$in0,$tmp
163	vext.8	$tmp,$zero,$tmp,#12
164	veor	$in0,$in0,$tmp
165	vext.8	$tmp,$zero,$tmp,#12
166	 veor	$key,$key,$rcon
167	veor	$in0,$in0,$tmp
168	vshl.u8	$rcon,$rcon,#1
169	veor	$in0,$in0,$key
170	b.ne	.Loop128
171
172	vld1.32	{$rcon},[$ptr]
173
174	vtbl.8	$key,{$in0},$mask
175	vext.8	$tmp,$zero,$in0,#12
176	vst1.32	{$in0},[$out],#16
177	aese	$key,$zero
178
179	veor	$in0,$in0,$tmp
180	vext.8	$tmp,$zero,$tmp,#12
181	veor	$in0,$in0,$tmp
182	vext.8	$tmp,$zero,$tmp,#12
183	 veor	$key,$key,$rcon
184	veor	$in0,$in0,$tmp
185	vshl.u8	$rcon,$rcon,#1
186	veor	$in0,$in0,$key
187
188	vtbl.8	$key,{$in0},$mask
189	vext.8	$tmp,$zero,$in0,#12
190	vst1.32	{$in0},[$out],#16
191	aese	$key,$zero
192
193	veor	$in0,$in0,$tmp
194	vext.8	$tmp,$zero,$tmp,#12
195	veor	$in0,$in0,$tmp
196	vext.8	$tmp,$zero,$tmp,#12
197	 veor	$key,$key,$rcon
198	veor	$in0,$in0,$tmp
199	veor	$in0,$in0,$key
200	vst1.32	{$in0},[$out]
201	add	$out,$out,#0x50
202
203	mov	$rounds,#10
204	b	.Ldone
205
206.align	4
207.L192:
208	vld1.8	{$in1},[$inp],#8
209	vmov.i8	$key,#8			// borrow $key
210	vst1.32	{$in0},[$out],#16
211	vsub.i8	$mask,$mask,$key	// adjust the mask
212
213.Loop192:
214	vtbl.8	$key,{$in1},$mask
215	vext.8	$tmp,$zero,$in0,#12
216#ifdef __ARMEB__
217	vst1.32	{$in1},[$out],#16
218	sub	$out,$out,#8
219#else
220	vst1.32	{$in1},[$out],#8
221#endif
222	aese	$key,$zero
223	subs	$bits,$bits,#1
224
225	veor	$in0,$in0,$tmp
226	vext.8	$tmp,$zero,$tmp,#12
227	veor	$in0,$in0,$tmp
228	vext.8	$tmp,$zero,$tmp,#12
229	veor	$in0,$in0,$tmp
230
231	vdup.32	$tmp,${in0}[3]
232	veor	$tmp,$tmp,$in1
233	 veor	$key,$key,$rcon
234	vext.8	$in1,$zero,$in1,#12
235	vshl.u8	$rcon,$rcon,#1
236	veor	$in1,$in1,$tmp
237	veor	$in0,$in0,$key
238	veor	$in1,$in1,$key
239	vst1.32	{$in0},[$out],#16
240	b.ne	.Loop192
241
242	mov	$rounds,#12
243	add	$out,$out,#0x20
244	b	.Ldone
245
246.align	4
247.L256:
248	vld1.8	{$in1},[$inp]
249	mov	$bits,#7
250	mov	$rounds,#14
251	vst1.32	{$in0},[$out],#16
252
253.Loop256:
254	vtbl.8	$key,{$in1},$mask
255	vext.8	$tmp,$zero,$in0,#12
256	vst1.32	{$in1},[$out],#16
257	aese	$key,$zero
258	subs	$bits,$bits,#1
259
260	veor	$in0,$in0,$tmp
261	vext.8	$tmp,$zero,$tmp,#12
262	veor	$in0,$in0,$tmp
263	vext.8	$tmp,$zero,$tmp,#12
264	 veor	$key,$key,$rcon
265	veor	$in0,$in0,$tmp
266	vshl.u8	$rcon,$rcon,#1
267	veor	$in0,$in0,$key
268	vst1.32	{$in0},[$out],#16
269	b.eq	.Ldone
270
271	vdup.32	$key,${in0}[3]		// just splat
272	vext.8	$tmp,$zero,$in1,#12
273	aese	$key,$zero
274
275	veor	$in1,$in1,$tmp
276	vext.8	$tmp,$zero,$tmp,#12
277	veor	$in1,$in1,$tmp
278	vext.8	$tmp,$zero,$tmp,#12
279	veor	$in1,$in1,$tmp
280
281	veor	$in1,$in1,$key
282	b	.Loop256
283
284.Ldone:
285	str	$rounds,[$out]
286	mov	$ptr,#0
287
288.Lenc_key_abort:
289	mov	x0,$ptr			// return value
290	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
291	ret
292.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
293
294.globl	${prefix}_set_decrypt_key
295.type	${prefix}_set_decrypt_key,%function
296.align	5
297${prefix}_set_decrypt_key:
298___
299$code.=<<___	if ($flavour =~ /64/);
300	AARCH64_SIGN_LINK_REGISTER
301	stp	x29,x30,[sp,#-16]!
302	add	x29,sp,#0
303___
304$code.=<<___	if ($flavour !~ /64/);
305	stmdb	sp!,{r4,lr}
306___
307$code.=<<___;
308	bl	.Lenc_key
309
310	cmp	x0,#0
311	b.ne	.Ldec_key_abort
312
313	sub	$out,$out,#240		// restore original $out
314	mov	x4,#-16
315	add	$inp,$out,x12,lsl#4	// end of key schedule
316
317	vld1.32	{v0.16b},[$out]
318	vld1.32	{v1.16b},[$inp]
319	vst1.32	{v0.16b},[$inp],x4
320	vst1.32	{v1.16b},[$out],#16
321
322.Loop_imc:
323	vld1.32	{v0.16b},[$out]
324	vld1.32	{v1.16b},[$inp]
325	aesimc	v0.16b,v0.16b
326	aesimc	v1.16b,v1.16b
327	vst1.32	{v0.16b},[$inp],x4
328	vst1.32	{v1.16b},[$out],#16
329	cmp	$inp,$out
330	b.hi	.Loop_imc
331
332	vld1.32	{v0.16b},[$out]
333	aesimc	v0.16b,v0.16b
334	vst1.32	{v0.16b},[$inp]
335
336	eor	x0,x0,x0		// return value
337.Ldec_key_abort:
338___
339$code.=<<___	if ($flavour !~ /64/);
340	ldmia	sp!,{r4,pc}
341___
342$code.=<<___	if ($flavour =~ /64/);
343	ldp	x29,x30,[sp],#16
344	AARCH64_VALIDATE_LINK_REGISTER
345	ret
346___
347$code.=<<___;
348.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
349___
350}}}
351{{{
352sub gen_block () {
353my $dir = shift;
354my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355my ($inp,$out,$key)=map("x$_",(0..2));
356my $rounds="w3";
357my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358
359$code.=<<___;
360.globl	${prefix}_${dir}crypt
361.type	${prefix}_${dir}crypt,%function
362.align	5
363${prefix}_${dir}crypt:
364___
365$code.=<<___	if ($flavour =~ /64/);
366	AARCH64_VALID_CALL_TARGET
367___
368$code.=<<___;
369	ldr	$rounds,[$key,#240]
370	vld1.32	{$rndkey0},[$key],#16
371	vld1.8	{$inout},[$inp]
372	sub	$rounds,$rounds,#2
373	vld1.32	{$rndkey1},[$key],#16
374
375.Loop_${dir}c:
376	aes$e	$inout,$rndkey0
377	aes$mc	$inout,$inout
378	vld1.32	{$rndkey0},[$key],#16
379	subs	$rounds,$rounds,#2
380	aes$e	$inout,$rndkey1
381	aes$mc	$inout,$inout
382	vld1.32	{$rndkey1},[$key],#16
383	b.gt	.Loop_${dir}c
384
385	aes$e	$inout,$rndkey0
386	aes$mc	$inout,$inout
387	vld1.32	{$rndkey0},[$key]
388	aes$e	$inout,$rndkey1
389	veor	$inout,$inout,$rndkey0
390
391	vst1.8	{$inout},[$out]
392	ret
393.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
394___
395}
396&gen_block("en");
397&gen_block("de");
398}}}
399
400# Performance in cycles per byte.
401# Processed with AES-ECB different key size.
402# It shows the value before and after optimization as below:
403# (before/after):
404#
405#		AES-128-ECB		AES-192-ECB		AES-256-ECB
406# Cortex-A57	1.85/0.82		2.16/0.96		2.47/1.10
407# Cortex-A72	1.64/0.85		1.82/0.99		2.13/1.14
408
409# Optimization is implemented by loop unrolling and interleaving.
410# Commonly, we choose the unrolling factor as 5, if the input
411# data size smaller than 5 blocks, but not smaller than 3 blocks,
412# choose 3 as the unrolling factor.
413# If the input data size dsize >= 5*16 bytes, then take 5 blocks
414# as one iteration, every loop the left size lsize -= 5*16.
415# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416# every loop lsize -=3*16.
417# If lsize < 3*16 bytes, treat them as the tail, interleave the
418# two blocks AES instructions.
419# There is one special case, if the original input data size dsize
420# = 16 bytes, we will treat it separately to improve the
421# performance: one independent code block without LR, FP load and
422# store, just looks like what the original ECB implementation does.
423
424{{{
425my ($inp,$out,$len,$key)=map("x$_",(0..3));
426my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
428
429my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
430
431### q7	last round key
432### q10-q15	q7 Last 7 round keys
433### q8-q9	preloaded round keys except last 7 keys for big size
434### q5, q6, q8-q9	preloaded round keys except last 7 keys for only 16 byte
435
436{
437my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
438
439my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
440my ($dat4,$in4,$tmp4);
441if ($flavour =~ /64/) {
442    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
443}
444
445$code.=<<___;
446.globl	${prefix}_ecb_encrypt
447.type	${prefix}_ecb_encrypt,%function
448.align	5
449${prefix}_ecb_encrypt:
450___
451$code.=<<___	if ($flavour =~ /64/);
452	AARCH64_VALID_CALL_TARGET
453	subs	$len,$len,#16
454	// Original input data size bigger than 16, jump to big size processing.
455	b.ne    .Lecb_big_size
456	vld1.8	{$dat0},[$inp]
457	cmp	$enc,#0					// en- or decrypting?
458	ldr	$rounds,[$key,#240]
459	vld1.32	{q5-q6},[$key],#32			// load key schedule...
460
461	b.eq .Lecb_small_dec
462	aese	$dat0,q5
463	aesmc	$dat0,$dat0
464	vld1.32	{q8-q9},[$key],#32			// load key schedule...
465	aese	$dat0,q6
466	aesmc	$dat0,$dat0
467	subs	$rounds,$rounds,#10			// if rounds==10, jump to aes-128-ecb processing
468	b.eq    .Lecb_128_enc
469.Lecb_round_loop:
470	aese	$dat0,q8
471	aesmc	$dat0,$dat0
472	vld1.32	{q8},[$key],#16				// load key schedule...
473	aese	$dat0,q9
474	aesmc	$dat0,$dat0
475	vld1.32	{q9},[$key],#16				// load key schedule...
476	subs	$rounds,$rounds,#2			// bias
477	b.gt    .Lecb_round_loop
478.Lecb_128_enc:
479	vld1.32	{q10-q11},[$key],#32		// load key schedule...
480	aese	$dat0,q8
481	aesmc	$dat0,$dat0
482	aese	$dat0,q9
483	aesmc	$dat0,$dat0
484	vld1.32	{q12-q13},[$key],#32		// load key schedule...
485	aese	$dat0,q10
486	aesmc	$dat0,$dat0
487	aese	$dat0,q11
488	aesmc	$dat0,$dat0
489	vld1.32	{q14-q15},[$key],#32		// load key schedule...
490	aese	$dat0,q12
491	aesmc	$dat0,$dat0
492	aese	$dat0,q13
493	aesmc	$dat0,$dat0
494	vld1.32	{$rndlast},[$key]
495	aese	$dat0,q14
496	aesmc	$dat0,$dat0
497	aese	$dat0,q15
498	veor	$dat0,$dat0,$rndlast
499	vst1.8	{$dat0},[$out]
500	b	.Lecb_Final_abort
501.Lecb_small_dec:
502	aesd	$dat0,q5
503	aesimc	$dat0,$dat0
504	vld1.32	{q8-q9},[$key],#32			// load key schedule...
505	aesd	$dat0,q6
506	aesimc	$dat0,$dat0
507	subs	$rounds,$rounds,#10			// bias
508	b.eq    .Lecb_128_dec
509.Lecb_dec_round_loop:
510	aesd	$dat0,q8
511	aesimc	$dat0,$dat0
512	vld1.32	{q8},[$key],#16				// load key schedule...
513	aesd	$dat0,q9
514	aesimc	$dat0,$dat0
515	vld1.32	{q9},[$key],#16				// load key schedule...
516	subs	$rounds,$rounds,#2			// bias
517	b.gt    .Lecb_dec_round_loop
518.Lecb_128_dec:
519	vld1.32	{q10-q11},[$key],#32		// load key schedule...
520	aesd	$dat0,q8
521	aesimc	$dat0,$dat0
522	aesd	$dat0,q9
523	aesimc	$dat0,$dat0
524	vld1.32	{q12-q13},[$key],#32		// load key schedule...
525	aesd	$dat0,q10
526	aesimc	$dat0,$dat0
527	aesd	$dat0,q11
528	aesimc	$dat0,$dat0
529	vld1.32	{q14-q15},[$key],#32		// load key schedule...
530	aesd	$dat0,q12
531	aesimc	$dat0,$dat0
532	aesd	$dat0,q13
533	aesimc	$dat0,$dat0
534	vld1.32	{$rndlast},[$key]
535	aesd	$dat0,q14
536	aesimc	$dat0,$dat0
537	aesd	$dat0,q15
538	veor	$dat0,$dat0,$rndlast
539	vst1.8	{$dat0},[$out]
540	b	.Lecb_Final_abort
541.Lecb_big_size:
542___
543$code.=<<___	if ($flavour =~ /64/);
544	stp	x29,x30,[sp,#-16]!
545	add	x29,sp,#0
546___
547$code.=<<___	if ($flavour !~ /64/);
548	mov	ip,sp
549	stmdb	sp!,{r4-r8,lr}
550	vstmdb	sp!,{d8-d15}			@ ABI specification says so
551	ldmia	ip,{r4-r5}			@ load remaining args
552	subs	$len,$len,#16
553___
554$code.=<<___;
555	mov	$step,#16
556	b.lo	.Lecb_done
557	cclr	$step,eq
558
559	cmp	$enc,#0					// en- or decrypting?
560	ldr	$rounds,[$key,#240]
561	and	$len,$len,#-16
562	vld1.8	{$dat},[$inp],$step
563
564	vld1.32	{q8-q9},[$key]				// load key schedule...
565	sub	$rounds,$rounds,#6
566	add	$key_,$key,x5,lsl#4				// pointer to last 7 round keys
567	sub	$rounds,$rounds,#2
568	vld1.32	{q10-q11},[$key_],#32
569	vld1.32	{q12-q13},[$key_],#32
570	vld1.32	{q14-q15},[$key_],#32
571	vld1.32	{$rndlast},[$key_]
572
573	add	$key_,$key,#32
574	mov	$cnt,$rounds
575	b.eq	.Lecb_dec
576
577	vld1.8	{$dat1},[$inp],#16
578	subs	$len,$len,#32				// bias
579	add	$cnt,$rounds,#2
580	vorr	$in1,$dat1,$dat1
581	vorr	$dat2,$dat1,$dat1
582	vorr	$dat1,$dat,$dat
583	b.lo	.Lecb_enc_tail
584
585	vorr	$dat1,$in1,$in1
586	vld1.8	{$dat2},[$inp],#16
587___
588$code.=<<___	if ($flavour =~ /64/);
589	cmp	$len,#32
590	b.lo	.Loop3x_ecb_enc
591
592	vld1.8	{$dat3},[$inp],#16
593	vld1.8	{$dat4},[$inp],#16
594	sub	$len,$len,#32				// bias
595	mov	$cnt,$rounds
596
597.Loop5x_ecb_enc:
598	aese	$dat0,q8
599	aesmc	$dat0,$dat0
600	aese	$dat1,q8
601	aesmc	$dat1,$dat1
602	aese	$dat2,q8
603	aesmc	$dat2,$dat2
604	aese	$dat3,q8
605	aesmc	$dat3,$dat3
606	aese	$dat4,q8
607	aesmc	$dat4,$dat4
608	vld1.32	{q8},[$key_],#16
609	subs	$cnt,$cnt,#2
610	aese	$dat0,q9
611	aesmc	$dat0,$dat0
612	aese	$dat1,q9
613	aesmc	$dat1,$dat1
614	aese	$dat2,q9
615	aesmc	$dat2,$dat2
616	aese	$dat3,q9
617	aesmc	$dat3,$dat3
618	aese	$dat4,q9
619	aesmc	$dat4,$dat4
620	vld1.32	{q9},[$key_],#16
621	b.gt	.Loop5x_ecb_enc
622
623	aese	$dat0,q8
624	aesmc	$dat0,$dat0
625	aese	$dat1,q8
626	aesmc	$dat1,$dat1
627	aese	$dat2,q8
628	aesmc	$dat2,$dat2
629	aese	$dat3,q8
630	aesmc	$dat3,$dat3
631	aese	$dat4,q8
632	aesmc	$dat4,$dat4
633	cmp	$len,#0x40					// because .Lecb_enc_tail4x
634	sub	$len,$len,#0x50
635
636	aese	$dat0,q9
637	aesmc	$dat0,$dat0
638	aese	$dat1,q9
639	aesmc	$dat1,$dat1
640	aese	$dat2,q9
641	aesmc	$dat2,$dat2
642	aese	$dat3,q9
643	aesmc	$dat3,$dat3
644	aese	$dat4,q9
645	aesmc	$dat4,$dat4
646	csel	x6,xzr,$len,gt			// borrow x6, $cnt, "gt" is not typo
647	mov	$key_,$key
648
649	aese	$dat0,q10
650	aesmc	$dat0,$dat0
651	aese	$dat1,q10
652	aesmc	$dat1,$dat1
653	aese	$dat2,q10
654	aesmc	$dat2,$dat2
655	aese	$dat3,q10
656	aesmc	$dat3,$dat3
657	aese	$dat4,q10
658	aesmc	$dat4,$dat4
659	add	$inp,$inp,x6				// $inp is adjusted in such way that
660							// at exit from the loop $dat1-$dat4
661							// are loaded with last "words"
662	add	x6,$len,#0x60		    // because .Lecb_enc_tail4x
663
664	aese	$dat0,q11
665	aesmc	$dat0,$dat0
666	aese	$dat1,q11
667	aesmc	$dat1,$dat1
668	aese	$dat2,q11
669	aesmc	$dat2,$dat2
670	aese	$dat3,q11
671	aesmc	$dat3,$dat3
672	aese	$dat4,q11
673	aesmc	$dat4,$dat4
674
675	aese	$dat0,q12
676	aesmc	$dat0,$dat0
677	aese	$dat1,q12
678	aesmc	$dat1,$dat1
679	aese	$dat2,q12
680	aesmc	$dat2,$dat2
681	aese	$dat3,q12
682	aesmc	$dat3,$dat3
683	aese	$dat4,q12
684	aesmc	$dat4,$dat4
685
686	aese	$dat0,q13
687	aesmc	$dat0,$dat0
688	aese	$dat1,q13
689	aesmc	$dat1,$dat1
690	aese	$dat2,q13
691	aesmc	$dat2,$dat2
692	aese	$dat3,q13
693	aesmc	$dat3,$dat3
694	aese	$dat4,q13
695	aesmc	$dat4,$dat4
696
697	aese	$dat0,q14
698	aesmc	$dat0,$dat0
699	aese	$dat1,q14
700	aesmc	$dat1,$dat1
701	aese	$dat2,q14
702	aesmc	$dat2,$dat2
703	aese	$dat3,q14
704	aesmc	$dat3,$dat3
705	aese	$dat4,q14
706	aesmc	$dat4,$dat4
707
708	aese	$dat0,q15
709	vld1.8	{$in0},[$inp],#16
710	aese	$dat1,q15
711	vld1.8	{$in1},[$inp],#16
712	aese	$dat2,q15
713	vld1.8	{$in2},[$inp],#16
714	aese	$dat3,q15
715	vld1.8	{$in3},[$inp],#16
716	aese	$dat4,q15
717	vld1.8	{$in4},[$inp],#16
718	cbz	x6,.Lecb_enc_tail4x
719	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
720	veor	$tmp0,$rndlast,$dat0
721	vorr	$dat0,$in0,$in0
722	veor	$tmp1,$rndlast,$dat1
723	vorr	$dat1,$in1,$in1
724	veor	$tmp2,$rndlast,$dat2
725	vorr	$dat2,$in2,$in2
726	veor	$tmp3,$rndlast,$dat3
727	vorr	$dat3,$in3,$in3
728	veor	$tmp4,$rndlast,$dat4
729	vst1.8	{$tmp0},[$out],#16
730	vorr	$dat4,$in4,$in4
731	vst1.8	{$tmp1},[$out],#16
732	mov	$cnt,$rounds
733	vst1.8	{$tmp2},[$out],#16
734	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
735	vst1.8	{$tmp3},[$out],#16
736	vst1.8	{$tmp4},[$out],#16
737	b.hs	.Loop5x_ecb_enc
738
739	add	$len,$len,#0x50
740	cbz	$len,.Lecb_done
741
742	add	$cnt,$rounds,#2
743	subs	$len,$len,#0x30
744	vorr	$dat0,$in2,$in2
745	vorr	$dat1,$in3,$in3
746	vorr	$dat2,$in4,$in4
747	b.lo	.Lecb_enc_tail
748
749	b	.Loop3x_ecb_enc
750
751.align	4
752.Lecb_enc_tail4x:
753	veor	$tmp1,$rndlast,$dat1
754	veor	$tmp2,$rndlast,$dat2
755	veor	$tmp3,$rndlast,$dat3
756	veor	$tmp4,$rndlast,$dat4
757	vst1.8	{$tmp1},[$out],#16
758	vst1.8	{$tmp2},[$out],#16
759	vst1.8	{$tmp3},[$out],#16
760	vst1.8	{$tmp4},[$out],#16
761
762	b	.Lecb_done
763.align	4
764___
765$code.=<<___;
766.Loop3x_ecb_enc:
767	aese	$dat0,q8
768	aesmc	$dat0,$dat0
769	aese	$dat1,q8
770	aesmc	$dat1,$dat1
771	aese	$dat2,q8
772	aesmc	$dat2,$dat2
773	vld1.32	{q8},[$key_],#16
774	subs	$cnt,$cnt,#2
775	aese	$dat0,q9
776	aesmc	$dat0,$dat0
777	aese	$dat1,q9
778	aesmc	$dat1,$dat1
779	aese	$dat2,q9
780	aesmc	$dat2,$dat2
781	vld1.32	{q9},[$key_],#16
782	b.gt	.Loop3x_ecb_enc
783
784	aese	$dat0,q8
785	aesmc	$dat0,$dat0
786	aese	$dat1,q8
787	aesmc	$dat1,$dat1
788	aese	$dat2,q8
789	aesmc	$dat2,$dat2
790	subs	$len,$len,#0x30
791	mov.lo	x6,$len				// x6, $cnt, is zero at this point
792	aese	$dat0,q9
793	aesmc	$dat0,$dat0
794	aese	$dat1,q9
795	aesmc	$dat1,$dat1
796	aese	$dat2,q9
797	aesmc	$dat2,$dat2
798	add	$inp,$inp,x6			// $inp is adjusted in such way that
799						// at exit from the loop $dat1-$dat2
800						// are loaded with last "words"
801	mov	$key_,$key
802	aese	$dat0,q12
803	aesmc	$dat0,$dat0
804	aese	$dat1,q12
805	aesmc	$dat1,$dat1
806	aese	$dat2,q12
807	aesmc	$dat2,$dat2
808	vld1.8	{$in0},[$inp],#16
809	aese	$dat0,q13
810	aesmc	$dat0,$dat0
811	aese	$dat1,q13
812	aesmc	$dat1,$dat1
813	aese	$dat2,q13
814	aesmc	$dat2,$dat2
815	vld1.8	{$in1},[$inp],#16
816	aese	$dat0,q14
817	aesmc	$dat0,$dat0
818	aese	$dat1,q14
819	aesmc	$dat1,$dat1
820	aese	$dat2,q14
821	aesmc	$dat2,$dat2
822	vld1.8	{$in2},[$inp],#16
823	aese	$dat0,q15
824	aese	$dat1,q15
825	aese	$dat2,q15
826	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
827	add	$cnt,$rounds,#2
828	veor	$tmp0,$rndlast,$dat0
829	veor	$tmp1,$rndlast,$dat1
830	veor	$dat2,$dat2,$rndlast
831	vld1.32 {q9},[$key_],#16		// re-pre-load rndkey[1]
832	vst1.8	{$tmp0},[$out],#16
833	vorr	$dat0,$in0,$in0
834	vst1.8	{$tmp1},[$out],#16
835	vorr	$dat1,$in1,$in1
836	vst1.8	{$dat2},[$out],#16
837	vorr	$dat2,$in2,$in2
838	b.hs	.Loop3x_ecb_enc
839
840	cmn	$len,#0x30
841	b.eq	.Lecb_done
842	nop
843
844.Lecb_enc_tail:
845	aese	$dat1,q8
846	aesmc	$dat1,$dat1
847	aese	$dat2,q8
848	aesmc	$dat2,$dat2
849	vld1.32	{q8},[$key_],#16
850	subs	$cnt,$cnt,#2
851	aese	$dat1,q9
852	aesmc	$dat1,$dat1
853	aese	$dat2,q9
854	aesmc	$dat2,$dat2
855	vld1.32	{q9},[$key_],#16
856	b.gt	.Lecb_enc_tail
857
858	aese	$dat1,q8
859	aesmc	$dat1,$dat1
860	aese	$dat2,q8
861	aesmc	$dat2,$dat2
862	aese	$dat1,q9
863	aesmc	$dat1,$dat1
864	aese	$dat2,q9
865	aesmc	$dat2,$dat2
866	aese	$dat1,q12
867	aesmc	$dat1,$dat1
868	aese	$dat2,q12
869	aesmc	$dat2,$dat2
870	cmn	$len,#0x20
871	aese	$dat1,q13
872	aesmc	$dat1,$dat1
873	aese	$dat2,q13
874	aesmc	$dat2,$dat2
875	aese	$dat1,q14
876	aesmc	$dat1,$dat1
877	aese	$dat2,q14
878	aesmc	$dat2,$dat2
879	aese	$dat1,q15
880	aese	$dat2,q15
881	b.eq	.Lecb_enc_one
882	veor	$tmp1,$rndlast,$dat1
883	veor	$tmp2,$rndlast,$dat2
884	vst1.8	{$tmp1},[$out],#16
885	vst1.8	{$tmp2},[$out],#16
886	b	.Lecb_done
887
888.Lecb_enc_one:
889	veor	$tmp1,$rndlast,$dat2
890	vst1.8	{$tmp1},[$out],#16
891	b	.Lecb_done
892___
893
894$code.=<<___;
895.align	5
896.Lecb_dec:
897	vld1.8	{$dat1},[$inp],#16
898	subs	$len,$len,#32			// bias
899	add	$cnt,$rounds,#2
900	vorr	$in1,$dat1,$dat1
901	vorr	$dat2,$dat1,$dat1
902	vorr	$dat1,$dat,$dat
903	b.lo	.Lecb_dec_tail
904
905	vorr	$dat1,$in1,$in1
906	vld1.8	{$dat2},[$inp],#16
907___
908$code.=<<___	if ($flavour =~ /64/);
909	cmp	$len,#32
910	b.lo	.Loop3x_ecb_dec
911
912	vld1.8	{$dat3},[$inp],#16
913	vld1.8	{$dat4},[$inp],#16
914	sub	$len,$len,#32				// bias
915	mov	$cnt,$rounds
916
917.Loop5x_ecb_dec:
918	aesd	$dat0,q8
919	aesimc	$dat0,$dat0
920	aesd	$dat1,q8
921	aesimc	$dat1,$dat1
922	aesd	$dat2,q8
923	aesimc	$dat2,$dat2
924	aesd	$dat3,q8
925	aesimc	$dat3,$dat3
926	aesd	$dat4,q8
927	aesimc	$dat4,$dat4
928	vld1.32	{q8},[$key_],#16
929	subs	$cnt,$cnt,#2
930	aesd	$dat0,q9
931	aesimc	$dat0,$dat0
932	aesd	$dat1,q9
933	aesimc	$dat1,$dat1
934	aesd	$dat2,q9
935	aesimc	$dat2,$dat2
936	aesd	$dat3,q9
937	aesimc	$dat3,$dat3
938	aesd	$dat4,q9
939	aesimc	$dat4,$dat4
940	vld1.32	{q9},[$key_],#16
941	b.gt	.Loop5x_ecb_dec
942
943	aesd	$dat0,q8
944	aesimc	$dat0,$dat0
945	aesd	$dat1,q8
946	aesimc	$dat1,$dat1
947	aesd	$dat2,q8
948	aesimc	$dat2,$dat2
949	aesd	$dat3,q8
950	aesimc	$dat3,$dat3
951	aesd	$dat4,q8
952	aesimc	$dat4,$dat4
953	cmp	$len,#0x40				// because .Lecb_tail4x
954	sub	$len,$len,#0x50
955
956	aesd	$dat0,q9
957	aesimc	$dat0,$dat0
958	aesd	$dat1,q9
959	aesimc	$dat1,$dat1
960	aesd	$dat2,q9
961	aesimc	$dat2,$dat2
962	aesd	$dat3,q9
963	aesimc	$dat3,$dat3
964	aesd	$dat4,q9
965	aesimc	$dat4,$dat4
966	csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
967	mov	$key_,$key
968
969	aesd	$dat0,q10
970	aesimc	$dat0,$dat0
971	aesd	$dat1,q10
972	aesimc	$dat1,$dat1
973	aesd	$dat2,q10
974	aesimc	$dat2,$dat2
975	aesd	$dat3,q10
976	aesimc	$dat3,$dat3
977	aesd	$dat4,q10
978	aesimc	$dat4,$dat4
979	add	$inp,$inp,x6				// $inp is adjusted in such way that
980							// at exit from the loop $dat1-$dat4
981							// are loaded with last "words"
982	add	x6,$len,#0x60			// because .Lecb_tail4x
983
984	aesd	$dat0,q11
985	aesimc	$dat0,$dat0
986	aesd	$dat1,q11
987	aesimc	$dat1,$dat1
988	aesd	$dat2,q11
989	aesimc	$dat2,$dat2
990	aesd	$dat3,q11
991	aesimc	$dat3,$dat3
992	aesd	$dat4,q11
993	aesimc	$dat4,$dat4
994
995	aesd	$dat0,q12
996	aesimc	$dat0,$dat0
997	aesd	$dat1,q12
998	aesimc	$dat1,$dat1
999	aesd	$dat2,q12
1000	aesimc	$dat2,$dat2
1001	aesd	$dat3,q12
1002	aesimc	$dat3,$dat3
1003	aesd	$dat4,q12
1004	aesimc	$dat4,$dat4
1005
1006	aesd	$dat0,q13
1007	aesimc	$dat0,$dat0
1008	aesd	$dat1,q13
1009	aesimc	$dat1,$dat1
1010	aesd	$dat2,q13
1011	aesimc	$dat2,$dat2
1012	aesd	$dat3,q13
1013	aesimc	$dat3,$dat3
1014	aesd	$dat4,q13
1015	aesimc	$dat4,$dat4
1016
1017	aesd	$dat0,q14
1018	aesimc	$dat0,$dat0
1019	aesd	$dat1,q14
1020	aesimc	$dat1,$dat1
1021	aesd	$dat2,q14
1022	aesimc	$dat2,$dat2
1023	aesd	$dat3,q14
1024	aesimc	$dat3,$dat3
1025	aesd	$dat4,q14
1026	aesimc	$dat4,$dat4
1027
1028	aesd	$dat0,q15
1029	vld1.8	{$in0},[$inp],#16
1030	aesd	$dat1,q15
1031	vld1.8	{$in1},[$inp],#16
1032	aesd	$dat2,q15
1033	vld1.8	{$in2},[$inp],#16
1034	aesd	$dat3,q15
1035	vld1.8	{$in3},[$inp],#16
1036	aesd	$dat4,q15
1037	vld1.8	{$in4},[$inp],#16
1038	cbz	x6,.Lecb_tail4x
1039	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1040	veor	$tmp0,$rndlast,$dat0
1041	vorr	$dat0,$in0,$in0
1042	veor	$tmp1,$rndlast,$dat1
1043	vorr	$dat1,$in1,$in1
1044	veor	$tmp2,$rndlast,$dat2
1045	vorr	$dat2,$in2,$in2
1046	veor	$tmp3,$rndlast,$dat3
1047	vorr	$dat3,$in3,$in3
1048	veor	$tmp4,$rndlast,$dat4
1049	vst1.8	{$tmp0},[$out],#16
1050	vorr	$dat4,$in4,$in4
1051	vst1.8	{$tmp1},[$out],#16
1052	mov	$cnt,$rounds
1053	vst1.8	{$tmp2},[$out],#16
1054	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1055	vst1.8	{$tmp3},[$out],#16
1056	vst1.8	{$tmp4},[$out],#16
1057	b.hs	.Loop5x_ecb_dec
1058
1059	add	$len,$len,#0x50
1060	cbz	$len,.Lecb_done
1061
1062	add	$cnt,$rounds,#2
1063	subs	$len,$len,#0x30
1064	vorr	$dat0,$in2,$in2
1065	vorr	$dat1,$in3,$in3
1066	vorr	$dat2,$in4,$in4
1067	b.lo	.Lecb_dec_tail
1068
1069	b	.Loop3x_ecb_dec
1070
1071.align	4
1072.Lecb_tail4x:
1073	veor	$tmp1,$rndlast,$dat1
1074	veor	$tmp2,$rndlast,$dat2
1075	veor	$tmp3,$rndlast,$dat3
1076	veor	$tmp4,$rndlast,$dat4
1077	vst1.8	{$tmp1},[$out],#16
1078	vst1.8	{$tmp2},[$out],#16
1079	vst1.8	{$tmp3},[$out],#16
1080	vst1.8	{$tmp4},[$out],#16
1081
1082	b	.Lecb_done
1083.align	4
1084___
1085$code.=<<___;
1086.Loop3x_ecb_dec:
1087	aesd	$dat0,q8
1088	aesimc	$dat0,$dat0
1089	aesd	$dat1,q8
1090	aesimc	$dat1,$dat1
1091	aesd	$dat2,q8
1092	aesimc	$dat2,$dat2
1093	vld1.32	{q8},[$key_],#16
1094	subs	$cnt,$cnt,#2
1095	aesd	$dat0,q9
1096	aesimc	$dat0,$dat0
1097	aesd	$dat1,q9
1098	aesimc	$dat1,$dat1
1099	aesd	$dat2,q9
1100	aesimc	$dat2,$dat2
1101	vld1.32	{q9},[$key_],#16
1102	b.gt	.Loop3x_ecb_dec
1103
1104	aesd	$dat0,q8
1105	aesimc	$dat0,$dat0
1106	aesd	$dat1,q8
1107	aesimc	$dat1,$dat1
1108	aesd	$dat2,q8
1109	aesimc	$dat2,$dat2
1110	subs	$len,$len,#0x30
1111	mov.lo	x6,$len				// x6, $cnt, is zero at this point
1112	aesd	$dat0,q9
1113	aesimc	$dat0,$dat0
1114	aesd	$dat1,q9
1115	aesimc	$dat1,$dat1
1116	aesd	$dat2,q9
1117	aesimc	$dat2,$dat2
1118	add	$inp,$inp,x6 			// $inp is adjusted in such way that
1119						// at exit from the loop $dat1-$dat2
1120						// are loaded with last "words"
1121	mov	$key_,$key
1122	aesd	$dat0,q12
1123	aesimc	$dat0,$dat0
1124	aesd	$dat1,q12
1125	aesimc	$dat1,$dat1
1126	aesd	$dat2,q12
1127	aesimc	$dat2,$dat2
1128	vld1.8	{$in0},[$inp],#16
1129	aesd	$dat0,q13
1130	aesimc	$dat0,$dat0
1131	aesd	$dat1,q13
1132	aesimc	$dat1,$dat1
1133	aesd	$dat2,q13
1134	aesimc	$dat2,$dat2
1135	vld1.8	{$in1},[$inp],#16
1136	aesd	$dat0,q14
1137	aesimc	$dat0,$dat0
1138	aesd	$dat1,q14
1139	aesimc	$dat1,$dat1
1140	aesd	$dat2,q14
1141	aesimc	$dat2,$dat2
1142	vld1.8	{$in2},[$inp],#16
1143	aesd	$dat0,q15
1144	aesd	$dat1,q15
1145	aesd	$dat2,q15
1146	vld1.32 {q8},[$key_],#16			// re-pre-load rndkey[0]
1147	add	$cnt,$rounds,#2
1148	veor	$tmp0,$rndlast,$dat0
1149	veor	$tmp1,$rndlast,$dat1
1150	veor	$dat2,$dat2,$rndlast
1151	vld1.32 {q9},[$key_],#16			// re-pre-load rndkey[1]
1152	vst1.8	{$tmp0},[$out],#16
1153	vorr	$dat0,$in0,$in0
1154	vst1.8	{$tmp1},[$out],#16
1155	vorr	$dat1,$in1,$in1
1156	vst1.8	{$dat2},[$out],#16
1157	vorr	$dat2,$in2,$in2
1158	b.hs	.Loop3x_ecb_dec
1159
1160	cmn	$len,#0x30
1161	b.eq	.Lecb_done
1162	nop
1163
1164.Lecb_dec_tail:
1165	aesd	$dat1,q8
1166	aesimc	$dat1,$dat1
1167	aesd	$dat2,q8
1168	aesimc	$dat2,$dat2
1169	vld1.32	{q8},[$key_],#16
1170	subs	$cnt,$cnt,#2
1171	aesd	$dat1,q9
1172	aesimc	$dat1,$dat1
1173	aesd	$dat2,q9
1174	aesimc	$dat2,$dat2
1175	vld1.32	{q9},[$key_],#16
1176	b.gt	.Lecb_dec_tail
1177
1178	aesd	$dat1,q8
1179	aesimc	$dat1,$dat1
1180	aesd	$dat2,q8
1181	aesimc	$dat2,$dat2
1182	aesd	$dat1,q9
1183	aesimc	$dat1,$dat1
1184	aesd	$dat2,q9
1185	aesimc	$dat2,$dat2
1186	aesd	$dat1,q12
1187	aesimc	$dat1,$dat1
1188	aesd	$dat2,q12
1189	aesimc	$dat2,$dat2
1190	cmn	$len,#0x20
1191	aesd	$dat1,q13
1192	aesimc	$dat1,$dat1
1193	aesd	$dat2,q13
1194	aesimc	$dat2,$dat2
1195	aesd	$dat1,q14
1196	aesimc	$dat1,$dat1
1197	aesd	$dat2,q14
1198	aesimc	$dat2,$dat2
1199	aesd	$dat1,q15
1200	aesd	$dat2,q15
1201	b.eq	.Lecb_dec_one
1202	veor	$tmp1,$rndlast,$dat1
1203	veor	$tmp2,$rndlast,$dat2
1204	vst1.8	{$tmp1},[$out],#16
1205	vst1.8	{$tmp2},[$out],#16
1206	b	.Lecb_done
1207
1208.Lecb_dec_one:
1209	veor	$tmp1,$rndlast,$dat2
1210	vst1.8	{$tmp1},[$out],#16
1211
1212.Lecb_done:
1213___
1214}
1215$code.=<<___	if ($flavour !~ /64/);
1216	vldmia	sp!,{d8-d15}
1217	ldmia	sp!,{r4-r8,pc}
1218___
1219$code.=<<___	if ($flavour =~ /64/);
1220	ldr	x29,[sp],#16
1221___
1222$code.=<<___	if ($flavour =~ /64/);
1223.Lecb_Final_abort:
1224	ret
1225___
1226$code.=<<___;
1227.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1228___
1229}}}
1230{{{
1231my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1234
1235my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1237
1238### q8-q15	preloaded key schedule
1239
1240$code.=<<___;
1241.globl	${prefix}_cbc_encrypt
1242.type	${prefix}_cbc_encrypt,%function
1243.align	5
1244${prefix}_cbc_encrypt:
1245___
1246$code.=<<___	if ($flavour =~ /64/);
1247	AARCH64_VALID_CALL_TARGET
1248	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249	stp	x29,x30,[sp,#-16]!
1250	add	x29,sp,#0
1251___
1252$code.=<<___	if ($flavour !~ /64/);
1253	mov	ip,sp
1254	stmdb	sp!,{r4-r8,lr}
1255	vstmdb	sp!,{d8-d15}            @ ABI specification says so
1256	ldmia	ip,{r4-r5}		@ load remaining args
1257___
1258$code.=<<___;
1259	subs	$len,$len,#16
1260	mov	$step,#16
1261	b.lo	.Lcbc_abort
1262	cclr	$step,eq
1263
1264	cmp	$enc,#0			// en- or decrypting?
1265	ldr	$rounds,[$key,#240]
1266	and	$len,$len,#-16
1267	vld1.8	{$ivec},[$ivp]
1268	vld1.8	{$dat},[$inp],$step
1269
1270	vld1.32	{q8-q9},[$key]		// load key schedule...
1271	sub	$rounds,$rounds,#6
1272	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
1273	sub	$rounds,$rounds,#2
1274	vld1.32	{q10-q11},[$key_],#32
1275	vld1.32	{q12-q13},[$key_],#32
1276	vld1.32	{q14-q15},[$key_],#32
1277	vld1.32	{$rndlast},[$key_]
1278
1279	add	$key_,$key,#32
1280	mov	$cnt,$rounds
1281	b.eq	.Lcbc_dec
1282
1283	cmp	$rounds,#2
1284	veor	$dat,$dat,$ivec
1285	veor	$rndzero_n_last,q8,$rndlast
1286	b.eq	.Lcbc_enc128
1287
1288	vld1.32	{$in0-$in1},[$key_]
1289	add	$key_,$key,#16
1290	add	$key4,$key,#16*4
1291	add	$key5,$key,#16*5
1292	aese	$dat,q8
1293	aesmc	$dat,$dat
1294	add	$key6,$key,#16*6
1295	add	$key7,$key,#16*7
1296	b	.Lenter_cbc_enc
1297
1298.align	4
1299.Loop_cbc_enc:
1300	aese	$dat,q8
1301	aesmc	$dat,$dat
1302	 vst1.8	{$ivec},[$out],#16
1303.Lenter_cbc_enc:
1304	aese	$dat,q9
1305	aesmc	$dat,$dat
1306	aese	$dat,$in0
1307	aesmc	$dat,$dat
1308	vld1.32	{q8},[$key4]
1309	cmp	$rounds,#4
1310	aese	$dat,$in1
1311	aesmc	$dat,$dat
1312	vld1.32	{q9},[$key5]
1313	b.eq	.Lcbc_enc192
1314
1315	aese	$dat,q8
1316	aesmc	$dat,$dat
1317	vld1.32	{q8},[$key6]
1318	aese	$dat,q9
1319	aesmc	$dat,$dat
1320	vld1.32	{q9},[$key7]
1321	nop
1322
1323.Lcbc_enc192:
1324	aese	$dat,q8
1325	aesmc	$dat,$dat
1326	 subs	$len,$len,#16
1327	aese	$dat,q9
1328	aesmc	$dat,$dat
1329	 cclr	$step,eq
1330	aese	$dat,q10
1331	aesmc	$dat,$dat
1332	aese	$dat,q11
1333	aesmc	$dat,$dat
1334	 vld1.8	{q8},[$inp],$step
1335	aese	$dat,q12
1336	aesmc	$dat,$dat
1337	 veor	q8,q8,$rndzero_n_last
1338	aese	$dat,q13
1339	aesmc	$dat,$dat
1340	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
1341	aese	$dat,q14
1342	aesmc	$dat,$dat
1343	aese	$dat,q15
1344	veor	$ivec,$dat,$rndlast
1345	b.hs	.Loop_cbc_enc
1346
1347	vst1.8	{$ivec},[$out],#16
1348	b	.Lcbc_done
1349
1350.align	5
1351.Lcbc_enc128:
1352	vld1.32	{$in0-$in1},[$key_]
1353	aese	$dat,q8
1354	aesmc	$dat,$dat
1355	b	.Lenter_cbc_enc128
1356.Loop_cbc_enc128:
1357	aese	$dat,q8
1358	aesmc	$dat,$dat
1359	 vst1.8	{$ivec},[$out],#16
1360.Lenter_cbc_enc128:
1361	aese	$dat,q9
1362	aesmc	$dat,$dat
1363	 subs	$len,$len,#16
1364	aese	$dat,$in0
1365	aesmc	$dat,$dat
1366	 cclr	$step,eq
1367	aese	$dat,$in1
1368	aesmc	$dat,$dat
1369	aese	$dat,q10
1370	aesmc	$dat,$dat
1371	aese	$dat,q11
1372	aesmc	$dat,$dat
1373	 vld1.8	{q8},[$inp],$step
1374	aese	$dat,q12
1375	aesmc	$dat,$dat
1376	aese	$dat,q13
1377	aesmc	$dat,$dat
1378	aese	$dat,q14
1379	aesmc	$dat,$dat
1380	 veor	q8,q8,$rndzero_n_last
1381	aese	$dat,q15
1382	veor	$ivec,$dat,$rndlast
1383	b.hs	.Loop_cbc_enc128
1384
1385	vst1.8	{$ivec},[$out],#16
1386	b	.Lcbc_done
1387___
1388{
1389my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1390
1391my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
1392my ($dat4,$in4,$tmp4);
1393if ($flavour =~ /64/) {
1394    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1395}
1396
1397$code.=<<___;
1398.align	5
1399.Lcbc_dec:
1400	vld1.8	{$dat2},[$inp],#16
1401	subs	$len,$len,#32		// bias
1402	add	$cnt,$rounds,#2
1403	vorr	$in1,$dat,$dat
1404	vorr	$dat1,$dat,$dat
1405	vorr	$in2,$dat2,$dat2
1406	b.lo	.Lcbc_dec_tail
1407
1408	vorr	$dat1,$dat2,$dat2
1409	vld1.8	{$dat2},[$inp],#16
1410	vorr	$in0,$dat,$dat
1411	vorr	$in1,$dat1,$dat1
1412	vorr	$in2,$dat2,$dat2
1413___
1414$code.=<<___	if ($flavour =~ /64/);
1415	cmp	$len,#32
1416	b.lo	.Loop3x_cbc_dec
1417
1418	vld1.8	{$dat3},[$inp],#16
1419	vld1.8	{$dat4},[$inp],#16
1420	sub	$len,$len,#32		// bias
1421	mov	$cnt,$rounds
1422	vorr	$in3,$dat3,$dat3
1423	vorr	$in4,$dat4,$dat4
1424
1425.Loop5x_cbc_dec:
1426	aesd	$dat0,q8
1427	aesimc	$dat0,$dat0
1428	aesd	$dat1,q8
1429	aesimc	$dat1,$dat1
1430	aesd	$dat2,q8
1431	aesimc	$dat2,$dat2
1432	aesd	$dat3,q8
1433	aesimc	$dat3,$dat3
1434	aesd	$dat4,q8
1435	aesimc	$dat4,$dat4
1436	vld1.32	{q8},[$key_],#16
1437	subs	$cnt,$cnt,#2
1438	aesd	$dat0,q9
1439	aesimc	$dat0,$dat0
1440	aesd	$dat1,q9
1441	aesimc	$dat1,$dat1
1442	aesd	$dat2,q9
1443	aesimc	$dat2,$dat2
1444	aesd	$dat3,q9
1445	aesimc	$dat3,$dat3
1446	aesd	$dat4,q9
1447	aesimc	$dat4,$dat4
1448	vld1.32	{q9},[$key_],#16
1449	b.gt	.Loop5x_cbc_dec
1450
1451	aesd	$dat0,q8
1452	aesimc	$dat0,$dat0
1453	aesd	$dat1,q8
1454	aesimc	$dat1,$dat1
1455	aesd	$dat2,q8
1456	aesimc	$dat2,$dat2
1457	aesd	$dat3,q8
1458	aesimc	$dat3,$dat3
1459	aesd	$dat4,q8
1460	aesimc	$dat4,$dat4
1461	 cmp	$len,#0x40		// because .Lcbc_tail4x
1462	 sub	$len,$len,#0x50
1463
1464	aesd	$dat0,q9
1465	aesimc	$dat0,$dat0
1466	aesd	$dat1,q9
1467	aesimc	$dat1,$dat1
1468	aesd	$dat2,q9
1469	aesimc	$dat2,$dat2
1470	aesd	$dat3,q9
1471	aesimc	$dat3,$dat3
1472	aesd	$dat4,q9
1473	aesimc	$dat4,$dat4
1474	 csel	x6,xzr,$len,gt		// borrow x6, $cnt, "gt" is not typo
1475	 mov	$key_,$key
1476
1477	aesd	$dat0,q10
1478	aesimc	$dat0,$dat0
1479	aesd	$dat1,q10
1480	aesimc	$dat1,$dat1
1481	aesd	$dat2,q10
1482	aesimc	$dat2,$dat2
1483	aesd	$dat3,q10
1484	aesimc	$dat3,$dat3
1485	aesd	$dat4,q10
1486	aesimc	$dat4,$dat4
1487	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1488					// at exit from the loop $dat1-$dat4
1489					// are loaded with last "words"
1490	 add	x6,$len,#0x60		// because .Lcbc_tail4x
1491
1492	aesd	$dat0,q11
1493	aesimc	$dat0,$dat0
1494	aesd	$dat1,q11
1495	aesimc	$dat1,$dat1
1496	aesd	$dat2,q11
1497	aesimc	$dat2,$dat2
1498	aesd	$dat3,q11
1499	aesimc	$dat3,$dat3
1500	aesd	$dat4,q11
1501	aesimc	$dat4,$dat4
1502
1503	aesd	$dat0,q12
1504	aesimc	$dat0,$dat0
1505	aesd	$dat1,q12
1506	aesimc	$dat1,$dat1
1507	aesd	$dat2,q12
1508	aesimc	$dat2,$dat2
1509	aesd	$dat3,q12
1510	aesimc	$dat3,$dat3
1511	aesd	$dat4,q12
1512	aesimc	$dat4,$dat4
1513
1514	aesd	$dat0,q13
1515	aesimc	$dat0,$dat0
1516	aesd	$dat1,q13
1517	aesimc	$dat1,$dat1
1518	aesd	$dat2,q13
1519	aesimc	$dat2,$dat2
1520	aesd	$dat3,q13
1521	aesimc	$dat3,$dat3
1522	aesd	$dat4,q13
1523	aesimc	$dat4,$dat4
1524
1525	aesd	$dat0,q14
1526	aesimc	$dat0,$dat0
1527	aesd	$dat1,q14
1528	aesimc	$dat1,$dat1
1529	aesd	$dat2,q14
1530	aesimc	$dat2,$dat2
1531	aesd	$dat3,q14
1532	aesimc	$dat3,$dat3
1533	aesd	$dat4,q14
1534	aesimc	$dat4,$dat4
1535
1536	 veor	$tmp0,$ivec,$rndlast
1537	aesd	$dat0,q15
1538	 veor	$tmp1,$in0,$rndlast
1539	 vld1.8	{$in0},[$inp],#16
1540	aesd	$dat1,q15
1541	 veor	$tmp2,$in1,$rndlast
1542	 vld1.8	{$in1},[$inp],#16
1543	aesd	$dat2,q15
1544	 veor	$tmp3,$in2,$rndlast
1545	 vld1.8	{$in2},[$inp],#16
1546	aesd	$dat3,q15
1547	 veor	$tmp4,$in3,$rndlast
1548	 vld1.8	{$in3},[$inp],#16
1549	aesd	$dat4,q15
1550	 vorr	$ivec,$in4,$in4
1551	 vld1.8	{$in4},[$inp],#16
1552	cbz	x6,.Lcbc_tail4x
1553	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1554	veor	$tmp0,$tmp0,$dat0
1555	 vorr	$dat0,$in0,$in0
1556	veor	$tmp1,$tmp1,$dat1
1557	 vorr	$dat1,$in1,$in1
1558	veor	$tmp2,$tmp2,$dat2
1559	 vorr	$dat2,$in2,$in2
1560	veor	$tmp3,$tmp3,$dat3
1561	 vorr	$dat3,$in3,$in3
1562	veor	$tmp4,$tmp4,$dat4
1563	vst1.8	{$tmp0},[$out],#16
1564	 vorr	$dat4,$in4,$in4
1565	vst1.8	{$tmp1},[$out],#16
1566	 mov	$cnt,$rounds
1567	vst1.8	{$tmp2},[$out],#16
1568	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1569	vst1.8	{$tmp3},[$out],#16
1570	vst1.8	{$tmp4},[$out],#16
1571	b.hs	.Loop5x_cbc_dec
1572
1573	add	$len,$len,#0x50
1574	cbz	$len,.Lcbc_done
1575
1576	add	$cnt,$rounds,#2
1577	subs	$len,$len,#0x30
1578	vorr	$dat0,$in2,$in2
1579	vorr	$in0,$in2,$in2
1580	vorr	$dat1,$in3,$in3
1581	vorr	$in1,$in3,$in3
1582	vorr	$dat2,$in4,$in4
1583	vorr	$in2,$in4,$in4
1584	b.lo	.Lcbc_dec_tail
1585
1586	b	.Loop3x_cbc_dec
1587
1588.align	4
1589.Lcbc_tail4x:
1590	veor	$tmp1,$tmp0,$dat1
1591	veor	$tmp2,$tmp2,$dat2
1592	veor	$tmp3,$tmp3,$dat3
1593	veor	$tmp4,$tmp4,$dat4
1594	vst1.8	{$tmp1},[$out],#16
1595	vst1.8	{$tmp2},[$out],#16
1596	vst1.8	{$tmp3},[$out],#16
1597	vst1.8	{$tmp4},[$out],#16
1598
1599	b	.Lcbc_done
1600.align	4
1601___
1602$code.=<<___;
1603.Loop3x_cbc_dec:
1604	aesd	$dat0,q8
1605	aesimc	$dat0,$dat0
1606	aesd	$dat1,q8
1607	aesimc	$dat1,$dat1
1608	aesd	$dat2,q8
1609	aesimc	$dat2,$dat2
1610	vld1.32	{q8},[$key_],#16
1611	subs	$cnt,$cnt,#2
1612	aesd	$dat0,q9
1613	aesimc	$dat0,$dat0
1614	aesd	$dat1,q9
1615	aesimc	$dat1,$dat1
1616	aesd	$dat2,q9
1617	aesimc	$dat2,$dat2
1618	vld1.32	{q9},[$key_],#16
1619	b.gt	.Loop3x_cbc_dec
1620
1621	aesd	$dat0,q8
1622	aesimc	$dat0,$dat0
1623	aesd	$dat1,q8
1624	aesimc	$dat1,$dat1
1625	aesd	$dat2,q8
1626	aesimc	$dat2,$dat2
1627	 veor	$tmp0,$ivec,$rndlast
1628	 subs	$len,$len,#0x30
1629	 veor	$tmp1,$in0,$rndlast
1630	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
1631	aesd	$dat0,q9
1632	aesimc	$dat0,$dat0
1633	aesd	$dat1,q9
1634	aesimc	$dat1,$dat1
1635	aesd	$dat2,q9
1636	aesimc	$dat2,$dat2
1637	 veor	$tmp2,$in1,$rndlast
1638	 add	$inp,$inp,x6		// $inp is adjusted in such way that
1639					// at exit from the loop $dat1-$dat2
1640					// are loaded with last "words"
1641	 vorr	$ivec,$in2,$in2
1642	 mov	$key_,$key
1643	aesd	$dat0,q12
1644	aesimc	$dat0,$dat0
1645	aesd	$dat1,q12
1646	aesimc	$dat1,$dat1
1647	aesd	$dat2,q12
1648	aesimc	$dat2,$dat2
1649	 vld1.8	{$in0},[$inp],#16
1650	aesd	$dat0,q13
1651	aesimc	$dat0,$dat0
1652	aesd	$dat1,q13
1653	aesimc	$dat1,$dat1
1654	aesd	$dat2,q13
1655	aesimc	$dat2,$dat2
1656	 vld1.8	{$in1},[$inp],#16
1657	aesd	$dat0,q14
1658	aesimc	$dat0,$dat0
1659	aesd	$dat1,q14
1660	aesimc	$dat1,$dat1
1661	aesd	$dat2,q14
1662	aesimc	$dat2,$dat2
1663	 vld1.8	{$in2},[$inp],#16
1664	aesd	$dat0,q15
1665	aesd	$dat1,q15
1666	aesd	$dat2,q15
1667	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
1668	 add	$cnt,$rounds,#2
1669	veor	$tmp0,$tmp0,$dat0
1670	veor	$tmp1,$tmp1,$dat1
1671	veor	$dat2,$dat2,$tmp2
1672	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
1673	vst1.8	{$tmp0},[$out],#16
1674	 vorr	$dat0,$in0,$in0
1675	vst1.8	{$tmp1},[$out],#16
1676	 vorr	$dat1,$in1,$in1
1677	vst1.8	{$dat2},[$out],#16
1678	 vorr	$dat2,$in2,$in2
1679	b.hs	.Loop3x_cbc_dec
1680
1681	cmn	$len,#0x30
1682	b.eq	.Lcbc_done
1683	nop
1684
1685.Lcbc_dec_tail:
1686	aesd	$dat1,q8
1687	aesimc	$dat1,$dat1
1688	aesd	$dat2,q8
1689	aesimc	$dat2,$dat2
1690	vld1.32	{q8},[$key_],#16
1691	subs	$cnt,$cnt,#2
1692	aesd	$dat1,q9
1693	aesimc	$dat1,$dat1
1694	aesd	$dat2,q9
1695	aesimc	$dat2,$dat2
1696	vld1.32	{q9},[$key_],#16
1697	b.gt	.Lcbc_dec_tail
1698
1699	aesd	$dat1,q8
1700	aesimc	$dat1,$dat1
1701	aesd	$dat2,q8
1702	aesimc	$dat2,$dat2
1703	aesd	$dat1,q9
1704	aesimc	$dat1,$dat1
1705	aesd	$dat2,q9
1706	aesimc	$dat2,$dat2
1707	aesd	$dat1,q12
1708	aesimc	$dat1,$dat1
1709	aesd	$dat2,q12
1710	aesimc	$dat2,$dat2
1711	 cmn	$len,#0x20
1712	aesd	$dat1,q13
1713	aesimc	$dat1,$dat1
1714	aesd	$dat2,q13
1715	aesimc	$dat2,$dat2
1716	 veor	$tmp1,$ivec,$rndlast
1717	aesd	$dat1,q14
1718	aesimc	$dat1,$dat1
1719	aesd	$dat2,q14
1720	aesimc	$dat2,$dat2
1721	 veor	$tmp2,$in1,$rndlast
1722	aesd	$dat1,q15
1723	aesd	$dat2,q15
1724	b.eq	.Lcbc_dec_one
1725	veor	$tmp1,$tmp1,$dat1
1726	veor	$tmp2,$tmp2,$dat2
1727	 vorr	$ivec,$in2,$in2
1728	vst1.8	{$tmp1},[$out],#16
1729	vst1.8	{$tmp2},[$out],#16
1730	b	.Lcbc_done
1731
1732.Lcbc_dec_one:
1733	veor	$tmp1,$tmp1,$dat2
1734	 vorr	$ivec,$in2,$in2
1735	vst1.8	{$tmp1},[$out],#16
1736
1737.Lcbc_done:
1738	vst1.8	{$ivec},[$ivp]
1739.Lcbc_abort:
1740___
1741}
1742$code.=<<___	if ($flavour !~ /64/);
1743	vldmia	sp!,{d8-d15}
1744	ldmia	sp!,{r4-r8,pc}
1745___
1746$code.=<<___	if ($flavour =~ /64/);
1747	ldr	x29,[sp],#16
1748	ret
1749___
1750$code.=<<___;
1751.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1752___
1753}}}
1754{{{
1755my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1756my ($rounds,$cnt,$key_)=("w5","w6","x7");
1757my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1758my $step="x12";		# aliases with $tctr2
1759
1760my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1761my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1762
1763# used only in 64-bit mode...
1764my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1765
1766my ($dat,$tmp)=($dat0,$tmp0);
1767
1768### q8-q15	preloaded key schedule
1769
1770$code.=<<___;
1771.globl	${prefix}_ctr32_encrypt_blocks
1772.type	${prefix}_ctr32_encrypt_blocks,%function
1773.align	5
1774${prefix}_ctr32_encrypt_blocks:
1775___
1776$code.=<<___	if ($flavour =~ /64/);
1777	AARCH64_VALID_CALL_TARGET
1778	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1779	stp		x29,x30,[sp,#-16]!
1780	add		x29,sp,#0
1781___
1782$code.=<<___	if ($flavour !~ /64/);
1783	mov		ip,sp
1784	stmdb		sp!,{r4-r10,lr}
1785	vstmdb		sp!,{d8-d15}            @ ABI specification says so
1786	ldr		r4, [ip]		@ load remaining arg
1787___
1788$code.=<<___;
1789	ldr		$rounds,[$key,#240]
1790
1791	ldr		$ctr, [$ivp, #12]
1792#ifdef __ARMEB__
1793	vld1.8		{$dat0},[$ivp]
1794#else
1795	vld1.32		{$dat0},[$ivp]
1796#endif
1797	vld1.32		{q8-q9},[$key]		// load key schedule...
1798	sub		$rounds,$rounds,#4
1799	mov		$step,#16
1800	cmp		$len,#2
1801	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
1802	sub		$rounds,$rounds,#2
1803	vld1.32		{q12-q13},[$key_],#32
1804	vld1.32		{q14-q15},[$key_],#32
1805	vld1.32		{$rndlast},[$key_]
1806	add		$key_,$key,#32
1807	mov		$cnt,$rounds
1808	cclr		$step,lo
1809#ifndef __ARMEB__
1810	rev		$ctr, $ctr
1811#endif
1812___
1813$code.=<<___	if ($flavour =~ /64/);
1814	vorr		$dat1,$dat0,$dat0
1815	add		$tctr1, $ctr, #1
1816	vorr		$dat2,$dat0,$dat0
1817	add		$ctr, $ctr, #2
1818	vorr		$ivec,$dat0,$dat0
1819	rev		$tctr1, $tctr1
1820	vmov.32		${dat1}[3],$tctr1
1821	b.ls		.Lctr32_tail
1822	rev		$tctr2, $ctr
1823	sub		$len,$len,#3		// bias
1824	vmov.32		${dat2}[3],$tctr2
1825___
1826$code.=<<___	if ($flavour !~ /64/);
1827	add		$tctr1, $ctr, #1
1828	vorr		$ivec,$dat0,$dat0
1829	rev		$tctr1, $tctr1
1830	vmov.32		${ivec}[3],$tctr1
1831	add		$ctr, $ctr, #2
1832	vorr		$dat1,$ivec,$ivec
1833	b.ls		.Lctr32_tail
1834	rev		$tctr2, $ctr
1835	vmov.32		${ivec}[3],$tctr2
1836	sub		$len,$len,#3		// bias
1837	vorr		$dat2,$ivec,$ivec
1838___
1839$code.=<<___	if ($flavour =~ /64/);
1840	cmp		$len,#32
1841	b.lo		.Loop3x_ctr32
1842
1843	add		w13,$ctr,#1
1844	add		w14,$ctr,#2
1845	vorr		$dat3,$dat0,$dat0
1846	rev		w13,w13
1847	vorr		$dat4,$dat0,$dat0
1848	rev		w14,w14
1849	vmov.32		${dat3}[3],w13
1850	sub		$len,$len,#2		// bias
1851	vmov.32		${dat4}[3],w14
1852	add		$ctr,$ctr,#2
1853	b		.Loop5x_ctr32
1854
1855.align	4
1856.Loop5x_ctr32:
1857	aese		$dat0,q8
1858	aesmc		$dat0,$dat0
1859	aese		$dat1,q8
1860	aesmc		$dat1,$dat1
1861	aese		$dat2,q8
1862	aesmc		$dat2,$dat2
1863	aese		$dat3,q8
1864	aesmc		$dat3,$dat3
1865	aese		$dat4,q8
1866	aesmc		$dat4,$dat4
1867	vld1.32		{q8},[$key_],#16
1868	subs		$cnt,$cnt,#2
1869	aese		$dat0,q9
1870	aesmc		$dat0,$dat0
1871	aese		$dat1,q9
1872	aesmc		$dat1,$dat1
1873	aese		$dat2,q9
1874	aesmc		$dat2,$dat2
1875	aese		$dat3,q9
1876	aesmc		$dat3,$dat3
1877	aese		$dat4,q9
1878	aesmc		$dat4,$dat4
1879	vld1.32		{q9},[$key_],#16
1880	b.gt		.Loop5x_ctr32
1881
1882	mov		$key_,$key
1883	aese		$dat0,q8
1884	aesmc		$dat0,$dat0
1885	aese		$dat1,q8
1886	aesmc		$dat1,$dat1
1887	aese		$dat2,q8
1888	aesmc		$dat2,$dat2
1889	aese		$dat3,q8
1890	aesmc		$dat3,$dat3
1891	aese		$dat4,q8
1892	aesmc		$dat4,$dat4
1893	vld1.32	 	{q8},[$key_],#16	// re-pre-load rndkey[0]
1894
1895	aese		$dat0,q9
1896	aesmc		$dat0,$dat0
1897	aese		$dat1,q9
1898	aesmc		$dat1,$dat1
1899	aese		$dat2,q9
1900	aesmc		$dat2,$dat2
1901	aese		$dat3,q9
1902	aesmc		$dat3,$dat3
1903	aese		$dat4,q9
1904	aesmc		$dat4,$dat4
1905	vld1.32	 	{q9},[$key_],#16	// re-pre-load rndkey[1]
1906
1907	aese		$dat0,q12
1908	aesmc		$dat0,$dat0
1909	 add		$tctr0,$ctr,#1
1910	 add		$tctr1,$ctr,#2
1911	aese		$dat1,q12
1912	aesmc		$dat1,$dat1
1913	 add		$tctr2,$ctr,#3
1914	 add		w13,$ctr,#4
1915	aese		$dat2,q12
1916	aesmc		$dat2,$dat2
1917	 add		w14,$ctr,#5
1918	 rev		$tctr0,$tctr0
1919	aese		$dat3,q12
1920	aesmc		$dat3,$dat3
1921	 rev		$tctr1,$tctr1
1922	 rev		$tctr2,$tctr2
1923	aese		$dat4,q12
1924	aesmc		$dat4,$dat4
1925	 rev		w13,w13
1926	 rev		w14,w14
1927
1928	aese		$dat0,q13
1929	aesmc		$dat0,$dat0
1930	aese		$dat1,q13
1931	aesmc		$dat1,$dat1
1932	aese		$dat2,q13
1933	aesmc		$dat2,$dat2
1934	aese		$dat3,q13
1935	aesmc		$dat3,$dat3
1936	aese		$dat4,q13
1937	aesmc		$dat4,$dat4
1938
1939	aese		$dat0,q14
1940	aesmc		$dat0,$dat0
1941	 vld1.8		{$in0},[$inp],#16
1942	aese		$dat1,q14
1943	aesmc		$dat1,$dat1
1944	 vld1.8		{$in1},[$inp],#16
1945	aese		$dat2,q14
1946	aesmc		$dat2,$dat2
1947	 vld1.8		{$in2},[$inp],#16
1948	aese		$dat3,q14
1949	aesmc		$dat3,$dat3
1950	 vld1.8		{$in3},[$inp],#16
1951	aese		$dat4,q14
1952	aesmc		$dat4,$dat4
1953	 vld1.8		{$in4},[$inp],#16
1954
1955	aese		$dat0,q15
1956	 veor		$in0,$in0,$rndlast
1957	aese		$dat1,q15
1958	 veor		$in1,$in1,$rndlast
1959	aese		$dat2,q15
1960	 veor		$in2,$in2,$rndlast
1961	aese		$dat3,q15
1962	 veor		$in3,$in3,$rndlast
1963	aese		$dat4,q15
1964	 veor		$in4,$in4,$rndlast
1965
1966	veor		$in0,$in0,$dat0
1967	 vorr		$dat0,$ivec,$ivec
1968	veor		$in1,$in1,$dat1
1969	 vorr		$dat1,$ivec,$ivec
1970	veor		$in2,$in2,$dat2
1971	 vorr		$dat2,$ivec,$ivec
1972	veor		$in3,$in3,$dat3
1973	 vorr		$dat3,$ivec,$ivec
1974	veor		$in4,$in4,$dat4
1975	 vorr		$dat4,$ivec,$ivec
1976
1977	vst1.8		{$in0},[$out],#16
1978	 vmov.32	${dat0}[3],$tctr0
1979	vst1.8		{$in1},[$out],#16
1980	 vmov.32	${dat1}[3],$tctr1
1981	vst1.8		{$in2},[$out],#16
1982	 vmov.32	${dat2}[3],$tctr2
1983	vst1.8		{$in3},[$out],#16
1984	 vmov.32	${dat3}[3],w13
1985	vst1.8		{$in4},[$out],#16
1986	 vmov.32	${dat4}[3],w14
1987
1988	mov		$cnt,$rounds
1989	cbz		$len,.Lctr32_done
1990
1991	add		$ctr,$ctr,#5
1992	subs		$len,$len,#5
1993	b.hs		.Loop5x_ctr32
1994
1995	add		$len,$len,#5
1996	sub		$ctr,$ctr,#5
1997
1998	cmp		$len,#2
1999	mov		$step,#16
2000	cclr		$step,lo
2001	b.ls		.Lctr32_tail
2002
2003	sub		$len,$len,#3		// bias
2004	add		$ctr,$ctr,#3
2005___
2006$code.=<<___;
2007	b		.Loop3x_ctr32
2008
2009.align	4
2010.Loop3x_ctr32:
2011	aese		$dat0,q8
2012	aesmc		$dat0,$dat0
2013	aese		$dat1,q8
2014	aesmc		$dat1,$dat1
2015	aese		$dat2,q8
2016	aesmc		$dat2,$dat2
2017	vld1.32		{q8},[$key_],#16
2018	subs		$cnt,$cnt,#2
2019	aese		$dat0,q9
2020	aesmc		$dat0,$dat0
2021	aese		$dat1,q9
2022	aesmc		$dat1,$dat1
2023	aese		$dat2,q9
2024	aesmc		$dat2,$dat2
2025	vld1.32		{q9},[$key_],#16
2026	b.gt		.Loop3x_ctr32
2027
2028	aese		$dat0,q8
2029	aesmc		$tmp0,$dat0
2030	aese		$dat1,q8
2031	aesmc		$tmp1,$dat1
2032	 vld1.8		{$in0},[$inp],#16
2033___
2034$code.=<<___	if ($flavour =~ /64/);
2035	 vorr		$dat0,$ivec,$ivec
2036___
2037$code.=<<___	if ($flavour !~ /64/);
2038	 add		$tctr0,$ctr,#1
2039___
2040$code.=<<___;
2041	aese		$dat2,q8
2042	aesmc		$dat2,$dat2
2043	 vld1.8		{$in1},[$inp],#16
2044___
2045$code.=<<___	if ($flavour =~ /64/);
2046	 vorr		$dat1,$ivec,$ivec
2047___
2048$code.=<<___	if ($flavour !~ /64/);
2049	 rev		$tctr0,$tctr0
2050___
2051$code.=<<___;
2052	aese		$tmp0,q9
2053	aesmc		$tmp0,$tmp0
2054	aese		$tmp1,q9
2055	aesmc		$tmp1,$tmp1
2056	 vld1.8		{$in2},[$inp],#16
2057	 mov		$key_,$key
2058	aese		$dat2,q9
2059	aesmc		$tmp2,$dat2
2060___
2061$code.=<<___	if ($flavour =~ /64/);
2062	 vorr		$dat2,$ivec,$ivec
2063	 add		$tctr0,$ctr,#1
2064___
2065$code.=<<___;
2066	aese		$tmp0,q12
2067	aesmc		$tmp0,$tmp0
2068	aese		$tmp1,q12
2069	aesmc		$tmp1,$tmp1
2070	 veor		$in0,$in0,$rndlast
2071	 add		$tctr1,$ctr,#2
2072	aese		$tmp2,q12
2073	aesmc		$tmp2,$tmp2
2074	 veor		$in1,$in1,$rndlast
2075	 add		$ctr,$ctr,#3
2076	aese		$tmp0,q13
2077	aesmc		$tmp0,$tmp0
2078	aese		$tmp1,q13
2079	aesmc		$tmp1,$tmp1
2080	 veor		$in2,$in2,$rndlast
2081___
2082$code.=<<___	if ($flavour =~ /64/);
2083	 rev		$tctr0,$tctr0
2084	aese		$tmp2,q13
2085	aesmc		$tmp2,$tmp2
2086	 vmov.32	${dat0}[3], $tctr0
2087___
2088$code.=<<___	if ($flavour !~ /64/);
2089	 vmov.32	${ivec}[3], $tctr0
2090	aese		$tmp2,q13
2091	aesmc		$tmp2,$tmp2
2092	 vorr		$dat0,$ivec,$ivec
2093___
2094$code.=<<___;
2095	 rev		$tctr1,$tctr1
2096	aese		$tmp0,q14
2097	aesmc		$tmp0,$tmp0
2098___
2099$code.=<<___	if ($flavour !~ /64/);
2100	 vmov.32	${ivec}[3], $tctr1
2101	 rev		$tctr2,$ctr
2102___
2103$code.=<<___;
2104	aese		$tmp1,q14
2105	aesmc		$tmp1,$tmp1
2106___
2107$code.=<<___	if ($flavour =~ /64/);
2108	 vmov.32	${dat1}[3], $tctr1
2109	 rev		$tctr2,$ctr
2110	aese		$tmp2,q14
2111	aesmc		$tmp2,$tmp2
2112	 vmov.32	${dat2}[3], $tctr2
2113___
2114$code.=<<___	if ($flavour !~ /64/);
2115	 vorr		$dat1,$ivec,$ivec
2116	 vmov.32	${ivec}[3], $tctr2
2117	aese		$tmp2,q14
2118	aesmc		$tmp2,$tmp2
2119	 vorr		$dat2,$ivec,$ivec
2120___
2121$code.=<<___;
2122	 subs		$len,$len,#3
2123	aese		$tmp0,q15
2124	aese		$tmp1,q15
2125	aese		$tmp2,q15
2126
2127	veor		$in0,$in0,$tmp0
2128	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
2129	vst1.8		{$in0},[$out],#16
2130	veor		$in1,$in1,$tmp1
2131	 mov		$cnt,$rounds
2132	vst1.8		{$in1},[$out],#16
2133	veor		$in2,$in2,$tmp2
2134	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
2135	vst1.8		{$in2},[$out],#16
2136	b.hs		.Loop3x_ctr32
2137
2138	adds		$len,$len,#3
2139	b.eq		.Lctr32_done
2140	cmp		$len,#1
2141	mov		$step,#16
2142	cclr		$step,eq
2143
2144.Lctr32_tail:
2145	aese		$dat0,q8
2146	aesmc		$dat0,$dat0
2147	aese		$dat1,q8
2148	aesmc		$dat1,$dat1
2149	vld1.32		{q8},[$key_],#16
2150	subs		$cnt,$cnt,#2
2151	aese		$dat0,q9
2152	aesmc		$dat0,$dat0
2153	aese		$dat1,q9
2154	aesmc		$dat1,$dat1
2155	vld1.32		{q9},[$key_],#16
2156	b.gt		.Lctr32_tail
2157
2158	aese		$dat0,q8
2159	aesmc		$dat0,$dat0
2160	aese		$dat1,q8
2161	aesmc		$dat1,$dat1
2162	aese		$dat0,q9
2163	aesmc		$dat0,$dat0
2164	aese		$dat1,q9
2165	aesmc		$dat1,$dat1
2166	 vld1.8		{$in0},[$inp],$step
2167	aese		$dat0,q12
2168	aesmc		$dat0,$dat0
2169	aese		$dat1,q12
2170	aesmc		$dat1,$dat1
2171	 vld1.8		{$in1},[$inp]
2172	aese		$dat0,q13
2173	aesmc		$dat0,$dat0
2174	aese		$dat1,q13
2175	aesmc		$dat1,$dat1
2176	 veor		$in0,$in0,$rndlast
2177	aese		$dat0,q14
2178	aesmc		$dat0,$dat0
2179	aese		$dat1,q14
2180	aesmc		$dat1,$dat1
2181	 veor		$in1,$in1,$rndlast
2182	aese		$dat0,q15
2183	aese		$dat1,q15
2184
2185	cmp		$len,#1
2186	veor		$in0,$in0,$dat0
2187	veor		$in1,$in1,$dat1
2188	vst1.8		{$in0},[$out],#16
2189	b.eq		.Lctr32_done
2190	vst1.8		{$in1},[$out]
2191
2192.Lctr32_done:
2193___
2194$code.=<<___	if ($flavour !~ /64/);
2195	vldmia		sp!,{d8-d15}
2196	ldmia		sp!,{r4-r10,pc}
2197___
2198$code.=<<___	if ($flavour =~ /64/);
2199	ldr		x29,[sp],#16
2200	ret
2201___
2202$code.=<<___;
2203.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2204___
2205}}}
2206# Performance in cycles per byte.
2207# Processed with AES-XTS different key size.
2208# It shows the value before and after optimization as below:
2209# (before/after):
2210#
2211#		AES-128-XTS		AES-256-XTS
2212# Cortex-A57	3.36/1.09		4.02/1.37
2213# Cortex-A72	3.03/1.02		3.28/1.33
2214
2215# Optimization is implemented by loop unrolling and interleaving.
2216# Commonly, we choose the unrolling factor as 5, if the input
2217# data size smaller than 5 blocks, but not smaller than 3 blocks,
2218# choose 3 as the unrolling factor.
2219# If the input data size dsize >= 5*16 bytes, then take 5 blocks
2220# as one iteration, every loop the left size lsize -= 5*16.
2221# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2222# will be processed specially, which be integrated into the 5*16 bytes
2223# loop to improve the efficiency.
2224# There is one special case, if the original input data size dsize
2225# = 16 bytes, we will treat it separately to improve the
2226# performance: one independent code block without LR, FP load and
2227# store.
2228# Encryption will process the (length -tailcnt) bytes as mentioned
2229# previously, then encrypt the composite block as last second
2230# cipher block.
2231# Decryption will process the (length -tailcnt -1) bytes as mentioned
2232# previously, then decrypt the last second cipher block to get the
2233# last plain block(tail), decrypt the composite block as last second
2234# plain text block.
2235
2236{{{
2237my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2238my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2239my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2240my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2241my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2242my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2243my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2244my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2245my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2246
2247my ($tmpin)=("v26.16b");
2248my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2249
2250# q7	last round key
2251# q10-q15, q7	Last 7 round keys
2252# q8-q9	preloaded round keys except last 7 keys for big size
2253# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
2254
2255
2256my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2257
2258my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
2259my ($dat4,$in4,$tmp4);
2260if ($flavour =~ /64/) {
2261    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2262}
2263
2264$code.=<<___	if ($flavour =~ /64/);
2265.globl	${prefix}_xts_encrypt
2266.type	${prefix}_xts_encrypt,%function
2267.align	5
2268${prefix}_xts_encrypt:
2269___
2270$code.=<<___	if ($flavour =~ /64/);
2271	AARCH64_VALID_CALL_TARGET
2272	cmp	$len,#16
2273	// Original input data size bigger than 16, jump to big size processing.
2274	b.ne	.Lxts_enc_big_size
2275	// Encrypt the iv with key2, as the first XEX iv.
2276	ldr	$rounds,[$key2,#240]
2277	vld1.8	{$dat},[$key2],#16
2278	vld1.8	{$iv0},[$ivp]
2279	sub	$rounds,$rounds,#2
2280	vld1.8	{$dat1},[$key2],#16
2281
2282.Loop_enc_iv_enc:
2283	aese	$iv0,$dat
2284	aesmc	$iv0,$iv0
2285	vld1.32	{$dat},[$key2],#16
2286	subs	$rounds,$rounds,#2
2287	aese	$iv0,$dat1
2288	aesmc	$iv0,$iv0
2289	vld1.32	{$dat1},[$key2],#16
2290	b.gt	.Loop_enc_iv_enc
2291
2292	aese	$iv0,$dat
2293	aesmc	$iv0,$iv0
2294	vld1.32	{$dat},[$key2]
2295	aese	$iv0,$dat1
2296	veor	$iv0,$iv0,$dat
2297
2298	vld1.8	{$dat0},[$inp]
2299	veor	$dat0,$iv0,$dat0
2300
2301	ldr	$rounds,[$key1,#240]
2302	vld1.32	{q20-q21},[$key1],#32		// load key schedule...
2303
2304	aese	$dat0,q20
2305	aesmc	$dat0,$dat0
2306	vld1.32	{q8-q9},[$key1],#32		// load key schedule...
2307	aese	$dat0,q21
2308	aesmc	$dat0,$dat0
2309	subs	$rounds,$rounds,#10		// if rounds==10, jump to aes-128-xts processing
2310	b.eq	.Lxts_128_enc
2311.Lxts_enc_round_loop:
2312	aese	$dat0,q8
2313	aesmc	$dat0,$dat0
2314	vld1.32	{q8},[$key1],#16		// load key schedule...
2315	aese	$dat0,q9
2316	aesmc	$dat0,$dat0
2317	vld1.32	{q9},[$key1],#16		// load key schedule...
2318	subs	$rounds,$rounds,#2		// bias
2319	b.gt	.Lxts_enc_round_loop
2320.Lxts_128_enc:
2321	vld1.32	{q10-q11},[$key1],#32		// load key schedule...
2322	aese	$dat0,q8
2323	aesmc	$dat0,$dat0
2324	aese	$dat0,q9
2325	aesmc	$dat0,$dat0
2326	vld1.32	{q12-q13},[$key1],#32		// load key schedule...
2327	aese	$dat0,q10
2328	aesmc	$dat0,$dat0
2329	aese	$dat0,q11
2330	aesmc	$dat0,$dat0
2331	vld1.32	{q14-q15},[$key1],#32		// load key schedule...
2332	aese	$dat0,q12
2333	aesmc	$dat0,$dat0
2334	aese	$dat0,q13
2335	aesmc	$dat0,$dat0
2336	vld1.32	{$rndlast},[$key1]
2337	aese	$dat0,q14
2338	aesmc	$dat0,$dat0
2339	aese	$dat0,q15
2340	veor	$dat0,$dat0,$rndlast
2341	veor	$dat0,$dat0,$iv0
2342	vst1.8	{$dat0},[$out]
2343	b	.Lxts_enc_final_abort
2344
2345.align	4
2346.Lxts_enc_big_size:
2347___
2348$code.=<<___	if ($flavour =~ /64/);
2349	stp	$constnumx,$tmpinp,[sp,#-64]!
2350	stp	$tailcnt,$midnumx,[sp,#48]
2351	stp	$ivd10,$ivd20,[sp,#32]
2352	stp	$ivd30,$ivd40,[sp,#16]
2353
2354	// tailcnt store the tail value of length%16.
2355	and	$tailcnt,$len,#0xf
2356	and	$len,$len,#-16
2357	subs	$len,$len,#16
2358	mov	$step,#16
2359	b.lo	.Lxts_abort
2360	csel	$step,xzr,$step,eq
2361
2362	// Firstly, encrypt the iv with key2, as the first iv of XEX.
2363	ldr	$rounds,[$key2,#240]
2364	vld1.32	{$dat},[$key2],#16
2365	vld1.8	{$iv0},[$ivp]
2366	sub	$rounds,$rounds,#2
2367	vld1.32	{$dat1},[$key2],#16
2368
2369.Loop_iv_enc:
2370	aese	$iv0,$dat
2371	aesmc	$iv0,$iv0
2372	vld1.32	{$dat},[$key2],#16
2373	subs	$rounds,$rounds,#2
2374	aese	$iv0,$dat1
2375	aesmc	$iv0,$iv0
2376	vld1.32	{$dat1},[$key2],#16
2377	b.gt	.Loop_iv_enc
2378
2379	aese	$iv0,$dat
2380	aesmc	$iv0,$iv0
2381	vld1.32	{$dat},[$key2]
2382	aese	$iv0,$dat1
2383	veor	$iv0,$iv0,$dat
2384
2385	// The iv for second block
2386	// $ivl- iv(low), $ivh - iv(high)
2387	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2388	fmov	$ivl,$ivd00
2389	fmov	$ivh,$ivd01
2390	mov	$constnum,#0x87
2391	extr	$midnumx,$ivh,$ivh,#32
2392	extr	$ivh,$ivh,$ivl,#63
2393	and	$tmpmw,$constnum,$midnum,asr#31
2394	eor	$ivl,$tmpmx,$ivl,lsl#1
2395	fmov	$ivd10,$ivl
2396	fmov	$ivd11,$ivh
2397
2398	ldr	$rounds0,[$key1,#240]		// next starting point
2399	vld1.8	{$dat},[$inp],$step
2400
2401	vld1.32	{q8-q9},[$key1]			// load key schedule...
2402	sub	$rounds0,$rounds0,#6
2403	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
2404	sub	$rounds0,$rounds0,#2
2405	vld1.32	{q10-q11},[$key_],#32
2406	vld1.32	{q12-q13},[$key_],#32
2407	vld1.32	{q14-q15},[$key_],#32
2408	vld1.32	{$rndlast},[$key_]
2409
2410	add	$key_,$key1,#32
2411	mov	$rounds,$rounds0
2412
2413	// Encryption
2414.Lxts_enc:
2415	vld1.8	{$dat2},[$inp],#16
2416	subs	$len,$len,#32			// bias
2417	add	$rounds,$rounds0,#2
2418	vorr	$in1,$dat,$dat
2419	vorr	$dat1,$dat,$dat
2420	vorr	$in3,$dat,$dat
2421	vorr	$in2,$dat2,$dat2
2422	vorr	$in4,$dat2,$dat2
2423	b.lo	.Lxts_inner_enc_tail
2424	veor	$dat,$dat,$iv0			// before encryption, xor with iv
2425	veor	$dat2,$dat2,$iv1
2426
2427	// The iv for third block
2428	extr	$midnumx,$ivh,$ivh,#32
2429	extr	$ivh,$ivh,$ivl,#63
2430	and	$tmpmw,$constnum,$midnum,asr#31
2431	eor	$ivl,$tmpmx,$ivl,lsl#1
2432	fmov	$ivd20,$ivl
2433	fmov	$ivd21,$ivh
2434
2435
2436	vorr	$dat1,$dat2,$dat2
2437	vld1.8	{$dat2},[$inp],#16
2438	vorr	$in0,$dat,$dat
2439	vorr	$in1,$dat1,$dat1
2440	veor	$in2,$dat2,$iv2 		// the third block
2441	veor	$dat2,$dat2,$iv2
2442	cmp	$len,#32
2443	b.lo	.Lxts_outer_enc_tail
2444
2445	// The iv for fourth block
2446	extr	$midnumx,$ivh,$ivh,#32
2447	extr	$ivh,$ivh,$ivl,#63
2448	and	$tmpmw,$constnum,$midnum,asr#31
2449	eor	$ivl,$tmpmx,$ivl,lsl#1
2450	fmov	$ivd30,$ivl
2451	fmov	$ivd31,$ivh
2452
2453	vld1.8	{$dat3},[$inp],#16
2454	// The iv for fifth block
2455	extr	$midnumx,$ivh,$ivh,#32
2456	extr	$ivh,$ivh,$ivl,#63
2457	and	$tmpmw,$constnum,$midnum,asr#31
2458	eor	$ivl,$tmpmx,$ivl,lsl#1
2459	fmov	$ivd40,$ivl
2460	fmov	$ivd41,$ivh
2461
2462	vld1.8	{$dat4},[$inp],#16
2463	veor	$dat3,$dat3,$iv3		// the fourth block
2464	veor	$dat4,$dat4,$iv4
2465	sub	$len,$len,#32			// bias
2466	mov	$rounds,$rounds0
2467	b	.Loop5x_xts_enc
2468
2469.align	4
2470.Loop5x_xts_enc:
2471	aese	$dat0,q8
2472	aesmc	$dat0,$dat0
2473	aese	$dat1,q8
2474	aesmc	$dat1,$dat1
2475	aese	$dat2,q8
2476	aesmc	$dat2,$dat2
2477	aese	$dat3,q8
2478	aesmc	$dat3,$dat3
2479	aese	$dat4,q8
2480	aesmc	$dat4,$dat4
2481	vld1.32	{q8},[$key_],#16
2482	subs	$rounds,$rounds,#2
2483	aese	$dat0,q9
2484	aesmc	$dat0,$dat0
2485	aese	$dat1,q9
2486	aesmc	$dat1,$dat1
2487	aese	$dat2,q9
2488	aesmc	$dat2,$dat2
2489	aese	$dat3,q9
2490	aesmc	$dat3,$dat3
2491	aese	$dat4,q9
2492	aesmc	$dat4,$dat4
2493	vld1.32	{q9},[$key_],#16
2494	b.gt	.Loop5x_xts_enc
2495
2496	aese	$dat0,q8
2497	aesmc	$dat0,$dat0
2498	aese	$dat1,q8
2499	aesmc	$dat1,$dat1
2500	aese	$dat2,q8
2501	aesmc	$dat2,$dat2
2502	aese	$dat3,q8
2503	aesmc	$dat3,$dat3
2504	aese	$dat4,q8
2505	aesmc	$dat4,$dat4
2506	subs	$len,$len,#0x50			// because .Lxts_enc_tail4x
2507
2508	aese	$dat0,q9
2509	aesmc	$dat0,$dat0
2510	aese	$dat1,q9
2511	aesmc	$dat1,$dat1
2512	aese	$dat2,q9
2513	aesmc	$dat2,$dat2
2514	aese	$dat3,q9
2515	aesmc	$dat3,$dat3
2516	aese	$dat4,q9
2517	aesmc	$dat4,$dat4
2518	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
2519	mov	$key_,$key1
2520
2521	aese	$dat0,q10
2522	aesmc	$dat0,$dat0
2523	aese	$dat1,q10
2524	aesmc	$dat1,$dat1
2525	aese	$dat2,q10
2526	aesmc	$dat2,$dat2
2527	aese	$dat3,q10
2528	aesmc	$dat3,$dat3
2529	aese	$dat4,q10
2530	aesmc	$dat4,$dat4
2531	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
2532						// at exit from the loop v1.16b-v26.16b
2533						// are loaded with last "words"
2534	add	$xoffset,$len,#0x60		// because .Lxts_enc_tail4x
2535
2536	aese	$dat0,q11
2537	aesmc	$dat0,$dat0
2538	aese	$dat1,q11
2539	aesmc	$dat1,$dat1
2540	aese	$dat2,q11
2541	aesmc	$dat2,$dat2
2542	aese	$dat3,q11
2543	aesmc	$dat3,$dat3
2544	aese	$dat4,q11
2545	aesmc	$dat4,$dat4
2546
2547	aese	$dat0,q12
2548	aesmc	$dat0,$dat0
2549	aese	$dat1,q12
2550	aesmc	$dat1,$dat1
2551	aese	$dat2,q12
2552	aesmc	$dat2,$dat2
2553	aese	$dat3,q12
2554	aesmc	$dat3,$dat3
2555	aese	$dat4,q12
2556	aesmc	$dat4,$dat4
2557
2558	aese	$dat0,q13
2559	aesmc	$dat0,$dat0
2560	aese	$dat1,q13
2561	aesmc	$dat1,$dat1
2562	aese	$dat2,q13
2563	aesmc	$dat2,$dat2
2564	aese	$dat3,q13
2565	aesmc	$dat3,$dat3
2566	aese	$dat4,q13
2567	aesmc	$dat4,$dat4
2568
2569	aese	$dat0,q14
2570	aesmc	$dat0,$dat0
2571	aese	$dat1,q14
2572	aesmc	$dat1,$dat1
2573	aese	$dat2,q14
2574	aesmc	$dat2,$dat2
2575	aese	$dat3,q14
2576	aesmc	$dat3,$dat3
2577	aese	$dat4,q14
2578	aesmc	$dat4,$dat4
2579
2580	veor	$tmp0,$rndlast,$iv0
2581	aese	$dat0,q15
2582	// The iv for first block of one iteration
2583	extr	$midnumx,$ivh,$ivh,#32
2584	extr	$ivh,$ivh,$ivl,#63
2585	and	$tmpmw,$constnum,$midnum,asr#31
2586	eor	$ivl,$tmpmx,$ivl,lsl#1
2587	fmov	$ivd00,$ivl
2588	fmov	$ivd01,$ivh
2589	veor	$tmp1,$rndlast,$iv1
2590	vld1.8	{$in0},[$inp],#16
2591	aese	$dat1,q15
2592	// The iv for second block
2593	extr	$midnumx,$ivh,$ivh,#32
2594	extr	$ivh,$ivh,$ivl,#63
2595	and	$tmpmw,$constnum,$midnum,asr#31
2596	eor	$ivl,$tmpmx,$ivl,lsl#1
2597	fmov	$ivd10,$ivl
2598	fmov	$ivd11,$ivh
2599	veor	$tmp2,$rndlast,$iv2
2600	vld1.8	{$in1},[$inp],#16
2601	aese	$dat2,q15
2602	// The iv for third block
2603	extr	$midnumx,$ivh,$ivh,#32
2604	extr	$ivh,$ivh,$ivl,#63
2605	and	$tmpmw,$constnum,$midnum,asr#31
2606	eor	$ivl,$tmpmx,$ivl,lsl#1
2607	fmov	$ivd20,$ivl
2608	fmov	$ivd21,$ivh
2609	veor	$tmp3,$rndlast,$iv3
2610	vld1.8	{$in2},[$inp],#16
2611	aese	$dat3,q15
2612	// The iv for fourth block
2613	extr	$midnumx,$ivh,$ivh,#32
2614	extr	$ivh,$ivh,$ivl,#63
2615	and	$tmpmw,$constnum,$midnum,asr#31
2616	eor	$ivl,$tmpmx,$ivl,lsl#1
2617	fmov	$ivd30,$ivl
2618	fmov	$ivd31,$ivh
2619	veor	$tmp4,$rndlast,$iv4
2620	vld1.8	{$in3},[$inp],#16
2621	aese	$dat4,q15
2622
2623	// The iv for fifth block
2624	extr	$midnumx,$ivh,$ivh,#32
2625	extr	$ivh,$ivh,$ivl,#63
2626	and	$tmpmw,$constnum,$midnum,asr #31
2627	eor	$ivl,$tmpmx,$ivl,lsl #1
2628	fmov	$ivd40,$ivl
2629	fmov	$ivd41,$ivh
2630
2631	vld1.8	{$in4},[$inp],#16
2632	cbz	$xoffset,.Lxts_enc_tail4x
2633	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
2634	veor	$tmp0,$tmp0,$dat0
2635	veor	$dat0,$in0,$iv0
2636	veor	$tmp1,$tmp1,$dat1
2637	veor	$dat1,$in1,$iv1
2638	veor	$tmp2,$tmp2,$dat2
2639	veor	$dat2,$in2,$iv2
2640	veor	$tmp3,$tmp3,$dat3
2641	veor	$dat3,$in3,$iv3
2642	veor	$tmp4,$tmp4,$dat4
2643	vst1.8	{$tmp0},[$out],#16
2644	veor	$dat4,$in4,$iv4
2645	vst1.8	{$tmp1},[$out],#16
2646	mov	$rounds,$rounds0
2647	vst1.8	{$tmp2},[$out],#16
2648	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
2649	vst1.8	{$tmp3},[$out],#16
2650	vst1.8	{$tmp4},[$out],#16
2651	b.hs	.Loop5x_xts_enc
2652
2653
2654	// If left 4 blocks, borrow the five block's processing.
2655	cmn	$len,#0x10
2656	b.ne	.Loop5x_enc_after
2657	vorr	$iv4,$iv3,$iv3
2658	vorr	$iv3,$iv2,$iv2
2659	vorr	$iv2,$iv1,$iv1
2660	vorr	$iv1,$iv0,$iv0
2661	fmov	$ivl,$ivd40
2662	fmov	$ivh,$ivd41
2663	veor	$dat0,$iv0,$in0
2664	veor	$dat1,$iv1,$in1
2665	veor	$dat2,$in2,$iv2
2666	veor	$dat3,$in3,$iv3
2667	veor	$dat4,$in4,$iv4
2668	b.eq	.Loop5x_xts_enc
2669
2670.Loop5x_enc_after:
2671	add	$len,$len,#0x50
2672	cbz	$len,.Lxts_enc_done
2673
2674	add	$rounds,$rounds0,#2
2675	subs	$len,$len,#0x30
2676	b.lo	.Lxts_inner_enc_tail
2677
2678	veor	$dat0,$iv0,$in2
2679	veor	$dat1,$iv1,$in3
2680	veor	$dat2,$in4,$iv2
2681	b	.Lxts_outer_enc_tail
2682
2683.align	4
2684.Lxts_enc_tail4x:
2685	add	$inp,$inp,#16
2686	veor	$tmp1,$dat1,$tmp1
2687	vst1.8	{$tmp1},[$out],#16
2688	veor	$tmp2,$dat2,$tmp2
2689	vst1.8	{$tmp2},[$out],#16
2690	veor	$tmp3,$dat3,$tmp3
2691	veor	$tmp4,$dat4,$tmp4
2692	vst1.8	{$tmp3-$tmp4},[$out],#32
2693
2694	b	.Lxts_enc_done
2695.align	4
2696.Lxts_outer_enc_tail:
2697	aese	$dat0,q8
2698	aesmc	$dat0,$dat0
2699	aese	$dat1,q8
2700	aesmc	$dat1,$dat1
2701	aese	$dat2,q8
2702	aesmc	$dat2,$dat2
2703	vld1.32	{q8},[$key_],#16
2704	subs	$rounds,$rounds,#2
2705	aese	$dat0,q9
2706	aesmc	$dat0,$dat0
2707	aese	$dat1,q9
2708	aesmc	$dat1,$dat1
2709	aese	$dat2,q9
2710	aesmc	$dat2,$dat2
2711	vld1.32	{q9},[$key_],#16
2712	b.gt	.Lxts_outer_enc_tail
2713
2714	aese	$dat0,q8
2715	aesmc	$dat0,$dat0
2716	aese	$dat1,q8
2717	aesmc	$dat1,$dat1
2718	aese	$dat2,q8
2719	aesmc	$dat2,$dat2
2720	veor	$tmp0,$iv0,$rndlast
2721	subs	$len,$len,#0x30
2722	// The iv for first block
2723	fmov	$ivl,$ivd20
2724	fmov	$ivh,$ivd21
2725	//mov	$constnum,#0x87
2726	extr	$midnumx,$ivh,$ivh,#32
2727	extr	$ivh,$ivh,$ivl,#63
2728	and	$tmpmw,$constnum,$midnum,asr#31
2729	eor	$ivl,$tmpmx,$ivl,lsl#1
2730	fmov	$ivd00,$ivl
2731	fmov	$ivd01,$ivh
2732	veor	$tmp1,$iv1,$rndlast
2733	csel	$xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
2734	aese	$dat0,q9
2735	aesmc	$dat0,$dat0
2736	aese	$dat1,q9
2737	aesmc	$dat1,$dat1
2738	aese	$dat2,q9
2739	aesmc	$dat2,$dat2
2740	veor	$tmp2,$iv2,$rndlast
2741
2742	add	$xoffset,$xoffset,#0x20
2743	add	$inp,$inp,$xoffset
2744	mov	$key_,$key1
2745
2746	aese	$dat0,q12
2747	aesmc	$dat0,$dat0
2748	aese	$dat1,q12
2749	aesmc	$dat1,$dat1
2750	aese	$dat2,q12
2751	aesmc	$dat2,$dat2
2752	aese	$dat0,q13
2753	aesmc	$dat0,$dat0
2754	aese	$dat1,q13
2755	aesmc	$dat1,$dat1
2756	aese	$dat2,q13
2757	aesmc	$dat2,$dat2
2758	aese	$dat0,q14
2759	aesmc	$dat0,$dat0
2760	aese	$dat1,q14
2761	aesmc	$dat1,$dat1
2762	aese	$dat2,q14
2763	aesmc	$dat2,$dat2
2764	aese	$dat0,q15
2765	aese	$dat1,q15
2766	aese	$dat2,q15
2767	vld1.8	{$in2},[$inp],#16
2768	add	$rounds,$rounds0,#2
2769	vld1.32	{q8},[$key_],#16                // re-pre-load rndkey[0]
2770	veor	$tmp0,$tmp0,$dat0
2771	veor	$tmp1,$tmp1,$dat1
2772	veor	$dat2,$dat2,$tmp2
2773	vld1.32	{q9},[$key_],#16                // re-pre-load rndkey[1]
2774	vst1.8	{$tmp0},[$out],#16
2775	vst1.8	{$tmp1},[$out],#16
2776	vst1.8	{$dat2},[$out],#16
2777	cmn	$len,#0x30
2778	b.eq	.Lxts_enc_done
2779.Lxts_encxor_one:
2780	vorr	$in3,$in1,$in1
2781	vorr	$in4,$in2,$in2
2782	nop
2783
2784.Lxts_inner_enc_tail:
2785	cmn	$len,#0x10
2786	veor	$dat1,$in3,$iv0
2787	veor	$dat2,$in4,$iv1
2788	b.eq	.Lxts_enc_tail_loop
2789	veor	$dat2,$in4,$iv0
2790.Lxts_enc_tail_loop:
2791	aese	$dat1,q8
2792	aesmc	$dat1,$dat1
2793	aese	$dat2,q8
2794	aesmc	$dat2,$dat2
2795	vld1.32	{q8},[$key_],#16
2796	subs	$rounds,$rounds,#2
2797	aese	$dat1,q9
2798	aesmc	$dat1,$dat1
2799	aese	$dat2,q9
2800	aesmc	$dat2,$dat2
2801	vld1.32	{q9},[$key_],#16
2802	b.gt	.Lxts_enc_tail_loop
2803
2804	aese	$dat1,q8
2805	aesmc	$dat1,$dat1
2806	aese	$dat2,q8
2807	aesmc	$dat2,$dat2
2808	aese	$dat1,q9
2809	aesmc	$dat1,$dat1
2810	aese	$dat2,q9
2811	aesmc	$dat2,$dat2
2812	aese	$dat1,q12
2813	aesmc	$dat1,$dat1
2814	aese	$dat2,q12
2815	aesmc	$dat2,$dat2
2816	cmn	$len,#0x20
2817	aese	$dat1,q13
2818	aesmc	$dat1,$dat1
2819	aese	$dat2,q13
2820	aesmc	$dat2,$dat2
2821	veor	$tmp1,$iv0,$rndlast
2822	aese	$dat1,q14
2823	aesmc	$dat1,$dat1
2824	aese	$dat2,q14
2825	aesmc	$dat2,$dat2
2826	veor	$tmp2,$iv1,$rndlast
2827	aese	$dat1,q15
2828	aese	$dat2,q15
2829	b.eq	.Lxts_enc_one
2830	veor	$tmp1,$tmp1,$dat1
2831	vst1.8	{$tmp1},[$out],#16
2832	veor	$tmp2,$tmp2,$dat2
2833	vorr	$iv0,$iv1,$iv1
2834	vst1.8	{$tmp2},[$out],#16
2835	fmov	$ivl,$ivd10
2836	fmov	$ivh,$ivd11
2837	mov	$constnum,#0x87
2838	extr	$midnumx,$ivh,$ivh,#32
2839	extr	$ivh,$ivh,$ivl,#63
2840	and	$tmpmw,$constnum,$midnum,asr #31
2841	eor	$ivl,$tmpmx,$ivl,lsl #1
2842	fmov	$ivd00,$ivl
2843	fmov	$ivd01,$ivh
2844	b	.Lxts_enc_done
2845
2846.Lxts_enc_one:
2847	veor	$tmp1,$tmp1,$dat2
2848	vorr	$iv0,$iv0,$iv0
2849	vst1.8	{$tmp1},[$out],#16
2850	fmov	$ivl,$ivd00
2851	fmov	$ivh,$ivd01
2852	mov	$constnum,#0x87
2853	extr	$midnumx,$ivh,$ivh,#32
2854	extr	$ivh,$ivh,$ivl,#63
2855	and	$tmpmw,$constnum,$midnum,asr #31
2856	eor	$ivl,$tmpmx,$ivl,lsl #1
2857	fmov	$ivd00,$ivl
2858	fmov	$ivd01,$ivh
2859	b	.Lxts_enc_done
2860.align	5
2861.Lxts_enc_done:
2862	// Process the tail block with cipher stealing.
2863	tst	$tailcnt,#0xf
2864	b.eq	.Lxts_abort
2865
2866	mov	$tmpinp,$inp
2867	mov	$tmpoutp,$out
2868	sub	$out,$out,#16
2869.composite_enc_loop:
2870	subs	$tailcnt,$tailcnt,#1
2871	ldrb	$l2outp,[$out,$tailcnt]
2872	ldrb	$loutp,[$tmpinp,$tailcnt]
2873	strb	$l2outp,[$tmpoutp,$tailcnt]
2874	strb	$loutp,[$out,$tailcnt]
2875	b.gt	.composite_enc_loop
2876.Lxts_enc_load_done:
2877	vld1.8	{$tmpin},[$out]
2878	veor	$tmpin,$tmpin,$iv0
2879
2880	// Encrypt the composite block to get the last second encrypted text block
2881	ldr	$rounds,[$key1,#240]		// load key schedule...
2882	vld1.8	{$dat},[$key1],#16
2883	sub	$rounds,$rounds,#2
2884	vld1.8	{$dat1},[$key1],#16		// load key schedule...
2885.Loop_final_enc:
2886	aese	$tmpin,$dat0
2887	aesmc	$tmpin,$tmpin
2888	vld1.32	{$dat0},[$key1],#16
2889	subs	$rounds,$rounds,#2
2890	aese	$tmpin,$dat1
2891	aesmc	$tmpin,$tmpin
2892	vld1.32	{$dat1},[$key1],#16
2893	b.gt	.Loop_final_enc
2894
2895	aese	$tmpin,$dat0
2896	aesmc	$tmpin,$tmpin
2897	vld1.32	{$dat0},[$key1]
2898	aese	$tmpin,$dat1
2899	veor	$tmpin,$tmpin,$dat0
2900	veor	$tmpin,$tmpin,$iv0
2901	vst1.8	{$tmpin},[$out]
2902
2903.Lxts_abort:
2904	ldp	$tailcnt,$midnumx,[sp,#48]
2905	ldp	$ivd10,$ivd20,[sp,#32]
2906	ldp	$ivd30,$ivd40,[sp,#16]
2907	ldp	$constnumx,$tmpinp,[sp],#64
2908.Lxts_enc_final_abort:
2909	ret
2910.size	${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2911___
2912
2913}}}
2914{{{
2915my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2916my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2917my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2918my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2919my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2920my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2921my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2922my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2923my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2924
2925my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2926
2927# q7	last round key
2928# q10-q15, q7	Last 7 round keys
2929# q8-q9	preloaded round keys except last 7 keys for big size
2930# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
2931
2932{
2933my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2934
2935my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
2936my ($dat4,$in4,$tmp4);
2937if ($flavour =~ /64/) {
2938    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2939}
2940
2941$code.=<<___	if ($flavour =~ /64/);
2942.globl	${prefix}_xts_decrypt
2943.type	${prefix}_xts_decrypt,%function
2944.align	5
2945${prefix}_xts_decrypt:
2946	AARCH64_VALID_CALL_TARGET
2947___
2948$code.=<<___	if ($flavour =~ /64/);
2949	cmp	$len,#16
2950	// Original input data size bigger than 16, jump to big size processing.
2951	b.ne	.Lxts_dec_big_size
2952	// Encrypt the iv with key2, as the first XEX iv.
2953	ldr	$rounds,[$key2,#240]
2954	vld1.8	{$dat},[$key2],#16
2955	vld1.8	{$iv0},[$ivp]
2956	sub	$rounds,$rounds,#2
2957	vld1.8	{$dat1},[$key2],#16
2958
2959.Loop_dec_small_iv_enc:
2960	aese	$iv0,$dat
2961	aesmc	$iv0,$iv0
2962	vld1.32	{$dat},[$key2],#16
2963	subs	$rounds,$rounds,#2
2964	aese	$iv0,$dat1
2965	aesmc	$iv0,$iv0
2966	vld1.32	{$dat1},[$key2],#16
2967	b.gt	.Loop_dec_small_iv_enc
2968
2969	aese	$iv0,$dat
2970	aesmc	$iv0,$iv0
2971	vld1.32	{$dat},[$key2]
2972	aese	$iv0,$dat1
2973	veor	$iv0,$iv0,$dat
2974
2975	vld1.8	{$dat0},[$inp]
2976	veor	$dat0,$iv0,$dat0
2977
2978	ldr	$rounds,[$key1,#240]
2979	vld1.32	{q20-q21},[$key1],#32			// load key schedule...
2980
2981	aesd	$dat0,q20
2982	aesimc	$dat0,$dat0
2983	vld1.32	{q8-q9},[$key1],#32			// load key schedule...
2984	aesd	$dat0,q21
2985	aesimc	$dat0,$dat0
2986	subs	$rounds,$rounds,#10			// bias
2987	b.eq	.Lxts_128_dec
2988.Lxts_dec_round_loop:
2989	aesd	$dat0,q8
2990	aesimc	$dat0,$dat0
2991	vld1.32	{q8},[$key1],#16			// load key schedule...
2992	aesd	$dat0,q9
2993	aesimc	$dat0,$dat0
2994	vld1.32	{q9},[$key1],#16			// load key schedule...
2995	subs	$rounds,$rounds,#2			// bias
2996	b.gt	.Lxts_dec_round_loop
2997.Lxts_128_dec:
2998	vld1.32	{q10-q11},[$key1],#32			// load key schedule...
2999	aesd	$dat0,q8
3000	aesimc	$dat0,$dat0
3001	aesd	$dat0,q9
3002	aesimc	$dat0,$dat0
3003	vld1.32	{q12-q13},[$key1],#32			// load key schedule...
3004	aesd	$dat0,q10
3005	aesimc	$dat0,$dat0
3006	aesd	$dat0,q11
3007	aesimc	$dat0,$dat0
3008	vld1.32	{q14-q15},[$key1],#32			// load key schedule...
3009	aesd	$dat0,q12
3010	aesimc	$dat0,$dat0
3011	aesd	$dat0,q13
3012	aesimc	$dat0,$dat0
3013	vld1.32	{$rndlast},[$key1]
3014	aesd	$dat0,q14
3015	aesimc	$dat0,$dat0
3016	aesd	$dat0,q15
3017	veor	$dat0,$dat0,$rndlast
3018	veor	$dat0,$iv0,$dat0
3019	vst1.8	{$dat0},[$out]
3020	b	.Lxts_dec_final_abort
3021.Lxts_dec_big_size:
3022___
3023$code.=<<___	if ($flavour =~ /64/);
3024	stp	$constnumx,$tmpinp,[sp,#-64]!
3025	stp	$tailcnt,$midnumx,[sp,#48]
3026	stp	$ivd10,$ivd20,[sp,#32]
3027	stp	$ivd30,$ivd40,[sp,#16]
3028
3029	and	$tailcnt,$len,#0xf
3030	and	$len,$len,#-16
3031	subs	$len,$len,#16
3032	mov	$step,#16
3033	b.lo	.Lxts_dec_abort
3034
3035	// Encrypt the iv with key2, as the first XEX iv
3036	ldr	$rounds,[$key2,#240]
3037	vld1.8	{$dat},[$key2],#16
3038	vld1.8	{$iv0},[$ivp]
3039	sub	$rounds,$rounds,#2
3040	vld1.8	{$dat1},[$key2],#16
3041
3042.Loop_dec_iv_enc:
3043	aese	$iv0,$dat
3044	aesmc	$iv0,$iv0
3045	vld1.32	{$dat},[$key2],#16
3046	subs	$rounds,$rounds,#2
3047	aese	$iv0,$dat1
3048	aesmc	$iv0,$iv0
3049	vld1.32	{$dat1},[$key2],#16
3050	b.gt	.Loop_dec_iv_enc
3051
3052	aese	$iv0,$dat
3053	aesmc	$iv0,$iv0
3054	vld1.32	{$dat},[$key2]
3055	aese	$iv0,$dat1
3056	veor	$iv0,$iv0,$dat
3057
3058	// The iv for second block
3059	// $ivl- iv(low), $ivh - iv(high)
3060	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3061	fmov	$ivl,$ivd00
3062	fmov	$ivh,$ivd01
3063	mov	$constnum,#0x87
3064	extr	$midnumx,$ivh,$ivh,#32
3065	extr	$ivh,$ivh,$ivl,#63
3066	and	$tmpmw,$constnum,$midnum,asr #31
3067	eor	$ivl,$tmpmx,$ivl,lsl #1
3068	fmov	$ivd10,$ivl
3069	fmov	$ivd11,$ivh
3070
3071	ldr	$rounds0,[$key1,#240]		// load rounds number
3072
3073	// The iv for third block
3074	extr	$midnumx,$ivh,$ivh,#32
3075	extr	$ivh,$ivh,$ivl,#63
3076	and	$tmpmw,$constnum,$midnum,asr #31
3077	eor	$ivl,$tmpmx,$ivl,lsl #1
3078	fmov	$ivd20,$ivl
3079	fmov	$ivd21,$ivh
3080
3081	vld1.32	{q8-q9},[$key1]			// load key schedule...
3082	sub	$rounds0,$rounds0,#6
3083	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
3084	sub	$rounds0,$rounds0,#2
3085	vld1.32	{q10-q11},[$key_],#32		// load key schedule...
3086	vld1.32	{q12-q13},[$key_],#32
3087	vld1.32	{q14-q15},[$key_],#32
3088	vld1.32	{$rndlast},[$key_]
3089
3090	// The iv for fourth block
3091	extr	$midnumx,$ivh,$ivh,#32
3092	extr	$ivh,$ivh,$ivl,#63
3093	and	$tmpmw,$constnum,$midnum,asr #31
3094	eor	$ivl,$tmpmx,$ivl,lsl #1
3095	fmov	$ivd30,$ivl
3096	fmov	$ivd31,$ivh
3097
3098	add	$key_,$key1,#32
3099	mov	$rounds,$rounds0
3100	b	.Lxts_dec
3101
3102	// Decryption
3103.align	5
3104.Lxts_dec:
3105	tst	$tailcnt,#0xf
3106	b.eq	.Lxts_dec_begin
3107	subs	$len,$len,#16
3108	csel	$step,xzr,$step,eq
3109	vld1.8	{$dat},[$inp],#16
3110	b.lo	.Lxts_done
3111	sub	$inp,$inp,#16
3112.Lxts_dec_begin:
3113	vld1.8	{$dat},[$inp],$step
3114	subs	$len,$len,#32			// bias
3115	add	$rounds,$rounds0,#2
3116	vorr	$in1,$dat,$dat
3117	vorr	$dat1,$dat,$dat
3118	vorr	$in3,$dat,$dat
3119	vld1.8	{$dat2},[$inp],#16
3120	vorr	$in2,$dat2,$dat2
3121	vorr	$in4,$dat2,$dat2
3122	b.lo	.Lxts_inner_dec_tail
3123	veor	$dat,$dat,$iv0			// before decryt, xor with iv
3124	veor	$dat2,$dat2,$iv1
3125
3126	vorr	$dat1,$dat2,$dat2
3127	vld1.8	{$dat2},[$inp],#16
3128	vorr	$in0,$dat,$dat
3129	vorr	$in1,$dat1,$dat1
3130	veor	$in2,$dat2,$iv2			// third block xox with third iv
3131	veor	$dat2,$dat2,$iv2
3132	cmp	$len,#32
3133	b.lo	.Lxts_outer_dec_tail
3134
3135	vld1.8	{$dat3},[$inp],#16
3136
3137	// The iv for fifth block
3138	extr	$midnumx,$ivh,$ivh,#32
3139	extr	$ivh,$ivh,$ivl,#63
3140	and	$tmpmw,$constnum,$midnum,asr #31
3141	eor	$ivl,$tmpmx,$ivl,lsl #1
3142	fmov	$ivd40,$ivl
3143	fmov	$ivd41,$ivh
3144
3145	vld1.8	{$dat4},[$inp],#16
3146	veor	$dat3,$dat3,$iv3		// the fourth block
3147	veor	$dat4,$dat4,$iv4
3148	sub $len,$len,#32			// bias
3149	mov	$rounds,$rounds0
3150	b	.Loop5x_xts_dec
3151
3152.align	4
3153.Loop5x_xts_dec:
3154	aesd	$dat0,q8
3155	aesimc	$dat0,$dat0
3156	aesd	$dat1,q8
3157	aesimc	$dat1,$dat1
3158	aesd	$dat2,q8
3159	aesimc	$dat2,$dat2
3160	aesd	$dat3,q8
3161	aesimc	$dat3,$dat3
3162	aesd	$dat4,q8
3163	aesimc	$dat4,$dat4
3164	vld1.32	{q8},[$key_],#16		// load key schedule...
3165	subs	$rounds,$rounds,#2
3166	aesd	$dat0,q9
3167	aesimc	$dat0,$dat0
3168	aesd	$dat1,q9
3169	aesimc	$dat1,$dat1
3170	aesd	$dat2,q9
3171	aesimc	$dat2,$dat2
3172	aesd	$dat3,q9
3173	aesimc	$dat3,$dat3
3174	aesd	$dat4,q9
3175	aesimc	$dat4,$dat4
3176	vld1.32	{q9},[$key_],#16		// load key schedule...
3177	b.gt	.Loop5x_xts_dec
3178
3179	aesd	$dat0,q8
3180	aesimc	$dat0,$dat0
3181	aesd	$dat1,q8
3182	aesimc	$dat1,$dat1
3183	aesd	$dat2,q8
3184	aesimc	$dat2,$dat2
3185	aesd	$dat3,q8
3186	aesimc	$dat3,$dat3
3187	aesd	$dat4,q8
3188	aesimc	$dat4,$dat4
3189	subs	$len,$len,#0x50			// because .Lxts_dec_tail4x
3190
3191	aesd	$dat0,q9
3192	aesimc	$dat0,$dat
3193	aesd	$dat1,q9
3194	aesimc	$dat1,$dat1
3195	aesd	$dat2,q9
3196	aesimc	$dat2,$dat2
3197	aesd	$dat3,q9
3198	aesimc	$dat3,$dat3
3199	aesd	$dat4,q9
3200	aesimc	$dat4,$dat4
3201	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
3202	mov	$key_,$key1
3203
3204	aesd	$dat0,q10
3205	aesimc	$dat0,$dat0
3206	aesd	$dat1,q10
3207	aesimc	$dat1,$dat1
3208	aesd	$dat2,q10
3209	aesimc	$dat2,$dat2
3210	aesd	$dat3,q10
3211	aesimc	$dat3,$dat3
3212	aesd	$dat4,q10
3213	aesimc	$dat4,$dat4
3214	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
3215						// at exit from the loop v1.16b-v26.16b
3216						// are loaded with last "words"
3217	add	$xoffset,$len,#0x60		// because .Lxts_dec_tail4x
3218
3219	aesd	$dat0,q11
3220	aesimc	$dat0,$dat0
3221	aesd	$dat1,q11
3222	aesimc	$dat1,$dat1
3223	aesd	$dat2,q11
3224	aesimc	$dat2,$dat2
3225	aesd	$dat3,q11
3226	aesimc	$dat3,$dat3
3227	aesd	$dat4,q11
3228	aesimc	$dat4,$dat4
3229
3230	aesd	$dat0,q12
3231	aesimc	$dat0,$dat0
3232	aesd	$dat1,q12
3233	aesimc	$dat1,$dat1
3234	aesd	$dat2,q12
3235	aesimc	$dat2,$dat2
3236	aesd	$dat3,q12
3237	aesimc	$dat3,$dat3
3238	aesd	$dat4,q12
3239	aesimc	$dat4,$dat4
3240
3241	aesd	$dat0,q13
3242	aesimc	$dat0,$dat0
3243	aesd	$dat1,q13
3244	aesimc	$dat1,$dat1
3245	aesd	$dat2,q13
3246	aesimc	$dat2,$dat2
3247	aesd	$dat3,q13
3248	aesimc	$dat3,$dat3
3249	aesd	$dat4,q13
3250	aesimc	$dat4,$dat4
3251
3252	aesd	$dat0,q14
3253	aesimc	$dat0,$dat0
3254	aesd	$dat1,q14
3255	aesimc	$dat1,$dat1
3256	aesd	$dat2,q14
3257	aesimc	$dat2,$dat2
3258	aesd	$dat3,q14
3259	aesimc	$dat3,$dat3
3260	aesd	$dat4,q14
3261	aesimc	$dat4,$dat4
3262
3263	veor	$tmp0,$rndlast,$iv0
3264	aesd	$dat0,q15
3265	// The iv for first block of next iteration.
3266	extr	$midnumx,$ivh,$ivh,#32
3267	extr	$ivh,$ivh,$ivl,#63
3268	and	$tmpmw,$constnum,$midnum,asr #31
3269	eor	$ivl,$tmpmx,$ivl,lsl #1
3270	fmov	$ivd00,$ivl
3271	fmov	$ivd01,$ivh
3272	veor	$tmp1,$rndlast,$iv1
3273	vld1.8	{$in0},[$inp],#16
3274	aesd	$dat1,q15
3275	// The iv for second block
3276	extr	$midnumx,$ivh,$ivh,#32
3277	extr	$ivh,$ivh,$ivl,#63
3278	and	$tmpmw,$constnum,$midnum,asr #31
3279	eor	$ivl,$tmpmx,$ivl,lsl #1
3280	fmov	$ivd10,$ivl
3281	fmov	$ivd11,$ivh
3282	veor	$tmp2,$rndlast,$iv2
3283	vld1.8	{$in1},[$inp],#16
3284	aesd	$dat2,q15
3285	// The iv for third block
3286	extr	$midnumx,$ivh,$ivh,#32
3287	extr	$ivh,$ivh,$ivl,#63
3288	and	$tmpmw,$constnum,$midnum,asr #31
3289	eor	$ivl,$tmpmx,$ivl,lsl #1
3290	fmov	$ivd20,$ivl
3291	fmov	$ivd21,$ivh
3292	veor	$tmp3,$rndlast,$iv3
3293	vld1.8	{$in2},[$inp],#16
3294	aesd	$dat3,q15
3295	// The iv for fourth block
3296	extr	$midnumx,$ivh,$ivh,#32
3297	extr	$ivh,$ivh,$ivl,#63
3298	and	$tmpmw,$constnum,$midnum,asr #31
3299	eor	$ivl,$tmpmx,$ivl,lsl #1
3300	fmov	$ivd30,$ivl
3301	fmov	$ivd31,$ivh
3302	veor	$tmp4,$rndlast,$iv4
3303	vld1.8	{$in3},[$inp],#16
3304	aesd	$dat4,q15
3305
3306	// The iv for fifth block
3307	extr	$midnumx,$ivh,$ivh,#32
3308	extr	$ivh,$ivh,$ivl,#63
3309	and	$tmpmw,$constnum,$midnum,asr #31
3310	eor	$ivl,$tmpmx,$ivl,lsl #1
3311	fmov	$ivd40,$ivl
3312	fmov	$ivd41,$ivh
3313
3314	vld1.8	{$in4},[$inp],#16
3315	cbz	$xoffset,.Lxts_dec_tail4x
3316	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
3317	veor	$tmp0,$tmp0,$dat0
3318	veor	$dat0,$in0,$iv0
3319	veor	$tmp1,$tmp1,$dat1
3320	veor	$dat1,$in1,$iv1
3321	veor	$tmp2,$tmp2,$dat2
3322	veor	$dat2,$in2,$iv2
3323	veor	$tmp3,$tmp3,$dat3
3324	veor	$dat3,$in3,$iv3
3325	veor	$tmp4,$tmp4,$dat4
3326	vst1.8	{$tmp0},[$out],#16
3327	veor	$dat4,$in4,$iv4
3328	vst1.8	{$tmp1},[$out],#16
3329	mov	$rounds,$rounds0
3330	vst1.8	{$tmp2},[$out],#16
3331	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
3332	vst1.8	{$tmp3},[$out],#16
3333	vst1.8	{$tmp4},[$out],#16
3334	b.hs	.Loop5x_xts_dec
3335
3336	cmn	$len,#0x10
3337	b.ne	.Loop5x_dec_after
3338	// If x2($len) equal to -0x10, the left blocks is 4.
3339	// After specially processing, utilize the five blocks processing again.
3340	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3341	vorr	$iv4,$iv3,$iv3
3342	vorr	$iv3,$iv2,$iv2
3343	vorr	$iv2,$iv1,$iv1
3344	vorr	$iv1,$iv0,$iv0
3345	fmov	$ivl,$ivd40
3346	fmov	$ivh,$ivd41
3347	veor	$dat0,$iv0,$in0
3348	veor	$dat1,$iv1,$in1
3349	veor	$dat2,$in2,$iv2
3350	veor	$dat3,$in3,$iv3
3351	veor	$dat4,$in4,$iv4
3352	b.eq	.Loop5x_xts_dec
3353
3354.Loop5x_dec_after:
3355	add	$len,$len,#0x50
3356	cbz	$len,.Lxts_done
3357
3358	add	$rounds,$rounds0,#2
3359	subs	$len,$len,#0x30
3360	b.lo	.Lxts_inner_dec_tail
3361
3362	veor	$dat0,$iv0,$in2
3363	veor	$dat1,$iv1,$in3
3364	veor	$dat2,$in4,$iv2
3365	b	.Lxts_outer_dec_tail
3366
3367.align	4
3368.Lxts_dec_tail4x:
3369	add	$inp,$inp,#16
3370	vld1.32	{$dat0},[$inp],#16
3371	veor	$tmp1,$dat1,$tmp0
3372	vst1.8	{$tmp1},[$out],#16
3373	veor	$tmp2,$dat2,$tmp2
3374	vst1.8	{$tmp2},[$out],#16
3375	veor	$tmp3,$dat3,$tmp3
3376	veor	$tmp4,$dat4,$tmp4
3377	vst1.8	{$tmp3-$tmp4},[$out],#32
3378
3379	b	.Lxts_done
3380.align	4
3381.Lxts_outer_dec_tail:
3382	aesd	$dat0,q8
3383	aesimc	$dat0,$dat0
3384	aesd	$dat1,q8
3385	aesimc	$dat1,$dat1
3386	aesd	$dat2,q8
3387	aesimc	$dat2,$dat2
3388	vld1.32	{q8},[$key_],#16
3389	subs	$rounds,$rounds,#2
3390	aesd	$dat0,q9
3391	aesimc	$dat0,$dat0
3392	aesd	$dat1,q9
3393	aesimc	$dat1,$dat1
3394	aesd	$dat2,q9
3395	aesimc	$dat2,$dat2
3396	vld1.32	{q9},[$key_],#16
3397	b.gt	.Lxts_outer_dec_tail
3398
3399	aesd	$dat0,q8
3400	aesimc	$dat0,$dat0
3401	aesd	$dat1,q8
3402	aesimc	$dat1,$dat1
3403	aesd	$dat2,q8
3404	aesimc	$dat2,$dat2
3405	veor	$tmp0,$iv0,$rndlast
3406	subs	$len,$len,#0x30
3407	// The iv for first block
3408	fmov	$ivl,$ivd20
3409	fmov	$ivh,$ivd21
3410	mov	$constnum,#0x87
3411	extr	$midnumx,$ivh,$ivh,#32
3412	extr	$ivh,$ivh,$ivl,#63
3413	and	$tmpmw,$constnum,$midnum,asr #31
3414	eor	$ivl,$tmpmx,$ivl,lsl #1
3415	fmov	$ivd00,$ivl
3416	fmov	$ivd01,$ivh
3417	veor	$tmp1,$iv1,$rndlast
3418	csel	$xoffset,$len,$xoffset,lo	// x6, w6, is zero at this point
3419	aesd	$dat0,q9
3420	aesimc	$dat0,$dat0
3421	aesd	$dat1,q9
3422	aesimc	$dat1,$dat1
3423	aesd	$dat2,q9
3424	aesimc	$dat2,$dat2
3425	veor	$tmp2,$iv2,$rndlast
3426	// The iv for second block
3427	extr	$midnumx,$ivh,$ivh,#32
3428	extr	$ivh,$ivh,$ivl,#63
3429	and	$tmpmw,$constnum,$midnum,asr #31
3430	eor	$ivl,$tmpmx,$ivl,lsl #1
3431	fmov	$ivd10,$ivl
3432	fmov	$ivd11,$ivh
3433
3434	add	$xoffset,$xoffset,#0x20
3435	add	$inp,$inp,$xoffset		// $inp is adjusted to the last data
3436
3437	mov	$key_,$key1
3438
3439	// The iv for third block
3440	extr	$midnumx,$ivh,$ivh,#32
3441	extr	$ivh,$ivh,$ivl,#63
3442	and	$tmpmw,$constnum,$midnum,asr #31
3443	eor	$ivl,$tmpmx,$ivl,lsl #1
3444	fmov	$ivd20,$ivl
3445	fmov	$ivd21,$ivh
3446
3447	aesd	$dat0,q12
3448	aesimc	$dat0,$dat0
3449	aesd	$dat1,q12
3450	aesimc	$dat1,$dat1
3451	aesd	$dat2,q12
3452	aesimc	$dat2,$dat2
3453	aesd	$dat0,q13
3454	aesimc	$dat0,$dat0
3455	aesd	$dat1,q13
3456	aesimc	$dat1,$dat1
3457	aesd	$dat2,q13
3458	aesimc	$dat2,$dat2
3459	aesd	$dat0,q14
3460	aesimc	$dat0,$dat0
3461	aesd	$dat1,q14
3462	aesimc	$dat1,$dat1
3463	aesd	$dat2,q14
3464	aesimc	$dat2,$dat2
3465	vld1.8	{$in2},[$inp],#16
3466	aesd	$dat0,q15
3467	aesd	$dat1,q15
3468	aesd	$dat2,q15
3469	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
3470	add	$rounds,$rounds0,#2
3471	veor	$tmp0,$tmp0,$dat0
3472	veor	$tmp1,$tmp1,$dat1
3473	veor	$dat2,$dat2,$tmp2
3474	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
3475	vst1.8	{$tmp0},[$out],#16
3476	vst1.8	{$tmp1},[$out],#16
3477	vst1.8	{$dat2},[$out],#16
3478
3479	cmn	$len,#0x30
3480	add	$len,$len,#0x30
3481	b.eq	.Lxts_done
3482	sub	$len,$len,#0x30
3483	vorr	$in3,$in1,$in1
3484	vorr	$in4,$in2,$in2
3485	nop
3486
3487.Lxts_inner_dec_tail:
3488	// $len == -0x10 means two blocks left.
3489	cmn	$len,#0x10
3490	veor	$dat1,$in3,$iv0
3491	veor	$dat2,$in4,$iv1
3492	b.eq	.Lxts_dec_tail_loop
3493	veor	$dat2,$in4,$iv0
3494.Lxts_dec_tail_loop:
3495	aesd	$dat1,q8
3496	aesimc	$dat1,$dat1
3497	aesd	$dat2,q8
3498	aesimc	$dat2,$dat2
3499	vld1.32	{q8},[$key_],#16
3500	subs	$rounds,$rounds,#2
3501	aesd	$dat1,q9
3502	aesimc	$dat1,$dat1
3503	aesd	$dat2,q9
3504	aesimc	$dat2,$dat2
3505	vld1.32	{q9},[$key_],#16
3506	b.gt	.Lxts_dec_tail_loop
3507
3508	aesd	$dat1,q8
3509	aesimc	$dat1,$dat1
3510	aesd	$dat2,q8
3511	aesimc	$dat2,$dat2
3512	aesd	$dat1,q9
3513	aesimc	$dat1,$dat1
3514	aesd	$dat2,q9
3515	aesimc	$dat2,$dat2
3516	aesd	$dat1,q12
3517	aesimc	$dat1,$dat1
3518	aesd	$dat2,q12
3519	aesimc	$dat2,$dat2
3520	cmn	$len,#0x20
3521	aesd	$dat1,q13
3522	aesimc	$dat1,$dat1
3523	aesd	$dat2,q13
3524	aesimc	$dat2,$dat2
3525	veor	$tmp1,$iv0,$rndlast
3526	aesd	$dat1,q14
3527	aesimc	$dat1,$dat1
3528	aesd	$dat2,q14
3529	aesimc	$dat2,$dat2
3530	veor	$tmp2,$iv1,$rndlast
3531	aesd	$dat1,q15
3532	aesd	$dat2,q15
3533	b.eq	.Lxts_dec_one
3534	veor	$tmp1,$tmp1,$dat1
3535	veor	$tmp2,$tmp2,$dat2
3536	vorr	$iv0,$iv2,$iv2
3537	vorr	$iv1,$iv3,$iv3
3538	vst1.8	{$tmp1},[$out],#16
3539	vst1.8	{$tmp2},[$out],#16
3540	add	$len,$len,#16
3541	b	.Lxts_done
3542
3543.Lxts_dec_one:
3544	veor	$tmp1,$tmp1,$dat2
3545	vorr	$iv0,$iv1,$iv1
3546	vorr	$iv1,$iv2,$iv2
3547	vst1.8	{$tmp1},[$out],#16
3548	add	$len,$len,#32
3549
3550.Lxts_done:
3551	tst	$tailcnt,#0xf
3552	b.eq	.Lxts_dec_abort
3553	// Processing the last two blocks with cipher stealing.
3554	mov	x7,x3
3555	cbnz	x2,.Lxts_dec_1st_done
3556	vld1.32	{$dat0},[$inp],#16
3557
3558	// Decrypt the last second block to get the last plain text block
3559.Lxts_dec_1st_done:
3560	eor	$tmpin,$dat0,$iv1
3561	ldr	$rounds,[$key1,#240]
3562	vld1.32	{$dat0},[$key1],#16
3563	sub	$rounds,$rounds,#2
3564	vld1.32	{$dat1},[$key1],#16
3565.Loop_final_2nd_dec:
3566	aesd	$tmpin,$dat0
3567	aesimc	$tmpin,$tmpin
3568	vld1.32	{$dat0},[$key1],#16		// load key schedule...
3569	subs	$rounds,$rounds,#2
3570	aesd	$tmpin,$dat1
3571	aesimc	$tmpin,$tmpin
3572	vld1.32	{$dat1},[$key1],#16		// load key schedule...
3573	b.gt	.Loop_final_2nd_dec
3574
3575	aesd	$tmpin,$dat0
3576	aesimc	$tmpin,$tmpin
3577	vld1.32	{$dat0},[$key1]
3578	aesd	$tmpin,$dat1
3579	veor	$tmpin,$tmpin,$dat0
3580	veor	$tmpin,$tmpin,$iv1
3581	vst1.8	{$tmpin},[$out]
3582
3583	mov	$tmpinp,$inp
3584	add	$tmpoutp,$out,#16
3585
3586	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3587	// to get the last encrypted block.
3588.composite_dec_loop:
3589	subs	$tailcnt,$tailcnt,#1
3590	ldrb	$l2outp,[$out,$tailcnt]
3591	ldrb	$loutp,[$tmpinp,$tailcnt]
3592	strb	$l2outp,[$tmpoutp,$tailcnt]
3593	strb	$loutp,[$out,$tailcnt]
3594	b.gt	.composite_dec_loop
3595.Lxts_dec_load_done:
3596	vld1.8	{$tmpin},[$out]
3597	veor	$tmpin,$tmpin,$iv0
3598
3599	// Decrypt the composite block to get the last second plain text block
3600	ldr	$rounds,[$key_,#240]
3601	vld1.8	{$dat},[$key_],#16
3602	sub	$rounds,$rounds,#2
3603	vld1.8	{$dat1},[$key_],#16
3604.Loop_final_dec:
3605	aesd	$tmpin,$dat0
3606	aesimc	$tmpin,$tmpin
3607	vld1.32	{$dat0},[$key_],#16		// load key schedule...
3608	subs	$rounds,$rounds,#2
3609	aesd	$tmpin,$dat1
3610	aesimc	$tmpin,$tmpin
3611	vld1.32	{$dat1},[$key_],#16		// load key schedule...
3612	b.gt	.Loop_final_dec
3613
3614	aesd	$tmpin,$dat0
3615	aesimc	$tmpin,$tmpin
3616	vld1.32	{$dat0},[$key_]
3617	aesd	$tmpin,$dat1
3618	veor	$tmpin,$tmpin,$dat0
3619	veor	$tmpin,$tmpin,$iv0
3620	vst1.8	{$tmpin},[$out]
3621
3622.Lxts_dec_abort:
3623	ldp	$tailcnt,$midnumx,[sp,#48]
3624	ldp	$ivd10,$ivd20,[sp,#32]
3625	ldp	$ivd30,$ivd40,[sp,#16]
3626	ldp	$constnumx,$tmpinp,[sp],#64
3627
3628.Lxts_dec_final_abort:
3629	ret
3630.size	${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3631___
3632}
3633}}}
3634$code.=<<___;
3635#endif
3636___
3637########################################
3638if ($flavour =~ /64/) {			######## 64-bit code
3639    my %opcode = (
3640	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
3641	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
3642
3643    local *unaes = sub {
3644	my ($mnemonic,$arg)=@_;
3645
3646	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
3647	sprintf ".inst\t0x%08x\t//%s %s",
3648			$opcode{$mnemonic}|$1|($2<<5),
3649			$mnemonic,$arg;
3650    };
3651
3652    foreach(split("\n",$code)) {
3653	s/\`([^\`]*)\`/eval($1)/geo;
3654
3655	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
3656	s/@\s/\/\//o;			# old->new style commentary
3657
3658	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
3659	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
3660	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
3661	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
3662	s/vext\.8/ext/o		or
3663	s/vrev32\.8/rev32/o	or
3664	s/vtst\.8/cmtst/o	or
3665	s/vshr/ushr/o		or
3666	s/^(\s+)v/$1/o		or	# strip off v prefix
3667	s/\bbx\s+lr\b/ret/o;
3668
3669	# fix up remaining legacy suffixes
3670	s/\.[ui]?8//o;
3671	m/\],#8/o and s/\.16b/\.8b/go;
3672	s/\.[ui]?32//o and s/\.16b/\.4s/go;
3673	s/\.[ui]?64//o and s/\.16b/\.2d/go;
3674	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3675
3676	# Switch preprocessor checks to aarch64 versions.
3677	s/__ARME([BL])__/__AARCH64E$1__/go;
3678
3679	print $_,"\n";
3680    }
3681} else {				######## 32-bit code
3682    my %opcode = (
3683	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
3684	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
3685
3686    local *unaes = sub {
3687	my ($mnemonic,$arg)=@_;
3688
3689	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3690	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3691					 |(($2&7)<<1) |(($2&8)<<2);
3692	    # since ARMv7 instructions are always encoded little-endian.
3693	    # correct solution is to use .inst directive, but older
3694	    # assemblers don't implement it:-(
3695	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3696			$word&0xff,($word>>8)&0xff,
3697			($word>>16)&0xff,($word>>24)&0xff,
3698			$mnemonic,$arg;
3699	}
3700    };
3701
3702    sub unvtbl {
3703	my $arg=shift;
3704
3705	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3706	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
3707		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3708    }
3709
3710    sub unvdup32 {
3711	my $arg=shift;
3712
3713	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3714	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3715    }
3716
3717    sub unvmov32 {
3718	my $arg=shift;
3719
3720	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3721	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3722    }
3723
3724    foreach(split("\n",$code)) {
3725	s/\`([^\`]*)\`/eval($1)/geo;
3726
3727	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
3728	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
3729	s/\/\/\s?/@ /o;				# new->old style commentary
3730
3731	# fix up remaining new-style suffixes
3732	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
3733	s/\],#[0-9]+/]!/o;
3734
3735	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
3736	s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2	$1,#0/o	or
3737	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
3738	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
3739	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
3740	s/^(\s+)b\./$1b/o				or
3741	s/^(\s+)ret/$1bx\tlr/o;
3742
3743	if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3744	    print "	it	$2\n";
3745	}
3746
3747	print $_,"\n";
3748    }
3749}
3750
3751close STDOUT or die "error closing STDOUT: $!";
3752