xref: /openssl/crypto/aes/asm/vpaes-armv8.pl (revision c6e65c1f)
1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19##
20######################################################################
21# ARMv8 NEON adaptation by <appro@openssl.org>
22#
23# Reason for undertaken effort is that there is at least one popular
24# SoC based on Cortex-A53 that doesn't have crypto extensions.
25#
26#                   CBC enc     ECB enc/dec(*)   [bit-sliced enc/dec]
27# Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
28# Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
29# X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
30# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
31# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
32# Mongoose(***)     26.3(**)    21.0/25.0(**)    [13.3/16.8         ]
33# ThunderX2(***)    39.4(**)    33.8/48.6(**)
34#
35# (*)	ECB denotes approximate result for parallelizable modes
36#	such as CBC decrypt, CTR, etc.;
37# (**)	these results are worse than scalar compiler-generated
38#	code, but it's constant-time and therefore preferred;
39# (***)	presented for reference/comparison purposes;
40
41# $output is the last argument if it looks like a file (it has an extension)
42# $flavour is the first argument if it doesn't look like a file
43$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
44$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49die "can't locate arm-xlate.pl";
50
51open OUT,"| \"$^X\" $xlate $flavour \"$output\""
52    or die "can't call $xlate: $!";
53*STDOUT=*OUT;
54
55$code.=<<___;
56#include "arm_arch.h"
57
58.rodata
59
60.type	_vpaes_consts,%object
61.align	7	// totally strategic alignment
62_vpaes_consts:
63.Lk_mc_forward:	// mc_forward
64	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
65	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
66	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
67	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
68.Lk_mc_backward:// mc_backward
69	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
70	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
71	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
72	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
73.Lk_sr:		// sr
74	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
75	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
76	.quad	0x0F060D040B020900, 0x070E050C030A0108
77	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
78
79//
80// "Hot" constants
81//
82.Lk_inv:	// inv, inva
83	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
84	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
85.Lk_ipt:	// input transform (lo, hi)
86	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
87	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
88.Lk_sbo:	// sbou, sbot
89	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
90	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
91.Lk_sb1:	// sb1u, sb1t
92	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
93	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
94.Lk_sb2:	// sb2u, sb2t
95	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
96	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
97
98//
99//  Decryption stuff
100//
101.Lk_dipt:	// decryption input transform
102	.quad	0x0F505B040B545F00, 0x154A411E114E451A
103	.quad	0x86E383E660056500, 0x12771772F491F194
104.Lk_dsbo:	// decryption sbox final output
105	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
106	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
107.Lk_dsb9:	// decryption sbox output *9*u, *9*t
108	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
109	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
110.Lk_dsbd:	// decryption sbox output *D*u, *D*t
111	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
112	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
113.Lk_dsbb:	// decryption sbox output *B*u, *B*t
114	.quad	0xD022649296B44200, 0x602646F6B0F2D404
115	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
116.Lk_dsbe:	// decryption sbox output *E*u, *E*t
117	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
118	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
119
120//
121//  Key schedule constants
122//
123.Lk_dksd:	// decryption key schedule: invskew x*D
124	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
125	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
126.Lk_dksb:	// decryption key schedule: invskew x*B
127	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
128	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
129.Lk_dkse:	// decryption key schedule: invskew x*E + 0x63
130	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
131	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
132.Lk_dks9:	// decryption key schedule: invskew x*9
133	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
134	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
135
136.Lk_rcon:	// rcon
137	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
138
139.Lk_opt:	// output transform
140	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
141	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
142.Lk_deskew:	// deskew tables: inverts the sbox's "skew"
143	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
144	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
145
146.asciz  "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
147.size	_vpaes_consts,.-_vpaes_consts
148.align	6
149
150.text
151
152___
153
154{
155my ($inp,$out,$key) = map("x$_",(0..2));
156
157my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
158my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
159my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
160
161$code.=<<___;
162//
163//  _aes_preheat
164//
165//  Fills register %r10 -> .aes_consts (so you can -fPIC)
166//  and %xmm9-%xmm15 as specified below.
167//
168.type	_vpaes_encrypt_preheat,%function
169.align	4
170_vpaes_encrypt_preheat:
171	adrp	x10, .Lk_inv
172	add	x10, x10, :lo12:.Lk_inv
173	movi	v17.16b, #0x0f
174	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
175	ld1	{v20.2d-v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
176	ld1	{v24.2d-v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
177	ret
178.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
179
180//
181//  _aes_encrypt_core
182//
183//  AES-encrypt %xmm0.
184//
185//  Inputs:
186//     %xmm0 = input
187//     %xmm9-%xmm15 as in _vpaes_preheat
188//    (%rdx) = scheduled keys
189//
190//  Output in %xmm0
191//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
192//  Preserves %xmm6 - %xmm8 so you get some local vectors
193//
194//
195.type	_vpaes_encrypt_core,%function
196.align 4
197_vpaes_encrypt_core:
198	mov	x9, $key
199	ldr	w8, [$key,#240]			// pull rounds
200	adrp	x11, .Lk_mc_forward+16
201	add	x11, x11, :lo12:.Lk_mc_forward+16
202						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
203	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
204	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
205	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
206	tbl	v1.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
207						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
208	tbl	v2.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
209	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
210	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
211	b	.Lenc_entry
212
213.align 4
214.Lenc_loop:
215	// middle of middle round
216	add	x10, x11, #0x40
217	tbl	v4.16b, {$sb1t}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
218	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
219	tbl	v0.16b, {$sb1u}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
220	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
221	tbl	v5.16b,	{$sb2t}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
222	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
223	tbl	v2.16b, {$sb2u}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
224	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
225	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
226	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
227	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
228	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
229	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
230	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
231	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
232	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
233	sub	w8, w8, #1			// nr--
234
235.Lenc_entry:
236	// top of round
237	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
238	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
239	tbl	v5.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
240	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
241	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
242	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
243	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
244	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
245	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
246	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
247	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
248	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
249	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
250	cbnz	w8, .Lenc_loop
251
252	// middle of last round
253	add	x10, x11, #0x80
254						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
255						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
256	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
257	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
258	tbl	v0.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
259	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
260	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
261	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
262	ret
263.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
264
265.globl	vpaes_encrypt
266.type	vpaes_encrypt,%function
267.align	4
268vpaes_encrypt:
269	AARCH64_SIGN_LINK_REGISTER
270	stp	x29,x30,[sp,#-16]!
271	add	x29,sp,#0
272
273	ld1	{v7.16b}, [$inp]
274	bl	_vpaes_encrypt_preheat
275	bl	_vpaes_encrypt_core
276	st1	{v0.16b}, [$out]
277
278	ldp	x29,x30,[sp],#16
279	AARCH64_VALIDATE_LINK_REGISTER
280	ret
281.size	vpaes_encrypt,.-vpaes_encrypt
282
283.type	_vpaes_encrypt_2x,%function
284.align 4
285_vpaes_encrypt_2x:
286	mov	x9, $key
287	ldr	w8, [$key,#240]			// pull rounds
288	adrp	x11, .Lk_mc_forward+16
289	add	x11, x11, :lo12:.Lk_mc_forward+16
290						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
291	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
292	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
293	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0
294	 and	v9.16b,  v15.16b,  v17.16b
295	 ushr	v8.16b,  v15.16b,  #4
296	tbl	v1.16b,  {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
297	 tbl	v9.16b,  {$iptlo}, v9.16b
298						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
299	tbl	v2.16b,  {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
300	 tbl	v10.16b, {$ipthi}, v8.16b
301	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
302	 eor	v8.16b,  v9.16b,   v16.16b
303	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
304	 eor	v8.16b,  v8.16b,   v10.16b
305	b	.Lenc_2x_entry
306
307.align 4
308.Lenc_2x_loop:
309	// middle of middle round
310	add	x10, x11, #0x40
311	tbl	v4.16b,  {$sb1t}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
312	 tbl	v12.16b, {$sb1t}, v10.16b
313	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
314	tbl	v0.16b,  {$sb1u}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
315	 tbl	v8.16b,  {$sb1u}, v11.16b
316	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
317	 eor	v12.16b, v12.16b, v16.16b
318	tbl	v5.16b,	 {$sb2t}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
319	 tbl	v13.16b, {$sb2t}, v10.16b
320	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
321	 eor	v8.16b,  v8.16b,  v12.16b
322	tbl	v2.16b,  {$sb2u}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
323	 tbl	v10.16b, {$sb2u}, v11.16b
324	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
325	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
326	 tbl	v11.16b, {v8.16b}, v1.16b
327	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
328	 eor	v10.16b, v10.16b, v13.16b
329	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
330	 tbl	v8.16b,  {v8.16b}, v4.16b
331	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
332	 eor	v11.16b, v11.16b, v10.16b
333	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
334	 tbl	v12.16b, {v11.16b},v1.16b
335	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
336	 eor	v8.16b,  v8.16b,  v11.16b
337	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
338	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
339	 eor	v8.16b,  v8.16b,  v12.16b
340	sub	w8, w8, #1			// nr--
341
342.Lenc_2x_entry:
343	// top of round
344	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
345	ushr	v0.16b,  v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
346	 and	v9.16b,  v8.16b, v17.16b
347	 ushr	v8.16b,  v8.16b, #4
348	tbl	v5.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
349	 tbl	v13.16b, {$invhi},v9.16b
350	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
351	 eor	v9.16b,  v9.16b,  v8.16b
352	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
353	 tbl	v11.16b, {$invlo},v8.16b
354	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
355	 tbl	v12.16b, {$invlo},v9.16b
356	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
357	 eor	v11.16b, v11.16b, v13.16b
358	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
359	 eor	v12.16b, v12.16b, v13.16b
360	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
361	 tbl	v10.16b, {$invlo},v11.16b
362	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
363	 tbl	v11.16b, {$invlo},v12.16b
364	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
365	 eor	v10.16b, v10.16b, v9.16b
366	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
367	 eor	v11.16b, v11.16b, v8.16b
368	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
369	cbnz	w8, .Lenc_2x_loop
370
371	// middle of last round
372	add	x10, x11, #0x80
373						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
374						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
375	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
376	 tbl	v12.16b, {$sbou}, v10.16b
377	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
378	tbl	v0.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
379	 tbl	v8.16b,  {$sbot}, v11.16b
380	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
381	 eor	v12.16b, v12.16b, v16.16b
382	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
383	 eor	v8.16b,  v8.16b,  v12.16b
384	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
385	 tbl	v1.16b,  {v8.16b},v1.16b
386	ret
387.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
388
389.type	_vpaes_decrypt_preheat,%function
390.align	4
391_vpaes_decrypt_preheat:
392	adrp	x10, .Lk_inv
393	add	x10, x10, :lo12:.Lk_inv
394	movi	v17.16b, #0x0f
395	adrp	x11, .Lk_dipt
396	add	x11, x11, :lo12:.Lk_dipt
397	ld1	{v18.2d-v19.2d}, [x10],#32	// .Lk_inv
398	ld1	{v20.2d-v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
399	ld1	{v24.2d-v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
400	ld1	{v28.2d-v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
401	ret
402.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
403
404//
405//  Decryption core
406//
407//  Same API as encryption core.
408//
409.type	_vpaes_decrypt_core,%function
410.align	4
411_vpaes_decrypt_core:
412	mov	x9, $key
413	ldr	w8, [$key,#240]			// pull rounds
414
415						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
416	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
417	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
418	adrp	x10, .Lk_sr
419	add	x10, x10, :lo12:.Lk_sr
420	and	x11, x11, #0x30			// and		\$0x30,	%r11
421	add	x11, x11, x10
422	adrp	x10, .Lk_mc_forward+48
423	add	x10, x10, :lo12:.Lk_mc_forward+48
424
425	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
426	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
427	ushr	v0.16b, v7.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
428	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
429	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
430						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
431	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
432	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
433	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
434	b	.Ldec_entry
435
436.align 4
437.Ldec_loop:
438//
439//  Inverse mix columns
440//
441						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
442						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
443	tbl	v4.16b, {$sb9u}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
444	tbl	v1.16b, {$sb9t}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
445	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
446						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
447	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
448						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
449
450	tbl	v4.16b, {$sbdu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
451	tbl 	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
452	tbl	v1.16b, {$sbdt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
453	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
454						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
455	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
456						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
457
458	tbl	v4.16b, {$sbbu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
459	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
460	tbl	v1.16b, {$sbbt}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
461	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
462						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
463	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
464						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
465
466	tbl	v4.16b, {$sbeu}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
467	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
468	tbl	v1.16b, {$sbet}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
469	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
470	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
471	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
472	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
473
474.Ldec_entry:
475	// top of round
476	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
477	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
478	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
479	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
480	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
481	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
482	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
483	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
484	tbl	v2.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
485	tbl	v3.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
486	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
487	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
488	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
489	cbnz	w8, .Ldec_loop
490
491	// middle of last round
492						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
493	tbl	v4.16b, {$sbou}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
494						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
495	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
496	tbl	v1.16b, {$sbot}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
497	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
498	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
499	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
500	ret
501.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
502
503.globl	vpaes_decrypt
504.type	vpaes_decrypt,%function
505.align	4
506vpaes_decrypt:
507	AARCH64_SIGN_LINK_REGISTER
508	stp	x29,x30,[sp,#-16]!
509	add	x29,sp,#0
510
511	ld1	{v7.16b}, [$inp]
512	bl	_vpaes_decrypt_preheat
513	bl	_vpaes_decrypt_core
514	st1	{v0.16b}, [$out]
515
516	ldp	x29,x30,[sp],#16
517	AARCH64_VALIDATE_LINK_REGISTER
518	ret
519.size	vpaes_decrypt,.-vpaes_decrypt
520
521// v14-v15 input, v0-v1 output
522.type	_vpaes_decrypt_2x,%function
523.align	4
524_vpaes_decrypt_2x:
525	mov	x9, $key
526	ldr	w8, [$key,#240]			// pull rounds
527
528						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
529	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	\$4, %r11
530	eor	x11, x11, #0x30			// xor		\$0x30,	%r11
531	adrp	x10, .Lk_sr
532	add	x10, x10, :lo12:.Lk_sr
533	and	x11, x11, #0x30			// and		\$0x30,	%r11
534	add	x11, x11, x10
535	adrp	x10, .Lk_mc_forward+48
536	add	x10, x10, :lo12:.Lk_mc_forward+48
537
538	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
539	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
540	ushr	v0.16b,  v14.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
541	 and	v9.16b,  v15.16b, v17.16b
542	 ushr	v8.16b,  v15.16b, #4
543	tbl	v2.16b,  {$iptlo},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
544	 tbl	v10.16b, {$iptlo},v9.16b
545	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
546						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
547	tbl	v0.16b,  {$ipthi},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
548	 tbl	v8.16b,  {$ipthi},v8.16b
549	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
550	 eor	v10.16b, v10.16b, v16.16b
551	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
552	 eor	v8.16b,  v8.16b,  v10.16b
553	b	.Ldec_2x_entry
554
555.align 4
556.Ldec_2x_loop:
557//
558//  Inverse mix columns
559//
560						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
561						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
562	tbl	v4.16b,  {$sb9u}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
563	 tbl	v12.16b, {$sb9u}, v10.16b
564	tbl	v1.16b,  {$sb9t}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
565	 tbl	v9.16b,  {$sb9t}, v11.16b
566	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
567	 eor	v8.16b,  v12.16b, v16.16b
568						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
569	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
570	 eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
571						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
572
573	tbl	v4.16b,  {$sbdu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
574	 tbl	v12.16b, {$sbdu}, v10.16b
575	tbl 	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
576	 tbl 	v8.16b,  {v8.16b},v5.16b
577	tbl	v1.16b,  {$sbdt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
578	 tbl	v9.16b,  {$sbdt}, v11.16b
579	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
580	 eor	v8.16b,  v8.16b,  v12.16b
581						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
582	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
583	 eor	v8.16b,  v8.16b,  v9.16b
584						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
585
586	tbl	v4.16b,  {$sbbu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
587	 tbl	v12.16b, {$sbbu}, v10.16b
588	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
589	 tbl	v8.16b,  {v8.16b},v5.16b
590	tbl	v1.16b,  {$sbbt}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
591	 tbl	v9.16b,  {$sbbt}, v11.16b
592	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
593	 eor	v8.16b,  v8.16b,  v12.16b
594						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
595	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
596	 eor	v8.16b,  v8.16b,  v9.16b
597						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
598
599	tbl	v4.16b,  {$sbeu}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
600	 tbl	v12.16b, {$sbeu}, v10.16b
601	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
602	 tbl	v8.16b,  {v8.16b},v5.16b
603	tbl	v1.16b,  {$sbet}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
604	 tbl	v9.16b,  {$sbet}, v11.16b
605	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
606	 eor	v8.16b,  v8.16b,  v12.16b
607	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr \$12,	%xmm5,	%xmm5,	%xmm5
608	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
609	 eor	v8.16b,  v8.16b,  v9.16b
610	sub	w8, w8, #1			// sub		\$1,%rax			# nr--
611
612.Ldec_2x_entry:
613	// top of round
614	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
615	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
616	 and	v9.16b,  v8.16b,  v17.16b
617	 ushr	v8.16b,  v8.16b,  #4
618	tbl	v2.16b,  {$invhi},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
619	 tbl	v10.16b, {$invhi},v9.16b
620	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
621	 eor	v9.16b,	 v9.16b,  v8.16b
622	tbl	v3.16b,  {$invlo},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
623	 tbl	v11.16b, {$invlo},v8.16b
624	tbl	v4.16b,  {$invlo},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
625	 tbl	v12.16b, {$invlo},v9.16b
626	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
627	 eor	v11.16b, v11.16b, v10.16b
628	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
629	 eor	v12.16b, v12.16b, v10.16b
630	tbl	v2.16b,  {$invlo},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
631	 tbl	v10.16b, {$invlo},v11.16b
632	tbl	v3.16b,  {$invlo},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
633	 tbl	v11.16b, {$invlo},v12.16b
634	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
635	 eor	v10.16b, v10.16b, v9.16b
636	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
637	 eor	v11.16b, v11.16b, v8.16b
638	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
639	cbnz	w8, .Ldec_2x_loop
640
641	// middle of last round
642						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
643	tbl	v4.16b,  {$sbou}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
644	 tbl	v12.16b, {$sbou}, v10.16b
645						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
646	tbl	v1.16b,  {$sbot}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
647	 tbl	v9.16b,  {$sbot}, v11.16b
648	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
649	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
650	 eor	v12.16b, v12.16b, v16.16b
651	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
652	 eor	v8.16b,  v9.16b,  v12.16b
653	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
654	 tbl	v1.16b,  {v8.16b},v2.16b
655	ret
656.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
657___
658}
659{
660my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
661my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
662
663$code.=<<___;
664////////////////////////////////////////////////////////
665//                                                    //
666//                  AES key schedule                  //
667//                                                    //
668////////////////////////////////////////////////////////
669.type	_vpaes_key_preheat,%function
670.align	4
671_vpaes_key_preheat:
672	adrp	x10, .Lk_inv
673	add	x10, x10, :lo12:.Lk_inv
674	movi	v16.16b, #0x5b			// .Lk_s63
675	adrp	x11, .Lk_sb1
676	add	x11, x11, :lo12:.Lk_sb1
677	movi	v17.16b, #0x0f			// .Lk_s0F
678	ld1	{v18.2d-v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
679	adrp	x10, .Lk_dksd
680	add	x10, x10, :lo12:.Lk_dksd
681	ld1	{v22.2d-v23.2d}, [x11]		// .Lk_sb1
682	adrp	x11, .Lk_mc_forward
683	add	x11, x11, :lo12:.Lk_mc_forward
684	ld1	{v24.2d-v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
685	ld1	{v28.2d-v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
686	ld1	{v8.2d}, [x10]			// .Lk_rcon
687	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
688	ret
689.size	_vpaes_key_preheat,.-_vpaes_key_preheat
690
691.type	_vpaes_schedule_core,%function
692.align	4
693_vpaes_schedule_core:
694	AARCH64_SIGN_LINK_REGISTER
695	stp	x29, x30, [sp,#-16]!
696	add	x29,sp,#0
697
698	bl	_vpaes_key_preheat		// load the tables
699
700	ld1	{v0.16b}, [$inp],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
701
702	// input transform
703	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
704	bl	_vpaes_schedule_transform
705	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
706
707	adrp	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
708	add	x10, x10, :lo12:.Lk_sr
709	add	x8, x8, x10
710	cbnz	$dir, .Lschedule_am_decrypting
711
712	// encrypting, output zeroth round key after transform
713	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)
714	b	.Lschedule_go
715
716.Lschedule_am_decrypting:
717	// decrypting, output zeroth round key after shiftrows
718	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
719	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
720	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
721	eor	x8, x8, #0x30			// xor	\$0x30, %r8
722
723.Lschedule_go:
724	cmp	$bits, #192			// cmp	\$192,	%esi
725	b.hi	.Lschedule_256
726	b.eq	.Lschedule_192
727	// 128: fall though
728
729//
730//  .schedule_128
731//
732//  128-bit specific part of key schedule.
733//
734//  This schedule is really simple, because all its parts
735//  are accomplished by the subroutines.
736//
737.Lschedule_128:
738	mov	$inp, #10			// mov	\$10, %esi
739
740.Loop_schedule_128:
741	sub	$inp, $inp, #1			// dec	%esi
742	bl 	_vpaes_schedule_round
743	cbz 	$inp, .Lschedule_mangle_last
744	bl	_vpaes_schedule_mangle		// write output
745	b 	.Loop_schedule_128
746
747//
748//  .aes_schedule_192
749//
750//  192-bit specific part of key schedule.
751//
752//  The main body of this schedule is the same as the 128-bit
753//  schedule, but with more smearing.  The long, high side is
754//  stored in %xmm7 as before, and the short, low side is in
755//  the high bits of %xmm6.
756//
757//  This schedule is somewhat nastier, however, because each
758//  round produces 192 bits of key material, or 1.5 round keys.
759//  Therefore, on each cycle we do 2 rounds and produce 3 round
760//  keys.
761//
762.align	4
763.Lschedule_192:
764	sub	$inp, $inp, #8
765	ld1	{v0.16b}, [$inp]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
766	bl	_vpaes_schedule_transform	// input transform
767	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
768	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
769	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
770	mov	$inp, #4			// mov	\$4,	%esi
771
772.Loop_schedule_192:
773	sub	$inp, $inp, #1			// dec	%esi
774	bl	_vpaes_schedule_round
775	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	\$8,%xmm6,%xmm0,%xmm0
776	bl	_vpaes_schedule_mangle		// save key n
777	bl	_vpaes_schedule_192_smear
778	bl	_vpaes_schedule_mangle		// save key n+1
779	bl	_vpaes_schedule_round
780	cbz 	$inp, .Lschedule_mangle_last
781	bl	_vpaes_schedule_mangle		// save key n+2
782	bl	_vpaes_schedule_192_smear
783	b	.Loop_schedule_192
784
785//
786//  .aes_schedule_256
787//
788//  256-bit specific part of key schedule.
789//
790//  The structure here is very similar to the 128-bit
791//  schedule, but with an additional "low side" in
792//  %xmm6.  The low side's rounds are the same as the
793//  high side's, except no rcon and no rotation.
794//
795.align	4
796.Lschedule_256:
797	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
798	bl	_vpaes_schedule_transform	// input transform
799	mov	$inp, #7			// mov	\$7, %esi
800
801.Loop_schedule_256:
802	sub	$inp, $inp, #1			// dec	%esi
803	bl	_vpaes_schedule_mangle		// output low result
804	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
805
806	// high round
807	bl	_vpaes_schedule_round
808	cbz 	$inp, .Lschedule_mangle_last
809	bl	_vpaes_schedule_mangle
810
811	// low round. swap xmm7 and xmm6
812	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
813	movi	v4.16b, #0
814	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
815	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
816	bl	_vpaes_schedule_low_round
817	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
818
819	b	.Loop_schedule_256
820
821//
822//  .aes_schedule_mangle_last
823//
824//  Mangler for last round of key schedule
825//  Mangles %xmm0
826//    when encrypting, outputs out(%xmm0) ^ 63
827//    when decrypting, outputs unskew(%xmm0)
828//
829//  Always called right before return... jumps to cleanup and exits
830//
831.align	4
832.Lschedule_mangle_last:
833	// schedule last round key from xmm0
834	adrp	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
835	add	x11, x11, :lo12:.Lk_deskew
836	cbnz	$dir, .Lschedule_mangle_last_dec
837
838	// encrypting
839	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
840	adrp	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
841	add	x11, x11, :lo12:.Lk_opt
842	add	$out, $out, #32			// add	\$32,	%rdx
843	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
844
845.Lschedule_mangle_last_dec:
846	ld1	{v20.2d-v21.2d}, [x11]		// reload constants
847	sub	$out, $out, #16			// add	\$-16,	%rdx
848	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
849	bl	_vpaes_schedule_transform	// output transform
850	st1	{v0.2d}, [$out]			// vmovdqu	%xmm0,	(%rdx)		# save last key
851
852	// cleanup
853	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
854	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
855	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
856	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
857	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
858	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
859	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
860	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
861	ldp	x29, x30, [sp],#16
862	AARCH64_VALIDATE_LINK_REGISTER
863	ret
864.size	_vpaes_schedule_core,.-_vpaes_schedule_core
865
866//
867//  .aes_schedule_192_smear
868//
869//  Smear the short, low side in the 192-bit key schedule.
870//
871//  Inputs:
872//    %xmm7: high side, b  a  x  y
873//    %xmm6:  low side, d  c  0  0
874//    %xmm13: 0
875//
876//  Outputs:
877//    %xmm6: b+c+d  b+c  0  0
878//    %xmm0: b+c+d  b+c  b  a
879//
880.type	_vpaes_schedule_192_smear,%function
881.align	4
882_vpaes_schedule_192_smear:
883	movi	v1.16b, #0
884	dup	v0.4s, v7.s[3]
885	ins	v1.s[3], v6.s[2]	// vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
886	ins	v0.s[0], v7.s[2]	// vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
887	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
888	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
889	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
890	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
891	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
892	ret
893.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
894
895//
896//  .aes_schedule_round
897//
898//  Runs one main round of the key schedule on %xmm0, %xmm7
899//
900//  Specifically, runs subbytes on the high dword of %xmm0
901//  then rotates it by one byte and xors into the low dword of
902//  %xmm7.
903//
904//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
905//  next rcon.
906//
907//  Smears the dwords of %xmm7 by xoring the low into the
908//  second low, result into third, result into highest.
909//
910//  Returns results in %xmm7 = %xmm0.
911//  Clobbers %xmm1-%xmm4, %r11.
912//
913.type	_vpaes_schedule_round,%function
914.align	4
915_vpaes_schedule_round:
916	// extract rcon from xmm8
917	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
918	ext	v1.16b, $rcon, v4.16b, #15	// vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
919	ext	$rcon, $rcon, $rcon, #15	// vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
920	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
921
922	// rotate
923	dup	v0.4s, v0.s[3]			// vpshufd	\$0xFF,	%xmm0,	%xmm0
924	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
925
926	// fall through...
927
928	// low round: same as high round, but no rotation and no rcon.
929_vpaes_schedule_low_round:
930	// smear xmm7
931	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	\$4,	%xmm7,	%xmm1
932	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
933	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	\$8,	%xmm7,	%xmm4
934
935	// subbytes
936	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
937	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
938	 eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
939	tbl	v2.16b, {$invhi}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
940	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
941	tbl	v3.16b, {$invlo}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
942	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
943	tbl	v4.16b, {$invlo}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
944	 eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
945	tbl	v3.16b, {$invlo}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
946	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
947	tbl	v2.16b, {$invlo}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
948	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
949	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
950	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
951	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
952	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
953
954	// add in smeared stuff
955	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
956	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
957	ret
958.size	_vpaes_schedule_round,.-_vpaes_schedule_round
959
960//
961//  .aes_schedule_transform
962//
963//  Linear-transform %xmm0 according to tables at (%r11)
964//
965//  Requires that %xmm9 = 0x0F0F... as in preheat
966//  Output in %xmm0
967//  Clobbers %xmm1, %xmm2
968//
969.type	_vpaes_schedule_transform,%function
970.align	4
971_vpaes_schedule_transform:
972	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
973	ushr	v0.16b, v0.16b, #4		// vpsrlb	\$4,	%xmm0,	%xmm0
974						// vmovdqa	(%r11),	%xmm2 	# lo
975	tbl	v2.16b, {$iptlo}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
976						// vmovdqa	16(%r11),	%xmm1 # hi
977	tbl	v0.16b, {$ipthi}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
978	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
979	ret
980.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
981
982//
983//  .aes_schedule_mangle
984//
985//  Mangle xmm0 from (basis-transformed) standard version
986//  to our version.
987//
988//  On encrypt,
989//    xor with 0x63
990//    multiply by circulant 0,1,1,1
991//    apply shiftrows transform
992//
993//  On decrypt,
994//    xor with 0x63
995//    multiply by "inverse mixcolumns" circulant E,B,D,9
996//    deskew
997//    apply shiftrows transform
998//
999//
1000//  Writes out to (%rdx), and increments or decrements it
1001//  Keeps track of round number mod 4 in %r8
1002//  Preserves xmm0
1003//  Clobbers xmm1-xmm5
1004//
1005.type	_vpaes_schedule_mangle,%function
1006.align	4
1007_vpaes_schedule_mangle:
1008	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
1009						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
1010	cbnz	$dir, .Lschedule_mangle_dec
1011
1012	// encrypting
1013	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
1014	add	$out, $out, #16			// add	\$16,	%rdx
1015	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
1016	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
1017	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
1018	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
1019	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
1020	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
1021
1022	b	.Lschedule_mangle_both
1023.align	4
1024.Lschedule_mangle_dec:
1025	// inverse mix columns
1026						// lea	.Lk_dksd(%rip),%r11
1027	ushr	v1.16b, v4.16b, #4		// vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
1028	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
1029
1030						// vmovdqa	0x00(%r11),	%xmm2
1031	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1032						// vmovdqa	0x10(%r11),	%xmm3
1033	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1034	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
1035	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1036
1037						// vmovdqa	0x20(%r11),	%xmm2
1038	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1039	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1040						// vmovdqa	0x30(%r11),	%xmm3
1041	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1042	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
1043	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1044
1045						// vmovdqa	0x40(%r11),	%xmm2
1046	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1047	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1048						// vmovdqa	0x50(%r11),	%xmm3
1049	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1050	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
1051
1052						// vmovdqa	0x60(%r11),	%xmm2
1053	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1054	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1055						// vmovdqa	0x70(%r11),	%xmm4
1056	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
1057	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
1058	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1059	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
1060
1061	sub	$out, $out, #16			// add	\$-16,	%rdx
1062
1063.Lschedule_mangle_both:
1064	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1065	add	x8, x8, #64-16			// add	\$-16,	%r8
1066	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
1067	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
1068	ret
1069.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1070
1071.globl	vpaes_set_encrypt_key
1072.type	vpaes_set_encrypt_key,%function
1073.align	4
1074vpaes_set_encrypt_key:
1075	AARCH64_SIGN_LINK_REGISTER
1076	stp	x29,x30,[sp,#-16]!
1077	add	x29,sp,#0
1078	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1079
1080	lsr	w9, $bits, #5		// shr	\$5,%eax
1081	add	w9, w9, #5		// \$5,%eax
1082	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1083
1084	mov	$dir, #0		// mov	\$0,%ecx
1085	mov	x8, #0x30		// mov	\$0x30,%r8d
1086	bl	_vpaes_schedule_core
1087	eor	x0, x0, x0
1088
1089	ldp	d8,d9,[sp],#16
1090	ldp	x29,x30,[sp],#16
1091	AARCH64_VALIDATE_LINK_REGISTER
1092	ret
1093.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1094
1095.globl	vpaes_set_decrypt_key
1096.type	vpaes_set_decrypt_key,%function
1097.align	4
1098vpaes_set_decrypt_key:
1099	AARCH64_SIGN_LINK_REGISTER
1100	stp	x29,x30,[sp,#-16]!
1101	add	x29,sp,#0
1102	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1103
1104	lsr	w9, $bits, #5		// shr	\$5,%eax
1105	add	w9, w9, #5		// \$5,%eax
1106	str	w9, [$out,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1107	lsl	w9, w9, #4		// shl	\$4,%eax
1108	add	$out, $out, #16		// lea	16(%rdx,%rax),%rdx
1109	add	$out, $out, x9
1110
1111	mov	$dir, #1		// mov	\$1,%ecx
1112	lsr	w8, $bits, #1		// shr	\$1,%r8d
1113	and	x8, x8, #32		// and	\$32,%r8d
1114	eor	x8, x8, #32		// xor	\$32,%r8d	# nbits==192?0:32
1115	bl	_vpaes_schedule_core
1116
1117	ldp	d8,d9,[sp],#16
1118	ldp	x29,x30,[sp],#16
1119	AARCH64_VALIDATE_LINK_REGISTER
1120	ret
1121.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1122___
1123}
1124{
1125my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1126
1127$code.=<<___;
1128.globl	vpaes_cbc_encrypt
1129.type	vpaes_cbc_encrypt,%function
1130.align	4
1131vpaes_cbc_encrypt:
1132	AARCH64_SIGN_LINK_REGISTER
1133	cbz	$len, .Lcbc_abort
1134	cmp	w5, #0			// check direction
1135	b.eq	vpaes_cbc_decrypt
1136
1137	stp	x29,x30,[sp,#-16]!
1138	add	x29,sp,#0
1139
1140	mov	x17, $len		// reassign
1141	mov	x2,  $key		// reassign
1142
1143	ld1	{v0.16b}, [$ivec]	// load ivec
1144	bl	_vpaes_encrypt_preheat
1145	b	.Lcbc_enc_loop
1146
1147.align	4
1148.Lcbc_enc_loop:
1149	ld1	{v7.16b}, [$inp],#16	// load input
1150	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1151	bl	_vpaes_encrypt_core
1152	st1	{v0.16b}, [$out],#16	// save output
1153	subs	x17, x17, #16
1154	b.hi	.Lcbc_enc_loop
1155
1156	st1	{v0.16b}, [$ivec]	// write ivec
1157
1158	ldp	x29,x30,[sp],#16
1159.Lcbc_abort:
1160	AARCH64_VALIDATE_LINK_REGISTER
1161	ret
1162.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1163
1164.type	vpaes_cbc_decrypt,%function
1165.align	4
1166vpaes_cbc_decrypt:
1167	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1168	// only from vpaes_cbc_encrypt which has already signed the return address.
1169	stp	x29,x30,[sp,#-16]!
1170	add	x29,sp,#0
1171	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1172	stp	d10,d11,[sp,#-16]!
1173	stp	d12,d13,[sp,#-16]!
1174	stp	d14,d15,[sp,#-16]!
1175
1176	mov	x17, $len		// reassign
1177	mov	x2,  $key		// reassign
1178	ld1	{v6.16b}, [$ivec]	// load ivec
1179	bl	_vpaes_decrypt_preheat
1180	tst	x17, #16
1181	b.eq	.Lcbc_dec_loop2x
1182
1183	ld1	{v7.16b}, [$inp], #16	// load input
1184	bl	_vpaes_decrypt_core
1185	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1186	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1187	st1	{v0.16b}, [$out], #16
1188	subs	x17, x17, #16
1189	b.ls	.Lcbc_dec_done
1190
1191.align	4
1192.Lcbc_dec_loop2x:
1193	ld1	{v14.16b,v15.16b}, [$inp], #32
1194	bl	_vpaes_decrypt_2x
1195	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1196	eor	v1.16b, v1.16b, v14.16b
1197	orr	v6.16b, v15.16b, v15.16b
1198	st1	{v0.16b,v1.16b}, [$out], #32
1199	subs	x17, x17, #32
1200	b.hi	.Lcbc_dec_loop2x
1201
1202.Lcbc_dec_done:
1203	st1	{v6.16b}, [$ivec]
1204
1205	ldp	d14,d15,[sp],#16
1206	ldp	d12,d13,[sp],#16
1207	ldp	d10,d11,[sp],#16
1208	ldp	d8,d9,[sp],#16
1209	ldp	x29,x30,[sp],#16
1210	AARCH64_VALIDATE_LINK_REGISTER
1211	ret
1212.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1213___
1214if (1) {
1215$code.=<<___;
1216.globl	vpaes_ecb_encrypt
1217.type	vpaes_ecb_encrypt,%function
1218.align	4
1219vpaes_ecb_encrypt:
1220	AARCH64_SIGN_LINK_REGISTER
1221	stp	x29,x30,[sp,#-16]!
1222	add	x29,sp,#0
1223	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1224	stp	d10,d11,[sp,#-16]!
1225	stp	d12,d13,[sp,#-16]!
1226	stp	d14,d15,[sp,#-16]!
1227
1228	mov	x17, $len
1229	mov	x2,  $key
1230	bl	_vpaes_encrypt_preheat
1231	tst	x17, #16
1232	b.eq	.Lecb_enc_loop
1233
1234	ld1	{v7.16b}, [$inp],#16
1235	bl	_vpaes_encrypt_core
1236	st1	{v0.16b}, [$out],#16
1237	subs	x17, x17, #16
1238	b.ls	.Lecb_enc_done
1239
1240.align	4
1241.Lecb_enc_loop:
1242	ld1	{v14.16b,v15.16b}, [$inp], #32
1243	bl	_vpaes_encrypt_2x
1244	st1	{v0.16b,v1.16b}, [$out], #32
1245	subs	x17, x17, #32
1246	b.hi	.Lecb_enc_loop
1247
1248.Lecb_enc_done:
1249	ldp	d14,d15,[sp],#16
1250	ldp	d12,d13,[sp],#16
1251	ldp	d10,d11,[sp],#16
1252	ldp	d8,d9,[sp],#16
1253	ldp	x29,x30,[sp],#16
1254	AARCH64_VALIDATE_LINK_REGISTER
1255	ret
1256.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1257
1258.globl	vpaes_ecb_decrypt
1259.type	vpaes_ecb_decrypt,%function
1260.align	4
1261vpaes_ecb_decrypt:
1262	AARCH64_SIGN_LINK_REGISTER
1263	stp	x29,x30,[sp,#-16]!
1264	add	x29,sp,#0
1265	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1266	stp	d10,d11,[sp,#-16]!
1267	stp	d12,d13,[sp,#-16]!
1268	stp	d14,d15,[sp,#-16]!
1269
1270	mov	x17, $len
1271	mov	x2,  $key
1272	bl	_vpaes_decrypt_preheat
1273	tst	x17, #16
1274	b.eq	.Lecb_dec_loop
1275
1276	ld1	{v7.16b}, [$inp],#16
1277	bl	_vpaes_encrypt_core
1278	st1	{v0.16b}, [$out],#16
1279	subs	x17, x17, #16
1280	b.ls	.Lecb_dec_done
1281
1282.align	4
1283.Lecb_dec_loop:
1284	ld1	{v14.16b,v15.16b}, [$inp], #32
1285	bl	_vpaes_decrypt_2x
1286	st1	{v0.16b,v1.16b}, [$out], #32
1287	subs	x17, x17, #32
1288	b.hi	.Lecb_dec_loop
1289
1290.Lecb_dec_done:
1291	ldp	d14,d15,[sp],#16
1292	ldp	d12,d13,[sp],#16
1293	ldp	d10,d11,[sp],#16
1294	ldp	d8,d9,[sp],#16
1295	ldp	x29,x30,[sp],#16
1296	AARCH64_VALIDATE_LINK_REGISTER
1297	ret
1298.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1299___
1300}	}
1301print $code;
1302
1303close STDOUT or die "error closing STDOUT: $!";
1304