xref: /openssl/crypto/sm4/asm/vpsm4-armv8.pl (revision 110f1afd)
1#! /usr/bin/env perl
2# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# This module implements SM4 with ASIMD on aarch64
11#
12# Feb 2022
13#
14
15# $output is the last argument if it looks like a file (it has an extension)
16# $flavour is the first argument if it doesn't look like a file
17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23die "can't locate arm-xlate.pl";
24
25open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26    or die "can't call $xlate: $!";
27*STDOUT=*OUT;
28
29$prefix="vpsm4";
30my @vtmp=map("v$_",(0..3));
31my @qtmp=map("q$_",(0..3));
32my @data=map("v$_",(4..7));
33my @datax=map("v$_",(8..11));
34my ($rk0,$rk1)=("v12","v13");
35my ($rka,$rkb)=("v14","v15");
36my @vtmpx=map("v$_",(12..15));
37my @sbox=map("v$_",(16..31));
38my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40my ($xtmp1,$xtmp2)=("x8","x9");
41my ($ptr,$counter)=("x10","w11");
42my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43
44sub rev32() {
45	my $dst = shift;
46	my $src = shift;
47
48	if ($src and ("$src" ne "$dst")) {
49$code.=<<___;
50#ifndef __AARCH64EB__
51	rev32	$dst.16b,$src.16b
52#else
53	mov	$dst.16b,$src.16b
54#endif
55___
56	} else {
57$code.=<<___;
58#ifndef __AARCH64EB__
59	rev32	$dst.16b,$dst.16b
60#endif
61___
62	}
63}
64
65sub rev32_armeb() {
66	my $dst = shift;
67	my $src = shift;
68
69	if ($src and ("$src" ne "$dst")) {
70$code.=<<___;
71#ifdef __AARCH64EB__
72	rev32	$dst.16b,$src.16b
73#else
74	mov	$dst.16b,$src.16b
75#endif
76___
77	} else {
78$code.=<<___;
79#ifdef __AARCH64EB__
80	rev32	$dst.16b,$dst.16b
81#endif
82___
83	}
84}
85
86sub rbit() {
87	my $dst = shift;
88	my $src = shift;
89	my $std = shift;
90
91	if ($src and ("$src" ne "$dst")) {
92		if ($std eq "_gb") {
93$code.=<<___;
94			rbit $dst.16b,$src.16b
95___
96		} else {
97$code.=<<___;
98			mov $dst.16b,$src.16b
99___
100		}
101	} else {
102		if ($std eq "_gb") {
103$code.=<<___;
104			rbit $dst.16b,$src.16b
105___
106		}
107	}
108}
109
110sub transpose() {
111	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112
113$code.=<<___;
114	zip1	$vt0.4s,$dat0.4s,$dat1.4s
115	zip2	$vt1.4s,$dat0.4s,$dat1.4s
116	zip1	$vt2.4s,$dat2.4s,$dat3.4s
117	zip2	$vt3.4s,$dat2.4s,$dat3.4s
118	zip1	$dat0.2d,$vt0.2d,$vt2.2d
119	zip2	$dat1.2d,$vt0.2d,$vt2.2d
120	zip1	$dat2.2d,$vt1.2d,$vt3.2d
121	zip2	$dat3.2d,$vt1.2d,$vt3.2d
122___
123}
124
125# sbox operations for 4-lane of words
126sub sbox() {
127	my $dat = shift;
128
129$code.=<<___;
130	movi	@vtmp[0].16b,#64
131	movi	@vtmp[1].16b,#128
132	movi	@vtmp[2].16b,#192
133	sub	@vtmp[0].16b,$dat.16b,@vtmp[0].16b
134	sub	@vtmp[1].16b,$dat.16b,@vtmp[1].16b
135	sub	@vtmp[2].16b,$dat.16b,@vtmp[2].16b
136	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140	add	@vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141	add	@vtmp[2].2d,@vtmp[2].2d,$dat.2d
142	add	$dat.2d,@vtmp[0].2d,@vtmp[2].2d
143
144	ushr	@vtmp[0].4s,$dat.4s,32-2
145	sli	@vtmp[0].4s,$dat.4s,2
146	ushr	@vtmp[2].4s,$dat.4s,32-10
147	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
148	sli	@vtmp[2].4s,$dat.4s,10
149	eor	@vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150	ushr	@vtmp[0].4s,$dat.4s,32-18
151	sli	@vtmp[0].4s,$dat.4s,18
152	ushr	@vtmp[2].4s,$dat.4s,32-24
153	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154	sli	@vtmp[2].4s,$dat.4s,24
155	eor	$dat.16b,@vtmp[2].16b,@vtmp[1].16b
156___
157}
158
159# sbox operation for 8-lane of words
160sub sbox_double() {
161	my $dat = shift;
162	my $datx = shift;
163
164$code.=<<___;
165	movi	@vtmp[3].16b,#64
166	sub	@vtmp[0].16b,$dat.16b,@vtmp[3].16b
167	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174	add	$dat.2d,@vtmp[2].2d,$dat.2d
175	add	$dat.2d,@vtmp[1].2d,$dat.2d
176
177	sub	@vtmp[0].16b,$datx.16b,@vtmp[3].16b
178	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180	tbl	$datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185	add	$datx.2d,@vtmp[2].2d,$datx.2d
186	add	$datx.2d,@vtmp[1].2d,$datx.2d
187
188	ushr	@vtmp[0].4s,$dat.4s,32-2
189	sli	@vtmp[0].4s,$dat.4s,2
190	ushr	@vtmp[2].4s,$datx.4s,32-2
191	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
192	sli	@vtmp[2].4s,$datx.4s,2
193
194	ushr	@vtmp[0].4s,$dat.4s,32-10
195	eor	@vtmp[3].16b,@vtmp[2].16b,$datx.16b
196	sli	@vtmp[0].4s,$dat.4s,10
197	ushr	@vtmp[2].4s,$datx.4s,32-10
198	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199	sli	@vtmp[2].4s,$datx.4s,10
200
201	ushr	@vtmp[0].4s,$dat.4s,32-18
202	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203	sli	@vtmp[0].4s,$dat.4s,18
204	ushr	@vtmp[2].4s,$datx.4s,32-18
205	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206	sli	@vtmp[2].4s,$datx.4s,18
207
208	ushr	@vtmp[0].4s,$dat.4s,32-24
209	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210	sli	@vtmp[0].4s,$dat.4s,24
211	ushr	@vtmp[2].4s,$datx.4s,32-24
212	eor	$dat.16b,@vtmp[0].16b,@vtmp[1].16b
213	sli	@vtmp[2].4s,$datx.4s,24
214	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
215___
216}
217
218# sbox operation for one single word
219sub sbox_1word () {
220	my $word = shift;
221
222$code.=<<___;
223	movi	@vtmp[1].16b,#64
224	movi	@vtmp[2].16b,#128
225	movi	@vtmp[3].16b,#192
226	mov	@vtmp[0].s[0],$word
227
228	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229	sub	@vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230	sub	@vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231
232	tbl	@vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233	tbl	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234	tbl	@vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235	tbl	@vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236
237	mov	$word,@vtmp[0].s[0]
238	mov	$wtmp0,@vtmp[1].s[0]
239	mov	$wtmp2,@vtmp[2].s[0]
240	add	$wtmp0,$word,$wtmp0
241	mov	$word,@vtmp[3].s[0]
242	add	$wtmp0,$wtmp0,$wtmp2
243	add	$wtmp0,$wtmp0,$word
244
245	eor	$word,$wtmp0,$wtmp0,ror #32-2
246	eor	$word,$word,$wtmp0,ror #32-10
247	eor	$word,$word,$wtmp0,ror #32-18
248	eor	$word,$word,$wtmp0,ror #32-24
249___
250}
251
252# sm4 for one block of data, in scalar registers word0/word1/word2/word3
253sub sm4_1blk () {
254	my $kptr = shift;
255
256$code.=<<___;
257	ldp	$wtmp0,$wtmp1,[$kptr],8
258	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259	eor	$tmpw,$word2,$word3
260	eor	$wtmp2,$wtmp0,$word1
261	eor	$tmpw,$tmpw,$wtmp2
262___
263	&sbox_1word($tmpw);
264$code.=<<___;
265	eor	$word0,$word0,$tmpw
266	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267	eor	$tmpw,$word2,$word3
268	eor	$wtmp2,$word0,$wtmp1
269	eor	$tmpw,$tmpw,$wtmp2
270___
271	&sbox_1word($tmpw);
272$code.=<<___;
273	ldp	$wtmp0,$wtmp1,[$kptr],8
274	eor	$word1,$word1,$tmpw
275	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276	eor	$tmpw,$word0,$word1
277	eor	$wtmp2,$wtmp0,$word3
278	eor	$tmpw,$tmpw,$wtmp2
279___
280	&sbox_1word($tmpw);
281$code.=<<___;
282	eor	$word2,$word2,$tmpw
283	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284	eor	$tmpw,$word0,$word1
285	eor	$wtmp2,$word2,$wtmp1
286	eor	$tmpw,$tmpw,$wtmp2
287___
288	&sbox_1word($tmpw);
289$code.=<<___;
290	eor	$word3,$word3,$tmpw
291___
292}
293
294# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295sub sm4_4blks () {
296	my $kptr = shift;
297
298$code.=<<___;
299	ldp	$wtmp0,$wtmp1,[$kptr],8
300	dup	$rk0.4s,$wtmp0
301	dup	$rk1.4s,$wtmp1
302
303	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304	eor	$rka.16b,@data[2].16b,@data[3].16b
305	eor	$rk0.16b,@data[1].16b,$rk0.16b
306	eor	$rk0.16b,$rka.16b,$rk0.16b
307___
308	&sbox($rk0);
309$code.=<<___;
310	eor	@data[0].16b,@data[0].16b,$rk0.16b
311
312	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313	eor	$rka.16b,$rka.16b,@data[0].16b
314	eor	$rk1.16b,$rka.16b,$rk1.16b
315___
316	&sbox($rk1);
317$code.=<<___;
318	ldp	$wtmp0,$wtmp1,[$kptr],8
319	eor	@data[1].16b,@data[1].16b,$rk1.16b
320
321	dup	$rk0.4s,$wtmp0
322	dup	$rk1.4s,$wtmp1
323
324	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325	eor	$rka.16b,@data[0].16b,@data[1].16b
326	eor	$rk0.16b,@data[3].16b,$rk0.16b
327	eor	$rk0.16b,$rka.16b,$rk0.16b
328___
329	&sbox($rk0);
330$code.=<<___;
331	eor	@data[2].16b,@data[2].16b,$rk0.16b
332
333	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334	eor	$rka.16b,$rka.16b,@data[2].16b
335	eor	$rk1.16b,$rka.16b,$rk1.16b
336___
337	&sbox($rk1);
338$code.=<<___;
339	eor	@data[3].16b,@data[3].16b,$rk1.16b
340___
341}
342
343# sm4 for 8 lanes of data, in neon registers
344# data0/data1/data2/data3 datax0/datax1/datax2/datax3
345sub sm4_8blks () {
346	my $kptr = shift;
347
348$code.=<<___;
349	ldp	$wtmp0,$wtmp1,[$kptr],8
350	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351	dup	$rk0.4s,$wtmp0
352	eor	$rka.16b,@data[2].16b,@data[3].16b
353	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
354	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
355	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
356	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
357	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
358___
359	&sbox_double($rk0,$rk1);
360$code.=<<___;
361	eor	@data[0].16b,@data[0].16b,$rk0.16b
362	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
363
364	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365	dup	$rk1.4s,$wtmp1
366	eor	$rka.16b,$rka.16b,@data[0].16b
367	eor	$rkb.16b,$rkb.16b,@datax[0].16b
368	eor	$rk0.16b,$rka.16b,$rk1.16b
369	eor	$rk1.16b,$rkb.16b,$rk1.16b
370___
371	&sbox_double($rk0,$rk1);
372$code.=<<___;
373	ldp	$wtmp0,$wtmp1,[$kptr],8
374	eor	@data[1].16b,@data[1].16b,$rk0.16b
375	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
376
377	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378	dup	$rk0.4s,$wtmp0
379	eor	$rka.16b,@data[0].16b,@data[1].16b
380	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
381	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
382	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
383	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
384	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
385___
386	&sbox_double($rk0,$rk1);
387$code.=<<___;
388	eor	@data[2].16b,@data[2].16b,$rk0.16b
389	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
390
391	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392	dup	$rk1.4s,$wtmp1
393	eor	$rka.16b,$rka.16b,@data[2].16b
394	eor	$rkb.16b,$rkb.16b,@datax[2].16b
395	eor	$rk0.16b,$rka.16b,$rk1.16b
396	eor	$rk1.16b,$rkb.16b,$rk1.16b
397___
398	&sbox_double($rk0,$rk1);
399$code.=<<___;
400	eor	@data[3].16b,@data[3].16b,$rk0.16b
401	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
402___
403}
404
405sub encrypt_1blk_norev() {
406	my $dat = shift;
407
408$code.=<<___;
409	mov	$ptr,$rks
410	mov	$counter,#8
411	mov	$word0,$dat.s[0]
412	mov	$word1,$dat.s[1]
413	mov	$word2,$dat.s[2]
414	mov	$word3,$dat.s[3]
41510:
416___
417	&sm4_1blk($ptr);
418$code.=<<___;
419	subs	$counter,$counter,#1
420	b.ne	10b
421	mov	$dat.s[0],$word3
422	mov	$dat.s[1],$word2
423	mov	$dat.s[2],$word1
424	mov	$dat.s[3],$word0
425___
426}
427
428sub encrypt_1blk() {
429	my $dat = shift;
430
431	&encrypt_1blk_norev($dat);
432	&rev32($dat,$dat);
433}
434
435sub encrypt_4blks() {
436$code.=<<___;
437	mov	$ptr,$rks
438	mov	$counter,#8
43910:
440___
441	&sm4_4blks($ptr);
442$code.=<<___;
443	subs	$counter,$counter,#1
444	b.ne	10b
445___
446	&rev32(@vtmp[3],@data[0]);
447	&rev32(@vtmp[2],@data[1]);
448	&rev32(@vtmp[1],@data[2]);
449	&rev32(@vtmp[0],@data[3]);
450}
451
452sub encrypt_8blks() {
453$code.=<<___;
454	mov	$ptr,$rks
455	mov	$counter,#8
45610:
457___
458	&sm4_8blks($ptr);
459$code.=<<___;
460	subs	$counter,$counter,#1
461	b.ne	10b
462___
463	&rev32(@vtmp[3],@data[0]);
464	&rev32(@vtmp[2],@data[1]);
465	&rev32(@vtmp[1],@data[2]);
466	&rev32(@vtmp[0],@data[3]);
467	&rev32(@data[3],@datax[0]);
468	&rev32(@data[2],@datax[1]);
469	&rev32(@data[1],@datax[2]);
470	&rev32(@data[0],@datax[3]);
471}
472
473sub load_sbox () {
474	my $data = shift;
475
476$code.=<<___;
477	adrp	$ptr,.Lsbox
478	add	$ptr,$ptr,#:lo12:.Lsbox
479	ld1	{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
480	ld1	{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
481	ld1	{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
482	ld1	{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
483___
484}
485
486
487sub mov_reg_to_vec() {
488	my $src0 = shift;
489	my $src1 = shift;
490	my $desv = shift;
491$code.=<<___;
492	mov $desv.d[0],$src0
493	mov $desv.d[1],$src1
494___
495	&rev32_armeb($desv,$desv);
496}
497
498sub mov_vec_to_reg() {
499	my $srcv = shift;
500	my $des0 = shift;
501	my $des1 = shift;
502$code.=<<___;
503	mov $des0,$srcv.d[0]
504	mov $des1,$srcv.d[1]
505___
506}
507
508sub compute_tweak() {
509	my $src0 = shift;
510	my $src1 = shift;
511	my $des0 = shift;
512	my $des1 = shift;
513$code.=<<___;
514	mov $wtmp0,0x87
515	extr	$xtmp2,$src1,$src1,#32
516	extr	$des1,$src1,$src0,#63
517	and	$wtmp1,$wtmp0,$wtmp2,asr#31
518	eor	$des0,$xtmp1,$src0,lsl#1
519___
520}
521
522sub compute_tweak_vec() {
523	my $src = shift;
524	my $des = shift;
525	my $std = shift;
526	&rbit(@vtmp[2],$src,$std);
527$code.=<<___;
528	adrp $ptr,.Lxts_magic
529	ldr  @qtmp[0], [$ptr, #:lo12:.Lxts_magic]
530	shl  $des.16b, @vtmp[2].16b, #1
531	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
532	ushr @vtmp[1].16b, @vtmp[1].16b, #7
533	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
534	eor  $des.16b, $des.16b, @vtmp[1].16b
535___
536	&rbit($des,$des,$std);
537}
538
539$code=<<___;
540#include "arm_arch.h"
541.arch	armv8-a
542.text
543
544.rodata
545.type	_${prefix}_consts,%object
546.align	7
547_${prefix}_consts:
548.Lsbox:
549	.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
550	.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
551	.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
552	.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
553	.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
554	.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
555	.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
556	.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
557	.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
558	.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
559	.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
560	.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
561	.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
562	.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
563	.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
564	.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
565.Lck:
566	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
567	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
568	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
569	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
570	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
571	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
572	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
573	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
574.Lfk:
575	.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
576.Lshuffles:
577	.quad 0x0B0A090807060504,0x030201000F0E0D0C
578.Lxts_magic:
579	.quad 0x0101010101010187,0x0101010101010101
580
581.size	_${prefix}_consts,.-_${prefix}_consts
582
583.previous
584
585___
586
587{{{
588my ($key,$keys,$enc)=("x0","x1","w2");
589my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
590my ($vkey,$vfk,$vmap)=("v5","v6","v7");
591$code.=<<___;
592.type	_vpsm4_set_key,%function
593.align	4
594_vpsm4_set_key:
595	AARCH64_VALID_CALL_TARGET
596	ld1	{$vkey.4s},[$key]
597___
598	&load_sbox();
599	&rev32($vkey,$vkey);
600$code.=<<___;
601	adrp	$pointer,.Lshuffles
602	add	$pointer,$pointer,#:lo12:.Lshuffles
603	ld1	{$vmap.2d},[$pointer]
604	adrp	$pointer,.Lfk
605	add	$pointer,$pointer,#:lo12:.Lfk
606	ld1	{$vfk.2d},[$pointer]
607	eor	$vkey.16b,$vkey.16b,$vfk.16b
608	mov	$schedules,#32
609	adrp	$pointer,.Lck
610	add	$pointer,$pointer,#:lo12:.Lck
611	movi	@vtmp[0].16b,#64
612	cbnz	$enc,1f
613	add	$keys,$keys,124
6141:
615	mov	$wtmp,$vkey.s[1]
616	ldr	$roundkey,[$pointer],#4
617	eor	$roundkey,$roundkey,$wtmp
618	mov	$wtmp,$vkey.s[2]
619	eor	$roundkey,$roundkey,$wtmp
620	mov	$wtmp,$vkey.s[3]
621	eor	$roundkey,$roundkey,$wtmp
622	// sbox lookup
623	mov	@data[0].s[0],$roundkey
624	tbl	@vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
625	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
626	tbx	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
627	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
628	tbx	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
629	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
630	tbx	@vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
631	mov	$wtmp,@vtmp[1].s[0]
632	eor	$roundkey,$wtmp,$wtmp,ror #19
633	eor	$roundkey,$roundkey,$wtmp,ror #9
634	mov	$wtmp,$vkey.s[0]
635	eor	$roundkey,$roundkey,$wtmp
636	mov	$vkey.s[0],$roundkey
637	cbz	$enc,2f
638	str	$roundkey,[$keys],#4
639	b	3f
6402:
641	str	$roundkey,[$keys],#-4
6423:
643	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
644	subs	$schedules,$schedules,#1
645	b.ne	1b
646	ret
647.size	_vpsm4_set_key,.-_vpsm4_set_key
648___
649}}}
650
651
652{{{
653$code.=<<___;
654.type	_vpsm4_enc_4blks,%function
655.align	4
656_vpsm4_enc_4blks:
657	AARCH64_VALID_CALL_TARGET
658___
659	&encrypt_4blks();
660$code.=<<___;
661	ret
662.size	_vpsm4_enc_4blks,.-_vpsm4_enc_4blks
663___
664}}}
665
666{{{
667$code.=<<___;
668.type	_vpsm4_enc_8blks,%function
669.align	4
670_vpsm4_enc_8blks:
671	AARCH64_VALID_CALL_TARGET
672___
673	&encrypt_8blks();
674$code.=<<___;
675	ret
676.size	_vpsm4_enc_8blks,.-_vpsm4_enc_8blks
677___
678}}}
679
680
681{{{
682my ($key,$keys)=("x0","x1");
683$code.=<<___;
684.globl	${prefix}_set_encrypt_key
685.type	${prefix}_set_encrypt_key,%function
686.align	5
687${prefix}_set_encrypt_key:
688	AARCH64_SIGN_LINK_REGISTER
689	stp	x29,x30,[sp,#-16]!
690	mov	w2,1
691	bl	_vpsm4_set_key
692	ldp	x29,x30,[sp],#16
693	AARCH64_VALIDATE_LINK_REGISTER
694	ret
695.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
696___
697}}}
698
699{{{
700my ($key,$keys)=("x0","x1");
701$code.=<<___;
702.globl	${prefix}_set_decrypt_key
703.type	${prefix}_set_decrypt_key,%function
704.align	5
705${prefix}_set_decrypt_key:
706	AARCH64_SIGN_LINK_REGISTER
707	stp	x29,x30,[sp,#-16]!
708	mov	w2,0
709	bl	_vpsm4_set_key
710	ldp	x29,x30,[sp],#16
711	AARCH64_VALIDATE_LINK_REGISTER
712	ret
713.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
714___
715}}}
716
717{{{
718sub gen_block () {
719	my $dir = shift;
720	my ($inp,$outp,$rk)=map("x$_",(0..2));
721
722$code.=<<___;
723.globl	${prefix}_${dir}crypt
724.type	${prefix}_${dir}crypt,%function
725.align	5
726${prefix}_${dir}crypt:
727	AARCH64_VALID_CALL_TARGET
728	ld1	{@data[0].4s},[$inp]
729___
730	&load_sbox();
731	&rev32(@data[0],@data[0]);
732$code.=<<___;
733	mov	$rks,x2
734___
735	&encrypt_1blk(@data[0]);
736$code.=<<___;
737	st1	{@data[0].4s},[$outp]
738	ret
739.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
740___
741}
742&gen_block("en");
743&gen_block("de");
744}}}
745
746{{{
747my ($enc) = ("w4");
748my @dat=map("v$_",(16..23));
749
750$code.=<<___;
751.globl	${prefix}_ecb_encrypt
752.type	${prefix}_ecb_encrypt,%function
753.align	5
754${prefix}_ecb_encrypt:
755	AARCH64_SIGN_LINK_REGISTER
756	// convert length into blocks
757	lsr	x2,x2,4
758	stp	d8,d9,[sp,#-80]!
759	stp	d10,d11,[sp,#16]
760	stp	d12,d13,[sp,#32]
761	stp	d14,d15,[sp,#48]
762	stp	x29,x30,[sp,#64]
763___
764	&load_sbox();
765$code.=<<___;
766.Lecb_8_blocks_process:
767	cmp	$blocks,#8
768	b.lt	.Lecb_4_blocks_process
769	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
770	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
771___
772	&rev32(@data[0],@data[0]);
773	&rev32(@data[1],@data[1]);
774	&rev32(@data[2],@data[2]);
775	&rev32(@data[3],@data[3]);
776	&rev32(@datax[0],@datax[0]);
777	&rev32(@datax[1],@datax[1]);
778	&rev32(@datax[2],@datax[2]);
779	&rev32(@datax[3],@datax[3]);
780$code.=<<___;
781	bl	_vpsm4_enc_8blks
782	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
783	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
784	subs	$blocks,$blocks,#8
785	b.gt	.Lecb_8_blocks_process
786	b	100f
787.Lecb_4_blocks_process:
788	cmp	$blocks,#4
789	b.lt	1f
790	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
791___
792	&rev32(@data[0],@data[0]);
793	&rev32(@data[1],@data[1]);
794	&rev32(@data[2],@data[2]);
795	&rev32(@data[3],@data[3]);
796$code.=<<___;
797	bl	_vpsm4_enc_4blks
798	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
799	sub	$blocks,$blocks,#4
8001:
801	// process last block
802	cmp	$blocks,#1
803	b.lt	100f
804	b.gt	1f
805	ld1	{@data[0].4s},[$inp]
806___
807	&rev32(@data[0],@data[0]);
808	&encrypt_1blk(@data[0]);
809$code.=<<___;
810	st1	{@data[0].4s},[$outp]
811	b	100f
8121:	// process last 2 blocks
813	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
814	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
815	cmp	$blocks,#2
816	b.gt	1f
817___
818	&rev32(@data[0],@data[0]);
819	&rev32(@data[1],@data[1]);
820	&rev32(@data[2],@data[2]);
821	&rev32(@data[3],@data[3]);
822$code.=<<___;
823	bl	_vpsm4_enc_4blks
824	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
825	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
826	b	100f
8271:	// process last 3 blocks
828	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
829___
830	&rev32(@data[0],@data[0]);
831	&rev32(@data[1],@data[1]);
832	&rev32(@data[2],@data[2]);
833	&rev32(@data[3],@data[3]);
834$code.=<<___;
835	bl	_vpsm4_enc_4blks
836	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
837	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
838	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
839100:
840	ldp	d10,d11,[sp,#16]
841	ldp	d12,d13,[sp,#32]
842	ldp	d14,d15,[sp,#48]
843	ldp	x29,x30,[sp,#64]
844	ldp	d8,d9,[sp],#80
845	AARCH64_VALIDATE_LINK_REGISTER
846	ret
847.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
848___
849}}}
850
851{{{
852my ($len,$ivp,$enc)=("x2","x4","w5");
853my $ivec0=("v3");
854my $ivec1=("v15");
855
856$code.=<<___;
857.globl	${prefix}_cbc_encrypt
858.type	${prefix}_cbc_encrypt,%function
859.align	5
860${prefix}_cbc_encrypt:
861	AARCH64_VALID_CALL_TARGET
862	lsr	$len,$len,4
863___
864	&load_sbox();
865$code.=<<___;
866	cbz	$enc,.Ldec
867	ld1	{$ivec0.4s},[$ivp]
868.Lcbc_4_blocks_enc:
869	cmp	$blocks,#4
870	b.lt	1f
871	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
872	eor	@data[0].16b,@data[0].16b,$ivec0.16b
873___
874	&rev32(@data[1],@data[1]);
875	&rev32(@data[0],@data[0]);
876	&rev32(@data[2],@data[2]);
877	&rev32(@data[3],@data[3]);
878	&encrypt_1blk_norev(@data[0]);
879$code.=<<___;
880	eor	@data[1].16b,@data[1].16b,@data[0].16b
881___
882	&encrypt_1blk_norev(@data[1]);
883	&rev32(@data[0],@data[0]);
884
885$code.=<<___;
886	eor	@data[2].16b,@data[2].16b,@data[1].16b
887___
888	&encrypt_1blk_norev(@data[2]);
889	&rev32(@data[1],@data[1]);
890$code.=<<___;
891	eor	@data[3].16b,@data[3].16b,@data[2].16b
892___
893	&encrypt_1blk_norev(@data[3]);
894	&rev32(@data[2],@data[2]);
895	&rev32(@data[3],@data[3]);
896$code.=<<___;
897	orr	$ivec0.16b,@data[3].16b,@data[3].16b
898	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
899	subs	$blocks,$blocks,#4
900	b.ne	.Lcbc_4_blocks_enc
901	b	2f
9021:
903	subs	$blocks,$blocks,#1
904	b.lt	2f
905	ld1	{@data[0].4s},[$inp],#16
906	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
907___
908	&rev32($ivec0,$ivec0);
909	&encrypt_1blk($ivec0);
910$code.=<<___;
911	st1	{$ivec0.4s},[$outp],#16
912	b	1b
9132:
914	// save back IV
915	st1	{$ivec0.4s},[$ivp]
916	ret
917
918.Ldec:
919	// decryption mode starts
920	AARCH64_SIGN_LINK_REGISTER
921	stp	d8,d9,[sp,#-80]!
922	stp	d10,d11,[sp,#16]
923	stp	d12,d13,[sp,#32]
924	stp	d14,d15,[sp,#48]
925	stp	x29,x30,[sp,#64]
926.Lcbc_8_blocks_dec:
927	cmp	$blocks,#8
928	b.lt	1f
929	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
930	add	$ptr,$inp,#64
931	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
932___
933	&rev32(@data[0],@data[0]);
934	&rev32(@data[1],@data[1]);
935	&rev32(@data[2],@data[2]);
936	&rev32(@data[3],$data[3]);
937	&rev32(@datax[0],@datax[0]);
938	&rev32(@datax[1],@datax[1]);
939	&rev32(@datax[2],@datax[2]);
940	&rev32(@datax[3],$datax[3]);
941$code.=<<___;
942	bl	_vpsm4_enc_8blks
943___
944	&transpose(@vtmp,@datax);
945	&transpose(@data,@datax);
946$code.=<<___;
947	ld1	{$ivec1.4s},[$ivp]
948	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
949	// note ivec1 and vtmpx[3] are reusing the same register
950	// care needs to be taken to avoid conflict
951	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
952	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
953	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
954	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
955	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
956	// save back IV
957	st1	{$vtmpx[3].4s}, [$ivp]
958	eor	@data[0].16b,@data[0].16b,$datax[3].16b
959	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
960	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
961	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
962	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
963	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
964	subs	$blocks,$blocks,#8
965	b.gt	.Lcbc_8_blocks_dec
966	b.eq	100f
9671:
968	ld1	{$ivec1.4s},[$ivp]
969.Lcbc_4_blocks_dec:
970	cmp	$blocks,#4
971	b.lt	1f
972	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
973___
974	&rev32(@data[0],@data[0]);
975	&rev32(@data[1],@data[1]);
976	&rev32(@data[2],@data[2]);
977	&rev32(@data[3],$data[3]);
978$code.=<<___;
979	bl	_vpsm4_enc_4blks
980	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
981___
982	&transpose(@vtmp,@datax);
983$code.=<<___;
984	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
985	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
986	orr	$ivec1.16b,@data[3].16b,@data[3].16b
987	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
988	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
989	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
990	subs	$blocks,$blocks,#4
991	b.gt	.Lcbc_4_blocks_dec
992	// save back IV
993	st1	{@data[3].4s}, [$ivp]
994	b	100f
9951:	// last block
996	subs	$blocks,$blocks,#1
997	b.lt	100f
998	b.gt	1f
999	ld1	{@data[0].4s},[$inp],#16
1000	// save back IV
1001	st1	{$data[0].4s}, [$ivp]
1002___
1003	&rev32(@datax[0],@data[0]);
1004	&encrypt_1blk(@datax[0]);
1005$code.=<<___;
1006	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
1007	st1	{@datax[0].4s},[$outp],#16
1008	b	100f
10091:	// last two blocks
1010	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1011	add	$ptr,$inp,#16
1012	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1013	subs	$blocks,$blocks,1
1014	b.gt	1f
1015___
1016	&rev32(@data[0],@data[0]);
1017	&rev32(@data[1],@data[1]);
1018	&rev32(@data[2],@data[2]);
1019	&rev32(@data[3],@data[3]);
1020$code.=<<___;
1021	bl	_vpsm4_enc_4blks
1022	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1023___
1024	&transpose(@vtmp,@datax);
1025$code.=<<___;
1026	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1027	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1028	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1029	// save back IV
1030	st1	{@data[1].4s}, [$ivp]
1031	b	100f
10321:	// last 3 blocks
1033	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1034___
1035	&rev32(@data[0],@data[0]);
1036	&rev32(@data[1],@data[1]);
1037	&rev32(@data[2],@data[2]);
1038	&rev32(@data[3],@data[3]);
1039$code.=<<___;
1040	bl	_vpsm4_enc_4blks
1041	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1042___
1043	&transpose(@vtmp,@datax);
1044$code.=<<___;
1045	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1046	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1047	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1048	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1049	// save back IV
1050	st1	{@data[2].4s}, [$ivp]
1051100:
1052	ldp	d10,d11,[sp,#16]
1053	ldp	d12,d13,[sp,#32]
1054	ldp	d14,d15,[sp,#48]
1055	ldp	x29,x30,[sp,#64]
1056	ldp	d8,d9,[sp],#80
1057	AARCH64_VALIDATE_LINK_REGISTER
1058	ret
1059.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1060___
1061}}}
1062
1063{{{
1064my ($ivp)=("x4");
1065my ($ctr)=("w5");
1066my $ivec=("v3");
1067
1068$code.=<<___;
1069.globl	${prefix}_ctr32_encrypt_blocks
1070.type	${prefix}_ctr32_encrypt_blocks,%function
1071.align	5
1072${prefix}_ctr32_encrypt_blocks:
1073	AARCH64_VALID_CALL_TARGET
1074	ld1	{$ivec.4s},[$ivp]
1075___
1076	&rev32($ivec,$ivec);
1077	&load_sbox();
1078$code.=<<___;
1079	cmp	$blocks,#1
1080	b.ne	1f
1081	// fast processing for one single block without
1082	// context saving overhead
1083___
1084	&encrypt_1blk($ivec);
1085$code.=<<___;
1086	ld1	{@data[0].4s},[$inp]
1087	eor	@data[0].16b,@data[0].16b,$ivec.16b
1088	st1	{@data[0].4s},[$outp]
1089	ret
10901:
1091	AARCH64_SIGN_LINK_REGISTER
1092	stp	d8,d9,[sp,#-80]!
1093	stp	d10,d11,[sp,#16]
1094	stp	d12,d13,[sp,#32]
1095	stp	d14,d15,[sp,#48]
1096	stp	x29,x30,[sp,#64]
1097	mov	$word0,$ivec.s[0]
1098	mov	$word1,$ivec.s[1]
1099	mov	$word2,$ivec.s[2]
1100	mov	$ctr,$ivec.s[3]
1101.Lctr32_4_blocks_process:
1102	cmp	$blocks,#4
1103	b.lt	1f
1104	dup	@data[0].4s,$word0
1105	dup	@data[1].4s,$word1
1106	dup	@data[2].4s,$word2
1107	mov	@data[3].s[0],$ctr
1108	add	$ctr,$ctr,#1
1109	mov	$data[3].s[1],$ctr
1110	add	$ctr,$ctr,#1
1111	mov	@data[3].s[2],$ctr
1112	add	$ctr,$ctr,#1
1113	mov	@data[3].s[3],$ctr
1114	add	$ctr,$ctr,#1
1115	cmp	$blocks,#8
1116	b.ge	.Lctr32_8_blocks_process
1117	bl	_vpsm4_enc_4blks
1118	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1119	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1120	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1121	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1122	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1123	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1124	subs	$blocks,$blocks,#4
1125	b.ne	.Lctr32_4_blocks_process
1126	b	100f
1127.Lctr32_8_blocks_process:
1128	dup	@datax[0].4s,$word0
1129	dup	@datax[1].4s,$word1
1130	dup	@datax[2].4s,$word2
1131	mov	@datax[3].s[0],$ctr
1132	add	$ctr,$ctr,#1
1133	mov	$datax[3].s[1],$ctr
1134	add	$ctr,$ctr,#1
1135	mov	@datax[3].s[2],$ctr
1136	add	$ctr,$ctr,#1
1137	mov	@datax[3].s[3],$ctr
1138	add	$ctr,$ctr,#1
1139	bl	_vpsm4_enc_8blks
1140	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1141	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1142	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1143	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1144	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1145	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1146	eor	@data[0].16b,@data[0].16b,@datax[0].16b
1147	eor	@data[1].16b,@data[1].16b,@datax[1].16b
1148	eor	@data[2].16b,@data[2].16b,@datax[2].16b
1149	eor	@data[3].16b,@data[3].16b,@datax[3].16b
1150	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1151	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1152	subs	$blocks,$blocks,#8
1153	b.ne	.Lctr32_4_blocks_process
1154	b	100f
11551:	// last block processing
1156	subs	$blocks,$blocks,#1
1157	b.lt	100f
1158	b.gt	1f
1159	mov	$ivec.s[0],$word0
1160	mov	$ivec.s[1],$word1
1161	mov	$ivec.s[2],$word2
1162	mov	$ivec.s[3],$ctr
1163___
1164	&encrypt_1blk($ivec);
1165$code.=<<___;
1166	ld1	{@data[0].4s},[$inp]
1167	eor	@data[0].16b,@data[0].16b,$ivec.16b
1168	st1	{@data[0].4s},[$outp]
1169	b	100f
11701:	// last 2 blocks processing
1171	dup	@data[0].4s,$word0
1172	dup	@data[1].4s,$word1
1173	dup	@data[2].4s,$word2
1174	mov	@data[3].s[0],$ctr
1175	add	$ctr,$ctr,#1
1176	mov	@data[3].s[1],$ctr
1177	subs	$blocks,$blocks,#1
1178	b.ne	1f
1179	bl	_vpsm4_enc_4blks
1180	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1181	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1182	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1183	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1184	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1185	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1186	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1187	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1188	b	100f
11891:	// last 3 blocks processing
1190	add	$ctr,$ctr,#1
1191	mov	@data[3].s[2],$ctr
1192	bl	_vpsm4_enc_4blks
1193	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1194	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1195	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1196	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1197	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1198	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1199	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1200	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1201	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1202	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1203100:
1204	ldp	d10,d11,[sp,#16]
1205	ldp	d12,d13,[sp,#32]
1206	ldp	d14,d15,[sp,#48]
1207	ldp	x29,x30,[sp,#64]
1208	ldp	d8,d9,[sp],#80
1209	AARCH64_VALIDATE_LINK_REGISTER
1210	ret
1211.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1212___
1213}}}
1214
1215{{{
1216my ($blocks,$len)=("x2","x2");
1217my $ivp=("x5");
1218my @twx=map("x$_",(12..27));
1219my ($rks1,$rks2)=("x26","x27");
1220my $lastBlk=("x26");
1221my $enc=("w28");
1222my $remain=("x29");
1223
1224my @tweak=@datax;
1225
1226sub gen_xts_cipher() {
1227	my $std = shift;
1228$code.=<<___;
1229.globl	${prefix}_xts_encrypt${std}
1230.type	${prefix}_xts_encrypt${std},%function
1231.align	5
1232${prefix}_xts_encrypt${std}:
1233	AARCH64_SIGN_LINK_REGISTER
1234	stp	x15, x16, [sp, #-0x10]!
1235	stp	x17, x18, [sp, #-0x10]!
1236	stp	x19, x20, [sp, #-0x10]!
1237	stp	x21, x22, [sp, #-0x10]!
1238	stp	x23, x24, [sp, #-0x10]!
1239	stp	x25, x26, [sp, #-0x10]!
1240	stp	x27, x28, [sp, #-0x10]!
1241	stp	x29, x30, [sp, #-0x10]!
1242	stp	d8, d9, [sp, #-0x10]!
1243	stp	d10, d11, [sp, #-0x10]!
1244	stp	d12, d13, [sp, #-0x10]!
1245	stp	d14, d15, [sp, #-0x10]!
1246	mov	$rks1,x3
1247	mov	$rks2,x4
1248	mov	$enc,w6
1249	ld1	{@tweak[0].4s}, [$ivp]
1250	mov	$rks,$rks2
1251___
1252	&load_sbox();
1253	&rev32(@tweak[0],@tweak[0]);
1254	&encrypt_1blk(@tweak[0]);
1255$code.=<<___;
1256	mov	$rks,$rks1
1257	and	$remain,$len,#0x0F
1258	// convert length into blocks
1259	lsr	$blocks,$len,4
1260	cmp	$blocks,#1
1261	b.lt .return${std}
1262
1263	cmp $remain,0
1264	// If the encryption/decryption Length is N times of 16,
1265	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1266	b.eq .xts_encrypt_blocks${std}
1267
1268	// If the encryption/decryption length is not N times of 16,
1269	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1270	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1271	subs $blocks,$blocks,#1
1272	b.eq .only_2blks_tweak${std}
1273.xts_encrypt_blocks${std}:
1274___
1275	&rbit(@tweak[0],@tweak[0],$std);
1276	&rev32_armeb(@tweak[0],@tweak[0]);
1277	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1278	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1279	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1280	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1281	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1282	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1283	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1284	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1285$code.=<<___;
1286.Lxts_8_blocks_process${std}:
1287	cmp	$blocks,#8
1288	b.lt	.Lxts_4_blocks_process${std}
1289___
1290	&mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1291	&mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1292	&mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1293	&mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1294	&mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1295	&mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1296	&mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1297	&mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1298$code.=<<___;
1299	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1300___
1301	&rbit(@vtmp[0],@vtmp[0],$std);
1302	&rbit(@vtmp[1],@vtmp[1],$std);
1303	&rbit(@vtmp[2],@vtmp[2],$std);
1304	&rbit(@vtmp[3],@vtmp[3],$std);
1305$code.=<<___;
1306	eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1307	eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1308	eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1309	eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1310	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1311___
1312	&rbit(@vtmpx[0],@vtmpx[0],$std);
1313	&rbit(@vtmpx[1],@vtmpx[1],$std);
1314	&rbit(@vtmpx[2],@vtmpx[2],$std);
1315	&rbit(@vtmpx[3],@vtmpx[3],$std);
1316$code.=<<___;
1317	eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1318	eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1319	eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1320	eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1321___
1322	&rev32(@data[0],@data[0]);
1323	&rev32(@data[1],@data[1]);
1324	&rev32(@data[2],@data[2]);
1325	&rev32(@data[3],@data[3]);
1326	&rev32(@datax[0],@datax[0]);
1327	&rev32(@datax[1],@datax[1]);
1328	&rev32(@datax[2],@datax[2]);
1329	&rev32(@datax[3],@datax[3]);
1330	&transpose(@data,@vtmp);
1331	&transpose(@datax,@vtmp);
1332$code.=<<___;
1333	bl	_${prefix}_enc_8blks
1334___
1335	&transpose(@vtmp,@datax);
1336	&transpose(@data,@datax);
1337
1338	&mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1339	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1340	&mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1341	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1342	&mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1343	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1344	&mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1345	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1346	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1347	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1348	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1349	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1350	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1351	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1352	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1353	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1354$code.=<<___;
1355	eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1356	eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1357	eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1358	eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1359	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1360	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1361	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1362	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1363
1364	// save the last tweak
1365	st1	{@tweak[3].4s},[$ivp]
1366	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1367	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1368	subs	$blocks,$blocks,#8
1369	b.gt	.Lxts_8_blocks_process${std}
1370	b	100f
1371.Lxts_4_blocks_process${std}:
1372___
1373	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1374	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1375	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1376	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1377$code.=<<___;
1378	cmp	$blocks,#4
1379	b.lt	1f
1380	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1381___
1382	&rbit(@tweak[0],@tweak[0],$std);
1383	&rbit(@tweak[1],@tweak[1],$std);
1384	&rbit(@tweak[2],@tweak[2],$std);
1385	&rbit(@tweak[3],@tweak[3],$std);
1386$code.=<<___;
1387	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1388	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1389	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1390	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1391___
1392	&rev32(@data[0],@data[0]);
1393	&rev32(@data[1],@data[1]);
1394	&rev32(@data[2],@data[2]);
1395	&rev32(@data[3],@data[3]);
1396	&transpose(@data,@vtmp);
1397$code.=<<___;
1398	bl	_${prefix}_enc_4blks
1399___
1400	&transpose(@vtmp,@data);
1401$code.=<<___;
1402	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1403	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1404	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1405	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1406	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1407	sub	$blocks,$blocks,#4
1408___
1409	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1410	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1411	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1412$code.=<<___;
1413	// save the last tweak
1414	st1	{@tweak[3].4s},[$ivp]
14151:
1416	// process last block
1417	cmp	$blocks,#1
1418	b.lt	100f
1419	b.gt	1f
1420	ld1	{@data[0].4s},[$inp],#16
1421___
1422	&rbit(@tweak[0],@tweak[0],$std);
1423$code.=<<___;
1424	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1425___
1426	&rev32(@data[0],@data[0]);
1427	&encrypt_1blk(@data[0]);
1428$code.=<<___;
1429	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1430	st1	{@data[0].4s},[$outp],#16
1431	// save the last tweak
1432	st1	{@tweak[0].4s},[$ivp]
1433	b	100f
14341:  // process last 2 blocks
1435	cmp	$blocks,#2
1436	b.gt	1f
1437	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1438___
1439	&rbit(@tweak[0],@tweak[0],$std);
1440	&rbit(@tweak[1],@tweak[1],$std);
1441$code.=<<___;
1442	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1443	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1444___
1445	&rev32(@data[0],@data[0]);
1446	&rev32(@data[1],@data[1]);
1447	&transpose(@data,@vtmp);
1448$code.=<<___;
1449	bl	_${prefix}_enc_4blks
1450___
1451	&transpose(@vtmp,@data);
1452$code.=<<___;
1453	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1454	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1455	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1456	// save the last tweak
1457	st1	{@tweak[1].4s},[$ivp]
1458	b	100f
14591:  // process last 3 blocks
1460	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1461___
1462	&rbit(@tweak[0],@tweak[0],$std);
1463	&rbit(@tweak[1],@tweak[1],$std);
1464	&rbit(@tweak[2],@tweak[2],$std);
1465$code.=<<___;
1466	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1467	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1468	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1469___
1470	&rev32(@data[0],@data[0]);
1471	&rev32(@data[1],@data[1]);
1472	&rev32(@data[2],@data[2]);
1473	&transpose(@data,@vtmp);
1474$code.=<<___;
1475	bl	_${prefix}_enc_4blks
1476___
1477	&transpose(@vtmp,@data);
1478$code.=<<___;
1479	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1480	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1481	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1482	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1483	// save the last tweak
1484	st1	{@tweak[2].4s},[$ivp]
1485100:
1486	cmp $remain,0
1487	b.eq .return${std}
1488
1489// This branch calculates the last two tweaks,
1490// while the encryption/decryption length is larger than 32
1491.last_2blks_tweak${std}:
1492	ld1	{@tweak[0].4s},[$ivp]
1493___
1494	&rev32_armeb(@tweak[0],@tweak[0]);
1495	&compute_tweak_vec(@tweak[0],@tweak[1],$std);
1496	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1497$code.=<<___;
1498	b .check_dec${std}
1499
1500
1501// This branch calculates the last two tweaks,
1502// while the encryption/decryption length is equal to 32, who only need two tweaks
1503.only_2blks_tweak${std}:
1504	mov @tweak[1].16b,@tweak[0].16b
1505___
1506	&rev32_armeb(@tweak[1],@tweak[1]);
1507	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1508$code.=<<___;
1509	b .check_dec${std}
1510
1511
1512// Determine whether encryption or decryption is required.
1513// The last two tweaks need to be swapped for decryption.
1514.check_dec${std}:
1515	// encryption:1 decryption:0
1516	cmp $enc,1
1517	b.eq .process_last_2blks${std}
1518	mov @vtmp[0].16B,@tweak[1].16b
1519	mov @tweak[1].16B,@tweak[2].16b
1520	mov @tweak[2].16B,@vtmp[0].16b
1521
1522.process_last_2blks${std}:
1523___
1524	&rev32_armeb(@tweak[1],@tweak[1]);
1525	&rev32_armeb(@tweak[2],@tweak[2]);
1526$code.=<<___;
1527	ld1	{@data[0].4s},[$inp],#16
1528	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1529___
1530	&rev32(@data[0],@data[0]);
1531	&encrypt_1blk(@data[0]);
1532$code.=<<___;
1533	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1534	st1	{@data[0].4s},[$outp],#16
1535
1536	sub $lastBlk,$outp,16
1537	.loop${std}:
1538		subs $remain,$remain,1
1539		ldrb	$wtmp0,[$lastBlk,$remain]
1540		ldrb	$wtmp1,[$inp,$remain]
1541		strb	$wtmp1,[$lastBlk,$remain]
1542		strb	$wtmp0,[$outp,$remain]
1543	b.gt .loop${std}
1544	ld1		{@data[0].4s}, [$lastBlk]
1545	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1546___
1547	&rev32(@data[0],@data[0]);
1548	&encrypt_1blk(@data[0]);
1549$code.=<<___;
1550	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1551	st1		{@data[0].4s}, [$lastBlk]
1552.return${std}:
1553	ldp		d14, d15, [sp], #0x10
1554	ldp		d12, d13, [sp], #0x10
1555	ldp		d10, d11, [sp], #0x10
1556	ldp		d8, d9, [sp], #0x10
1557	ldp		x29, x30, [sp], #0x10
1558	ldp		x27, x28, [sp], #0x10
1559	ldp		x25, x26, [sp], #0x10
1560	ldp		x23, x24, [sp], #0x10
1561	ldp		x21, x22, [sp], #0x10
1562	ldp		x19, x20, [sp], #0x10
1563	ldp		x17, x18, [sp], #0x10
1564	ldp		x15, x16, [sp], #0x10
1565	AARCH64_VALIDATE_LINK_REGISTER
1566	ret
1567.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1568___
1569} # end of gen_xts_cipher
1570&gen_xts_cipher("_gb");
1571&gen_xts_cipher("");
1572}}}
1573########################################
1574open SELF,$0;
1575while(<SELF>) {
1576        next if (/^#!/);
1577        last if (!s/^#/\/\// and !/^$/);
1578        print;
1579}
1580close SELF;
1581
1582foreach(split("\n",$code)) {
1583	s/\`([^\`]*)\`/eval($1)/ge;
1584	print $_,"\n";
1585}
1586
1587close STDOUT or die "error closing STDOUT: $!";
1588