xref: /openssl/crypto/sm4/asm/vpsm4-armv8.pl (revision b6461792)
1#! /usr/bin/env perl
2# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# This module implements SM4 with ASIMD on aarch64
11#
12# Feb 2022
13#
14
15# $output is the last argument if it looks like a file (it has an extension)
16# $flavour is the first argument if it doesn't look like a file
17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23die "can't locate arm-xlate.pl";
24
25open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26    or die "can't call $xlate: $!";
27*STDOUT=*OUT;
28
29$prefix="vpsm4";
30my @vtmp=map("v$_",(0..3));
31my @qtmp=map("q$_",(0..3));
32my @data=map("v$_",(4..7));
33my @datax=map("v$_",(8..11));
34my ($rk0,$rk1)=("v12","v13");
35my ($rka,$rkb)=("v14","v15");
36my @vtmpx=map("v$_",(12..15));
37my @sbox=map("v$_",(16..31));
38my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40my ($xtmp1,$xtmp2)=("x8","x9");
41my ($ptr,$counter)=("x10","w11");
42my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43
44sub rev32() {
45	my $dst = shift;
46	my $src = shift;
47
48	if ($src and ("$src" ne "$dst")) {
49$code.=<<___;
50#ifndef __AARCH64EB__
51	rev32	$dst.16b,$src.16b
52#else
53	mov	$dst.16b,$src.16b
54#endif
55___
56	} else {
57$code.=<<___;
58#ifndef __AARCH64EB__
59	rev32	$dst.16b,$dst.16b
60#endif
61___
62	}
63}
64
65sub rev32_armeb() {
66	my $dst = shift;
67	my $src = shift;
68
69	if ($src and ("$src" ne "$dst")) {
70$code.=<<___;
71#ifdef __AARCH64EB__
72	rev32	$dst.16b,$src.16b
73#else
74	mov	$dst.16b,$src.16b
75#endif
76___
77	} else {
78$code.=<<___;
79#ifdef __AARCH64EB__
80	rev32	$dst.16b,$dst.16b
81#endif
82___
83	}
84}
85
86sub rbit() {
87	my $dst = shift;
88	my $src = shift;
89	my $std = shift;
90
91	if ($src and ("$src" ne "$dst")) {
92		if ($std eq "_gb") {
93$code.=<<___;
94			rbit $dst.16b,$src.16b
95___
96		} else {
97$code.=<<___;
98			mov $dst.16b,$src.16b
99___
100		}
101	} else {
102		if ($std eq "_gb") {
103$code.=<<___;
104			rbit $dst.16b,$src.16b
105___
106		}
107	}
108}
109
110sub transpose() {
111	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112
113$code.=<<___;
114	zip1	$vt0.4s,$dat0.4s,$dat1.4s
115	zip2	$vt1.4s,$dat0.4s,$dat1.4s
116	zip1	$vt2.4s,$dat2.4s,$dat3.4s
117	zip2	$vt3.4s,$dat2.4s,$dat3.4s
118	zip1	$dat0.2d,$vt0.2d,$vt2.2d
119	zip2	$dat1.2d,$vt0.2d,$vt2.2d
120	zip1	$dat2.2d,$vt1.2d,$vt3.2d
121	zip2	$dat3.2d,$vt1.2d,$vt3.2d
122___
123}
124
125# sbox operations for 4-lane of words
126sub sbox() {
127	my $dat = shift;
128
129$code.=<<___;
130	movi	@vtmp[0].16b,#64
131	movi	@vtmp[1].16b,#128
132	movi	@vtmp[2].16b,#192
133	sub	@vtmp[0].16b,$dat.16b,@vtmp[0].16b
134	sub	@vtmp[1].16b,$dat.16b,@vtmp[1].16b
135	sub	@vtmp[2].16b,$dat.16b,@vtmp[2].16b
136	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140	add	@vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141	add	@vtmp[2].2d,@vtmp[2].2d,$dat.2d
142	add	$dat.2d,@vtmp[0].2d,@vtmp[2].2d
143
144	ushr	@vtmp[0].4s,$dat.4s,32-2
145	sli	@vtmp[0].4s,$dat.4s,2
146	ushr	@vtmp[2].4s,$dat.4s,32-10
147	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
148	sli	@vtmp[2].4s,$dat.4s,10
149	eor	@vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150	ushr	@vtmp[0].4s,$dat.4s,32-18
151	sli	@vtmp[0].4s,$dat.4s,18
152	ushr	@vtmp[2].4s,$dat.4s,32-24
153	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154	sli	@vtmp[2].4s,$dat.4s,24
155	eor	$dat.16b,@vtmp[2].16b,@vtmp[1].16b
156___
157}
158
159# sbox operation for 8-lane of words
160sub sbox_double() {
161	my $dat = shift;
162	my $datx = shift;
163
164$code.=<<___;
165	movi	@vtmp[3].16b,#64
166	sub	@vtmp[0].16b,$dat.16b,@vtmp[3].16b
167	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174	add	$dat.2d,@vtmp[2].2d,$dat.2d
175	add	$dat.2d,@vtmp[1].2d,$dat.2d
176
177	sub	@vtmp[0].16b,$datx.16b,@vtmp[3].16b
178	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180	tbl	$datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185	add	$datx.2d,@vtmp[2].2d,$datx.2d
186	add	$datx.2d,@vtmp[1].2d,$datx.2d
187
188	ushr	@vtmp[0].4s,$dat.4s,32-2
189	sli	@vtmp[0].4s,$dat.4s,2
190	ushr	@vtmp[2].4s,$datx.4s,32-2
191	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
192	sli	@vtmp[2].4s,$datx.4s,2
193
194	ushr	@vtmp[0].4s,$dat.4s,32-10
195	eor	@vtmp[3].16b,@vtmp[2].16b,$datx.16b
196	sli	@vtmp[0].4s,$dat.4s,10
197	ushr	@vtmp[2].4s,$datx.4s,32-10
198	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199	sli	@vtmp[2].4s,$datx.4s,10
200
201	ushr	@vtmp[0].4s,$dat.4s,32-18
202	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203	sli	@vtmp[0].4s,$dat.4s,18
204	ushr	@vtmp[2].4s,$datx.4s,32-18
205	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206	sli	@vtmp[2].4s,$datx.4s,18
207
208	ushr	@vtmp[0].4s,$dat.4s,32-24
209	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210	sli	@vtmp[0].4s,$dat.4s,24
211	ushr	@vtmp[2].4s,$datx.4s,32-24
212	eor	$dat.16b,@vtmp[0].16b,@vtmp[1].16b
213	sli	@vtmp[2].4s,$datx.4s,24
214	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
215___
216}
217
218# sbox operation for one single word
219sub sbox_1word () {
220	my $word = shift;
221
222$code.=<<___;
223	movi	@vtmp[1].16b,#64
224	movi	@vtmp[2].16b,#128
225	movi	@vtmp[3].16b,#192
226	mov	@vtmp[0].s[0],$word
227
228	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229	sub	@vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230	sub	@vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231
232	tbl	@vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233	tbl	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234	tbl	@vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235	tbl	@vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236
237	mov	$word,@vtmp[0].s[0]
238	mov	$wtmp0,@vtmp[1].s[0]
239	mov	$wtmp2,@vtmp[2].s[0]
240	add	$wtmp0,$word,$wtmp0
241	mov	$word,@vtmp[3].s[0]
242	add	$wtmp0,$wtmp0,$wtmp2
243	add	$wtmp0,$wtmp0,$word
244
245	eor	$word,$wtmp0,$wtmp0,ror #32-2
246	eor	$word,$word,$wtmp0,ror #32-10
247	eor	$word,$word,$wtmp0,ror #32-18
248	eor	$word,$word,$wtmp0,ror #32-24
249___
250}
251
252# sm4 for one block of data, in scalar registers word0/word1/word2/word3
253sub sm4_1blk () {
254	my $kptr = shift;
255
256$code.=<<___;
257	ldp	$wtmp0,$wtmp1,[$kptr],8
258	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259	eor	$tmpw,$word2,$word3
260	eor	$wtmp2,$wtmp0,$word1
261	eor	$tmpw,$tmpw,$wtmp2
262___
263	&sbox_1word($tmpw);
264$code.=<<___;
265	eor	$word0,$word0,$tmpw
266	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267	eor	$tmpw,$word2,$word3
268	eor	$wtmp2,$word0,$wtmp1
269	eor	$tmpw,$tmpw,$wtmp2
270___
271	&sbox_1word($tmpw);
272$code.=<<___;
273	ldp	$wtmp0,$wtmp1,[$kptr],8
274	eor	$word1,$word1,$tmpw
275	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276	eor	$tmpw,$word0,$word1
277	eor	$wtmp2,$wtmp0,$word3
278	eor	$tmpw,$tmpw,$wtmp2
279___
280	&sbox_1word($tmpw);
281$code.=<<___;
282	eor	$word2,$word2,$tmpw
283	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284	eor	$tmpw,$word0,$word1
285	eor	$wtmp2,$word2,$wtmp1
286	eor	$tmpw,$tmpw,$wtmp2
287___
288	&sbox_1word($tmpw);
289$code.=<<___;
290	eor	$word3,$word3,$tmpw
291___
292}
293
294# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295sub sm4_4blks () {
296	my $kptr = shift;
297
298$code.=<<___;
299	ldp	$wtmp0,$wtmp1,[$kptr],8
300	dup	$rk0.4s,$wtmp0
301	dup	$rk1.4s,$wtmp1
302
303	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304	eor	$rka.16b,@data[2].16b,@data[3].16b
305	eor	$rk0.16b,@data[1].16b,$rk0.16b
306	eor	$rk0.16b,$rka.16b,$rk0.16b
307___
308	&sbox($rk0);
309$code.=<<___;
310	eor	@data[0].16b,@data[0].16b,$rk0.16b
311
312	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313	eor	$rka.16b,$rka.16b,@data[0].16b
314	eor	$rk1.16b,$rka.16b,$rk1.16b
315___
316	&sbox($rk1);
317$code.=<<___;
318	ldp	$wtmp0,$wtmp1,[$kptr],8
319	eor	@data[1].16b,@data[1].16b,$rk1.16b
320
321	dup	$rk0.4s,$wtmp0
322	dup	$rk1.4s,$wtmp1
323
324	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325	eor	$rka.16b,@data[0].16b,@data[1].16b
326	eor	$rk0.16b,@data[3].16b,$rk0.16b
327	eor	$rk0.16b,$rka.16b,$rk0.16b
328___
329	&sbox($rk0);
330$code.=<<___;
331	eor	@data[2].16b,@data[2].16b,$rk0.16b
332
333	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334	eor	$rka.16b,$rka.16b,@data[2].16b
335	eor	$rk1.16b,$rka.16b,$rk1.16b
336___
337	&sbox($rk1);
338$code.=<<___;
339	eor	@data[3].16b,@data[3].16b,$rk1.16b
340___
341}
342
343# sm4 for 8 lanes of data, in neon registers
344# data0/data1/data2/data3 datax0/datax1/datax2/datax3
345sub sm4_8blks () {
346	my $kptr = shift;
347
348$code.=<<___;
349	ldp	$wtmp0,$wtmp1,[$kptr],8
350	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351	dup	$rk0.4s,$wtmp0
352	eor	$rka.16b,@data[2].16b,@data[3].16b
353	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
354	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
355	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
356	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
357	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
358___
359	&sbox_double($rk0,$rk1);
360$code.=<<___;
361	eor	@data[0].16b,@data[0].16b,$rk0.16b
362	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
363
364	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365	dup	$rk1.4s,$wtmp1
366	eor	$rka.16b,$rka.16b,@data[0].16b
367	eor	$rkb.16b,$rkb.16b,@datax[0].16b
368	eor	$rk0.16b,$rka.16b,$rk1.16b
369	eor	$rk1.16b,$rkb.16b,$rk1.16b
370___
371	&sbox_double($rk0,$rk1);
372$code.=<<___;
373	ldp	$wtmp0,$wtmp1,[$kptr],8
374	eor	@data[1].16b,@data[1].16b,$rk0.16b
375	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
376
377	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378	dup	$rk0.4s,$wtmp0
379	eor	$rka.16b,@data[0].16b,@data[1].16b
380	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
381	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
382	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
383	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
384	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
385___
386	&sbox_double($rk0,$rk1);
387$code.=<<___;
388	eor	@data[2].16b,@data[2].16b,$rk0.16b
389	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
390
391	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392	dup	$rk1.4s,$wtmp1
393	eor	$rka.16b,$rka.16b,@data[2].16b
394	eor	$rkb.16b,$rkb.16b,@datax[2].16b
395	eor	$rk0.16b,$rka.16b,$rk1.16b
396	eor	$rk1.16b,$rkb.16b,$rk1.16b
397___
398	&sbox_double($rk0,$rk1);
399$code.=<<___;
400	eor	@data[3].16b,@data[3].16b,$rk0.16b
401	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
402___
403}
404
405sub encrypt_1blk_norev() {
406	my $dat = shift;
407
408$code.=<<___;
409	mov	$ptr,$rks
410	mov	$counter,#8
411	mov	$word0,$dat.s[0]
412	mov	$word1,$dat.s[1]
413	mov	$word2,$dat.s[2]
414	mov	$word3,$dat.s[3]
41510:
416___
417	&sm4_1blk($ptr);
418$code.=<<___;
419	subs	$counter,$counter,#1
420	b.ne	10b
421	mov	$dat.s[0],$word3
422	mov	$dat.s[1],$word2
423	mov	$dat.s[2],$word1
424	mov	$dat.s[3],$word0
425___
426}
427
428sub encrypt_1blk() {
429	my $dat = shift;
430
431	&encrypt_1blk_norev($dat);
432	&rev32($dat,$dat);
433}
434
435sub encrypt_4blks() {
436$code.=<<___;
437	mov	$ptr,$rks
438	mov	$counter,#8
43910:
440___
441	&sm4_4blks($ptr);
442$code.=<<___;
443	subs	$counter,$counter,#1
444	b.ne	10b
445___
446	&rev32(@vtmp[3],@data[0]);
447	&rev32(@vtmp[2],@data[1]);
448	&rev32(@vtmp[1],@data[2]);
449	&rev32(@vtmp[0],@data[3]);
450}
451
452sub encrypt_8blks() {
453$code.=<<___;
454	mov	$ptr,$rks
455	mov	$counter,#8
45610:
457___
458	&sm4_8blks($ptr);
459$code.=<<___;
460	subs	$counter,$counter,#1
461	b.ne	10b
462___
463	&rev32(@vtmp[3],@data[0]);
464	&rev32(@vtmp[2],@data[1]);
465	&rev32(@vtmp[1],@data[2]);
466	&rev32(@vtmp[0],@data[3]);
467	&rev32(@data[3],@datax[0]);
468	&rev32(@data[2],@datax[1]);
469	&rev32(@data[1],@datax[2]);
470	&rev32(@data[0],@datax[3]);
471}
472
473sub load_sbox () {
474	my $data = shift;
475
476$code.=<<___;
477	adr	$ptr,.Lsbox
478	ld1	{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
479	ld1	{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
480	ld1	{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
481	ld1	{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
482___
483}
484
485
486sub mov_reg_to_vec() {
487	my $src0 = shift;
488	my $src1 = shift;
489	my $desv = shift;
490$code.=<<___;
491	mov $desv.d[0],$src0
492	mov $desv.d[1],$src1
493___
494	&rev32_armeb($desv,$desv);
495}
496
497sub mov_vec_to_reg() {
498	my $srcv = shift;
499	my $des0 = shift;
500	my $des1 = shift;
501$code.=<<___;
502	mov $des0,$srcv.d[0]
503	mov $des1,$srcv.d[1]
504___
505}
506
507sub compute_tweak() {
508	my $src0 = shift;
509	my $src1 = shift;
510	my $des0 = shift;
511	my $des1 = shift;
512$code.=<<___;
513	mov $wtmp0,0x87
514	extr	$xtmp2,$src1,$src1,#32
515	extr	$des1,$src1,$src0,#63
516	and	$wtmp1,$wtmp0,$wtmp2,asr#31
517	eor	$des0,$xtmp1,$src0,lsl#1
518___
519}
520
521sub compute_tweak_vec() {
522	my $src = shift;
523	my $des = shift;
524	my $std = shift;
525	&rbit(@vtmp[2],$src,$std);
526$code.=<<___;
527	ldr  @qtmp[0], .Lxts_magic
528	shl  $des.16b, @vtmp[2].16b, #1
529	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
530	ushr @vtmp[1].16b, @vtmp[1].16b, #7
531	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
532	eor  $des.16b, $des.16b, @vtmp[1].16b
533___
534	&rbit($des,$des,$std);
535}
536
537$code=<<___;
538#include "arm_arch.h"
539.arch	armv8-a
540.text
541
542.type	_vpsm4_consts,%object
543.align	7
544_vpsm4_consts:
545.Lsbox:
546	.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547	.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548	.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549	.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550	.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551	.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552	.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553	.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554	.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555	.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556	.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557	.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558	.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559	.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560	.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561	.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
562.Lck:
563	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
571.Lfk:
572	.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
573.Lshuffles:
574	.quad 0x0B0A090807060504,0x030201000F0E0D0C
575.Lxts_magic:
576	.quad 0x0101010101010187,0x0101010101010101
577
578.size	_vpsm4_consts,.-_vpsm4_consts
579___
580
581{{{
582my ($key,$keys,$enc)=("x0","x1","w2");
583my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
584my ($vkey,$vfk,$vmap)=("v5","v6","v7");
585$code.=<<___;
586.type	_vpsm4_set_key,%function
587.align	4
588_vpsm4_set_key:
589	AARCH64_VALID_CALL_TARGET
590	ld1	{$vkey.4s},[$key]
591___
592	&load_sbox();
593	&rev32($vkey,$vkey);
594$code.=<<___;
595	adr	$pointer,.Lshuffles
596	ld1	{$vmap.2d},[$pointer]
597	adr	$pointer,.Lfk
598	ld1	{$vfk.2d},[$pointer]
599	eor	$vkey.16b,$vkey.16b,$vfk.16b
600	mov	$schedules,#32
601	adr	$pointer,.Lck
602	movi	@vtmp[0].16b,#64
603	cbnz	$enc,1f
604	add	$keys,$keys,124
6051:
606	mov	$wtmp,$vkey.s[1]
607	ldr	$roundkey,[$pointer],#4
608	eor	$roundkey,$roundkey,$wtmp
609	mov	$wtmp,$vkey.s[2]
610	eor	$roundkey,$roundkey,$wtmp
611	mov	$wtmp,$vkey.s[3]
612	eor	$roundkey,$roundkey,$wtmp
613	// sbox lookup
614	mov	@data[0].s[0],$roundkey
615	tbl	@vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
616	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
617	tbx	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
618	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
619	tbx	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
620	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
621	tbx	@vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
622	mov	$wtmp,@vtmp[1].s[0]
623	eor	$roundkey,$wtmp,$wtmp,ror #19
624	eor	$roundkey,$roundkey,$wtmp,ror #9
625	mov	$wtmp,$vkey.s[0]
626	eor	$roundkey,$roundkey,$wtmp
627	mov	$vkey.s[0],$roundkey
628	cbz	$enc,2f
629	str	$roundkey,[$keys],#4
630	b	3f
6312:
632	str	$roundkey,[$keys],#-4
6333:
634	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
635	subs	$schedules,$schedules,#1
636	b.ne	1b
637	ret
638.size	_vpsm4_set_key,.-_vpsm4_set_key
639___
640}}}
641
642
643{{{
644$code.=<<___;
645.type	_vpsm4_enc_4blks,%function
646.align	4
647_vpsm4_enc_4blks:
648	AARCH64_VALID_CALL_TARGET
649___
650	&encrypt_4blks();
651$code.=<<___;
652	ret
653.size	_vpsm4_enc_4blks,.-_vpsm4_enc_4blks
654___
655}}}
656
657{{{
658$code.=<<___;
659.type	_vpsm4_enc_8blks,%function
660.align	4
661_vpsm4_enc_8blks:
662	AARCH64_VALID_CALL_TARGET
663___
664	&encrypt_8blks();
665$code.=<<___;
666	ret
667.size	_vpsm4_enc_8blks,.-_vpsm4_enc_8blks
668___
669}}}
670
671
672{{{
673my ($key,$keys)=("x0","x1");
674$code.=<<___;
675.globl	${prefix}_set_encrypt_key
676.type	${prefix}_set_encrypt_key,%function
677.align	5
678${prefix}_set_encrypt_key:
679	AARCH64_SIGN_LINK_REGISTER
680	stp	x29,x30,[sp,#-16]!
681	mov	w2,1
682	bl	_vpsm4_set_key
683	ldp	x29,x30,[sp],#16
684	AARCH64_VALIDATE_LINK_REGISTER
685	ret
686.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
687___
688}}}
689
690{{{
691my ($key,$keys)=("x0","x1");
692$code.=<<___;
693.globl	${prefix}_set_decrypt_key
694.type	${prefix}_set_decrypt_key,%function
695.align	5
696${prefix}_set_decrypt_key:
697	AARCH64_SIGN_LINK_REGISTER
698	stp	x29,x30,[sp,#-16]!
699	mov	w2,0
700	bl	_vpsm4_set_key
701	ldp	x29,x30,[sp],#16
702	AARCH64_VALIDATE_LINK_REGISTER
703	ret
704.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
705___
706}}}
707
708{{{
709sub gen_block () {
710	my $dir = shift;
711	my ($inp,$outp,$rk)=map("x$_",(0..2));
712
713$code.=<<___;
714.globl	${prefix}_${dir}crypt
715.type	${prefix}_${dir}crypt,%function
716.align	5
717${prefix}_${dir}crypt:
718	AARCH64_VALID_CALL_TARGET
719	ld1	{@data[0].4s},[$inp]
720___
721	&load_sbox();
722	&rev32(@data[0],@data[0]);
723$code.=<<___;
724	mov	$rks,x2
725___
726	&encrypt_1blk(@data[0]);
727$code.=<<___;
728	st1	{@data[0].4s},[$outp]
729	ret
730.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
731___
732}
733&gen_block("en");
734&gen_block("de");
735}}}
736
737{{{
738my ($enc) = ("w4");
739my @dat=map("v$_",(16..23));
740
741$code.=<<___;
742.globl	${prefix}_ecb_encrypt
743.type	${prefix}_ecb_encrypt,%function
744.align	5
745${prefix}_ecb_encrypt:
746	AARCH64_SIGN_LINK_REGISTER
747	// convert length into blocks
748	lsr	x2,x2,4
749	stp	d8,d9,[sp,#-80]!
750	stp	d10,d11,[sp,#16]
751	stp	d12,d13,[sp,#32]
752	stp	d14,d15,[sp,#48]
753	stp	x29,x30,[sp,#64]
754___
755	&load_sbox();
756$code.=<<___;
757.Lecb_8_blocks_process:
758	cmp	$blocks,#8
759	b.lt	.Lecb_4_blocks_process
760	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
761	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
762___
763	&rev32(@data[0],@data[0]);
764	&rev32(@data[1],@data[1]);
765	&rev32(@data[2],@data[2]);
766	&rev32(@data[3],@data[3]);
767	&rev32(@datax[0],@datax[0]);
768	&rev32(@datax[1],@datax[1]);
769	&rev32(@datax[2],@datax[2]);
770	&rev32(@datax[3],@datax[3]);
771$code.=<<___;
772	bl	_vpsm4_enc_8blks
773	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
774	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
775	subs	$blocks,$blocks,#8
776	b.gt	.Lecb_8_blocks_process
777	b	100f
778.Lecb_4_blocks_process:
779	cmp	$blocks,#4
780	b.lt	1f
781	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
782___
783	&rev32(@data[0],@data[0]);
784	&rev32(@data[1],@data[1]);
785	&rev32(@data[2],@data[2]);
786	&rev32(@data[3],@data[3]);
787$code.=<<___;
788	bl	_vpsm4_enc_4blks
789	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
790	sub	$blocks,$blocks,#4
7911:
792	// process last block
793	cmp	$blocks,#1
794	b.lt	100f
795	b.gt	1f
796	ld1	{@data[0].4s},[$inp]
797___
798	&rev32(@data[0],@data[0]);
799	&encrypt_1blk(@data[0]);
800$code.=<<___;
801	st1	{@data[0].4s},[$outp]
802	b	100f
8031:	// process last 2 blocks
804	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
805	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
806	cmp	$blocks,#2
807	b.gt	1f
808___
809	&rev32(@data[0],@data[0]);
810	&rev32(@data[1],@data[1]);
811	&rev32(@data[2],@data[2]);
812	&rev32(@data[3],@data[3]);
813$code.=<<___;
814	bl	_vpsm4_enc_4blks
815	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
816	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
817	b	100f
8181:	// process last 3 blocks
819	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
820___
821	&rev32(@data[0],@data[0]);
822	&rev32(@data[1],@data[1]);
823	&rev32(@data[2],@data[2]);
824	&rev32(@data[3],@data[3]);
825$code.=<<___;
826	bl	_vpsm4_enc_4blks
827	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
828	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
829	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
830100:
831	ldp	d10,d11,[sp,#16]
832	ldp	d12,d13,[sp,#32]
833	ldp	d14,d15,[sp,#48]
834	ldp	x29,x30,[sp,#64]
835	ldp	d8,d9,[sp],#80
836	AARCH64_VALIDATE_LINK_REGISTER
837	ret
838.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
839___
840}}}
841
842{{{
843my ($len,$ivp,$enc)=("x2","x4","w5");
844my $ivec0=("v3");
845my $ivec1=("v15");
846
847$code.=<<___;
848.globl	${prefix}_cbc_encrypt
849.type	${prefix}_cbc_encrypt,%function
850.align	5
851${prefix}_cbc_encrypt:
852	AARCH64_VALID_CALL_TARGET
853	lsr	$len,$len,4
854___
855	&load_sbox();
856$code.=<<___;
857	cbz	$enc,.Ldec
858	ld1	{$ivec0.4s},[$ivp]
859.Lcbc_4_blocks_enc:
860	cmp	$blocks,#4
861	b.lt	1f
862	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
863	eor	@data[0].16b,@data[0].16b,$ivec0.16b
864___
865	&rev32(@data[1],@data[1]);
866	&rev32(@data[0],@data[0]);
867	&rev32(@data[2],@data[2]);
868	&rev32(@data[3],@data[3]);
869	&encrypt_1blk_norev(@data[0]);
870$code.=<<___;
871	eor	@data[1].16b,@data[1].16b,@data[0].16b
872___
873	&encrypt_1blk_norev(@data[1]);
874	&rev32(@data[0],@data[0]);
875
876$code.=<<___;
877	eor	@data[2].16b,@data[2].16b,@data[1].16b
878___
879	&encrypt_1blk_norev(@data[2]);
880	&rev32(@data[1],@data[1]);
881$code.=<<___;
882	eor	@data[3].16b,@data[3].16b,@data[2].16b
883___
884	&encrypt_1blk_norev(@data[3]);
885	&rev32(@data[2],@data[2]);
886	&rev32(@data[3],@data[3]);
887$code.=<<___;
888	orr	$ivec0.16b,@data[3].16b,@data[3].16b
889	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
890	subs	$blocks,$blocks,#4
891	b.ne	.Lcbc_4_blocks_enc
892	b	2f
8931:
894	subs	$blocks,$blocks,#1
895	b.lt	2f
896	ld1	{@data[0].4s},[$inp],#16
897	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
898___
899	&rev32($ivec0,$ivec0);
900	&encrypt_1blk($ivec0);
901$code.=<<___;
902	st1	{$ivec0.4s},[$outp],#16
903	b	1b
9042:
905	// save back IV
906	st1	{$ivec0.4s},[$ivp]
907	ret
908
909.Ldec:
910	// decryption mode starts
911	AARCH64_SIGN_LINK_REGISTER
912	stp	d8,d9,[sp,#-80]!
913	stp	d10,d11,[sp,#16]
914	stp	d12,d13,[sp,#32]
915	stp	d14,d15,[sp,#48]
916	stp	x29,x30,[sp,#64]
917.Lcbc_8_blocks_dec:
918	cmp	$blocks,#8
919	b.lt	1f
920	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
921	add	$ptr,$inp,#64
922	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
923___
924	&rev32(@data[0],@data[0]);
925	&rev32(@data[1],@data[1]);
926	&rev32(@data[2],@data[2]);
927	&rev32(@data[3],$data[3]);
928	&rev32(@datax[0],@datax[0]);
929	&rev32(@datax[1],@datax[1]);
930	&rev32(@datax[2],@datax[2]);
931	&rev32(@datax[3],$datax[3]);
932$code.=<<___;
933	bl	_vpsm4_enc_8blks
934___
935	&transpose(@vtmp,@datax);
936	&transpose(@data,@datax);
937$code.=<<___;
938	ld1	{$ivec1.4s},[$ivp]
939	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
940	// note ivec1 and vtmpx[3] are reusing the same register
941	// care needs to be taken to avoid conflict
942	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
943	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
944	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
945	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
946	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
947	// save back IV
948	st1	{$vtmpx[3].4s}, [$ivp]
949	eor	@data[0].16b,@data[0].16b,$datax[3].16b
950	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
951	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
952	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
953	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
954	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
955	subs	$blocks,$blocks,#8
956	b.gt	.Lcbc_8_blocks_dec
957	b.eq	100f
9581:
959	ld1	{$ivec1.4s},[$ivp]
960.Lcbc_4_blocks_dec:
961	cmp	$blocks,#4
962	b.lt	1f
963	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
964___
965	&rev32(@data[0],@data[0]);
966	&rev32(@data[1],@data[1]);
967	&rev32(@data[2],@data[2]);
968	&rev32(@data[3],$data[3]);
969$code.=<<___;
970	bl	_vpsm4_enc_4blks
971	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
972___
973	&transpose(@vtmp,@datax);
974$code.=<<___;
975	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
976	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
977	orr	$ivec1.16b,@data[3].16b,@data[3].16b
978	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
979	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
980	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
981	subs	$blocks,$blocks,#4
982	b.gt	.Lcbc_4_blocks_dec
983	// save back IV
984	st1	{@data[3].4s}, [$ivp]
985	b	100f
9861:	// last block
987	subs	$blocks,$blocks,#1
988	b.lt	100f
989	b.gt	1f
990	ld1	{@data[0].4s},[$inp],#16
991	// save back IV
992	st1	{$data[0].4s}, [$ivp]
993___
994	&rev32(@datax[0],@data[0]);
995	&encrypt_1blk(@datax[0]);
996$code.=<<___;
997	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
998	st1	{@datax[0].4s},[$outp],#16
999	b	100f
10001:	// last two blocks
1001	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1002	add	$ptr,$inp,#16
1003	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1004	subs	$blocks,$blocks,1
1005	b.gt	1f
1006___
1007	&rev32(@data[0],@data[0]);
1008	&rev32(@data[1],@data[1]);
1009	&rev32(@data[2],@data[2]);
1010	&rev32(@data[3],@data[3]);
1011$code.=<<___;
1012	bl	_vpsm4_enc_4blks
1013	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1014___
1015	&transpose(@vtmp,@datax);
1016$code.=<<___;
1017	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1018	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1019	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1020	// save back IV
1021	st1	{@data[1].4s}, [$ivp]
1022	b	100f
10231:	// last 3 blocks
1024	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1025___
1026	&rev32(@data[0],@data[0]);
1027	&rev32(@data[1],@data[1]);
1028	&rev32(@data[2],@data[2]);
1029	&rev32(@data[3],@data[3]);
1030$code.=<<___;
1031	bl	_vpsm4_enc_4blks
1032	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1033___
1034	&transpose(@vtmp,@datax);
1035$code.=<<___;
1036	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1037	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1038	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1039	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1040	// save back IV
1041	st1	{@data[2].4s}, [$ivp]
1042100:
1043	ldp	d10,d11,[sp,#16]
1044	ldp	d12,d13,[sp,#32]
1045	ldp	d14,d15,[sp,#48]
1046	ldp	x29,x30,[sp,#64]
1047	ldp	d8,d9,[sp],#80
1048	AARCH64_VALIDATE_LINK_REGISTER
1049	ret
1050.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1051___
1052}}}
1053
1054{{{
1055my ($ivp)=("x4");
1056my ($ctr)=("w5");
1057my $ivec=("v3");
1058
1059$code.=<<___;
1060.globl	${prefix}_ctr32_encrypt_blocks
1061.type	${prefix}_ctr32_encrypt_blocks,%function
1062.align	5
1063${prefix}_ctr32_encrypt_blocks:
1064	AARCH64_VALID_CALL_TARGET
1065	ld1	{$ivec.4s},[$ivp]
1066___
1067	&rev32($ivec,$ivec);
1068	&load_sbox();
1069$code.=<<___;
1070	cmp	$blocks,#1
1071	b.ne	1f
1072	// fast processing for one single block without
1073	// context saving overhead
1074___
1075	&encrypt_1blk($ivec);
1076$code.=<<___;
1077	ld1	{@data[0].4s},[$inp]
1078	eor	@data[0].16b,@data[0].16b,$ivec.16b
1079	st1	{@data[0].4s},[$outp]
1080	ret
10811:
1082	AARCH64_SIGN_LINK_REGISTER
1083	stp	d8,d9,[sp,#-80]!
1084	stp	d10,d11,[sp,#16]
1085	stp	d12,d13,[sp,#32]
1086	stp	d14,d15,[sp,#48]
1087	stp	x29,x30,[sp,#64]
1088	mov	$word0,$ivec.s[0]
1089	mov	$word1,$ivec.s[1]
1090	mov	$word2,$ivec.s[2]
1091	mov	$ctr,$ivec.s[3]
1092.Lctr32_4_blocks_process:
1093	cmp	$blocks,#4
1094	b.lt	1f
1095	dup	@data[0].4s,$word0
1096	dup	@data[1].4s,$word1
1097	dup	@data[2].4s,$word2
1098	mov	@data[3].s[0],$ctr
1099	add	$ctr,$ctr,#1
1100	mov	$data[3].s[1],$ctr
1101	add	$ctr,$ctr,#1
1102	mov	@data[3].s[2],$ctr
1103	add	$ctr,$ctr,#1
1104	mov	@data[3].s[3],$ctr
1105	add	$ctr,$ctr,#1
1106	cmp	$blocks,#8
1107	b.ge	.Lctr32_8_blocks_process
1108	bl	_vpsm4_enc_4blks
1109	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1110	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1111	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1112	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1113	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1114	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1115	subs	$blocks,$blocks,#4
1116	b.ne	.Lctr32_4_blocks_process
1117	b	100f
1118.Lctr32_8_blocks_process:
1119	dup	@datax[0].4s,$word0
1120	dup	@datax[1].4s,$word1
1121	dup	@datax[2].4s,$word2
1122	mov	@datax[3].s[0],$ctr
1123	add	$ctr,$ctr,#1
1124	mov	$datax[3].s[1],$ctr
1125	add	$ctr,$ctr,#1
1126	mov	@datax[3].s[2],$ctr
1127	add	$ctr,$ctr,#1
1128	mov	@datax[3].s[3],$ctr
1129	add	$ctr,$ctr,#1
1130	bl	_vpsm4_enc_8blks
1131	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1132	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1133	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1134	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1135	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1136	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1137	eor	@data[0].16b,@data[0].16b,@datax[0].16b
1138	eor	@data[1].16b,@data[1].16b,@datax[1].16b
1139	eor	@data[2].16b,@data[2].16b,@datax[2].16b
1140	eor	@data[3].16b,@data[3].16b,@datax[3].16b
1141	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1142	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1143	subs	$blocks,$blocks,#8
1144	b.ne	.Lctr32_4_blocks_process
1145	b	100f
11461:	// last block processing
1147	subs	$blocks,$blocks,#1
1148	b.lt	100f
1149	b.gt	1f
1150	mov	$ivec.s[0],$word0
1151	mov	$ivec.s[1],$word1
1152	mov	$ivec.s[2],$word2
1153	mov	$ivec.s[3],$ctr
1154___
1155	&encrypt_1blk($ivec);
1156$code.=<<___;
1157	ld1	{@data[0].4s},[$inp]
1158	eor	@data[0].16b,@data[0].16b,$ivec.16b
1159	st1	{@data[0].4s},[$outp]
1160	b	100f
11611:	// last 2 blocks processing
1162	dup	@data[0].4s,$word0
1163	dup	@data[1].4s,$word1
1164	dup	@data[2].4s,$word2
1165	mov	@data[3].s[0],$ctr
1166	add	$ctr,$ctr,#1
1167	mov	@data[3].s[1],$ctr
1168	subs	$blocks,$blocks,#1
1169	b.ne	1f
1170	bl	_vpsm4_enc_4blks
1171	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1172	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1173	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1174	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1175	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1176	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1177	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1178	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1179	b	100f
11801:	// last 3 blocks processing
1181	add	$ctr,$ctr,#1
1182	mov	@data[3].s[2],$ctr
1183	bl	_vpsm4_enc_4blks
1184	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1185	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1186	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1187	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1188	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1189	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1190	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1191	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1192	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1193	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1194100:
1195	ldp	d10,d11,[sp,#16]
1196	ldp	d12,d13,[sp,#32]
1197	ldp	d14,d15,[sp,#48]
1198	ldp	x29,x30,[sp,#64]
1199	ldp	d8,d9,[sp],#80
1200	AARCH64_VALIDATE_LINK_REGISTER
1201	ret
1202.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1203___
1204}}}
1205
1206{{{
1207my ($blocks,$len)=("x2","x2");
1208my $ivp=("x5");
1209my @twx=map("x$_",(12..27));
1210my ($rks1,$rks2)=("x26","x27");
1211my $lastBlk=("x26");
1212my $enc=("w28");
1213my $remain=("x29");
1214
1215my @tweak=@datax;
1216
1217sub gen_xts_cipher() {
1218	my $std = shift;
1219$code.=<<___;
1220.globl	${prefix}_xts_encrypt${std}
1221.type	${prefix}_xts_encrypt${std},%function
1222.align	5
1223${prefix}_xts_encrypt${std}:
1224	AARCH64_SIGN_LINK_REGISTER
1225	stp	x15, x16, [sp, #-0x10]!
1226	stp	x17, x18, [sp, #-0x10]!
1227	stp	x19, x20, [sp, #-0x10]!
1228	stp	x21, x22, [sp, #-0x10]!
1229	stp	x23, x24, [sp, #-0x10]!
1230	stp	x25, x26, [sp, #-0x10]!
1231	stp	x27, x28, [sp, #-0x10]!
1232	stp	x29, x30, [sp, #-0x10]!
1233	stp	d8, d9, [sp, #-0x10]!
1234	stp	d10, d11, [sp, #-0x10]!
1235	stp	d12, d13, [sp, #-0x10]!
1236	stp	d14, d15, [sp, #-0x10]!
1237	mov	$rks1,x3
1238	mov	$rks2,x4
1239	mov	$enc,w6
1240	ld1	{@tweak[0].4s}, [$ivp]
1241	mov	$rks,$rks2
1242___
1243	&load_sbox();
1244	&rev32(@tweak[0],@tweak[0]);
1245	&encrypt_1blk(@tweak[0]);
1246$code.=<<___;
1247	mov	$rks,$rks1
1248	and	$remain,$len,#0x0F
1249	// convert length into blocks
1250	lsr	$blocks,$len,4
1251	cmp	$blocks,#1
1252	b.lt .return${std}
1253
1254	cmp $remain,0
1255	// If the encryption/decryption Length is N times of 16,
1256	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1257	b.eq .xts_encrypt_blocks${std}
1258
1259	// If the encryption/decryption length is not N times of 16,
1260	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1261	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1262	subs $blocks,$blocks,#1
1263	b.eq .only_2blks_tweak${std}
1264.xts_encrypt_blocks${std}:
1265___
1266	&rbit(@tweak[0],@tweak[0],$std);
1267	&rev32_armeb(@tweak[0],@tweak[0]);
1268	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1269	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1270	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1271	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1272	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1273	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1274	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1275	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1276$code.=<<___;
1277.Lxts_8_blocks_process${std}:
1278	cmp	$blocks,#8
1279	b.lt	.Lxts_4_blocks_process${std}
1280___
1281	&mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1282	&mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1283	&mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1284	&mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1285	&mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1286	&mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1287	&mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1288	&mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1289$code.=<<___;
1290	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1291___
1292	&rbit(@vtmp[0],@vtmp[0],$std);
1293	&rbit(@vtmp[1],@vtmp[1],$std);
1294	&rbit(@vtmp[2],@vtmp[2],$std);
1295	&rbit(@vtmp[3],@vtmp[3],$std);
1296$code.=<<___;
1297	eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1298	eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1299	eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1300	eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1301	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1302___
1303	&rbit(@vtmpx[0],@vtmpx[0],$std);
1304	&rbit(@vtmpx[1],@vtmpx[1],$std);
1305	&rbit(@vtmpx[2],@vtmpx[2],$std);
1306	&rbit(@vtmpx[3],@vtmpx[3],$std);
1307$code.=<<___;
1308	eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1309	eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1310	eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1311	eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1312___
1313	&rev32(@data[0],@data[0]);
1314	&rev32(@data[1],@data[1]);
1315	&rev32(@data[2],@data[2]);
1316	&rev32(@data[3],@data[3]);
1317	&rev32(@datax[0],@datax[0]);
1318	&rev32(@datax[1],@datax[1]);
1319	&rev32(@datax[2],@datax[2]);
1320	&rev32(@datax[3],@datax[3]);
1321	&transpose(@data,@vtmp);
1322	&transpose(@datax,@vtmp);
1323$code.=<<___;
1324	bl	_${prefix}_enc_8blks
1325___
1326	&transpose(@vtmp,@datax);
1327	&transpose(@data,@datax);
1328
1329	&mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1330	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1331	&mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1332	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1333	&mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1334	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1335	&mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1336	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1337	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1338	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1339	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1340	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1341	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1342	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1343	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1344	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1345$code.=<<___;
1346	eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1347	eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1348	eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1349	eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1350	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1351	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1352	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1353	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1354
1355	// save the last tweak
1356	st1	{@tweak[3].4s},[$ivp]
1357	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1358	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1359	subs	$blocks,$blocks,#8
1360	b.gt	.Lxts_8_blocks_process${std}
1361	b	100f
1362.Lxts_4_blocks_process${std}:
1363___
1364	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1365	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1366	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1367	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1368$code.=<<___;
1369	cmp	$blocks,#4
1370	b.lt	1f
1371	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1372___
1373	&rbit(@tweak[0],@tweak[0],$std);
1374	&rbit(@tweak[1],@tweak[1],$std);
1375	&rbit(@tweak[2],@tweak[2],$std);
1376	&rbit(@tweak[3],@tweak[3],$std);
1377$code.=<<___;
1378	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1379	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1380	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1381	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1382___
1383	&rev32(@data[0],@data[0]);
1384	&rev32(@data[1],@data[1]);
1385	&rev32(@data[2],@data[2]);
1386	&rev32(@data[3],@data[3]);
1387	&transpose(@data,@vtmp);
1388$code.=<<___;
1389	bl	_${prefix}_enc_4blks
1390___
1391	&transpose(@vtmp,@data);
1392$code.=<<___;
1393	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1394	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1395	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1396	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1397	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1398	sub	$blocks,$blocks,#4
1399___
1400	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1401	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1402	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1403$code.=<<___;
1404	// save the last tweak
1405	st1	{@tweak[3].4s},[$ivp]
14061:
1407	// process last block
1408	cmp	$blocks,#1
1409	b.lt	100f
1410	b.gt	1f
1411	ld1	{@data[0].4s},[$inp],#16
1412___
1413	&rbit(@tweak[0],@tweak[0],$std);
1414$code.=<<___;
1415	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1416___
1417	&rev32(@data[0],@data[0]);
1418	&encrypt_1blk(@data[0]);
1419$code.=<<___;
1420	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1421	st1	{@data[0].4s},[$outp],#16
1422	// save the last tweak
1423	st1	{@tweak[0].4s},[$ivp]
1424	b	100f
14251:  // process last 2 blocks
1426	cmp	$blocks,#2
1427	b.gt	1f
1428	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1429___
1430	&rbit(@tweak[0],@tweak[0],$std);
1431	&rbit(@tweak[1],@tweak[1],$std);
1432$code.=<<___;
1433	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1434	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1435___
1436	&rev32(@data[0],@data[0]);
1437	&rev32(@data[1],@data[1]);
1438	&transpose(@data,@vtmp);
1439$code.=<<___;
1440	bl	_${prefix}_enc_4blks
1441___
1442	&transpose(@vtmp,@data);
1443$code.=<<___;
1444	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1445	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1446	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1447	// save the last tweak
1448	st1	{@tweak[1].4s},[$ivp]
1449	b	100f
14501:  // process last 3 blocks
1451	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1452___
1453	&rbit(@tweak[0],@tweak[0],$std);
1454	&rbit(@tweak[1],@tweak[1],$std);
1455	&rbit(@tweak[2],@tweak[2],$std);
1456$code.=<<___;
1457	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1458	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1459	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1460___
1461	&rev32(@data[0],@data[0]);
1462	&rev32(@data[1],@data[1]);
1463	&rev32(@data[2],@data[2]);
1464	&transpose(@data,@vtmp);
1465$code.=<<___;
1466	bl	_${prefix}_enc_4blks
1467___
1468	&transpose(@vtmp,@data);
1469$code.=<<___;
1470	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1471	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1472	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1473	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1474	// save the last tweak
1475	st1	{@tweak[2].4s},[$ivp]
1476100:
1477	cmp $remain,0
1478	b.eq .return${std}
1479
1480// This branch calculates the last two tweaks,
1481// while the encryption/decryption length is larger than 32
1482.last_2blks_tweak${std}:
1483	ld1	{@tweak[0].4s},[$ivp]
1484___
1485	&rev32_armeb(@tweak[0],@tweak[0]);
1486	&compute_tweak_vec(@tweak[0],@tweak[1],$std);
1487	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1488$code.=<<___;
1489	b .check_dec${std}
1490
1491
1492// This branch calculates the last two tweaks,
1493// while the encryption/decryption length is equal to 32, who only need two tweaks
1494.only_2blks_tweak${std}:
1495	mov @tweak[1].16b,@tweak[0].16b
1496___
1497	&rev32_armeb(@tweak[1],@tweak[1]);
1498	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1499$code.=<<___;
1500	b .check_dec${std}
1501
1502
1503// Determine whether encryption or decryption is required.
1504// The last two tweaks need to be swapped for decryption.
1505.check_dec${std}:
1506	// encryption:1 decryption:0
1507	cmp $enc,1
1508	b.eq .process_last_2blks${std}
1509	mov @vtmp[0].16B,@tweak[1].16b
1510	mov @tweak[1].16B,@tweak[2].16b
1511	mov @tweak[2].16B,@vtmp[0].16b
1512
1513.process_last_2blks${std}:
1514___
1515	&rev32_armeb(@tweak[1],@tweak[1]);
1516	&rev32_armeb(@tweak[2],@tweak[2]);
1517$code.=<<___;
1518	ld1	{@data[0].4s},[$inp],#16
1519	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1520___
1521	&rev32(@data[0],@data[0]);
1522	&encrypt_1blk(@data[0]);
1523$code.=<<___;
1524	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1525	st1	{@data[0].4s},[$outp],#16
1526
1527	sub $lastBlk,$outp,16
1528	.loop${std}:
1529		subs $remain,$remain,1
1530		ldrb	$wtmp0,[$lastBlk,$remain]
1531		ldrb	$wtmp1,[$inp,$remain]
1532		strb	$wtmp1,[$lastBlk,$remain]
1533		strb	$wtmp0,[$outp,$remain]
1534	b.gt .loop${std}
1535	ld1		{@data[0].4s}, [$lastBlk]
1536	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1537___
1538	&rev32(@data[0],@data[0]);
1539	&encrypt_1blk(@data[0]);
1540$code.=<<___;
1541	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1542	st1		{@data[0].4s}, [$lastBlk]
1543.return${std}:
1544	ldp		d14, d15, [sp], #0x10
1545	ldp		d12, d13, [sp], #0x10
1546	ldp		d10, d11, [sp], #0x10
1547	ldp		d8, d9, [sp], #0x10
1548	ldp		x29, x30, [sp], #0x10
1549	ldp		x27, x28, [sp], #0x10
1550	ldp		x25, x26, [sp], #0x10
1551	ldp		x23, x24, [sp], #0x10
1552	ldp		x21, x22, [sp], #0x10
1553	ldp		x19, x20, [sp], #0x10
1554	ldp		x17, x18, [sp], #0x10
1555	ldp		x15, x16, [sp], #0x10
1556	AARCH64_VALIDATE_LINK_REGISTER
1557	ret
1558.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1559___
1560} # end of gen_xts_cipher
1561&gen_xts_cipher("_gb");
1562&gen_xts_cipher("");
1563}}}
1564########################################
1565open SELF,$0;
1566while(<SELF>) {
1567        next if (/^#!/);
1568        last if (!s/^#/\/\// and !/^$/);
1569        print;
1570}
1571close SELF;
1572
1573foreach(split("\n",$code)) {
1574	s/\`([^\`]*)\`/eval($1)/ge;
1575	print $_,"\n";
1576}
1577
1578close STDOUT or die "error closing STDOUT: $!";
1579