xref: /openssl/crypto/sm4/asm/vpsm4_ex-armv8.pl (revision b6461792)
1#! /usr/bin/env perl
2# Copyright 2022-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# This module implements SM4 with ASIMD and AESE on AARCH64
11#
12# Dec 2022
13#
14
15# $output is the last argument if it looks like a file (it has an extension)
16# $flavour is the first argument if it doesn't look like a file
17$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23die "can't locate arm-xlate.pl";
24
25open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26	or die "can't call $xlate: $!";
27*STDOUT=*OUT;
28
29$prefix="vpsm4_ex";
30my @vtmp=map("v$_",(0..3));
31my @qtmp=map("q$_",(0..3));
32my @data=map("v$_",(4..7));
33my @datax=map("v$_",(8..11));
34my ($rk0,$rk1)=("v12","v13");
35my ($rka,$rkb)=("v14","v15");
36my @vtmpx=map("v$_",(12..15));
37my ($vtmp4,$vtmp5)=("v24","v25");
38my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
39my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
40
41my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
42my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
43my ($xtmp1,$xtmp2)=("x8","x9");
44my ($ptr,$counter)=("x10","w11");
45my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
46
47sub rev32() {
48	my $dst = shift;
49	my $src = shift;
50
51	if ($src and ("$src" ne "$dst")) {
52$code.=<<___;
53#ifndef __AARCH64EB__
54	rev32	$dst.16b,$src.16b
55#else
56	mov	$dst.16b,$src.16b
57#endif
58___
59	} else {
60$code.=<<___;
61#ifndef __AARCH64EB__
62	rev32	$dst.16b,$dst.16b
63#endif
64___
65	}
66}
67
68sub rev32_armeb() {
69	my $dst = shift;
70	my $src = shift;
71
72	if ($src and ("$src" ne "$dst")) {
73$code.=<<___;
74#ifdef __AARCH64EB__
75	rev32	$dst.16b,$src.16b
76#else
77	mov	$dst.16b,$src.16b
78#endif
79___
80	} else {
81$code.=<<___;
82#ifdef __AARCH64EB__
83	rev32	$dst.16b,$dst.16b
84#endif
85___
86	}
87}
88
89sub rbit() {
90	my $dst = shift;
91	my $src = shift;
92	my $std = shift;
93
94	if ($src and ("$src" ne "$dst")) {
95		if ($std eq "_gb") {
96$code.=<<___;
97			rbit $dst.16b,$src.16b
98___
99		} else {
100$code.=<<___;
101			mov $dst.16b,$src.16b
102___
103		}
104	} else {
105		if ($std eq "_gb") {
106$code.=<<___;
107			rbit $dst.16b,$src.16b
108___
109		}
110	}
111}
112
113sub transpose() {
114	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
115
116$code.=<<___;
117	zip1	$vt0.4s,$dat0.4s,$dat1.4s
118	zip2	$vt1.4s,$dat0.4s,$dat1.4s
119	zip1	$vt2.4s,$dat2.4s,$dat3.4s
120	zip2	$vt3.4s,$dat2.4s,$dat3.4s
121	zip1	$dat0.2d,$vt0.2d,$vt2.2d
122	zip2	$dat1.2d,$vt0.2d,$vt2.2d
123	zip1	$dat2.2d,$vt1.2d,$vt3.2d
124	zip2	$dat3.2d,$vt1.2d,$vt3.2d
125___
126}
127
128# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
129sub mul_matrix() {
130	my $x = shift;
131	my $higherMat = shift;
132	my $lowerMat = shift;
133	my $tmp = shift;
134$code.=<<___;
135	ushr	$tmp.16b, $x.16b, 4
136	and		$x.16b, $x.16b, $ANDMaskV.16b
137	tbl		$x.16b, {$lowerMat.16b}, $x.16b
138	tbl		$tmp.16b, {$higherMat.16b}, $tmp.16b
139	eor		$x.16b, $x.16b, $tmp.16b
140___
141}
142
143# sbox operations for 4-lane of words
144# sbox operation for 4-lane of words
145sub sbox() {
146	my $dat = shift;
147
148$code.=<<___;
149	// optimize sbox using AESE instruction
150	tbl	@vtmp[0].16b, {$dat.16b}, $MaskV.16b
151___
152	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
153$code.=<<___;
154	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
155	aese @vtmp[0].16b,@vtmp[1].16b
156___
157	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
158$code.=<<___;
159	mov	$dat.16b,@vtmp[0].16b
160
161	// linear transformation
162	ushr	@vtmp[0].4s,$dat.4s,32-2
163	ushr	@vtmp[1].4s,$dat.4s,32-10
164	ushr	@vtmp[2].4s,$dat.4s,32-18
165	ushr	@vtmp[3].4s,$dat.4s,32-24
166	sli	@vtmp[0].4s,$dat.4s,2
167	sli	@vtmp[1].4s,$dat.4s,10
168	sli	@vtmp[2].4s,$dat.4s,18
169	sli	@vtmp[3].4s,$dat.4s,24
170	eor	$vtmp4.16b,@vtmp[0].16b,$dat.16b
171	eor	$vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
172	eor	$dat.16b,@vtmp[2].16b,@vtmp[3].16b
173	eor	$dat.16b,$dat.16b,$vtmp4.16b
174___
175}
176
177# sbox operation for 8-lane of words
178sub sbox_double() {
179	my $dat = shift;
180	my $datx = shift;
181
182$code.=<<___;
183	// optimize sbox using AESE instruction
184	tbl	@vtmp[0].16b, {$dat.16b}, $MaskV.16b
185	tbl	@vtmp[1].16b, {$datx.16b}, $MaskV.16b
186___
187	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
188	&mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
189$code.=<<___;
190	eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
191	aese @vtmp[0].16b,$vtmp5.16b
192	aese @vtmp[1].16b,$vtmp5.16b
193___
194	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
195	&mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
196$code.=<<___;
197	mov	$dat.16b,@vtmp[0].16b
198	mov	$datx.16b,@vtmp[1].16b
199
200	// linear transformation
201	ushr	@vtmp[0].4s,$dat.4s,32-2
202	ushr	$vtmp5.4s,$datx.4s,32-2
203	ushr	@vtmp[1].4s,$dat.4s,32-10
204	ushr	@vtmp[2].4s,$dat.4s,32-18
205	ushr	@vtmp[3].4s,$dat.4s,32-24
206	sli	@vtmp[0].4s,$dat.4s,2
207	sli	$vtmp5.4s,$datx.4s,2
208	sli	@vtmp[1].4s,$dat.4s,10
209	sli	@vtmp[2].4s,$dat.4s,18
210	sli	@vtmp[3].4s,$dat.4s,24
211	eor	$vtmp4.16b,@vtmp[0].16b,$dat.16b
212	eor	$vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
213	eor	$dat.16b,@vtmp[2].16b,@vtmp[3].16b
214	eor	$dat.16b,$dat.16b,$vtmp4.16b
215	ushr	@vtmp[1].4s,$datx.4s,32-10
216	ushr	@vtmp[2].4s,$datx.4s,32-18
217	ushr	@vtmp[3].4s,$datx.4s,32-24
218	sli	@vtmp[1].4s,$datx.4s,10
219	sli	@vtmp[2].4s,$datx.4s,18
220	sli	@vtmp[3].4s,$datx.4s,24
221	eor	$vtmp4.16b,$vtmp5.16b,$datx.16b
222	eor	$vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
223	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
224	eor	$datx.16b,$datx.16b,$vtmp4.16b
225___
226}
227
228# sbox operation for one single word
229sub sbox_1word () {
230	my $word = shift;
231
232$code.=<<___;
233	mov	@vtmp[3].s[0],$word
234	// optimize sbox using AESE instruction
235	tbl	@vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
236___
237	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
238$code.=<<___;
239	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
240	aese @vtmp[0].16b,@vtmp[1].16b
241___
242	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
243$code.=<<___;
244
245	mov	$wtmp0,@vtmp[0].s[0]
246	eor	$word,$wtmp0,$wtmp0,ror #32-2
247	eor	$word,$word,$wtmp0,ror #32-10
248	eor	$word,$word,$wtmp0,ror #32-18
249	eor	$word,$word,$wtmp0,ror #32-24
250___
251}
252
253# sm4 for one block of data, in scalar registers word0/word1/word2/word3
254sub sm4_1blk () {
255	my $kptr = shift;
256
257$code.=<<___;
258	ldp	$wtmp0,$wtmp1,[$kptr],8
259	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
260	eor	$tmpw,$word2,$word3
261	eor	$wtmp2,$wtmp0,$word1
262	eor	$tmpw,$tmpw,$wtmp2
263___
264	&sbox_1word($tmpw);
265$code.=<<___;
266	eor	$word0,$word0,$tmpw
267	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
268	eor	$tmpw,$word2,$word3
269	eor	$wtmp2,$word0,$wtmp1
270	eor	$tmpw,$tmpw,$wtmp2
271___
272	&sbox_1word($tmpw);
273$code.=<<___;
274	ldp	$wtmp0,$wtmp1,[$kptr],8
275	eor	$word1,$word1,$tmpw
276	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
277	eor	$tmpw,$word0,$word1
278	eor	$wtmp2,$wtmp0,$word3
279	eor	$tmpw,$tmpw,$wtmp2
280___
281	&sbox_1word($tmpw);
282$code.=<<___;
283	eor	$word2,$word2,$tmpw
284	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
285	eor	$tmpw,$word0,$word1
286	eor	$wtmp2,$word2,$wtmp1
287	eor	$tmpw,$tmpw,$wtmp2
288___
289	&sbox_1word($tmpw);
290$code.=<<___;
291	eor	$word3,$word3,$tmpw
292___
293}
294
295# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
296sub sm4_4blks () {
297	my $kptr = shift;
298
299$code.=<<___;
300	ldp	$wtmp0,$wtmp1,[$kptr],8
301	dup	$rk0.4s,$wtmp0
302	dup	$rk1.4s,$wtmp1
303
304	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
305	eor	$rka.16b,@data[2].16b,@data[3].16b
306	eor	$rk0.16b,@data[1].16b,$rk0.16b
307	eor	$rk0.16b,$rka.16b,$rk0.16b
308___
309	&sbox($rk0);
310$code.=<<___;
311	eor	@data[0].16b,@data[0].16b,$rk0.16b
312
313	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
314	eor	$rka.16b,$rka.16b,@data[0].16b
315	eor	$rk1.16b,$rka.16b,$rk1.16b
316___
317	&sbox($rk1);
318$code.=<<___;
319	ldp	$wtmp0,$wtmp1,[$kptr],8
320	eor	@data[1].16b,@data[1].16b,$rk1.16b
321
322	dup	$rk0.4s,$wtmp0
323	dup	$rk1.4s,$wtmp1
324
325	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
326	eor	$rka.16b,@data[0].16b,@data[1].16b
327	eor	$rk0.16b,@data[3].16b,$rk0.16b
328	eor	$rk0.16b,$rka.16b,$rk0.16b
329___
330	&sbox($rk0);
331$code.=<<___;
332	eor	@data[2].16b,@data[2].16b,$rk0.16b
333
334	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
335	eor	$rka.16b,$rka.16b,@data[2].16b
336	eor	$rk1.16b,$rka.16b,$rk1.16b
337___
338	&sbox($rk1);
339$code.=<<___;
340	eor	@data[3].16b,@data[3].16b,$rk1.16b
341___
342}
343
344# sm4 for 8 lanes of data, in neon registers
345# data0/data1/data2/data3 datax0/datax1/datax2/datax3
346sub sm4_8blks () {
347	my $kptr = shift;
348
349$code.=<<___;
350	ldp	$wtmp0,$wtmp1,[$kptr],8
351	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
352	dup	$rk0.4s,$wtmp0
353	eor	$rka.16b,@data[2].16b,@data[3].16b
354	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
355	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
356	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
357	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
358	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
359___
360	&sbox_double($rk0,$rk1);
361$code.=<<___;
362	eor	@data[0].16b,@data[0].16b,$rk0.16b
363	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
364
365	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
366	dup	$rk1.4s,$wtmp1
367	eor	$rka.16b,$rka.16b,@data[0].16b
368	eor	$rkb.16b,$rkb.16b,@datax[0].16b
369	eor	$rk0.16b,$rka.16b,$rk1.16b
370	eor	$rk1.16b,$rkb.16b,$rk1.16b
371___
372	&sbox_double($rk0,$rk1);
373$code.=<<___;
374	ldp	$wtmp0,$wtmp1,[$kptr],8
375	eor	@data[1].16b,@data[1].16b,$rk0.16b
376	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
377
378	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
379	dup	$rk0.4s,$wtmp0
380	eor	$rka.16b,@data[0].16b,@data[1].16b
381	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
382	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
383	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
384	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
385	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
386___
387	&sbox_double($rk0,$rk1);
388$code.=<<___;
389	eor	@data[2].16b,@data[2].16b,$rk0.16b
390	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
391
392	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
393	dup	$rk1.4s,$wtmp1
394	eor	$rka.16b,$rka.16b,@data[2].16b
395	eor	$rkb.16b,$rkb.16b,@datax[2].16b
396	eor	$rk0.16b,$rka.16b,$rk1.16b
397	eor	$rk1.16b,$rkb.16b,$rk1.16b
398___
399	&sbox_double($rk0,$rk1);
400$code.=<<___;
401	eor	@data[3].16b,@data[3].16b,$rk0.16b
402	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
403___
404}
405
406sub encrypt_1blk_norev() {
407	my $dat = shift;
408
409$code.=<<___;
410	mov	$ptr,$rks
411	mov	$counter,#8
412	mov	$word0,$dat.s[0]
413	mov	$word1,$dat.s[1]
414	mov	$word2,$dat.s[2]
415	mov	$word3,$dat.s[3]
41610:
417___
418	&sm4_1blk($ptr);
419$code.=<<___;
420	subs	$counter,$counter,#1
421	b.ne	10b
422	mov	$dat.s[0],$word3
423	mov	$dat.s[1],$word2
424	mov	$dat.s[2],$word1
425	mov	$dat.s[3],$word0
426___
427}
428
429sub encrypt_1blk() {
430	my $dat = shift;
431
432	&encrypt_1blk_norev($dat);
433	&rev32($dat,$dat);
434}
435
436sub encrypt_4blks() {
437$code.=<<___;
438	mov	$ptr,$rks
439	mov	$counter,#8
44010:
441___
442	&sm4_4blks($ptr);
443$code.=<<___;
444	subs	$counter,$counter,#1
445	b.ne	10b
446___
447	&rev32(@vtmp[3],@data[0]);
448	&rev32(@vtmp[2],@data[1]);
449	&rev32(@vtmp[1],@data[2]);
450	&rev32(@vtmp[0],@data[3]);
451}
452
453sub encrypt_8blks() {
454$code.=<<___;
455	mov	$ptr,$rks
456	mov	$counter,#8
45710:
458___
459	&sm4_8blks($ptr);
460$code.=<<___;
461	subs	$counter,$counter,#1
462	b.ne	10b
463___
464	&rev32(@vtmp[3],@data[0]);
465	&rev32(@vtmp[2],@data[1]);
466	&rev32(@vtmp[1],@data[2]);
467	&rev32(@vtmp[0],@data[3]);
468	&rev32(@data[3],@datax[0]);
469	&rev32(@data[2],@datax[1]);
470	&rev32(@data[1],@datax[2]);
471	&rev32(@data[0],@datax[3]);
472}
473
474sub load_sbox () {
475	my $data = shift;
476
477$code.=<<___;
478	ldr $MaskQ, .Lsbox_magic
479	ldr $TAHMatQ, .Lsbox_magic+16
480	ldr $TALMatQ, .Lsbox_magic+32
481	ldr $ATAHMatQ, .Lsbox_magic+48
482	ldr $ATALMatQ, .Lsbox_magic+64
483	ldr $ANDMaskQ, .Lsbox_magic+80
484___
485}
486
487sub mov_reg_to_vec() {
488	my $src0 = shift;
489	my $src1 = shift;
490	my $desv = shift;
491$code.=<<___;
492	mov $desv.d[0],$src0
493	mov $desv.d[1],$src1
494___
495	&rev32_armeb($desv,$desv);
496}
497
498sub mov_vec_to_reg() {
499	my $srcv = shift;
500	my $des0 = shift;
501	my $des1 = shift;
502$code.=<<___;
503	mov $des0,$srcv.d[0]
504	mov $des1,$srcv.d[1]
505___
506}
507
508sub compute_tweak() {
509	my $src0 = shift;
510	my $src1 = shift;
511	my $des0 = shift;
512	my $des1 = shift;
513$code.=<<___;
514	mov $wtmp0,0x87
515	extr	$xtmp2,$src1,$src1,#32
516	extr	$des1,$src1,$src0,#63
517	and	$wtmp1,$wtmp0,$wtmp2,asr#31
518	eor	$des0,$xtmp1,$src0,lsl#1
519___
520}
521
522sub compute_tweak_vec() {
523	my $src = shift;
524	my $des = shift;
525	my $std = shift;
526	&rbit(@vtmp[2],$src,$std);
527$code.=<<___;
528	ldr  @qtmp[0], .Lxts_magic
529	shl  $des.16b, @vtmp[2].16b, #1
530	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
531	ushr @vtmp[1].16b, @vtmp[1].16b, #7
532	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
533	eor  $des.16b, $des.16b, @vtmp[1].16b
534___
535	&rbit($des,$des,$std);
536}
537
538$code=<<___;
539#include "arm_arch.h"
540.arch	armv8-a+crypto
541.text
542
543.type	_${prefix}_consts,%object
544.align	7
545_${prefix}_consts:
546.Lck:
547	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
548	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
549	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
550	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
551	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
552	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
553	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
554	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
555.Lfk:
556	.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
557.Lshuffles:
558	.quad 0x0B0A090807060504,0x030201000F0E0D0C
559.Lxts_magic:
560	.quad 0x0101010101010187,0x0101010101010101
561.Lsbox_magic:
562	.quad 0x0b0e0104070a0d00,0x0306090c0f020508
563	.quad 0x62185a2042387a00,0x22581a6002783a40
564	.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
565	.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
566	.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
567	.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
568
569.size	_${prefix}_consts,.-_${prefix}_consts
570___
571
572{{{
573my ($key,$keys,$enc)=("x0","x1","w2");
574my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
575my ($vkey,$vfk,$vmap)=("v5","v6","v7");
576$code.=<<___;
577.type	_${prefix}_set_key,%function
578.align	4
579_${prefix}_set_key:
580	AARCH64_VALID_CALL_TARGET
581	ld1	{$vkey.4s},[$key]
582___
583	&load_sbox();
584	&rev32($vkey,$vkey);
585$code.=<<___;
586	adr	$pointer,.Lshuffles
587	ld1	{$vmap.2d},[$pointer]
588	adr	$pointer,.Lfk
589	ld1	{$vfk.2d},[$pointer]
590	eor	$vkey.16b,$vkey.16b,$vfk.16b
591	mov	$schedules,#32
592	adr	$pointer,.Lck
593	movi	@vtmp[0].16b,#64
594	cbnz	$enc,1f
595	add	$keys,$keys,124
5961:
597	mov	$wtmp,$vkey.s[1]
598	ldr	$roundkey,[$pointer],#4
599	eor	$roundkey,$roundkey,$wtmp
600	mov	$wtmp,$vkey.s[2]
601	eor	$roundkey,$roundkey,$wtmp
602	mov	$wtmp,$vkey.s[3]
603	eor	$roundkey,$roundkey,$wtmp
604	// optimize sbox using AESE instruction
605	mov	@data[0].s[0],$roundkey
606	tbl	@vtmp[0].16b, {@data[0].16b}, $MaskV.16b
607___
608	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
609$code.=<<___;
610	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
611	aese @vtmp[0].16b,@vtmp[1].16b
612___
613	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
614$code.=<<___;
615	mov	$wtmp,@vtmp[0].s[0]
616	eor	$roundkey,$wtmp,$wtmp,ror #19
617	eor	$roundkey,$roundkey,$wtmp,ror #9
618	mov	$wtmp,$vkey.s[0]
619	eor	$roundkey,$roundkey,$wtmp
620	mov	$vkey.s[0],$roundkey
621	cbz	$enc,2f
622	str	$roundkey,[$keys],#4
623	b	3f
6242:
625	str	$roundkey,[$keys],#-4
6263:
627	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
628	subs	$schedules,$schedules,#1
629	b.ne	1b
630	ret
631.size	_${prefix}_set_key,.-_${prefix}_set_key
632___
633}}}
634
635
636{{{
637$code.=<<___;
638.type	_${prefix}_enc_4blks,%function
639.align	4
640_${prefix}_enc_4blks:
641	AARCH64_VALID_CALL_TARGET
642___
643	&encrypt_4blks();
644$code.=<<___;
645	ret
646.size	_${prefix}_enc_4blks,.-_${prefix}_enc_4blks
647___
648}}}
649
650{{{
651$code.=<<___;
652.type	_${prefix}_enc_8blks,%function
653.align	4
654_${prefix}_enc_8blks:
655	AARCH64_VALID_CALL_TARGET
656___
657	&encrypt_8blks();
658$code.=<<___;
659	ret
660.size	_${prefix}_enc_8blks,.-_${prefix}_enc_8blks
661___
662}}}
663
664
665{{{
666my ($key,$keys)=("x0","x1");
667$code.=<<___;
668.globl	${prefix}_set_encrypt_key
669.type	${prefix}_set_encrypt_key,%function
670.align	5
671${prefix}_set_encrypt_key:
672	AARCH64_SIGN_LINK_REGISTER
673	stp	x29,x30,[sp,#-16]!
674	mov	w2,1
675	bl	_${prefix}_set_key
676	ldp	x29,x30,[sp],#16
677	AARCH64_VALIDATE_LINK_REGISTER
678	ret
679.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
680___
681}}}
682
683{{{
684my ($key,$keys)=("x0","x1");
685$code.=<<___;
686.globl	${prefix}_set_decrypt_key
687.type	${prefix}_set_decrypt_key,%function
688.align	5
689${prefix}_set_decrypt_key:
690	AARCH64_SIGN_LINK_REGISTER
691	stp	x29,x30,[sp,#-16]!
692	mov	w2,0
693	bl	_${prefix}_set_key
694	ldp	x29,x30,[sp],#16
695	AARCH64_VALIDATE_LINK_REGISTER
696	ret
697.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
698___
699}}}
700
701{{{
702sub gen_block () {
703	my $dir = shift;
704	my ($inp,$outp,$rk)=map("x$_",(0..2));
705
706$code.=<<___;
707.globl	${prefix}_${dir}crypt
708.type	${prefix}_${dir}crypt,%function
709.align	5
710${prefix}_${dir}crypt:
711	AARCH64_VALID_CALL_TARGET
712	ld1	{@data[0].4s},[$inp]
713___
714	&load_sbox();
715	&rev32(@data[0],@data[0]);
716$code.=<<___;
717	mov	$rks,$rk
718___
719	&encrypt_1blk(@data[0]);
720$code.=<<___;
721	st1	{@data[0].4s},[$outp]
722	ret
723.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
724___
725}
726&gen_block("en");
727&gen_block("de");
728}}}
729
730{{{
731$code.=<<___;
732.globl	${prefix}_ecb_encrypt
733.type	${prefix}_ecb_encrypt,%function
734.align	5
735${prefix}_ecb_encrypt:
736	AARCH64_SIGN_LINK_REGISTER
737	// convert length into blocks
738	lsr	x2,x2,4
739	stp	d8,d9,[sp,#-80]!
740	stp	d10,d11,[sp,#16]
741	stp	d12,d13,[sp,#32]
742	stp	d14,d15,[sp,#48]
743	stp	x29,x30,[sp,#64]
744___
745	&load_sbox();
746$code.=<<___;
747.Lecb_8_blocks_process:
748	cmp	$blocks,#8
749	b.lt	.Lecb_4_blocks_process
750	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
751	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
752___
753	&rev32(@data[0],@data[0]);
754	&rev32(@data[1],@data[1]);
755	&rev32(@data[2],@data[2]);
756	&rev32(@data[3],@data[3]);
757	&rev32(@datax[0],@datax[0]);
758	&rev32(@datax[1],@datax[1]);
759	&rev32(@datax[2],@datax[2]);
760	&rev32(@datax[3],@datax[3]);
761$code.=<<___;
762	bl	_${prefix}_enc_8blks
763	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
764	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
765	subs	$blocks,$blocks,#8
766	b.gt	.Lecb_8_blocks_process
767	b	100f
768.Lecb_4_blocks_process:
769	cmp	$blocks,#4
770	b.lt	1f
771	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
772___
773	&rev32(@data[0],@data[0]);
774	&rev32(@data[1],@data[1]);
775	&rev32(@data[2],@data[2]);
776	&rev32(@data[3],@data[3]);
777$code.=<<___;
778	bl	_${prefix}_enc_4blks
779	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
780	sub	$blocks,$blocks,#4
7811:
782	// process last block
783	cmp	$blocks,#1
784	b.lt	100f
785	b.gt	1f
786	ld1	{@data[0].4s},[$inp]
787___
788	&rev32(@data[0],@data[0]);
789	&encrypt_1blk(@data[0]);
790$code.=<<___;
791	st1	{@data[0].4s},[$outp]
792	b	100f
7931:	// process last 2 blocks
794	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
795	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
796	cmp	$blocks,#2
797	b.gt	1f
798___
799	&rev32(@data[0],@data[0]);
800	&rev32(@data[1],@data[1]);
801	&rev32(@data[2],@data[2]);
802	&rev32(@data[3],@data[3]);
803$code.=<<___;
804	bl	_${prefix}_enc_4blks
805	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
806	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
807	b	100f
8081:	// process last 3 blocks
809	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
810___
811	&rev32(@data[0],@data[0]);
812	&rev32(@data[1],@data[1]);
813	&rev32(@data[2],@data[2]);
814	&rev32(@data[3],@data[3]);
815$code.=<<___;
816	bl	_${prefix}_enc_4blks
817	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
818	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
819	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
820100:
821	ldp	d10,d11,[sp,#16]
822	ldp	d12,d13,[sp,#32]
823	ldp	d14,d15,[sp,#48]
824	ldp	x29,x30,[sp,#64]
825	ldp	d8,d9,[sp],#80
826	AARCH64_VALIDATE_LINK_REGISTER
827	ret
828.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
829___
830}}}
831
832{{{
833my ($len,$ivp,$enc)=("x2","x4","w5");
834my $ivec0=("v3");
835my $ivec1=("v15");
836
837$code.=<<___;
838.globl	${prefix}_cbc_encrypt
839.type	${prefix}_cbc_encrypt,%function
840.align	5
841${prefix}_cbc_encrypt:
842	AARCH64_VALID_CALL_TARGET
843	lsr	$len,$len,4
844___
845	&load_sbox();
846$code.=<<___;
847	cbz	$enc,.Ldec
848	ld1	{$ivec0.4s},[$ivp]
849.Lcbc_4_blocks_enc:
850	cmp	$blocks,#4
851	b.lt	1f
852	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
853	eor	@data[0].16b,@data[0].16b,$ivec0.16b
854___
855	&rev32(@data[1],@data[1]);
856	&rev32(@data[0],@data[0]);
857	&rev32(@data[2],@data[2]);
858	&rev32(@data[3],@data[3]);
859	&encrypt_1blk_norev(@data[0]);
860$code.=<<___;
861	eor	@data[1].16b,@data[1].16b,@data[0].16b
862___
863	&encrypt_1blk_norev(@data[1]);
864	&rev32(@data[0],@data[0]);
865
866$code.=<<___;
867	eor	@data[2].16b,@data[2].16b,@data[1].16b
868___
869	&encrypt_1blk_norev(@data[2]);
870	&rev32(@data[1],@data[1]);
871$code.=<<___;
872	eor	@data[3].16b,@data[3].16b,@data[2].16b
873___
874	&encrypt_1blk_norev(@data[3]);
875	&rev32(@data[2],@data[2]);
876	&rev32(@data[3],@data[3]);
877$code.=<<___;
878	orr	$ivec0.16b,@data[3].16b,@data[3].16b
879	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
880	subs	$blocks,$blocks,#4
881	b.ne	.Lcbc_4_blocks_enc
882	b	2f
8831:
884	subs	$blocks,$blocks,#1
885	b.lt	2f
886	ld1	{@data[0].4s},[$inp],#16
887	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
888___
889	&rev32($ivec0,$ivec0);
890	&encrypt_1blk($ivec0);
891$code.=<<___;
892	st1	{$ivec0.4s},[$outp],#16
893	b	1b
8942:
895	// save back IV
896	st1	{$ivec0.4s},[$ivp]
897	ret
898
899.Ldec:
900	// decryption mode starts
901	AARCH64_SIGN_LINK_REGISTER
902	stp	d8,d9,[sp,#-80]!
903	stp	d10,d11,[sp,#16]
904	stp	d12,d13,[sp,#32]
905	stp	d14,d15,[sp,#48]
906	stp	x29,x30,[sp,#64]
907.Lcbc_8_blocks_dec:
908	cmp	$blocks,#8
909	b.lt	1f
910	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
911	add	$ptr,$inp,#64
912	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
913___
914	&rev32(@data[0],@data[0]);
915	&rev32(@data[1],@data[1]);
916	&rev32(@data[2],@data[2]);
917	&rev32(@data[3],$data[3]);
918	&rev32(@datax[0],@datax[0]);
919	&rev32(@datax[1],@datax[1]);
920	&rev32(@datax[2],@datax[2]);
921	&rev32(@datax[3],$datax[3]);
922$code.=<<___;
923	bl	_${prefix}_enc_8blks
924___
925	&transpose(@vtmp,@datax);
926	&transpose(@data,@datax);
927$code.=<<___;
928	ld1	{$ivec1.4s},[$ivp]
929	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
930	// note ivec1 and vtmpx[3] are reusing the same register
931	// care needs to be taken to avoid conflict
932	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
933	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
934	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
935	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
936	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
937	// save back IV
938	st1	{$vtmpx[3].4s}, [$ivp]
939	eor	@data[0].16b,@data[0].16b,$datax[3].16b
940	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
941	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
942	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
943	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
944	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
945	subs	$blocks,$blocks,#8
946	b.gt	.Lcbc_8_blocks_dec
947	b.eq	100f
9481:
949	ld1	{$ivec1.4s},[$ivp]
950.Lcbc_4_blocks_dec:
951	cmp	$blocks,#4
952	b.lt	1f
953	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
954___
955	&rev32(@data[0],@data[0]);
956	&rev32(@data[1],@data[1]);
957	&rev32(@data[2],@data[2]);
958	&rev32(@data[3],$data[3]);
959$code.=<<___;
960	bl	_${prefix}_enc_4blks
961	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
962___
963	&transpose(@vtmp,@datax);
964$code.=<<___;
965	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
966	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
967	orr	$ivec1.16b,@data[3].16b,@data[3].16b
968	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
969	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
970	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
971	subs	$blocks,$blocks,#4
972	b.gt	.Lcbc_4_blocks_dec
973	// save back IV
974	st1	{@data[3].4s}, [$ivp]
975	b	100f
9761:	// last block
977	subs	$blocks,$blocks,#1
978	b.lt	100f
979	b.gt	1f
980	ld1	{@data[0].4s},[$inp],#16
981	// save back IV
982	st1	{$data[0].4s}, [$ivp]
983___
984	&rev32(@datax[0],@data[0]);
985	&encrypt_1blk(@datax[0]);
986$code.=<<___;
987	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
988	st1	{@datax[0].4s},[$outp],#16
989	b	100f
9901:	// last two blocks
991	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
992	add	$ptr,$inp,#16
993	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
994	subs	$blocks,$blocks,1
995	b.gt	1f
996___
997	&rev32(@data[0],@data[0]);
998	&rev32(@data[1],@data[1]);
999	&rev32(@data[2],@data[2]);
1000	&rev32(@data[3],@data[3]);
1001$code.=<<___;
1002	bl	_${prefix}_enc_4blks
1003	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1004___
1005	&transpose(@vtmp,@datax);
1006$code.=<<___;
1007	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1008	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1009	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1010	// save back IV
1011	st1	{@data[1].4s}, [$ivp]
1012	b	100f
10131:	// last 3 blocks
1014	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1015___
1016	&rev32(@data[0],@data[0]);
1017	&rev32(@data[1],@data[1]);
1018	&rev32(@data[2],@data[2]);
1019	&rev32(@data[3],@data[3]);
1020$code.=<<___;
1021	bl	_${prefix}_enc_4blks
1022	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1023___
1024	&transpose(@vtmp,@datax);
1025$code.=<<___;
1026	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1027	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1028	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1029	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1030	// save back IV
1031	st1	{@data[2].4s}, [$ivp]
1032100:
1033	ldp	d10,d11,[sp,#16]
1034	ldp	d12,d13,[sp,#32]
1035	ldp	d14,d15,[sp,#48]
1036	ldp	x29,x30,[sp,#64]
1037	ldp	d8,d9,[sp],#80
1038	AARCH64_VALIDATE_LINK_REGISTER
1039	ret
1040.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1041___
1042}}}
1043
1044{{{
1045my ($ivp)=("x4");
1046my ($ctr)=("w5");
1047my $ivec=("v3");
1048
1049$code.=<<___;
1050.globl	${prefix}_ctr32_encrypt_blocks
1051.type	${prefix}_ctr32_encrypt_blocks,%function
1052.align	5
1053${prefix}_ctr32_encrypt_blocks:
1054	AARCH64_VALID_CALL_TARGET
1055	ld1	{$ivec.4s},[$ivp]
1056___
1057	&rev32($ivec,$ivec);
1058	&load_sbox();
1059$code.=<<___;
1060	cmp	$blocks,#1
1061	b.ne	1f
1062	// fast processing for one single block without
1063	// context saving overhead
1064___
1065	&encrypt_1blk($ivec);
1066$code.=<<___;
1067	ld1	{@data[0].4s},[$inp]
1068	eor	@data[0].16b,@data[0].16b,$ivec.16b
1069	st1	{@data[0].4s},[$outp]
1070	ret
10711:
1072	AARCH64_SIGN_LINK_REGISTER
1073	stp	d8,d9,[sp,#-80]!
1074	stp	d10,d11,[sp,#16]
1075	stp	d12,d13,[sp,#32]
1076	stp	d14,d15,[sp,#48]
1077	stp	x29,x30,[sp,#64]
1078	mov	$word0,$ivec.s[0]
1079	mov	$word1,$ivec.s[1]
1080	mov	$word2,$ivec.s[2]
1081	mov	$ctr,$ivec.s[3]
1082.Lctr32_4_blocks_process:
1083	cmp	$blocks,#4
1084	b.lt	1f
1085	dup	@data[0].4s,$word0
1086	dup	@data[1].4s,$word1
1087	dup	@data[2].4s,$word2
1088	mov	@data[3].s[0],$ctr
1089	add	$ctr,$ctr,#1
1090	mov	$data[3].s[1],$ctr
1091	add	$ctr,$ctr,#1
1092	mov	@data[3].s[2],$ctr
1093	add	$ctr,$ctr,#1
1094	mov	@data[3].s[3],$ctr
1095	add	$ctr,$ctr,#1
1096	cmp	$blocks,#8
1097	b.ge	.Lctr32_8_blocks_process
1098	bl	_${prefix}_enc_4blks
1099	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1100	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1101	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1102	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1103	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1104	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1105	subs	$blocks,$blocks,#4
1106	b.ne	.Lctr32_4_blocks_process
1107	b	100f
1108.Lctr32_8_blocks_process:
1109	dup	@datax[0].4s,$word0
1110	dup	@datax[1].4s,$word1
1111	dup	@datax[2].4s,$word2
1112	mov	@datax[3].s[0],$ctr
1113	add	$ctr,$ctr,#1
1114	mov	$datax[3].s[1],$ctr
1115	add	$ctr,$ctr,#1
1116	mov	@datax[3].s[2],$ctr
1117	add	$ctr,$ctr,#1
1118	mov	@datax[3].s[3],$ctr
1119	add	$ctr,$ctr,#1
1120	bl	_${prefix}_enc_8blks
1121	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1122	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1123	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1124	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1125	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1126	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1127	eor	@data[0].16b,@data[0].16b,@datax[0].16b
1128	eor	@data[1].16b,@data[1].16b,@datax[1].16b
1129	eor	@data[2].16b,@data[2].16b,@datax[2].16b
1130	eor	@data[3].16b,@data[3].16b,@datax[3].16b
1131	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1132	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1133	subs	$blocks,$blocks,#8
1134	b.ne	.Lctr32_4_blocks_process
1135	b	100f
11361:	// last block processing
1137	subs	$blocks,$blocks,#1
1138	b.lt	100f
1139	b.gt	1f
1140	mov	$ivec.s[0],$word0
1141	mov	$ivec.s[1],$word1
1142	mov	$ivec.s[2],$word2
1143	mov	$ivec.s[3],$ctr
1144___
1145	&encrypt_1blk($ivec);
1146$code.=<<___;
1147	ld1	{@data[0].4s},[$inp]
1148	eor	@data[0].16b,@data[0].16b,$ivec.16b
1149	st1	{@data[0].4s},[$outp]
1150	b	100f
11511:	// last 2 blocks processing
1152	dup	@data[0].4s,$word0
1153	dup	@data[1].4s,$word1
1154	dup	@data[2].4s,$word2
1155	mov	@data[3].s[0],$ctr
1156	add	$ctr,$ctr,#1
1157	mov	@data[3].s[1],$ctr
1158	subs	$blocks,$blocks,#1
1159	b.ne	1f
1160	bl	_${prefix}_enc_4blks
1161	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1162	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1163	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1164	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1165	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1166	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1167	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1168	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1169	b	100f
11701:	// last 3 blocks processing
1171	add	$ctr,$ctr,#1
1172	mov	@data[3].s[2],$ctr
1173	bl	_${prefix}_enc_4blks
1174	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1175	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1176	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1177	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1178	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1179	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1180	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1181	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1182	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1183	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1184100:
1185	ldp	d10,d11,[sp,#16]
1186	ldp	d12,d13,[sp,#32]
1187	ldp	d14,d15,[sp,#48]
1188	ldp	x29,x30,[sp,#64]
1189	ldp	d8,d9,[sp],#80
1190	AARCH64_VALIDATE_LINK_REGISTER
1191	ret
1192.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1193___
1194}}}
1195
1196
1197{{{
1198my ($blocks,$len)=("x2","x2");
1199my $ivp=("x5");
1200my @twx=map("x$_",(12..27));
1201my ($rks1,$rks2)=("x26","x27");
1202my $lastBlk=("x26");
1203my $enc=("w28");
1204my $remain=("x29");
1205
1206my @tweak=map("v$_",(16..23));
1207my $lastTweak=("v25");
1208
1209sub gen_xts_cipher() {
1210	my $std = shift;
1211$code.=<<___;
1212.globl	${prefix}_xts_encrypt${std}
1213.type	${prefix}_xts_encrypt${std},%function
1214.align	5
1215${prefix}_xts_encrypt${std}:
1216	AARCH64_SIGN_LINK_REGISTER
1217	stp	x15, x16, [sp, #-0x10]!
1218	stp	x17, x18, [sp, #-0x10]!
1219	stp	x19, x20, [sp, #-0x10]!
1220	stp	x21, x22, [sp, #-0x10]!
1221	stp	x23, x24, [sp, #-0x10]!
1222	stp	x25, x26, [sp, #-0x10]!
1223	stp	x27, x28, [sp, #-0x10]!
1224	stp	x29, x30, [sp, #-0x10]!
1225	stp	d8, d9, [sp, #-0x10]!
1226	stp	d10, d11, [sp, #-0x10]!
1227	stp	d12, d13, [sp, #-0x10]!
1228	stp	d14, d15, [sp, #-0x10]!
1229	mov	$rks1,x3
1230	mov	$rks2,x4
1231	mov	$enc,w6
1232	ld1	{@tweak[0].4s}, [$ivp]
1233	mov	$rks,$rks2
1234___
1235	&load_sbox();
1236	&rev32(@tweak[0],@tweak[0]);
1237	&encrypt_1blk(@tweak[0]);
1238$code.=<<___;
1239	mov	$rks,$rks1
1240	and	$remain,$len,#0x0F
1241	// convert length into blocks
1242	lsr	$blocks,$len,4
1243	cmp	$blocks,#1
1244	b.lt .return${std}
1245
1246	cmp $remain,0
1247	// If the encryption/decryption Length is N times of 16,
1248	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1249	b.eq .xts_encrypt_blocks${std}
1250
1251	// If the encryption/decryption length is not N times of 16,
1252	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1253	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1254	subs $blocks,$blocks,#1
1255	b.eq .only_2blks_tweak${std}
1256.xts_encrypt_blocks${std}:
1257___
1258	&rbit(@tweak[0],@tweak[0],$std);
1259	&rev32_armeb(@tweak[0],@tweak[0]);
1260	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1261	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1262	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1263	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1264	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1265	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1266	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1267	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1268$code.=<<___;
1269.Lxts_8_blocks_process${std}:
1270	cmp	$blocks,#8
1271___
1272	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1273	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1274	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1275	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1276	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1277	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1278	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1279	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1280	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
1281	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1282	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
1283	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1284	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
1285	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1286	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
1287	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1288$code.=<<___;
1289	b.lt	.Lxts_4_blocks_process${std}
1290	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1291___
1292	&rbit(@tweak[0],@tweak[0],$std);
1293	&rbit(@tweak[1],@tweak[1],$std);
1294	&rbit(@tweak[2],@tweak[2],$std);
1295	&rbit(@tweak[3],@tweak[3],$std);
1296$code.=<<___;
1297	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1298	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1299	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1300	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1301	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1302___
1303	&rbit(@tweak[4],@tweak[4],$std);
1304	&rbit(@tweak[5],@tweak[5],$std);
1305	&rbit(@tweak[6],@tweak[6],$std);
1306	&rbit(@tweak[7],@tweak[7],$std);
1307$code.=<<___;
1308	eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
1309	eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
1310	eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
1311	eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
1312___
1313	&rev32(@data[0],@data[0]);
1314	&rev32(@data[1],@data[1]);
1315	&rev32(@data[2],@data[2]);
1316	&rev32(@data[3],@data[3]);
1317	&rev32(@datax[0],@datax[0]);
1318	&rev32(@datax[1],@datax[1]);
1319	&rev32(@datax[2],@datax[2]);
1320	&rev32(@datax[3],@datax[3]);
1321	&transpose(@data,@vtmp);
1322	&transpose(@datax,@vtmp);
1323$code.=<<___;
1324	bl	_${prefix}_enc_8blks
1325___
1326	&transpose(@vtmp,@datax);
1327	&transpose(@data,@datax);
1328$code.=<<___;
1329	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1330	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1331	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1332	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1333	eor @data[0].16b, @data[0].16b, @tweak[4].16b
1334	eor @data[1].16b, @data[1].16b, @tweak[5].16b
1335	eor @data[2].16b, @data[2].16b, @tweak[6].16b
1336	eor @data[3].16b, @data[3].16b, @tweak[7].16b
1337
1338	// save the last tweak
1339	mov $lastTweak.16b,@tweak[7].16b
1340	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1341	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1342	subs	$blocks,$blocks,#8
1343	b.gt	.Lxts_8_blocks_process${std}
1344	b	100f
1345.Lxts_4_blocks_process${std}:
1346	cmp	$blocks,#4
1347	b.lt	1f
1348	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1349___
1350	&rbit(@tweak[0],@tweak[0],$std);
1351	&rbit(@tweak[1],@tweak[1],$std);
1352	&rbit(@tweak[2],@tweak[2],$std);
1353	&rbit(@tweak[3],@tweak[3],$std);
1354$code.=<<___;
1355	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1356	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1357	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1358	eor @data[3].16b, @data[3].16b, @tweak[3].16b
1359___
1360	&rev32(@data[0],@data[0]);
1361	&rev32(@data[1],@data[1]);
1362	&rev32(@data[2],@data[2]);
1363	&rev32(@data[3],@data[3]);
1364	&transpose(@data,@vtmp);
1365$code.=<<___;
1366	bl	_${prefix}_enc_4blks
1367___
1368	&transpose(@vtmp,@data);
1369$code.=<<___;
1370	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1371	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1372	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1373	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1374	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1375	sub	$blocks,$blocks,#4
1376	mov @tweak[0].16b,@tweak[4].16b
1377	mov @tweak[1].16b,@tweak[5].16b
1378	mov @tweak[2].16b,@tweak[6].16b
1379	// save the last tweak
1380	mov $lastTweak.16b,@tweak[3].16b
13811:
1382	// process last block
1383	cmp	$blocks,#1
1384	b.lt	100f
1385	b.gt	1f
1386	ld1	{@data[0].4s},[$inp],#16
1387___
1388	&rbit(@tweak[0],@tweak[0],$std);
1389$code.=<<___;
1390	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1391___
1392	&rev32(@data[0],@data[0]);
1393	&encrypt_1blk(@data[0]);
1394$code.=<<___;
1395	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1396	st1	{@data[0].4s},[$outp],#16
1397	// save the last tweak
1398	mov $lastTweak.16b,@tweak[0].16b
1399	b	100f
14001:  // process last 2 blocks
1401	cmp	$blocks,#2
1402	b.gt	1f
1403	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
1404___
1405	&rbit(@tweak[0],@tweak[0],$std);
1406	&rbit(@tweak[1],@tweak[1],$std);
1407$code.=<<___;
1408	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1409	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1410___
1411	&rev32(@data[0],@data[0]);
1412	&rev32(@data[1],@data[1]);
1413	&transpose(@data,@vtmp);
1414$code.=<<___;
1415	bl	_${prefix}_enc_4blks
1416___
1417	&transpose(@vtmp,@data);
1418$code.=<<___;
1419	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1420	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1421	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1422	// save the last tweak
1423	mov $lastTweak.16b,@tweak[1].16b
1424	b	100f
14251:  // process last 3 blocks
1426	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1427___
1428	&rbit(@tweak[0],@tweak[0],$std);
1429	&rbit(@tweak[1],@tweak[1],$std);
1430	&rbit(@tweak[2],@tweak[2],$std);
1431$code.=<<___;
1432	eor @data[0].16b, @data[0].16b, @tweak[0].16b
1433	eor @data[1].16b, @data[1].16b, @tweak[1].16b
1434	eor @data[2].16b, @data[2].16b, @tweak[2].16b
1435___
1436	&rev32(@data[0],@data[0]);
1437	&rev32(@data[1],@data[1]);
1438	&rev32(@data[2],@data[2]);
1439	&transpose(@data,@vtmp);
1440$code.=<<___;
1441	bl	_${prefix}_enc_4blks
1442___
1443	&transpose(@vtmp,@data);
1444$code.=<<___;
1445	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1446	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1447	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1448	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1449	// save the last tweak
1450	mov $lastTweak.16b,@tweak[2].16b
1451100:
1452	cmp $remain,0
1453	b.eq .return${std}
1454
1455// This branch calculates the last two tweaks,
1456// while the encryption/decryption length is larger than 32
1457.last_2blks_tweak${std}:
1458___
1459	&rev32_armeb($lastTweak,$lastTweak);
1460	&compute_tweak_vec($lastTweak,@tweak[1],$std);
1461	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1462$code.=<<___;
1463	b .check_dec${std}
1464
1465
1466// This branch calculates the last two tweaks,
1467// while the encryption/decryption length is equal to 32, who only need two tweaks
1468.only_2blks_tweak${std}:
1469	mov @tweak[1].16b,@tweak[0].16b
1470___
1471	&rev32_armeb(@tweak[1],@tweak[1]);
1472	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
1473$code.=<<___;
1474	b .check_dec${std}
1475
1476
1477// Determine whether encryption or decryption is required.
1478// The last two tweaks need to be swapped for decryption.
1479.check_dec${std}:
1480	// encryption:1 decryption:0
1481	cmp $enc,1
1482	b.eq .process_last_2blks${std}
1483	mov @vtmp[0].16B,@tweak[1].16b
1484	mov @tweak[1].16B,@tweak[2].16b
1485	mov @tweak[2].16B,@vtmp[0].16b
1486
1487.process_last_2blks${std}:
1488___
1489	&rev32_armeb(@tweak[1],@tweak[1]);
1490	&rev32_armeb(@tweak[2],@tweak[2]);
1491$code.=<<___;
1492	ld1	{@data[0].4s},[$inp],#16
1493	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1494___
1495	&rev32(@data[0],@data[0]);
1496	&encrypt_1blk(@data[0]);
1497$code.=<<___;
1498	eor @data[0].16b, @data[0].16b, @tweak[1].16b
1499	st1	{@data[0].4s},[$outp],#16
1500
1501	sub $lastBlk,$outp,16
1502	.loop${std}:
1503		subs $remain,$remain,1
1504		ldrb	$wtmp0,[$lastBlk,$remain]
1505		ldrb	$wtmp1,[$inp,$remain]
1506		strb	$wtmp1,[$lastBlk,$remain]
1507		strb	$wtmp0,[$outp,$remain]
1508	b.gt .loop${std}
1509	ld1		{@data[0].4s}, [$lastBlk]
1510	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1511___
1512	&rev32(@data[0],@data[0]);
1513	&encrypt_1blk(@data[0]);
1514$code.=<<___;
1515	eor @data[0].16b, @data[0].16b, @tweak[2].16b
1516	st1		{@data[0].4s}, [$lastBlk]
1517.return${std}:
1518	ldp		d14, d15, [sp], #0x10
1519	ldp		d12, d13, [sp], #0x10
1520	ldp		d10, d11, [sp], #0x10
1521	ldp		d8, d9, [sp], #0x10
1522	ldp		x29, x30, [sp], #0x10
1523	ldp		x27, x28, [sp], #0x10
1524	ldp		x25, x26, [sp], #0x10
1525	ldp		x23, x24, [sp], #0x10
1526	ldp		x21, x22, [sp], #0x10
1527	ldp		x19, x20, [sp], #0x10
1528	ldp		x17, x18, [sp], #0x10
1529	ldp		x15, x16, [sp], #0x10
1530	AARCH64_VALIDATE_LINK_REGISTER
1531	ret
1532.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1533___
1534} # end of gen_xts_cipher
1535&gen_xts_cipher("_gb");
1536&gen_xts_cipher("");
1537}}}
1538
1539########################################
1540open SELF,$0;
1541while(<SELF>) {
1542		next if (/^#!/);
1543		last if (!s/^#/\/\// and !/^$/);
1544		print;
1545}
1546close SELF;
1547
1548foreach(split("\n",$code)) {
1549	s/\`([^\`]*)\`/eval($1)/ge;
1550	print $_,"\n";
1551}
1552
1553close STDOUT or die "error closing STDOUT: $!";
1554