1#! /usr/bin/env perl
2# Copyright 2022-2023  The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# ChaCha20 for ARMv8 via SVE
11#
12# $output is the last argument if it looks like a file (it has an extension)
13# $flavour is the first argument if it doesn't look like a file
14$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
15$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
16
17$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
18( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
19( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
20die "can't locate arm-xlate.pl";
21
22open OUT,"| \"$^X\" $xlate $flavour \"$output\""
23    or die "can't call $xlate: $!";
24*STDOUT=*OUT;
25
26sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
27{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
28  my $arg = pop;
29    $arg = "#$arg" if ($arg*1 eq $arg);
30    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
31}
32
33my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
34my ($veclen) = ("x5");
35my ($counter) = ("x6");
36my ($counter_w) = ("w6");
37my @xx=(7..22);
38my @sxx=map("x$_",@xx);
39my @sx=map("w$_",@xx);
40my @K=map("x$_",(23..30));
41my @elem=(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
42my @KL=map("w$_",(23..30));
43my @mx=map("z$_",@elem);
44my @vx=map("v$_",@elem);
45my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
46    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
47my ($zctr) = ("z16");
48my @tt=(17..24);
49my @xt=map("z$_",@tt);
50my @vt=map("v$_",@tt);
51my @perm=map("z$_",(25..30));
52my ($rot8) = ("z31");
53my @bak=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],@xt[4],@xt[5],@xt[6],@xt[7],@xt[0],@xt[1],$zctr,@xt[2],@xt[3],$rot8);
54my $debug_encoder=0;
55
56sub SVE_ADD() {
57	my $x = shift;
58	my $y = shift;
59
60$code.=<<___;
61	add	@mx[$x].s,@mx[$x].s,@mx[$y].s
62	.if mixin == 1
63		add	@sx[$x],@sx[$x],@sx[$y]
64	.endif
65___
66	if (@_) {
67		&SVE_ADD(@_);
68	}
69}
70
71sub SVE_EOR() {
72	my $x = shift;
73	my $y = shift;
74
75$code.=<<___;
76	eor	@mx[$x].d,@mx[$x].d,@mx[$y].d
77	.if mixin == 1
78		eor	@sx[$x],@sx[$x],@sx[$y]
79	.endif
80___
81	if (@_) {
82		&SVE_EOR(@_);
83	}
84}
85
86sub SVE_LSL() {
87	my $bits = shift;
88	my $x = shift;
89	my $y = shift;
90	my $next = $x + 1;
91
92$code.=<<___;
93	lsl	@xt[$x].s,@mx[$y].s,$bits
94___
95	if (@_) {
96		&SVE_LSL($bits,$next,@_);
97	}
98}
99
100sub SVE_LSR() {
101	my $bits = shift;
102	my $x = shift;
103
104$code.=<<___;
105	lsr	@mx[$x].s,@mx[$x].s,$bits
106	.if mixin == 1
107		ror	@sx[$x],@sx[$x],$bits
108	.endif
109___
110	if (@_) {
111		&SVE_LSR($bits,@_);
112	}
113}
114
115sub SVE_ORR() {
116	my $x = shift;
117	my $y = shift;
118	my $next = $x + 1;
119
120$code.=<<___;
121	orr	@mx[$y].d,@mx[$y].d,@xt[$x].d
122___
123	if (@_) {
124		&SVE_ORR($next,@_);
125	}
126}
127
128sub SVE_REV16() {
129	my $x = shift;
130
131$code.=<<___;
132	revh	@mx[$x].s,p0/m,@mx[$x].s
133	.if mixin == 1
134		ror	@sx[$x],@sx[$x],#16
135	.endif
136___
137	if (@_) {
138		&SVE_REV16(@_);
139	}
140}
141
142sub SVE_ROT8() {
143	my $x = shift;
144
145$code.=<<___;
146	tbl	@mx[$x].b,{@mx[$x].b},$rot8.b
147	.if mixin == 1
148		ror	@sx[$x],@sx[$x],#24
149	.endif
150___
151	if (@_) {
152		&SVE_ROT8(@_);
153	}
154}
155
156sub SVE2_XAR() {
157	my $bits = shift;
158	my $x = shift;
159	my $y = shift;
160	my $rbits = 32-$bits;
161
162$code.=<<___;
163	.if mixin == 1
164		eor	@sx[$x],@sx[$x],@sx[$y]
165	.endif
166	xar	@mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
167	.if mixin == 1
168		ror	@sx[$x],@sx[$x],$rbits
169	.endif
170___
171	if (@_) {
172		&SVE2_XAR($bits,@_);
173	}
174}
175
176sub SVE2_QR_GROUP() {
177	my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
178
179	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
180	&SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
181
182	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
183	&SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
184
185	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
186	&SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
187
188	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
189	&SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
190}
191
192sub SVE_QR_GROUP() {
193	my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
194
195	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
196	&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
197	&SVE_REV16($d0,$d1,$d2,$d3);
198
199	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
200	&SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
201	&SVE_LSL(12,0,$b0,$b1,$b2,$b3);
202	&SVE_LSR(20,$b0,$b1,$b2,$b3);
203	&SVE_ORR(0,$b0,$b1,$b2,$b3);
204
205	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
206	&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
207	&SVE_ROT8($d0,$d1,$d2,$d3);
208
209	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
210	&SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
211	&SVE_LSL(7,0,$b0,$b1,$b2,$b3);
212	&SVE_LSR(25,$b0,$b1,$b2,$b3);
213	&SVE_ORR(0,$b0,$b1,$b2,$b3);
214}
215
216sub SVE_INNER_BLOCK() {
217$code.=<<___;
218	mov	$counter,#10
21910:
220.align	5
221___
222	&SVE_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
223	&SVE_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
224$code.=<<___;
225	sub	$counter,$counter,1
226	cbnz	$counter,10b
227___
228}
229
230sub SVE2_INNER_BLOCK() {
231$code.=<<___;
232	mov	$counter,#10
23310:
234.align	5
235___
236	&SVE2_QR_GROUP(0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
237	&SVE2_QR_GROUP(0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
238$code.=<<___;
239	sub	$counter,$counter,1
240	cbnz	$counter,10b
241___
242}
243
244sub load_regs() {
245	my $offset = shift;
246	my $reg = shift;
247	my $next_offset = $offset + 1;
248$code.=<<___;
249	ld1w	{$reg.s},p0/z,[$inp,#$offset,MUL VL]
250#ifdef  __AARCH64EB__
251	revb    $reg.s,p0/m,$reg.s
252#endif
253___
254	if (@_) {
255		&load_regs($next_offset, @_);
256	} else {
257$code.=<<___;
258	addvl	$inp,$inp,$next_offset
259___
260	}
261}
262
263sub load() {
264	if (@_) {
265		&load_regs(0, @_);
266	}
267}
268
269sub store_regs() {
270	my $offset = shift;
271	my $reg = shift;
272	my $next_offset = $offset + 1;
273$code.=<<___;
274#ifdef  __AARCH64EB__
275	revb	$reg.s,p0/m,$reg.s
276#endif
277	st1w	{$reg.s},p0,[$outp,#$offset,MUL VL]
278___
279	if (@_) {
280		&store_regs($next_offset, @_);
281	} else {
282$code.=<<___;
283	addvl	$outp,$outp,$next_offset
284___
285	}
286}
287
288sub store() {
289	if (@_) {
290		&store_regs(0, @_);
291	}
292}
293
294sub transpose() {
295	my $xa = shift;
296	my $xb = shift;
297	my $xc = shift;
298	my $xd = shift;
299	my $xa1 = shift;
300	my $xb1 = shift;
301	my $xc1 = shift;
302	my $xd1 = shift;
303$code.=<<___;
304	zip1	@xt[0].s,$xa.s,$xb.s
305	zip2	@xt[1].s,$xa.s,$xb.s
306	zip1	@xt[2].s,$xc.s,$xd.s
307	zip2	@xt[3].s,$xc.s,$xd.s
308
309	zip1	@xt[4].s,$xa1.s,$xb1.s
310	zip2	@xt[5].s,$xa1.s,$xb1.s
311	zip1	@xt[6].s,$xc1.s,$xd1.s
312	zip2	@xt[7].s,$xc1.s,$xd1.s
313
314	zip1	$xa.d,@xt[0].d,@xt[2].d
315	zip2	$xb.d,@xt[0].d,@xt[2].d
316	zip1	$xc.d,@xt[1].d,@xt[3].d
317	zip2	$xd.d,@xt[1].d,@xt[3].d
318
319	zip1	$xa1.d,@xt[4].d,@xt[6].d
320	zip2	$xb1.d,@xt[4].d,@xt[6].d
321	zip1	$xc1.d,@xt[5].d,@xt[7].d
322	zip2	$xd1.d,@xt[5].d,@xt[7].d
323___
324}
325
326sub ACCUM() {
327	my $idx0 = shift;
328	my $idx1 = $idx0 + 1;
329	my $x0 = @sx[$idx0];
330	my $xx0 = @sxx[$idx0];
331	my $x1 = @sx[$idx1];
332	my $xx1 = @sxx[$idx1];
333	my $d = $idx0/2;
334	my ($tmp,$tmpw) = ($counter,$counter_w);
335	my $bk0 = @_ ? shift : @bak[$idx0];
336	my $bk1 = @_ ? shift : @bak[$idx1];
337
338$code.=<<___;
339	.if mixin == 1
340		add	@sx[$idx0],@sx[$idx0],@KL[$d]
341	.endif
342	add	@mx[$idx0].s,@mx[$idx0].s,$bk0.s
343	.if mixin == 1
344		add	@sxx[$idx1],@sxx[$idx1],@K[$d],lsr #32
345	.endif
346	add	@mx[$idx1].s,@mx[$idx1].s,$bk1.s
347	.if mixin == 1
348		add	@sxx[$idx0],@sxx[$idx0],$sxx[$idx1],lsl #32  // pack
349	.endif
350___
351}
352
353sub SCA_INP() {
354	my $idx0 = shift;
355	my $idx1 = $idx0 + 2;
356$code.=<<___;
357	.if mixin == 1
358		ldp	@sxx[$idx0],@sxx[$idx1],[$inp],#16
359	.endif
360___
361}
362
363sub SVE_ACCUM_STATES() {
364	my ($tmp,$tmpw) = ($counter,$counter_w);
365
366$code.=<<___;
367	lsr	$tmp,@K[5],#32
368	dup	@bak[10].s,@KL[5]
369	dup	@bak[11].s,$tmpw
370	lsr	$tmp,@K[6],#32
371	dup	@bak[13].s,$tmpw
372	lsr	$tmp,@K[7],#32
373___
374	&ACCUM(0);
375	&ACCUM(2);
376	&SCA_INP(1);
377	&ACCUM(4);
378	&ACCUM(6);
379	&SCA_INP(5);
380	&ACCUM(8);
381	&ACCUM(10);
382	&SCA_INP(9);
383$code.=<<___;
384	dup	@bak[14].s,@KL[7]
385	dup	@bak[0].s,$tmpw	// bak[15] not available for SVE
386___
387	&ACCUM(12);
388	&ACCUM(14, @bak[14],@bak[0]);
389	&SCA_INP(13);
390}
391
392sub SVE2_ACCUM_STATES() {
393	&ACCUM(0);
394	&ACCUM(2);
395	&SCA_INP(1);
396	&ACCUM(4);
397	&ACCUM(6);
398	&SCA_INP(5);
399	&ACCUM(8);
400	&ACCUM(10);
401	&SCA_INP(9);
402	&ACCUM(12);
403	&ACCUM(14);
404	&SCA_INP(13);
405}
406
407sub SCA_EOR() {
408	my $idx0 = shift;
409	my $idx1 = $idx0 + 1;
410$code.=<<___;
411	.if mixin == 1
412		eor	@sxx[$idx0],@sxx[$idx0],@sxx[$idx1]
413	.endif
414___
415}
416
417sub SCA_SAVE() {
418	my $idx0 = shift;
419	my $idx1 = shift;
420$code.=<<___;
421	.if mixin == 1
422		stp	@sxx[$idx0],@sxx[$idx1],[$outp],#16
423	.endif
424___
425}
426
427sub SVE_VL128_TRANSFORMS() {
428	&SCA_EOR(0);
429	&SCA_EOR(2);
430	&SCA_EOR(4);
431	&transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
432	&SCA_EOR(6);
433	&SCA_EOR(8);
434	&SCA_EOR(10);
435	&transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
436	&SCA_EOR(12);
437	&SCA_EOR(14);
438$code.=<<___;
439	ld1	{@vt[0].4s-@vt[3].4s},[$inp],#64
440	ld1	{@vt[4].4s-@vt[7].4s},[$inp],#64
441	eor	$xa0.d,$xa0.d,@xt[0].d
442	eor	$xb0.d,$xb0.d,@xt[1].d
443	eor	$xc0.d,$xc0.d,@xt[2].d
444	eor	$xd0.d,$xd0.d,@xt[3].d
445	eor	$xa1.d,$xa1.d,@xt[4].d
446	eor	$xb1.d,$xb1.d,@xt[5].d
447	eor	$xc1.d,$xc1.d,@xt[6].d
448	eor	$xd1.d,$xd1.d,@xt[7].d
449	ld1	{@vt[0].4s-@vt[3].4s},[$inp],#64
450	ld1	{@vt[4].4s-@vt[7].4s},[$inp],#64
451___
452	&SCA_SAVE(0,2);
453$code.=<<___;
454	eor	$xa2.d,$xa2.d,@xt[0].d
455	eor	$xb2.d,$xb2.d,@xt[1].d
456___
457	&SCA_SAVE(4,6);
458$code.=<<___;
459	eor	$xc2.d,$xc2.d,@xt[2].d
460	eor	$xd2.d,$xd2.d,@xt[3].d
461___
462	&SCA_SAVE(8,10);
463$code.=<<___;
464	eor	$xa3.d,$xa3.d,@xt[4].d
465	eor	$xb3.d,$xb3.d,@xt[5].d
466___
467	&SCA_SAVE(12,14);
468$code.=<<___;
469	eor	$xc3.d,$xc3.d,@xt[6].d
470	eor	$xd3.d,$xd3.d,@xt[7].d
471	st1	{@vx[0].4s-@vx[12].4s},[$outp],#64
472	st1	{@vx[1].4s-@vx[13].4s},[$outp],#64
473	st1	{@vx[2].4s-@vx[14].4s},[$outp],#64
474	st1	{@vx[3].4s-@vx[15].4s},[$outp],#64
475___
476}
477
478sub SVE_TRANSFORMS() {
479$code.=<<___;
480#ifdef	__AARCH64EB__
481	rev	@sxx[0],@sxx[0]
482	rev	@sxx[2],@sxx[2]
483	rev	@sxx[4],@sxx[4]
484	rev	@sxx[6],@sxx[6]
485	rev	@sxx[8],@sxx[8]
486	rev	@sxx[10],@sxx[10]
487	rev	@sxx[12],@sxx[12]
488	rev	@sxx[14],@sxx[14]
489#endif
490	.if mixin == 1
491		add	@K[6],@K[6],#1
492	.endif
493	cmp	$veclen,4
494	b.ne	200f
495___
496	&SVE_VL128_TRANSFORMS();
497$code.=<<___;
498	b	210f
499200:
500___
501	&transpose($xa0,$xb0,$xc0,$xd0,$xa1,$xb1,$xc1,$xd1);
502	&SCA_EOR(0);
503	&SCA_EOR(2);
504	&transpose($xa2,$xb2,$xc2,$xd2,$xa3,$xb3,$xc3,$xd3);
505	&SCA_EOR(4);
506	&SCA_EOR(6);
507	&transpose($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
508	&SCA_EOR(8);
509	&SCA_EOR(10);
510	&transpose($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
511	&SCA_EOR(12);
512	&SCA_EOR(14);
513	&load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]);
514$code.=<<___;
515	eor	$xa0.d,$xa0.d,@xt[0].d
516	eor	$xa1.d,$xa1.d,@xt[1].d
517	eor	$xa2.d,$xa2.d,@xt[2].d
518	eor	$xa3.d,$xa3.d,@xt[3].d
519	eor	$xb0.d,$xb0.d,@xt[4].d
520	eor	$xb1.d,$xb1.d,@xt[5].d
521	eor	$xb2.d,$xb2.d,@xt[6].d
522	eor	$xb3.d,$xb3.d,@xt[7].d
523___
524	&load(@xt[0],@xt[1],@xt[2],@xt[3],@xt[4],@xt[5],@xt[6],@xt[7]);
525	&SCA_SAVE(0,2);
526$code.=<<___;
527	eor	$xc0.d,$xc0.d,@xt[0].d
528	eor	$xc1.d,$xc1.d,@xt[1].d
529___
530	&SCA_SAVE(4,6);
531$code.=<<___;
532	eor	$xc2.d,$xc2.d,@xt[2].d
533	eor	$xc3.d,$xc3.d,@xt[3].d
534___
535	&SCA_SAVE(8,10);
536$code.=<<___;
537	eor	$xd0.d,$xd0.d,@xt[4].d
538	eor	$xd1.d,$xd1.d,@xt[5].d
539___
540	&SCA_SAVE(12,14);
541$code.=<<___;
542	eor	$xd2.d,$xd2.d,@xt[6].d
543	eor	$xd3.d,$xd3.d,@xt[7].d
544___
545	&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
546	&store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
547$code.=<<___;
548210:
549	incw	@K[6], ALL, MUL #1
550___
551}
552
553sub SET_STATE_BAK() {
554	my $idx0 = shift;
555	my $idx1 = $idx0 + 1;
556	my $x0 = @sx[$idx0];
557	my $xx0 = @sxx[$idx0];
558	my $x1 = @sx[$idx1];
559	my $xx1 = @sxx[$idx1];
560	my $d = $idx0/2;
561
562$code.=<<___;
563	lsr	$xx1,@K[$d],#32
564	dup	@mx[$idx0].s,@KL[$d]
565	dup	@bak[$idx0].s,@KL[$d]
566	.if mixin == 1
567		mov	$x0,@KL[$d]
568	.endif
569	dup	@mx[$idx1].s,$x1
570	dup	@bak[$idx1].s,$x1
571___
572}
573
574sub SET_STATE() {
575	my $idx0 = shift;
576	my $idx1 = $idx0 + 1;
577	my $x0 = @sx[$idx0];
578	my $xx0 = @sxx[$idx0];
579	my $x1 = @sx[$idx1];
580	my $xx1 = @sxx[$idx1];
581	my $d = $idx0/2;
582
583$code.=<<___;
584	lsr	$xx1,@K[$d],#32
585	dup	@mx[$idx0].s,@KL[$d]
586	.if mixin == 1
587		mov	$x0,@KL[$d]
588	.endif
589	dup	@mx[$idx1].s,$x1
590___
591}
592
593sub SVE_LOAD_STATES() {
594	&SET_STATE_BAK(0);
595	&SET_STATE_BAK(2);
596	&SET_STATE_BAK(4);
597	&SET_STATE_BAK(6);
598	&SET_STATE_BAK(8);
599	&SET_STATE(10);
600	&SET_STATE(14);
601$code.=<<___;
602	.if mixin == 1
603		add	@sx[13],@KL[6],#1
604		mov	@sx[12],@KL[6]
605		index	$zctr.s,@sx[13],1
606		index	@mx[12].s,@sx[13],1
607	.else
608		index	$zctr.s,@KL[6],1
609		index	@mx[12].s,@KL[6],1
610	.endif
611	lsr	@sxx[13],@K[6],#32
612	dup	@mx[13].s,@sx[13]
613___
614}
615
616sub SVE2_LOAD_STATES() {
617	&SET_STATE_BAK(0);
618	&SET_STATE_BAK(2);
619	&SET_STATE_BAK(4);
620	&SET_STATE_BAK(6);
621	&SET_STATE_BAK(8);
622	&SET_STATE_BAK(10);
623	&SET_STATE_BAK(14);
624
625$code.=<<___;
626	.if mixin == 1
627		add	@sx[13],@KL[6],#1
628		mov	@sx[12],@KL[6]
629		index	$zctr.s,@sx[13],1
630		index	@mx[12].s,@sx[13],1
631	.else
632		index	$zctr.s,@KL[6],1
633		index	@mx[12].s,@KL[6],1
634	.endif
635	lsr	@sxx[13],@K[6],#32
636	dup	@mx[13].s,@sx[13]
637	dup	@bak[13].s,@sx[13]
638___
639}
640
641sub chacha20_sve() {
642	my ($tmp) = (@sxx[0]);
643
644$code.=<<___;
645.align	5
646100:
647	subs	$tmp,$len,$veclen,lsl #6
648	b.lt	110f
649	mov	$len,$tmp
650	b.eq	101f
651	cmp	$len,64
652	b.lt	101f
653	mixin=1
654___
655	&SVE_LOAD_STATES();
656	&SVE_INNER_BLOCK();
657	&SVE_ACCUM_STATES();
658	&SVE_TRANSFORMS();
659$code.=<<___;
660	subs	$len,$len,64
661	b.gt	100b
662	b	110f
663101:
664	mixin=0
665___
666	&SVE_LOAD_STATES();
667	&SVE_INNER_BLOCK();
668	&SVE_ACCUM_STATES();
669	&SVE_TRANSFORMS();
670$code.=<<___;
671110:
672___
673}
674
675sub chacha20_sve2() {
676	my ($tmp) = (@sxx[0]);
677
678$code.=<<___;
679.align	5
680100:
681	subs	$tmp,$len,$veclen,lsl #6
682	b.lt	110f
683	mov	$len,$tmp
684	b.eq	101f
685	cmp	$len,64
686	b.lt	101f
687	mixin=1
688___
689	&SVE2_LOAD_STATES();
690	&SVE2_INNER_BLOCK();
691	&SVE2_ACCUM_STATES();
692	&SVE_TRANSFORMS();
693$code.=<<___;
694	subs	$len,$len,64
695	b.gt	100b
696	b	110f
697101:
698	mixin=0
699___
700	&SVE2_LOAD_STATES();
701	&SVE2_INNER_BLOCK();
702	&SVE2_ACCUM_STATES();
703	&SVE_TRANSFORMS();
704$code.=<<___;
705110:
706___
707}
708
709
710{{{
711	my ($tmp,$tmpw) = ("x6", "w6");
712	my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
713	my ($sve2flag) = ("x7");
714
715$code.=<<___;
716#include "arm_arch.h"
717
718.arch   armv8-a
719
720.extern	OPENSSL_armcap_P
721.hidden	OPENSSL_armcap_P
722
723.text
724.align	5
725.Lchacha20_consts:
726.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
727.Lrot8:
728	.word 0x02010003,0x04040404,0x02010003,0x04040404
729.globl	ChaCha20_ctr32_sve
730.type	ChaCha20_ctr32_sve,%function
731.align	5
732ChaCha20_ctr32_sve:
733	AARCH64_VALID_CALL_TARGET
734	cntw	$veclen, ALL, MUL #1
735	cmp	$len,$veclen,lsl #6
736	b.lt	.Lreturn
737	mov	$sve2flag,0
738	adrp	$tmp,OPENSSL_armcap_P
739	ldr	$tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
740	tst	$tmpw,#ARMV8_SVE2
741	b.eq	1f
742	mov	$sve2flag,1
743	b	2f
7441:
745	cmp	$veclen,4
746	b.le	.Lreturn
747	adr	$tmp,.Lrot8
748	ldp	$tmpw0,$tmpw1,[$tmp]
749	index	$rot8.s,$tmpw0,$tmpw1
7502:
751	AARCH64_SIGN_LINK_REGISTER
752	stp	d8,d9,[sp,-192]!
753	stp	d10,d11,[sp,16]
754	stp	d12,d13,[sp,32]
755	stp	d14,d15,[sp,48]
756	stp	x16,x17,[sp,64]
757	stp	x18,x19,[sp,80]
758	stp	x20,x21,[sp,96]
759	stp	x22,x23,[sp,112]
760	stp	x24,x25,[sp,128]
761	stp	x26,x27,[sp,144]
762	stp	x28,x29,[sp,160]
763	str	x30,[sp,176]
764
765	adr	$tmp,.Lchacha20_consts
766	ldp	@K[0],@K[1],[$tmp]
767	ldp	@K[2],@K[3],[$key]
768	ldp	@K[4],@K[5],[$key, 16]
769	ldp	@K[6],@K[7],[$ctr]
770	ptrues	p0.s,ALL
771#ifdef	__AARCH64EB__
772	ror	@K[2],@K[2],#32
773	ror	@K[3],@K[3],#32
774	ror	@K[4],@K[4],#32
775	ror	@K[5],@K[5],#32
776	ror	@K[6],@K[6],#32
777	ror	@K[7],@K[7],#32
778#endif
779	cbz	$sve2flag, 1f
780___
781	&chacha20_sve2();
782$code.=<<___;
783	b	2f
7841:
785___
786	&chacha20_sve();
787$code.=<<___;
7882:
789	str	@KL[6],[$ctr]
790	ldp	d10,d11,[sp,16]
791	ldp	d12,d13,[sp,32]
792	ldp	d14,d15,[sp,48]
793	ldp	x16,x17,[sp,64]
794	ldp	x18,x19,[sp,80]
795	ldp	x20,x21,[sp,96]
796	ldp	x22,x23,[sp,112]
797	ldp	x24,x25,[sp,128]
798	ldp	x26,x27,[sp,144]
799	ldp	x28,x29,[sp,160]
800	ldr	x30,[sp,176]
801	ldp	d8,d9,[sp],192
802	AARCH64_VALIDATE_LINK_REGISTER
803.Lreturn:
804	ret
805.size	ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
806___
807
808}}}
809
810########################################
811{
812my  %opcode_unpred = (
813	"movprfx"      => 0x0420BC00,
814	"eor"          => 0x04a03000,
815	"add"          => 0x04200000,
816	"orr"          => 0x04603000,
817	"lsl"          => 0x04209C00,
818	"lsr"          => 0x04209400,
819	"incw"         => 0x04B00000,
820	"xar"          => 0x04203400,
821	"zip1"         => 0x05206000,
822	"zip2"         => 0x05206400,
823	"uzp1"         => 0x05206800,
824	"uzp2"         => 0x05206C00,
825	"index"        => 0x04204C00,
826	"mov"          => 0x05203800,
827	"dup"          => 0x05203800,
828	"cntw"         => 0x04A0E000,
829	"tbl"          => 0x05203000);
830
831my  %opcode_imm_unpred = (
832	"dup"          => 0x2538C000,
833	"index"        => 0x04204400);
834
835my %opcode_scalar_pred = (
836	"mov"          => 0x0528A000,
837	"cpy"          => 0x0528A000,
838	"st4w"         => 0xE5606000,
839	"st1w"         => 0xE5004000,
840	"ld1w"         => 0xA5404000);
841
842my %opcode_gather_pred = (
843	"ld1w"         => 0x85204000);
844
845my  %opcode_pred = (
846	"eor"          => 0x04190000,
847	"add"          => 0x04000000,
848	"orr"          => 0x04180000,
849	"whilelo"      => 0x25200C00,
850	"whilelt"      => 0x25200400,
851	"cntp"         => 0x25208000,
852	"addvl"        => 0x04205000,
853	"lsl"          => 0x04038000,
854	"lsr"          => 0x04018000,
855	"sel"          => 0x0520C000,
856	"mov"          => 0x0520C000,
857	"ptrue"        => 0x2518E000,
858	"pfalse"       => 0x2518E400,
859	"ptrues"       => 0x2519E000,
860	"pnext"        => 0x2519C400,
861	"ld4w"         => 0xA560E000,
862	"st4w"         => 0xE570E000,
863	"st1w"         => 0xE500E000,
864	"ld1w"         => 0xA540A000,
865	"ld1rw"        => 0x8540C000,
866	"lasta"        => 0x0520A000,
867	"revh"         => 0x05258000,
868	"revb"         => 0x05248000);
869
870my  %tsize = (
871	'b'          => 0,
872	'h'          => 1,
873	's'          => 2,
874	'd'          => 3);
875
876my %sf = (
877	"w"          => 0,
878	"x"          => 1);
879
880my %pattern = (
881	"POW2"       => 0,
882	"VL1"        => 1,
883	"VL2"        => 2,
884	"VL3"        => 3,
885	"VL4"        => 4,
886	"VL5"        => 5,
887	"VL6"        => 6,
888	"VL7"        => 7,
889	"VL8"        => 8,
890	"VL16"       => 9,
891	"VL32"       => 10,
892	"VL64"       => 11,
893	"VL128"      => 12,
894	"VL256"      => 13,
895	"MUL4"       => 29,
896	"MUL3"       => 30,
897	"ALL"        => 31);
898
899sub create_verifier {
900	my $filename="./compile_sve.sh";
901
902$scripts = <<___;
903#! /bin/bash
904set -e
905CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
906
907[ -z "\$1" ] && exit 1
908ARCH=`uname -p | xargs echo -n`
909
910# need gcc-10 and above to compile SVE code
911# change this according to your system during debugging
912if [ \$ARCH == 'aarch64' ]; then
913	CC=gcc-11
914	OBJDUMP=objdump
915else
916	CC=\${CROSS_COMPILE}gcc
917	OBJDUMP=\${CROSS_COMPILE}objdump
918fi
919TMPFILE=/tmp/\$\$
920cat > \$TMPFILE.c << EOF
921extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
922{
923	asm("\$@\\t\\n");
924}
925int main(int argc, char *argv[])
926{
927}
928EOF
929\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
930\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
931rm \$TMPFILE.c \$TMPFILE.out
932___
933	open(FH, '>', $filename) or die $!;
934	print FH $scripts;
935	close(FH);
936	system("chmod a+x ./compile_sve.sh");
937}
938
939sub compile_sve {
940	return `./compile_sve.sh '@_'`
941}
942
943sub verify_inst {
944	my ($code,$inst)=@_;
945	my $hexcode = (sprintf "%08x", $code);
946
947	if ($debug_encoder == 1) {
948		my $expect=&compile_sve($inst);
949		if ($expect ne $hexcode) {
950			return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
951		}
952	}
953	return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
954}
955
956sub reg_code {
957	my $code = shift;
958
959	if ($code == "zr") {
960		return "31";
961	}
962	return $code;
963}
964
965sub encode_size_imm() {
966	my ($mnemonic, $isize, $const)=@_;
967	my $esize = (8<<$tsize{$isize});
968	my $tsize_imm = $esize + $const;
969
970	if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
971		$tsize_imm = 2*$esize - $const;
972	}
973	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
974}
975
976sub encode_shift_pred() {
977	my ($mnemonic, $isize, $const)=@_;
978	my $esize = (8<<$tsize{$isize});
979	my $tsize_imm = $esize + $const;
980
981	if ($mnemonic eq "lsr") {
982		$tsize_imm = 2*$esize - $const;
983	}
984	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
985}
986
987sub sve_unpred {
988	my ($mnemonic,$arg)=@_;
989	my $inst = (sprintf "%s %s", $mnemonic,$arg);
990
991	if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
992		return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
993					$inst)
994	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
995       		my $regd = $1;
996		my $isize = $2;
997		my $regs=$3;
998
999		if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
1000			if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
1001				&& ((8<<$tsize{$isize}) > $2)) {
1002				return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
1003					$inst);
1004			}
1005		} elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
1006			return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
1007		} elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
1008			return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
1009		} elsif ($regs =~ m/[wx]([0-9]+)/o) {
1010			return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
1011		} else {
1012			my $encoded_size = 0;
1013			if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
1014				$encoded_size = ($tsize{$isize}<<22);
1015			}
1016			if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
1017				$1 == $regd) {
1018				return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
1019			} elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
1020				return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
1021			}
1022		}
1023	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
1024		return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
1025					$inst)
1026	}
1027	sprintf "%s // fail to parse", $inst;
1028}
1029
1030sub sve_pred {
1031	my ($mnemonic,,$arg)=@_;
1032	my $inst = (sprintf "%s %s", $mnemonic,$arg);
1033
1034	if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
1035		my $zt = $1;
1036		my $size = $tsize{$2};
1037		my $pg = $3;
1038		my $addr = $5;
1039		my $xn = 31;
1040
1041		if ($addr =~ m/x([0-9]+)\s*/o) {
1042			$xn = $1;
1043		}
1044
1045		if ($mnemonic =~m/ld1r[bhwd]/o) {
1046			$size = 0;
1047		}
1048		if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
1049			return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
1050		} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
1051			my $xs = ($2 eq "SXTW") ? 1 : 0;
1052			return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
1053		} elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
1054			return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
1055		} else {
1056			return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
1057		}
1058	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
1059		my $regd = $1;
1060		my $isize = $2;
1061		my $pg = $3;
1062		my $mod = $4;
1063		my $regs = $5;
1064
1065		if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
1066			if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
1067				&& $regd == $1
1068				&& $mode == 'm'
1069				&& ((8<<$tsize{$isize}) > $2)) {
1070				return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
1071			}
1072		} elsif($regs =~ m/[wx]([0-9]+)/o) {
1073			return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
1074		} elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
1075			if ($mnemonic eq "sel") {
1076				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
1077			} elsif ($mnemonic eq "mov") {
1078				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
1079			} elsif (length $2 > 0) {
1080				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
1081			} else {
1082				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
1083			}
1084		}
1085	} elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
1086		my $pg = $1;
1087		my $isize = $2;
1088		my $regs = $3;
1089
1090		if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
1091			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
1092		} elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
1093			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
1094		} else {
1095			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
1096		}
1097	} elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
1098		return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
1099	}
1100
1101	sprintf "%s // fail to parse", $inst;
1102}
1103
1104sub sve_other {
1105	my ($mnemonic,$arg)=@_;
1106	my $inst = (sprintf "%s %s", $mnemonic,$arg);
1107
1108	if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
1109		return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
1110	} elsif ($arg =~ m/(x|w)([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*z([0-9]+)\.([bhsd])/o) {
1111		return &verify_inst($opcode_pred{$mnemonic}|($tsize{$5}<<22)|$1|($3<<10)|($4<<5)|$2, $inst);
1112	}elsif ($mnemonic =~ /inc[bhdw]/) {
1113		if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
1114			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16)|0xE000, $inst);
1115		} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
1116			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16)|0xC000, $inst);
1117		} elsif ($arg =~ m/x([0-9]+)/o) {
1118			return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16)|0xE000, $inst);
1119		}
1120	} elsif ($mnemonic =~ /cnt[bhdw]/) {
1121		if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
1122			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
1123		}
1124	} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
1125		return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
1126	} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
1127		return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
1128	}
1129	sprintf "%s // fail to parse", $inst;
1130}
1131}
1132
1133open SELF,$0;
1134while(<SELF>) {
1135	next if (/^#!/);
1136	last if (!s/^#/\/\// and !/^$/);
1137	print;
1138}
1139close SELF;
1140
1141if ($debug_encoder == 1) {
1142	&create_verifier();
1143}
1144
1145foreach(split("\n",$code)) {
1146	s/\`([^\`]*)\`/eval($1)/ge;
1147	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
1148	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
1149	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
1150	s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
1151	s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
1152	s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
1153	s/\b(movprfx|lasta|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z|w).*)/sve_other($1,$2)/ge;
1154	print $_,"\n";
1155}
1156
1157close STDOUT or die "error closing STDOUT: $!";
1158