1#! /usr/bin/env perl
2# Copyright 2022  The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9#
10# ChaCha20 for ARMv8 via SVE
11#
12# $output is the last argument if it looks like a file (it has an extension)
13# $flavour is the first argument if it doesn't look like a file
14$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
15$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
16
17$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
18( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
19( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
20die "can't locate arm-xlate.pl";
21
22open OUT,"| \"$^X\" $xlate $flavour \"$output\""
23    or die "can't call $xlate: $!";
24*STDOUT=*OUT;
25
26sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
27{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
28  my $arg = pop;
29    $arg = "#$arg" if ($arg*1 eq $arg);
30    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
31}
32
33my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
34my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
35my ($sve2flag) = ("x7");
36my ($wctr, $xctr) = ("w8", "x8");
37my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
38my ($tmp,$tmpw) = ("x10", "w10");
39my ($counter) = ("x11");
40my @K=map("x$_",(12..15,19..22));
41my @KL=map("w$_",(12..15,19..22));
42my @mx=map("z$_",(0..15));
43my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
44    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
45my ($zctr) = ("z16");
46my @xt=map("z$_",(17..24));
47my @perm=map("z$_",(25..30));
48my ($rot8) = ("z31");
49my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
50# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
51# in SVE2 we use all 15 backup register
52my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
53my $debug_encoder=0;
54
55sub SVE_ADD() {
56	my $x = shift;
57	my $y = shift;
58
59$code.=<<___;
60	add	@mx[$x].s,@mx[$x].s,@mx[$y].s
61___
62	if (@_) {
63		&SVE_ADD(@_);
64	}
65}
66
67sub SVE_EOR() {
68	my $x = shift;
69	my $y = shift;
70
71$code.=<<___;
72	eor	@mx[$x].d,@mx[$x].d,@mx[$y].d
73___
74	if (@_) {
75		&SVE_EOR(@_);
76	}
77}
78
79sub SVE_LSL() {
80	my $bits = shift;
81	my $x = shift;
82	my $y = shift;
83	my $next = $x + 1;
84
85$code.=<<___;
86	lsl	@xt[$x].s,@mx[$y].s,$bits
87___
88	if (@_) {
89		&SVE_LSL($bits,$next,@_);
90	}
91}
92
93sub SVE_LSR() {
94	my $bits = shift;
95	my $x = shift;
96
97$code.=<<___;
98	lsr	@mx[$x].s,@mx[$x].s,$bits
99___
100	if (@_) {
101		&SVE_LSR($bits,@_);
102	}
103}
104
105sub SVE_ORR() {
106	my $x = shift;
107	my $y = shift;
108	my $next = $x + 1;
109
110$code.=<<___;
111	orr	@mx[$y].d,@mx[$y].d,@xt[$x].d
112___
113	if (@_) {
114		&SVE_ORR($next,@_);
115	}
116}
117
118sub SVE_REV16() {
119	my $x = shift;
120
121$code.=<<___;
122	revh	@mx[$x].s,p0/m,@mx[$x].s
123___
124	if (@_) {
125		&SVE_REV16(@_);
126	}
127}
128
129sub SVE_ROT8() {
130	my $x = shift;
131
132$code.=<<___;
133	tbl	@mx[$x].b,{@mx[$x].b},$rot8.b
134___
135	if (@_) {
136		&SVE_ROT8(@_);
137	}
138}
139
140sub SVE2_XAR() {
141	my $bits = shift;
142	my $x = shift;
143	my $y = shift;
144	my $rbits = 32-$bits;
145
146$code.=<<___;
147	xar	@mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
148___
149	if (@_) {
150		&SVE2_XAR($bits,@_);
151	}
152}
153
154sub SVE_QR_GROUP() {
155	my $have_sve2 = shift;
156	my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
157
158	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
159	if ($have_sve2 == 0) {
160		&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
161		&SVE_REV16($d0,$d1,$d2,$d3);
162	} else {
163		&SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
164	}
165
166	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
167	if ($have_sve2 == 0) {
168		&SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
169		&SVE_LSL(12,0,$b0,$b1,$b2,$b3);
170		&SVE_LSR(20,$b0,$b1,$b2,$b3);
171		&SVE_ORR(0,$b0,$b1,$b2,$b3,);
172	} else {
173		&SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
174	}
175
176	&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
177	if ($have_sve2 == 0) {
178		&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
179		&SVE_ROT8($d0,$d1,$d2,$d3);
180	} else {
181		&SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
182	}
183
184	&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
185	if ($have_sve2 == 0) {
186		&SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
187		&SVE_LSL(7,0,$b0,$b1,$b2,$b3);
188		&SVE_LSR(25,$b0,$b1,$b2,$b3);
189		&SVE_ORR(0,$b0,$b1,$b2,$b3);
190	} else {
191		&SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
192	}
193}
194
195sub SVE_INNER_BLOCK() {
196$code.=<<___;
197	mov	$counter,#10
1981:
199.align	5
200___
201	&SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
202	&SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
203$code.=<<___;
204	subs	$counter,$counter,1
205	b.ne	1b
206___
207}
208
209sub SVE2_INNER_BLOCK() {
210$code.=<<___;
211	mov	$counter,#10
2121:
213.align	5
214___
215	&SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
216	&SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
217$code.=<<___;
218	subs	$counter,$counter,1
219	b.ne	1b
220___
221}
222
223sub load() {
224	my $x0 = shift;
225	my $x1 = shift;
226	my $x2 = shift;
227	my $x3 = shift;
228	my $x4 = shift;
229	my $x5 = shift;
230	my $x6 = shift;
231	my $x7 = shift;
232
233$code.=<<___;
234	ld1w	{$x0.s},p0/z,[$inp]
235	ld1w	{$x1.s},p0/z,[$inp, #1, MUL VL]
236	ld1w	{$x2.s},p0/z,[$inp, #2, MUL VL]
237	ld1w	{$x3.s},p0/z,[$inp, #3, MUL VL]
238	ld1w	{$x4.s},p0/z,[$inp, #4, MUL VL]
239	ld1w	{$x5.s},p0/z,[$inp, #5, MUL VL]
240	ld1w	{$x6.s},p0/z,[$inp, #6, MUL VL]
241	ld1w	{$x7.s},p0/z,[$inp, #7, MUL VL]
242	addvl	$inp,$inp,#8
243___
244}
245
246sub store() {
247	my $x0 = shift;
248	my $x1 = shift;
249	my $x2 = shift;
250	my $x3 = shift;
251	my $x4 = shift;
252	my $x5 = shift;
253	my $x6 = shift;
254	my $x7 = shift;
255
256$code.=<<___;
257	st1w	{$x0.s},p0,[$outp]
258	st1w	{$x1.s},p0,[$outp, #1, MUL VL]
259	st1w	{$x2.s},p0,[$outp, #2, MUL VL]
260	st1w	{$x3.s},p0,[$outp, #3, MUL VL]
261	st1w	{$x4.s},p0,[$outp, #4, MUL VL]
262	st1w	{$x5.s},p0,[$outp, #5, MUL VL]
263	st1w	{$x6.s},p0,[$outp, #6, MUL VL]
264	st1w	{$x7.s},p0,[$outp, #7, MUL VL]
265	addvl	$outp,$outp,#8
266___
267}
268
269sub transpose() {
270	my $xa = shift;
271	my $xb = shift;
272	my $xc = shift;
273	my $xd = shift;
274
275$code.=<<___;
276	zip1	$xt0.s,$xa.s,$xb.s
277	zip2	$xt1.s,$xa.s,$xb.s
278	zip1	$xt2.s,$xc.s,$xd.s
279	zip2	$xt3.s,$xc.s,$xd.s
280	zip1	$xa.d,$xt0.d,$xt2.d
281	zip2	$xb.d,$xt0.d,$xt2.d
282	zip1	$xc.d,$xt1.d,$xt3.d
283	zip2	$xd.d,$xt1.d,$xt3.d
284___
285}
286
287sub SVE_ADD_STATES() {
288$code.=<<___;
289	lsr	$tmp1,@K[5],#32
290	dup	$xt0.s,@KL[5]
291	dup	$xt1.s,$tmpw1
292	add	@mx[0].s,@mx[0].s,$bak0.s
293	add	@mx[1].s,@mx[1].s,$bak1.s
294	add	@mx[2].s,@mx[2].s,$bak2.s
295	add	@mx[3].s,@mx[3].s,$bak3.s
296	add	@mx[4].s,@mx[4].s,$bak4.s
297	add	@mx[5].s,@mx[5].s,$bak5.s
298	add	@mx[6].s,@mx[6].s,$bak6.s
299	add	@mx[7].s,@mx[7].s,$bak7.s
300	add	@mx[8].s,@mx[8].s,$bak8.s
301	add	@mx[9].s,@mx[9].s,$bak9.s
302	lsr	$tmp0,@K[6],#32
303	dup	$xt4.s,$tmpw0
304	lsr	$tmp1,@K[7],#32
305	dup	$xt5.s,@KL[7]
306	dup	$xt6.s,$tmpw1
307	add	@mx[10].s,@mx[10].s,$xt0.s
308	add	@mx[11].s,@mx[11].s,$xt1.s
309	add	@mx[12].s,@mx[12].s,$zctr.s
310	add	@mx[13].s,@mx[13].s,$xt4.s
311	add	@mx[14].s,@mx[14].s,$xt5.s
312	add	@mx[15].s,@mx[15].s,$xt6.s
313___
314}
315
316sub SVE2_ADD_STATES() {
317$code.=<<___;
318	add	@mx[0].s,@mx[0].s,$bak0.s
319	add	@mx[1].s,@mx[1].s,$bak1.s
320	add	@mx[2].s,@mx[2].s,$bak2.s
321	add	@mx[3].s,@mx[3].s,$bak3.s
322	add	@mx[4].s,@mx[4].s,$bak4.s
323	add	@mx[5].s,@mx[5].s,$bak5.s
324	add	@mx[6].s,@mx[6].s,$bak6.s
325	add	@mx[7].s,@mx[7].s,$bak7.s
326	add	@mx[8].s,@mx[8].s,$bak8.s
327	add	@mx[9].s,@mx[9].s,$bak9.s
328	add	@mx[10].s,@mx[10].s,$bak10.s
329	add	@mx[11].s,@mx[11].s,$bak11.s
330	add	@mx[12].s,@mx[12].s,$zctr.s
331	add	@mx[13].s,@mx[13].s,$bak13.s
332	add	@mx[14].s,@mx[14].s,$bak14.s
333	add	@mx[15].s,@mx[15].s,$bak15.s
334___
335}
336
337sub SVE_TRANSFORMS() {
338	&transpose($xa0,$xb0,$xc0,$xd0);
339	&transpose($xa1,$xb1,$xc1,$xd1);
340	&transpose($xa2,$xb2,$xc2,$xd2);
341	&transpose($xa3,$xb3,$xc3,$xd3);
342	&transpose($xa0,$xa1,$xa2,$xa3);
343	&transpose($xb0,$xb1,$xb2,$xb3);
344	&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
345$code.=<<___;
346	eor	$xa0.d,$xa0.d,$xt0.d
347	eor	$xa1.d,$xa1.d,$xt1.d
348	eor	$xa2.d,$xa2.d,$xt2.d
349	eor	$xa3.d,$xa3.d,$xt3.d
350	eor	$xb0.d,$xb0.d,$xt4.d
351	eor	$xb1.d,$xb1.d,$xt5.d
352	eor	$xb2.d,$xb2.d,$xt6.d
353	eor	$xb3.d,$xb3.d,$xt7.d
354___
355	&transpose($xc0,$xc1,$xc2,$xc3);
356	&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
357	&transpose($xd0,$xd1,$xd2,$xd3);
358	&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
359$code.=<<___;
360	eor	$xc0.d,$xc0.d,$xt0.d
361	eor	$xc1.d,$xc1.d,$xt1.d
362	eor	$xc2.d,$xc2.d,$xt2.d
363	eor	$xc3.d,$xc3.d,$xt3.d
364	eor	$xd0.d,$xd0.d,$xt4.d
365	eor	$xd1.d,$xd1.d,$xt5.d
366	eor	$xd2.d,$xd2.d,$xt6.d
367	eor	$xd3.d,$xd3.d,$xt7.d
368___
369	&store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
370$code.=<<___;
371	incw	$xctr, ALL, MUL #1
372	incw	$zctr.s, ALL, MUL #1
373___
374}
375
376sub SVE_LOAD_STATES() {
377$code.=<<___;
378	lsr	$tmp0,@K[0],#32
379	dup	@mx[0].s,@KL[0]
380	dup	$bak0.s,@KL[0]
381	dup	@mx[1].s,$tmpw0
382	dup	$bak1.s,$tmpw0
383	lsr	$tmp1,@K[1],#32
384	dup	@mx[2].s,@KL[1]
385	dup	$bak2.s,@KL[1]
386	dup	@mx[3].s,$tmpw1
387	dup	$bak3.s,$tmpw1
388	lsr	$tmp0,@K[2],#32
389	dup	@mx[4].s,@KL[2]
390	dup	$bak4.s,@KL[2]
391	dup	@mx[5].s,$tmpw0
392	dup	$bak5.s,$tmpw0
393	lsr	$tmp1,@K[3],#32
394	dup	@mx[6].s,@KL[3]
395	dup	$bak6.s,@KL[3]
396	dup	@mx[7].s,$tmpw1
397	dup	$bak7.s,$tmpw1
398	lsr	$tmp0,@K[4],#32
399	dup	@mx[8].s,@KL[4]
400	dup	$bak8.s,@KL[4]
401	dup	@mx[9].s,$tmpw0
402	dup	$bak9.s,$tmpw0
403	lsr	$tmp1,@K[5],#32
404	dup	@mx[10].s,@KL[5]
405	dup	@mx[11].s,$tmpw1
406	orr	@mx[12].d,$zctr.d,$zctr.d
407	lsr	$tmp0,@K[6],#32
408	dup	@mx[13].s,$tmpw0
409	lsr	$tmp1,@K[7],#32
410	dup	@mx[14].s,@KL[7]
411	dup	@mx[15].s,$tmpw1
412___
413}
414
415sub SVE2_LOAD_STATES() {
416$code.=<<___;
417	lsr	$tmp0,@K[0],#32
418	dup	@mx[0].s,@KL[0]
419	dup	$bak0.s,@KL[0]
420	dup	@mx[1].s,$tmpw0
421	dup	$bak1.s,$tmpw0
422	lsr	$tmp1,@K[1],#32
423	dup	@mx[2].s,@KL[1]
424	dup	$bak2.s,@KL[1]
425	dup	@mx[3].s,$tmpw1
426	dup	$bak3.s,$tmpw1
427	lsr	$tmp0,@K[2],#32
428	dup	@mx[4].s,@KL[2]
429	dup	$bak4.s,@KL[2]
430	dup	@mx[5].s,$tmpw0
431	dup	$bak5.s,$tmpw0
432	lsr	$tmp1,@K[3],#32
433	dup	@mx[6].s,@KL[3]
434	dup	$bak6.s,@KL[3]
435	dup	@mx[7].s,$tmpw1
436	dup	$bak7.s,$tmpw1
437	lsr	$tmp0,@K[4],#32
438	dup	@mx[8].s,@KL[4]
439	dup	$bak8.s,@KL[4]
440	dup	@mx[9].s,$tmpw0
441	dup	$bak9.s,$tmpw0
442	lsr	$tmp1,@K[5],#32
443	dup	@mx[10].s,@KL[5]
444	dup	$bak10.s,@KL[5]
445	dup	@mx[11].s,$tmpw1
446	dup	$bak11.s,$tmpw1
447	orr	@mx[12].d,$zctr.d,$zctr.d
448	lsr	$tmp0,@K[6],#32
449	dup	@mx[13].s,$tmpw0
450	dup	$bak13.s,$tmpw0
451	lsr	$tmp1,@K[7],#32
452	dup	@mx[14].s,@KL[7]
453	dup	$bak14.s,@KL[7]
454	dup	@mx[15].s,$tmpw1
455	dup	$bak15.s,$tmpw1
456___
457}
458
459sub sve_handle_blocks() {
460$code.=<<___;
461	cbz	$sve2flag,.sve_inner
462___
463	&SVE2_LOAD_STATES();
464	&SVE2_INNER_BLOCK();
465	&SVE2_ADD_STATES();
466$code.=<<___;
467	b	.fini_inner
468.sve_inner:
469___
470	&SVE_LOAD_STATES();
471	&SVE_INNER_BLOCK();
472	&SVE_ADD_STATES();
473$code.=<<___;
474.fini_inner:
475___
476	&SVE_TRANSFORMS();
477}
478
479sub chacha20_process() {
480$code.=<<___;
481.align	5
482.Loop:
483	cmp	$blocks,$veclen
484	b.lt	.Lexit
485___
486	&sve_handle_blocks();
487$code.=<<___;
488	subs	$blocks,$blocks,$veclen
489	b.gt	.Loop
490.Lexit:
491___
492}
493
494{{{
495$code.=<<___;
496#include "arm_arch.h"
497
498.arch   armv8-a
499
500.extern	OPENSSL_armcap_P
501.hidden	OPENSSL_armcap_P
502
503.text
504.align	5
505.Lchacha20_consts:
506.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
507.Lrot8:
508	.word 0x02010003,0x04040404,0x02010003,0x04040404
509.globl	ChaCha20_ctr32_sve
510.type	ChaCha20_ctr32_sve,%function
511.align	5
512ChaCha20_ctr32_sve:
513	AARCH64_VALID_CALL_TARGET
514	cntw	$veclen, ALL, MUL #1
515	lsr	$blocks,$len,#6
516	cmp	$blocks,$veclen
517	b.lt	.Lreturn
518	mov	$sve2flag,0
519	adrp	$tmp,OPENSSL_armcap_P
520	ldr	$tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
521	tst	$tmpw,#ARMV8_SVE2
522	b.eq	1f
523	mov	$sve2flag,1
524	b	2f
5251:
526	cmp	$veclen,4
527	b.le	.Lreturn
528	adr	$tmp,.Lrot8
529	ldp	$tmpw0,$tmpw1,[$tmp]
530	index	$rot8.s,$tmpw0,$tmpw1
5312:
532	stp	d8,d9,[sp,-96]!
533	stp	d10,d11,[sp,16]
534	stp	d12,d13,[sp,32]
535	stp	d14,d15,[sp,48]
536	stp	x19,x20,[sp,64]
537	stp	x21,x22,[sp,80]
538	adr	$tmp,.Lchacha20_consts
539	ldp	@K[0],@K[1],[$tmp]
540	ldp	@K[2],@K[3],[$key]
541	ldp	@K[4],@K[5],[$key, 16]
542	ldp	@K[6],@K[7],[$ctr]
543	ldr	$wctr,[$ctr]
544	index	$zctr.s,$wctr,1
545	ptrues	p0.s,ALL
546#ifdef	__AARCH64EB__
547	ror	@K[2],@K[2],#32
548	ror	@K[3],@K[3],#32
549	ror	@K[4],@K[4],#32
550	ror	@K[5],@K[5],#32
551	ror	@K[6],@K[6],#32
552	ror	@K[7],@K[7],#32
553#endif
554___
555	&chacha20_process();
556$code.=<<___;
557	ldp	d10,d11,[sp,16]
558	ldp	d12,d13,[sp,32]
559	ldp	d14,d15,[sp,48]
560	ldp	x19,x20,[sp,64]
561	ldp	x21,x22,[sp,80]
562	ldp	d8,d9,[sp],96
563	str	$wctr,[$ctr]
564	and	$len,$len,#63
565	add	$len,$len,$blocks,lsl #6
566.Lreturn:
567	ret
568.size	ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
569___
570
571}}}
572
573########################################
574{
575my  %opcode_unpred = (
576	"movprfx"      => 0x0420BC00,
577	"eor"          => 0x04a03000,
578	"add"          => 0x04200000,
579	"orr"          => 0x04603000,
580	"lsl"          => 0x04209C00,
581	"lsr"          => 0x04209400,
582	"incw"         => 0x04B0C000,
583	"xar"          => 0x04203400,
584	"zip1"         => 0x05206000,
585	"zip2"         => 0x05206400,
586	"uzp1"         => 0x05206800,
587	"uzp2"         => 0x05206C00,
588	"index"        => 0x04204C00,
589	"mov"          => 0x05203800,
590	"dup"          => 0x05203800,
591	"cntw"         => 0x04A0E000,
592	"tbl"          => 0x05203000);
593
594my  %opcode_imm_unpred = (
595	"dup"          => 0x2538C000,
596	"index"        => 0x04204400);
597
598my %opcode_scalar_pred = (
599	"mov"          => 0x0528A000,
600	"cpy"          => 0x0528A000,
601	"st4w"         => 0xE5606000,
602	"st1w"         => 0xE5004000,
603	"ld1w"         => 0xA5404000);
604
605my %opcode_gather_pred = (
606	"ld1w"         => 0x85204000);
607
608my  %opcode_pred = (
609	"eor"          => 0x04190000,
610	"add"          => 0x04000000,
611	"orr"          => 0x04180000,
612	"whilelo"      => 0x25200C00,
613	"whilelt"      => 0x25200400,
614	"cntp"         => 0x25208000,
615	"addvl"        => 0x04205000,
616	"lsl"          => 0x04038000,
617	"lsr"          => 0x04018000,
618	"sel"          => 0x0520C000,
619	"mov"          => 0x0520C000,
620	"ptrue"        => 0x2518E000,
621	"pfalse"       => 0x2518E400,
622	"ptrues"       => 0x2519E000,
623	"pnext"        => 0x2519C400,
624	"ld4w"         => 0xA560E000,
625	"st4w"         => 0xE570E000,
626	"st1w"         => 0xE500E000,
627	"ld1w"         => 0xA540A000,
628	"ld1rw"        => 0x8540C000,
629	"revh"         => 0x05258000);
630
631my  %tsize = (
632	'b'          => 0,
633	'h'          => 1,
634	's'          => 2,
635	'd'          => 3);
636
637my %sf = (
638	"w"          => 0,
639	"x"          => 1);
640
641my %pattern = (
642	"POW2"       => 0,
643	"VL1"        => 1,
644	"VL2"        => 2,
645	"VL3"        => 3,
646	"VL4"        => 4,
647	"VL5"        => 5,
648	"VL6"        => 6,
649	"VL7"        => 7,
650	"VL8"        => 8,
651	"VL16"       => 9,
652	"VL32"       => 10,
653	"VL64"       => 11,
654	"VL128"      => 12,
655	"VL256"      => 13,
656	"MUL4"       => 29,
657	"MUL3"       => 30,
658	"ALL"        => 31);
659
660sub create_verifier {
661	my $filename="./compile_sve.sh";
662
663$scripts = <<___;
664#! /bin/bash
665set -e
666CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
667
668[ -z "\$1" ] && exit 1
669ARCH=`uname -p | xargs echo -n`
670
671# need gcc-10 and above to compile SVE code
672# change this according to your system during debugging
673if [ \$ARCH == 'aarch64' ]; then
674	CC=gcc-11
675	OBJDUMP=objdump
676else
677	CC=\${CROSS_COMPILE}gcc
678	OBJDUMP=\${CROSS_COMPILE}objdump
679fi
680TMPFILE=/tmp/\$\$
681cat > \$TMPFILE.c << EOF
682extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
683{
684	asm("\$@\\t\\n");
685}
686int main(int argc, char *argv[])
687{
688}
689EOF
690\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
691\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
692rm \$TMPFILE.c \$TMPFILE.out
693___
694	open(FH, '>', $filename) or die $!;
695	print FH $scripts;
696	close(FH);
697	system("chmod a+x ./compile_sve.sh");
698}
699
700sub compile_sve {
701	return `./compile_sve.sh '@_'`
702}
703
704sub verify_inst {
705	my ($code,$inst)=@_;
706	my $hexcode = (sprintf "%08x", $code);
707
708	if ($debug_encoder == 1) {
709		my $expect=&compile_sve($inst);
710		if ($expect ne $hexcode) {
711			return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
712		}
713	}
714	return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
715}
716
717sub reg_code {
718	my $code = shift;
719
720	if ($code == "zr") {
721		return "31";
722	}
723	return $code;
724}
725
726sub encode_size_imm() {
727	my ($mnemonic, $isize, $const)=@_;
728	my $esize = (8<<$tsize{$isize});
729	my $tsize_imm = $esize + $const;
730
731	if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
732		$tsize_imm = 2*$esize - $const;
733	}
734	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
735}
736
737sub encode_shift_pred() {
738	my ($mnemonic, $isize, $const)=@_;
739	my $esize = (8<<$tsize{$isize});
740	my $tsize_imm = $esize + $const;
741
742	if ($mnemonic eq "lsr") {
743		$tsize_imm = 2*$esize - $const;
744	}
745	return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
746}
747
748sub sve_unpred {
749	my ($mnemonic,$arg)=@_;
750	my $inst = (sprintf "%s %s", $mnemonic,$arg);
751
752	if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
753		return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
754					$inst)
755	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
756       		my $regd = $1;
757		my $isize = $2;
758		my $regs=$3;
759
760		if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
761			if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
762				&& ((8<<$tsize{$isize}) > $2)) {
763				return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
764					$inst);
765			}
766		} elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
767			return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
768		} elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
769			return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
770		} elsif ($regs =~ m/[wx]([0-9]+)/o) {
771			return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
772		} else {
773			my $encoded_size = 0;
774			if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
775				$encoded_size = ($tsize{$isize}<<22);
776			}
777			if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
778				$1 == $regd) {
779				return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
780			} elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
781				return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
782			}
783		}
784	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
785		return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
786					$inst)
787	}
788	sprintf "%s // fail to parse", $inst;
789}
790
791sub sve_pred {
792	my ($mnemonic,,$arg)=@_;
793	my $inst = (sprintf "%s %s", $mnemonic,$arg);
794
795	if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
796		my $zt = $1;
797		my $size = $tsize{$2};
798		my $pg = $3;
799		my $addr = $5;
800		my $xn = 31;
801
802		if ($addr =~ m/x([0-9]+)\s*/o) {
803			$xn = $1;
804		}
805
806		if ($mnemonic =~m/ld1r[bhwd]/o) {
807			$size = 0;
808		}
809		if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
810			return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
811		} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
812			my $xs = ($2 eq "SXTW") ? 1 : 0;
813			return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
814		} elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
815			return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
816		} else {
817			return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
818		}
819	} elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
820		my $regd = $1;
821		my $isize = $2;
822		my $pg = $3;
823		my $mod = $4;
824		my $regs = $5;
825
826		if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
827			if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
828				&& $regd == $1
829				&& $mode == 'm'
830				&& ((8<<$tsize{$isize}) > $2)) {
831				return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
832			}
833		} elsif($regs =~ m/[wx]([0-9]+)/o) {
834			return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
835		} elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
836			if ($mnemonic eq "sel") {
837				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
838			} elsif ($mnemonic eq "mov") {
839				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
840			} elsif (length $2 > 0) {
841				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
842			} else {
843				return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
844			}
845		}
846	} elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
847		my $pg = $1;
848		my $isize = $2;
849		my $regs = $3;
850
851		if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
852			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
853		} elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
854			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
855		} else {
856			return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
857		}
858	} elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
859		return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
860	}
861
862	sprintf "%s // fail to parse", $inst;
863}
864
865sub sve_other {
866	my ($mnemonic,$arg)=@_;
867	my $inst = (sprintf "%s %s", $mnemonic,$arg);
868
869	if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
870		return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
871	} elsif ($mnemonic =~ /inc[bhdw]/) {
872		if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
873			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16), $inst);
874		} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
875			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
876		} elsif ($arg =~ m/x([0-9]+)/o) {
877			return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
878		}
879	} elsif ($mnemonic =~ /cnt[bhdw]/) {
880		if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
881			return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
882		}
883	} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
884		return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
885	} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
886		return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
887	}
888	sprintf "%s // fail to parse", $inst;
889}
890}
891
892open SELF,$0;
893while(<SELF>) {
894	next if (/^#!/);
895	last if (!s/^#/\/\// and !/^$/);
896	print;
897}
898close SELF;
899
900if ($debug_encoder == 1) {
901	&create_verifier();
902}
903
904foreach(split("\n",$code)) {
905	s/\`([^\`]*)\`/eval($1)/ge;
906	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
907	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
908	s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
909	s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
910	s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
911	s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
912	s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
913	print $_,"\n";
914}
915
916close STDOUT or die "error closing STDOUT: $!";
917