1#! /usr/bin/env perl
2# Author: Min Zhou <zhoumin@loongson.cn>
3# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
4#
5# Licensed under the Apache License 2.0 (the "License").  You may not use
6# this file except in compliance with the License.  You can obtain a copy
7# in the file LICENSE in the source distribution or at
8# https://www.openssl.org/source/license.html
9
10use strict;
11
12my $code;
13
14# Here is the scalar register layout for LoongArch.
15my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22));
16my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
17my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21));
18my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$r$_",(23..31));
19
20# The saved floating-point registers in the LP64D ABI.  In LoongArch
21# with vector extension, the low 64 bits of a vector register alias with
22# the corresponding FPR.  So we must save and restore the corresponding
23# FPR if we'll write into a vector register.  The ABI only requires
24# saving and restoring the FPR (i.e. 64 bits of the corresponding vector
25# register), not the entire vector register.
26my ($fs0,$fs1,$fs2,$fs3,$fs4,$fs5,$fs6,$fs7)=map("\$f$_",(24..31));
27
28# Here is the 128-bit vector register layout for LSX extension.
29my ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10,
30    $vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19,
31    $vr20,$vr21,$vr22,$vr23,$vr24,$vr25,$vr26,$vr27,$vr28,
32    $vr29,$vr30,$vr31)=map("\$vr$_",(0..31));
33
34# Here is the 256-bit vector register layout for LASX extension.
35my ($xr0,$xr1,$xr2,$xr3,$xr4,$xr5,$xr6,$xr7,$xr8,$xr9,$xr10,
36    $xr11,$xr12,$xr13,$xr14,$xr15,$xr16,$xr17,$xr18,$xr19,
37    $xr20,$xr21,$xr22,$xr23,$xr24,$xr25,$xr26,$xr27,$xr28,
38    $xr29,$xr30,$xr31)=map("\$xr$_",(0..31));
39
40my $output;
41for (@ARGV) {	$output=$_ if (/\w[\w\-]*\.\w+$/);	}
42open STDOUT,">$output";
43
44# Input parameter block
45my ($out, $inp, $len, $key, $counter) = ($a0, $a1, $a2, $a3, $a4);
46
47$code .= <<EOF;
48#include "loongarch_arch.h"
49
50.text
51
52.extern OPENSSL_loongarch_hwcap_P
53
54.align 6
55.Lsigma:
56.ascii	"expand 32-byte k"
57.Linc8x:
58.long	0,1,2,3,4,5,6,7
59.Linc4x:
60.long	0,1,2,3
61
62.globl	ChaCha20_ctr32
63.type	ChaCha20_ctr32 function
64
65.align 6
66ChaCha20_ctr32:
67	# $a0 = arg #1 (out pointer)
68	# $a1 = arg #2 (inp pointer)
69	# $a2 = arg #3 (len)
70	# $a3 = arg #4 (key array)
71	# $a4 = arg #5 (counter array)
72
73	beqz		$len,.Lno_data
74	ori			$t3,$zero,64
75	la.pcrel	$t0,OPENSSL_loongarch_hwcap_P
76	ld.w		$t0,$t0,0
77
78	bleu		$len,$t3,.LChaCha20_1x  # goto 1x when len <= 64
79
80	andi		$t0,$t0,LOONGARCH_HWCAP_LASX | LOONGARCH_HWCAP_LSX
81	beqz		$t0,.LChaCha20_1x
82
83	addi.d		$sp,$sp,-64
84	fst.d		$fs0,$sp,0
85	fst.d		$fs1,$sp,8
86	fst.d		$fs2,$sp,16
87	fst.d		$fs3,$sp,24
88	fst.d		$fs4,$sp,32
89	fst.d		$fs5,$sp,40
90	fst.d		$fs6,$sp,48
91	fst.d		$fs7,$sp,56
92
93	andi		$t1,$t0,LOONGARCH_HWCAP_LASX
94	bnez		$t1,.LChaCha20_8x
95
96	b		.LChaCha20_4x
97
98EOF
99
100########################################################################
101# Scalar code path that handles all lengths.
102{
103# Load the initial states in array @x[*] and update directly
104my @x = ($t0, $t1, $t2, $t3, $t4, $t5, $t6, $t7,
105         $s0, $s1, $s2, $s3, $s4, $s5, $s6, $s7);
106
107sub ROUND {
108	my ($a0,$b0,$c0,$d0) = @_;
109	my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
110	my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
111	my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
112
113$code .= <<EOF;
114	add.w		@x[$a0],@x[$a0],@x[$b0]
115	xor			@x[$d0],@x[$d0],@x[$a0]
116	rotri.w		@x[$d0],@x[$d0],16      # rotate left 16 bits
117	add.w		@x[$a1],@x[$a1],@x[$b1]
118	xor			@x[$d1],@x[$d1],@x[$a1]
119	rotri.w		@x[$d1],@x[$d1],16
120
121	add.w		@x[$c0],@x[$c0],@x[$d0]
122	xor			@x[$b0],@x[$b0],@x[$c0]
123	rotri.w		@x[$b0],@x[$b0],20      # rotate left 12 bits
124	add.w		@x[$c1],@x[$c1],@x[$d1]
125	xor			@x[$b1],@x[$b1],@x[$c1]
126	rotri.w		@x[$b1],@x[$b1],20
127
128	add.w		@x[$a0],@x[$a0],@x[$b0]
129	xor			@x[$d0],@x[$d0],@x[$a0]
130	rotri.w		@x[$d0],@x[$d0],24      # rotate left 8 bits
131	add.w		@x[$a1],@x[$a1],@x[$b1]
132	xor			@x[$d1],@x[$d1],@x[$a1]
133	rotri.w		@x[$d1],@x[$d1],24
134
135	add.w		@x[$c0],@x[$c0],@x[$d0]
136	xor			@x[$b0],@x[$b0],@x[$c0]
137	rotri.w		@x[$b0],@x[$b0],25      # rotate left 7 bits
138	add.w		@x[$c1],@x[$c1],@x[$d1]
139	xor			@x[$b1],@x[$b1],@x[$c1]
140	rotri.w		@x[$b1],@x[$b1],25
141
142	add.w		@x[$a2],@x[$a2],@x[$b2]
143	xor			@x[$d2],@x[$d2],@x[$a2]
144	rotri.w		@x[$d2],@x[$d2],16
145	add.w		@x[$a3],@x[$a3],@x[$b3]
146	xor			@x[$d3],@x[$d3],@x[$a3]
147	rotri.w		@x[$d3],@x[$d3],16
148
149	add.w		@x[$c2],@x[$c2],@x[$d2]
150	xor			@x[$b2],@x[$b2],@x[$c2]
151	rotri.w		@x[$b2],@x[$b2],20
152	add.w		@x[$c3],@x[$c3],@x[$d3]
153	xor			@x[$b3],@x[$b3],@x[$c3]
154	rotri.w		@x[$b3],@x[$b3],20
155
156	add.w		@x[$a2],@x[$a2],@x[$b2]
157	xor			@x[$d2],@x[$d2],@x[$a2]
158	rotri.w		@x[$d2],@x[$d2],24
159	add.w		@x[$a3],@x[$a3],@x[$b3]
160	xor			@x[$d3],@x[$d3],@x[$a3]
161	rotri.w		@x[$d3],@x[$d3],24
162
163	add.w		@x[$c2],@x[$c2],@x[$d2]
164	xor			@x[$b2],@x[$b2],@x[$c2]
165	rotri.w		@x[$b2],@x[$b2],25
166	add.w		@x[$c3],@x[$c3],@x[$d3]
167	xor			@x[$b3],@x[$b3],@x[$c3]
168	rotri.w		@x[$b3],@x[$b3],25
169
170EOF
171}
172
173$code .= <<EOF;
174.align 6
175.LChaCha20_1x:
176	addi.d		$sp,$sp,-256
177	st.d		$s0,$sp,0
178	st.d		$s1,$sp,8
179	st.d		$s2,$sp,16
180	st.d		$s3,$sp,24
181	st.d		$s4,$sp,32
182	st.d		$s5,$sp,40
183	st.d		$s6,$sp,48
184	st.d		$s7,$sp,56
185	st.d		$s8,$sp,64
186
187	# Save the initial block counter in $s8
188	ld.w		$s8,$counter,0
189	b			.Loop_outer_1x
190
191.align 5
192.Loop_outer_1x:
193	# Load constants
194	la.local	$t8,.Lsigma
195	ld.w		@x[0],$t8,0		  # 'expa'
196	ld.w		@x[1],$t8,4		  # 'nd 3'
197	ld.w		@x[2],$t8,8		  # '2-by'
198	ld.w		@x[3],$t8,12	  # 'te k'
199
200	# Load key
201	ld.w		@x[4],$key,4*0
202	ld.w		@x[5],$key,4*1
203	ld.w		@x[6],$key,4*2
204	ld.w		@x[7],$key,4*3
205	ld.w		@x[8],$key,4*4
206	ld.w		@x[9],$key,4*5
207	ld.w		@x[10],$key,4*6
208	ld.w		@x[11],$key,4*7
209
210	# Load block counter
211	move		@x[12],$s8
212
213	# Load nonce
214	ld.w		@x[13],$counter,4*1
215	ld.w		@x[14],$counter,4*2
216	ld.w		@x[15],$counter,4*3
217
218	# Update states in \@x[*] for 20 rounds
219	ori			$t8,$zero,10
220	b			.Loop_1x
221
222.align 5
223.Loop_1x:
224EOF
225
226&ROUND (0, 4, 8, 12);
227&ROUND (0, 5, 10, 15);
228
229$code .= <<EOF;
230	addi.w		$t8,$t8,-1
231	bnez		$t8,.Loop_1x
232
233	# Get the final states by adding the initial states
234	la.local	$t8,.Lsigma
235	ld.w		$a7,$t8,4*0
236	ld.w		$a6,$t8,4*1
237	ld.w		$a5,$t8,4*2
238	add.w		@x[0],@x[0],$a7
239	add.w		@x[1],@x[1],$a6
240	add.w		@x[2],@x[2],$a5
241	ld.w		$a7,$t8,4*3
242	add.w		@x[3],@x[3],$a7
243
244	ld.w		$t8,$key,4*0
245	ld.w		$a7,$key,4*1
246	ld.w		$a6,$key,4*2
247	ld.w		$a5,$key,4*3
248	add.w		@x[4],@x[4],$t8
249	add.w		@x[5],@x[5],$a7
250	add.w		@x[6],@x[6],$a6
251	add.w		@x[7],@x[7],$a5
252
253	ld.w		$t8,$key,4*4
254	ld.w		$a7,$key,4*5
255	ld.w		$a6,$key,4*6
256	ld.w		$a5,$key,4*7
257	add.w		@x[8],@x[8],$t8
258	add.w		@x[9],@x[9],$a7
259	add.w		@x[10],@x[10],$a6
260	add.w		@x[11],@x[11],$a5
261
262	add.w		@x[12],@x[12],$s8
263
264	ld.w		$t8,$counter,4*1
265	ld.w		$a7,$counter,4*2
266	ld.w		$a6,$counter,4*3
267	add.w		@x[13],@x[13],$t8
268	add.w		@x[14],@x[14],$a7
269	add.w		@x[15],@x[15],$a6
270
271	ori			$t8,$zero,64
272	bltu		$len,$t8,.Ltail_1x
273
274	# Get the encrypted message by xor states with plaintext
275	ld.w		$t8,$inp,4*0
276	ld.w		$a7,$inp,4*1
277	ld.w		$a6,$inp,4*2
278	ld.w		$a5,$inp,4*3
279	xor			$t8,$t8,@x[0]
280	xor			$a7,$a7,@x[1]
281	xor			$a6,$a6,@x[2]
282	xor			$a5,$a5,@x[3]
283	st.w		$t8,$out,4*0
284	st.w		$a7,$out,4*1
285	st.w		$a6,$out,4*2
286	st.w		$a5,$out,4*3
287
288	ld.w		$t8,$inp,4*4
289	ld.w		$a7,$inp,4*5
290	ld.w		$a6,$inp,4*6
291	ld.w		$a5,$inp,4*7
292	xor			$t8,$t8,@x[4]
293	xor			$a7,$a7,@x[5]
294	xor			$a6,$a6,@x[6]
295	xor			$a5,$a5,@x[7]
296	st.w		$t8,$out,4*4
297	st.w		$a7,$out,4*5
298	st.w		$a6,$out,4*6
299	st.w		$a5,$out,4*7
300
301	ld.w		$t8,$inp,4*8
302	ld.w		$a7,$inp,4*9
303	ld.w		$a6,$inp,4*10
304	ld.w		$a5,$inp,4*11
305	xor			$t8,$t8,@x[8]
306	xor			$a7,$a7,@x[9]
307	xor			$a6,$a6,@x[10]
308	xor			$a5,$a5,@x[11]
309	st.w		$t8,$out,4*8
310	st.w		$a7,$out,4*9
311	st.w		$a6,$out,4*10
312	st.w		$a5,$out,4*11
313
314	ld.w		$t8,$inp,4*12
315	ld.w		$a7,$inp,4*13
316	ld.w		$a6,$inp,4*14
317	ld.w		$a5,$inp,4*15
318	xor			$t8,$t8,@x[12]
319	xor			$a7,$a7,@x[13]
320	xor			$a6,$a6,@x[14]
321	xor			$a5,$a5,@x[15]
322	st.w		$t8,$out,4*12
323	st.w		$a7,$out,4*13
324	st.w		$a6,$out,4*14
325	st.w		$a5,$out,4*15
326
327	addi.d		$len,$len,-64
328	beqz		$len,.Ldone_1x
329	addi.d		$inp,$inp,64
330	addi.d		$out,$out,64
331	addi.w		$s8,$s8,1
332	b			.Loop_outer_1x
333
334.align 4
335.Ltail_1x:
336	# Handle the tail for 1x (1 <= tail_len <= 63)
337	addi.d		$a7,$sp,72
338	st.w		@x[0],$a7,4*0
339	st.w		@x[1],$a7,4*1
340	st.w		@x[2],$a7,4*2
341	st.w		@x[3],$a7,4*3
342	st.w		@x[4],$a7,4*4
343	st.w		@x[5],$a7,4*5
344	st.w		@x[6],$a7,4*6
345	st.w		@x[7],$a7,4*7
346	st.w		@x[8],$a7,4*8
347	st.w		@x[9],$a7,4*9
348	st.w		@x[10],$a7,4*10
349	st.w		@x[11],$a7,4*11
350	st.w		@x[12],$a7,4*12
351	st.w		@x[13],$a7,4*13
352	st.w		@x[14],$a7,4*14
353	st.w		@x[15],$a7,4*15
354
355	move		$t8,$zero
356
357.Loop_tail_1x:
358	# Xor input with states byte by byte
359	ldx.bu		$a6,$inp,$t8
360	ldx.bu		$a5,$a7,$t8
361	xor			$a6,$a6,$a5
362	stx.b		$a6,$out,$t8
363	addi.w		$t8,$t8,1
364	addi.d		$len,$len,-1
365	bnez		$len,.Loop_tail_1x
366	b			.Ldone_1x
367
368.Ldone_1x:
369	ld.d		$s0,$sp,0
370	ld.d		$s1,$sp,8
371	ld.d		$s2,$sp,16
372	ld.d		$s3,$sp,24
373	ld.d		$s4,$sp,32
374	ld.d		$s5,$sp,40
375	ld.d		$s6,$sp,48
376	ld.d		$s7,$sp,56
377	ld.d		$s8,$sp,64
378	addi.d		$sp,$sp,256
379
380	b			.Lend
381
382EOF
383}
384
385########################################################################
386# 128-bit LSX code path that handles all lengths.
387{
388# Load the initial states in array @x[*] and update directly.
389my @x = ($vr0, $vr1, $vr2, $vr3, $vr4, $vr5, $vr6, $vr7,
390         $vr8, $vr9, $vr10, $vr11, $vr12, $vr13, $vr14, $vr15);
391
392# Save the initial states in array @y[*]
393my @y = ($vr16, $vr17, $vr18, $vr19, $vr20, $vr21, $vr22, $vr23,
394         $vr24, $vr25, $vr26, $vr27, $vr28, $vr29, $vr30, $vr31);
395
396sub ROUND_4x {
397	my ($a0,$b0,$c0,$d0) = @_;
398	my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
399	my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
400	my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
401
402$code .= <<EOF;
403	vadd.w		@x[$a0],@x[$a0],@x[$b0]
404	vxor.v		@x[$d0],@x[$d0],@x[$a0]
405	vrotri.w	@x[$d0],@x[$d0],16      # rotate left 16 bits
406	vadd.w		@x[$a1],@x[$a1],@x[$b1]
407	vxor.v		@x[$d1],@x[$d1],@x[$a1]
408	vrotri.w	@x[$d1],@x[$d1],16
409
410	vadd.w		@x[$c0],@x[$c0],@x[$d0]
411	vxor.v		@x[$b0],@x[$b0],@x[$c0]
412	vrotri.w	@x[$b0],@x[$b0],20      # rotate left 12 bits
413	vadd.w		@x[$c1],@x[$c1],@x[$d1]
414	vxor.v		@x[$b1],@x[$b1],@x[$c1]
415	vrotri.w	@x[$b1],@x[$b1],20
416
417	vadd.w		@x[$a0],@x[$a0],@x[$b0]
418	vxor.v		@x[$d0],@x[$d0],@x[$a0]
419	vrotri.w	@x[$d0],@x[$d0],24      # rotate left 8 bits
420	vadd.w		@x[$a1],@x[$a1],@x[$b1]
421	vxor.v		@x[$d1],@x[$d1],@x[$a1]
422	vrotri.w	@x[$d1],@x[$d1],24
423
424	vadd.w		@x[$c0],@x[$c0],@x[$d0]
425	vxor.v		@x[$b0],@x[$b0],@x[$c0]
426	vrotri.w	@x[$b0],@x[$b0],25      # rotate left 7 bits
427	vadd.w		@x[$c1],@x[$c1],@x[$d1]
428	vxor.v		@x[$b1],@x[$b1],@x[$c1]
429	vrotri.w	@x[$b1],@x[$b1],25
430
431	vadd.w		@x[$a2],@x[$a2],@x[$b2]
432	vxor.v		@x[$d2],@x[$d2],@x[$a2]
433	vrotri.w	@x[$d2],@x[$d2],16
434	vadd.w		@x[$a3],@x[$a3],@x[$b3]
435	vxor.v		@x[$d3],@x[$d3],@x[$a3]
436	vrotri.w	@x[$d3],@x[$d3],16
437
438	vadd.w		@x[$c2],@x[$c2],@x[$d2]
439	vxor.v		@x[$b2],@x[$b2],@x[$c2]
440	vrotri.w	@x[$b2],@x[$b2],20
441	vadd.w		@x[$c3],@x[$c3],@x[$d3]
442	vxor.v		@x[$b3],@x[$b3],@x[$c3]
443	vrotri.w	@x[$b3],@x[$b3],20
444
445	vadd.w		@x[$a2],@x[$a2],@x[$b2]
446	vxor.v		@x[$d2],@x[$d2],@x[$a2]
447	vrotri.w	@x[$d2],@x[$d2],24
448	vadd.w		@x[$a3],@x[$a3],@x[$b3]
449	vxor.v		@x[$d3],@x[$d3],@x[$a3]
450	vrotri.w	@x[$d3],@x[$d3],24
451
452	vadd.w		@x[$c2],@x[$c2],@x[$d2]
453	vxor.v		@x[$b2],@x[$b2],@x[$c2]
454	vrotri.w	@x[$b2],@x[$b2],25
455	vadd.w		@x[$c3],@x[$c3],@x[$d3]
456	vxor.v		@x[$b3],@x[$b3],@x[$c3]
457	vrotri.w	@x[$b3],@x[$b3],25
458
459EOF
460}
461
462$code .= <<EOF;
463.align 6
464.LChaCha20_4x:
465	addi.d		$sp,$sp,-128
466
467	# Save the initial block counter in $t4
468	ld.w		$t4,$counter,0
469	b			.Loop_outer_4x
470
471.align 5
472.Loop_outer_4x:
473	# Load constant
474	la.local		$t8,.Lsigma
475	vldrepl.w		@x[0],$t8,4*0		# 'expa'
476	vldrepl.w		@x[1],$t8,4*1		# 'nd 3'
477	vldrepl.w		@x[2],$t8,4*2		# '2-by'
478	vldrepl.w		@x[3],$t8,4*3		# 'te k'
479
480	# Load key
481	vldrepl.w		@x[4],$key,4*0
482	vldrepl.w		@x[5],$key,4*1
483	vldrepl.w		@x[6],$key,4*2
484	vldrepl.w		@x[7],$key,4*3
485	vldrepl.w		@x[8],$key,4*4
486	vldrepl.w		@x[9],$key,4*5
487	vldrepl.w		@x[10],$key,4*6
488	vldrepl.w		@x[11],$key,4*7
489
490	# Load block counter
491	vreplgr2vr.w	@x[12],$t4
492
493	# Load nonce
494	vldrepl.w		@x[13],$counter,4*1
495	vldrepl.w		@x[14],$counter,4*2
496	vldrepl.w		@x[15],$counter,4*3
497
498	# Get the correct block counter for each block
499	la.local		$t8,.Linc4x
500	vld				@y[0],$t8,0
501	vadd.w			@x[12],@x[12],@y[0]
502
503	# Copy the initial states from \@x[*] to \@y[*]
504	vori.b			@y[0],@x[0],0
505	vori.b			@y[1],@x[1],0
506	vori.b			@y[2],@x[2],0
507	vori.b			@y[3],@x[3],0
508	vori.b			@y[4],@x[4],0
509	vori.b			@y[5],@x[5],0
510	vori.b			@y[6],@x[6],0
511	vori.b			@y[7],@x[7],0
512	vori.b			@y[8],@x[8],0
513	vori.b			@y[9],@x[9],0
514	vori.b			@y[10],@x[10],0
515	vori.b			@y[11],@x[11],0
516	vori.b			@y[12],@x[12],0
517	vori.b			@y[13],@x[13],0
518	vori.b			@y[14],@x[14],0
519	vori.b			@y[15],@x[15],0
520
521	# Update states in \@x[*] for 20 rounds
522	ori				$t8,$zero,10
523	b				.Loop_4x
524
525.align 5
526.Loop_4x:
527EOF
528
529&ROUND_4x (0, 4, 8, 12);
530&ROUND_4x (0, 5, 10, 15);
531
532$code .= <<EOF;
533	addi.w		$t8,$t8,-1
534	bnez		$t8,.Loop_4x
535
536	# Get the final states by adding the initial states
537	vadd.w		@x[0],@x[0],@y[0]
538	vadd.w		@x[1],@x[1],@y[1]
539	vadd.w		@x[2],@x[2],@y[2]
540	vadd.w		@x[3],@x[3],@y[3]
541	vadd.w		@x[4],@x[4],@y[4]
542	vadd.w		@x[5],@x[5],@y[5]
543	vadd.w		@x[6],@x[6],@y[6]
544	vadd.w		@x[7],@x[7],@y[7]
545	vadd.w		@x[8],@x[8],@y[8]
546	vadd.w		@x[9],@x[9],@y[9]
547	vadd.w		@x[10],@x[10],@y[10]
548	vadd.w		@x[11],@x[11],@y[11]
549	vadd.w		@x[12],@x[12],@y[12]
550	vadd.w		@x[13],@x[13],@y[13]
551	vadd.w		@x[14],@x[14],@y[14]
552	vadd.w		@x[15],@x[15],@y[15]
553
554	# Get the transpose of \@x[*] and save them in \@x[*]
555	vilvl.w		@y[0],@x[1],@x[0]
556	vilvh.w		@y[1],@x[1],@x[0]
557	vilvl.w		@y[2],@x[3],@x[2]
558	vilvh.w		@y[3],@x[3],@x[2]
559	vilvl.w		@y[4],@x[5],@x[4]
560	vilvh.w		@y[5],@x[5],@x[4]
561	vilvl.w		@y[6],@x[7],@x[6]
562	vilvh.w		@y[7],@x[7],@x[6]
563	vilvl.w		@y[8],@x[9],@x[8]
564	vilvh.w		@y[9],@x[9],@x[8]
565	vilvl.w		@y[10],@x[11],@x[10]
566	vilvh.w		@y[11],@x[11],@x[10]
567	vilvl.w		@y[12],@x[13],@x[12]
568	vilvh.w		@y[13],@x[13],@x[12]
569	vilvl.w		@y[14],@x[15],@x[14]
570	vilvh.w		@y[15],@x[15],@x[14]
571
572	vilvl.d		@x[0],@y[2],@y[0]
573	vilvh.d		@x[1],@y[2],@y[0]
574	vilvl.d		@x[2],@y[3],@y[1]
575	vilvh.d		@x[3],@y[3],@y[1]
576	vilvl.d		@x[4],@y[6],@y[4]
577	vilvh.d		@x[5],@y[6],@y[4]
578	vilvl.d		@x[6],@y[7],@y[5]
579	vilvh.d		@x[7],@y[7],@y[5]
580	vilvl.d		@x[8],@y[10],@y[8]
581	vilvh.d		@x[9],@y[10],@y[8]
582	vilvl.d		@x[10],@y[11],@y[9]
583	vilvh.d		@x[11],@y[11],@y[9]
584	vilvl.d		@x[12],@y[14],@y[12]
585	vilvh.d		@x[13],@y[14],@y[12]
586	vilvl.d		@x[14],@y[15],@y[13]
587	vilvh.d		@x[15],@y[15],@y[13]
588EOF
589
590# Adjust the order of elements in @x[*] for ease of use.
591@x = (@x[0],@x[4],@x[8],@x[12],@x[1],@x[5],@x[9],@x[13],
592      @x[2],@x[6],@x[10],@x[14],@x[3],@x[7],@x[11],@x[15]);
593
594$code .= <<EOF;
595	ori			$t8,$zero,64*4
596	bltu		$len,$t8,.Ltail_4x
597
598	# Get the encrypted message by xor states with plaintext
599	vld			@y[0],$inp,16*0
600	vld			@y[1],$inp,16*1
601	vld			@y[2],$inp,16*2
602	vld			@y[3],$inp,16*3
603	vxor.v		@y[0],@y[0],@x[0]
604	vxor.v		@y[1],@y[1],@x[1]
605	vxor.v		@y[2],@y[2],@x[2]
606	vxor.v		@y[3],@y[3],@x[3]
607	vst			@y[0],$out,16*0
608	vst			@y[1],$out,16*1
609	vst			@y[2],$out,16*2
610	vst			@y[3],$out,16*3
611
612	vld			@y[0],$inp,16*4
613	vld			@y[1],$inp,16*5
614	vld			@y[2],$inp,16*6
615	vld			@y[3],$inp,16*7
616	vxor.v		@y[0],@y[0],@x[4]
617	vxor.v		@y[1],@y[1],@x[5]
618	vxor.v		@y[2],@y[2],@x[6]
619	vxor.v		@y[3],@y[3],@x[7]
620	vst			@y[0],$out,16*4
621	vst			@y[1],$out,16*5
622	vst			@y[2],$out,16*6
623	vst			@y[3],$out,16*7
624
625	vld			@y[0],$inp,16*8
626	vld			@y[1],$inp,16*9
627	vld			@y[2],$inp,16*10
628	vld			@y[3],$inp,16*11
629	vxor.v		@y[0],@y[0],@x[8]
630	vxor.v		@y[1],@y[1],@x[9]
631	vxor.v		@y[2],@y[2],@x[10]
632	vxor.v		@y[3],@y[3],@x[11]
633	vst			@y[0],$out,16*8
634	vst			@y[1],$out,16*9
635	vst			@y[2],$out,16*10
636	vst			@y[3],$out,16*11
637
638	vld			@y[0],$inp,16*12
639	vld			@y[1],$inp,16*13
640	vld			@y[2],$inp,16*14
641	vld			@y[3],$inp,16*15
642	vxor.v		@y[0],@y[0],@x[12]
643	vxor.v		@y[1],@y[1],@x[13]
644	vxor.v		@y[2],@y[2],@x[14]
645	vxor.v		@y[3],@y[3],@x[15]
646	vst			@y[0],$out,16*12
647	vst			@y[1],$out,16*13
648	vst			@y[2],$out,16*14
649	vst			@y[3],$out,16*15
650
651	addi.d		$len,$len,-64*4
652	beqz		$len,.Ldone_4x
653	addi.d		$inp,$inp,64*4
654	addi.d		$out,$out,64*4
655	addi.w		$t4,$t4,4
656	b			.Loop_outer_4x
657
658.Ltail_4x:
659	# Handle the tail for 4x (1 <= tail_len <= 255)
660	ori			$t8,$zero,192
661	bgeu		$len,$t8,.L192_or_more4x
662	ori			$t8,$zero,128
663	bgeu		$len,$t8,.L128_or_more4x
664	ori			$t8,$zero,64
665	bgeu		$len,$t8,.L64_or_more4x
666
667	vst			@x[0],$sp,16*0
668	vst			@x[1],$sp,16*1
669	vst			@x[2],$sp,16*2
670	vst			@x[3],$sp,16*3
671	move		$t8,$zero
672	b			.Loop_tail_4x
673
674.align 5
675.L64_or_more4x:
676	vld			@y[0],$inp,16*0
677	vld			@y[1],$inp,16*1
678	vld			@y[2],$inp,16*2
679	vld			@y[3],$inp,16*3
680	vxor.v		@y[0],@y[0],@x[0]
681	vxor.v		@y[1],@y[1],@x[1]
682	vxor.v		@y[2],@y[2],@x[2]
683	vxor.v		@y[3],@y[3],@x[3]
684	vst			@y[0],$out,16*0
685	vst			@y[1],$out,16*1
686	vst			@y[2],$out,16*2
687	vst			@y[3],$out,16*3
688
689	addi.d		$len,$len,-64
690	beqz		$len,.Ldone_4x
691	addi.d		$inp,$inp,64
692	addi.d		$out,$out,64
693	vst			@x[4],$sp,16*0
694	vst			@x[5],$sp,16*1
695	vst			@x[6],$sp,16*2
696	vst			@x[7],$sp,16*3
697	move		$t8,$zero
698	b			.Loop_tail_4x
699
700.align 5
701.L128_or_more4x:
702	vld			@y[0],$inp,16*0
703	vld			@y[1],$inp,16*1
704	vld			@y[2],$inp,16*2
705	vld			@y[3],$inp,16*3
706	vxor.v		@y[0],@y[0],@x[0]
707	vxor.v		@y[1],@y[1],@x[1]
708	vxor.v		@y[2],@y[2],@x[2]
709	vxor.v		@y[3],@y[3],@x[3]
710	vst			@y[0],$out,16*0
711	vst			@y[1],$out,16*1
712	vst			@y[2],$out,16*2
713	vst			@y[3],$out,16*3
714
715	vld			@y[0],$inp,16*4
716	vld			@y[1],$inp,16*5
717	vld			@y[2],$inp,16*6
718	vld			@y[3],$inp,16*7
719	vxor.v		@y[0],@y[0],@x[4]
720	vxor.v		@y[1],@y[1],@x[5]
721	vxor.v		@y[2],@y[2],@x[6]
722	vxor.v		@y[3],@y[3],@x[7]
723	vst			@y[0],$out,16*4
724	vst			@y[1],$out,16*5
725	vst			@y[2],$out,16*6
726	vst			@y[3],$out,16*7
727
728	addi.d		$len,$len,-128
729	beqz		$len,.Ldone_4x
730	addi.d		$inp,$inp,128
731	addi.d		$out,$out,128
732	vst			@x[8],$sp,16*0
733	vst			@x[9],$sp,16*1
734	vst			@x[10],$sp,16*2
735	vst			@x[11],$sp,16*3
736	move		$t8,$zero
737	b			.Loop_tail_4x
738
739.align 5
740.L192_or_more4x:
741	vld			@y[0],$inp,16*0
742	vld			@y[1],$inp,16*1
743	vld			@y[2],$inp,16*2
744	vld			@y[3],$inp,16*3
745	vxor.v		@y[0],@y[0],@x[0]
746	vxor.v		@y[1],@y[1],@x[1]
747	vxor.v		@y[2],@y[2],@x[2]
748	vxor.v		@y[3],@y[3],@x[3]
749	vst			@y[0],$out,16*0
750	vst			@y[1],$out,16*1
751	vst			@y[2],$out,16*2
752	vst			@y[3],$out,16*3
753
754	vld			@y[0],$inp,16*4
755	vld			@y[1],$inp,16*5
756	vld			@y[2],$inp,16*6
757	vld			@y[3],$inp,16*7
758	vxor.v		@y[0],@y[0],@x[4]
759	vxor.v		@y[1],@y[1],@x[5]
760	vxor.v		@y[2],@y[2],@x[6]
761	vxor.v		@y[3],@y[3],@x[7]
762	vst			@y[0],$out,16*4
763	vst			@y[1],$out,16*5
764	vst			@y[2],$out,16*6
765	vst			@y[3],$out,16*7
766
767	vld			@y[0],$inp,16*8
768	vld			@y[1],$inp,16*9
769	vld			@y[2],$inp,16*10
770	vld			@y[3],$inp,16*11
771	vxor.v		@y[0],@y[0],@x[8]
772	vxor.v		@y[1],@y[1],@x[9]
773	vxor.v		@y[2],@y[2],@x[10]
774	vxor.v		@y[3],@y[3],@x[11]
775	vst			@y[0],$out,16*8
776	vst			@y[1],$out,16*9
777	vst			@y[2],$out,16*10
778	vst			@y[3],$out,16*11
779
780	addi.d		$len,$len,-192
781	beqz		$len,.Ldone_4x
782	addi.d		$inp,$inp,192
783	addi.d		$out,$out,192
784	vst			@x[12],$sp,16*0
785	vst			@x[13],$sp,16*1
786	vst			@x[14],$sp,16*2
787	vst			@x[15],$sp,16*3
788	move		$t8,$zero
789	b			.Loop_tail_4x
790
791.Loop_tail_4x:
792	# Xor input with states byte by byte
793	ldx.bu		$t5,$inp,$t8
794	ldx.bu		$t6,$sp,$t8
795	xor			$t5,$t5,$t6
796	stx.b		$t5,$out,$t8
797	addi.w		$t8,$t8,1
798	addi.d		$len,$len,-1
799	bnez		$len,.Loop_tail_4x
800	b			.Ldone_4x
801
802.Ldone_4x:
803	addi.d		$sp,$sp,128
804	b			.Lrestore_saved_fpr
805
806EOF
807}
808
809########################################################################
810# 256-bit LASX code path that handles all lengths.
811{
812# Load the initial states in array @x[*] and update directly.
813my @x = ($xr0, $xr1, $xr2, $xr3, $xr4, $xr5, $xr6, $xr7,
814         $xr8, $xr9, $xr10, $xr11, $xr12, $xr13, $xr14, $xr15);
815
816# Save the initial states in array @y[*]
817my @y = ($xr16, $xr17, $xr18, $xr19, $xr20, $xr21, $xr22, $xr23,
818         $xr24, $xr25, $xr26, $xr27, $xr28, $xr29, $xr30, $xr31);
819
820sub ROUND_8x {
821	my ($a0,$b0,$c0,$d0) = @_;
822	my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
823	my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
824	my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
825
826$code .= <<EOF;
827	xvadd.w		@x[$a0],@x[$a0],@x[$b0]
828	xvxor.v		@x[$d0],@x[$d0],@x[$a0]
829	xvrotri.w	@x[$d0],@x[$d0],16      # rotate left 16 bits
830	xvadd.w		@x[$a1],@x[$a1],@x[$b1]
831	xvxor.v		@x[$d1],@x[$d1],@x[$a1]
832	xvrotri.w	@x[$d1],@x[$d1],16
833
834	xvadd.w		@x[$c0],@x[$c0],@x[$d0]
835	xvxor.v		@x[$b0],@x[$b0],@x[$c0]
836	xvrotri.w	@x[$b0],@x[$b0],20      # rotate left 12 bits
837	xvadd.w		@x[$c1],@x[$c1],@x[$d1]
838	xvxor.v		@x[$b1],@x[$b1],@x[$c1]
839	xvrotri.w	@x[$b1],@x[$b1],20
840
841	xvadd.w		@x[$a0],@x[$a0],@x[$b0]
842	xvxor.v		@x[$d0],@x[$d0],@x[$a0]
843	xvrotri.w	@x[$d0],@x[$d0],24      # rotate left 8 bits
844	xvadd.w		@x[$a1],@x[$a1],@x[$b1]
845	xvxor.v		@x[$d1],@x[$d1],@x[$a1]
846	xvrotri.w	@x[$d1],@x[$d1],24
847
848	xvadd.w		@x[$c0],@x[$c0],@x[$d0]
849	xvxor.v		@x[$b0],@x[$b0],@x[$c0]
850	xvrotri.w	@x[$b0],@x[$b0],25      # rotate left 7 bits
851	xvadd.w		@x[$c1],@x[$c1],@x[$d1]
852	xvxor.v		@x[$b1],@x[$b1],@x[$c1]
853	xvrotri.w	@x[$b1],@x[$b1],25
854
855	xvadd.w		@x[$a2],@x[$a2],@x[$b2]
856	xvxor.v		@x[$d2],@x[$d2],@x[$a2]
857	xvrotri.w	@x[$d2],@x[$d2],16
858	xvadd.w		@x[$a3],@x[$a3],@x[$b3]
859	xvxor.v		@x[$d3],@x[$d3],@x[$a3]
860	xvrotri.w	@x[$d3],@x[$d3],16
861
862	xvadd.w		@x[$c2],@x[$c2],@x[$d2]
863	xvxor.v		@x[$b2],@x[$b2],@x[$c2]
864	xvrotri.w	@x[$b2],@x[$b2],20
865	xvadd.w		@x[$c3],@x[$c3],@x[$d3]
866	xvxor.v		@x[$b3],@x[$b3],@x[$c3]
867	xvrotri.w	@x[$b3],@x[$b3],20
868
869	xvadd.w		@x[$a2],@x[$a2],@x[$b2]
870	xvxor.v		@x[$d2],@x[$d2],@x[$a2]
871	xvrotri.w	@x[$d2],@x[$d2],24
872	xvadd.w		@x[$a3],@x[$a3],@x[$b3]
873	xvxor.v		@x[$d3],@x[$d3],@x[$a3]
874	xvrotri.w	@x[$d3],@x[$d3],24
875
876	xvadd.w		@x[$c2],@x[$c2],@x[$d2]
877	xvxor.v		@x[$b2],@x[$b2],@x[$c2]
878	xvrotri.w	@x[$b2],@x[$b2],25
879	xvadd.w		@x[$c3],@x[$c3],@x[$d3]
880	xvxor.v		@x[$b3],@x[$b3],@x[$c3]
881	xvrotri.w	@x[$b3],@x[$b3],25
882
883EOF
884}
885
886$code .= <<EOF;
887.align 6
888.LChaCha20_8x:
889	addi.d		$sp,$sp,-128
890
891	# Save the initial block counter in $t4
892	ld.w		$t4,$counter,0
893	b			.Loop_outer_8x
894
895.align 5
896.Loop_outer_8x:
897	# Load constant
898	la.local		$t8,.Lsigma
899	xvldrepl.w		@x[0],$t8,4*0		# 'expa'
900	xvldrepl.w		@x[1],$t8,4*1		# 'nd 3'
901	xvldrepl.w		@x[2],$t8,4*2		# '2-by'
902	xvldrepl.w		@x[3],$t8,4*3		# 'te k'
903
904	# Load key
905	xvldrepl.w		@x[4],$key,4*0
906	xvldrepl.w		@x[5],$key,4*1
907	xvldrepl.w		@x[6],$key,4*2
908	xvldrepl.w		@x[7],$key,4*3
909	xvldrepl.w		@x[8],$key,4*4
910	xvldrepl.w		@x[9],$key,4*5
911	xvldrepl.w		@x[10],$key,4*6
912	xvldrepl.w		@x[11],$key,4*7
913
914	# Load block counter
915	xvreplgr2vr.w	@x[12],$t4
916
917	# Load nonce
918	xvldrepl.w		@x[13],$counter,4*1
919	xvldrepl.w		@x[14],$counter,4*2
920	xvldrepl.w		@x[15],$counter,4*3
921
922	# Get the correct block counter for each block
923	la.local		$t8,.Linc8x
924	xvld			@y[0],$t8,0
925	xvadd.w			@x[12],@x[12],@y[0]
926
927	# Copy the initial states from \@x[*] to \@y[*]
928	xvori.b			@y[0],@x[0],0
929	xvori.b			@y[1],@x[1],0
930	xvori.b			@y[2],@x[2],0
931	xvori.b			@y[3],@x[3],0
932	xvori.b			@y[4],@x[4],0
933	xvori.b			@y[5],@x[5],0
934	xvori.b			@y[6],@x[6],0
935	xvori.b			@y[7],@x[7],0
936	xvori.b			@y[8],@x[8],0
937	xvori.b			@y[9],@x[9],0
938	xvori.b			@y[10],@x[10],0
939	xvori.b			@y[11],@x[11],0
940	xvori.b			@y[12],@x[12],0
941	xvori.b			@y[13],@x[13],0
942	xvori.b			@y[14],@x[14],0
943	xvori.b			@y[15],@x[15],0
944
945	# Update states in \@x[*] for 20 rounds
946	ori				$t8,$zero,10
947	b				.Loop_8x
948
949.align 5
950.Loop_8x:
951EOF
952
953&ROUND_8x (0, 4, 8, 12);
954&ROUND_8x (0, 5, 10, 15);
955
956$code .= <<EOF;
957	addi.w		$t8,$t8,-1
958	bnez		$t8,.Loop_8x
959
960	# Get the final states by adding the initial states
961	xvadd.w		@x[0],@x[0],@y[0]
962	xvadd.w		@x[1],@x[1],@y[1]
963	xvadd.w		@x[2],@x[2],@y[2]
964	xvadd.w		@x[3],@x[3],@y[3]
965	xvadd.w		@x[4],@x[4],@y[4]
966	xvadd.w		@x[5],@x[5],@y[5]
967	xvadd.w		@x[6],@x[6],@y[6]
968	xvadd.w		@x[7],@x[7],@y[7]
969	xvadd.w		@x[8],@x[8],@y[8]
970	xvadd.w		@x[9],@x[9],@y[9]
971	xvadd.w		@x[10],@x[10],@y[10]
972	xvadd.w		@x[11],@x[11],@y[11]
973	xvadd.w		@x[12],@x[12],@y[12]
974	xvadd.w		@x[13],@x[13],@y[13]
975	xvadd.w		@x[14],@x[14],@y[14]
976	xvadd.w		@x[15],@x[15],@y[15]
977
978	# Get the transpose of \@x[*] and save them in \@y[*]
979	xvilvl.w	@y[0],@x[1],@x[0]
980	xvilvh.w	@y[1],@x[1],@x[0]
981	xvilvl.w	@y[2],@x[3],@x[2]
982	xvilvh.w	@y[3],@x[3],@x[2]
983	xvilvl.w	@y[4],@x[5],@x[4]
984	xvilvh.w	@y[5],@x[5],@x[4]
985	xvilvl.w	@y[6],@x[7],@x[6]
986	xvilvh.w	@y[7],@x[7],@x[6]
987	xvilvl.w	@y[8],@x[9],@x[8]
988	xvilvh.w	@y[9],@x[9],@x[8]
989	xvilvl.w	@y[10],@x[11],@x[10]
990	xvilvh.w	@y[11],@x[11],@x[10]
991	xvilvl.w	@y[12],@x[13],@x[12]
992	xvilvh.w	@y[13],@x[13],@x[12]
993	xvilvl.w	@y[14],@x[15],@x[14]
994	xvilvh.w	@y[15],@x[15],@x[14]
995
996	xvilvl.d	@x[0],@y[2],@y[0]
997	xvilvh.d	@x[1],@y[2],@y[0]
998	xvilvl.d	@x[2],@y[3],@y[1]
999	xvilvh.d	@x[3],@y[3],@y[1]
1000	xvilvl.d	@x[4],@y[6],@y[4]
1001	xvilvh.d	@x[5],@y[6],@y[4]
1002	xvilvl.d	@x[6],@y[7],@y[5]
1003	xvilvh.d	@x[7],@y[7],@y[5]
1004	xvilvl.d	@x[8],@y[10],@y[8]
1005	xvilvh.d	@x[9],@y[10],@y[8]
1006	xvilvl.d	@x[10],@y[11],@y[9]
1007	xvilvh.d	@x[11],@y[11],@y[9]
1008	xvilvl.d	@x[12],@y[14],@y[12]
1009	xvilvh.d	@x[13],@y[14],@y[12]
1010	xvilvl.d	@x[14],@y[15],@y[13]
1011	xvilvh.d	@x[15],@y[15],@y[13]
1012
1013	xvori.b		@y[0],@x[4],0
1014	xvpermi.q	@y[0],@x[0],0x20
1015	xvori.b		@y[1],@x[5],0
1016	xvpermi.q	@y[1],@x[1],0x20
1017	xvori.b		@y[2],@x[6],0
1018	xvpermi.q	@y[2],@x[2],0x20
1019	xvori.b		@y[3],@x[7],0
1020	xvpermi.q	@y[3],@x[3],0x20
1021	xvori.b		@y[4],@x[4],0
1022	xvpermi.q	@y[4],@x[0],0x31
1023	xvori.b		@y[5],@x[5],0
1024	xvpermi.q	@y[5],@x[1],0x31
1025	xvori.b		@y[6],@x[6],0
1026	xvpermi.q	@y[6],@x[2],0x31
1027	xvori.b		@y[7],@x[7],0
1028	xvpermi.q	@y[7],@x[3],0x31
1029	xvori.b		@y[8],@x[12],0
1030	xvpermi.q	@y[8],@x[8],0x20
1031	xvori.b		@y[9],@x[13],0
1032	xvpermi.q	@y[9],@x[9],0x20
1033	xvori.b		@y[10],@x[14],0
1034	xvpermi.q	@y[10],@x[10],0x20
1035	xvori.b		@y[11],@x[15],0
1036	xvpermi.q	@y[11],@x[11],0x20
1037	xvori.b		@y[12],@x[12],0
1038	xvpermi.q	@y[12],@x[8],0x31
1039	xvori.b		@y[13],@x[13],0
1040	xvpermi.q	@y[13],@x[9],0x31
1041	xvori.b		@y[14],@x[14],0
1042	xvpermi.q	@y[14],@x[10],0x31
1043	xvori.b		@y[15],@x[15],0
1044	xvpermi.q	@y[15],@x[11],0x31
1045
1046EOF
1047
1048# Adjust the order of elements in @y[*] for ease of use.
1049@y = (@y[0],@y[8],@y[1],@y[9],@y[2],@y[10],@y[3],@y[11],
1050      @y[4],@y[12],@y[5],@y[13],@y[6],@y[14],@y[7],@y[15]);
1051
1052$code .= <<EOF;
1053	ori			$t8,$zero,64*8
1054	bltu		$len,$t8,.Ltail_8x
1055
1056	# Get the encrypted message by xor states with plaintext
1057	xvld		@x[0],$inp,32*0
1058	xvld		@x[1],$inp,32*1
1059	xvld		@x[2],$inp,32*2
1060	xvld		@x[3],$inp,32*3
1061	xvxor.v		@x[0],@x[0],@y[0]
1062	xvxor.v		@x[1],@x[1],@y[1]
1063	xvxor.v		@x[2],@x[2],@y[2]
1064	xvxor.v		@x[3],@x[3],@y[3]
1065	xvst		@x[0],$out,32*0
1066	xvst		@x[1],$out,32*1
1067	xvst		@x[2],$out,32*2
1068	xvst		@x[3],$out,32*3
1069
1070	xvld		@x[0],$inp,32*4
1071	xvld		@x[1],$inp,32*5
1072	xvld		@x[2],$inp,32*6
1073	xvld		@x[3],$inp,32*7
1074	xvxor.v		@x[0],@x[0],@y[4]
1075	xvxor.v		@x[1],@x[1],@y[5]
1076	xvxor.v		@x[2],@x[2],@y[6]
1077	xvxor.v		@x[3],@x[3],@y[7]
1078	xvst		@x[0],$out,32*4
1079	xvst		@x[1],$out,32*5
1080	xvst		@x[2],$out,32*6
1081	xvst		@x[3],$out,32*7
1082
1083	xvld		@x[0],$inp,32*8
1084	xvld		@x[1],$inp,32*9
1085	xvld		@x[2],$inp,32*10
1086	xvld		@x[3],$inp,32*11
1087	xvxor.v		@x[0],@x[0],@y[8]
1088	xvxor.v		@x[1],@x[1],@y[9]
1089	xvxor.v		@x[2],@x[2],@y[10]
1090	xvxor.v		@x[3],@x[3],@y[11]
1091	xvst		@x[0],$out,32*8
1092	xvst		@x[1],$out,32*9
1093	xvst		@x[2],$out,32*10
1094	xvst		@x[3],$out,32*11
1095
1096	xvld		@x[0],$inp,32*12
1097	xvld		@x[1],$inp,32*13
1098	xvld		@x[2],$inp,32*14
1099	xvld		@x[3],$inp,32*15
1100	xvxor.v		@x[0],@x[0],@y[12]
1101	xvxor.v		@x[1],@x[1],@y[13]
1102	xvxor.v		@x[2],@x[2],@y[14]
1103	xvxor.v		@x[3],@x[3],@y[15]
1104	xvst		@x[0],$out,32*12
1105	xvst		@x[1],$out,32*13
1106	xvst		@x[2],$out,32*14
1107	xvst		@x[3],$out,32*15
1108
1109	addi.d		$len,$len,-64*8
1110	beqz		$len,.Ldone_8x
1111	addi.d		$inp,$inp,64*8
1112	addi.d		$out,$out,64*8
1113	addi.w		$t4,$t4,8
1114	b			.Loop_outer_8x
1115
1116.Ltail_8x:
1117	# Handle the tail for 8x (1 <= tail_len <= 511)
1118	ori			$t8,$zero,448
1119	bgeu		$len,$t8,.L448_or_more8x
1120	ori			$t8,$zero,384
1121	bgeu		$len,$t8,.L384_or_more8x
1122	ori			$t8,$zero,320
1123	bgeu		$len,$t8,.L320_or_more8x
1124	ori			$t8,$zero,256
1125	bgeu		$len,$t8,.L256_or_more8x
1126	ori			$t8,$zero,192
1127	bgeu		$len,$t8,.L192_or_more8x
1128	ori			$t8,$zero,128
1129	bgeu		$len,$t8,.L128_or_more8x
1130	ori			$t8,$zero,64
1131	bgeu		$len,$t8,.L64_or_more8x
1132
1133	xvst		@y[0],$sp,32*0
1134	xvst		@y[1],$sp,32*1
1135	move		$t8,$zero
1136	b			.Loop_tail_8x
1137
1138.align 5
1139.L64_or_more8x:
1140	xvld		@x[0],$inp,32*0
1141	xvld		@x[1],$inp,32*1
1142	xvxor.v		@x[0],@x[0],@y[0]
1143	xvxor.v		@x[1],@x[1],@y[1]
1144	xvst		@x[0],$out,32*0
1145	xvst		@x[1],$out,32*1
1146
1147	addi.d		$len,$len,-64
1148	beqz		$len,.Ldone_8x
1149	addi.d		$inp,$inp,64
1150	addi.d		$out,$out,64
1151	xvst		@y[2],$sp,32*0
1152	xvst		@y[3],$sp,32*1
1153	move		$t8,$zero
1154	b			.Loop_tail_8x
1155
1156.align 5
1157.L128_or_more8x:
1158	xvld		@x[0],$inp,32*0
1159	xvld		@x[1],$inp,32*1
1160	xvld		@x[2],$inp,32*2
1161	xvld		@x[3],$inp,32*3
1162	xvxor.v		@x[0],@x[0],@y[0]
1163	xvxor.v		@x[1],@x[1],@y[1]
1164	xvxor.v		@x[2],@x[2],@y[2]
1165	xvxor.v		@x[3],@x[3],@y[3]
1166	xvst		@x[0],$out,32*0
1167	xvst		@x[1],$out,32*1
1168	xvst		@x[2],$out,32*2
1169	xvst		@x[3],$out,32*3
1170
1171	addi.d		$len,$len,-128
1172	beqz		$len,.Ldone_8x
1173	addi.d		$inp,$inp,128
1174	addi.d		$out,$out,128
1175	xvst		@y[4],$sp,32*0
1176	xvst		@y[5],$sp,32*1
1177	move		$t8,$zero
1178	b			.Loop_tail_8x
1179
1180.align 5
1181.L192_or_more8x:
1182	xvld		@x[0],$inp,32*0
1183	xvld		@x[1],$inp,32*1
1184	xvld		@x[2],$inp,32*2
1185	xvld		@x[3],$inp,32*3
1186	xvxor.v		@x[0],@x[0],@y[0]
1187	xvxor.v		@x[1],@x[1],@y[1]
1188	xvxor.v		@x[2],@x[2],@y[2]
1189	xvxor.v		@x[3],@x[3],@y[3]
1190	xvst		@x[0],$out,32*0
1191	xvst		@x[1],$out,32*1
1192	xvst		@x[2],$out,32*2
1193	xvst		@x[3],$out,32*3
1194
1195	xvld		@x[0],$inp,32*4
1196	xvld		@x[1],$inp,32*5
1197	xvxor.v		@x[0],@x[0],@y[4]
1198	xvxor.v		@x[1],@x[1],@y[5]
1199	xvst		@x[0],$out,32*4
1200	xvst		@x[1],$out,32*5
1201
1202	addi.d		$len,$len,-192
1203	beqz		$len,.Ldone_8x
1204	addi.d		$inp,$inp,192
1205	addi.d		$out,$out,192
1206	xvst		@y[6],$sp,32*0
1207	xvst		@y[7],$sp,32*1
1208	move		$t8,$zero
1209	b			.Loop_tail_8x
1210
1211.align 5
1212.L256_or_more8x:
1213	xvld		@x[0],$inp,32*0
1214	xvld		@x[1],$inp,32*1
1215	xvld		@x[2],$inp,32*2
1216	xvld		@x[3],$inp,32*3
1217	xvxor.v		@x[0],@x[0],@y[0]
1218	xvxor.v		@x[1],@x[1],@y[1]
1219	xvxor.v		@x[2],@x[2],@y[2]
1220	xvxor.v		@x[3],@x[3],@y[3]
1221	xvst		@x[0],$out,32*0
1222	xvst		@x[1],$out,32*1
1223	xvst		@x[2],$out,32*2
1224	xvst		@x[3],$out,32*3
1225
1226	xvld		@x[0],$inp,32*4
1227	xvld		@x[1],$inp,32*5
1228	xvld		@x[2],$inp,32*6
1229	xvld		@x[3],$inp,32*7
1230	xvxor.v		@x[0],@x[0],@y[4]
1231	xvxor.v		@x[1],@x[1],@y[5]
1232	xvxor.v		@x[2],@x[2],@y[6]
1233	xvxor.v		@x[3],@x[3],@y[7]
1234	xvst		@x[0],$out,32*4
1235	xvst		@x[1],$out,32*5
1236	xvst		@x[2],$out,32*6
1237	xvst		@x[3],$out,32*7
1238
1239	addi.d		$len,$len,-256
1240	beqz		$len,.Ldone_8x
1241	addi.d		$inp,$inp,256
1242	addi.d		$out,$out,256
1243	xvst		@y[8],$sp,32*0
1244	xvst		@y[9],$sp,32*1
1245	move		$t8,$zero
1246	b			.Loop_tail_8x
1247
1248.align 5
1249.L320_or_more8x:
1250	xvld		@x[0],$inp,32*0
1251	xvld		@x[1],$inp,32*1
1252	xvld		@x[2],$inp,32*2
1253	xvld		@x[3],$inp,32*3
1254	xvxor.v		@x[0],@x[0],@y[0]
1255	xvxor.v		@x[1],@x[1],@y[1]
1256	xvxor.v		@x[2],@x[2],@y[2]
1257	xvxor.v		@x[3],@x[3],@y[3]
1258	xvst		@x[0],$out,32*0
1259	xvst		@x[1],$out,32*1
1260	xvst		@x[2],$out,32*2
1261	xvst		@x[3],$out,32*3
1262
1263	xvld		@x[0],$inp,32*4
1264	xvld		@x[1],$inp,32*5
1265	xvld		@x[2],$inp,32*6
1266	xvld		@x[3],$inp,32*7
1267	xvxor.v		@x[0],@x[0],@y[4]
1268	xvxor.v		@x[1],@x[1],@y[5]
1269	xvxor.v		@x[2],@x[2],@y[6]
1270	xvxor.v		@x[3],@x[3],@y[7]
1271	xvst		@x[0],$out,32*4
1272	xvst		@x[1],$out,32*5
1273	xvst		@x[2],$out,32*6
1274	xvst		@x[3],$out,32*7
1275
1276	xvld		@x[0],$inp,32*8
1277	xvld		@x[1],$inp,32*9
1278	xvxor.v		@x[0],@x[0],@y[8]
1279	xvxor.v		@x[1],@x[1],@y[9]
1280	xvst		@x[0],$out,32*8
1281	xvst		@x[1],$out,32*9
1282
1283	addi.d		$len,$len,-320
1284	beqz		$len,.Ldone_8x
1285	addi.d		$inp,$inp,320
1286	addi.d		$out,$out,320
1287	xvst		@y[10],$sp,32*0
1288	xvst		@y[11],$sp,32*1
1289	move		$t8,$zero
1290	b			.Loop_tail_8x
1291
1292.align 5
1293.L384_or_more8x:
1294	xvld		@x[0],$inp,32*0
1295	xvld		@x[1],$inp,32*1
1296	xvld		@x[2],$inp,32*2
1297	xvld		@x[3],$inp,32*3
1298	xvxor.v		@x[0],@x[0],@y[0]
1299	xvxor.v		@x[1],@x[1],@y[1]
1300	xvxor.v		@x[2],@x[2],@y[2]
1301	xvxor.v		@x[3],@x[3],@y[3]
1302	xvst		@x[0],$out,32*0
1303	xvst		@x[1],$out,32*1
1304	xvst		@x[2],$out,32*2
1305	xvst		@x[3],$out,32*3
1306
1307	xvld		@x[0],$inp,32*4
1308	xvld		@x[1],$inp,32*5
1309	xvld		@x[2],$inp,32*6
1310	xvld		@x[3],$inp,32*7
1311	xvxor.v		@x[0],@x[0],@y[4]
1312	xvxor.v		@x[1],@x[1],@y[5]
1313	xvxor.v		@x[2],@x[2],@y[6]
1314	xvxor.v		@x[3],@x[3],@y[7]
1315	xvst		@x[0],$out,32*4
1316	xvst		@x[1],$out,32*5
1317	xvst		@x[2],$out,32*6
1318	xvst		@x[3],$out,32*7
1319
1320	xvld		@x[0],$inp,32*8
1321	xvld		@x[1],$inp,32*9
1322	xvld		@x[2],$inp,32*10
1323	xvld		@x[3],$inp,32*11
1324	xvxor.v		@x[0],@x[0],@y[8]
1325	xvxor.v		@x[1],@x[1],@y[9]
1326	xvxor.v		@x[2],@x[2],@y[10]
1327	xvxor.v		@x[3],@x[3],@y[11]
1328	xvst		@x[0],$out,32*8
1329	xvst		@x[1],$out,32*9
1330	xvst		@x[2],$out,32*10
1331	xvst		@x[3],$out,32*11
1332
1333	addi.d		$len,$len,-384
1334	beqz		$len,.Ldone_8x
1335	addi.d		$inp,$inp,384
1336	addi.d		$out,$out,384
1337	xvst		@y[12],$sp,32*0
1338	xvst		@y[13],$sp,32*1
1339	move		$t8,$zero
1340	b			.Loop_tail_8x
1341
1342.align 5
1343.L448_or_more8x:
1344	xvld		@x[0],$inp,32*0
1345	xvld		@x[1],$inp,32*1
1346	xvld		@x[2],$inp,32*2
1347	xvld		@x[3],$inp,32*3
1348	xvxor.v		@x[0],@x[0],@y[0]
1349	xvxor.v		@x[1],@x[1],@y[1]
1350	xvxor.v		@x[2],@x[2],@y[2]
1351	xvxor.v		@x[3],@x[3],@y[3]
1352	xvst		@x[0],$out,32*0
1353	xvst		@x[1],$out,32*1
1354	xvst		@x[2],$out,32*2
1355	xvst		@x[3],$out,32*3
1356
1357	xvld		@x[0],$inp,32*4
1358	xvld		@x[1],$inp,32*5
1359	xvld		@x[2],$inp,32*6
1360	xvld		@x[3],$inp,32*7
1361	xvxor.v		@x[0],@x[0],@y[4]
1362	xvxor.v		@x[1],@x[1],@y[5]
1363	xvxor.v		@x[2],@x[2],@y[6]
1364	xvxor.v		@x[3],@x[3],@y[7]
1365	xvst		@x[0],$out,32*4
1366	xvst		@x[1],$out,32*5
1367	xvst		@x[2],$out,32*6
1368	xvst		@x[3],$out,32*7
1369
1370	xvld		@x[0],$inp,32*8
1371	xvld		@x[1],$inp,32*9
1372	xvld		@x[2],$inp,32*10
1373	xvld		@x[3],$inp,32*11
1374	xvxor.v		@x[0],@x[0],@y[8]
1375	xvxor.v		@x[1],@x[1],@y[9]
1376	xvxor.v		@x[2],@x[2],@y[10]
1377	xvxor.v		@x[3],@x[3],@y[11]
1378	xvst		@x[0],$out,32*8
1379	xvst		@x[1],$out,32*9
1380	xvst		@x[2],$out,32*10
1381	xvst		@x[3],$out,32*11
1382
1383	xvld		@x[0],$inp,32*12
1384	xvld		@x[1],$inp,32*13
1385	xvxor.v		@x[0],@x[0],@y[12]
1386	xvxor.v		@x[1],@x[1],@y[13]
1387	xvst		@x[0],$out,32*12
1388	xvst		@x[1],$out,32*13
1389
1390	addi.d		$len,$len,-448
1391	beqz		$len,.Ldone_8x
1392	addi.d		$inp,$inp,448
1393	addi.d		$out,$out,448
1394	xvst		@y[14],$sp,32*0
1395	xvst		@y[15],$sp,32*1
1396	move		$t8,$zero
1397	b			.Loop_tail_8x
1398
1399.Loop_tail_8x:
1400	# Xor input with states byte by byte
1401	ldx.bu		$t5,$inp,$t8
1402	ldx.bu		$t6,$sp,$t8
1403	xor			$t5,$t5,$t6
1404	stx.b		$t5,$out,$t8
1405	addi.w		$t8,$t8,1
1406	addi.d		$len,$len,-1
1407	bnez		$len,.Loop_tail_8x
1408	b			.Ldone_8x
1409
1410.Ldone_8x:
1411	addi.d		$sp,$sp,128
1412	b			.Lrestore_saved_fpr
1413
1414EOF
1415}
1416
1417$code .= <<EOF;
1418.Lrestore_saved_fpr:
1419	fld.d		$fs0,$sp,0
1420	fld.d		$fs1,$sp,8
1421	fld.d		$fs2,$sp,16
1422	fld.d		$fs3,$sp,24
1423	fld.d		$fs4,$sp,32
1424	fld.d		$fs5,$sp,40
1425	fld.d		$fs6,$sp,48
1426	fld.d		$fs7,$sp,56
1427	addi.d		$sp,$sp,64
1428.Lno_data:
1429.Lend:
1430	jr	$ra
1431.size ChaCha20_ctr32,.-ChaCha20_ctr32
1432EOF
1433
1434$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1435
1436print $code;
1437
1438close STDOUT;
1439