xref: /openssl/crypto/chacha/asm/chacha-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# November 2014
18#
19# ChaCha20 for x86_64.
20#
21# December 2016
22#
23# Add AVX512F code path.
24#
25# December 2017
26#
27# Add AVX512VL code path.
28#
29# Performance in cycles per byte out of large buffer.
30#
31#		IALU/gcc 4.8(i)	1x/2xSSSE3(ii)	4xSSSE3	    NxAVX(v)
32#
33# P4		9.48/+99%	-		-
34# Core2		7.83/+55%	7.90/5.76	4.35
35# Westmere	7.19/+50%	5.60/4.50	3.00
36# Sandy Bridge	8.31/+42%	5.45/4.00	2.72
37# Ivy Bridge	6.71/+46%	5.40/?		2.41
38# Haswell	5.92/+43%	5.20/3.45	2.42        1.23
39# Skylake[-X]	5.87/+39%	4.70/3.22	2.31        1.19[0.80(vi)]
40# Silvermont	12.0/+33%	7.75/6.90	7.03(iii)
41# Knights L	11.7/-		?		9.60(iii)   0.80
42# Goldmont	10.6/+17%	5.10/3.52	3.28
43# Sledgehammer	7.28/+52%	-		-
44# Bulldozer	9.66/+28%	9.85/5.35(iv)	3.06(iv)
45# Ryzen		5.96/+50%	5.19/3.00	2.40        2.09
46# VIA Nano	10.5/+46%	6.72/6.88	6.05
47#
48# (i)	compared to older gcc 3.x one can observe >2x improvement on
49#	most platforms;
50# (ii)	2xSSSE3 is code path optimized specifically for 128 bytes used
51#	by chacha20_poly1305_tls_cipher, results are EVP-free;
52# (iii)	this is not optimal result for Atom because of MSROM
53#	limitations, SSE2 can do better, but gain is considered too
54#	low to justify the [maintenance] effort;
55# (iv)	Bulldozer actually executes 4xXOP code path that delivers 2.20
56#	and 4.85 for 128-byte inputs;
57# (v)	8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58# (vi)	even though Skylake-X can execute AVX512F code and deliver 0.57
59#	cpb in single thread, the corresponding capability is suppressed;
60
61# $output is the last argument if it looks like a file (it has an extension)
62# $flavour is the first argument if it doesn't look like a file
63$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
64$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
65
66$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
67
68$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71die "can't locate x86_64-xlate.pl";
72
73if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
76}
77
78if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80	$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81	$avx += 1 if ($1==2.11 && $2>=8);
82}
83
84if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86	$avx = ($1>=10) + ($1>=11);
87}
88
89if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
90	$avx = ($2>=3.0) + ($2>3.0);
91}
92
93open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
94    or die "can't call $xlate: $!";
95*STDOUT=*OUT;
96
97# input parameter block
98($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
99
100$code.=<<___;
101.text
102
103.extern OPENSSL_ia32cap_P
104
105.section .rodata align=64
106.align	64
107.Lzero:
108.long	0,0,0,0
109.Lone:
110.long	1,0,0,0
111.Linc:
112.long	0,1,2,3
113.Lfour:
114.long	4,4,4,4
115.Lincy:
116.long	0,2,4,6,1,3,5,7
117.Leight:
118.long	8,8,8,8,8,8,8,8
119.Lrot16:
120.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
121.Lrot24:
122.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
123.Ltwoy:
124.long	2,0,0,0, 2,0,0,0
125.align	64
126.Lzeroz:
127.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
128.Lfourz:
129.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
130.Lincz:
131.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
132.Lsixteen:
133.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
134.Lsigma:
135.asciz	"expand 32-byte k"
136.asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
137.previous
138___
139
140sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
141{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
142  my $arg = pop;
143    $arg = "\$$arg" if ($arg*1 eq $arg);
144    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
145}
146
147@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
148    "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
149@t=("%esi","%edi");
150
151sub ROUND {			# critical path is 24 cycles per round
152my ($a0,$b0,$c0,$d0)=@_;
153my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
154my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
155my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
156my ($xc,$xc_)=map("\"$_\"",@t);
157my @x=map("\"$_\"",@x);
158
159	# Consider order in which variables are addressed by their
160	# index:
161	#
162	#	a   b   c   d
163	#
164	#	0   4   8  12 < even round
165	#	1   5   9  13
166	#	2   6  10  14
167	#	3   7  11  15
168	#	0   5  10  15 < odd round
169	#	1   6  11  12
170	#	2   7   8  13
171	#	3   4   9  14
172	#
173	# 'a', 'b' and 'd's are permanently allocated in registers,
174	# @x[0..7,12..15], while 'c's are maintained in memory. If
175	# you observe 'c' column, you'll notice that pair of 'c's is
176	# invariant between rounds. This means that we have to reload
177	# them once per round, in the middle. This is why you'll see
178	# bunch of 'c' stores and loads in the middle, but none in
179	# the beginning or end.
180
181	# Normally instructions would be interleaved to favour in-order
182	# execution. Generally out-of-order cores manage it gracefully,
183	# but not this time for some reason. As in-order execution
184	# cores are dying breed, old Atom is the only one around,
185	# instructions are left uninterleaved. Besides, Atom is better
186	# off executing 1xSSSE3 code anyway...
187
188	(
189	"&add	(@x[$a0],@x[$b0])",	# Q1
190	"&xor	(@x[$d0],@x[$a0])",
191	"&rol	(@x[$d0],16)",
192	 "&add	(@x[$a1],@x[$b1])",	# Q2
193	 "&xor	(@x[$d1],@x[$a1])",
194	 "&rol	(@x[$d1],16)",
195
196	"&add	($xc,@x[$d0])",
197	"&xor	(@x[$b0],$xc)",
198	"&rol	(@x[$b0],12)",
199	 "&add	($xc_,@x[$d1])",
200	 "&xor	(@x[$b1],$xc_)",
201	 "&rol	(@x[$b1],12)",
202
203	"&add	(@x[$a0],@x[$b0])",
204	"&xor	(@x[$d0],@x[$a0])",
205	"&rol	(@x[$d0],8)",
206	 "&add	(@x[$a1],@x[$b1])",
207	 "&xor	(@x[$d1],@x[$a1])",
208	 "&rol	(@x[$d1],8)",
209
210	"&add	($xc,@x[$d0])",
211	"&xor	(@x[$b0],$xc)",
212	"&rol	(@x[$b0],7)",
213	 "&add	($xc_,@x[$d1])",
214	 "&xor	(@x[$b1],$xc_)",
215	 "&rol	(@x[$b1],7)",
216
217	"&mov	(\"4*$c0(%rsp)\",$xc)",	# reload pair of 'c's
218	 "&mov	(\"4*$c1(%rsp)\",$xc_)",
219	"&mov	($xc,\"4*$c2(%rsp)\")",
220	 "&mov	($xc_,\"4*$c3(%rsp)\")",
221
222	"&add	(@x[$a2],@x[$b2])",	# Q3
223	"&xor	(@x[$d2],@x[$a2])",
224	"&rol	(@x[$d2],16)",
225	 "&add	(@x[$a3],@x[$b3])",	# Q4
226	 "&xor	(@x[$d3],@x[$a3])",
227	 "&rol	(@x[$d3],16)",
228
229	"&add	($xc,@x[$d2])",
230	"&xor	(@x[$b2],$xc)",
231	"&rol	(@x[$b2],12)",
232	 "&add	($xc_,@x[$d3])",
233	 "&xor	(@x[$b3],$xc_)",
234	 "&rol	(@x[$b3],12)",
235
236	"&add	(@x[$a2],@x[$b2])",
237	"&xor	(@x[$d2],@x[$a2])",
238	"&rol	(@x[$d2],8)",
239	 "&add	(@x[$a3],@x[$b3])",
240	 "&xor	(@x[$d3],@x[$a3])",
241	 "&rol	(@x[$d3],8)",
242
243	"&add	($xc,@x[$d2])",
244	"&xor	(@x[$b2],$xc)",
245	"&rol	(@x[$b2],7)",
246	 "&add	($xc_,@x[$d3])",
247	 "&xor	(@x[$b3],$xc_)",
248	 "&rol	(@x[$b3],7)"
249	);
250}
251
252########################################################################
253# Generic code path that handles all lengths on pre-SSSE3 processors.
254$code.=<<___;
255.globl	ChaCha20_ctr32
256.type	ChaCha20_ctr32,\@function,5
257.align	64
258ChaCha20_ctr32:
259.cfi_startproc
260	cmp	\$0,$len
261	je	.Lno_data
262	mov	OPENSSL_ia32cap_P+4(%rip),%r10
263___
264$code.=<<___	if ($avx>2);
265	bt	\$48,%r10		# check for AVX512F
266	jc	.LChaCha20_avx512
267	test	%r10,%r10		# check for AVX512VL
268	js	.LChaCha20_avx512vl
269___
270$code.=<<___;
271	test	\$`1<<(41-32)`,%r10d
272	jnz	.LChaCha20_ssse3
273
274	push	%rbx
275.cfi_push	%rbx
276	push	%rbp
277.cfi_push	%rbp
278	push	%r12
279.cfi_push	%r12
280	push	%r13
281.cfi_push	%r13
282	push	%r14
283.cfi_push	%r14
284	push	%r15
285.cfi_push	%r15
286	sub	\$64+24,%rsp
287.cfi_adjust_cfa_offset	64+24
288.Lctr32_body:
289
290	#movdqa	.Lsigma(%rip),%xmm0
291	movdqu	($key),%xmm1
292	movdqu	16($key),%xmm2
293	movdqu	($counter),%xmm3
294	movdqa	.Lone(%rip),%xmm4
295
296	#movdqa	%xmm0,4*0(%rsp)		# key[0]
297	movdqa	%xmm1,4*4(%rsp)		# key[1]
298	movdqa	%xmm2,4*8(%rsp)		# key[2]
299	movdqa	%xmm3,4*12(%rsp)	# key[3]
300	mov	$len,%rbp		# reassign $len
301	jmp	.Loop_outer
302
303.align	32
304.Loop_outer:
305	mov	\$0x61707865,@x[0]      # 'expa'
306	mov	\$0x3320646e,@x[1]      # 'nd 3'
307	mov	\$0x79622d32,@x[2]      # '2-by'
308	mov	\$0x6b206574,@x[3]      # 'te k'
309	mov	4*4(%rsp),@x[4]
310	mov	4*5(%rsp),@x[5]
311	mov	4*6(%rsp),@x[6]
312	mov	4*7(%rsp),@x[7]
313	movd	%xmm3,@x[12]
314	mov	4*13(%rsp),@x[13]
315	mov	4*14(%rsp),@x[14]
316	mov	4*15(%rsp),@x[15]
317
318	mov	%rbp,64+0(%rsp)		# save len
319	mov	\$10,%ebp
320	mov	$inp,64+8(%rsp)		# save inp
321	movq	%xmm2,%rsi		# "@x[8]"
322	mov	$out,64+16(%rsp)	# save out
323	mov	%rsi,%rdi
324	shr	\$32,%rdi		# "@x[9]"
325	jmp	.Loop
326
327.align	32
328.Loop:
329___
330	foreach (&ROUND (0, 4, 8,12)) { eval; }
331	foreach (&ROUND	(0, 5,10,15)) { eval; }
332	&dec	("%ebp");
333	&jnz	(".Loop");
334
335$code.=<<___;
336	mov	@t[1],4*9(%rsp)		# modulo-scheduled
337	mov	@t[0],4*8(%rsp)
338	mov	64(%rsp),%rbp		# load len
339	movdqa	%xmm2,%xmm1
340	mov	64+8(%rsp),$inp		# load inp
341	paddd	%xmm4,%xmm3		# increment counter
342	mov	64+16(%rsp),$out	# load out
343
344	add	\$0x61707865,@x[0]      # 'expa'
345	add	\$0x3320646e,@x[1]      # 'nd 3'
346	add	\$0x79622d32,@x[2]      # '2-by'
347	add	\$0x6b206574,@x[3]      # 'te k'
348	add	4*4(%rsp),@x[4]
349	add	4*5(%rsp),@x[5]
350	add	4*6(%rsp),@x[6]
351	add	4*7(%rsp),@x[7]
352	add	4*12(%rsp),@x[12]
353	add	4*13(%rsp),@x[13]
354	add	4*14(%rsp),@x[14]
355	add	4*15(%rsp),@x[15]
356	paddd	4*8(%rsp),%xmm1
357
358	cmp	\$64,%rbp
359	jb	.Ltail
360
361	xor	4*0($inp),@x[0]		# xor with input
362	xor	4*1($inp),@x[1]
363	xor	4*2($inp),@x[2]
364	xor	4*3($inp),@x[3]
365	xor	4*4($inp),@x[4]
366	xor	4*5($inp),@x[5]
367	xor	4*6($inp),@x[6]
368	xor	4*7($inp),@x[7]
369	movdqu	4*8($inp),%xmm0
370	xor	4*12($inp),@x[12]
371	xor	4*13($inp),@x[13]
372	xor	4*14($inp),@x[14]
373	xor	4*15($inp),@x[15]
374	lea	4*16($inp),$inp		# inp+=64
375	pxor	%xmm1,%xmm0
376
377	movdqa	%xmm2,4*8(%rsp)
378	movd	%xmm3,4*12(%rsp)
379
380	mov	@x[0],4*0($out)		# write output
381	mov	@x[1],4*1($out)
382	mov	@x[2],4*2($out)
383	mov	@x[3],4*3($out)
384	mov	@x[4],4*4($out)
385	mov	@x[5],4*5($out)
386	mov	@x[6],4*6($out)
387	mov	@x[7],4*7($out)
388	movdqu	%xmm0,4*8($out)
389	mov	@x[12],4*12($out)
390	mov	@x[13],4*13($out)
391	mov	@x[14],4*14($out)
392	mov	@x[15],4*15($out)
393	lea	4*16($out),$out		# out+=64
394
395	sub	\$64,%rbp
396	jnz	.Loop_outer
397
398	jmp	.Ldone
399
400.align	16
401.Ltail:
402	mov	@x[0],4*0(%rsp)
403	mov	@x[1],4*1(%rsp)
404	xor	%rbx,%rbx
405	mov	@x[2],4*2(%rsp)
406	mov	@x[3],4*3(%rsp)
407	mov	@x[4],4*4(%rsp)
408	mov	@x[5],4*5(%rsp)
409	mov	@x[6],4*6(%rsp)
410	mov	@x[7],4*7(%rsp)
411	movdqa	%xmm1,4*8(%rsp)
412	mov	@x[12],4*12(%rsp)
413	mov	@x[13],4*13(%rsp)
414	mov	@x[14],4*14(%rsp)
415	mov	@x[15],4*15(%rsp)
416
417.Loop_tail:
418	movzb	($inp,%rbx),%eax
419	movzb	(%rsp,%rbx),%edx
420	lea	1(%rbx),%rbx
421	xor	%edx,%eax
422	mov	%al,-1($out,%rbx)
423	dec	%rbp
424	jnz	.Loop_tail
425
426.Ldone:
427	lea	64+24+48(%rsp),%rsi
428.cfi_def_cfa	%rsi,8
429	mov	-48(%rsi),%r15
430.cfi_restore	%r15
431	mov	-40(%rsi),%r14
432.cfi_restore	%r14
433	mov	-32(%rsi),%r13
434.cfi_restore	%r13
435	mov	-24(%rsi),%r12
436.cfi_restore	%r12
437	mov	-16(%rsi),%rbp
438.cfi_restore	%rbp
439	mov	-8(%rsi),%rbx
440.cfi_restore	%rbx
441	lea	(%rsi),%rsp
442.cfi_def_cfa_register	%rsp
443.Lno_data:
444	ret
445.cfi_endproc
446.size	ChaCha20_ctr32,.-ChaCha20_ctr32
447___
448
449########################################################################
450# SSSE3 code path that handles shorter lengths
451{
452my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
453
454sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
455	&paddd	($a,$b);
456	&pxor	($d,$a);
457	&pshufb	($d,$rot16);
458
459	&paddd	($c,$d);
460	&pxor	($b,$c);
461	&movdqa	($t,$b);
462	&psrld	($b,20);
463	&pslld	($t,12);
464	&por	($b,$t);
465
466	&paddd	($a,$b);
467	&pxor	($d,$a);
468	&pshufb	($d,$rot24);
469
470	&paddd	($c,$d);
471	&pxor	($b,$c);
472	&movdqa	($t,$b);
473	&psrld	($b,25);
474	&pslld	($t,7);
475	&por	($b,$t);
476}
477
478my $xframe = $win64 ? 160+8 : 8;
479
480$code.=<<___;
481.type	ChaCha20_ssse3,\@function,5
482.align	32
483ChaCha20_ssse3:
484.cfi_startproc
485.LChaCha20_ssse3:
486	mov	%rsp,%r9		# frame pointer
487.cfi_def_cfa_register	%r9
488___
489$code.=<<___	if ($avx);
490	test	\$`1<<(43-32)`,%r10d
491	jnz	.LChaCha20_4xop		# XOP is fastest even if we use 1/4
492___
493$code.=<<___;
494	cmp	\$128,$len		# we might throw away some data,
495	je	.LChaCha20_128
496	ja	.LChaCha20_4x		# but overall it won't be slower
497
498.Ldo_sse3_after_all:
499	sub	\$64+$xframe,%rsp
500___
501$code.=<<___	if ($win64);
502	movaps	%xmm6,-0x28(%r9)
503	movaps	%xmm7,-0x18(%r9)
504.Lssse3_body:
505___
506$code.=<<___;
507	movdqa	.Lsigma(%rip),$a
508	movdqu	($key),$b
509	movdqu	16($key),$c
510	movdqu	($counter),$d
511	movdqa	.Lrot16(%rip),$rot16
512	movdqa	.Lrot24(%rip),$rot24
513
514	movdqa	$a,0x00(%rsp)
515	movdqa	$b,0x10(%rsp)
516	movdqa	$c,0x20(%rsp)
517	movdqa	$d,0x30(%rsp)
518	mov	\$10,$counter		# reuse $counter
519	jmp	.Loop_ssse3
520
521.align	32
522.Loop_outer_ssse3:
523	movdqa	.Lone(%rip),$d
524	movdqa	0x00(%rsp),$a
525	movdqa	0x10(%rsp),$b
526	movdqa	0x20(%rsp),$c
527	paddd	0x30(%rsp),$d
528	mov	\$10,$counter
529	movdqa	$d,0x30(%rsp)
530	jmp	.Loop_ssse3
531
532.align	32
533.Loop_ssse3:
534___
535	&SSSE3ROUND();
536	&pshufd	($c,$c,0b01001110);
537	&pshufd	($b,$b,0b00111001);
538	&pshufd	($d,$d,0b10010011);
539	&nop	();
540
541	&SSSE3ROUND();
542	&pshufd	($c,$c,0b01001110);
543	&pshufd	($b,$b,0b10010011);
544	&pshufd	($d,$d,0b00111001);
545
546	&dec	($counter);
547	&jnz	(".Loop_ssse3");
548
549$code.=<<___;
550	paddd	0x00(%rsp),$a
551	paddd	0x10(%rsp),$b
552	paddd	0x20(%rsp),$c
553	paddd	0x30(%rsp),$d
554
555	cmp	\$64,$len
556	jb	.Ltail_ssse3
557
558	movdqu	0x00($inp),$t
559	movdqu	0x10($inp),$t1
560	pxor	$t,$a			# xor with input
561	movdqu	0x20($inp),$t
562	pxor	$t1,$b
563	movdqu	0x30($inp),$t1
564	lea	0x40($inp),$inp		# inp+=64
565	pxor	$t,$c
566	pxor	$t1,$d
567
568	movdqu	$a,0x00($out)		# write output
569	movdqu	$b,0x10($out)
570	movdqu	$c,0x20($out)
571	movdqu	$d,0x30($out)
572	lea	0x40($out),$out		# out+=64
573
574	sub	\$64,$len
575	jnz	.Loop_outer_ssse3
576
577	jmp	.Ldone_ssse3
578
579.align	16
580.Ltail_ssse3:
581	movdqa	$a,0x00(%rsp)
582	movdqa	$b,0x10(%rsp)
583	movdqa	$c,0x20(%rsp)
584	movdqa	$d,0x30(%rsp)
585	xor	$counter,$counter
586
587.Loop_tail_ssse3:
588	movzb	($inp,$counter),%eax
589	movzb	(%rsp,$counter),%ecx
590	lea	1($counter),$counter
591	xor	%ecx,%eax
592	mov	%al,-1($out,$counter)
593	dec	$len
594	jnz	.Loop_tail_ssse3
595
596.Ldone_ssse3:
597___
598$code.=<<___	if ($win64);
599	movaps	-0x28(%r9),%xmm6
600	movaps	-0x18(%r9),%xmm7
601___
602$code.=<<___;
603	lea	(%r9),%rsp
604.cfi_def_cfa_register	%rsp
605.Lssse3_epilogue:
606	ret
607.cfi_endproc
608.size	ChaCha20_ssse3,.-ChaCha20_ssse3
609___
610}
611
612########################################################################
613# SSSE3 code path that handles 128-byte inputs
614{
615my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
616my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
617
618sub SSSE3ROUND_2x {
619	&paddd	($a,$b);
620	&pxor	($d,$a);
621	 &paddd	($a1,$b1);
622	 &pxor	($d1,$a1);
623	&pshufb	($d,$rot16);
624	 &pshufb($d1,$rot16);
625
626	&paddd	($c,$d);
627	 &paddd	($c1,$d1);
628	&pxor	($b,$c);
629	 &pxor	($b1,$c1);
630	&movdqa	($t,$b);
631	&psrld	($b,20);
632	 &movdqa($t1,$b1);
633	&pslld	($t,12);
634	 &psrld	($b1,20);
635	&por	($b,$t);
636	 &pslld	($t1,12);
637	 &por	($b1,$t1);
638
639	&paddd	($a,$b);
640	&pxor	($d,$a);
641	 &paddd	($a1,$b1);
642	 &pxor	($d1,$a1);
643	&pshufb	($d,$rot24);
644	 &pshufb($d1,$rot24);
645
646	&paddd	($c,$d);
647	 &paddd	($c1,$d1);
648	&pxor	($b,$c);
649	 &pxor	($b1,$c1);
650	&movdqa	($t,$b);
651	&psrld	($b,25);
652	 &movdqa($t1,$b1);
653	&pslld	($t,7);
654	 &psrld	($b1,25);
655	&por	($b,$t);
656	 &pslld	($t1,7);
657	 &por	($b1,$t1);
658}
659
660my $xframe = $win64 ? 0x68 : 8;
661
662$code.=<<___;
663.type	ChaCha20_128,\@function,5
664.align	32
665ChaCha20_128:
666.cfi_startproc
667.LChaCha20_128:
668	mov	%rsp,%r9		# frame pointer
669.cfi_def_cfa_register	%r9
670	sub	\$64+$xframe,%rsp
671___
672$code.=<<___	if ($win64);
673	movaps	%xmm6,-0x68(%r9)
674	movaps	%xmm7,-0x58(%r9)
675	movaps	%xmm8,-0x48(%r9)
676	movaps	%xmm9,-0x38(%r9)
677	movaps	%xmm10,-0x28(%r9)
678	movaps	%xmm11,-0x18(%r9)
679.L128_body:
680___
681$code.=<<___;
682	movdqa	.Lsigma(%rip),$a
683	movdqu	($key),$b
684	movdqu	16($key),$c
685	movdqu	($counter),$d
686	movdqa	.Lone(%rip),$d1
687	movdqa	.Lrot16(%rip),$rot16
688	movdqa	.Lrot24(%rip),$rot24
689
690	movdqa	$a,$a1
691	movdqa	$a,0x00(%rsp)
692	movdqa	$b,$b1
693	movdqa	$b,0x10(%rsp)
694	movdqa	$c,$c1
695	movdqa	$c,0x20(%rsp)
696	paddd	$d,$d1
697	movdqa	$d,0x30(%rsp)
698	mov	\$10,$counter		# reuse $counter
699	jmp	.Loop_128
700
701.align	32
702.Loop_128:
703___
704	&SSSE3ROUND_2x();
705	&pshufd	($c,$c,0b01001110);
706	&pshufd	($b,$b,0b00111001);
707	&pshufd	($d,$d,0b10010011);
708	&pshufd	($c1,$c1,0b01001110);
709	&pshufd	($b1,$b1,0b00111001);
710	&pshufd	($d1,$d1,0b10010011);
711
712	&SSSE3ROUND_2x();
713	&pshufd	($c,$c,0b01001110);
714	&pshufd	($b,$b,0b10010011);
715	&pshufd	($d,$d,0b00111001);
716	&pshufd	($c1,$c1,0b01001110);
717	&pshufd	($b1,$b1,0b10010011);
718	&pshufd	($d1,$d1,0b00111001);
719
720	&dec	($counter);
721	&jnz	(".Loop_128");
722
723$code.=<<___;
724	paddd	0x00(%rsp),$a
725	paddd	0x10(%rsp),$b
726	paddd	0x20(%rsp),$c
727	paddd	0x30(%rsp),$d
728	paddd	.Lone(%rip),$d1
729	paddd	0x00(%rsp),$a1
730	paddd	0x10(%rsp),$b1
731	paddd	0x20(%rsp),$c1
732	paddd	0x30(%rsp),$d1
733
734	movdqu	0x00($inp),$t
735	movdqu	0x10($inp),$t1
736	pxor	$t,$a			# xor with input
737	movdqu	0x20($inp),$t
738	pxor	$t1,$b
739	movdqu	0x30($inp),$t1
740	pxor	$t,$c
741	movdqu	0x40($inp),$t
742	pxor	$t1,$d
743	movdqu	0x50($inp),$t1
744	pxor	$t,$a1
745	movdqu	0x60($inp),$t
746	pxor	$t1,$b1
747	movdqu	0x70($inp),$t1
748	pxor	$t,$c1
749	pxor	$t1,$d1
750
751	movdqu	$a,0x00($out)		# write output
752	movdqu	$b,0x10($out)
753	movdqu	$c,0x20($out)
754	movdqu	$d,0x30($out)
755	movdqu	$a1,0x40($out)
756	movdqu	$b1,0x50($out)
757	movdqu	$c1,0x60($out)
758	movdqu	$d1,0x70($out)
759___
760$code.=<<___	if ($win64);
761	movaps	-0x68(%r9),%xmm6
762	movaps	-0x58(%r9),%xmm7
763	movaps	-0x48(%r9),%xmm8
764	movaps	-0x38(%r9),%xmm9
765	movaps	-0x28(%r9),%xmm10
766	movaps	-0x18(%r9),%xmm11
767___
768$code.=<<___;
769	lea	(%r9),%rsp
770.cfi_def_cfa_register	%rsp
771.L128_epilogue:
772	ret
773.cfi_endproc
774.size	ChaCha20_128,.-ChaCha20_128
775___
776}
777
778########################################################################
779# SSSE3 code path that handles longer messages.
780{
781# assign variables to favor Atom front-end
782my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
783    $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
784my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
785	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
786
787sub SSSE3_lane_ROUND {
788my ($a0,$b0,$c0,$d0)=@_;
789my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
790my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
791my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
792my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
793my @x=map("\"$_\"",@xx);
794
795	# Consider order in which variables are addressed by their
796	# index:
797	#
798	#	a   b   c   d
799	#
800	#	0   4   8  12 < even round
801	#	1   5   9  13
802	#	2   6  10  14
803	#	3   7  11  15
804	#	0   5  10  15 < odd round
805	#	1   6  11  12
806	#	2   7   8  13
807	#	3   4   9  14
808	#
809	# 'a', 'b' and 'd's are permanently allocated in registers,
810	# @x[0..7,12..15], while 'c's are maintained in memory. If
811	# you observe 'c' column, you'll notice that pair of 'c's is
812	# invariant between rounds. This means that we have to reload
813	# them once per round, in the middle. This is why you'll see
814	# bunch of 'c' stores and loads in the middle, but none in
815	# the beginning or end.
816
817	(
818	"&paddd		(@x[$a0],@x[$b0])",	# Q1
819	 "&paddd	(@x[$a1],@x[$b1])",	# Q2
820	"&pxor		(@x[$d0],@x[$a0])",
821	 "&pxor		(@x[$d1],@x[$a1])",
822	"&pshufb	(@x[$d0],$t1)",
823	 "&pshufb	(@x[$d1],$t1)",
824
825	"&paddd		($xc,@x[$d0])",
826	 "&paddd	($xc_,@x[$d1])",
827	"&pxor		(@x[$b0],$xc)",
828	 "&pxor		(@x[$b1],$xc_)",
829	"&movdqa	($t0,@x[$b0])",
830	"&pslld		(@x[$b0],12)",
831	"&psrld		($t0,20)",
832	 "&movdqa	($t1,@x[$b1])",
833	 "&pslld	(@x[$b1],12)",
834	"&por		(@x[$b0],$t0)",
835	 "&psrld	($t1,20)",
836	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
837	 "&por		(@x[$b1],$t1)",
838
839	"&paddd		(@x[$a0],@x[$b0])",
840	 "&paddd	(@x[$a1],@x[$b1])",
841	"&pxor		(@x[$d0],@x[$a0])",
842	 "&pxor		(@x[$d1],@x[$a1])",
843	"&pshufb	(@x[$d0],$t0)",
844	 "&pshufb	(@x[$d1],$t0)",
845
846	"&paddd		($xc,@x[$d0])",
847	 "&paddd	($xc_,@x[$d1])",
848	"&pxor		(@x[$b0],$xc)",
849	 "&pxor		(@x[$b1],$xc_)",
850	"&movdqa	($t1,@x[$b0])",
851	"&pslld		(@x[$b0],7)",
852	"&psrld		($t1,25)",
853	 "&movdqa	($t0,@x[$b1])",
854	 "&pslld	(@x[$b1],7)",
855	"&por		(@x[$b0],$t1)",
856	 "&psrld	($t0,25)",
857	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
858	 "&por		(@x[$b1],$t0)",
859
860	"&movdqa	(\"`16*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
861	 "&movdqa	(\"`16*($c1-8)`(%rsp)\",$xc_)",
862	"&movdqa	($xc,\"`16*($c2-8)`(%rsp)\")",
863	 "&movdqa	($xc_,\"`16*($c3-8)`(%rsp)\")",
864
865	"&paddd		(@x[$a2],@x[$b2])",	# Q3
866	 "&paddd	(@x[$a3],@x[$b3])",	# Q4
867	"&pxor		(@x[$d2],@x[$a2])",
868	 "&pxor		(@x[$d3],@x[$a3])",
869	"&pshufb	(@x[$d2],$t1)",
870	 "&pshufb	(@x[$d3],$t1)",
871
872	"&paddd		($xc,@x[$d2])",
873	 "&paddd	($xc_,@x[$d3])",
874	"&pxor		(@x[$b2],$xc)",
875	 "&pxor		(@x[$b3],$xc_)",
876	"&movdqa	($t0,@x[$b2])",
877	"&pslld		(@x[$b2],12)",
878	"&psrld		($t0,20)",
879	 "&movdqa	($t1,@x[$b3])",
880	 "&pslld	(@x[$b3],12)",
881	"&por		(@x[$b2],$t0)",
882	 "&psrld	($t1,20)",
883	"&movdqa	($t0,'(%r11)')",	# .Lrot24(%rip)
884	 "&por		(@x[$b3],$t1)",
885
886	"&paddd		(@x[$a2],@x[$b2])",
887	 "&paddd	(@x[$a3],@x[$b3])",
888	"&pxor		(@x[$d2],@x[$a2])",
889	 "&pxor		(@x[$d3],@x[$a3])",
890	"&pshufb	(@x[$d2],$t0)",
891	 "&pshufb	(@x[$d3],$t0)",
892
893	"&paddd		($xc,@x[$d2])",
894	 "&paddd	($xc_,@x[$d3])",
895	"&pxor		(@x[$b2],$xc)",
896	 "&pxor		(@x[$b3],$xc_)",
897	"&movdqa	($t1,@x[$b2])",
898	"&pslld		(@x[$b2],7)",
899	"&psrld		($t1,25)",
900	 "&movdqa	($t0,@x[$b3])",
901	 "&pslld	(@x[$b3],7)",
902	"&por		(@x[$b2],$t1)",
903	 "&psrld	($t0,25)",
904	"&movdqa	($t1,'(%r10)')",	# .Lrot16(%rip)
905	 "&por		(@x[$b3],$t0)"
906	);
907}
908
909my $xframe = $win64 ? 0xa8 : 8;
910
911$code.=<<___;
912.type	ChaCha20_4x,\@function,5
913.align	32
914ChaCha20_4x:
915.cfi_startproc
916.LChaCha20_4x:
917	mov		%rsp,%r9		# frame pointer
918.cfi_def_cfa_register	%r9
919	mov		%r10,%r11
920___
921$code.=<<___	if ($avx>1);
922	shr		\$32,%r10		# OPENSSL_ia32cap_P+8
923	test		\$`1<<5`,%r10		# test AVX2
924	jnz		.LChaCha20_8x
925___
926$code.=<<___;
927	cmp		\$192,$len
928	ja		.Lproceed4x
929
930	and		\$`1<<26|1<<22`,%r11	# isolate XSAVE+MOVBE
931	cmp		\$`1<<22`,%r11		# check for MOVBE without XSAVE
932	je		.Ldo_sse3_after_all	# to detect Atom
933
934.Lproceed4x:
935	sub		\$0x140+$xframe,%rsp
936___
937	################ stack layout
938	# +0x00		SIMD equivalent of @x[8-12]
939	# ...
940	# +0x40		constant copy of key[0-2] smashed by lanes
941	# ...
942	# +0x100	SIMD counters (with nonce smashed by lanes)
943	# ...
944	# +0x140
945$code.=<<___	if ($win64);
946	movaps		%xmm6,-0xa8(%r9)
947	movaps		%xmm7,-0x98(%r9)
948	movaps		%xmm8,-0x88(%r9)
949	movaps		%xmm9,-0x78(%r9)
950	movaps		%xmm10,-0x68(%r9)
951	movaps		%xmm11,-0x58(%r9)
952	movaps		%xmm12,-0x48(%r9)
953	movaps		%xmm13,-0x38(%r9)
954	movaps		%xmm14,-0x28(%r9)
955	movaps		%xmm15,-0x18(%r9)
956.L4x_body:
957___
958$code.=<<___;
959	movdqa		.Lsigma(%rip),$xa3	# key[0]
960	movdqu		($key),$xb3		# key[1]
961	movdqu		16($key),$xt3		# key[2]
962	movdqu		($counter),$xd3		# key[3]
963	lea		0x100(%rsp),%rcx	# size optimization
964	lea		.Lrot16(%rip),%r10
965	lea		.Lrot24(%rip),%r11
966
967	pshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
968	pshufd		\$0x55,$xa3,$xa1
969	movdqa		$xa0,0x40(%rsp)		# ... and offload
970	pshufd		\$0xaa,$xa3,$xa2
971	movdqa		$xa1,0x50(%rsp)
972	pshufd		\$0xff,$xa3,$xa3
973	movdqa		$xa2,0x60(%rsp)
974	movdqa		$xa3,0x70(%rsp)
975
976	pshufd		\$0x00,$xb3,$xb0
977	pshufd		\$0x55,$xb3,$xb1
978	movdqa		$xb0,0x80-0x100(%rcx)
979	pshufd		\$0xaa,$xb3,$xb2
980	movdqa		$xb1,0x90-0x100(%rcx)
981	pshufd		\$0xff,$xb3,$xb3
982	movdqa		$xb2,0xa0-0x100(%rcx)
983	movdqa		$xb3,0xb0-0x100(%rcx)
984
985	pshufd		\$0x00,$xt3,$xt0	# "$xc0"
986	pshufd		\$0x55,$xt3,$xt1	# "$xc1"
987	movdqa		$xt0,0xc0-0x100(%rcx)
988	pshufd		\$0xaa,$xt3,$xt2	# "$xc2"
989	movdqa		$xt1,0xd0-0x100(%rcx)
990	pshufd		\$0xff,$xt3,$xt3	# "$xc3"
991	movdqa		$xt2,0xe0-0x100(%rcx)
992	movdqa		$xt3,0xf0-0x100(%rcx)
993
994	pshufd		\$0x00,$xd3,$xd0
995	pshufd		\$0x55,$xd3,$xd1
996	paddd		.Linc(%rip),$xd0	# don't save counters yet
997	pshufd		\$0xaa,$xd3,$xd2
998	movdqa		$xd1,0x110-0x100(%rcx)
999	pshufd		\$0xff,$xd3,$xd3
1000	movdqa		$xd2,0x120-0x100(%rcx)
1001	movdqa		$xd3,0x130-0x100(%rcx)
1002
1003	jmp		.Loop_enter4x
1004
1005.align	32
1006.Loop_outer4x:
1007	movdqa		0x40(%rsp),$xa0		# re-load smashed key
1008	movdqa		0x50(%rsp),$xa1
1009	movdqa		0x60(%rsp),$xa2
1010	movdqa		0x70(%rsp),$xa3
1011	movdqa		0x80-0x100(%rcx),$xb0
1012	movdqa		0x90-0x100(%rcx),$xb1
1013	movdqa		0xa0-0x100(%rcx),$xb2
1014	movdqa		0xb0-0x100(%rcx),$xb3
1015	movdqa		0xc0-0x100(%rcx),$xt0	# "$xc0"
1016	movdqa		0xd0-0x100(%rcx),$xt1	# "$xc1"
1017	movdqa		0xe0-0x100(%rcx),$xt2	# "$xc2"
1018	movdqa		0xf0-0x100(%rcx),$xt3	# "$xc3"
1019	movdqa		0x100-0x100(%rcx),$xd0
1020	movdqa		0x110-0x100(%rcx),$xd1
1021	movdqa		0x120-0x100(%rcx),$xd2
1022	movdqa		0x130-0x100(%rcx),$xd3
1023	paddd		.Lfour(%rip),$xd0	# next SIMD counters
1024
1025.Loop_enter4x:
1026	movdqa		$xt2,0x20(%rsp)		# SIMD equivalent of "@x[10]"
1027	movdqa		$xt3,0x30(%rsp)		# SIMD equivalent of "@x[11]"
1028	movdqa		(%r10),$xt3		# .Lrot16(%rip)
1029	mov		\$10,%eax
1030	movdqa		$xd0,0x100-0x100(%rcx)	# save SIMD counters
1031	jmp		.Loop4x
1032
1033.align	32
1034.Loop4x:
1035___
1036	foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1037	foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1038$code.=<<___;
1039	dec		%eax
1040	jnz		.Loop4x
1041
1042	paddd		0x40(%rsp),$xa0		# accumulate key material
1043	paddd		0x50(%rsp),$xa1
1044	paddd		0x60(%rsp),$xa2
1045	paddd		0x70(%rsp),$xa3
1046
1047	movdqa		$xa0,$xt2		# "de-interlace" data
1048	punpckldq	$xa1,$xa0
1049	movdqa		$xa2,$xt3
1050	punpckldq	$xa3,$xa2
1051	punpckhdq	$xa1,$xt2
1052	punpckhdq	$xa3,$xt3
1053	movdqa		$xa0,$xa1
1054	punpcklqdq	$xa2,$xa0		# "a0"
1055	movdqa		$xt2,$xa3
1056	punpcklqdq	$xt3,$xt2		# "a2"
1057	punpckhqdq	$xa2,$xa1		# "a1"
1058	punpckhqdq	$xt3,$xa3		# "a3"
1059___
1060	($xa2,$xt2)=($xt2,$xa2);
1061$code.=<<___;
1062	paddd		0x80-0x100(%rcx),$xb0
1063	paddd		0x90-0x100(%rcx),$xb1
1064	paddd		0xa0-0x100(%rcx),$xb2
1065	paddd		0xb0-0x100(%rcx),$xb3
1066
1067	movdqa		$xa0,0x00(%rsp)		# offload $xaN
1068	movdqa		$xa1,0x10(%rsp)
1069	movdqa		0x20(%rsp),$xa0		# "xc2"
1070	movdqa		0x30(%rsp),$xa1		# "xc3"
1071
1072	movdqa		$xb0,$xt2
1073	punpckldq	$xb1,$xb0
1074	movdqa		$xb2,$xt3
1075	punpckldq	$xb3,$xb2
1076	punpckhdq	$xb1,$xt2
1077	punpckhdq	$xb3,$xt3
1078	movdqa		$xb0,$xb1
1079	punpcklqdq	$xb2,$xb0		# "b0"
1080	movdqa		$xt2,$xb3
1081	punpcklqdq	$xt3,$xt2		# "b2"
1082	punpckhqdq	$xb2,$xb1		# "b1"
1083	punpckhqdq	$xt3,$xb3		# "b3"
1084___
1085	($xb2,$xt2)=($xt2,$xb2);
1086	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1087$code.=<<___;
1088	paddd		0xc0-0x100(%rcx),$xc0
1089	paddd		0xd0-0x100(%rcx),$xc1
1090	paddd		0xe0-0x100(%rcx),$xc2
1091	paddd		0xf0-0x100(%rcx),$xc3
1092
1093	movdqa		$xa2,0x20(%rsp)		# keep offloading $xaN
1094	movdqa		$xa3,0x30(%rsp)
1095
1096	movdqa		$xc0,$xt2
1097	punpckldq	$xc1,$xc0
1098	movdqa		$xc2,$xt3
1099	punpckldq	$xc3,$xc2
1100	punpckhdq	$xc1,$xt2
1101	punpckhdq	$xc3,$xt3
1102	movdqa		$xc0,$xc1
1103	punpcklqdq	$xc2,$xc0		# "c0"
1104	movdqa		$xt2,$xc3
1105	punpcklqdq	$xt3,$xt2		# "c2"
1106	punpckhqdq	$xc2,$xc1		# "c1"
1107	punpckhqdq	$xt3,$xc3		# "c3"
1108___
1109	($xc2,$xt2)=($xt2,$xc2);
1110	($xt0,$xt1)=($xa2,$xa3);		# use $xaN as temporary
1111$code.=<<___;
1112	paddd		0x100-0x100(%rcx),$xd0
1113	paddd		0x110-0x100(%rcx),$xd1
1114	paddd		0x120-0x100(%rcx),$xd2
1115	paddd		0x130-0x100(%rcx),$xd3
1116
1117	movdqa		$xd0,$xt2
1118	punpckldq	$xd1,$xd0
1119	movdqa		$xd2,$xt3
1120	punpckldq	$xd3,$xd2
1121	punpckhdq	$xd1,$xt2
1122	punpckhdq	$xd3,$xt3
1123	movdqa		$xd0,$xd1
1124	punpcklqdq	$xd2,$xd0		# "d0"
1125	movdqa		$xt2,$xd3
1126	punpcklqdq	$xt3,$xt2		# "d2"
1127	punpckhqdq	$xd2,$xd1		# "d1"
1128	punpckhqdq	$xt3,$xd3		# "d3"
1129___
1130	($xd2,$xt2)=($xt2,$xd2);
1131$code.=<<___;
1132	cmp		\$64*4,$len
1133	jb		.Ltail4x
1134
1135	movdqu		0x00($inp),$xt0		# xor with input
1136	movdqu		0x10($inp),$xt1
1137	movdqu		0x20($inp),$xt2
1138	movdqu		0x30($inp),$xt3
1139	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1140	pxor		$xb0,$xt1
1141	pxor		$xc0,$xt2
1142	pxor		$xd0,$xt3
1143
1144	 movdqu		$xt0,0x00($out)
1145	movdqu		0x40($inp),$xt0
1146	 movdqu		$xt1,0x10($out)
1147	movdqu		0x50($inp),$xt1
1148	 movdqu		$xt2,0x20($out)
1149	movdqu		0x60($inp),$xt2
1150	 movdqu		$xt3,0x30($out)
1151	movdqu		0x70($inp),$xt3
1152	lea		0x80($inp),$inp		# size optimization
1153	pxor		0x10(%rsp),$xt0
1154	pxor		$xb1,$xt1
1155	pxor		$xc1,$xt2
1156	pxor		$xd1,$xt3
1157
1158	 movdqu		$xt0,0x40($out)
1159	movdqu		0x00($inp),$xt0
1160	 movdqu		$xt1,0x50($out)
1161	movdqu		0x10($inp),$xt1
1162	 movdqu		$xt2,0x60($out)
1163	movdqu		0x20($inp),$xt2
1164	 movdqu		$xt3,0x70($out)
1165	 lea		0x80($out),$out		# size optimization
1166	movdqu		0x30($inp),$xt3
1167	pxor		0x20(%rsp),$xt0
1168	pxor		$xb2,$xt1
1169	pxor		$xc2,$xt2
1170	pxor		$xd2,$xt3
1171
1172	 movdqu		$xt0,0x00($out)
1173	movdqu		0x40($inp),$xt0
1174	 movdqu		$xt1,0x10($out)
1175	movdqu		0x50($inp),$xt1
1176	 movdqu		$xt2,0x20($out)
1177	movdqu		0x60($inp),$xt2
1178	 movdqu		$xt3,0x30($out)
1179	movdqu		0x70($inp),$xt3
1180	lea		0x80($inp),$inp		# inp+=64*4
1181	pxor		0x30(%rsp),$xt0
1182	pxor		$xb3,$xt1
1183	pxor		$xc3,$xt2
1184	pxor		$xd3,$xt3
1185	movdqu		$xt0,0x40($out)
1186	movdqu		$xt1,0x50($out)
1187	movdqu		$xt2,0x60($out)
1188	movdqu		$xt3,0x70($out)
1189	lea		0x80($out),$out		# out+=64*4
1190
1191	sub		\$64*4,$len
1192	jnz		.Loop_outer4x
1193
1194	jmp		.Ldone4x
1195
1196.Ltail4x:
1197	cmp		\$192,$len
1198	jae		.L192_or_more4x
1199	cmp		\$128,$len
1200	jae		.L128_or_more4x
1201	cmp		\$64,$len
1202	jae		.L64_or_more4x
1203
1204	#movdqa		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1205	xor		%r10,%r10
1206	#movdqa		$xt0,0x00(%rsp)
1207	movdqa		$xb0,0x10(%rsp)
1208	movdqa		$xc0,0x20(%rsp)
1209	movdqa		$xd0,0x30(%rsp)
1210	jmp		.Loop_tail4x
1211
1212.align	32
1213.L64_or_more4x:
1214	movdqu		0x00($inp),$xt0		# xor with input
1215	movdqu		0x10($inp),$xt1
1216	movdqu		0x20($inp),$xt2
1217	movdqu		0x30($inp),$xt3
1218	pxor		0x00(%rsp),$xt0		# $xaxN is offloaded, remember?
1219	pxor		$xb0,$xt1
1220	pxor		$xc0,$xt2
1221	pxor		$xd0,$xt3
1222	movdqu		$xt0,0x00($out)
1223	movdqu		$xt1,0x10($out)
1224	movdqu		$xt2,0x20($out)
1225	movdqu		$xt3,0x30($out)
1226	je		.Ldone4x
1227
1228	movdqa		0x10(%rsp),$xt0		# $xaN is offloaded, remember?
1229	lea		0x40($inp),$inp		# inp+=64*1
1230	xor		%r10,%r10
1231	movdqa		$xt0,0x00(%rsp)
1232	movdqa		$xb1,0x10(%rsp)
1233	lea		0x40($out),$out		# out+=64*1
1234	movdqa		$xc1,0x20(%rsp)
1235	sub		\$64,$len		# len-=64*1
1236	movdqa		$xd1,0x30(%rsp)
1237	jmp		.Loop_tail4x
1238
1239.align	32
1240.L128_or_more4x:
1241	movdqu		0x00($inp),$xt0		# xor with input
1242	movdqu		0x10($inp),$xt1
1243	movdqu		0x20($inp),$xt2
1244	movdqu		0x30($inp),$xt3
1245	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1246	pxor		$xb0,$xt1
1247	pxor		$xc0,$xt2
1248	pxor		$xd0,$xt3
1249
1250	 movdqu		$xt0,0x00($out)
1251	movdqu		0x40($inp),$xt0
1252	 movdqu		$xt1,0x10($out)
1253	movdqu		0x50($inp),$xt1
1254	 movdqu		$xt2,0x20($out)
1255	movdqu		0x60($inp),$xt2
1256	 movdqu		$xt3,0x30($out)
1257	movdqu		0x70($inp),$xt3
1258	pxor		0x10(%rsp),$xt0
1259	pxor		$xb1,$xt1
1260	pxor		$xc1,$xt2
1261	pxor		$xd1,$xt3
1262	movdqu		$xt0,0x40($out)
1263	movdqu		$xt1,0x50($out)
1264	movdqu		$xt2,0x60($out)
1265	movdqu		$xt3,0x70($out)
1266	je		.Ldone4x
1267
1268	movdqa		0x20(%rsp),$xt0		# $xaN is offloaded, remember?
1269	lea		0x80($inp),$inp		# inp+=64*2
1270	xor		%r10,%r10
1271	movdqa		$xt0,0x00(%rsp)
1272	movdqa		$xb2,0x10(%rsp)
1273	lea		0x80($out),$out		# out+=64*2
1274	movdqa		$xc2,0x20(%rsp)
1275	sub		\$128,$len		# len-=64*2
1276	movdqa		$xd2,0x30(%rsp)
1277	jmp		.Loop_tail4x
1278
1279.align	32
1280.L192_or_more4x:
1281	movdqu		0x00($inp),$xt0		# xor with input
1282	movdqu		0x10($inp),$xt1
1283	movdqu		0x20($inp),$xt2
1284	movdqu		0x30($inp),$xt3
1285	pxor		0x00(%rsp),$xt0		# $xaN is offloaded, remember?
1286	pxor		$xb0,$xt1
1287	pxor		$xc0,$xt2
1288	pxor		$xd0,$xt3
1289
1290	 movdqu		$xt0,0x00($out)
1291	movdqu		0x40($inp),$xt0
1292	 movdqu		$xt1,0x10($out)
1293	movdqu		0x50($inp),$xt1
1294	 movdqu		$xt2,0x20($out)
1295	movdqu		0x60($inp),$xt2
1296	 movdqu		$xt3,0x30($out)
1297	movdqu		0x70($inp),$xt3
1298	lea		0x80($inp),$inp		# size optimization
1299	pxor		0x10(%rsp),$xt0
1300	pxor		$xb1,$xt1
1301	pxor		$xc1,$xt2
1302	pxor		$xd1,$xt3
1303
1304	 movdqu		$xt0,0x40($out)
1305	movdqu		0x00($inp),$xt0
1306	 movdqu		$xt1,0x50($out)
1307	movdqu		0x10($inp),$xt1
1308	 movdqu		$xt2,0x60($out)
1309	movdqu		0x20($inp),$xt2
1310	 movdqu		$xt3,0x70($out)
1311	 lea		0x80($out),$out		# size optimization
1312	movdqu		0x30($inp),$xt3
1313	pxor		0x20(%rsp),$xt0
1314	pxor		$xb2,$xt1
1315	pxor		$xc2,$xt2
1316	pxor		$xd2,$xt3
1317	movdqu		$xt0,0x00($out)
1318	movdqu		$xt1,0x10($out)
1319	movdqu		$xt2,0x20($out)
1320	movdqu		$xt3,0x30($out)
1321	je		.Ldone4x
1322
1323	movdqa		0x30(%rsp),$xt0		# $xaN is offloaded, remember?
1324	lea		0x40($inp),$inp		# inp+=64*3
1325	xor		%r10,%r10
1326	movdqa		$xt0,0x00(%rsp)
1327	movdqa		$xb3,0x10(%rsp)
1328	lea		0x40($out),$out		# out+=64*3
1329	movdqa		$xc3,0x20(%rsp)
1330	sub		\$192,$len		# len-=64*3
1331	movdqa		$xd3,0x30(%rsp)
1332
1333.Loop_tail4x:
1334	movzb		($inp,%r10),%eax
1335	movzb		(%rsp,%r10),%ecx
1336	lea		1(%r10),%r10
1337	xor		%ecx,%eax
1338	mov		%al,-1($out,%r10)
1339	dec		$len
1340	jnz		.Loop_tail4x
1341
1342.Ldone4x:
1343___
1344$code.=<<___	if ($win64);
1345	movaps		-0xa8(%r9),%xmm6
1346	movaps		-0x98(%r9),%xmm7
1347	movaps		-0x88(%r9),%xmm8
1348	movaps		-0x78(%r9),%xmm9
1349	movaps		-0x68(%r9),%xmm10
1350	movaps		-0x58(%r9),%xmm11
1351	movaps		-0x48(%r9),%xmm12
1352	movaps		-0x38(%r9),%xmm13
1353	movaps		-0x28(%r9),%xmm14
1354	movaps		-0x18(%r9),%xmm15
1355___
1356$code.=<<___;
1357	lea		(%r9),%rsp
1358.cfi_def_cfa_register	%rsp
1359.L4x_epilogue:
1360	ret
1361.cfi_endproc
1362.size	ChaCha20_4x,.-ChaCha20_4x
1363___
1364}
1365
1366########################################################################
1367# XOP code path that handles all lengths.
1368if ($avx) {
1369# There is some "anomaly" observed depending on instructions' size or
1370# alignment. If you look closely at below code you'll notice that
1371# sometimes argument order varies. The order affects instruction
1372# encoding by making it larger, and such fiddling gives 5% performance
1373# improvement. This is on FX-4100...
1374
1375my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1376    $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1377my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1378	 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1379
1380sub XOP_lane_ROUND {
1381my ($a0,$b0,$c0,$d0)=@_;
1382my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1383my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1384my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1385my @x=map("\"$_\"",@xx);
1386
1387	(
1388	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
1389	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
1390	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
1391	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
1392	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
1393	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
1394	  "&vpxor	(@x[$d2],@x[$a2],@x[$d2])",
1395	   "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
1396	"&vprotd	(@x[$d0],@x[$d0],16)",
1397	 "&vprotd	(@x[$d1],@x[$d1],16)",
1398	  "&vprotd	(@x[$d2],@x[$d2],16)",
1399	   "&vprotd	(@x[$d3],@x[$d3],16)",
1400
1401	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
1402	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
1403	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
1404	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
1405	"&vpxor		(@x[$b0],@x[$c0],@x[$b0])",
1406	 "&vpxor	(@x[$b1],@x[$c1],@x[$b1])",
1407	  "&vpxor	(@x[$b2],@x[$b2],@x[$c2])",	# flip
1408	   "&vpxor	(@x[$b3],@x[$b3],@x[$c3])",	# flip
1409	"&vprotd	(@x[$b0],@x[$b0],12)",
1410	 "&vprotd	(@x[$b1],@x[$b1],12)",
1411	  "&vprotd	(@x[$b2],@x[$b2],12)",
1412	   "&vprotd	(@x[$b3],@x[$b3],12)",
1413
1414	"&vpaddd	(@x[$a0],@x[$b0],@x[$a0])",	# flip
1415	 "&vpaddd	(@x[$a1],@x[$b1],@x[$a1])",	# flip
1416	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
1417	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
1418	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
1419	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
1420	  "&vpxor	(@x[$d2],@x[$a2],@x[$d2])",
1421	   "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
1422	"&vprotd	(@x[$d0],@x[$d0],8)",
1423	 "&vprotd	(@x[$d1],@x[$d1],8)",
1424	  "&vprotd	(@x[$d2],@x[$d2],8)",
1425	   "&vprotd	(@x[$d3],@x[$d3],8)",
1426
1427	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
1428	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
1429	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
1430	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
1431	"&vpxor		(@x[$b0],@x[$c0],@x[$b0])",
1432	 "&vpxor	(@x[$b1],@x[$c1],@x[$b1])",
1433	  "&vpxor	(@x[$b2],@x[$b2],@x[$c2])",	# flip
1434	   "&vpxor	(@x[$b3],@x[$b3],@x[$c3])",	# flip
1435	"&vprotd	(@x[$b0],@x[$b0],7)",
1436	 "&vprotd	(@x[$b1],@x[$b1],7)",
1437	  "&vprotd	(@x[$b2],@x[$b2],7)",
1438	   "&vprotd	(@x[$b3],@x[$b3],7)"
1439	);
1440}
1441
1442my $xframe = $win64 ? 0xa8 : 8;
1443
1444$code.=<<___;
1445.type	ChaCha20_4xop,\@function,5
1446.align	32
1447ChaCha20_4xop:
1448.cfi_startproc
1449.LChaCha20_4xop:
1450	mov		%rsp,%r9		# frame pointer
1451.cfi_def_cfa_register	%r9
1452	sub		\$0x140+$xframe,%rsp
1453___
1454	################ stack layout
1455	# +0x00		SIMD equivalent of @x[8-12]
1456	# ...
1457	# +0x40		constant copy of key[0-2] smashed by lanes
1458	# ...
1459	# +0x100	SIMD counters (with nonce smashed by lanes)
1460	# ...
1461	# +0x140
1462$code.=<<___	if ($win64);
1463	movaps		%xmm6,-0xa8(%r9)
1464	movaps		%xmm7,-0x98(%r9)
1465	movaps		%xmm8,-0x88(%r9)
1466	movaps		%xmm9,-0x78(%r9)
1467	movaps		%xmm10,-0x68(%r9)
1468	movaps		%xmm11,-0x58(%r9)
1469	movaps		%xmm12,-0x48(%r9)
1470	movaps		%xmm13,-0x38(%r9)
1471	movaps		%xmm14,-0x28(%r9)
1472	movaps		%xmm15,-0x18(%r9)
1473.L4xop_body:
1474___
1475$code.=<<___;
1476	vzeroupper
1477
1478	vmovdqa		.Lsigma(%rip),$xa3	# key[0]
1479	vmovdqu		($key),$xb3		# key[1]
1480	vmovdqu		16($key),$xt3		# key[2]
1481	vmovdqu		($counter),$xd3		# key[3]
1482	lea		0x100(%rsp),%rcx	# size optimization
1483
1484	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
1485	vpshufd		\$0x55,$xa3,$xa1
1486	vmovdqa		$xa0,0x40(%rsp)		# ... and offload
1487	vpshufd		\$0xaa,$xa3,$xa2
1488	vmovdqa		$xa1,0x50(%rsp)
1489	vpshufd		\$0xff,$xa3,$xa3
1490	vmovdqa		$xa2,0x60(%rsp)
1491	vmovdqa		$xa3,0x70(%rsp)
1492
1493	vpshufd		\$0x00,$xb3,$xb0
1494	vpshufd		\$0x55,$xb3,$xb1
1495	vmovdqa		$xb0,0x80-0x100(%rcx)
1496	vpshufd		\$0xaa,$xb3,$xb2
1497	vmovdqa		$xb1,0x90-0x100(%rcx)
1498	vpshufd		\$0xff,$xb3,$xb3
1499	vmovdqa		$xb2,0xa0-0x100(%rcx)
1500	vmovdqa		$xb3,0xb0-0x100(%rcx)
1501
1502	vpshufd		\$0x00,$xt3,$xt0	# "$xc0"
1503	vpshufd		\$0x55,$xt3,$xt1	# "$xc1"
1504	vmovdqa		$xt0,0xc0-0x100(%rcx)
1505	vpshufd		\$0xaa,$xt3,$xt2	# "$xc2"
1506	vmovdqa		$xt1,0xd0-0x100(%rcx)
1507	vpshufd		\$0xff,$xt3,$xt3	# "$xc3"
1508	vmovdqa		$xt2,0xe0-0x100(%rcx)
1509	vmovdqa		$xt3,0xf0-0x100(%rcx)
1510
1511	vpshufd		\$0x00,$xd3,$xd0
1512	vpshufd		\$0x55,$xd3,$xd1
1513	vpaddd		.Linc(%rip),$xd0,$xd0	# don't save counters yet
1514	vpshufd		\$0xaa,$xd3,$xd2
1515	vmovdqa		$xd1,0x110-0x100(%rcx)
1516	vpshufd		\$0xff,$xd3,$xd3
1517	vmovdqa		$xd2,0x120-0x100(%rcx)
1518	vmovdqa		$xd3,0x130-0x100(%rcx)
1519
1520	jmp		.Loop_enter4xop
1521
1522.align	32
1523.Loop_outer4xop:
1524	vmovdqa		0x40(%rsp),$xa0		# re-load smashed key
1525	vmovdqa		0x50(%rsp),$xa1
1526	vmovdqa		0x60(%rsp),$xa2
1527	vmovdqa		0x70(%rsp),$xa3
1528	vmovdqa		0x80-0x100(%rcx),$xb0
1529	vmovdqa		0x90-0x100(%rcx),$xb1
1530	vmovdqa		0xa0-0x100(%rcx),$xb2
1531	vmovdqa		0xb0-0x100(%rcx),$xb3
1532	vmovdqa		0xc0-0x100(%rcx),$xt0	# "$xc0"
1533	vmovdqa		0xd0-0x100(%rcx),$xt1	# "$xc1"
1534	vmovdqa		0xe0-0x100(%rcx),$xt2	# "$xc2"
1535	vmovdqa		0xf0-0x100(%rcx),$xt3	# "$xc3"
1536	vmovdqa		0x100-0x100(%rcx),$xd0
1537	vmovdqa		0x110-0x100(%rcx),$xd1
1538	vmovdqa		0x120-0x100(%rcx),$xd2
1539	vmovdqa		0x130-0x100(%rcx),$xd3
1540	vpaddd		.Lfour(%rip),$xd0,$xd0	# next SIMD counters
1541
1542.Loop_enter4xop:
1543	mov		\$10,%eax
1544	vmovdqa		$xd0,0x100-0x100(%rcx)	# save SIMD counters
1545	jmp		.Loop4xop
1546
1547.align	32
1548.Loop4xop:
1549___
1550	foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1551	foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1552$code.=<<___;
1553	dec		%eax
1554	jnz		.Loop4xop
1555
1556	vpaddd		0x40(%rsp),$xa0,$xa0	# accumulate key material
1557	vpaddd		0x50(%rsp),$xa1,$xa1
1558	vpaddd		0x60(%rsp),$xa2,$xa2
1559	vpaddd		0x70(%rsp),$xa3,$xa3
1560
1561	vmovdqa		$xt2,0x20(%rsp)		# offload $xc2,3
1562	vmovdqa		$xt3,0x30(%rsp)
1563
1564	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
1565	vpunpckldq	$xa3,$xa2,$xt3
1566	vpunpckhdq	$xa1,$xa0,$xa0
1567	vpunpckhdq	$xa3,$xa2,$xa2
1568	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
1569	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
1570	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
1571	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
1572___
1573        ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1574$code.=<<___;
1575	vpaddd		0x80-0x100(%rcx),$xb0,$xb0
1576	vpaddd		0x90-0x100(%rcx),$xb1,$xb1
1577	vpaddd		0xa0-0x100(%rcx),$xb2,$xb2
1578	vpaddd		0xb0-0x100(%rcx),$xb3,$xb3
1579
1580	vmovdqa		$xa0,0x00(%rsp)		# offload $xa0,1
1581	vmovdqa		$xa1,0x10(%rsp)
1582	vmovdqa		0x20(%rsp),$xa0		# "xc2"
1583	vmovdqa		0x30(%rsp),$xa1		# "xc3"
1584
1585	vpunpckldq	$xb1,$xb0,$xt2
1586	vpunpckldq	$xb3,$xb2,$xt3
1587	vpunpckhdq	$xb1,$xb0,$xb0
1588	vpunpckhdq	$xb3,$xb2,$xb2
1589	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
1590	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
1591	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
1592	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
1593___
1594	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1595	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1596$code.=<<___;
1597	vpaddd		0xc0-0x100(%rcx),$xc0,$xc0
1598	vpaddd		0xd0-0x100(%rcx),$xc1,$xc1
1599	vpaddd		0xe0-0x100(%rcx),$xc2,$xc2
1600	vpaddd		0xf0-0x100(%rcx),$xc3,$xc3
1601
1602	vpunpckldq	$xc1,$xc0,$xt2
1603	vpunpckldq	$xc3,$xc2,$xt3
1604	vpunpckhdq	$xc1,$xc0,$xc0
1605	vpunpckhdq	$xc3,$xc2,$xc2
1606	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
1607	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
1608	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
1609	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
1610___
1611	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1612$code.=<<___;
1613	vpaddd		0x100-0x100(%rcx),$xd0,$xd0
1614	vpaddd		0x110-0x100(%rcx),$xd1,$xd1
1615	vpaddd		0x120-0x100(%rcx),$xd2,$xd2
1616	vpaddd		0x130-0x100(%rcx),$xd3,$xd3
1617
1618	vpunpckldq	$xd1,$xd0,$xt2
1619	vpunpckldq	$xd3,$xd2,$xt3
1620	vpunpckhdq	$xd1,$xd0,$xd0
1621	vpunpckhdq	$xd3,$xd2,$xd2
1622	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
1623	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
1624	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
1625	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
1626___
1627	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1628	($xa0,$xa1)=($xt2,$xt3);
1629$code.=<<___;
1630	vmovdqa		0x00(%rsp),$xa0		# restore $xa0,1
1631	vmovdqa		0x10(%rsp),$xa1
1632
1633	cmp		\$64*4,$len
1634	jb		.Ltail4xop
1635
1636	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1637	vpxor		0x10($inp),$xb0,$xb0
1638	vpxor		0x20($inp),$xc0,$xc0
1639	vpxor		0x30($inp),$xd0,$xd0
1640	vpxor		0x40($inp),$xa1,$xa1
1641	vpxor		0x50($inp),$xb1,$xb1
1642	vpxor		0x60($inp),$xc1,$xc1
1643	vpxor		0x70($inp),$xd1,$xd1
1644	lea		0x80($inp),$inp		# size optimization
1645	vpxor		0x00($inp),$xa2,$xa2
1646	vpxor		0x10($inp),$xb2,$xb2
1647	vpxor		0x20($inp),$xc2,$xc2
1648	vpxor		0x30($inp),$xd2,$xd2
1649	vpxor		0x40($inp),$xa3,$xa3
1650	vpxor		0x50($inp),$xb3,$xb3
1651	vpxor		0x60($inp),$xc3,$xc3
1652	vpxor		0x70($inp),$xd3,$xd3
1653	lea		0x80($inp),$inp		# inp+=64*4
1654
1655	vmovdqu		$xa0,0x00($out)
1656	vmovdqu		$xb0,0x10($out)
1657	vmovdqu		$xc0,0x20($out)
1658	vmovdqu		$xd0,0x30($out)
1659	vmovdqu		$xa1,0x40($out)
1660	vmovdqu		$xb1,0x50($out)
1661	vmovdqu		$xc1,0x60($out)
1662	vmovdqu		$xd1,0x70($out)
1663	lea		0x80($out),$out		# size optimization
1664	vmovdqu		$xa2,0x00($out)
1665	vmovdqu		$xb2,0x10($out)
1666	vmovdqu		$xc2,0x20($out)
1667	vmovdqu		$xd2,0x30($out)
1668	vmovdqu		$xa3,0x40($out)
1669	vmovdqu		$xb3,0x50($out)
1670	vmovdqu		$xc3,0x60($out)
1671	vmovdqu		$xd3,0x70($out)
1672	lea		0x80($out),$out		# out+=64*4
1673
1674	sub		\$64*4,$len
1675	jnz		.Loop_outer4xop
1676
1677	jmp		.Ldone4xop
1678
1679.align	32
1680.Ltail4xop:
1681	cmp		\$192,$len
1682	jae		.L192_or_more4xop
1683	cmp		\$128,$len
1684	jae		.L128_or_more4xop
1685	cmp		\$64,$len
1686	jae		.L64_or_more4xop
1687
1688	xor		%r10,%r10
1689	vmovdqa		$xa0,0x00(%rsp)
1690	vmovdqa		$xb0,0x10(%rsp)
1691	vmovdqa		$xc0,0x20(%rsp)
1692	vmovdqa		$xd0,0x30(%rsp)
1693	jmp		.Loop_tail4xop
1694
1695.align	32
1696.L64_or_more4xop:
1697	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1698	vpxor		0x10($inp),$xb0,$xb0
1699	vpxor		0x20($inp),$xc0,$xc0
1700	vpxor		0x30($inp),$xd0,$xd0
1701	vmovdqu		$xa0,0x00($out)
1702	vmovdqu		$xb0,0x10($out)
1703	vmovdqu		$xc0,0x20($out)
1704	vmovdqu		$xd0,0x30($out)
1705	je		.Ldone4xop
1706
1707	lea		0x40($inp),$inp		# inp+=64*1
1708	vmovdqa		$xa1,0x00(%rsp)
1709	xor		%r10,%r10
1710	vmovdqa		$xb1,0x10(%rsp)
1711	lea		0x40($out),$out		# out+=64*1
1712	vmovdqa		$xc1,0x20(%rsp)
1713	sub		\$64,$len		# len-=64*1
1714	vmovdqa		$xd1,0x30(%rsp)
1715	jmp		.Loop_tail4xop
1716
1717.align	32
1718.L128_or_more4xop:
1719	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1720	vpxor		0x10($inp),$xb0,$xb0
1721	vpxor		0x20($inp),$xc0,$xc0
1722	vpxor		0x30($inp),$xd0,$xd0
1723	vpxor		0x40($inp),$xa1,$xa1
1724	vpxor		0x50($inp),$xb1,$xb1
1725	vpxor		0x60($inp),$xc1,$xc1
1726	vpxor		0x70($inp),$xd1,$xd1
1727
1728	vmovdqu		$xa0,0x00($out)
1729	vmovdqu		$xb0,0x10($out)
1730	vmovdqu		$xc0,0x20($out)
1731	vmovdqu		$xd0,0x30($out)
1732	vmovdqu		$xa1,0x40($out)
1733	vmovdqu		$xb1,0x50($out)
1734	vmovdqu		$xc1,0x60($out)
1735	vmovdqu		$xd1,0x70($out)
1736	je		.Ldone4xop
1737
1738	lea		0x80($inp),$inp		# inp+=64*2
1739	vmovdqa		$xa2,0x00(%rsp)
1740	xor		%r10,%r10
1741	vmovdqa		$xb2,0x10(%rsp)
1742	lea		0x80($out),$out		# out+=64*2
1743	vmovdqa		$xc2,0x20(%rsp)
1744	sub		\$128,$len		# len-=64*2
1745	vmovdqa		$xd2,0x30(%rsp)
1746	jmp		.Loop_tail4xop
1747
1748.align	32
1749.L192_or_more4xop:
1750	vpxor		0x00($inp),$xa0,$xa0	# xor with input
1751	vpxor		0x10($inp),$xb0,$xb0
1752	vpxor		0x20($inp),$xc0,$xc0
1753	vpxor		0x30($inp),$xd0,$xd0
1754	vpxor		0x40($inp),$xa1,$xa1
1755	vpxor		0x50($inp),$xb1,$xb1
1756	vpxor		0x60($inp),$xc1,$xc1
1757	vpxor		0x70($inp),$xd1,$xd1
1758	lea		0x80($inp),$inp		# size optimization
1759	vpxor		0x00($inp),$xa2,$xa2
1760	vpxor		0x10($inp),$xb2,$xb2
1761	vpxor		0x20($inp),$xc2,$xc2
1762	vpxor		0x30($inp),$xd2,$xd2
1763
1764	vmovdqu		$xa0,0x00($out)
1765	vmovdqu		$xb0,0x10($out)
1766	vmovdqu		$xc0,0x20($out)
1767	vmovdqu		$xd0,0x30($out)
1768	vmovdqu		$xa1,0x40($out)
1769	vmovdqu		$xb1,0x50($out)
1770	vmovdqu		$xc1,0x60($out)
1771	vmovdqu		$xd1,0x70($out)
1772	lea		0x80($out),$out		# size optimization
1773	vmovdqu		$xa2,0x00($out)
1774	vmovdqu		$xb2,0x10($out)
1775	vmovdqu		$xc2,0x20($out)
1776	vmovdqu		$xd2,0x30($out)
1777	je		.Ldone4xop
1778
1779	lea		0x40($inp),$inp		# inp+=64*3
1780	vmovdqa		$xa3,0x00(%rsp)
1781	xor		%r10,%r10
1782	vmovdqa		$xb3,0x10(%rsp)
1783	lea		0x40($out),$out		# out+=64*3
1784	vmovdqa		$xc3,0x20(%rsp)
1785	sub		\$192,$len		# len-=64*3
1786	vmovdqa		$xd3,0x30(%rsp)
1787
1788.Loop_tail4xop:
1789	movzb		($inp,%r10),%eax
1790	movzb		(%rsp,%r10),%ecx
1791	lea		1(%r10),%r10
1792	xor		%ecx,%eax
1793	mov		%al,-1($out,%r10)
1794	dec		$len
1795	jnz		.Loop_tail4xop
1796
1797.Ldone4xop:
1798	vzeroupper
1799___
1800$code.=<<___	if ($win64);
1801	movaps		-0xa8(%r9),%xmm6
1802	movaps		-0x98(%r9),%xmm7
1803	movaps		-0x88(%r9),%xmm8
1804	movaps		-0x78(%r9),%xmm9
1805	movaps		-0x68(%r9),%xmm10
1806	movaps		-0x58(%r9),%xmm11
1807	movaps		-0x48(%r9),%xmm12
1808	movaps		-0x38(%r9),%xmm13
1809	movaps		-0x28(%r9),%xmm14
1810	movaps		-0x18(%r9),%xmm15
1811___
1812$code.=<<___;
1813	lea		(%r9),%rsp
1814.cfi_def_cfa_register	%rsp
1815.L4xop_epilogue:
1816	ret
1817.cfi_endproc
1818.size	ChaCha20_4xop,.-ChaCha20_4xop
1819___
1820}
1821
1822########################################################################
1823# AVX2 code path
1824if ($avx>1) {
1825my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1826    $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1827my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1828	"%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1829
1830sub AVX2_lane_ROUND {
1831my ($a0,$b0,$c0,$d0)=@_;
1832my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1833my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1834my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1835my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1836my @x=map("\"$_\"",@xx);
1837
1838	# Consider order in which variables are addressed by their
1839	# index:
1840	#
1841	#	a   b   c   d
1842	#
1843	#	0   4   8  12 < even round
1844	#	1   5   9  13
1845	#	2   6  10  14
1846	#	3   7  11  15
1847	#	0   5  10  15 < odd round
1848	#	1   6  11  12
1849	#	2   7   8  13
1850	#	3   4   9  14
1851	#
1852	# 'a', 'b' and 'd's are permanently allocated in registers,
1853	# @x[0..7,12..15], while 'c's are maintained in memory. If
1854	# you observe 'c' column, you'll notice that pair of 'c's is
1855	# invariant between rounds. This means that we have to reload
1856	# them once per round, in the middle. This is why you'll see
1857	# bunch of 'c' stores and loads in the middle, but none in
1858	# the beginning or end.
1859
1860	(
1861	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
1862	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
1863	"&vpshufb	(@x[$d0],@x[$d0],$t1)",
1864	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
1865	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
1866	 "&vpshufb	(@x[$d1],@x[$d1],$t1)",
1867
1868	"&vpaddd	($xc,$xc,@x[$d0])",
1869	"&vpxor		(@x[$b0],$xc,@x[$b0])",
1870	"&vpslld	($t0,@x[$b0],12)",
1871	"&vpsrld	(@x[$b0],@x[$b0],20)",
1872	"&vpor		(@x[$b0],$t0,@x[$b0])",
1873	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
1874	 "&vpaddd	($xc_,$xc_,@x[$d1])",
1875	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
1876	 "&vpslld	($t1,@x[$b1],12)",
1877	 "&vpsrld	(@x[$b1],@x[$b1],20)",
1878	 "&vpor		(@x[$b1],$t1,@x[$b1])",
1879
1880	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
1881	"&vpxor		(@x[$d0],@x[$a0],@x[$d0])",
1882	"&vpshufb	(@x[$d0],@x[$d0],$t0)",
1883	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
1884	 "&vpxor	(@x[$d1],@x[$a1],@x[$d1])",
1885	 "&vpshufb	(@x[$d1],@x[$d1],$t0)",
1886
1887	"&vpaddd	($xc,$xc,@x[$d0])",
1888	"&vpxor		(@x[$b0],$xc,@x[$b0])",
1889	"&vpslld	($t1,@x[$b0],7)",
1890	"&vpsrld	(@x[$b0],@x[$b0],25)",
1891	"&vpor		(@x[$b0],$t1,@x[$b0])",
1892	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
1893	 "&vpaddd	($xc_,$xc_,@x[$d1])",
1894	 "&vpxor	(@x[$b1],$xc_,@x[$b1])",
1895	 "&vpslld	($t0,@x[$b1],7)",
1896	 "&vpsrld	(@x[$b1],@x[$b1],25)",
1897	 "&vpor		(@x[$b1],$t0,@x[$b1])",
1898
1899	"&vmovdqa	(\"`32*($c0-8)`(%rsp)\",$xc)",	# reload pair of 'c's
1900	 "&vmovdqa	(\"`32*($c1-8)`(%rsp)\",$xc_)",
1901	"&vmovdqa	($xc,\"`32*($c2-8)`(%rsp)\")",
1902	 "&vmovdqa	($xc_,\"`32*($c3-8)`(%rsp)\")",
1903
1904	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
1905	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
1906	"&vpshufb	(@x[$d2],@x[$d2],$t1)",
1907	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
1908	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
1909	 "&vpshufb	(@x[$d3],@x[$d3],$t1)",
1910
1911	"&vpaddd	($xc,$xc,@x[$d2])",
1912	"&vpxor		(@x[$b2],$xc,@x[$b2])",
1913	"&vpslld	($t0,@x[$b2],12)",
1914	"&vpsrld	(@x[$b2],@x[$b2],20)",
1915	"&vpor		(@x[$b2],$t0,@x[$b2])",
1916	"&vbroadcasti128($t0,'(%r11)')",		# .Lrot24(%rip)
1917	 "&vpaddd	($xc_,$xc_,@x[$d3])",
1918	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
1919	 "&vpslld	($t1,@x[$b3],12)",
1920	 "&vpsrld	(@x[$b3],@x[$b3],20)",
1921	 "&vpor		(@x[$b3],$t1,@x[$b3])",
1922
1923	"&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
1924	"&vpxor		(@x[$d2],@x[$a2],@x[$d2])",
1925	"&vpshufb	(@x[$d2],@x[$d2],$t0)",
1926	 "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
1927	 "&vpxor	(@x[$d3],@x[$a3],@x[$d3])",
1928	 "&vpshufb	(@x[$d3],@x[$d3],$t0)",
1929
1930	"&vpaddd	($xc,$xc,@x[$d2])",
1931	"&vpxor		(@x[$b2],$xc,@x[$b2])",
1932	"&vpslld	($t1,@x[$b2],7)",
1933	"&vpsrld	(@x[$b2],@x[$b2],25)",
1934	"&vpor		(@x[$b2],$t1,@x[$b2])",
1935	"&vbroadcasti128($t1,'(%r10)')",		# .Lrot16(%rip)
1936	 "&vpaddd	($xc_,$xc_,@x[$d3])",
1937	 "&vpxor	(@x[$b3],$xc_,@x[$b3])",
1938	 "&vpslld	($t0,@x[$b3],7)",
1939	 "&vpsrld	(@x[$b3],@x[$b3],25)",
1940	 "&vpor		(@x[$b3],$t0,@x[$b3])"
1941	);
1942}
1943
1944my $xframe = $win64 ? 0xa8 : 8;
1945
1946$code.=<<___;
1947.type	ChaCha20_8x,\@function,5
1948.align	32
1949ChaCha20_8x:
1950.cfi_startproc
1951.LChaCha20_8x:
1952	mov		%rsp,%r9		# frame register
1953.cfi_def_cfa_register	%r9
1954	sub		\$0x280+$xframe,%rsp
1955	and		\$-32,%rsp
1956___
1957$code.=<<___	if ($win64);
1958	movaps		%xmm6,-0xa8(%r9)
1959	movaps		%xmm7,-0x98(%r9)
1960	movaps		%xmm8,-0x88(%r9)
1961	movaps		%xmm9,-0x78(%r9)
1962	movaps		%xmm10,-0x68(%r9)
1963	movaps		%xmm11,-0x58(%r9)
1964	movaps		%xmm12,-0x48(%r9)
1965	movaps		%xmm13,-0x38(%r9)
1966	movaps		%xmm14,-0x28(%r9)
1967	movaps		%xmm15,-0x18(%r9)
1968.L8x_body:
1969___
1970$code.=<<___;
1971	vzeroupper
1972
1973	################ stack layout
1974	# +0x00		SIMD equivalent of @x[8-12]
1975	# ...
1976	# +0x80		constant copy of key[0-2] smashed by lanes
1977	# ...
1978	# +0x200	SIMD counters (with nonce smashed by lanes)
1979	# ...
1980	# +0x280
1981
1982	vbroadcasti128	.Lsigma(%rip),$xa3	# key[0]
1983	vbroadcasti128	($key),$xb3		# key[1]
1984	vbroadcasti128	16($key),$xt3		# key[2]
1985	vbroadcasti128	($counter),$xd3		# key[3]
1986	lea		0x100(%rsp),%rcx	# size optimization
1987	lea		0x200(%rsp),%rax	# size optimization
1988	lea		.Lrot16(%rip),%r10
1989	lea		.Lrot24(%rip),%r11
1990
1991	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
1992	vpshufd		\$0x55,$xa3,$xa1
1993	vmovdqa		$xa0,0x80-0x100(%rcx)	# ... and offload
1994	vpshufd		\$0xaa,$xa3,$xa2
1995	vmovdqa		$xa1,0xa0-0x100(%rcx)
1996	vpshufd		\$0xff,$xa3,$xa3
1997	vmovdqa		$xa2,0xc0-0x100(%rcx)
1998	vmovdqa		$xa3,0xe0-0x100(%rcx)
1999
2000	vpshufd		\$0x00,$xb3,$xb0
2001	vpshufd		\$0x55,$xb3,$xb1
2002	vmovdqa		$xb0,0x100-0x100(%rcx)
2003	vpshufd		\$0xaa,$xb3,$xb2
2004	vmovdqa		$xb1,0x120-0x100(%rcx)
2005	vpshufd		\$0xff,$xb3,$xb3
2006	vmovdqa		$xb2,0x140-0x100(%rcx)
2007	vmovdqa		$xb3,0x160-0x100(%rcx)
2008
2009	vpshufd		\$0x00,$xt3,$xt0	# "xc0"
2010	vpshufd		\$0x55,$xt3,$xt1	# "xc1"
2011	vmovdqa		$xt0,0x180-0x200(%rax)
2012	vpshufd		\$0xaa,$xt3,$xt2	# "xc2"
2013	vmovdqa		$xt1,0x1a0-0x200(%rax)
2014	vpshufd		\$0xff,$xt3,$xt3	# "xc3"
2015	vmovdqa		$xt2,0x1c0-0x200(%rax)
2016	vmovdqa		$xt3,0x1e0-0x200(%rax)
2017
2018	vpshufd		\$0x00,$xd3,$xd0
2019	vpshufd		\$0x55,$xd3,$xd1
2020	vpaddd		.Lincy(%rip),$xd0,$xd0	# don't save counters yet
2021	vpshufd		\$0xaa,$xd3,$xd2
2022	vmovdqa		$xd1,0x220-0x200(%rax)
2023	vpshufd		\$0xff,$xd3,$xd3
2024	vmovdqa		$xd2,0x240-0x200(%rax)
2025	vmovdqa		$xd3,0x260-0x200(%rax)
2026
2027	jmp		.Loop_enter8x
2028
2029.align	32
2030.Loop_outer8x:
2031	vmovdqa		0x80-0x100(%rcx),$xa0	# re-load smashed key
2032	vmovdqa		0xa0-0x100(%rcx),$xa1
2033	vmovdqa		0xc0-0x100(%rcx),$xa2
2034	vmovdqa		0xe0-0x100(%rcx),$xa3
2035	vmovdqa		0x100-0x100(%rcx),$xb0
2036	vmovdqa		0x120-0x100(%rcx),$xb1
2037	vmovdqa		0x140-0x100(%rcx),$xb2
2038	vmovdqa		0x160-0x100(%rcx),$xb3
2039	vmovdqa		0x180-0x200(%rax),$xt0	# "xc0"
2040	vmovdqa		0x1a0-0x200(%rax),$xt1	# "xc1"
2041	vmovdqa		0x1c0-0x200(%rax),$xt2	# "xc2"
2042	vmovdqa		0x1e0-0x200(%rax),$xt3	# "xc3"
2043	vmovdqa		0x200-0x200(%rax),$xd0
2044	vmovdqa		0x220-0x200(%rax),$xd1
2045	vmovdqa		0x240-0x200(%rax),$xd2
2046	vmovdqa		0x260-0x200(%rax),$xd3
2047	vpaddd		.Leight(%rip),$xd0,$xd0	# next SIMD counters
2048
2049.Loop_enter8x:
2050	vmovdqa		$xt2,0x40(%rsp)		# SIMD equivalent of "@x[10]"
2051	vmovdqa		$xt3,0x60(%rsp)		# SIMD equivalent of "@x[11]"
2052	vbroadcasti128	(%r10),$xt3
2053	vmovdqa		$xd0,0x200-0x200(%rax)	# save SIMD counters
2054	mov		\$10,%eax
2055	jmp		.Loop8x
2056
2057.align	32
2058.Loop8x:
2059___
2060	foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2061	foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2062$code.=<<___;
2063	dec		%eax
2064	jnz		.Loop8x
2065
2066	lea		0x200(%rsp),%rax	# size optimization
2067	vpaddd		0x80-0x100(%rcx),$xa0,$xa0	# accumulate key
2068	vpaddd		0xa0-0x100(%rcx),$xa1,$xa1
2069	vpaddd		0xc0-0x100(%rcx),$xa2,$xa2
2070	vpaddd		0xe0-0x100(%rcx),$xa3,$xa3
2071
2072	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
2073	vpunpckldq	$xa3,$xa2,$xt3
2074	vpunpckhdq	$xa1,$xa0,$xa0
2075	vpunpckhdq	$xa3,$xa2,$xa2
2076	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
2077	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
2078	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
2079	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
2080___
2081	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2082$code.=<<___;
2083	vpaddd		0x100-0x100(%rcx),$xb0,$xb0
2084	vpaddd		0x120-0x100(%rcx),$xb1,$xb1
2085	vpaddd		0x140-0x100(%rcx),$xb2,$xb2
2086	vpaddd		0x160-0x100(%rcx),$xb3,$xb3
2087
2088	vpunpckldq	$xb1,$xb0,$xt2
2089	vpunpckldq	$xb3,$xb2,$xt3
2090	vpunpckhdq	$xb1,$xb0,$xb0
2091	vpunpckhdq	$xb3,$xb2,$xb2
2092	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
2093	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
2094	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
2095	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
2096___
2097	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2098$code.=<<___;
2099	vperm2i128	\$0x20,$xb0,$xa0,$xt3	# "de-interlace" further
2100	vperm2i128	\$0x31,$xb0,$xa0,$xb0
2101	vperm2i128	\$0x20,$xb1,$xa1,$xa0
2102	vperm2i128	\$0x31,$xb1,$xa1,$xb1
2103	vperm2i128	\$0x20,$xb2,$xa2,$xa1
2104	vperm2i128	\$0x31,$xb2,$xa2,$xb2
2105	vperm2i128	\$0x20,$xb3,$xa3,$xa2
2106	vperm2i128	\$0x31,$xb3,$xa3,$xb3
2107___
2108	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2109	my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2110$code.=<<___;
2111	vmovdqa		$xa0,0x00(%rsp)		# offload $xaN
2112	vmovdqa		$xa1,0x20(%rsp)
2113	vmovdqa		0x40(%rsp),$xc2		# $xa0
2114	vmovdqa		0x60(%rsp),$xc3		# $xa1
2115
2116	vpaddd		0x180-0x200(%rax),$xc0,$xc0
2117	vpaddd		0x1a0-0x200(%rax),$xc1,$xc1
2118	vpaddd		0x1c0-0x200(%rax),$xc2,$xc2
2119	vpaddd		0x1e0-0x200(%rax),$xc3,$xc3
2120
2121	vpunpckldq	$xc1,$xc0,$xt2
2122	vpunpckldq	$xc3,$xc2,$xt3
2123	vpunpckhdq	$xc1,$xc0,$xc0
2124	vpunpckhdq	$xc3,$xc2,$xc2
2125	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
2126	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
2127	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
2128	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
2129___
2130	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2131$code.=<<___;
2132	vpaddd		0x200-0x200(%rax),$xd0,$xd0
2133	vpaddd		0x220-0x200(%rax),$xd1,$xd1
2134	vpaddd		0x240-0x200(%rax),$xd2,$xd2
2135	vpaddd		0x260-0x200(%rax),$xd3,$xd3
2136
2137	vpunpckldq	$xd1,$xd0,$xt2
2138	vpunpckldq	$xd3,$xd2,$xt3
2139	vpunpckhdq	$xd1,$xd0,$xd0
2140	vpunpckhdq	$xd3,$xd2,$xd2
2141	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
2142	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
2143	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
2144	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
2145___
2146	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2147$code.=<<___;
2148	vperm2i128	\$0x20,$xd0,$xc0,$xt3	# "de-interlace" further
2149	vperm2i128	\$0x31,$xd0,$xc0,$xd0
2150	vperm2i128	\$0x20,$xd1,$xc1,$xc0
2151	vperm2i128	\$0x31,$xd1,$xc1,$xd1
2152	vperm2i128	\$0x20,$xd2,$xc2,$xc1
2153	vperm2i128	\$0x31,$xd2,$xc2,$xd2
2154	vperm2i128	\$0x20,$xd3,$xc3,$xc2
2155	vperm2i128	\$0x31,$xd3,$xc3,$xd3
2156___
2157	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2158	($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2159	($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2160	($xa0,$xa1)=($xt2,$xt3);
2161$code.=<<___;
2162	vmovdqa		0x00(%rsp),$xa0		# $xaN was offloaded, remember?
2163	vmovdqa		0x20(%rsp),$xa1
2164
2165	cmp		\$64*8,$len
2166	jb		.Ltail8x
2167
2168	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2169	vpxor		0x20($inp),$xb0,$xb0
2170	vpxor		0x40($inp),$xc0,$xc0
2171	vpxor		0x60($inp),$xd0,$xd0
2172	lea		0x80($inp),$inp		# size optimization
2173	vmovdqu		$xa0,0x00($out)
2174	vmovdqu		$xb0,0x20($out)
2175	vmovdqu		$xc0,0x40($out)
2176	vmovdqu		$xd0,0x60($out)
2177	lea		0x80($out),$out		# size optimization
2178
2179	vpxor		0x00($inp),$xa1,$xa1
2180	vpxor		0x20($inp),$xb1,$xb1
2181	vpxor		0x40($inp),$xc1,$xc1
2182	vpxor		0x60($inp),$xd1,$xd1
2183	lea		0x80($inp),$inp		# size optimization
2184	vmovdqu		$xa1,0x00($out)
2185	vmovdqu		$xb1,0x20($out)
2186	vmovdqu		$xc1,0x40($out)
2187	vmovdqu		$xd1,0x60($out)
2188	lea		0x80($out),$out		# size optimization
2189
2190	vpxor		0x00($inp),$xa2,$xa2
2191	vpxor		0x20($inp),$xb2,$xb2
2192	vpxor		0x40($inp),$xc2,$xc2
2193	vpxor		0x60($inp),$xd2,$xd2
2194	lea		0x80($inp),$inp		# size optimization
2195	vmovdqu		$xa2,0x00($out)
2196	vmovdqu		$xb2,0x20($out)
2197	vmovdqu		$xc2,0x40($out)
2198	vmovdqu		$xd2,0x60($out)
2199	lea		0x80($out),$out		# size optimization
2200
2201	vpxor		0x00($inp),$xa3,$xa3
2202	vpxor		0x20($inp),$xb3,$xb3
2203	vpxor		0x40($inp),$xc3,$xc3
2204	vpxor		0x60($inp),$xd3,$xd3
2205	lea		0x80($inp),$inp		# size optimization
2206	vmovdqu		$xa3,0x00($out)
2207	vmovdqu		$xb3,0x20($out)
2208	vmovdqu		$xc3,0x40($out)
2209	vmovdqu		$xd3,0x60($out)
2210	lea		0x80($out),$out		# size optimization
2211
2212	sub		\$64*8,$len
2213	jnz		.Loop_outer8x
2214
2215	jmp		.Ldone8x
2216
2217.Ltail8x:
2218	cmp		\$448,$len
2219	jae		.L448_or_more8x
2220	cmp		\$384,$len
2221	jae		.L384_or_more8x
2222	cmp		\$320,$len
2223	jae		.L320_or_more8x
2224	cmp		\$256,$len
2225	jae		.L256_or_more8x
2226	cmp		\$192,$len
2227	jae		.L192_or_more8x
2228	cmp		\$128,$len
2229	jae		.L128_or_more8x
2230	cmp		\$64,$len
2231	jae		.L64_or_more8x
2232
2233	xor		%r10,%r10
2234	vmovdqa		$xa0,0x00(%rsp)
2235	vmovdqa		$xb0,0x20(%rsp)
2236	jmp		.Loop_tail8x
2237
2238.align	32
2239.L64_or_more8x:
2240	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2241	vpxor		0x20($inp),$xb0,$xb0
2242	vmovdqu		$xa0,0x00($out)
2243	vmovdqu		$xb0,0x20($out)
2244	je		.Ldone8x
2245
2246	lea		0x40($inp),$inp		# inp+=64*1
2247	xor		%r10,%r10
2248	vmovdqa		$xc0,0x00(%rsp)
2249	lea		0x40($out),$out		# out+=64*1
2250	sub		\$64,$len		# len-=64*1
2251	vmovdqa		$xd0,0x20(%rsp)
2252	jmp		.Loop_tail8x
2253
2254.align	32
2255.L128_or_more8x:
2256	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2257	vpxor		0x20($inp),$xb0,$xb0
2258	vpxor		0x40($inp),$xc0,$xc0
2259	vpxor		0x60($inp),$xd0,$xd0
2260	vmovdqu		$xa0,0x00($out)
2261	vmovdqu		$xb0,0x20($out)
2262	vmovdqu		$xc0,0x40($out)
2263	vmovdqu		$xd0,0x60($out)
2264	je		.Ldone8x
2265
2266	lea		0x80($inp),$inp		# inp+=64*2
2267	xor		%r10,%r10
2268	vmovdqa		$xa1,0x00(%rsp)
2269	lea		0x80($out),$out		# out+=64*2
2270	sub		\$128,$len		# len-=64*2
2271	vmovdqa		$xb1,0x20(%rsp)
2272	jmp		.Loop_tail8x
2273
2274.align	32
2275.L192_or_more8x:
2276	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2277	vpxor		0x20($inp),$xb0,$xb0
2278	vpxor		0x40($inp),$xc0,$xc0
2279	vpxor		0x60($inp),$xd0,$xd0
2280	vpxor		0x80($inp),$xa1,$xa1
2281	vpxor		0xa0($inp),$xb1,$xb1
2282	vmovdqu		$xa0,0x00($out)
2283	vmovdqu		$xb0,0x20($out)
2284	vmovdqu		$xc0,0x40($out)
2285	vmovdqu		$xd0,0x60($out)
2286	vmovdqu		$xa1,0x80($out)
2287	vmovdqu		$xb1,0xa0($out)
2288	je		.Ldone8x
2289
2290	lea		0xc0($inp),$inp		# inp+=64*3
2291	xor		%r10,%r10
2292	vmovdqa		$xc1,0x00(%rsp)
2293	lea		0xc0($out),$out		# out+=64*3
2294	sub		\$192,$len		# len-=64*3
2295	vmovdqa		$xd1,0x20(%rsp)
2296	jmp		.Loop_tail8x
2297
2298.align	32
2299.L256_or_more8x:
2300	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2301	vpxor		0x20($inp),$xb0,$xb0
2302	vpxor		0x40($inp),$xc0,$xc0
2303	vpxor		0x60($inp),$xd0,$xd0
2304	vpxor		0x80($inp),$xa1,$xa1
2305	vpxor		0xa0($inp),$xb1,$xb1
2306	vpxor		0xc0($inp),$xc1,$xc1
2307	vpxor		0xe0($inp),$xd1,$xd1
2308	vmovdqu		$xa0,0x00($out)
2309	vmovdqu		$xb0,0x20($out)
2310	vmovdqu		$xc0,0x40($out)
2311	vmovdqu		$xd0,0x60($out)
2312	vmovdqu		$xa1,0x80($out)
2313	vmovdqu		$xb1,0xa0($out)
2314	vmovdqu		$xc1,0xc0($out)
2315	vmovdqu		$xd1,0xe0($out)
2316	je		.Ldone8x
2317
2318	lea		0x100($inp),$inp	# inp+=64*4
2319	xor		%r10,%r10
2320	vmovdqa		$xa2,0x00(%rsp)
2321	lea		0x100($out),$out	# out+=64*4
2322	sub		\$256,$len		# len-=64*4
2323	vmovdqa		$xb2,0x20(%rsp)
2324	jmp		.Loop_tail8x
2325
2326.align	32
2327.L320_or_more8x:
2328	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2329	vpxor		0x20($inp),$xb0,$xb0
2330	vpxor		0x40($inp),$xc0,$xc0
2331	vpxor		0x60($inp),$xd0,$xd0
2332	vpxor		0x80($inp),$xa1,$xa1
2333	vpxor		0xa0($inp),$xb1,$xb1
2334	vpxor		0xc0($inp),$xc1,$xc1
2335	vpxor		0xe0($inp),$xd1,$xd1
2336	vpxor		0x100($inp),$xa2,$xa2
2337	vpxor		0x120($inp),$xb2,$xb2
2338	vmovdqu		$xa0,0x00($out)
2339	vmovdqu		$xb0,0x20($out)
2340	vmovdqu		$xc0,0x40($out)
2341	vmovdqu		$xd0,0x60($out)
2342	vmovdqu		$xa1,0x80($out)
2343	vmovdqu		$xb1,0xa0($out)
2344	vmovdqu		$xc1,0xc0($out)
2345	vmovdqu		$xd1,0xe0($out)
2346	vmovdqu		$xa2,0x100($out)
2347	vmovdqu		$xb2,0x120($out)
2348	je		.Ldone8x
2349
2350	lea		0x140($inp),$inp	# inp+=64*5
2351	xor		%r10,%r10
2352	vmovdqa		$xc2,0x00(%rsp)
2353	lea		0x140($out),$out	# out+=64*5
2354	sub		\$320,$len		# len-=64*5
2355	vmovdqa		$xd2,0x20(%rsp)
2356	jmp		.Loop_tail8x
2357
2358.align	32
2359.L384_or_more8x:
2360	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2361	vpxor		0x20($inp),$xb0,$xb0
2362	vpxor		0x40($inp),$xc0,$xc0
2363	vpxor		0x60($inp),$xd0,$xd0
2364	vpxor		0x80($inp),$xa1,$xa1
2365	vpxor		0xa0($inp),$xb1,$xb1
2366	vpxor		0xc0($inp),$xc1,$xc1
2367	vpxor		0xe0($inp),$xd1,$xd1
2368	vpxor		0x100($inp),$xa2,$xa2
2369	vpxor		0x120($inp),$xb2,$xb2
2370	vpxor		0x140($inp),$xc2,$xc2
2371	vpxor		0x160($inp),$xd2,$xd2
2372	vmovdqu		$xa0,0x00($out)
2373	vmovdqu		$xb0,0x20($out)
2374	vmovdqu		$xc0,0x40($out)
2375	vmovdqu		$xd0,0x60($out)
2376	vmovdqu		$xa1,0x80($out)
2377	vmovdqu		$xb1,0xa0($out)
2378	vmovdqu		$xc1,0xc0($out)
2379	vmovdqu		$xd1,0xe0($out)
2380	vmovdqu		$xa2,0x100($out)
2381	vmovdqu		$xb2,0x120($out)
2382	vmovdqu		$xc2,0x140($out)
2383	vmovdqu		$xd2,0x160($out)
2384	je		.Ldone8x
2385
2386	lea		0x180($inp),$inp	# inp+=64*6
2387	xor		%r10,%r10
2388	vmovdqa		$xa3,0x00(%rsp)
2389	lea		0x180($out),$out	# out+=64*6
2390	sub		\$384,$len		# len-=64*6
2391	vmovdqa		$xb3,0x20(%rsp)
2392	jmp		.Loop_tail8x
2393
2394.align	32
2395.L448_or_more8x:
2396	vpxor		0x00($inp),$xa0,$xa0	# xor with input
2397	vpxor		0x20($inp),$xb0,$xb0
2398	vpxor		0x40($inp),$xc0,$xc0
2399	vpxor		0x60($inp),$xd0,$xd0
2400	vpxor		0x80($inp),$xa1,$xa1
2401	vpxor		0xa0($inp),$xb1,$xb1
2402	vpxor		0xc0($inp),$xc1,$xc1
2403	vpxor		0xe0($inp),$xd1,$xd1
2404	vpxor		0x100($inp),$xa2,$xa2
2405	vpxor		0x120($inp),$xb2,$xb2
2406	vpxor		0x140($inp),$xc2,$xc2
2407	vpxor		0x160($inp),$xd2,$xd2
2408	vpxor		0x180($inp),$xa3,$xa3
2409	vpxor		0x1a0($inp),$xb3,$xb3
2410	vmovdqu		$xa0,0x00($out)
2411	vmovdqu		$xb0,0x20($out)
2412	vmovdqu		$xc0,0x40($out)
2413	vmovdqu		$xd0,0x60($out)
2414	vmovdqu		$xa1,0x80($out)
2415	vmovdqu		$xb1,0xa0($out)
2416	vmovdqu		$xc1,0xc0($out)
2417	vmovdqu		$xd1,0xe0($out)
2418	vmovdqu		$xa2,0x100($out)
2419	vmovdqu		$xb2,0x120($out)
2420	vmovdqu		$xc2,0x140($out)
2421	vmovdqu		$xd2,0x160($out)
2422	vmovdqu		$xa3,0x180($out)
2423	vmovdqu		$xb3,0x1a0($out)
2424	je		.Ldone8x
2425
2426	lea		0x1c0($inp),$inp	# inp+=64*7
2427	xor		%r10,%r10
2428	vmovdqa		$xc3,0x00(%rsp)
2429	lea		0x1c0($out),$out	# out+=64*7
2430	sub		\$448,$len		# len-=64*7
2431	vmovdqa		$xd3,0x20(%rsp)
2432
2433.Loop_tail8x:
2434	movzb		($inp,%r10),%eax
2435	movzb		(%rsp,%r10),%ecx
2436	lea		1(%r10),%r10
2437	xor		%ecx,%eax
2438	mov		%al,-1($out,%r10)
2439	dec		$len
2440	jnz		.Loop_tail8x
2441
2442.Ldone8x:
2443	vzeroall
2444___
2445$code.=<<___	if ($win64);
2446	movaps		-0xa8(%r9),%xmm6
2447	movaps		-0x98(%r9),%xmm7
2448	movaps		-0x88(%r9),%xmm8
2449	movaps		-0x78(%r9),%xmm9
2450	movaps		-0x68(%r9),%xmm10
2451	movaps		-0x58(%r9),%xmm11
2452	movaps		-0x48(%r9),%xmm12
2453	movaps		-0x38(%r9),%xmm13
2454	movaps		-0x28(%r9),%xmm14
2455	movaps		-0x18(%r9),%xmm15
2456___
2457$code.=<<___;
2458	lea		(%r9),%rsp
2459.cfi_def_cfa_register	%rsp
2460.L8x_epilogue:
2461	ret
2462.cfi_endproc
2463.size	ChaCha20_8x,.-ChaCha20_8x
2464___
2465}
2466
2467########################################################################
2468# AVX512 code paths
2469if ($avx>2) {
2470# This one handles shorter inputs...
2471
2472my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2473my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2474
2475sub vpxord()		# size optimization
2476{ my $opcode = "vpxor";	# adhere to vpxor when possible
2477
2478    foreach (@_) {
2479	if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2480	    $opcode = "vpxord";
2481	    last;
2482	}
2483    }
2484
2485    $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2486}
2487
2488sub AVX512ROUND {	# critical path is 14 "SIMD ticks" per round
2489	&vpaddd	($a,$a,$b);
2490	&vpxord	($d,$d,$a);
2491	&vprold	($d,$d,16);
2492
2493	&vpaddd	($c,$c,$d);
2494	&vpxord	($b,$b,$c);
2495	&vprold	($b,$b,12);
2496
2497	&vpaddd	($a,$a,$b);
2498	&vpxord	($d,$d,$a);
2499	&vprold	($d,$d,8);
2500
2501	&vpaddd	($c,$c,$d);
2502	&vpxord	($b,$b,$c);
2503	&vprold	($b,$b,7);
2504}
2505
2506my $xframe = $win64 ? 160+8 : 8;
2507
2508$code.=<<___;
2509.type	ChaCha20_avx512,\@function,5
2510.align	32
2511ChaCha20_avx512:
2512.cfi_startproc
2513.LChaCha20_avx512:
2514	mov	%rsp,%r9		# frame pointer
2515.cfi_def_cfa_register	%r9
2516	cmp	\$512,$len
2517	ja	.LChaCha20_16x
2518
2519	sub	\$64+$xframe,%rsp
2520___
2521$code.=<<___	if ($win64);
2522	movaps	%xmm6,-0xa8(%r9)
2523	movaps	%xmm7,-0x98(%r9)
2524	movaps	%xmm8,-0x88(%r9)
2525	movaps	%xmm9,-0x78(%r9)
2526	movaps	%xmm10,-0x68(%r9)
2527	movaps	%xmm11,-0x58(%r9)
2528	movaps	%xmm12,-0x48(%r9)
2529	movaps	%xmm13,-0x38(%r9)
2530	movaps	%xmm14,-0x28(%r9)
2531	movaps	%xmm15,-0x18(%r9)
2532.Lavx512_body:
2533___
2534$code.=<<___;
2535	vbroadcasti32x4	.Lsigma(%rip),$a
2536	vbroadcasti32x4	($key),$b
2537	vbroadcasti32x4	16($key),$c
2538	vbroadcasti32x4	($counter),$d
2539
2540	vmovdqa32	$a,$a_
2541	vmovdqa32	$b,$b_
2542	vmovdqa32	$c,$c_
2543	vpaddd		.Lzeroz(%rip),$d,$d
2544	vmovdqa32	.Lfourz(%rip),$fourz
2545	mov		\$10,$counter	# reuse $counter
2546	vmovdqa32	$d,$d_
2547	jmp		.Loop_avx512
2548
2549.align	16
2550.Loop_outer_avx512:
2551	vmovdqa32	$a_,$a
2552	vmovdqa32	$b_,$b
2553	vmovdqa32	$c_,$c
2554	vpaddd		$fourz,$d_,$d
2555	mov		\$10,$counter
2556	vmovdqa32	$d,$d_
2557	jmp		.Loop_avx512
2558
2559.align	32
2560.Loop_avx512:
2561___
2562	&AVX512ROUND();
2563	&vpshufd	($c,$c,0b01001110);
2564	&vpshufd	($b,$b,0b00111001);
2565	&vpshufd	($d,$d,0b10010011);
2566
2567	&AVX512ROUND();
2568	&vpshufd	($c,$c,0b01001110);
2569	&vpshufd	($b,$b,0b10010011);
2570	&vpshufd	($d,$d,0b00111001);
2571
2572	&dec		($counter);
2573	&jnz		(".Loop_avx512");
2574
2575$code.=<<___;
2576	vpaddd		$a_,$a,$a
2577	vpaddd		$b_,$b,$b
2578	vpaddd		$c_,$c,$c
2579	vpaddd		$d_,$d,$d
2580
2581	sub		\$64,$len
2582	jb		.Ltail64_avx512
2583
2584	vpxor		0x00($inp),%x#$a,$t0	# xor with input
2585	vpxor		0x10($inp),%x#$b,$t1
2586	vpxor		0x20($inp),%x#$c,$t2
2587	vpxor		0x30($inp),%x#$d,$t3
2588	lea		0x40($inp),$inp		# inp+=64
2589
2590	vmovdqu		$t0,0x00($out)		# write output
2591	vmovdqu		$t1,0x10($out)
2592	vmovdqu		$t2,0x20($out)
2593	vmovdqu		$t3,0x30($out)
2594	lea		0x40($out),$out		# out+=64
2595
2596	jz		.Ldone_avx512
2597
2598	vextracti32x4	\$1,$a,$t0
2599	vextracti32x4	\$1,$b,$t1
2600	vextracti32x4	\$1,$c,$t2
2601	vextracti32x4	\$1,$d,$t3
2602
2603	sub		\$64,$len
2604	jb		.Ltail_avx512
2605
2606	vpxor		0x00($inp),$t0,$t0	# xor with input
2607	vpxor		0x10($inp),$t1,$t1
2608	vpxor		0x20($inp),$t2,$t2
2609	vpxor		0x30($inp),$t3,$t3
2610	lea		0x40($inp),$inp		# inp+=64
2611
2612	vmovdqu		$t0,0x00($out)		# write output
2613	vmovdqu		$t1,0x10($out)
2614	vmovdqu		$t2,0x20($out)
2615	vmovdqu		$t3,0x30($out)
2616	lea		0x40($out),$out		# out+=64
2617
2618	jz		.Ldone_avx512
2619
2620	vextracti32x4	\$2,$a,$t0
2621	vextracti32x4	\$2,$b,$t1
2622	vextracti32x4	\$2,$c,$t2
2623	vextracti32x4	\$2,$d,$t3
2624
2625	sub		\$64,$len
2626	jb		.Ltail_avx512
2627
2628	vpxor		0x00($inp),$t0,$t0	# xor with input
2629	vpxor		0x10($inp),$t1,$t1
2630	vpxor		0x20($inp),$t2,$t2
2631	vpxor		0x30($inp),$t3,$t3
2632	lea		0x40($inp),$inp		# inp+=64
2633
2634	vmovdqu		$t0,0x00($out)		# write output
2635	vmovdqu		$t1,0x10($out)
2636	vmovdqu		$t2,0x20($out)
2637	vmovdqu		$t3,0x30($out)
2638	lea		0x40($out),$out		# out+=64
2639
2640	jz		.Ldone_avx512
2641
2642	vextracti32x4	\$3,$a,$t0
2643	vextracti32x4	\$3,$b,$t1
2644	vextracti32x4	\$3,$c,$t2
2645	vextracti32x4	\$3,$d,$t3
2646
2647	sub		\$64,$len
2648	jb		.Ltail_avx512
2649
2650	vpxor		0x00($inp),$t0,$t0	# xor with input
2651	vpxor		0x10($inp),$t1,$t1
2652	vpxor		0x20($inp),$t2,$t2
2653	vpxor		0x30($inp),$t3,$t3
2654	lea		0x40($inp),$inp		# inp+=64
2655
2656	vmovdqu		$t0,0x00($out)		# write output
2657	vmovdqu		$t1,0x10($out)
2658	vmovdqu		$t2,0x20($out)
2659	vmovdqu		$t3,0x30($out)
2660	lea		0x40($out),$out		# out+=64
2661
2662	jnz		.Loop_outer_avx512
2663
2664	jmp		.Ldone_avx512
2665
2666.align	16
2667.Ltail64_avx512:
2668	vmovdqa		%x#$a,0x00(%rsp)
2669	vmovdqa		%x#$b,0x10(%rsp)
2670	vmovdqa		%x#$c,0x20(%rsp)
2671	vmovdqa		%x#$d,0x30(%rsp)
2672	add		\$64,$len
2673	jmp		.Loop_tail_avx512
2674
2675.align	16
2676.Ltail_avx512:
2677	vmovdqa		$t0,0x00(%rsp)
2678	vmovdqa		$t1,0x10(%rsp)
2679	vmovdqa		$t2,0x20(%rsp)
2680	vmovdqa		$t3,0x30(%rsp)
2681	add		\$64,$len
2682
2683.Loop_tail_avx512:
2684	movzb		($inp,$counter),%eax
2685	movzb		(%rsp,$counter),%ecx
2686	lea		1($counter),$counter
2687	xor		%ecx,%eax
2688	mov		%al,-1($out,$counter)
2689	dec		$len
2690	jnz		.Loop_tail_avx512
2691
2692	vmovdqu32	$a_,0x00(%rsp)
2693
2694.Ldone_avx512:
2695	vzeroall
2696___
2697$code.=<<___	if ($win64);
2698	movaps	-0xa8(%r9),%xmm6
2699	movaps	-0x98(%r9),%xmm7
2700	movaps	-0x88(%r9),%xmm8
2701	movaps	-0x78(%r9),%xmm9
2702	movaps	-0x68(%r9),%xmm10
2703	movaps	-0x58(%r9),%xmm11
2704	movaps	-0x48(%r9),%xmm12
2705	movaps	-0x38(%r9),%xmm13
2706	movaps	-0x28(%r9),%xmm14
2707	movaps	-0x18(%r9),%xmm15
2708___
2709$code.=<<___;
2710	lea	(%r9),%rsp
2711.cfi_def_cfa_register	%rsp
2712.Lavx512_epilogue:
2713	ret
2714.cfi_endproc
2715.size	ChaCha20_avx512,.-ChaCha20_avx512
2716___
2717
2718map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2719
2720$code.=<<___;
2721.type	ChaCha20_avx512vl,\@function,5
2722.align	32
2723ChaCha20_avx512vl:
2724.cfi_startproc
2725.LChaCha20_avx512vl:
2726	mov	%rsp,%r9		# frame pointer
2727.cfi_def_cfa_register	%r9
2728	cmp	\$128,$len
2729	ja	.LChaCha20_8xvl
2730
2731	sub	\$64+$xframe,%rsp
2732___
2733$code.=<<___	if ($win64);
2734	movaps	%xmm6,-0xa8(%r9)
2735	movaps	%xmm7,-0x98(%r9)
2736	movaps	%xmm8,-0x88(%r9)
2737	movaps	%xmm9,-0x78(%r9)
2738	movaps	%xmm10,-0x68(%r9)
2739	movaps	%xmm11,-0x58(%r9)
2740	movaps	%xmm12,-0x48(%r9)
2741	movaps	%xmm13,-0x38(%r9)
2742	movaps	%xmm14,-0x28(%r9)
2743	movaps	%xmm15,-0x18(%r9)
2744.Lavx512vl_body:
2745___
2746$code.=<<___;
2747	vbroadcasti128	.Lsigma(%rip),$a
2748	vbroadcasti128	($key),$b
2749	vbroadcasti128	16($key),$c
2750	vbroadcasti128	($counter),$d
2751
2752	vmovdqa32	$a,$a_
2753	vmovdqa32	$b,$b_
2754	vmovdqa32	$c,$c_
2755	vpaddd		.Lzeroz(%rip),$d,$d
2756	vmovdqa32	.Ltwoy(%rip),$fourz
2757	mov		\$10,$counter	# reuse $counter
2758	vmovdqa32	$d,$d_
2759	jmp		.Loop_avx512vl
2760
2761.align	16
2762.Loop_outer_avx512vl:
2763	vmovdqa32	$c_,$c
2764	vpaddd		$fourz,$d_,$d
2765	mov		\$10,$counter
2766	vmovdqa32	$d,$d_
2767	jmp		.Loop_avx512vl
2768
2769.align	32
2770.Loop_avx512vl:
2771___
2772	&AVX512ROUND();
2773	&vpshufd	($c,$c,0b01001110);
2774	&vpshufd	($b,$b,0b00111001);
2775	&vpshufd	($d,$d,0b10010011);
2776
2777	&AVX512ROUND();
2778	&vpshufd	($c,$c,0b01001110);
2779	&vpshufd	($b,$b,0b10010011);
2780	&vpshufd	($d,$d,0b00111001);
2781
2782	&dec		($counter);
2783	&jnz		(".Loop_avx512vl");
2784
2785$code.=<<___;
2786	vpaddd		$a_,$a,$a
2787	vpaddd		$b_,$b,$b
2788	vpaddd		$c_,$c,$c
2789	vpaddd		$d_,$d,$d
2790
2791	sub		\$64,$len
2792	jb		.Ltail64_avx512vl
2793
2794	vpxor		0x00($inp),%x#$a,$t0	# xor with input
2795	vpxor		0x10($inp),%x#$b,$t1
2796	vpxor		0x20($inp),%x#$c,$t2
2797	vpxor		0x30($inp),%x#$d,$t3
2798	lea		0x40($inp),$inp		# inp+=64
2799
2800	vmovdqu		$t0,0x00($out)		# write output
2801	vmovdqu		$t1,0x10($out)
2802	vmovdqu		$t2,0x20($out)
2803	vmovdqu		$t3,0x30($out)
2804	lea		0x40($out),$out		# out+=64
2805
2806	jz		.Ldone_avx512vl
2807
2808	vextracti128	\$1,$a,$t0
2809	vextracti128	\$1,$b,$t1
2810	vextracti128	\$1,$c,$t2
2811	vextracti128	\$1,$d,$t3
2812
2813	sub		\$64,$len
2814	jb		.Ltail_avx512vl
2815
2816	vpxor		0x00($inp),$t0,$t0	# xor with input
2817	vpxor		0x10($inp),$t1,$t1
2818	vpxor		0x20($inp),$t2,$t2
2819	vpxor		0x30($inp),$t3,$t3
2820	lea		0x40($inp),$inp		# inp+=64
2821
2822	vmovdqu		$t0,0x00($out)		# write output
2823	vmovdqu		$t1,0x10($out)
2824	vmovdqu		$t2,0x20($out)
2825	vmovdqu		$t3,0x30($out)
2826	lea		0x40($out),$out		# out+=64
2827
2828	vmovdqa32	$a_,$a
2829	vmovdqa32	$b_,$b
2830	jnz		.Loop_outer_avx512vl
2831
2832	jmp		.Ldone_avx512vl
2833
2834.align	16
2835.Ltail64_avx512vl:
2836	vmovdqa		%x#$a,0x00(%rsp)
2837	vmovdqa		%x#$b,0x10(%rsp)
2838	vmovdqa		%x#$c,0x20(%rsp)
2839	vmovdqa		%x#$d,0x30(%rsp)
2840	add		\$64,$len
2841	jmp		.Loop_tail_avx512vl
2842
2843.align	16
2844.Ltail_avx512vl:
2845	vmovdqa		$t0,0x00(%rsp)
2846	vmovdqa		$t1,0x10(%rsp)
2847	vmovdqa		$t2,0x20(%rsp)
2848	vmovdqa		$t3,0x30(%rsp)
2849	add		\$64,$len
2850
2851.Loop_tail_avx512vl:
2852	movzb		($inp,$counter),%eax
2853	movzb		(%rsp,$counter),%ecx
2854	lea		1($counter),$counter
2855	xor		%ecx,%eax
2856	mov		%al,-1($out,$counter)
2857	dec		$len
2858	jnz		.Loop_tail_avx512vl
2859
2860	vmovdqu32	$a_,0x00(%rsp)
2861	vmovdqu32	$a_,0x20(%rsp)
2862
2863.Ldone_avx512vl:
2864	vzeroall
2865___
2866$code.=<<___	if ($win64);
2867	movaps	-0xa8(%r9),%xmm6
2868	movaps	-0x98(%r9),%xmm7
2869	movaps	-0x88(%r9),%xmm8
2870	movaps	-0x78(%r9),%xmm9
2871	movaps	-0x68(%r9),%xmm10
2872	movaps	-0x58(%r9),%xmm11
2873	movaps	-0x48(%r9),%xmm12
2874	movaps	-0x38(%r9),%xmm13
2875	movaps	-0x28(%r9),%xmm14
2876	movaps	-0x18(%r9),%xmm15
2877___
2878$code.=<<___;
2879	lea	(%r9),%rsp
2880.cfi_def_cfa_register	%rsp
2881.Lavx512vl_epilogue:
2882	ret
2883.cfi_endproc
2884.size	ChaCha20_avx512vl,.-ChaCha20_avx512vl
2885___
2886}
2887if ($avx>2) {
2888# This one handles longer inputs...
2889
2890my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2891    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2892my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2893	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2894my @key=map("%zmm$_",(16..31));
2895my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2896
2897sub AVX512_lane_ROUND {
2898my ($a0,$b0,$c0,$d0)=@_;
2899my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2900my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2901my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2902my @x=map("\"$_\"",@xx);
2903
2904	(
2905	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
2906	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
2907	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
2908	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
2909	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
2910	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
2911	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
2912	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
2913	"&vprold	(@x[$d0],@x[$d0],16)",
2914	 "&vprold	(@x[$d1],@x[$d1],16)",
2915	  "&vprold	(@x[$d2],@x[$d2],16)",
2916	   "&vprold	(@x[$d3],@x[$d3],16)",
2917
2918	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
2919	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
2920	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
2921	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
2922	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
2923	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
2924	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
2925	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
2926	"&vprold	(@x[$b0],@x[$b0],12)",
2927	 "&vprold	(@x[$b1],@x[$b1],12)",
2928	  "&vprold	(@x[$b2],@x[$b2],12)",
2929	   "&vprold	(@x[$b3],@x[$b3],12)",
2930
2931	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
2932	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
2933	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
2934	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
2935	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
2936	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
2937	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
2938	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
2939	"&vprold	(@x[$d0],@x[$d0],8)",
2940	 "&vprold	(@x[$d1],@x[$d1],8)",
2941	  "&vprold	(@x[$d2],@x[$d2],8)",
2942	   "&vprold	(@x[$d3],@x[$d3],8)",
2943
2944	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
2945	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
2946	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
2947	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
2948	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
2949	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
2950	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
2951	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
2952	"&vprold	(@x[$b0],@x[$b0],7)",
2953	 "&vprold	(@x[$b1],@x[$b1],7)",
2954	  "&vprold	(@x[$b2],@x[$b2],7)",
2955	   "&vprold	(@x[$b3],@x[$b3],7)"
2956	);
2957}
2958
2959my $xframe = $win64 ? 0xa8 : 8;
2960
2961$code.=<<___;
2962.type	ChaCha20_16x,\@function,5
2963.align	32
2964ChaCha20_16x:
2965.cfi_startproc
2966.LChaCha20_16x:
2967	mov		%rsp,%r9		# frame register
2968.cfi_def_cfa_register	%r9
2969	sub		\$64+$xframe,%rsp
2970	and		\$-64,%rsp
2971___
2972$code.=<<___	if ($win64);
2973	movaps		%xmm6,-0xa8(%r9)
2974	movaps		%xmm7,-0x98(%r9)
2975	movaps		%xmm8,-0x88(%r9)
2976	movaps		%xmm9,-0x78(%r9)
2977	movaps		%xmm10,-0x68(%r9)
2978	movaps		%xmm11,-0x58(%r9)
2979	movaps		%xmm12,-0x48(%r9)
2980	movaps		%xmm13,-0x38(%r9)
2981	movaps		%xmm14,-0x28(%r9)
2982	movaps		%xmm15,-0x18(%r9)
2983.L16x_body:
2984___
2985$code.=<<___;
2986	vzeroupper
2987
2988	lea		.Lsigma(%rip),%r10
2989	vbroadcasti32x4	(%r10),$xa3		# key[0]
2990	vbroadcasti32x4	($key),$xb3		# key[1]
2991	vbroadcasti32x4	16($key),$xc3		# key[2]
2992	vbroadcasti32x4	($counter),$xd3		# key[3]
2993
2994	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
2995	vpshufd		\$0x55,$xa3,$xa1
2996	vpshufd		\$0xaa,$xa3,$xa2
2997	vpshufd		\$0xff,$xa3,$xa3
2998	vmovdqa64	$xa0,@key[0]
2999	vmovdqa64	$xa1,@key[1]
3000	vmovdqa64	$xa2,@key[2]
3001	vmovdqa64	$xa3,@key[3]
3002
3003	vpshufd		\$0x00,$xb3,$xb0
3004	vpshufd		\$0x55,$xb3,$xb1
3005	vpshufd		\$0xaa,$xb3,$xb2
3006	vpshufd		\$0xff,$xb3,$xb3
3007	vmovdqa64	$xb0,@key[4]
3008	vmovdqa64	$xb1,@key[5]
3009	vmovdqa64	$xb2,@key[6]
3010	vmovdqa64	$xb3,@key[7]
3011
3012	vpshufd		\$0x00,$xc3,$xc0
3013	vpshufd		\$0x55,$xc3,$xc1
3014	vpshufd		\$0xaa,$xc3,$xc2
3015	vpshufd		\$0xff,$xc3,$xc3
3016	vmovdqa64	$xc0,@key[8]
3017	vmovdqa64	$xc1,@key[9]
3018	vmovdqa64	$xc2,@key[10]
3019	vmovdqa64	$xc3,@key[11]
3020
3021	vpshufd		\$0x00,$xd3,$xd0
3022	vpshufd		\$0x55,$xd3,$xd1
3023	vpshufd		\$0xaa,$xd3,$xd2
3024	vpshufd		\$0xff,$xd3,$xd3
3025	vpaddd		.Lincz(%rip),$xd0,$xd0	# don't save counters yet
3026	vmovdqa64	$xd0,@key[12]
3027	vmovdqa64	$xd1,@key[13]
3028	vmovdqa64	$xd2,@key[14]
3029	vmovdqa64	$xd3,@key[15]
3030
3031	mov		\$10,%eax
3032	jmp		.Loop16x
3033
3034.align	32
3035.Loop_outer16x:
3036	vpbroadcastd	0(%r10),$xa0		# reload key
3037	vpbroadcastd	4(%r10),$xa1
3038	vpbroadcastd	8(%r10),$xa2
3039	vpbroadcastd	12(%r10),$xa3
3040	vpaddd		.Lsixteen(%rip),@key[12],@key[12]	# next SIMD counters
3041	vmovdqa64	@key[4],$xb0
3042	vmovdqa64	@key[5],$xb1
3043	vmovdqa64	@key[6],$xb2
3044	vmovdqa64	@key[7],$xb3
3045	vmovdqa64	@key[8],$xc0
3046	vmovdqa64	@key[9],$xc1
3047	vmovdqa64	@key[10],$xc2
3048	vmovdqa64	@key[11],$xc3
3049	vmovdqa64	@key[12],$xd0
3050	vmovdqa64	@key[13],$xd1
3051	vmovdqa64	@key[14],$xd2
3052	vmovdqa64	@key[15],$xd3
3053
3054	vmovdqa64	$xa0,@key[0]
3055	vmovdqa64	$xa1,@key[1]
3056	vmovdqa64	$xa2,@key[2]
3057	vmovdqa64	$xa3,@key[3]
3058
3059	mov		\$10,%eax
3060	jmp		.Loop16x
3061
3062.align	32
3063.Loop16x:
3064___
3065	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3066	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3067$code.=<<___;
3068	dec		%eax
3069	jnz		.Loop16x
3070
3071	vpaddd		@key[0],$xa0,$xa0	# accumulate key
3072	vpaddd		@key[1],$xa1,$xa1
3073	vpaddd		@key[2],$xa2,$xa2
3074	vpaddd		@key[3],$xa3,$xa3
3075
3076	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
3077	vpunpckldq	$xa3,$xa2,$xt3
3078	vpunpckhdq	$xa1,$xa0,$xa0
3079	vpunpckhdq	$xa3,$xa2,$xa2
3080	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
3081	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
3082	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
3083	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
3084___
3085	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3086$code.=<<___;
3087	vpaddd		@key[4],$xb0,$xb0
3088	vpaddd		@key[5],$xb1,$xb1
3089	vpaddd		@key[6],$xb2,$xb2
3090	vpaddd		@key[7],$xb3,$xb3
3091
3092	vpunpckldq	$xb1,$xb0,$xt2
3093	vpunpckldq	$xb3,$xb2,$xt3
3094	vpunpckhdq	$xb1,$xb0,$xb0
3095	vpunpckhdq	$xb3,$xb2,$xb2
3096	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
3097	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
3098	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
3099	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
3100___
3101	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3102$code.=<<___;
3103	vshufi32x4	\$0x44,$xb0,$xa0,$xt3	# "de-interlace" further
3104	vshufi32x4	\$0xee,$xb0,$xa0,$xb0
3105	vshufi32x4	\$0x44,$xb1,$xa1,$xa0
3106	vshufi32x4	\$0xee,$xb1,$xa1,$xb1
3107	vshufi32x4	\$0x44,$xb2,$xa2,$xa1
3108	vshufi32x4	\$0xee,$xb2,$xa2,$xb2
3109	vshufi32x4	\$0x44,$xb3,$xa3,$xa2
3110	vshufi32x4	\$0xee,$xb3,$xa3,$xb3
3111___
3112	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3113$code.=<<___;
3114	vpaddd		@key[8],$xc0,$xc0
3115	vpaddd		@key[9],$xc1,$xc1
3116	vpaddd		@key[10],$xc2,$xc2
3117	vpaddd		@key[11],$xc3,$xc3
3118
3119	vpunpckldq	$xc1,$xc0,$xt2
3120	vpunpckldq	$xc3,$xc2,$xt3
3121	vpunpckhdq	$xc1,$xc0,$xc0
3122	vpunpckhdq	$xc3,$xc2,$xc2
3123	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
3124	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
3125	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
3126	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
3127___
3128	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3129$code.=<<___;
3130	vpaddd		@key[12],$xd0,$xd0
3131	vpaddd		@key[13],$xd1,$xd1
3132	vpaddd		@key[14],$xd2,$xd2
3133	vpaddd		@key[15],$xd3,$xd3
3134
3135	vpunpckldq	$xd1,$xd0,$xt2
3136	vpunpckldq	$xd3,$xd2,$xt3
3137	vpunpckhdq	$xd1,$xd0,$xd0
3138	vpunpckhdq	$xd3,$xd2,$xd2
3139	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
3140	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
3141	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
3142	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
3143___
3144	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3145$code.=<<___;
3146	vshufi32x4	\$0x44,$xd0,$xc0,$xt3	# "de-interlace" further
3147	vshufi32x4	\$0xee,$xd0,$xc0,$xd0
3148	vshufi32x4	\$0x44,$xd1,$xc1,$xc0
3149	vshufi32x4	\$0xee,$xd1,$xc1,$xd1
3150	vshufi32x4	\$0x44,$xd2,$xc2,$xc1
3151	vshufi32x4	\$0xee,$xd2,$xc2,$xd2
3152	vshufi32x4	\$0x44,$xd3,$xc3,$xc2
3153	vshufi32x4	\$0xee,$xd3,$xc3,$xd3
3154___
3155	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3156$code.=<<___;
3157	vshufi32x4	\$0x88,$xc0,$xa0,$xt0	# "de-interlace" further
3158	vshufi32x4	\$0xdd,$xc0,$xa0,$xa0
3159	 vshufi32x4	\$0x88,$xd0,$xb0,$xc0
3160	 vshufi32x4	\$0xdd,$xd0,$xb0,$xd0
3161	vshufi32x4	\$0x88,$xc1,$xa1,$xt1
3162	vshufi32x4	\$0xdd,$xc1,$xa1,$xa1
3163	 vshufi32x4	\$0x88,$xd1,$xb1,$xc1
3164	 vshufi32x4	\$0xdd,$xd1,$xb1,$xd1
3165	vshufi32x4	\$0x88,$xc2,$xa2,$xt2
3166	vshufi32x4	\$0xdd,$xc2,$xa2,$xa2
3167	 vshufi32x4	\$0x88,$xd2,$xb2,$xc2
3168	 vshufi32x4	\$0xdd,$xd2,$xb2,$xd2
3169	vshufi32x4	\$0x88,$xc3,$xa3,$xt3
3170	vshufi32x4	\$0xdd,$xc3,$xa3,$xa3
3171	 vshufi32x4	\$0x88,$xd3,$xb3,$xc3
3172	 vshufi32x4	\$0xdd,$xd3,$xb3,$xd3
3173___
3174	($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3175	($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3176
3177	($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3178	 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3179	($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3180	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3181$code.=<<___;
3182	cmp		\$64*16,$len
3183	jb		.Ltail16x
3184
3185	vpxord		0x00($inp),$xa0,$xa0	# xor with input
3186	vpxord		0x40($inp),$xb0,$xb0
3187	vpxord		0x80($inp),$xc0,$xc0
3188	vpxord		0xc0($inp),$xd0,$xd0
3189	vmovdqu32	$xa0,0x00($out)
3190	vmovdqu32	$xb0,0x40($out)
3191	vmovdqu32	$xc0,0x80($out)
3192	vmovdqu32	$xd0,0xc0($out)
3193
3194	vpxord		0x100($inp),$xa1,$xa1
3195	vpxord		0x140($inp),$xb1,$xb1
3196	vpxord		0x180($inp),$xc1,$xc1
3197	vpxord		0x1c0($inp),$xd1,$xd1
3198	vmovdqu32	$xa1,0x100($out)
3199	vmovdqu32	$xb1,0x140($out)
3200	vmovdqu32	$xc1,0x180($out)
3201	vmovdqu32	$xd1,0x1c0($out)
3202
3203	vpxord		0x200($inp),$xa2,$xa2
3204	vpxord		0x240($inp),$xb2,$xb2
3205	vpxord		0x280($inp),$xc2,$xc2
3206	vpxord		0x2c0($inp),$xd2,$xd2
3207	vmovdqu32	$xa2,0x200($out)
3208	vmovdqu32	$xb2,0x240($out)
3209	vmovdqu32	$xc2,0x280($out)
3210	vmovdqu32	$xd2,0x2c0($out)
3211
3212	vpxord		0x300($inp),$xa3,$xa3
3213	vpxord		0x340($inp),$xb3,$xb3
3214	vpxord		0x380($inp),$xc3,$xc3
3215	vpxord		0x3c0($inp),$xd3,$xd3
3216	lea		0x400($inp),$inp
3217	vmovdqu32	$xa3,0x300($out)
3218	vmovdqu32	$xb3,0x340($out)
3219	vmovdqu32	$xc3,0x380($out)
3220	vmovdqu32	$xd3,0x3c0($out)
3221	lea		0x400($out),$out
3222
3223	sub		\$64*16,$len
3224	jnz		.Loop_outer16x
3225
3226	jmp		.Ldone16x
3227
3228.align	32
3229.Ltail16x:
3230	xor		%r10,%r10
3231	sub		$inp,$out
3232	cmp		\$64*1,$len
3233	jb		.Less_than_64_16x
3234	vpxord		($inp),$xa0,$xa0	# xor with input
3235	vmovdqu32	$xa0,($out,$inp)
3236	je		.Ldone16x
3237	vmovdqa32	$xb0,$xa0
3238	lea		64($inp),$inp
3239
3240	cmp		\$64*2,$len
3241	jb		.Less_than_64_16x
3242	vpxord		($inp),$xb0,$xb0
3243	vmovdqu32	$xb0,($out,$inp)
3244	je		.Ldone16x
3245	vmovdqa32	$xc0,$xa0
3246	lea		64($inp),$inp
3247
3248	cmp		\$64*3,$len
3249	jb		.Less_than_64_16x
3250	vpxord		($inp),$xc0,$xc0
3251	vmovdqu32	$xc0,($out,$inp)
3252	je		.Ldone16x
3253	vmovdqa32	$xd0,$xa0
3254	lea		64($inp),$inp
3255
3256	cmp		\$64*4,$len
3257	jb		.Less_than_64_16x
3258	vpxord		($inp),$xd0,$xd0
3259	vmovdqu32	$xd0,($out,$inp)
3260	je		.Ldone16x
3261	vmovdqa32	$xa1,$xa0
3262	lea		64($inp),$inp
3263
3264	cmp		\$64*5,$len
3265	jb		.Less_than_64_16x
3266	vpxord		($inp),$xa1,$xa1
3267	vmovdqu32	$xa1,($out,$inp)
3268	je		.Ldone16x
3269	vmovdqa32	$xb1,$xa0
3270	lea		64($inp),$inp
3271
3272	cmp		\$64*6,$len
3273	jb		.Less_than_64_16x
3274	vpxord		($inp),$xb1,$xb1
3275	vmovdqu32	$xb1,($out,$inp)
3276	je		.Ldone16x
3277	vmovdqa32	$xc1,$xa0
3278	lea		64($inp),$inp
3279
3280	cmp		\$64*7,$len
3281	jb		.Less_than_64_16x
3282	vpxord		($inp),$xc1,$xc1
3283	vmovdqu32	$xc1,($out,$inp)
3284	je		.Ldone16x
3285	vmovdqa32	$xd1,$xa0
3286	lea		64($inp),$inp
3287
3288	cmp		\$64*8,$len
3289	jb		.Less_than_64_16x
3290	vpxord		($inp),$xd1,$xd1
3291	vmovdqu32	$xd1,($out,$inp)
3292	je		.Ldone16x
3293	vmovdqa32	$xa2,$xa0
3294	lea		64($inp),$inp
3295
3296	cmp		\$64*9,$len
3297	jb		.Less_than_64_16x
3298	vpxord		($inp),$xa2,$xa2
3299	vmovdqu32	$xa2,($out,$inp)
3300	je		.Ldone16x
3301	vmovdqa32	$xb2,$xa0
3302	lea		64($inp),$inp
3303
3304	cmp		\$64*10,$len
3305	jb		.Less_than_64_16x
3306	vpxord		($inp),$xb2,$xb2
3307	vmovdqu32	$xb2,($out,$inp)
3308	je		.Ldone16x
3309	vmovdqa32	$xc2,$xa0
3310	lea		64($inp),$inp
3311
3312	cmp		\$64*11,$len
3313	jb		.Less_than_64_16x
3314	vpxord		($inp),$xc2,$xc2
3315	vmovdqu32	$xc2,($out,$inp)
3316	je		.Ldone16x
3317	vmovdqa32	$xd2,$xa0
3318	lea		64($inp),$inp
3319
3320	cmp		\$64*12,$len
3321	jb		.Less_than_64_16x
3322	vpxord		($inp),$xd2,$xd2
3323	vmovdqu32	$xd2,($out,$inp)
3324	je		.Ldone16x
3325	vmovdqa32	$xa3,$xa0
3326	lea		64($inp),$inp
3327
3328	cmp		\$64*13,$len
3329	jb		.Less_than_64_16x
3330	vpxord		($inp),$xa3,$xa3
3331	vmovdqu32	$xa3,($out,$inp)
3332	je		.Ldone16x
3333	vmovdqa32	$xb3,$xa0
3334	lea		64($inp),$inp
3335
3336	cmp		\$64*14,$len
3337	jb		.Less_than_64_16x
3338	vpxord		($inp),$xb3,$xb3
3339	vmovdqu32	$xb3,($out,$inp)
3340	je		.Ldone16x
3341	vmovdqa32	$xc3,$xa0
3342	lea		64($inp),$inp
3343
3344	cmp		\$64*15,$len
3345	jb		.Less_than_64_16x
3346	vpxord		($inp),$xc3,$xc3
3347	vmovdqu32	$xc3,($out,$inp)
3348	je		.Ldone16x
3349	vmovdqa32	$xd3,$xa0
3350	lea		64($inp),$inp
3351
3352.Less_than_64_16x:
3353	vmovdqa32	$xa0,0x00(%rsp)
3354	lea		($out,$inp),$out
3355	and		\$63,$len
3356
3357.Loop_tail16x:
3358	movzb		($inp,%r10),%eax
3359	movzb		(%rsp,%r10),%ecx
3360	lea		1(%r10),%r10
3361	xor		%ecx,%eax
3362	mov		%al,-1($out,%r10)
3363	dec		$len
3364	jnz		.Loop_tail16x
3365
3366	vpxord		$xa0,$xa0,$xa0
3367	vmovdqa32	$xa0,0(%rsp)
3368
3369.Ldone16x:
3370	vzeroall
3371___
3372$code.=<<___	if ($win64);
3373	movaps		-0xa8(%r9),%xmm6
3374	movaps		-0x98(%r9),%xmm7
3375	movaps		-0x88(%r9),%xmm8
3376	movaps		-0x78(%r9),%xmm9
3377	movaps		-0x68(%r9),%xmm10
3378	movaps		-0x58(%r9),%xmm11
3379	movaps		-0x48(%r9),%xmm12
3380	movaps		-0x38(%r9),%xmm13
3381	movaps		-0x28(%r9),%xmm14
3382	movaps		-0x18(%r9),%xmm15
3383___
3384$code.=<<___;
3385	lea		(%r9),%rsp
3386.cfi_def_cfa_register	%rsp
3387.L16x_epilogue:
3388	ret
3389.cfi_endproc
3390.size	ChaCha20_16x,.-ChaCha20_16x
3391___
3392
3393# switch to %ymm domain
3394($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3395 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3396@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3397     $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3398@key=map("%ymm$_",(16..31));
3399($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3400
3401$code.=<<___;
3402.type	ChaCha20_8xvl,\@function,5
3403.align	32
3404ChaCha20_8xvl:
3405.cfi_startproc
3406.LChaCha20_8xvl:
3407	mov		%rsp,%r9		# frame register
3408.cfi_def_cfa_register	%r9
3409	sub		\$64+$xframe,%rsp
3410	and		\$-64,%rsp
3411___
3412$code.=<<___	if ($win64);
3413	movaps		%xmm6,-0xa8(%r9)
3414	movaps		%xmm7,-0x98(%r9)
3415	movaps		%xmm8,-0x88(%r9)
3416	movaps		%xmm9,-0x78(%r9)
3417	movaps		%xmm10,-0x68(%r9)
3418	movaps		%xmm11,-0x58(%r9)
3419	movaps		%xmm12,-0x48(%r9)
3420	movaps		%xmm13,-0x38(%r9)
3421	movaps		%xmm14,-0x28(%r9)
3422	movaps		%xmm15,-0x18(%r9)
3423.L8xvl_body:
3424___
3425$code.=<<___;
3426	vzeroupper
3427
3428	lea		.Lsigma(%rip),%r10
3429	vbroadcasti128	(%r10),$xa3		# key[0]
3430	vbroadcasti128	($key),$xb3		# key[1]
3431	vbroadcasti128	16($key),$xc3		# key[2]
3432	vbroadcasti128	($counter),$xd3		# key[3]
3433
3434	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
3435	vpshufd		\$0x55,$xa3,$xa1
3436	vpshufd		\$0xaa,$xa3,$xa2
3437	vpshufd		\$0xff,$xa3,$xa3
3438	vmovdqa64	$xa0,@key[0]
3439	vmovdqa64	$xa1,@key[1]
3440	vmovdqa64	$xa2,@key[2]
3441	vmovdqa64	$xa3,@key[3]
3442
3443	vpshufd		\$0x00,$xb3,$xb0
3444	vpshufd		\$0x55,$xb3,$xb1
3445	vpshufd		\$0xaa,$xb3,$xb2
3446	vpshufd		\$0xff,$xb3,$xb3
3447	vmovdqa64	$xb0,@key[4]
3448	vmovdqa64	$xb1,@key[5]
3449	vmovdqa64	$xb2,@key[6]
3450	vmovdqa64	$xb3,@key[7]
3451
3452	vpshufd		\$0x00,$xc3,$xc0
3453	vpshufd		\$0x55,$xc3,$xc1
3454	vpshufd		\$0xaa,$xc3,$xc2
3455	vpshufd		\$0xff,$xc3,$xc3
3456	vmovdqa64	$xc0,@key[8]
3457	vmovdqa64	$xc1,@key[9]
3458	vmovdqa64	$xc2,@key[10]
3459	vmovdqa64	$xc3,@key[11]
3460
3461	vpshufd		\$0x00,$xd3,$xd0
3462	vpshufd		\$0x55,$xd3,$xd1
3463	vpshufd		\$0xaa,$xd3,$xd2
3464	vpshufd		\$0xff,$xd3,$xd3
3465	vpaddd		.Lincy(%rip),$xd0,$xd0	# don't save counters yet
3466	vmovdqa64	$xd0,@key[12]
3467	vmovdqa64	$xd1,@key[13]
3468	vmovdqa64	$xd2,@key[14]
3469	vmovdqa64	$xd3,@key[15]
3470
3471	mov		\$10,%eax
3472	jmp		.Loop8xvl
3473
3474.align	32
3475.Loop_outer8xvl:
3476	#vpbroadcastd	0(%r10),$xa0		# reload key
3477	#vpbroadcastd	4(%r10),$xa1
3478	vpbroadcastd	8(%r10),$xa2
3479	vpbroadcastd	12(%r10),$xa3
3480	vpaddd		.Leight(%rip),@key[12],@key[12]	# next SIMD counters
3481	vmovdqa64	@key[4],$xb0
3482	vmovdqa64	@key[5],$xb1
3483	vmovdqa64	@key[6],$xb2
3484	vmovdqa64	@key[7],$xb3
3485	vmovdqa64	@key[8],$xc0
3486	vmovdqa64	@key[9],$xc1
3487	vmovdqa64	@key[10],$xc2
3488	vmovdqa64	@key[11],$xc3
3489	vmovdqa64	@key[12],$xd0
3490	vmovdqa64	@key[13],$xd1
3491	vmovdqa64	@key[14],$xd2
3492	vmovdqa64	@key[15],$xd3
3493
3494	vmovdqa64	$xa0,@key[0]
3495	vmovdqa64	$xa1,@key[1]
3496	vmovdqa64	$xa2,@key[2]
3497	vmovdqa64	$xa3,@key[3]
3498
3499	mov		\$10,%eax
3500	jmp		.Loop8xvl
3501
3502.align	32
3503.Loop8xvl:
3504___
3505	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3506	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3507$code.=<<___;
3508	dec		%eax
3509	jnz		.Loop8xvl
3510
3511	vpaddd		@key[0],$xa0,$xa0	# accumulate key
3512	vpaddd		@key[1],$xa1,$xa1
3513	vpaddd		@key[2],$xa2,$xa2
3514	vpaddd		@key[3],$xa3,$xa3
3515
3516	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
3517	vpunpckldq	$xa3,$xa2,$xt3
3518	vpunpckhdq	$xa1,$xa0,$xa0
3519	vpunpckhdq	$xa3,$xa2,$xa2
3520	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
3521	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
3522	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
3523	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
3524___
3525	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3526$code.=<<___;
3527	vpaddd		@key[4],$xb0,$xb0
3528	vpaddd		@key[5],$xb1,$xb1
3529	vpaddd		@key[6],$xb2,$xb2
3530	vpaddd		@key[7],$xb3,$xb3
3531
3532	vpunpckldq	$xb1,$xb0,$xt2
3533	vpunpckldq	$xb3,$xb2,$xt3
3534	vpunpckhdq	$xb1,$xb0,$xb0
3535	vpunpckhdq	$xb3,$xb2,$xb2
3536	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
3537	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
3538	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
3539	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
3540___
3541	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3542$code.=<<___;
3543	vshufi32x4	\$0,$xb0,$xa0,$xt3	# "de-interlace" further
3544	vshufi32x4	\$3,$xb0,$xa0,$xb0
3545	vshufi32x4	\$0,$xb1,$xa1,$xa0
3546	vshufi32x4	\$3,$xb1,$xa1,$xb1
3547	vshufi32x4	\$0,$xb2,$xa2,$xa1
3548	vshufi32x4	\$3,$xb2,$xa2,$xb2
3549	vshufi32x4	\$0,$xb3,$xa3,$xa2
3550	vshufi32x4	\$3,$xb3,$xa3,$xb3
3551___
3552	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3553$code.=<<___;
3554	vpaddd		@key[8],$xc0,$xc0
3555	vpaddd		@key[9],$xc1,$xc1
3556	vpaddd		@key[10],$xc2,$xc2
3557	vpaddd		@key[11],$xc3,$xc3
3558
3559	vpunpckldq	$xc1,$xc0,$xt2
3560	vpunpckldq	$xc3,$xc2,$xt3
3561	vpunpckhdq	$xc1,$xc0,$xc0
3562	vpunpckhdq	$xc3,$xc2,$xc2
3563	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
3564	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
3565	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
3566	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
3567___
3568	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3569$code.=<<___;
3570	vpaddd		@key[12],$xd0,$xd0
3571	vpaddd		@key[13],$xd1,$xd1
3572	vpaddd		@key[14],$xd2,$xd2
3573	vpaddd		@key[15],$xd3,$xd3
3574
3575	vpunpckldq	$xd1,$xd0,$xt2
3576	vpunpckldq	$xd3,$xd2,$xt3
3577	vpunpckhdq	$xd1,$xd0,$xd0
3578	vpunpckhdq	$xd3,$xd2,$xd2
3579	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
3580	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
3581	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
3582	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
3583___
3584	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3585$code.=<<___;
3586	vperm2i128	\$0x20,$xd0,$xc0,$xt3	# "de-interlace" further
3587	vperm2i128	\$0x31,$xd0,$xc0,$xd0
3588	vperm2i128	\$0x20,$xd1,$xc1,$xc0
3589	vperm2i128	\$0x31,$xd1,$xc1,$xd1
3590	vperm2i128	\$0x20,$xd2,$xc2,$xc1
3591	vperm2i128	\$0x31,$xd2,$xc2,$xd2
3592	vperm2i128	\$0x20,$xd3,$xc3,$xc2
3593	vperm2i128	\$0x31,$xd3,$xc3,$xd3
3594___
3595	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3596	($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3597	($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3598$code.=<<___;
3599	cmp		\$64*8,$len
3600	jb		.Ltail8xvl
3601
3602	mov		\$0x80,%eax		# size optimization
3603	vpxord		0x00($inp),$xa0,$xa0	# xor with input
3604	vpxor		0x20($inp),$xb0,$xb0
3605	vpxor		0x40($inp),$xc0,$xc0
3606	vpxor		0x60($inp),$xd0,$xd0
3607	lea		($inp,%rax),$inp	# size optimization
3608	vmovdqu32	$xa0,0x00($out)
3609	vmovdqu		$xb0,0x20($out)
3610	vmovdqu		$xc0,0x40($out)
3611	vmovdqu		$xd0,0x60($out)
3612	lea		($out,%rax),$out	# size optimization
3613
3614	vpxor		0x00($inp),$xa1,$xa1
3615	vpxor		0x20($inp),$xb1,$xb1
3616	vpxor		0x40($inp),$xc1,$xc1
3617	vpxor		0x60($inp),$xd1,$xd1
3618	lea		($inp,%rax),$inp	# size optimization
3619	vmovdqu		$xa1,0x00($out)
3620	vmovdqu		$xb1,0x20($out)
3621	vmovdqu		$xc1,0x40($out)
3622	vmovdqu		$xd1,0x60($out)
3623	lea		($out,%rax),$out	# size optimization
3624
3625	vpxord		0x00($inp),$xa2,$xa2
3626	vpxor		0x20($inp),$xb2,$xb2
3627	vpxor		0x40($inp),$xc2,$xc2
3628	vpxor		0x60($inp),$xd2,$xd2
3629	lea		($inp,%rax),$inp	# size optimization
3630	vmovdqu32	$xa2,0x00($out)
3631	vmovdqu		$xb2,0x20($out)
3632	vmovdqu		$xc2,0x40($out)
3633	vmovdqu		$xd2,0x60($out)
3634	lea		($out,%rax),$out	# size optimization
3635
3636	vpxor		0x00($inp),$xa3,$xa3
3637	vpxor		0x20($inp),$xb3,$xb3
3638	vpxor		0x40($inp),$xc3,$xc3
3639	vpxor		0x60($inp),$xd3,$xd3
3640	lea		($inp,%rax),$inp	# size optimization
3641	vmovdqu		$xa3,0x00($out)
3642	vmovdqu		$xb3,0x20($out)
3643	vmovdqu		$xc3,0x40($out)
3644	vmovdqu		$xd3,0x60($out)
3645	lea		($out,%rax),$out	# size optimization
3646
3647	vpbroadcastd	0(%r10),%ymm0		# reload key
3648	vpbroadcastd	4(%r10),%ymm1
3649
3650	sub		\$64*8,$len
3651	jnz		.Loop_outer8xvl
3652
3653	jmp		.Ldone8xvl
3654
3655.align	32
3656.Ltail8xvl:
3657	vmovdqa64	$xa0,%ymm8		# size optimization
3658___
3659$xa0 = "%ymm8";
3660$code.=<<___;
3661	xor		%r10,%r10
3662	sub		$inp,$out
3663	cmp		\$64*1,$len
3664	jb		.Less_than_64_8xvl
3665	vpxor		0x00($inp),$xa0,$xa0	# xor with input
3666	vpxor		0x20($inp),$xb0,$xb0
3667	vmovdqu		$xa0,0x00($out,$inp)
3668	vmovdqu		$xb0,0x20($out,$inp)
3669	je		.Ldone8xvl
3670	vmovdqa		$xc0,$xa0
3671	vmovdqa		$xd0,$xb0
3672	lea		64($inp),$inp
3673
3674	cmp		\$64*2,$len
3675	jb		.Less_than_64_8xvl
3676	vpxor		0x00($inp),$xc0,$xc0
3677	vpxor		0x20($inp),$xd0,$xd0
3678	vmovdqu		$xc0,0x00($out,$inp)
3679	vmovdqu		$xd0,0x20($out,$inp)
3680	je		.Ldone8xvl
3681	vmovdqa		$xa1,$xa0
3682	vmovdqa		$xb1,$xb0
3683	lea		64($inp),$inp
3684
3685	cmp		\$64*3,$len
3686	jb		.Less_than_64_8xvl
3687	vpxor		0x00($inp),$xa1,$xa1
3688	vpxor		0x20($inp),$xb1,$xb1
3689	vmovdqu		$xa1,0x00($out,$inp)
3690	vmovdqu		$xb1,0x20($out,$inp)
3691	je		.Ldone8xvl
3692	vmovdqa		$xc1,$xa0
3693	vmovdqa		$xd1,$xb0
3694	lea		64($inp),$inp
3695
3696	cmp		\$64*4,$len
3697	jb		.Less_than_64_8xvl
3698	vpxor		0x00($inp),$xc1,$xc1
3699	vpxor		0x20($inp),$xd1,$xd1
3700	vmovdqu		$xc1,0x00($out,$inp)
3701	vmovdqu		$xd1,0x20($out,$inp)
3702	je		.Ldone8xvl
3703	vmovdqa32	$xa2,$xa0
3704	vmovdqa		$xb2,$xb0
3705	lea		64($inp),$inp
3706
3707	cmp		\$64*5,$len
3708	jb		.Less_than_64_8xvl
3709	vpxord		0x00($inp),$xa2,$xa2
3710	vpxor		0x20($inp),$xb2,$xb2
3711	vmovdqu32	$xa2,0x00($out,$inp)
3712	vmovdqu		$xb2,0x20($out,$inp)
3713	je		.Ldone8xvl
3714	vmovdqa		$xc2,$xa0
3715	vmovdqa		$xd2,$xb0
3716	lea		64($inp),$inp
3717
3718	cmp		\$64*6,$len
3719	jb		.Less_than_64_8xvl
3720	vpxor		0x00($inp),$xc2,$xc2
3721	vpxor		0x20($inp),$xd2,$xd2
3722	vmovdqu		$xc2,0x00($out,$inp)
3723	vmovdqu		$xd2,0x20($out,$inp)
3724	je		.Ldone8xvl
3725	vmovdqa		$xa3,$xa0
3726	vmovdqa		$xb3,$xb0
3727	lea		64($inp),$inp
3728
3729	cmp		\$64*7,$len
3730	jb		.Less_than_64_8xvl
3731	vpxor		0x00($inp),$xa3,$xa3
3732	vpxor		0x20($inp),$xb3,$xb3
3733	vmovdqu		$xa3,0x00($out,$inp)
3734	vmovdqu		$xb3,0x20($out,$inp)
3735	je		.Ldone8xvl
3736	vmovdqa		$xc3,$xa0
3737	vmovdqa		$xd3,$xb0
3738	lea		64($inp),$inp
3739
3740.Less_than_64_8xvl:
3741	vmovdqa		$xa0,0x00(%rsp)
3742	vmovdqa		$xb0,0x20(%rsp)
3743	lea		($out,$inp),$out
3744	and		\$63,$len
3745
3746.Loop_tail8xvl:
3747	movzb		($inp,%r10),%eax
3748	movzb		(%rsp,%r10),%ecx
3749	lea		1(%r10),%r10
3750	xor		%ecx,%eax
3751	mov		%al,-1($out,%r10)
3752	dec		$len
3753	jnz		.Loop_tail8xvl
3754
3755	vpxor		$xa0,$xa0,$xa0
3756	vmovdqa		$xa0,0x00(%rsp)
3757	vmovdqa		$xa0,0x20(%rsp)
3758
3759.Ldone8xvl:
3760	vzeroall
3761___
3762$code.=<<___	if ($win64);
3763	movaps		-0xa8(%r9),%xmm6
3764	movaps		-0x98(%r9),%xmm7
3765	movaps		-0x88(%r9),%xmm8
3766	movaps		-0x78(%r9),%xmm9
3767	movaps		-0x68(%r9),%xmm10
3768	movaps		-0x58(%r9),%xmm11
3769	movaps		-0x48(%r9),%xmm12
3770	movaps		-0x38(%r9),%xmm13
3771	movaps		-0x28(%r9),%xmm14
3772	movaps		-0x18(%r9),%xmm15
3773___
3774$code.=<<___;
3775	lea		(%r9),%rsp
3776.cfi_def_cfa_register	%rsp
3777.L8xvl_epilogue:
3778	ret
3779.cfi_endproc
3780.size	ChaCha20_8xvl,.-ChaCha20_8xvl
3781___
3782}
3783
3784# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3785#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3786if ($win64) {
3787$rec="%rcx";
3788$frame="%rdx";
3789$context="%r8";
3790$disp="%r9";
3791
3792$code.=<<___;
3793.extern	__imp_RtlVirtualUnwind
3794.type	se_handler,\@abi-omnipotent
3795.align	16
3796se_handler:
3797	push	%rsi
3798	push	%rdi
3799	push	%rbx
3800	push	%rbp
3801	push	%r12
3802	push	%r13
3803	push	%r14
3804	push	%r15
3805	pushfq
3806	sub	\$64,%rsp
3807
3808	mov	120($context),%rax	# pull context->Rax
3809	mov	248($context),%rbx	# pull context->Rip
3810
3811	mov	8($disp),%rsi		# disp->ImageBase
3812	mov	56($disp),%r11		# disp->HandlerData
3813
3814	lea	.Lctr32_body(%rip),%r10
3815	cmp	%r10,%rbx		# context->Rip<.Lprologue
3816	jb	.Lcommon_seh_tail
3817
3818	mov	152($context),%rax	# pull context->Rsp
3819
3820	lea	.Lno_data(%rip),%r10	# epilogue label
3821	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
3822	jae	.Lcommon_seh_tail
3823
3824	lea	64+24+48(%rax),%rax
3825
3826	mov	-8(%rax),%rbx
3827	mov	-16(%rax),%rbp
3828	mov	-24(%rax),%r12
3829	mov	-32(%rax),%r13
3830	mov	-40(%rax),%r14
3831	mov	-48(%rax),%r15
3832	mov	%rbx,144($context)	# restore context->Rbx
3833	mov	%rbp,160($context)	# restore context->Rbp
3834	mov	%r12,216($context)	# restore context->R12
3835	mov	%r13,224($context)	# restore context->R13
3836	mov	%r14,232($context)	# restore context->R14
3837	mov	%r15,240($context)	# restore context->R14
3838
3839.Lcommon_seh_tail:
3840	mov	8(%rax),%rdi
3841	mov	16(%rax),%rsi
3842	mov	%rax,152($context)	# restore context->Rsp
3843	mov	%rsi,168($context)	# restore context->Rsi
3844	mov	%rdi,176($context)	# restore context->Rdi
3845
3846	mov	40($disp),%rdi		# disp->ContextRecord
3847	mov	$context,%rsi		# context
3848	mov	\$154,%ecx		# sizeof(CONTEXT)
3849	.long	0xa548f3fc		# cld; rep movsq
3850
3851	mov	$disp,%rsi
3852	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3853	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3854	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3855	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3856	mov	40(%rsi),%r10		# disp->ContextRecord
3857	lea	56(%rsi),%r11		# &disp->HandlerData
3858	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3859	mov	%r10,32(%rsp)		# arg5
3860	mov	%r11,40(%rsp)		# arg6
3861	mov	%r12,48(%rsp)		# arg7
3862	mov	%rcx,56(%rsp)		# arg8, (NULL)
3863	call	*__imp_RtlVirtualUnwind(%rip)
3864
3865	mov	\$1,%eax		# ExceptionContinueSearch
3866	add	\$64,%rsp
3867	popfq
3868	pop	%r15
3869	pop	%r14
3870	pop	%r13
3871	pop	%r12
3872	pop	%rbp
3873	pop	%rbx
3874	pop	%rdi
3875	pop	%rsi
3876	ret
3877.size	se_handler,.-se_handler
3878
3879.type	simd_handler,\@abi-omnipotent
3880.align	16
3881simd_handler:
3882	push	%rsi
3883	push	%rdi
3884	push	%rbx
3885	push	%rbp
3886	push	%r12
3887	push	%r13
3888	push	%r14
3889	push	%r15
3890	pushfq
3891	sub	\$64,%rsp
3892
3893	mov	120($context),%rax	# pull context->Rax
3894	mov	248($context),%rbx	# pull context->Rip
3895
3896	mov	8($disp),%rsi		# disp->ImageBase
3897	mov	56($disp),%r11		# disp->HandlerData
3898
3899	mov	0(%r11),%r10d		# HandlerData[0]
3900	lea	(%rsi,%r10),%r10	# prologue label
3901	cmp	%r10,%rbx		# context->Rip<prologue label
3902	jb	.Lcommon_seh_tail
3903
3904	mov	192($context),%rax	# pull context->R9
3905
3906	mov	4(%r11),%r10d		# HandlerData[1]
3907	mov	8(%r11),%ecx		# HandlerData[2]
3908	lea	(%rsi,%r10),%r10	# epilogue label
3909	cmp	%r10,%rbx		# context->Rip>=epilogue label
3910	jae	.Lcommon_seh_tail
3911
3912	neg	%rcx
3913	lea	-8(%rax,%rcx),%rsi
3914	lea	512($context),%rdi	# &context.Xmm6
3915	neg	%ecx
3916	shr	\$3,%ecx
3917	.long	0xa548f3fc		# cld; rep movsq
3918
3919	jmp	.Lcommon_seh_tail
3920.size	simd_handler,.-simd_handler
3921
3922.section	.pdata
3923.align	4
3924	.rva	.LSEH_begin_ChaCha20_ctr32
3925	.rva	.LSEH_end_ChaCha20_ctr32
3926	.rva	.LSEH_info_ChaCha20_ctr32
3927
3928	.rva	.LSEH_begin_ChaCha20_ssse3
3929	.rva	.LSEH_end_ChaCha20_ssse3
3930	.rva	.LSEH_info_ChaCha20_ssse3
3931
3932	.rva	.LSEH_begin_ChaCha20_128
3933	.rva	.LSEH_end_ChaCha20_128
3934	.rva	.LSEH_info_ChaCha20_128
3935
3936	.rva	.LSEH_begin_ChaCha20_4x
3937	.rva	.LSEH_end_ChaCha20_4x
3938	.rva	.LSEH_info_ChaCha20_4x
3939___
3940$code.=<<___ if ($avx);
3941	.rva	.LSEH_begin_ChaCha20_4xop
3942	.rva	.LSEH_end_ChaCha20_4xop
3943	.rva	.LSEH_info_ChaCha20_4xop
3944___
3945$code.=<<___ if ($avx>1);
3946	.rva	.LSEH_begin_ChaCha20_8x
3947	.rva	.LSEH_end_ChaCha20_8x
3948	.rva	.LSEH_info_ChaCha20_8x
3949___
3950$code.=<<___ if ($avx>2);
3951	.rva	.LSEH_begin_ChaCha20_avx512
3952	.rva	.LSEH_end_ChaCha20_avx512
3953	.rva	.LSEH_info_ChaCha20_avx512
3954
3955	.rva	.LSEH_begin_ChaCha20_avx512vl
3956	.rva	.LSEH_end_ChaCha20_avx512vl
3957	.rva	.LSEH_info_ChaCha20_avx512vl
3958
3959	.rva	.LSEH_begin_ChaCha20_16x
3960	.rva	.LSEH_end_ChaCha20_16x
3961	.rva	.LSEH_info_ChaCha20_16x
3962
3963	.rva	.LSEH_begin_ChaCha20_8xvl
3964	.rva	.LSEH_end_ChaCha20_8xvl
3965	.rva	.LSEH_info_ChaCha20_8xvl
3966___
3967$code.=<<___;
3968.section	.xdata
3969.align	8
3970.LSEH_info_ChaCha20_ctr32:
3971	.byte	9,0,0,0
3972	.rva	se_handler
3973
3974.LSEH_info_ChaCha20_ssse3:
3975	.byte	9,0,0,0
3976	.rva	simd_handler
3977	.rva	.Lssse3_body,.Lssse3_epilogue
3978	.long	0x20,0
3979
3980.LSEH_info_ChaCha20_128:
3981	.byte	9,0,0,0
3982	.rva	simd_handler
3983	.rva	.L128_body,.L128_epilogue
3984	.long	0x60,0
3985
3986.LSEH_info_ChaCha20_4x:
3987	.byte	9,0,0,0
3988	.rva	simd_handler
3989	.rva	.L4x_body,.L4x_epilogue
3990	.long	0xa0,0
3991___
3992$code.=<<___ if ($avx);
3993.LSEH_info_ChaCha20_4xop:
3994	.byte	9,0,0,0
3995	.rva	simd_handler
3996	.rva	.L4xop_body,.L4xop_epilogue		# HandlerData[]
3997	.long	0xa0,0
3998___
3999$code.=<<___ if ($avx>1);
4000.LSEH_info_ChaCha20_8x:
4001	.byte	9,0,0,0
4002	.rva	simd_handler
4003	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
4004	.long	0xa0,0
4005___
4006$code.=<<___ if ($avx>2);
4007.LSEH_info_ChaCha20_avx512:
4008	.byte	9,0,0,0
4009	.rva	simd_handler
4010	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
4011	.long	0x20,0
4012
4013.LSEH_info_ChaCha20_avx512vl:
4014	.byte	9,0,0,0
4015	.rva	simd_handler
4016	.rva	.Lavx512vl_body,.Lavx512vl_epilogue	# HandlerData[]
4017	.long	0x20,0
4018
4019.LSEH_info_ChaCha20_16x:
4020	.byte	9,0,0,0
4021	.rva	simd_handler
4022	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
4023	.long	0xa0,0
4024
4025.LSEH_info_ChaCha20_8xvl:
4026	.byte	9,0,0,0
4027	.rva	simd_handler
4028	.rva	.L8xvl_body,.L8xvl_epilogue		# HandlerData[]
4029	.long	0xa0,0
4030___
4031}
4032
4033foreach (split("\n",$code)) {
4034	s/\`([^\`]*)\`/eval $1/ge;
4035
4036	s/%x#%[yz]/%x/g;	# "down-shift"
4037
4038	print $_,"\n";
4039}
4040
4041close STDOUT or die "error closing STDOUT: $!";
4042