xref: /openssl/crypto/chacha/asm/chacha-x86.pl (revision cd84d883)
1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2015
18#
19# ChaCha20 for x86.
20#
21# Performance in cycles per byte out of large buffer.
22#
23#		1xIALU/gcc	4xSSSE3
24# Pentium	17.5/+80%
25# PIII		14.2/+60%
26# P4		18.6/+84%
27# Core2		9.56/+89%	4.83
28# Westmere	9.50/+45%	3.35
29# Sandy Bridge	10.5/+47%	3.20
30# Haswell	8.15/+50%	2.83
31# Skylake	7.53/+22%	2.75
32# Silvermont	17.4/+36%	8.35
33# Goldmont	13.4/+40%	4.36
34# Sledgehammer	10.2/+54%
35# Bulldozer	13.4/+50%	4.38(*)
36#
37# (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
38
39$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40push(@INC,"${dir}","${dir}../../perlasm");
41require "x86asm.pl";
42
43$output = pop and open STDOUT,">$output";
44
45&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
46
47$xmm=$ymm=0;
48for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
49
50$ymm=1 if ($xmm &&
51		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
53		($gasver=$1)>=2.19);	# first version supporting AVX
54
55$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
56		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
57		$1>=2.03);	# first version supporting AVX
58
59$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
60		`ml 2>&1` =~ /Version ([0-9]+)\./ &&
61		$1>=10);	# first version supporting AVX
62
63$ymm=1 if ($xmm && !$ymm &&
64		`$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ &&
65		$2>=3.0);	# first version supporting AVX
66
67$a="eax";
68($b,$b_)=("ebx","ebp");
69($c,$c_)=("ecx","esi");
70($d,$d_)=("edx","edi");
71
72sub QUARTERROUND {
73my ($ai,$bi,$ci,$di,$i)=@_;
74my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
75my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
76
77	#       a   b   c   d
78	#
79	#       0   4   8  12 < even round
80	#       1   5   9  13
81	#       2   6  10  14
82	#       3   7  11  15
83	#       0   5  10  15 < odd round
84	#       1   6  11  12
85	#       2   7   8  13
86	#       3   4   9  14
87
88	if ($i==0) {
89            my $j=4;
90	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
91	} elsif ($i==3) {
92            my $j=0;
93	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
94	} elsif ($i==4) {
95            my $j=4;
96	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
97	} elsif ($i==7) {
98            my $j=0;
99	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
100	}
101
102	#&add	($a,$b);			# see elsewhere
103	&xor	($d,$a);
104	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
105	&rol	($d,16);
106	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
107	&add	($c,$d);
108	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
109	&xor	($b,$c);
110	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
111	&rol	($b,12);
112	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
113	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
114	&add	($a,$b);
115	&xor	($d,$a);
116	&mov	(&DWP(4*$ai,"esp"),$a);
117	&rol	($d,8);
118	&mov	($a,&DWP(4*$an,"esp"));
119	&add	($c,$d);
120	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
121	&mov	($d_,$d)			if ($di==$dn);
122	&xor	($b,$c);
123	 &add	($a,$b_)			if ($i<7);	# elsewhere
124	&rol	($b,7);
125
126	($b,$b_)=($b_,$b);
127	($c,$c_)=($c_,$c);
128	($d,$d_)=($d_,$d);
129}
130
131&static_label("ssse3_shortcut");
132&static_label("xop_shortcut");
133&static_label("ssse3_data");
134&static_label("pic_point");
135
136&function_begin("ChaCha20_ctr32");
137	&xor	("eax","eax");
138	&cmp	("eax",&wparam(2));		# len==0?
139	&je	(&label("no_data"));
140if ($xmm) {
141	&call	(&label("pic_point"));
142&set_label("pic_point");
143	&blindpop("eax");
144	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
145	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
146	&jz	(&label("x86"));
147	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
148	&jz	(&label("x86"));
149	&jmp	(&label("ssse3_shortcut"));
150&set_label("x86");
151}
152	&mov	("esi",&wparam(3));		# key
153	&mov	("edi",&wparam(4));		# counter and nonce
154
155	&stack_push(33);
156
157	&mov	("eax",&DWP(4*0,"esi"));	# copy key
158	&mov	("ebx",&DWP(4*1,"esi"));
159	&mov	("ecx",&DWP(4*2,"esi"));
160	&mov	("edx",&DWP(4*3,"esi"));
161	&mov	(&DWP(64+4*4,"esp"),"eax");
162	&mov	(&DWP(64+4*5,"esp"),"ebx");
163	&mov	(&DWP(64+4*6,"esp"),"ecx");
164	&mov	(&DWP(64+4*7,"esp"),"edx");
165	&mov	("eax",&DWP(4*4,"esi"));
166	&mov	("ebx",&DWP(4*5,"esi"));
167	&mov	("ecx",&DWP(4*6,"esi"));
168	&mov	("edx",&DWP(4*7,"esi"));
169	&mov	(&DWP(64+4*8,"esp"),"eax");
170	&mov	(&DWP(64+4*9,"esp"),"ebx");
171	&mov	(&DWP(64+4*10,"esp"),"ecx");
172	&mov	(&DWP(64+4*11,"esp"),"edx");
173	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
174	&mov	("ebx",&DWP(4*1,"edi"));
175	&mov	("ecx",&DWP(4*2,"edi"));
176	&mov	("edx",&DWP(4*3,"edi"));
177	&sub	("eax",1);
178	&mov	(&DWP(64+4*12,"esp"),"eax");
179	&mov	(&DWP(64+4*13,"esp"),"ebx");
180	&mov	(&DWP(64+4*14,"esp"),"ecx");
181	&mov	(&DWP(64+4*15,"esp"),"edx");
182	&jmp	(&label("entry"));
183
184&set_label("outer_loop",16);
185	&mov	(&wparam(1),$b);		# save input
186	&mov	(&wparam(0),$a);		# save output
187	&mov	(&wparam(2),$c);		# save len
188&set_label("entry");
189	&mov	($a,0x61707865);
190	&mov	(&DWP(4*1,"esp"),0x3320646e);
191	&mov	(&DWP(4*2,"esp"),0x79622d32);
192	&mov	(&DWP(4*3,"esp"),0x6b206574);
193
194	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
195	&mov	($b_,&DWP(64+4*6,"esp"));
196	&mov	($c, &DWP(64+4*10,"esp"));
197	&mov	($c_,&DWP(64+4*11,"esp"));
198	&mov	($d, &DWP(64+4*13,"esp"));
199	&mov	($d_,&DWP(64+4*14,"esp"));
200	&mov	(&DWP(4*5,"esp"),$b);
201	&mov	(&DWP(4*6,"esp"),$b_);
202	&mov	(&DWP(4*10,"esp"),$c);
203	&mov	(&DWP(4*11,"esp"),$c_);
204	&mov	(&DWP(4*13,"esp"),$d);
205	&mov	(&DWP(4*14,"esp"),$d_);
206
207	&mov	($b, &DWP(64+4*7,"esp"));
208	&mov	($d_,&DWP(64+4*15,"esp"));
209	&mov	($d, &DWP(64+4*12,"esp"));
210	&mov	($b_,&DWP(64+4*4,"esp"));
211	&mov	($c, &DWP(64+4*8,"esp"));
212	&mov	($c_,&DWP(64+4*9,"esp"));
213	&add	($d,1);				# counter value
214	&mov	(&DWP(4*7,"esp"),$b);
215	&mov	(&DWP(4*15,"esp"),$d_);
216	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
217
218	&mov	($b,10);			# loop counter
219	&jmp	(&label("loop"));
220
221&set_label("loop",16);
222	&add	($a,$b_);			# elsewhere
223	&mov	(&DWP(128,"esp"),$b);		# save loop counter
224	&mov	($b,$b_);
225	&QUARTERROUND(0, 4, 8, 12, 0);
226	&QUARTERROUND(1, 5, 9, 13, 1);
227	&QUARTERROUND(2, 6,10, 14, 2);
228	&QUARTERROUND(3, 7,11, 15, 3);
229	&QUARTERROUND(0, 5,10, 15, 4);
230	&QUARTERROUND(1, 6,11, 12, 5);
231	&QUARTERROUND(2, 7, 8, 13, 6);
232	&QUARTERROUND(3, 4, 9, 14, 7);
233	&dec	($b);
234	&jnz	(&label("loop"));
235
236	&mov	($b,&wparam(2));		# load len
237
238	&add	($a,0x61707865);		# accumulate key material
239	&add	($b_,&DWP(64+4*4,"esp"));
240	&add	($c, &DWP(64+4*8,"esp"));
241	&add	($c_,&DWP(64+4*9,"esp"));
242
243	&cmp	($b,64);
244	&jb	(&label("tail"));
245
246	&mov	($b,&wparam(1));		# load input pointer
247	&add	($d, &DWP(64+4*12,"esp"));
248	&add	($d_,&DWP(64+4*14,"esp"));
249
250	&xor	($a, &DWP(4*0,$b));		# xor with input
251	&xor	($b_,&DWP(4*4,$b));
252	&mov	(&DWP(4*0,"esp"),$a);
253	&mov	($a,&wparam(0));		# load output pointer
254	&xor	($c, &DWP(4*8,$b));
255	&xor	($c_,&DWP(4*9,$b));
256	&xor	($d, &DWP(4*12,$b));
257	&xor	($d_,&DWP(4*14,$b));
258	&mov	(&DWP(4*4,$a),$b_);		# write output
259	&mov	(&DWP(4*8,$a),$c);
260	&mov	(&DWP(4*9,$a),$c_);
261	&mov	(&DWP(4*12,$a),$d);
262	&mov	(&DWP(4*14,$a),$d_);
263
264	&mov	($b_,&DWP(4*1,"esp"));
265	&mov	($c, &DWP(4*2,"esp"));
266	&mov	($c_,&DWP(4*3,"esp"));
267	&mov	($d, &DWP(4*5,"esp"));
268	&mov	($d_,&DWP(4*6,"esp"));
269	&add	($b_,0x3320646e);		# accumulate key material
270	&add	($c, 0x79622d32);
271	&add	($c_,0x6b206574);
272	&add	($d, &DWP(64+4*5,"esp"));
273	&add	($d_,&DWP(64+4*6,"esp"));
274	&xor	($b_,&DWP(4*1,$b));
275	&xor	($c, &DWP(4*2,$b));
276	&xor	($c_,&DWP(4*3,$b));
277	&xor	($d, &DWP(4*5,$b));
278	&xor	($d_,&DWP(4*6,$b));
279	&mov	(&DWP(4*1,$a),$b_);
280	&mov	(&DWP(4*2,$a),$c);
281	&mov	(&DWP(4*3,$a),$c_);
282	&mov	(&DWP(4*5,$a),$d);
283	&mov	(&DWP(4*6,$a),$d_);
284
285	&mov	($b_,&DWP(4*7,"esp"));
286	&mov	($c, &DWP(4*10,"esp"));
287	&mov	($c_,&DWP(4*11,"esp"));
288	&mov	($d, &DWP(4*13,"esp"));
289	&mov	($d_,&DWP(4*15,"esp"));
290	&add	($b_,&DWP(64+4*7,"esp"));
291	&add	($c, &DWP(64+4*10,"esp"));
292	&add	($c_,&DWP(64+4*11,"esp"));
293	&add	($d, &DWP(64+4*13,"esp"));
294	&add	($d_,&DWP(64+4*15,"esp"));
295	&xor	($b_,&DWP(4*7,$b));
296	&xor	($c, &DWP(4*10,$b));
297	&xor	($c_,&DWP(4*11,$b));
298	&xor	($d, &DWP(4*13,$b));
299	&xor	($d_,&DWP(4*15,$b));
300	&lea	($b,&DWP(4*16,$b));
301	&mov	(&DWP(4*7,$a),$b_);
302	&mov	($b_,&DWP(4*0,"esp"));
303	&mov	(&DWP(4*10,$a),$c);
304	&mov	($c,&wparam(2));		# len
305	&mov	(&DWP(4*11,$a),$c_);
306	&mov	(&DWP(4*13,$a),$d);
307	&mov	(&DWP(4*15,$a),$d_);
308	&mov	(&DWP(4*0,$a),$b_);
309	&lea	($a,&DWP(4*16,$a));
310	&sub	($c,64);
311	&jnz	(&label("outer_loop"));
312
313	&jmp	(&label("done"));
314
315&set_label("tail");
316	&add	($d, &DWP(64+4*12,"esp"));
317	&add	($d_,&DWP(64+4*14,"esp"));
318	&mov	(&DWP(4*0,"esp"),$a);
319	&mov	(&DWP(4*4,"esp"),$b_);
320	&mov	(&DWP(4*8,"esp"),$c);
321	&mov	(&DWP(4*9,"esp"),$c_);
322	&mov	(&DWP(4*12,"esp"),$d);
323	&mov	(&DWP(4*14,"esp"),$d_);
324
325	&mov	($b_,&DWP(4*1,"esp"));
326	&mov	($c, &DWP(4*2,"esp"));
327	&mov	($c_,&DWP(4*3,"esp"));
328	&mov	($d, &DWP(4*5,"esp"));
329	&mov	($d_,&DWP(4*6,"esp"));
330	&add	($b_,0x3320646e);		# accumulate key material
331	&add	($c, 0x79622d32);
332	&add	($c_,0x6b206574);
333	&add	($d, &DWP(64+4*5,"esp"));
334	&add	($d_,&DWP(64+4*6,"esp"));
335	&mov	(&DWP(4*1,"esp"),$b_);
336	&mov	(&DWP(4*2,"esp"),$c);
337	&mov	(&DWP(4*3,"esp"),$c_);
338	&mov	(&DWP(4*5,"esp"),$d);
339	&mov	(&DWP(4*6,"esp"),$d_);
340
341	&mov	($b_,&DWP(4*7,"esp"));
342	&mov	($c, &DWP(4*10,"esp"));
343	&mov	($c_,&DWP(4*11,"esp"));
344	&mov	($d, &DWP(4*13,"esp"));
345	&mov	($d_,&DWP(4*15,"esp"));
346	&add	($b_,&DWP(64+4*7,"esp"));
347	&add	($c, &DWP(64+4*10,"esp"));
348	&add	($c_,&DWP(64+4*11,"esp"));
349	&add	($d, &DWP(64+4*13,"esp"));
350	&add	($d_,&DWP(64+4*15,"esp"));
351	&mov	(&DWP(4*7,"esp"),$b_);
352	&mov	($b_,&wparam(1));		# load input
353	&mov	(&DWP(4*10,"esp"),$c);
354	&mov	($c,&wparam(0));		# load output
355	&mov	(&DWP(4*11,"esp"),$c_);
356	&xor	($c_,$c_);
357	&mov	(&DWP(4*13,"esp"),$d);
358	&mov	(&DWP(4*15,"esp"),$d_);
359
360	&xor	("eax","eax");
361	&xor	("edx","edx");
362&set_label("tail_loop");
363	&movb	("al",&BP(0,$c_,$b_));
364	&movb	("dl",&BP(0,"esp",$c_));
365	&lea	($c_,&DWP(1,$c_));
366	&xor	("al","dl");
367	&mov	(&BP(-1,$c,$c_),"al");
368	&dec	($b);
369	&jnz	(&label("tail_loop"));
370
371&set_label("done");
372	&stack_pop(33);
373&set_label("no_data");
374&function_end("ChaCha20_ctr32");
375
376if ($xmm) {
377my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
378my ($out,$inp,$len)=("edi","esi","ecx");
379
380sub QUARTERROUND_SSSE3 {
381my ($ai,$bi,$ci,$di,$i)=@_;
382my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
383my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
384
385	#       a   b   c   d
386	#
387	#       0   4   8  12 < even round
388	#       1   5   9  13
389	#       2   6  10  14
390	#       3   7  11  15
391	#       0   5  10  15 < odd round
392	#       1   6  11  12
393	#       2   7   8  13
394	#       3   4   9  14
395
396	if ($i==0) {
397            my $j=4;
398	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
399	} elsif ($i==3) {
400            my $j=0;
401	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
402	} elsif ($i==4) {
403            my $j=4;
404	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
405	} elsif ($i==7) {
406            my $j=0;
407	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
408	}
409
410	#&paddd	($xa,$xb);			# see elsewhere
411	#&pxor	($xd,$xa);			# see elsewhere
412	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
413	&pshufb	($xd,&QWP(0,"eax"));		# rot16
414	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
415	&paddd	($xc,$xd);
416	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
417	&pxor	($xb,$xc);
418	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
419	&movdqa	($xa_,$xb);			# borrow as temporary
420	&pslld	($xb,12);
421	&psrld	($xa_,20);
422	&por	($xb,$xa_);
423	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
424	&paddd	($xa,$xb);
425	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
426	&pxor	($xd,$xa);
427	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
428	&pshufb	($xd,&QWP(16,"eax"));		# rot8
429	&paddd	($xc,$xd);
430	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
431	&movdqa	($xd_,$xd)			if ($di==$dn);
432	&pxor	($xb,$xc);
433	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
434	&movdqa	($xa,$xb);			# borrow as temporary
435	&pslld	($xb,7);
436	&psrld	($xa,25);
437	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
438	&por	($xb,$xa);
439
440	($xa,$xa_)=($xa_,$xa);
441	($xb,$xb_)=($xb_,$xb);
442	($xc,$xc_)=($xc_,$xc);
443	($xd,$xd_)=($xd_,$xd);
444}
445
446&function_begin("ChaCha20_ssse3");
447&set_label("ssse3_shortcut");
448if ($ymm) {
449	&test		(&DWP(4,"ebp"),1<<11);		# test XOP bit
450	&jnz		(&label("xop_shortcut"));
451}
452
453	&mov		($out,&wparam(0));
454	&mov		($inp,&wparam(1));
455	&mov		($len,&wparam(2));
456	&mov		("edx",&wparam(3));		# key
457	&mov		("ebx",&wparam(4));		# counter and nonce
458
459	&mov		("ebp","esp");
460	&stack_push	(131);
461	&and		("esp",-64);
462	&mov		(&DWP(512,"esp"),"ebp");
463
464	&lea		("eax",&DWP(&label("ssse3_data")."-".
465				    &label("pic_point"),"eax"));
466	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
467
468if (defined($gasver) && $gasver>=2.17) {		# even though we encode
469							# pshufb manually, we
470							# handle only register
471							# operands, while this
472							# segment uses memory
473							# operand...
474	&cmp		($len,64*4);
475	&jb		(&label("1x"));
476
477	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
478	&mov		(&DWP(512+8,"esp"),"ebx");
479	&sub		($len,64*4);			# bias len
480	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
481
482	&movdqu		("xmm7",&QWP(0,"edx"));		# key
483	&pshufd		("xmm0","xmm3",0x00);
484	&pshufd		("xmm1","xmm3",0x55);
485	&pshufd		("xmm2","xmm3",0xaa);
486	&pshufd		("xmm3","xmm3",0xff);
487	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
488	&pshufd		("xmm4","xmm7",0x00);
489	&pshufd		("xmm5","xmm7",0x55);
490	 &psubd		("xmm0",&QWP(16*4,"eax"));
491	&pshufd		("xmm6","xmm7",0xaa);
492	&pshufd		("xmm7","xmm7",0xff);
493	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
494	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
495	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
496	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
497	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
498	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
499	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
500	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
501	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
502	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
503	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
504
505	&pshufd		("xmm0","xmm3",0x00);
506	&pshufd		("xmm1","xmm3",0x55);
507	&pshufd		("xmm2","xmm3",0xaa);
508	&pshufd		("xmm3","xmm3",0xff);
509	&pshufd		("xmm4","xmm7",0x00);
510	&pshufd		("xmm5","xmm7",0x55);
511	&pshufd		("xmm6","xmm7",0xaa);
512	&pshufd		("xmm7","xmm7",0xff);
513	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
514	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
515	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
516	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
517	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
518	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
519	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
520	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
521
522	&lea		($inp,&DWP(128,$inp));		# size optimization
523	&lea		($out,&DWP(128,$out));		# size optimization
524	&jmp		(&label("outer_loop"));
525
526&set_label("outer_loop",16);
527	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
528	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
529	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
530	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
531	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
532	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
533	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
534	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
535	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
536	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
537	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
538	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
539	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
540	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
541	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
542	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
543	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
544	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
545	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
546	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
547	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
548	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
549	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
550	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
551	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
552	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
553	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
554	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
555	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
556	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
557	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
558	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
559	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
560	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
561
562	&movdqa		($xa, &QWP(16*0-128,"ebp"));
563	&movdqa		($xd, "xmm4");
564	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
565	&movdqa		($xc, &QWP(16*8-128,"ebp"));
566	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
567
568	&mov		("edx",10);			# loop counter
569	&nop		();
570
571&set_label("loop",16);
572	&paddd		($xa,$xb_);			# elsewhere
573	&movdqa		($xb,$xb_);
574	&pxor		($xd,$xa);			# elsewhere
575	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
576	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
577	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
578	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
579	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
580	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
581	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
582	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
583	&dec		("edx");
584	&jnz		(&label("loop"));
585
586	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
587	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
588	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
589	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
590	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
591
592    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
593
594	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
595	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
596	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
597	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
598
599    for($i=0;$i<256;$i+=64) {
600	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
601	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
602	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
603	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
604
605	&movdqa		($xt2,$xa0);		# "de-interlace" data
606	&punpckldq	($xa0,$xa1);
607	&movdqa		($xt3,$xa2);
608	&punpckldq	($xa2,$xa3);
609	&punpckhdq	($xt2,$xa1);
610	&punpckhdq	($xt3,$xa3);
611	&movdqa		($xa1,$xa0);
612	&punpcklqdq	($xa0,$xa2);		# "a0"
613	&movdqa		($xa3,$xt2);
614	&punpcklqdq	($xt2,$xt3);		# "a2"
615	&punpckhqdq	($xa1,$xa2);		# "a1"
616	&punpckhqdq	($xa3,$xt3);		# "a3"
617
618	#($xa2,$xt2)=($xt2,$xa2);
619
620	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
621	&movdqu		($xt1,&QWP(64*1-128,$inp));
622	&movdqu		($xa2,&QWP(64*2-128,$inp));
623	&movdqu		($xt3,&QWP(64*3-128,$inp));
624	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
625	&pxor		($xt0,$xa0);
626	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
627	&pxor		($xt1,$xa1);
628	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
629	&pxor		($xt2,$xa2);
630	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
631	&pxor		($xt3,$xa3);
632	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
633	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
634	&movdqu		(&QWP(64*1-128,$out),$xt1);
635	&movdqu		(&QWP(64*2-128,$out),$xt2);
636	&movdqu		(&QWP(64*3-128,$out),$xt3);
637	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
638    }
639	&sub		($len,64*4);
640	&jnc		(&label("outer_loop"));
641
642	&add		($len,64*4);
643	&jz		(&label("done"));
644
645	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
646	&lea		($inp,&DWP(-128,$inp));
647	&mov		("edx",&DWP(512+4,"esp"));
648	&lea		($out,&DWP(-128,$out));
649
650	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
651	&movdqu		("xmm3",&QWP(0,"ebx"));
652	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
653	&pand		("xmm3",&QWP(16*7,"eax"));
654	&por		("xmm3","xmm2");		# counter value
655}
656{
657my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
658
659sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
660	&paddd		($a,$b);
661	&pxor		($d,$a);
662	&pshufb		($d,$rot16);
663
664	&paddd		($c,$d);
665	&pxor		($b,$c);
666	&movdqa		($t,$b);
667	&psrld		($b,20);
668	&pslld		($t,12);
669	&por		($b,$t);
670
671	&paddd		($a,$b);
672	&pxor		($d,$a);
673	&pshufb		($d,$rot24);
674
675	&paddd		($c,$d);
676	&pxor		($b,$c);
677	&movdqa		($t,$b);
678	&psrld		($b,25);
679	&pslld		($t,7);
680	&por		($b,$t);
681}
682
683&set_label("1x");
684	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
685	&movdqu		($b,&QWP(0,"edx"));
686	&movdqu		($c,&QWP(16,"edx"));
687	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
688	&movdqa		($rot16,&QWP(0,"eax"));
689	&movdqa		($rot24,&QWP(16,"eax"));
690	&mov		(&DWP(16*3,"esp"),"ebp");
691
692	&movdqa		(&QWP(16*0,"esp"),$a);
693	&movdqa		(&QWP(16*1,"esp"),$b);
694	&movdqa		(&QWP(16*2,"esp"),$c);
695	&movdqa		(&QWP(16*3,"esp"),$d);
696	&mov		("edx",10);
697	&jmp		(&label("loop1x"));
698
699&set_label("outer1x",16);
700	&movdqa		($d,&QWP(16*5,"eax"));		# one
701	&movdqa		($a,&QWP(16*0,"esp"));
702	&movdqa		($b,&QWP(16*1,"esp"));
703	&movdqa		($c,&QWP(16*2,"esp"));
704	&paddd		($d,&QWP(16*3,"esp"));
705	&mov		("edx",10);
706	&movdqa		(&QWP(16*3,"esp"),$d);
707	&jmp		(&label("loop1x"));
708
709&set_label("loop1x",16);
710	&SSSE3ROUND();
711	&pshufd	($c,$c,0b01001110);
712	&pshufd	($b,$b,0b00111001);
713	&pshufd	($d,$d,0b10010011);
714	&nop	();
715
716	&SSSE3ROUND();
717	&pshufd	($c,$c,0b01001110);
718	&pshufd	($b,$b,0b10010011);
719	&pshufd	($d,$d,0b00111001);
720
721	&dec		("edx");
722	&jnz		(&label("loop1x"));
723
724	&paddd		($a,&QWP(16*0,"esp"));
725	&paddd		($b,&QWP(16*1,"esp"));
726	&paddd		($c,&QWP(16*2,"esp"));
727	&paddd		($d,&QWP(16*3,"esp"));
728
729	&cmp		($len,64);
730	&jb		(&label("tail"));
731
732	&movdqu		($t,&QWP(16*0,$inp));
733	&movdqu		($t1,&QWP(16*1,$inp));
734	&pxor		($a,$t);		# xor with input
735	&movdqu		($t,&QWP(16*2,$inp));
736	&pxor		($b,$t1);
737	&movdqu		($t1,&QWP(16*3,$inp));
738	&pxor		($c,$t);
739	&pxor		($d,$t1);
740	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
741
742	&movdqu		(&QWP(16*0,$out),$a);	# write output
743	&movdqu		(&QWP(16*1,$out),$b);
744	&movdqu		(&QWP(16*2,$out),$c);
745	&movdqu		(&QWP(16*3,$out),$d);
746	&lea		($out,&DWP(16*4,$out));	# inp+=64
747
748	&sub		($len,64);
749	&jnz		(&label("outer1x"));
750
751	&jmp		(&label("done"));
752
753&set_label("tail");
754	&movdqa		(&QWP(16*0,"esp"),$a);
755	&movdqa		(&QWP(16*1,"esp"),$b);
756	&movdqa		(&QWP(16*2,"esp"),$c);
757	&movdqa		(&QWP(16*3,"esp"),$d);
758
759	&xor		("eax","eax");
760	&xor		("edx","edx");
761	&xor		("ebp","ebp");
762
763&set_label("tail_loop");
764	&movb		("al",&BP(0,"esp","ebp"));
765	&movb		("dl",&BP(0,$inp,"ebp"));
766	&lea		("ebp",&DWP(1,"ebp"));
767	&xor		("al","dl");
768	&movb		(&BP(-1,$out,"ebp"),"al");
769	&dec		($len);
770	&jnz		(&label("tail_loop"));
771}
772&set_label("done");
773	&mov		("esp",&DWP(512,"esp"));
774&function_end("ChaCha20_ssse3");
775
776&align	(64);
777&set_label("ssse3_data");
778&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
779&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
780&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
781&data_word(0,1,2,3);
782&data_word(4,4,4,4);
783&data_word(1,0,0,0);
784&data_word(4,0,0,0);
785&data_word(0,-1,-1,-1);
786&align	(64);
787}
788&asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
789
790if ($ymm) {
791my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
792my ($out,$inp,$len)=("edi","esi","ecx");
793
794sub QUARTERROUND_XOP {
795my ($ai,$bi,$ci,$di,$i)=@_;
796my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
797my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
798
799	#       a   b   c   d
800	#
801	#       0   4   8  12 < even round
802	#       1   5   9  13
803	#       2   6  10  14
804	#       3   7  11  15
805	#       0   5  10  15 < odd round
806	#       1   6  11  12
807	#       2   7   8  13
808	#       3   4   9  14
809
810	if ($i==0) {
811            my $j=4;
812	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
813	} elsif ($i==3) {
814            my $j=0;
815	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
816	} elsif ($i==4) {
817            my $j=4;
818	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
819	} elsif ($i==7) {
820            my $j=0;
821	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
822	}
823
824	#&vpaddd	($xa,$xa,$xb);			# see elsewhere
825	#&vpxor		($xd,$xd,$xa);			# see elsewhere
826	 &vmovdqa	(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
827	&vprotd		($xd,$xd,16);
828	 &vmovdqa	(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
829	&vpaddd		($xc,$xc,$xd);
830	 &vmovdqa	($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
831	&vpxor		($xb,$i!=0?$xb:$xb_,$xc);
832	 &vmovdqa	($xa_,&QWP(16*$an-128,"ebx"));
833	&vprotd		($xb,$xb,12);
834	 &vmovdqa	($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
835	&vpaddd		($xa,$xa,$xb);
836	 &vmovdqa	($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
837	&vpxor		($xd,$xd,$xa);
838	 &vpaddd	($xa_,$xa_,$xb_)		if ($i<7);	# elsewhere
839	&vprotd		($xd,$xd,8);
840	&vmovdqa	(&QWP(16*$ai-128,"ebx"),$xa);
841	&vpaddd		($xc,$xc,$xd);
842	&vmovdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
843	&vpxor		($xb,$xb,$xc);
844	 &vpxor		($xd_,$di==$dn?$xd:$xd_,$xa_)	if ($i<7);	# elsewhere
845	&vprotd		($xb,$xb,7);
846
847	($xa,$xa_)=($xa_,$xa);
848	($xb,$xb_)=($xb_,$xb);
849	($xc,$xc_)=($xc_,$xc);
850	($xd,$xd_)=($xd_,$xd);
851}
852
853&function_begin("ChaCha20_xop");
854&set_label("xop_shortcut");
855	&mov		($out,&wparam(0));
856	&mov		($inp,&wparam(1));
857	&mov		($len,&wparam(2));
858	&mov		("edx",&wparam(3));		# key
859	&mov		("ebx",&wparam(4));		# counter and nonce
860	&vzeroupper	();
861
862	&mov		("ebp","esp");
863	&stack_push	(131);
864	&and		("esp",-64);
865	&mov		(&DWP(512,"esp"),"ebp");
866
867	&lea		("eax",&DWP(&label("ssse3_data")."-".
868				    &label("pic_point"),"eax"));
869	&vmovdqu	("xmm3",&QWP(0,"ebx"));		# counter and nonce
870
871	&cmp		($len,64*4);
872	&jb		(&label("1x"));
873
874	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
875	&mov		(&DWP(512+8,"esp"),"ebx");
876	&sub		($len,64*4);			# bias len
877	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
878
879	&vmovdqu	("xmm7",&QWP(0,"edx"));		# key
880	&vpshufd	("xmm0","xmm3",0x00);
881	&vpshufd	("xmm1","xmm3",0x55);
882	&vpshufd	("xmm2","xmm3",0xaa);
883	&vpshufd	("xmm3","xmm3",0xff);
884	 &vpaddd	("xmm0","xmm0",&QWP(16*3,"eax"));	# fix counters
885	&vpshufd	("xmm4","xmm7",0x00);
886	&vpshufd	("xmm5","xmm7",0x55);
887	 &vpsubd	("xmm0","xmm0",&QWP(16*4,"eax"));
888	&vpshufd	("xmm6","xmm7",0xaa);
889	&vpshufd	("xmm7","xmm7",0xff);
890	&vmovdqa	(&QWP(16*12-128,"ebp"),"xmm0");
891	&vmovdqa	(&QWP(16*13-128,"ebp"),"xmm1");
892	&vmovdqa	(&QWP(16*14-128,"ebp"),"xmm2");
893	&vmovdqa	(&QWP(16*15-128,"ebp"),"xmm3");
894	 &vmovdqu	("xmm3",&QWP(16,"edx"));	# key
895	&vmovdqa	(&QWP(16*4-128,"ebp"),"xmm4");
896	&vmovdqa	(&QWP(16*5-128,"ebp"),"xmm5");
897	&vmovdqa	(&QWP(16*6-128,"ebp"),"xmm6");
898	&vmovdqa	(&QWP(16*7-128,"ebp"),"xmm7");
899	 &vmovdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
900	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
901
902	&vpshufd	("xmm0","xmm3",0x00);
903	&vpshufd	("xmm1","xmm3",0x55);
904	&vpshufd	("xmm2","xmm3",0xaa);
905	&vpshufd	("xmm3","xmm3",0xff);
906	&vpshufd	("xmm4","xmm7",0x00);
907	&vpshufd	("xmm5","xmm7",0x55);
908	&vpshufd	("xmm6","xmm7",0xaa);
909	&vpshufd	("xmm7","xmm7",0xff);
910	&vmovdqa	(&QWP(16*8-128,"ebp"),"xmm0");
911	&vmovdqa	(&QWP(16*9-128,"ebp"),"xmm1");
912	&vmovdqa	(&QWP(16*10-128,"ebp"),"xmm2");
913	&vmovdqa	(&QWP(16*11-128,"ebp"),"xmm3");
914	&vmovdqa	(&QWP(16*0-128,"ebp"),"xmm4");
915	&vmovdqa	(&QWP(16*1-128,"ebp"),"xmm5");
916	&vmovdqa	(&QWP(16*2-128,"ebp"),"xmm6");
917	&vmovdqa	(&QWP(16*3-128,"ebp"),"xmm7");
918
919	&lea		($inp,&DWP(128,$inp));		# size optimization
920	&lea		($out,&DWP(128,$out));		# size optimization
921	&jmp		(&label("outer_loop"));
922
923&set_label("outer_loop",32);
924	#&vmovdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
925	&vmovdqa	("xmm1",&QWP(16*1-128,"ebp"));
926	&vmovdqa	("xmm2",&QWP(16*2-128,"ebp"));
927	&vmovdqa	("xmm3",&QWP(16*3-128,"ebp"));
928	#&vmovdqa	("xmm4",&QWP(16*4-128,"ebp"));
929	&vmovdqa	("xmm5",&QWP(16*5-128,"ebp"));
930	&vmovdqa	("xmm6",&QWP(16*6-128,"ebp"));
931	&vmovdqa	("xmm7",&QWP(16*7-128,"ebp"));
932	#&vmovdqa	(&QWP(16*0-128,"ebx"),"xmm0");
933	&vmovdqa	(&QWP(16*1-128,"ebx"),"xmm1");
934	&vmovdqa	(&QWP(16*2-128,"ebx"),"xmm2");
935	&vmovdqa	(&QWP(16*3-128,"ebx"),"xmm3");
936	#&vmovdqa	(&QWP(16*4-128,"ebx"),"xmm4");
937	&vmovdqa	(&QWP(16*5-128,"ebx"),"xmm5");
938	&vmovdqa	(&QWP(16*6-128,"ebx"),"xmm6");
939	&vmovdqa	(&QWP(16*7-128,"ebx"),"xmm7");
940	#&vmovdqa	("xmm0",&QWP(16*8-128,"ebp"));
941	#&vmovdqa	("xmm1",&QWP(16*9-128,"ebp"));
942	&vmovdqa	("xmm2",&QWP(16*10-128,"ebp"));
943	&vmovdqa	("xmm3",&QWP(16*11-128,"ebp"));
944	&vmovdqa	("xmm4",&QWP(16*12-128,"ebp"));
945	&vmovdqa	("xmm5",&QWP(16*13-128,"ebp"));
946	&vmovdqa	("xmm6",&QWP(16*14-128,"ebp"));
947	&vmovdqa	("xmm7",&QWP(16*15-128,"ebp"));
948	&vpaddd		("xmm4","xmm4",&QWP(16*4,"eax"));	# counter value
949	#&vmovdqa	(&QWP(16*8-128,"ebx"),"xmm0");
950	#&vmovdqa	(&QWP(16*9-128,"ebx"),"xmm1");
951	&vmovdqa	(&QWP(16*10-128,"ebx"),"xmm2");
952	&vmovdqa	(&QWP(16*11-128,"ebx"),"xmm3");
953	&vmovdqa	(&QWP(16*12-128,"ebx"),"xmm4");
954	&vmovdqa	(&QWP(16*13-128,"ebx"),"xmm5");
955	&vmovdqa	(&QWP(16*14-128,"ebx"),"xmm6");
956	&vmovdqa	(&QWP(16*15-128,"ebx"),"xmm7");
957	&vmovdqa	(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
958
959	&vmovdqa	($xa, &QWP(16*0-128,"ebp"));
960	&vmovdqa	($xd, "xmm4");
961	&vmovdqa	($xb_,&QWP(16*4-128,"ebp"));
962	&vmovdqa	($xc, &QWP(16*8-128,"ebp"));
963	&vmovdqa	($xc_,&QWP(16*9-128,"ebp"));
964
965	&mov		("edx",10);			# loop counter
966	&nop		();
967
968&set_label("loop",32);
969	&vpaddd		($xa,$xa,$xb_);			# elsewhere
970	&vpxor		($xd,$xd,$xa);			# elsewhere
971	&QUARTERROUND_XOP(0, 4, 8, 12, 0);
972	&QUARTERROUND_XOP(1, 5, 9, 13, 1);
973	&QUARTERROUND_XOP(2, 6,10, 14, 2);
974	&QUARTERROUND_XOP(3, 7,11, 15, 3);
975	&QUARTERROUND_XOP(0, 5,10, 15, 4);
976	&QUARTERROUND_XOP(1, 6,11, 12, 5);
977	&QUARTERROUND_XOP(2, 7, 8, 13, 6);
978	&QUARTERROUND_XOP(3, 4, 9, 14, 7);
979	&dec		("edx");
980	&jnz		(&label("loop"));
981
982	&vmovdqa	(&QWP(16*4-128,"ebx"),$xb_);
983	&vmovdqa	(&QWP(16*8-128,"ebx"),$xc);
984	&vmovdqa	(&QWP(16*9-128,"ebx"),$xc_);
985	&vmovdqa	(&QWP(16*12-128,"ebx"),$xd);
986	&vmovdqa	(&QWP(16*14-128,"ebx"),$xd_);
987
988    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
989
990	#&vmovdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
991	&vmovdqa	($xa1,&QWP(16*1-128,"ebx"));
992	&vmovdqa	($xa2,&QWP(16*2-128,"ebx"));
993	&vmovdqa	($xa3,&QWP(16*3-128,"ebx"));
994
995    for($i=0;$i<256;$i+=64) {
996	&vpaddd		($xa0,$xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
997	&vpaddd		($xa1,$xa1,&QWP($i+16*1-128,"ebp"));
998	&vpaddd		($xa2,$xa2,&QWP($i+16*2-128,"ebp"));
999	&vpaddd		($xa3,$xa3,&QWP($i+16*3-128,"ebp"));
1000
1001	&vpunpckldq	($xt2,$xa0,$xa1);	# "de-interlace" data
1002	&vpunpckldq	($xt3,$xa2,$xa3);
1003	&vpunpckhdq	($xa0,$xa0,$xa1);
1004	&vpunpckhdq	($xa2,$xa2,$xa3);
1005	&vpunpcklqdq	($xa1,$xt2,$xt3);	# "a0"
1006	&vpunpckhqdq	($xt2,$xt2,$xt3);	# "a1"
1007	&vpunpcklqdq	($xt3,$xa0,$xa2);	# "a2"
1008	&vpunpckhqdq	($xa3,$xa0,$xa2);	# "a3"
1009
1010	&vpxor		($xt0,$xa1,&QWP(64*0-128,$inp));
1011	&vpxor		($xt1,$xt2,&QWP(64*1-128,$inp));
1012	&vpxor		($xt2,$xt3,&QWP(64*2-128,$inp));
1013	&vpxor		($xt3,$xa3,&QWP(64*3-128,$inp));
1014	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
1015	&vmovdqa	($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
1016	&vmovdqa	($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
1017	&vmovdqa	($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
1018	&vmovdqa	($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
1019	&vmovdqu	(&QWP(64*0-128,$out),$xt0);	# store output
1020	&vmovdqu	(&QWP(64*1-128,$out),$xt1);
1021	&vmovdqu	(&QWP(64*2-128,$out),$xt2);
1022	&vmovdqu	(&QWP(64*3-128,$out),$xt3);
1023	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
1024    }
1025	&sub		($len,64*4);
1026	&jnc		(&label("outer_loop"));
1027
1028	&add		($len,64*4);
1029	&jz		(&label("done"));
1030
1031	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
1032	&lea		($inp,&DWP(-128,$inp));
1033	&mov		("edx",&DWP(512+4,"esp"));
1034	&lea		($out,&DWP(-128,$out));
1035
1036	&vmovd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
1037	&vmovdqu	("xmm3",&QWP(0,"ebx"));
1038	&vpaddd		("xmm2","xmm2",&QWP(16*6,"eax"));# +four
1039	&vpand		("xmm3","xmm3",&QWP(16*7,"eax"));
1040	&vpor		("xmm3","xmm3","xmm2");		# counter value
1041{
1042my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
1043
1044sub XOPROUND {
1045	&vpaddd		($a,$a,$b);
1046	&vpxor		($d,$d,$a);
1047	&vprotd		($d,$d,16);
1048
1049	&vpaddd		($c,$c,$d);
1050	&vpxor		($b,$b,$c);
1051	&vprotd		($b,$b,12);
1052
1053	&vpaddd		($a,$a,$b);
1054	&vpxor		($d,$d,$a);
1055	&vprotd		($d,$d,8);
1056
1057	&vpaddd		($c,$c,$d);
1058	&vpxor		($b,$b,$c);
1059	&vprotd		($b,$b,7);
1060}
1061
1062&set_label("1x");
1063	&vmovdqa	($a,&QWP(16*2,"eax"));		# sigma
1064	&vmovdqu	($b,&QWP(0,"edx"));
1065	&vmovdqu	($c,&QWP(16,"edx"));
1066	#&vmovdqu	($d,&QWP(0,"ebx"));		# already loaded
1067	&vmovdqa	($rot16,&QWP(0,"eax"));
1068	&vmovdqa	($rot24,&QWP(16,"eax"));
1069	&mov		(&DWP(16*3,"esp"),"ebp");
1070
1071	&vmovdqa	(&QWP(16*0,"esp"),$a);
1072	&vmovdqa	(&QWP(16*1,"esp"),$b);
1073	&vmovdqa	(&QWP(16*2,"esp"),$c);
1074	&vmovdqa	(&QWP(16*3,"esp"),$d);
1075	&mov		("edx",10);
1076	&jmp		(&label("loop1x"));
1077
1078&set_label("outer1x",16);
1079	&vmovdqa	($d,&QWP(16*5,"eax"));		# one
1080	&vmovdqa	($a,&QWP(16*0,"esp"));
1081	&vmovdqa	($b,&QWP(16*1,"esp"));
1082	&vmovdqa	($c,&QWP(16*2,"esp"));
1083	&vpaddd		($d,$d,&QWP(16*3,"esp"));
1084	&mov		("edx",10);
1085	&vmovdqa	(&QWP(16*3,"esp"),$d);
1086	&jmp		(&label("loop1x"));
1087
1088&set_label("loop1x",16);
1089	&XOPROUND();
1090	&vpshufd	($c,$c,0b01001110);
1091	&vpshufd	($b,$b,0b00111001);
1092	&vpshufd	($d,$d,0b10010011);
1093
1094	&XOPROUND();
1095	&vpshufd	($c,$c,0b01001110);
1096	&vpshufd	($b,$b,0b10010011);
1097	&vpshufd	($d,$d,0b00111001);
1098
1099	&dec		("edx");
1100	&jnz		(&label("loop1x"));
1101
1102	&vpaddd		($a,$a,&QWP(16*0,"esp"));
1103	&vpaddd		($b,$b,&QWP(16*1,"esp"));
1104	&vpaddd		($c,$c,&QWP(16*2,"esp"));
1105	&vpaddd		($d,$d,&QWP(16*3,"esp"));
1106
1107	&cmp		($len,64);
1108	&jb		(&label("tail"));
1109
1110	&vpxor		($a,$a,&QWP(16*0,$inp));	# xor with input
1111	&vpxor		($b,$b,&QWP(16*1,$inp));
1112	&vpxor		($c,$c,&QWP(16*2,$inp));
1113	&vpxor		($d,$d,&QWP(16*3,$inp));
1114	&lea		($inp,&DWP(16*4,$inp));		# inp+=64
1115
1116	&vmovdqu	(&QWP(16*0,$out),$a);		# write output
1117	&vmovdqu	(&QWP(16*1,$out),$b);
1118	&vmovdqu	(&QWP(16*2,$out),$c);
1119	&vmovdqu	(&QWP(16*3,$out),$d);
1120	&lea		($out,&DWP(16*4,$out));		# inp+=64
1121
1122	&sub		($len,64);
1123	&jnz		(&label("outer1x"));
1124
1125	&jmp		(&label("done"));
1126
1127&set_label("tail");
1128	&vmovdqa	(&QWP(16*0,"esp"),$a);
1129	&vmovdqa	(&QWP(16*1,"esp"),$b);
1130	&vmovdqa	(&QWP(16*2,"esp"),$c);
1131	&vmovdqa	(&QWP(16*3,"esp"),$d);
1132
1133	&xor		("eax","eax");
1134	&xor		("edx","edx");
1135	&xor		("ebp","ebp");
1136
1137&set_label("tail_loop");
1138	&movb		("al",&BP(0,"esp","ebp"));
1139	&movb		("dl",&BP(0,$inp,"ebp"));
1140	&lea		("ebp",&DWP(1,"ebp"));
1141	&xor		("al","dl");
1142	&movb		(&BP(-1,$out,"ebp"),"al");
1143	&dec		($len);
1144	&jnz		(&label("tail_loop"));
1145}
1146&set_label("done");
1147	&vzeroupper	();
1148	&mov		("esp",&DWP(512,"esp"));
1149&function_end("ChaCha20_xop");
1150}
1151
1152&asm_finish();
1153
1154close STDOUT or die "error closing STDOUT: $!";
1155