xref: /openssl/crypto/poly1305/asm/poly1305-x86.pl (revision cd84d883)
1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for x86.
18#
19# April 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# measured with rdtsc at fixed clock frequency.
23#
24#		IALU/gcc-3.4(*)	SSE2(**)	AVX2
25# Pentium	15.7/+80%	-
26# PIII		6.21/+90%	-
27# P4		19.8/+40%	3.24
28# Core 2	4.85/+90%	1.80
29# Westmere	4.58/+100%	1.43
30# Sandy Bridge	3.90/+100%	1.36
31# Haswell	3.88/+70%	1.18		0.72
32# Skylake	3.10/+60%	1.14		0.62
33# Silvermont	11.0/+40%	4.80
34# Goldmont	4.10/+200%	2.10
35# VIA Nano	6.71/+90%	2.47
36# Sledgehammer	3.51/+180%	4.27
37# Bulldozer	4.53/+140%	1.31
38#
39# (*)	gcc 4.8 for some reason generated worse code;
40# (**)	besides SSE2 there are floating-point and AVX options; FP
41#	is deemed unnecessary, because pre-SSE2 processor are too
42#	old to care about, while it's not the fastest option on
43#	SSE2-capable ones; AVX is omitted, because it doesn't give
44#	a lot of improvement, 5-10% depending on processor;
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47push(@INC,"${dir}","${dir}../../perlasm");
48require "x86asm.pl";
49
50$output=pop and open STDOUT,">$output";
51
52&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
53
54$sse2=$avx=0;
55for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
56
57if ($sse2) {
58	&static_label("const_sse2");
59	&static_label("enter_blocks");
60	&static_label("enter_emit");
61	&external_label("OPENSSL_ia32cap_P");
62
63	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64			=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65		$avx = ($1>=2.19) + ($1>=2.22);
66	}
67
68	if (!$avx && $ARGV[0] eq "win32n" &&
69	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
70	$avx = ($1>=2.09) + ($1>=2.10);
71	}
72
73	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) {
74		$avx = ($2>=3.0) + ($2>3.0);
75	}
76}
77
78########################################################################
79# Layout of opaque area is following.
80#
81#	unsigned __int32 h[5];		# current hash value base 2^32
82#	unsigned __int32 pad;		# is_base2_26 in vector context
83#	unsigned __int32 r[4];		# key value base 2^32
84
85&align(64);
86&function_begin("poly1305_init");
87	&mov	("edi",&wparam(0));		# context
88	&mov	("esi",&wparam(1));		# key
89	&mov	("ebp",&wparam(2));		# function table
90
91	&xor	("eax","eax");
92	&mov	(&DWP(4*0,"edi"),"eax");	# zero hash value
93	&mov	(&DWP(4*1,"edi"),"eax");
94	&mov	(&DWP(4*2,"edi"),"eax");
95	&mov	(&DWP(4*3,"edi"),"eax");
96	&mov	(&DWP(4*4,"edi"),"eax");
97	&mov	(&DWP(4*5,"edi"),"eax");	# is_base2_26
98
99	&cmp	("esi",0);
100	&je	(&label("nokey"));
101
102    if ($sse2) {
103	&call	(&label("pic_point"));
104    &set_label("pic_point");
105	&blindpop("ebx");
106
107	&lea	("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
108	&lea	("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
109
110	&picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
111	&mov	("ecx",&DWP(0,"edi"));
112	&and	("ecx",1<<26|1<<24);
113	&cmp	("ecx",1<<26|1<<24);		# SSE2 and XMM?
114	&jne	(&label("no_sse2"));
115
116	&lea	("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
117	&lea	("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
118
119      if ($avx>1) {
120	&mov	("ecx",&DWP(8,"edi"));
121	&test	("ecx",1<<5);			# AVX2?
122	&jz	(&label("no_sse2"));
123
124	&lea	("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
125      }
126    &set_label("no_sse2");
127	&mov	("edi",&wparam(0));		# reload context
128	&mov	(&DWP(0,"ebp"),"eax");		# fill function table
129	&mov	(&DWP(4,"ebp"),"edx");
130    }
131
132	&mov	("eax",&DWP(4*0,"esi"));	# load input key
133	&mov	("ebx",&DWP(4*1,"esi"));
134	&mov	("ecx",&DWP(4*2,"esi"));
135	&mov	("edx",&DWP(4*3,"esi"));
136	&and	("eax",0x0fffffff);
137	&and	("ebx",0x0ffffffc);
138	&and	("ecx",0x0ffffffc);
139	&and	("edx",0x0ffffffc);
140	&mov	(&DWP(4*6,"edi"),"eax");
141	&mov	(&DWP(4*7,"edi"),"ebx");
142	&mov	(&DWP(4*8,"edi"),"ecx");
143	&mov	(&DWP(4*9,"edi"),"edx");
144
145	&mov	("eax",$sse2);
146&set_label("nokey");
147&function_end("poly1305_init");
148
149($h0,$h1,$h2,$h3,$h4,
150 $d0,$d1,$d2,$d3,
151 $r0,$r1,$r2,$r3,
152     $s1,$s2,$s3)=map(4*$_,(0..15));
153
154&function_begin("poly1305_blocks");
155	&mov	("edi",&wparam(0));		# ctx
156	&mov	("esi",&wparam(1));		# inp
157	&mov	("ecx",&wparam(2));		# len
158&set_label("enter_blocks");
159	&and	("ecx",-15);
160	&jz	(&label("nodata"));
161
162	&stack_push(16);
163	&mov	("eax",&DWP(4*6,"edi"));	# r0
164	&mov	("ebx",&DWP(4*7,"edi"));	# r1
165	 &lea	("ebp",&DWP(0,"esi","ecx"));	# end of input
166	&mov	("ecx",&DWP(4*8,"edi"));	# r2
167	&mov	("edx",&DWP(4*9,"edi"));	# r3
168
169	&mov	(&wparam(2),"ebp");
170	&mov	("ebp","esi");
171
172	&mov	(&DWP($r0,"esp"),"eax");	# r0
173	&mov	("eax","ebx");
174	&shr	("eax",2);
175	&mov	(&DWP($r1,"esp"),"ebx");	# r1
176	&add	("eax","ebx");			# s1
177	&mov	("ebx","ecx");
178	&shr	("ebx",2);
179	&mov	(&DWP($r2,"esp"),"ecx");	# r2
180	&add	("ebx","ecx");			# s2
181	&mov	("ecx","edx");
182	&shr	("ecx",2);
183	&mov	(&DWP($r3,"esp"),"edx");	# r3
184	&add	("ecx","edx");			# s3
185	&mov	(&DWP($s1,"esp"),"eax");	# s1
186	&mov	(&DWP($s2,"esp"),"ebx");	# s2
187	&mov	(&DWP($s3,"esp"),"ecx");	# s3
188
189	&mov	("eax",&DWP(4*0,"edi"));	# load hash value
190	&mov	("ebx",&DWP(4*1,"edi"));
191	&mov	("ecx",&DWP(4*2,"edi"));
192	&mov	("esi",&DWP(4*3,"edi"));
193	&mov	("edi",&DWP(4*4,"edi"));
194	&jmp	(&label("loop"));
195
196&set_label("loop",32);
197	&add	("eax",&DWP(4*0,"ebp"));	# accumulate input
198	&adc	("ebx",&DWP(4*1,"ebp"));
199	&adc	("ecx",&DWP(4*2,"ebp"));
200	&adc	("esi",&DWP(4*3,"ebp"));
201	&lea	("ebp",&DWP(4*4,"ebp"));
202	&adc	("edi",&wparam(3));		# padbit
203
204	&mov	(&DWP($h0,"esp"),"eax");	# put aside hash[+inp]
205	&mov	(&DWP($h3,"esp"),"esi");
206
207	&mul	(&DWP($r0,"esp"));		# h0*r0
208	 &mov	(&DWP($h4,"esp"),"edi");
209	&mov	("edi","eax");
210	&mov	("eax","ebx");			# h1
211	&mov	("esi","edx");
212	&mul	(&DWP($s3,"esp"));		# h1*s3
213	&add	("edi","eax");
214	&mov	("eax","ecx");			# h2
215	&adc	("esi","edx");
216	&mul	(&DWP($s2,"esp"));		# h2*s2
217	&add	("edi","eax");
218	&mov	("eax",&DWP($h3,"esp"));
219	&adc	("esi","edx");
220	&mul	(&DWP($s1,"esp"));		# h3*s1
221	&add	("edi","eax");
222	 &mov	("eax",&DWP($h0,"esp"));
223	&adc	("esi","edx");
224
225	&mul	(&DWP($r1,"esp"));		# h0*r1
226	 &mov	(&DWP($d0,"esp"),"edi");
227	&xor	("edi","edi");
228	&add	("esi","eax");
229	&mov	("eax","ebx");			# h1
230	&adc	("edi","edx");
231	&mul	(&DWP($r0,"esp"));		# h1*r0
232	&add	("esi","eax");
233	&mov	("eax","ecx");			# h2
234	&adc	("edi","edx");
235	&mul	(&DWP($s3,"esp"));		# h2*s3
236	&add	("esi","eax");
237	&mov	("eax",&DWP($h3,"esp"));
238	&adc	("edi","edx");
239	&mul	(&DWP($s2,"esp"));		# h3*s2
240	&add	("esi","eax");
241	&mov	("eax",&DWP($h4,"esp"));
242	&adc	("edi","edx");
243	&imul	("eax",&DWP($s1,"esp"));	# h4*s1
244	&add	("esi","eax");
245	 &mov	("eax",&DWP($h0,"esp"));
246	&adc	("edi",0);
247
248	&mul	(&DWP($r2,"esp"));		# h0*r2
249	 &mov	(&DWP($d1,"esp"),"esi");
250	&xor	("esi","esi");
251	&add	("edi","eax");
252	&mov	("eax","ebx");			# h1
253	&adc	("esi","edx");
254	&mul	(&DWP($r1,"esp"));		# h1*r1
255	&add	("edi","eax");
256	&mov	("eax","ecx");			# h2
257	&adc	("esi","edx");
258	&mul	(&DWP($r0,"esp"));		# h2*r0
259	&add	("edi","eax");
260	&mov	("eax",&DWP($h3,"esp"));
261	&adc	("esi","edx");
262	&mul	(&DWP($s3,"esp"));		# h3*s3
263	&add	("edi","eax");
264	&mov	("eax",&DWP($h4,"esp"));
265	&adc	("esi","edx");
266	&imul	("eax",&DWP($s2,"esp"));	# h4*s2
267	&add	("edi","eax");
268	 &mov	("eax",&DWP($h0,"esp"));
269	&adc	("esi",0);
270
271	&mul	(&DWP($r3,"esp"));		# h0*r3
272	 &mov	(&DWP($d2,"esp"),"edi");
273	&xor	("edi","edi");
274	&add	("esi","eax");
275	&mov	("eax","ebx");			# h1
276	&adc	("edi","edx");
277	&mul	(&DWP($r2,"esp"));		# h1*r2
278	&add	("esi","eax");
279	&mov	("eax","ecx");			# h2
280	&adc	("edi","edx");
281	&mul	(&DWP($r1,"esp"));		# h2*r1
282	&add	("esi","eax");
283	&mov	("eax",&DWP($h3,"esp"));
284	&adc	("edi","edx");
285	&mul	(&DWP($r0,"esp"));		# h3*r0
286	&add	("esi","eax");
287	 &mov	("ecx",&DWP($h4,"esp"));
288	&adc	("edi","edx");
289
290	&mov	("edx","ecx");
291	&imul	("ecx",&DWP($s3,"esp"));	# h4*s3
292	&add	("esi","ecx");
293	 &mov	("eax",&DWP($d0,"esp"));
294	&adc	("edi",0);
295
296	&imul	("edx",&DWP($r0,"esp"));	# h4*r0
297	&add	("edx","edi");
298
299	&mov	("ebx",&DWP($d1,"esp"));
300	&mov	("ecx",&DWP($d2,"esp"));
301
302	&mov	("edi","edx");			# last reduction step
303	&shr	("edx",2);
304	&and	("edi",3);
305	&lea	("edx",&DWP(0,"edx","edx",4));	# *5
306	&add	("eax","edx");
307	&adc	("ebx",0);
308	&adc	("ecx",0);
309	&adc	("esi",0);
310	&adc	("edi",0);
311
312	&cmp	("ebp",&wparam(2));		# done yet?
313	&jne	(&label("loop"));
314
315	&mov	("edx",&wparam(0));		# ctx
316	&stack_pop(16);
317	&mov	(&DWP(4*0,"edx"),"eax");	# store hash value
318	&mov	(&DWP(4*1,"edx"),"ebx");
319	&mov	(&DWP(4*2,"edx"),"ecx");
320	&mov	(&DWP(4*3,"edx"),"esi");
321	&mov	(&DWP(4*4,"edx"),"edi");
322&set_label("nodata");
323&function_end("poly1305_blocks");
324
325&function_begin("poly1305_emit");
326	&mov	("ebp",&wparam(0));		# context
327&set_label("enter_emit");
328	&mov	("edi",&wparam(1));		# output
329	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
330	&mov	("ebx",&DWP(4*1,"ebp"));
331	&mov	("ecx",&DWP(4*2,"ebp"));
332	&mov	("edx",&DWP(4*3,"ebp"));
333	&mov	("esi",&DWP(4*4,"ebp"));
334
335	&add	("eax",5);			# compare to modulus
336	&adc	("ebx",0);
337	&adc	("ecx",0);
338	&adc	("edx",0);
339	&adc	("esi",0);
340	&shr	("esi",2);			# did it carry/borrow?
341	&neg	("esi");			# do we choose hash-modulus?
342
343	&and	("eax","esi");
344	&and	("ebx","esi");
345	&and	("ecx","esi");
346	&and	("edx","esi");
347	&mov	(&DWP(4*0,"edi"),"eax");
348	&mov	(&DWP(4*1,"edi"),"ebx");
349	&mov	(&DWP(4*2,"edi"),"ecx");
350	&mov	(&DWP(4*3,"edi"),"edx");
351
352	&not	("esi");			# or original hash value?
353	&mov	("eax",&DWP(4*0,"ebp"));
354	&mov	("ebx",&DWP(4*1,"ebp"));
355	&mov	("ecx",&DWP(4*2,"ebp"));
356	&mov	("edx",&DWP(4*3,"ebp"));
357	&mov	("ebp",&wparam(2));
358	&and	("eax","esi");
359	&and	("ebx","esi");
360	&and	("ecx","esi");
361	&and	("edx","esi");
362	&or	("eax",&DWP(4*0,"edi"));
363	&or	("ebx",&DWP(4*1,"edi"));
364	&or	("ecx",&DWP(4*2,"edi"));
365	&or	("edx",&DWP(4*3,"edi"));
366
367	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
368	&adc	("ebx",&DWP(4*1,"ebp"));
369	&adc	("ecx",&DWP(4*2,"ebp"));
370	&adc	("edx",&DWP(4*3,"ebp"));
371
372	&mov	(&DWP(4*0,"edi"),"eax");
373	&mov	(&DWP(4*1,"edi"),"ebx");
374	&mov	(&DWP(4*2,"edi"),"ecx");
375	&mov	(&DWP(4*3,"edi"),"edx");
376&function_end("poly1305_emit");
377
378if ($sse2) {
379########################################################################
380# Layout of opaque area is following.
381#
382#	unsigned __int32 h[5];		# current hash value base 2^26
383#	unsigned __int32 is_base2_26;
384#	unsigned __int32 r[4];		# key value base 2^32
385#	unsigned __int32 pad[2];
386#	struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
387#
388# where r^n are base 2^26 digits of degrees of multiplier key. There are
389# 5 digits, but last four are interleaved with multiples of 5, totalling
390# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
391
392my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
393my $MASK=$T2;	# borrow and keep in mind
394
395&align	(32);
396&function_begin_B("_poly1305_init_sse2");
397	&movdqu		($D4,&QWP(4*6,"edi"));		# key base 2^32
398	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
399	&mov		("ebp","esp");
400	&sub		("esp",16*(9+5));
401	&and		("esp",-16);
402
403	#&pand		($D4,&QWP(96,"ebx"));		# magic mask
404	&movq		($MASK,&QWP(64,"ebx"));
405
406	&movdqa		($D0,$D4);
407	&movdqa		($D1,$D4);
408	&movdqa		($D2,$D4);
409
410	&pand		($D0,$MASK);			# -> base 2^26
411	&psrlq		($D1,26);
412	&psrldq		($D2,6);
413	&pand		($D1,$MASK);
414	&movdqa		($D3,$D2);
415	&psrlq		($D2,4)
416	&psrlq		($D3,30);
417	&pand		($D2,$MASK);
418	&pand		($D3,$MASK);
419	&psrldq		($D4,13);
420
421	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
422	&mov		("ecx",2);
423&set_label("square");
424	&movdqa		(&QWP(16*0,"esp"),$D0);
425	&movdqa		(&QWP(16*1,"esp"),$D1);
426	&movdqa		(&QWP(16*2,"esp"),$D2);
427	&movdqa		(&QWP(16*3,"esp"),$D3);
428	&movdqa		(&QWP(16*4,"esp"),$D4);
429
430	&movdqa		($T1,$D1);
431	&movdqa		($T0,$D2);
432	&pslld		($T1,2);
433	&pslld		($T0,2);
434	&paddd		($T1,$D1);			# *5
435	&paddd		($T0,$D2);			# *5
436	&movdqa		(&QWP(16*5,"esp"),$T1);
437	&movdqa		(&QWP(16*6,"esp"),$T0);
438	&movdqa		($T1,$D3);
439	&movdqa		($T0,$D4);
440	&pslld		($T1,2);
441	&pslld		($T0,2);
442	&paddd		($T1,$D3);			# *5
443	&paddd		($T0,$D4);			# *5
444	&movdqa		(&QWP(16*7,"esp"),$T1);
445	&movdqa		(&QWP(16*8,"esp"),$T0);
446
447	&pshufd		($T1,$D0,0b01000100);
448	&movdqa		($T0,$D1);
449	&pshufd		($D1,$D1,0b01000100);
450	&pshufd		($D2,$D2,0b01000100);
451	&pshufd		($D3,$D3,0b01000100);
452	&pshufd		($D4,$D4,0b01000100);
453	&movdqa		(&QWP(16*0,"edx"),$T1);
454	&movdqa		(&QWP(16*1,"edx"),$D1);
455	&movdqa		(&QWP(16*2,"edx"),$D2);
456	&movdqa		(&QWP(16*3,"edx"),$D3);
457	&movdqa		(&QWP(16*4,"edx"),$D4);
458
459	################################################################
460	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
461	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
462	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
463	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
464	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
465
466	&pmuludq	($D4,$D0);			# h4*r0
467	&pmuludq	($D3,$D0);			# h3*r0
468	&pmuludq	($D2,$D0);			# h2*r0
469	&pmuludq	($D1,$D0);			# h1*r0
470	&pmuludq	($D0,$T1);			# h0*r0
471
472sub pmuladd {
473my $load = shift;
474my $base = shift; $base = "esp" if (!defined($base));
475
476	################################################################
477	# As for choice to "rotate" $T0-$T2 in order to move paddq
478	# past next multiplication. While it makes code harder to read
479	# and doesn't have significant effect on most processors, it
480	# makes a lot of difference on Atom, up to 30% improvement.
481
482	&movdqa		($T1,$T0);
483	&pmuludq	($T0,&QWP(16*3,$base));		# r1*h3
484	&movdqa		($T2,$T1);
485	&pmuludq	($T1,&QWP(16*2,$base));		# r1*h2
486	&paddq		($D4,$T0);
487	&movdqa		($T0,$T2);
488	&pmuludq	($T2,&QWP(16*1,$base));		# r1*h1
489	&paddq		($D3,$T1);
490	&$load		($T1,5);			# s1
491	&pmuludq	($T0,&QWP(16*0,$base));		# r1*h0
492	&paddq		($D2,$T2);
493	&pmuludq	($T1,&QWP(16*4,$base));		# s1*h4
494	 &$load		($T2,2);			# r2^n
495	&paddq		($D1,$T0);
496
497	&movdqa		($T0,$T2);
498	&pmuludq	($T2,&QWP(16*2,$base));		# r2*h2
499	 &paddq		($D0,$T1);
500	&movdqa		($T1,$T0);
501	&pmuludq	($T0,&QWP(16*1,$base));		# r2*h1
502	&paddq		($D4,$T2);
503	&$load		($T2,6);			# s2^n
504	&pmuludq	($T1,&QWP(16*0,$base));		# r2*h0
505	&paddq		($D3,$T0);
506	&movdqa		($T0,$T2);
507	&pmuludq	($T2,&QWP(16*4,$base));		# s2*h4
508	&paddq		($D2,$T1);
509	&pmuludq	($T0,&QWP(16*3,$base));		# s2*h3
510	 &$load		($T1,3);			# r3^n
511	&paddq		($D1,$T2);
512
513	&movdqa		($T2,$T1);
514	&pmuludq	($T1,&QWP(16*1,$base));		# r3*h1
515	 &paddq		($D0,$T0);
516	&$load		($T0,7);			# s3^n
517	&pmuludq	($T2,&QWP(16*0,$base));		# r3*h0
518	&paddq		($D4,$T1);
519	&movdqa		($T1,$T0);
520	&pmuludq	($T0,&QWP(16*4,$base));		# s3*h4
521	&paddq		($D3,$T2);
522	&movdqa		($T2,$T1);
523	&pmuludq	($T1,&QWP(16*3,$base));		# s3*h3
524	&paddq		($D2,$T0);
525	&pmuludq	($T2,&QWP(16*2,$base));		# s3*h2
526	 &$load		($T0,4);			# r4^n
527	&paddq		($D1,$T1);
528
529	&$load		($T1,8);			# s4^n
530	&pmuludq	($T0,&QWP(16*0,$base));		# r4*h0
531	 &paddq		($D0,$T2);
532	&movdqa		($T2,$T1);
533	&pmuludq	($T1,&QWP(16*4,$base));		# s4*h4
534	&paddq		($D4,$T0);
535	&movdqa		($T0,$T2);
536	&pmuludq	($T2,&QWP(16*1,$base));		# s4*h1
537	&paddq		($D3,$T1);
538	&movdqa		($T1,$T0);
539	&pmuludq	($T0,&QWP(16*2,$base));		# s4*h2
540	&paddq		($D0,$T2);
541	&pmuludq	($T1,&QWP(16*3,$base));		# s4*h3
542	 &movdqa	($MASK,&QWP(64,"ebx"));
543	&paddq		($D1,$T0);
544	&paddq		($D2,$T1);
545}
546	&pmuladd	(sub {	my ($reg,$i)=@_;
547				&movdqa ($reg,&QWP(16*$i,"esp"));
548			     },"edx");
549
550sub lazy_reduction {
551my $extra = shift;
552
553	################################################################
554	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
555	# and P. Schwabe
556	#
557	# [(*) see discussion in poly1305-armv4 module]
558
559	 &movdqa	($T0,$D3);
560	 &pand		($D3,$MASK);
561	 &psrlq		($T0,26);
562	 &$extra	()				if (defined($extra));
563	 &paddq		($T0,$D4);			# h3 -> h4
564	&movdqa		($T1,$D0);
565	&pand		($D0,$MASK);
566	&psrlq		($T1,26);
567	 &movdqa	($D4,$T0);
568	&paddq		($T1,$D1);			# h0 -> h1
569	 &psrlq		($T0,26);
570	 &pand		($D4,$MASK);
571	&movdqa		($D1,$T1);
572	&psrlq		($T1,26);
573	 &paddd		($D0,$T0);			# favour paddd when
574							# possible, because
575							# paddq is "broken"
576							# on Atom
577	 &psllq		($T0,2);
578	&paddq		($T1,$D2);			# h1 -> h2
579	 &paddq		($T0,$D0);			# h4 -> h0 (*)
580	&pand		($D1,$MASK);
581	&movdqa		($D2,$T1);
582	&psrlq		($T1,26);
583	&pand		($D2,$MASK);
584	&paddd		($T1,$D3);			# h2 -> h3
585	 &movdqa	($D0,$T0);
586	 &psrlq		($T0,26);
587	&movdqa		($D3,$T1);
588	&psrlq		($T1,26);
589	 &pand		($D0,$MASK);
590	 &paddd		($D1,$T0);			# h0 -> h1
591	&pand		($D3,$MASK);
592	&paddd		($D4,$T1);			# h3 -> h4
593}
594	&lazy_reduction	();
595
596	&dec		("ecx");
597	&jz		(&label("square_break"));
598
599	&punpcklqdq	($D0,&QWP(16*0,"esp"));		# 0:r^1:0:r^2
600	&punpcklqdq	($D1,&QWP(16*1,"esp"));
601	&punpcklqdq	($D2,&QWP(16*2,"esp"));
602	&punpcklqdq	($D3,&QWP(16*3,"esp"));
603	&punpcklqdq	($D4,&QWP(16*4,"esp"));
604	&jmp		(&label("square"));
605
606&set_label("square_break");
607	&psllq		($D0,32);			# -> r^3:0:r^4:0
608	&psllq		($D1,32);
609	&psllq		($D2,32);
610	&psllq		($D3,32);
611	&psllq		($D4,32);
612	&por		($D0,&QWP(16*0,"esp"));		# r^3:r^1:r^4:r^2
613	&por		($D1,&QWP(16*1,"esp"));
614	&por		($D2,&QWP(16*2,"esp"));
615	&por		($D3,&QWP(16*3,"esp"));
616	&por		($D4,&QWP(16*4,"esp"));
617
618	&pshufd		($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
619	&pshufd		($D1,$D1,0b10001101);
620	&pshufd		($D2,$D2,0b10001101);
621	&pshufd		($D3,$D3,0b10001101);
622	&pshufd		($D4,$D4,0b10001101);
623
624	&movdqu		(&QWP(16*0,"edi"),$D0);		# save the table
625	&movdqu		(&QWP(16*1,"edi"),$D1);
626	&movdqu		(&QWP(16*2,"edi"),$D2);
627	&movdqu		(&QWP(16*3,"edi"),$D3);
628	&movdqu		(&QWP(16*4,"edi"),$D4);
629
630	&movdqa		($T1,$D1);
631	&movdqa		($T0,$D2);
632	&pslld		($T1,2);
633	&pslld		($T0,2);
634	&paddd		($T1,$D1);			# *5
635	&paddd		($T0,$D2);			# *5
636	&movdqu		(&QWP(16*5,"edi"),$T1);
637	&movdqu		(&QWP(16*6,"edi"),$T0);
638	&movdqa		($T1,$D3);
639	&movdqa		($T0,$D4);
640	&pslld		($T1,2);
641	&pslld		($T0,2);
642	&paddd		($T1,$D3);			# *5
643	&paddd		($T0,$D4);			# *5
644	&movdqu		(&QWP(16*7,"edi"),$T1);
645	&movdqu		(&QWP(16*8,"edi"),$T0);
646
647	&mov		("esp","ebp");
648	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
649	&ret		();
650&function_end_B("_poly1305_init_sse2");
651
652&align	(32);
653&function_begin("_poly1305_blocks_sse2");
654	&mov	("edi",&wparam(0));			# ctx
655	&mov	("esi",&wparam(1));			# inp
656	&mov	("ecx",&wparam(2));			# len
657
658	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
659	&and	("ecx",-16);
660	&jz	(&label("nodata"));
661	&cmp	("ecx",64);
662	&jae	(&label("enter_sse2"));
663	&test	("eax","eax");				# is_base2_26?
664	&jz	(&label("enter_blocks"));
665
666&set_label("enter_sse2",16);
667	&call	(&label("pic_point"));
668&set_label("pic_point");
669	&blindpop("ebx");
670	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
671
672	&test	("eax","eax");				# is_base2_26?
673	&jnz	(&label("base2_26"));
674
675	&call	("_poly1305_init_sse2");
676
677	################################################# base 2^32 -> base 2^26
678	&mov	("eax",&DWP(0,"edi"));
679	&mov	("ecx",&DWP(3,"edi"));
680	&mov	("edx",&DWP(6,"edi"));
681	&mov	("esi",&DWP(9,"edi"));
682	&mov	("ebp",&DWP(13,"edi"));
683	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
684
685	&shr	("ecx",2);
686	&and	("eax",0x3ffffff);
687	&shr	("edx",4);
688	&and	("ecx",0x3ffffff);
689	&shr	("esi",6);
690	&and	("edx",0x3ffffff);
691
692	&movd	($D0,"eax");
693	&movd	($D1,"ecx");
694	&movd	($D2,"edx");
695	&movd	($D3,"esi");
696	&movd	($D4,"ebp");
697
698	&mov	("esi",&wparam(1));			# [reload] inp
699	&mov	("ecx",&wparam(2));			# [reload] len
700	&jmp	(&label("base2_32"));
701
702&set_label("base2_26",16);
703	&movd	($D0,&DWP(4*0,"edi"));			# load hash value
704	&movd	($D1,&DWP(4*1,"edi"));
705	&movd	($D2,&DWP(4*2,"edi"));
706	&movd	($D3,&DWP(4*3,"edi"));
707	&movd	($D4,&DWP(4*4,"edi"));
708	&movdqa	($MASK,&QWP(64,"ebx"));
709
710&set_label("base2_32");
711	&mov	("eax",&wparam(3));			# padbit
712	&mov	("ebp","esp");
713
714	&sub	("esp",16*(5+5+5+9+9));
715	&and	("esp",-16);
716
717	&lea	("edi",&DWP(16*3,"edi"));		# size optimization
718	&shl	("eax",24);				# padbit
719
720	&test	("ecx",31);
721	&jz	(&label("even"));
722
723	################################################################
724	# process single block, with SSE2, because it's still faster
725	# even though half of result is discarded
726
727	&movdqu		($T1,&QWP(0,"esi"));		# input
728	&lea		("esi",&DWP(16,"esi"));
729
730	&movdqa		($T0,$T1);			# -> base 2^26 ...
731	&pand		($T1,$MASK);
732	&paddd		($D0,$T1);			# ... and accumulate
733
734	&movdqa		($T1,$T0);
735	&psrlq		($T0,26);
736	&psrldq		($T1,6);
737	&pand		($T0,$MASK);
738	&paddd		($D1,$T0);
739
740	&movdqa		($T0,$T1);
741	&psrlq		($T1,4);
742	&pand		($T1,$MASK);
743	&paddd		($D2,$T1);
744
745	&movdqa		($T1,$T0);
746	&psrlq		($T0,30);
747	&pand		($T0,$MASK);
748	&psrldq		($T1,7);
749	&paddd		($D3,$T0);
750
751	&movd		($T0,"eax");			# padbit
752	&paddd		($D4,$T1);
753	 &movd		($T1,&DWP(16*0+12,"edi"));	# r0
754	&paddd		($D4,$T0);
755
756	&movdqa		(&QWP(16*0,"esp"),$D0);
757	&movdqa		(&QWP(16*1,"esp"),$D1);
758	&movdqa		(&QWP(16*2,"esp"),$D2);
759	&movdqa		(&QWP(16*3,"esp"),$D3);
760	&movdqa		(&QWP(16*4,"esp"),$D4);
761
762	################################################################
763	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
764	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
765	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
766	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
767	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
768
769	&pmuludq	($D0,$T1);			# h4*r0
770	&pmuludq	($D1,$T1);			# h3*r0
771	&pmuludq	($D2,$T1);			# h2*r0
772	 &movd		($T0,&DWP(16*1+12,"edi"));	# r1
773	&pmuludq	($D3,$T1);			# h1*r0
774	&pmuludq	($D4,$T1);			# h0*r0
775
776	&pmuladd	(sub {	my ($reg,$i)=@_;
777				&movd ($reg,&DWP(16*$i+12,"edi"));
778			     });
779
780	&lazy_reduction	();
781
782	&sub		("ecx",16);
783	&jz		(&label("done"));
784
785&set_label("even");
786	&lea		("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
787	&lea		("eax",&DWP(-16*2,"esi"));
788	&sub		("ecx",64);
789
790	################################################################
791	# expand and copy pre-calculated table to stack
792
793	&movdqu		($T0,&QWP(16*0,"edi"));		# r^1:r^2:r^3:r^4
794	&pshufd		($T1,$T0,0b01000100);		# duplicate r^3:r^4
795	&cmovb		("esi","eax");
796	&pshufd		($T0,$T0,0b11101110);		# duplicate r^1:r^2
797	&movdqa		(&QWP(16*0,"edx"),$T1);
798	&lea		("eax",&DWP(16*10,"esp"));
799	&movdqu		($T1,&QWP(16*1,"edi"));
800	&movdqa		(&QWP(16*(0-9),"edx"),$T0);
801	&pshufd		($T0,$T1,0b01000100);
802	&pshufd		($T1,$T1,0b11101110);
803	&movdqa		(&QWP(16*1,"edx"),$T0);
804	&movdqu		($T0,&QWP(16*2,"edi"));
805	&movdqa		(&QWP(16*(1-9),"edx"),$T1);
806	&pshufd		($T1,$T0,0b01000100);
807	&pshufd		($T0,$T0,0b11101110);
808	&movdqa		(&QWP(16*2,"edx"),$T1);
809	&movdqu		($T1,&QWP(16*3,"edi"));
810	&movdqa		(&QWP(16*(2-9),"edx"),$T0);
811	&pshufd		($T0,$T1,0b01000100);
812	&pshufd		($T1,$T1,0b11101110);
813	&movdqa		(&QWP(16*3,"edx"),$T0);
814	&movdqu		($T0,&QWP(16*4,"edi"));
815	&movdqa		(&QWP(16*(3-9),"edx"),$T1);
816	&pshufd		($T1,$T0,0b01000100);
817	&pshufd		($T0,$T0,0b11101110);
818	&movdqa		(&QWP(16*4,"edx"),$T1);
819	&movdqu		($T1,&QWP(16*5,"edi"));
820	&movdqa		(&QWP(16*(4-9),"edx"),$T0);
821	&pshufd		($T0,$T1,0b01000100);
822	&pshufd		($T1,$T1,0b11101110);
823	&movdqa		(&QWP(16*5,"edx"),$T0);
824	&movdqu		($T0,&QWP(16*6,"edi"));
825	&movdqa		(&QWP(16*(5-9),"edx"),$T1);
826	&pshufd		($T1,$T0,0b01000100);
827	&pshufd		($T0,$T0,0b11101110);
828	&movdqa		(&QWP(16*6,"edx"),$T1);
829	&movdqu		($T1,&QWP(16*7,"edi"));
830	&movdqa		(&QWP(16*(6-9),"edx"),$T0);
831	&pshufd		($T0,$T1,0b01000100);
832	&pshufd		($T1,$T1,0b11101110);
833	&movdqa		(&QWP(16*7,"edx"),$T0);
834	&movdqu		($T0,&QWP(16*8,"edi"));
835	&movdqa		(&QWP(16*(7-9),"edx"),$T1);
836	&pshufd		($T1,$T0,0b01000100);
837	&pshufd		($T0,$T0,0b11101110);
838	&movdqa		(&QWP(16*8,"edx"),$T1);
839	&movdqa		(&QWP(16*(8-9),"edx"),$T0);
840
841sub load_input {
842my ($inpbase,$offbase)=@_;
843
844	&movdqu		($T0,&QWP($inpbase+0,"esi"));	# load input
845	&movdqu		($T1,&QWP($inpbase+16,"esi"));
846	&lea		("esi",&DWP(16*2,"esi"));
847
848	&movdqa		(&QWP($offbase+16*2,"esp"),$D2);
849	&movdqa		(&QWP($offbase+16*3,"esp"),$D3);
850	&movdqa		(&QWP($offbase+16*4,"esp"),$D4);
851
852	&movdqa		($D2,$T0);			# splat input
853	&movdqa		($D3,$T1);
854	&psrldq		($D2,6);
855	&psrldq		($D3,6);
856	&movdqa		($D4,$T0);
857	&punpcklqdq	($D2,$D3);			# 2:3
858	&punpckhqdq	($D4,$T1);			# 4
859	&punpcklqdq	($T0,$T1);			# 0:1
860
861	&movdqa		($D3,$D2);
862	&psrlq		($D2,4);
863	&psrlq		($D3,30);
864	&movdqa		($T1,$T0);
865	&psrlq		($D4,40);			# 4
866	&psrlq		($T1,26);
867	&pand		($T0,$MASK);			# 0
868	&pand		($T1,$MASK);			# 1
869	&pand		($D2,$MASK);			# 2
870	&pand		($D3,$MASK);			# 3
871	&por		($D4,&QWP(0,"ebx"));		# padbit, yes, always
872
873	&movdqa		(&QWP($offbase+16*0,"esp"),$D0)	if ($offbase);
874	&movdqa		(&QWP($offbase+16*1,"esp"),$D1)	if ($offbase);
875}
876	&load_input	(16*2,16*5);
877
878	&jbe		(&label("skip_loop"));
879	&jmp		(&label("loop"));
880
881&set_label("loop",32);
882	################################################################
883	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
884	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
885	#   \___________________/
886	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
887	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
888	#   \___________________/ \____________________/
889	################################################################
890
891	&movdqa		($T2,&QWP(16*(0-9),"edx"));	# r0^2
892	&movdqa		(&QWP(16*1,"eax"),$T1);
893	&movdqa		(&QWP(16*2,"eax"),$D2);
894	&movdqa		(&QWP(16*3,"eax"),$D3);
895	&movdqa		(&QWP(16*4,"eax"),$D4);
896
897	################################################################
898	# d4 = h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
899	# d3 = h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
900	# d2 = h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
901	# d1 = h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
902	# d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
903
904	&movdqa		($D1,$T0);
905	&pmuludq	($T0,$T2);			# h0*r0
906	&movdqa		($D0,$T1);
907	&pmuludq	($T1,$T2);			# h1*r0
908	&pmuludq	($D2,$T2);			# h2*r0
909	&pmuludq	($D3,$T2);			# h3*r0
910	&pmuludq	($D4,$T2);			# h4*r0
911
912sub pmuladd_alt {
913my $addr = shift;
914
915	&pmuludq	($D0,&$addr(8));		# h1*s4
916	&movdqa		($T2,$D1);
917	&pmuludq	($D1,&$addr(1));		# h0*r1
918	&paddq		($D0,$T0);
919	&movdqa		($T0,$T2);
920	&pmuludq	($T2,&$addr(2));		# h0*r2
921	&paddq		($D1,$T1);
922	&movdqa		($T1,$T0);
923	&pmuludq	($T0,&$addr(3));		# h0*r3
924	&paddq		($D2,$T2);
925	 &movdqa	($T2,&QWP(16*1,"eax"));		# pull h1
926	&pmuludq	($T1,&$addr(4));		# h0*r4
927	&paddq		($D3,$T0);
928
929	&movdqa		($T0,$T2);
930	&pmuludq	($T2,&$addr(1));		# h1*r1
931	 &paddq		($D4,$T1);
932	&movdqa		($T1,$T0);
933	&pmuludq	($T0,&$addr(2));		# h1*r2
934	&paddq		($D2,$T2);
935	&movdqa		($T2,&QWP(16*2,"eax"));		# pull h2
936	&pmuludq	($T1,&$addr(3));		# h1*r3
937	&paddq		($D3,$T0);
938	&movdqa		($T0,$T2);
939	&pmuludq	($T2,&$addr(7));		# h2*s3
940	&paddq		($D4,$T1);
941	&movdqa		($T1,$T0);
942	&pmuludq	($T0,&$addr(8));		# h2*s4
943	&paddq		($D0,$T2);
944
945	&movdqa		($T2,$T1);
946	&pmuludq	($T1,&$addr(1));		# h2*r1
947	 &paddq		($D1,$T0);
948	&movdqa		($T0,&QWP(16*3,"eax"));		# pull h3
949	&pmuludq	($T2,&$addr(2));		# h2*r2
950	&paddq		($D3,$T1);
951	&movdqa		($T1,$T0);
952	&pmuludq	($T0,&$addr(6));		# h3*s2
953	&paddq		($D4,$T2);
954	&movdqa		($T2,$T1);
955	&pmuludq	($T1,&$addr(7));		# h3*s3
956	&paddq		($D0,$T0);
957	&movdqa		($T0,$T2);
958	&pmuludq	($T2,&$addr(8));		# h3*s4
959	&paddq		($D1,$T1);
960
961	&movdqa		($T1,&QWP(16*4,"eax"));		# pull h4
962	&pmuludq	($T0,&$addr(1));		# h3*r1
963	 &paddq		($D2,$T2);
964	&movdqa		($T2,$T1);
965	&pmuludq	($T1,&$addr(8));		# h4*s4
966	&paddq		($D4,$T0);
967	&movdqa		($T0,$T2);
968	&pmuludq	($T2,&$addr(5));		# h4*s1
969	&paddq		($D3,$T1);
970	&movdqa		($T1,$T0);
971	&pmuludq	($T0,&$addr(6));		# h4*s2
972	&paddq		($D0,$T2);
973	 &movdqa	($MASK,&QWP(64,"ebx"));
974	&pmuludq	($T1,&$addr(7));		# h4*s3
975	&paddq		($D1,$T0);
976	&paddq		($D2,$T1);
977}
978	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*($i-9),"edx");	});
979
980	&load_input	(-16*2,0);
981	&lea		("eax",&DWP(-16*2,"esi"));
982	&sub		("ecx",64);
983
984	&paddd		($T0,&QWP(16*(5+0),"esp"));	# add hash value
985	&paddd		($T1,&QWP(16*(5+1),"esp"));
986	&paddd		($D2,&QWP(16*(5+2),"esp"));
987	&paddd		($D3,&QWP(16*(5+3),"esp"));
988	&paddd		($D4,&QWP(16*(5+4),"esp"));
989
990	&cmovb		("esi","eax");
991	&lea		("eax",&DWP(16*10,"esp"));
992
993	&movdqa		($T2,&QWP(16*0,"edx"));		# r0^4
994	&movdqa		(&QWP(16*1,"esp"),$D1);
995	&movdqa		(&QWP(16*1,"eax"),$T1);
996	&movdqa		(&QWP(16*2,"eax"),$D2);
997	&movdqa		(&QWP(16*3,"eax"),$D3);
998	&movdqa		(&QWP(16*4,"eax"),$D4);
999
1000	################################################################
1001	# d4 += h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
1002	# d3 += h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
1003	# d2 += h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
1004	# d1 += h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
1005	# d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
1006
1007	&movdqa		($D1,$T0);
1008	&pmuludq	($T0,$T2);			# h0*r0
1009	&paddq		($T0,$D0);
1010	&movdqa		($D0,$T1);
1011	&pmuludq	($T1,$T2);			# h1*r0
1012	&pmuludq	($D2,$T2);			# h2*r0
1013	&pmuludq	($D3,$T2);			# h3*r0
1014	&pmuludq	($D4,$T2);			# h4*r0
1015
1016	&paddq		($T1,&QWP(16*1,"esp"));
1017	&paddq		($D2,&QWP(16*2,"esp"));
1018	&paddq		($D3,&QWP(16*3,"esp"));
1019	&paddq		($D4,&QWP(16*4,"esp"));
1020
1021	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*$i,"edx");	});
1022
1023	&lazy_reduction	();
1024
1025	&load_input	(16*2,16*5);
1026
1027	&ja		(&label("loop"));
1028
1029&set_label("skip_loop");
1030	################################################################
1031	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1032
1033	 &pshufd	($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
1034	&add		("ecx",32);
1035	&jnz		(&label("long_tail"));
1036
1037	&paddd		($T0,$D0);			# add hash value
1038	&paddd		($T1,$D1);
1039	&paddd		($D2,&QWP(16*7,"esp"));
1040	&paddd		($D3,&QWP(16*8,"esp"));
1041	&paddd		($D4,&QWP(16*9,"esp"));
1042
1043&set_label("long_tail");
1044
1045	&movdqa		(&QWP(16*0,"eax"),$T0);
1046	&movdqa		(&QWP(16*1,"eax"),$T1);
1047	&movdqa		(&QWP(16*2,"eax"),$D2);
1048	&movdqa		(&QWP(16*3,"eax"),$D3);
1049	&movdqa		(&QWP(16*4,"eax"),$D4);
1050
1051	################################################################
1052	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1053	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1054	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1055	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1056	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1057
1058	&pmuludq	($T0,$T2);			# h0*r0
1059	&pmuludq	($T1,$T2);			# h1*r0
1060	&pmuludq	($D2,$T2);			# h2*r0
1061	&movdqa		($D0,$T0);
1062	 &pshufd	($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
1063	&pmuludq	($D3,$T2);			# h3*r0
1064	&movdqa		($D1,$T1);
1065	&pmuludq	($D4,$T2);			# h4*r0
1066
1067	&pmuladd	(sub {	my ($reg,$i)=@_;
1068				&pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
1069			     },"eax");
1070
1071	&jz		(&label("short_tail"));
1072
1073	&load_input	(-16*2,0);
1074
1075	 &pshufd	($T2,&QWP(16*0,"edx"),0x10);	# r0^n
1076	&paddd		($T0,&QWP(16*5,"esp"));		# add hash value
1077	&paddd		($T1,&QWP(16*6,"esp"));
1078	&paddd		($D2,&QWP(16*7,"esp"));
1079	&paddd		($D3,&QWP(16*8,"esp"));
1080	&paddd		($D4,&QWP(16*9,"esp"));
1081
1082	################################################################
1083	# multiply inp[0:1] by r^4:r^3 and accumulate
1084
1085	&movdqa		(&QWP(16*0,"esp"),$T0);
1086	&pmuludq	($T0,$T2);			# h0*r0
1087	&movdqa		(&QWP(16*1,"esp"),$T1);
1088	&pmuludq	($T1,$T2);			# h1*r0
1089	&paddq		($D0,$T0);
1090	&movdqa		($T0,$D2);
1091	&pmuludq	($D2,$T2);			# h2*r0
1092	&paddq		($D1,$T1);
1093	&movdqa		($T1,$D3);
1094	&pmuludq	($D3,$T2);			# h3*r0
1095	&paddq		($D2,&QWP(16*2,"esp"));
1096	&movdqa		(&QWP(16*2,"esp"),$T0);
1097	 &pshufd	($T0,&QWP(16*1,"edx"),0x10);	# r1^n
1098	&paddq		($D3,&QWP(16*3,"esp"));
1099	&movdqa		(&QWP(16*3,"esp"),$T1);
1100	&movdqa		($T1,$D4);
1101	&pmuludq	($D4,$T2);			# h4*r0
1102	&paddq		($D4,&QWP(16*4,"esp"));
1103	&movdqa		(&QWP(16*4,"esp"),$T1);
1104
1105	&pmuladd	(sub {	my ($reg,$i)=@_;
1106				&pshufd ($reg,&QWP(16*$i,"edx"),0x10);
1107			     });
1108
1109&set_label("short_tail");
1110
1111	################################################################
1112	# horizontal addition
1113
1114	&pshufd		($T1,$D4,0b01001110);
1115	&pshufd		($T0,$D3,0b01001110);
1116	&paddq		($D4,$T1);
1117	&paddq		($D3,$T0);
1118	&pshufd		($T1,$D0,0b01001110);
1119	&pshufd		($T0,$D1,0b01001110);
1120	&paddq		($D0,$T1);
1121	&paddq		($D1,$T0);
1122	&pshufd		($T1,$D2,0b01001110);
1123	#&paddq		($D2,$T1);
1124
1125	&lazy_reduction	(sub { &paddq ($D2,$T1) });
1126
1127&set_label("done");
1128	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
1129	&movd		(&DWP(-16*3+4*1,"edi"),$D1);
1130	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
1131	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
1132	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
1133	&mov	("esp","ebp");
1134&set_label("nodata");
1135&function_end("_poly1305_blocks_sse2");
1136
1137&align	(32);
1138&function_begin("_poly1305_emit_sse2");
1139	&mov	("ebp",&wparam(0));		# context
1140
1141	&cmp	(&DWP(4*5,"ebp"),0);		# is_base2_26?
1142	&je	(&label("enter_emit"));
1143
1144	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
1145	&mov	("edi",&DWP(4*1,"ebp"));
1146	&mov	("ecx",&DWP(4*2,"ebp"));
1147	&mov	("edx",&DWP(4*3,"ebp"));
1148	&mov	("esi",&DWP(4*4,"ebp"));
1149
1150	&mov	("ebx","edi");			# base 2^26 -> base 2^32
1151	&shl	("edi",26);
1152	&shr	("ebx",6);
1153	&add	("eax","edi");
1154	&mov	("edi","ecx");
1155	&adc	("ebx",0);
1156
1157	&shl	("edi",20);
1158	&shr	("ecx",12);
1159	&add	("ebx","edi");
1160	&mov	("edi","edx");
1161	&adc	("ecx",0);
1162
1163	&shl	("edi",14);
1164	&shr	("edx",18);
1165	&add	("ecx","edi");
1166	&mov	("edi","esi");
1167	&adc	("edx",0);
1168
1169	&shl	("edi",8);
1170	&shr	("esi",24);
1171	&add	("edx","edi");
1172	&adc	("esi",0);			# can be partially reduced
1173
1174	&mov	("edi","esi");			# final reduction
1175	&and	("esi",3);
1176	&shr	("edi",2);
1177	&lea	("ebp",&DWP(0,"edi","edi",4));	# *5
1178	 &mov	("edi",&wparam(1));		# output
1179	&add	("eax","ebp");
1180	 &mov	("ebp",&wparam(2));		# key
1181	&adc	("ebx",0);
1182	&adc	("ecx",0);
1183	&adc	("edx",0);
1184	&adc	("esi",0);
1185
1186	&movd	($D0,"eax");			# offload original hash value
1187	&add	("eax",5);			# compare to modulus
1188	&movd	($D1,"ebx");
1189	&adc	("ebx",0);
1190	&movd	($D2,"ecx");
1191	&adc	("ecx",0);
1192	&movd	($D3,"edx");
1193	&adc	("edx",0);
1194	&adc	("esi",0);
1195	&shr	("esi",2);			# did it carry/borrow?
1196
1197	&neg	("esi");			# do we choose (hash-modulus) ...
1198	&and	("eax","esi");
1199	&and	("ebx","esi");
1200	&and	("ecx","esi");
1201	&and	("edx","esi");
1202	&mov	(&DWP(4*0,"edi"),"eax");
1203	&movd	("eax",$D0);
1204	&mov	(&DWP(4*1,"edi"),"ebx");
1205	&movd	("ebx",$D1);
1206	&mov	(&DWP(4*2,"edi"),"ecx");
1207	&movd	("ecx",$D2);
1208	&mov	(&DWP(4*3,"edi"),"edx");
1209	&movd	("edx",$D3);
1210
1211	&not	("esi");			# ... or original hash value?
1212	&and	("eax","esi");
1213	&and	("ebx","esi");
1214	&or	("eax",&DWP(4*0,"edi"));
1215	&and	("ecx","esi");
1216	&or	("ebx",&DWP(4*1,"edi"));
1217	&and	("edx","esi");
1218	&or	("ecx",&DWP(4*2,"edi"));
1219	&or	("edx",&DWP(4*3,"edi"));
1220
1221	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
1222	&adc	("ebx",&DWP(4*1,"ebp"));
1223	&mov	(&DWP(4*0,"edi"),"eax");
1224	&adc	("ecx",&DWP(4*2,"ebp"));
1225	&mov	(&DWP(4*1,"edi"),"ebx");
1226	&adc	("edx",&DWP(4*3,"ebp"));
1227	&mov	(&DWP(4*2,"edi"),"ecx");
1228	&mov	(&DWP(4*3,"edi"),"edx");
1229&function_end("_poly1305_emit_sse2");
1230
1231if ($avx>1) {
1232########################################################################
1233# Note that poly1305_init_avx2 operates on %xmm, I could have used
1234# poly1305_init_sse2...
1235
1236&align	(32);
1237&function_begin_B("_poly1305_init_avx2");
1238	&vmovdqu	($D4,&QWP(4*6,"edi"));		# key base 2^32
1239	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
1240	&mov		("ebp","esp");
1241	&sub		("esp",16*(9+5));
1242	&and		("esp",-16);
1243
1244	#&vpand		($D4,$D4,&QWP(96,"ebx"));	# magic mask
1245	&vmovdqa	($MASK,&QWP(64,"ebx"));
1246
1247	&vpand		($D0,$D4,$MASK);		# -> base 2^26
1248	&vpsrlq		($D1,$D4,26);
1249	&vpsrldq	($D3,$D4,6);
1250	&vpand		($D1,$D1,$MASK);
1251	&vpsrlq		($D2,$D3,4)
1252	&vpsrlq		($D3,$D3,30);
1253	&vpand		($D2,$D2,$MASK);
1254	&vpand		($D3,$D3,$MASK);
1255	&vpsrldq	($D4,$D4,13);
1256
1257	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
1258	&mov		("ecx",2);
1259&set_label("square");
1260	&vmovdqa	(&QWP(16*0,"esp"),$D0);
1261	&vmovdqa	(&QWP(16*1,"esp"),$D1);
1262	&vmovdqa	(&QWP(16*2,"esp"),$D2);
1263	&vmovdqa	(&QWP(16*3,"esp"),$D3);
1264	&vmovdqa	(&QWP(16*4,"esp"),$D4);
1265
1266	&vpslld		($T1,$D1,2);
1267	&vpslld		($T0,$D2,2);
1268	&vpaddd		($T1,$T1,$D1);			# *5
1269	&vpaddd		($T0,$T0,$D2);			# *5
1270	&vmovdqa	(&QWP(16*5,"esp"),$T1);
1271	&vmovdqa	(&QWP(16*6,"esp"),$T0);
1272	&vpslld		($T1,$D3,2);
1273	&vpslld		($T0,$D4,2);
1274	&vpaddd		($T1,$T1,$D3);			# *5
1275	&vpaddd		($T0,$T0,$D4);			# *5
1276	&vmovdqa	(&QWP(16*7,"esp"),$T1);
1277	&vmovdqa	(&QWP(16*8,"esp"),$T0);
1278
1279	&vpshufd	($T0,$D0,0b01000100);
1280	&vmovdqa	($T1,$D1);
1281	&vpshufd	($D1,$D1,0b01000100);
1282	&vpshufd	($D2,$D2,0b01000100);
1283	&vpshufd	($D3,$D3,0b01000100);
1284	&vpshufd	($D4,$D4,0b01000100);
1285	&vmovdqa	(&QWP(16*0,"edx"),$T0);
1286	&vmovdqa	(&QWP(16*1,"edx"),$D1);
1287	&vmovdqa	(&QWP(16*2,"edx"),$D2);
1288	&vmovdqa	(&QWP(16*3,"edx"),$D3);
1289	&vmovdqa	(&QWP(16*4,"edx"),$D4);
1290
1291	################################################################
1292	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1293	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1294	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1295	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1296	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1297
1298	&vpmuludq	($D4,$D4,$D0);			# h4*r0
1299	&vpmuludq	($D3,$D3,$D0);			# h3*r0
1300	&vpmuludq	($D2,$D2,$D0);			# h2*r0
1301	&vpmuludq	($D1,$D1,$D0);			# h1*r0
1302	&vpmuludq	($D0,$T0,$D0);			# h0*r0
1303
1304	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# r1*h3
1305	&vpaddq		($D4,$D4,$T0);
1306	&vpmuludq	($T2,$T1,&QWP(16*2,"edx"));	# r1*h2
1307	&vpaddq		($D3,$D3,$T2);
1308	&vpmuludq	($T0,$T1,&QWP(16*1,"edx"));	# r1*h1
1309	&vpaddq		($D2,$D2,$T0);
1310	&vmovdqa	($T2,&QWP(16*5,"esp"));		# s1
1311	&vpmuludq	($T1,$T1,&QWP(16*0,"edx"));	# r1*h0
1312	&vpaddq		($D1,$D1,$T1);
1313	 &vmovdqa	($T0,&QWP(16*2,"esp"));		# r2
1314	&vpmuludq	($T2,$T2,&QWP(16*4,"edx"));	# s1*h4
1315	&vpaddq		($D0,$D0,$T2);
1316
1317	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# r2*h2
1318	&vpaddq		($D4,$D4,$T1);
1319	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r2*h1
1320	&vpaddq		($D3,$D3,$T2);
1321	&vmovdqa	($T1,&QWP(16*6,"esp"));		# s2
1322	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r2*h0
1323	&vpaddq		($D2,$D2,$T0);
1324	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s2*h4
1325	&vpaddq		($D1,$D1,$T2);
1326	 &vmovdqa	($T0,&QWP(16*3,"esp"));		# r3
1327	&vpmuludq	($T1,$T1,&QWP(16*3,"edx"));	# s2*h3
1328	&vpaddq		($D0,$D0,$T1);
1329
1330	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r3*h1
1331	&vpaddq		($D4,$D4,$T2);
1332	&vmovdqa	($T1,&QWP(16*7,"esp"));		# s3
1333	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r3*h0
1334	&vpaddq		($D3,$D3,$T0);
1335	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s3*h4
1336	&vpaddq		($D2,$D2,$T2);
1337	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# s3*h3
1338	&vpaddq		($D1,$D1,$T0);
1339	 &vmovdqa	($T2,&QWP(16*4,"esp"));		# r4
1340	&vpmuludq	($T1,$T1,&QWP(16*2,"edx"));	# s3*h2
1341	&vpaddq		($D0,$D0,$T1);
1342
1343	&vmovdqa	($T0,&QWP(16*8,"esp"));		# s4
1344	&vpmuludq	($T2,$T2,&QWP(16*0,"edx"));	# r4*h0
1345	&vpaddq		($D4,$D4,$T2);
1346	&vpmuludq	($T1,$T0,&QWP(16*4,"edx"));	# s4*h4
1347	&vpaddq		($D3,$D3,$T1);
1348	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# s4*h1
1349	&vpaddq		($D0,$D0,$T2);
1350	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# s4*h2
1351	&vpaddq		($D1,$D1,$T1);
1352	 &vmovdqa	($MASK,&QWP(64,"ebx"));
1353	&vpmuludq	($T0,$T0,&QWP(16*3,"edx"));	# s4*h3
1354	&vpaddq		($D2,$D2,$T0);
1355
1356	################################################################
1357	# lazy reduction
1358	 &vpsrlq	($T0,$D3,26);
1359	 &vpand		($D3,$D3,$MASK);
1360	&vpsrlq		($T1,$D0,26);
1361	&vpand		($D0,$D0,$MASK);
1362	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
1363	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
1364	 &vpsrlq	($T0,$D4,26);
1365	 &vpand		($D4,$D4,$MASK);
1366	&vpsrlq		($T1,$D1,26);
1367	&vpand		($D1,$D1,$MASK);
1368	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
1369	 &vpaddd	($D0,$D0,$T0);
1370	 &vpsllq	($T0,$T0,2);
1371	&vpsrlq		($T1,$D2,26);
1372	&vpand		($D2,$D2,$MASK);
1373	 &vpaddd	($D0,$D0,$T0);			# h4 -> h0
1374	&vpaddd		($D3,$D3,$T1);			# h2 -> h3
1375	&vpsrlq		($T1,$D3,26);
1376	 &vpsrlq	($T0,$D0,26);
1377	 &vpand		($D0,$D0,$MASK);
1378	&vpand		($D3,$D3,$MASK);
1379	 &vpaddd	($D1,$D1,$T0);			# h0 -> h1
1380	&vpaddd		($D4,$D4,$T1);			# h3 -> h4
1381
1382	&dec		("ecx");
1383	&jz		(&label("square_break"));
1384
1385	&vpunpcklqdq	($D0,$D0,&QWP(16*0,"esp"));	# 0:r^1:0:r^2
1386	&vpunpcklqdq	($D1,$D1,&QWP(16*1,"esp"));
1387	&vpunpcklqdq	($D2,$D2,&QWP(16*2,"esp"));
1388	&vpunpcklqdq	($D3,$D3,&QWP(16*3,"esp"));
1389	&vpunpcklqdq	($D4,$D4,&QWP(16*4,"esp"));
1390	&jmp		(&label("square"));
1391
1392&set_label("square_break");
1393	&vpsllq		($D0,$D0,32);			# -> r^3:0:r^4:0
1394	&vpsllq		($D1,$D1,32);
1395	&vpsllq		($D2,$D2,32);
1396	&vpsllq		($D3,$D3,32);
1397	&vpsllq		($D4,$D4,32);
1398	&vpor		($D0,$D0,&QWP(16*0,"esp"));	# r^3:r^1:r^4:r^2
1399	&vpor		($D1,$D1,&QWP(16*1,"esp"));
1400	&vpor		($D2,$D2,&QWP(16*2,"esp"));
1401	&vpor		($D3,$D3,&QWP(16*3,"esp"));
1402	&vpor		($D4,$D4,&QWP(16*4,"esp"));
1403
1404	&vpshufd	($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
1405	&vpshufd	($D1,$D1,0b10001101);
1406	&vpshufd	($D2,$D2,0b10001101);
1407	&vpshufd	($D3,$D3,0b10001101);
1408	&vpshufd	($D4,$D4,0b10001101);
1409
1410	&vmovdqu	(&QWP(16*0,"edi"),$D0);		# save the table
1411	&vmovdqu	(&QWP(16*1,"edi"),$D1);
1412	&vmovdqu	(&QWP(16*2,"edi"),$D2);
1413	&vmovdqu	(&QWP(16*3,"edi"),$D3);
1414	&vmovdqu	(&QWP(16*4,"edi"),$D4);
1415
1416	&vpslld		($T1,$D1,2);
1417	&vpslld		($T0,$D2,2);
1418	&vpaddd		($T1,$T1,$D1);			# *5
1419	&vpaddd		($T0,$T0,$D2);			# *5
1420	&vmovdqu	(&QWP(16*5,"edi"),$T1);
1421	&vmovdqu	(&QWP(16*6,"edi"),$T0);
1422	&vpslld		($T1,$D3,2);
1423	&vpslld		($T0,$D4,2);
1424	&vpaddd		($T1,$T1,$D3);			# *5
1425	&vpaddd		($T0,$T0,$D4);			# *5
1426	&vmovdqu	(&QWP(16*7,"edi"),$T1);
1427	&vmovdqu	(&QWP(16*8,"edi"),$T0);
1428
1429	&mov		("esp","ebp");
1430	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
1431	&ret		();
1432&function_end_B("_poly1305_init_avx2");
1433
1434########################################################################
1435# now it's time to switch to %ymm
1436
1437my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
1438my $MASK=$T2;
1439
1440sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
1441
1442&align	(32);
1443&function_begin("_poly1305_blocks_avx2");
1444	&mov	("edi",&wparam(0));			# ctx
1445	&mov	("esi",&wparam(1));			# inp
1446	&mov	("ecx",&wparam(2));			# len
1447
1448	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
1449	&and	("ecx",-16);
1450	&jz	(&label("nodata"));
1451	&cmp	("ecx",64);
1452	&jae	(&label("enter_avx2"));
1453	&test	("eax","eax");				# is_base2_26?
1454	&jz	(&label("enter_blocks"));
1455
1456&set_label("enter_avx2");
1457	&vzeroupper	();
1458
1459	&call	(&label("pic_point"));
1460&set_label("pic_point");
1461	&blindpop("ebx");
1462	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
1463
1464	&test	("eax","eax");				# is_base2_26?
1465	&jnz	(&label("base2_26"));
1466
1467	&call	("_poly1305_init_avx2");
1468
1469	################################################# base 2^32 -> base 2^26
1470	&mov	("eax",&DWP(0,"edi"));
1471	&mov	("ecx",&DWP(3,"edi"));
1472	&mov	("edx",&DWP(6,"edi"));
1473	&mov	("esi",&DWP(9,"edi"));
1474	&mov	("ebp",&DWP(13,"edi"));
1475
1476	&shr	("ecx",2);
1477	&and	("eax",0x3ffffff);
1478	&shr	("edx",4);
1479	&and	("ecx",0x3ffffff);
1480	&shr	("esi",6);
1481	&and	("edx",0x3ffffff);
1482
1483	&mov	(&DWP(4*0,"edi"),"eax");
1484	&mov	(&DWP(4*1,"edi"),"ecx");
1485	&mov	(&DWP(4*2,"edi"),"edx");
1486	&mov	(&DWP(4*3,"edi"),"esi");
1487	&mov	(&DWP(4*4,"edi"),"ebp");
1488	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
1489
1490	&mov	("esi",&wparam(1));			# [reload] inp
1491	&mov	("ecx",&wparam(2));			# [reload] len
1492
1493&set_label("base2_26");
1494	&mov	("eax",&wparam(3));			# padbit
1495	&mov	("ebp","esp");
1496
1497	&sub	("esp",32*(5+9));
1498	&and	("esp",-512);				# ensure that frame
1499							# doesn't cross page
1500							# boundary, which is
1501							# essential for
1502							# misaligned 32-byte
1503							# loads
1504
1505	################################################################
1506        # expand and copy pre-calculated table to stack
1507
1508	&vmovdqu	(&X($D0),&QWP(16*(3+0),"edi"));
1509	&lea		("edx",&DWP(32*5+128,"esp"));	# +128 size optimization
1510	&vmovdqu	(&X($D1),&QWP(16*(3+1),"edi"));
1511	&vmovdqu	(&X($D2),&QWP(16*(3+2),"edi"));
1512	&vmovdqu	(&X($D3),&QWP(16*(3+3),"edi"));
1513	&vmovdqu	(&X($D4),&QWP(16*(3+4),"edi"));
1514	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
1515	&vpermq		($D0,$D0,0b01000000);		# 00001234 -> 12343434
1516	&vpermq		($D1,$D1,0b01000000);
1517	&vpermq		($D2,$D2,0b01000000);
1518	&vpermq		($D3,$D3,0b01000000);
1519	&vpermq		($D4,$D4,0b01000000);
1520	&vpshufd	($D0,$D0,0b11001000);		# 12343434 -> 14243444
1521	&vpshufd	($D1,$D1,0b11001000);
1522	&vpshufd	($D2,$D2,0b11001000);
1523	&vpshufd	($D3,$D3,0b11001000);
1524	&vpshufd	($D4,$D4,0b11001000);
1525	&vmovdqa	(&QWP(32*0-128,"edx"),$D0);
1526	&vmovdqu	(&X($D0),&QWP(16*5,"edi"));
1527	&vmovdqa	(&QWP(32*1-128,"edx"),$D1);
1528	&vmovdqu	(&X($D1),&QWP(16*6,"edi"));
1529	&vmovdqa	(&QWP(32*2-128,"edx"),$D2);
1530	&vmovdqu	(&X($D2),&QWP(16*7,"edi"));
1531	&vmovdqa	(&QWP(32*3-128,"edx"),$D3);
1532	&vmovdqu	(&X($D3),&QWP(16*8,"edi"));
1533	&vmovdqa	(&QWP(32*4-128,"edx"),$D4);
1534	&vpermq		($D0,$D0,0b01000000);
1535	&vpermq		($D1,$D1,0b01000000);
1536	&vpermq		($D2,$D2,0b01000000);
1537	&vpermq		($D3,$D3,0b01000000);
1538	&vpshufd	($D0,$D0,0b11001000);
1539	&vpshufd	($D1,$D1,0b11001000);
1540	&vpshufd	($D2,$D2,0b11001000);
1541	&vpshufd	($D3,$D3,0b11001000);
1542	&vmovdqa	(&QWP(32*5-128,"edx"),$D0);
1543	&vmovd		(&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
1544	&vmovdqa	(&QWP(32*6-128,"edx"),$D1);
1545	&vmovd		(&X($D1),&DWP(-16*3+4*1,"edi"));
1546	&vmovdqa	(&QWP(32*7-128,"edx"),$D2);
1547	&vmovd		(&X($D2),&DWP(-16*3+4*2,"edi"));
1548	&vmovdqa	(&QWP(32*8-128,"edx"),$D3);
1549	&vmovd		(&X($D3),&DWP(-16*3+4*3,"edi"));
1550	&vmovd		(&X($D4),&DWP(-16*3+4*4,"edi"));
1551	&vmovdqa	($MASK,&QWP(64,"ebx"));
1552	&neg		("eax");			# padbit
1553
1554	&test		("ecx",63);
1555	&jz		(&label("even"));
1556
1557	&mov		("edx","ecx");
1558	&and		("ecx",-64);
1559	&and		("edx",63);
1560
1561	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));
1562	&cmp		("edx",32);
1563	&jb		(&label("one"));
1564
1565	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
1566	&je		(&label("two"));
1567
1568	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
1569	&lea		("esi",&DWP(16*3,"esi"));
1570	&lea		("ebx",&DWP(8,"ebx"));		# three padbits
1571	&lea		("edx",&DWP(32*5+128+8,"esp"));	# --:r^1:r^2:r^3 (*)
1572	&jmp		(&label("tail"));
1573
1574&set_label("two");
1575	&lea		("esi",&DWP(16*2,"esi"));
1576	&lea		("ebx",&DWP(16,"ebx"));		# two padbits
1577	&lea		("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
1578	&jmp		(&label("tail"));
1579
1580&set_label("one");
1581	&lea		("esi",&DWP(16*1,"esi"));
1582	&vpxor		($T1,$T1,$T1);
1583	&lea		("ebx",&DWP(32,"ebx","eax",8));	# one or no padbits
1584	&lea		("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
1585	&jmp		(&label("tail"));
1586
1587# (*)	spots marked with '--' are data from next table entry, but they
1588#	are multiplied by 0 and therefore rendered insignificant
1589
1590&set_label("even",32);
1591	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
1592	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
1593	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
1594	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
1595	&lea		("esi",&DWP(16*4,"esi"));
1596	&sub		("ecx",64);
1597	&jz		(&label("tail"));
1598
1599&set_label("loop");
1600	################################################################
1601	# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
1602	# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
1603	# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
1604	# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
1605	#   \________/ \_______/
1606	################################################################
1607
1608sub vsplat_input {
1609	&vmovdqa	(&QWP(32*2,"esp"),$D2);
1610	&vpsrldq	($D2,$T0,6);			# splat input
1611	&vmovdqa	(&QWP(32*0,"esp"),$D0);
1612	&vpsrldq	($D0,$T1,6);
1613	&vmovdqa	(&QWP(32*1,"esp"),$D1);
1614	&vpunpckhqdq	($D1,$T0,$T1);			# 4
1615	&vpunpcklqdq	($T0,$T0,$T1);			# 0:1
1616	&vpunpcklqdq	($D2,$D2,$D0);			# 2:3
1617
1618	&vpsrlq		($D0,$D2,30);
1619	&vpsrlq		($D2,$D2,4);
1620	&vpsrlq		($T1,$T0,26);
1621	&vpsrlq		($D1,$D1,40);			# 4
1622	&vpand		($D2,$D2,$MASK);		# 2
1623	&vpand		($T0,$T0,$MASK);		# 0
1624	&vpand		($T1,$T1,$MASK);		# 1
1625	&vpand		($D0,$D0,$MASK);		# 3 (*)
1626	&vpor		($D1,$D1,&QWP(0,"ebx"));	# padbit, yes, always
1627
1628	# (*)	note that output is counterintuitive, inp[3:4] is
1629	#	returned in $D1-2, while $D3-4 are preserved;
1630}
1631	&vsplat_input	();
1632
1633sub vpmuladd {
1634my $addr = shift;
1635
1636	&vpaddq		($D2,$D2,&QWP(32*2,"esp"));	# add hash value
1637	&vpaddq		($T0,$T0,&QWP(32*0,"esp"));
1638	&vpaddq		($T1,$T1,&QWP(32*1,"esp"));
1639	&vpaddq		($D0,$D0,$D3);
1640	&vpaddq		($D1,$D1,$D4);
1641
1642	################################################################
1643	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0   + h4*5*r4
1644	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1   + h4*r0
1645	# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
1646	# d1 = h2*5*r4 + h0*r1 + h1*r0   + h3*5*r3 + h4*5*r2
1647	# d2 = h2*r0   + h0*r2 + h1*r1   + h3*5*r4 + h4*5*r3
1648
1649	&vpmuludq	($D3,$D2,&$addr(1));		# d3 = h2*r1
1650	 &vmovdqa	(QWP(32*1,"esp"),$T1);
1651	&vpmuludq	($D4,$D2,&$addr(2));		# d4 = h2*r2
1652	 &vmovdqa	(QWP(32*3,"esp"),$D0);
1653	&vpmuludq	($D0,$D2,&$addr(7));		# d0 = h2*s3
1654	 &vmovdqa	(QWP(32*4,"esp"),$D1);
1655	&vpmuludq	($D1,$D2,&$addr(8));		# d1 = h2*s4
1656	&vpmuludq	($D2,$D2,&$addr(0));		# d2 = h2*r0
1657
1658	&vpmuludq	($T2,$T0,&$addr(3));		# h0*r3
1659	&vpaddq		($D3,$D3,$T2);			# d3 += h0*r3
1660	&vpmuludq	($T1,$T0,&$addr(4));		# h0*r4
1661	&vpaddq		($D4,$D4,$T1);			# d4 + h0*r4
1662	&vpmuludq	($T2,$T0,&$addr(0));		# h0*r0
1663	&vpaddq		($D0,$D0,$T2);			# d0 + h0*r0
1664	 &vmovdqa	($T2,&QWP(32*1,"esp"));		# h1
1665	&vpmuludq	($T1,$T0,&$addr(1));		# h0*r1
1666	&vpaddq		($D1,$D1,$T1);			# d1 += h0*r1
1667	&vpmuludq	($T0,$T0,&$addr(2));		# h0*r2
1668	&vpaddq		($D2,$D2,$T0);			# d2 += h0*r2
1669
1670	&vpmuludq	($T1,$T2,&$addr(2));		# h1*r2
1671	&vpaddq		($D3,$D3,$T1);			# d3 += h1*r2
1672	&vpmuludq	($T0,$T2,&$addr(3));		# h1*r3
1673	&vpaddq		($D4,$D4,$T0);			# d4 += h1*r3
1674	&vpmuludq	($T1,$T2,&$addr(8));		# h1*s4
1675	&vpaddq		($D0,$D0,$T1);			# d0 += h1*s4
1676	 &vmovdqa	($T1,&QWP(32*3,"esp"));		# h3
1677	&vpmuludq	($T0,$T2,&$addr(0));		# h1*r0
1678	&vpaddq		($D1,$D1,$T0);			# d1 += h1*r0
1679	&vpmuludq	($T2,$T2,&$addr(1));		# h1*r1
1680	&vpaddq		($D2,$D2,$T2);			# d2 += h1*r1
1681
1682	&vpmuludq	($T0,$T1,&$addr(0));		# h3*r0
1683	&vpaddq		($D3,$D3,$T0);			# d3 += h3*r0
1684	&vpmuludq	($T2,$T1,&$addr(1));		# h3*r1
1685	&vpaddq		($D4,$D4,$T2);			# d4 += h3*r1
1686	&vpmuludq	($T0,$T1,&$addr(6));		# h3*s2
1687	&vpaddq		($D0,$D0,$T0);			# d0 += h3*s2
1688	 &vmovdqa	($T0,&QWP(32*4,"esp"));		# h4
1689	&vpmuludq	($T2,$T1,&$addr(7));		# h3*s3
1690	&vpaddq		($D1,$D1,$T2);			# d1+= h3*s3
1691	&vpmuludq	($T1,$T1,&$addr(8));		# h3*s4
1692	&vpaddq		($D2,$D2,$T1);			# d2 += h3*s4
1693
1694	&vpmuludq	($T2,$T0,&$addr(8));		# h4*s4
1695	&vpaddq		($D3,$D3,$T2);			# d3 += h4*s4
1696	&vpmuludq	($T1,$T0,&$addr(5));		# h4*s1
1697	&vpaddq		($D0,$D0,$T1);			# d0 += h4*s1
1698	&vpmuludq	($T2,$T0,&$addr(0));		# h4*r0
1699	&vpaddq		($D4,$D4,$T2);			# d4 += h4*r0
1700	 &vmovdqa	($MASK,&QWP(64,"ebx"));
1701	&vpmuludq	($T1,$T0,&$addr(6));		# h4*s2
1702	&vpaddq		($D1,$D1,$T1);			# d1 += h4*s2
1703	&vpmuludq	($T0,$T0,&$addr(7));		# h4*s3
1704	&vpaddq		($D2,$D2,$T0);			# d2 += h4*s3
1705}
1706	&vpmuladd	(sub {	my $i=shift; &QWP(32*$i-128,"edx");	});
1707
1708sub vlazy_reduction {
1709	################################################################
1710	# lazy reduction
1711
1712	 &vpsrlq	($T0,$D3,26);
1713	 &vpand		($D3,$D3,$MASK);
1714	&vpsrlq		($T1,$D0,26);
1715	&vpand		($D0,$D0,$MASK);
1716	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
1717	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
1718	 &vpsrlq	($T0,$D4,26);
1719	 &vpand		($D4,$D4,$MASK);
1720	&vpsrlq		($T1,$D1,26);
1721	&vpand		($D1,$D1,$MASK);
1722	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
1723	 &vpaddq	($D0,$D0,$T0);
1724	 &vpsllq	($T0,$T0,2);
1725	&vpsrlq		($T1,$D2,26);
1726	&vpand		($D2,$D2,$MASK);
1727	 &vpaddq	($D0,$D0,$T0);			# h4 -> h0
1728	&vpaddq		($D3,$D3,$T1);			# h2 -> h3
1729	&vpsrlq		($T1,$D3,26);
1730	 &vpsrlq	($T0,$D0,26);
1731	 &vpand		($D0,$D0,$MASK);
1732	&vpand		($D3,$D3,$MASK);
1733	 &vpaddq	($D1,$D1,$T0);			# h0 -> h1
1734	&vpaddq		($D4,$D4,$T1);			# h3 -> h4
1735}
1736	&vlazy_reduction();
1737
1738	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
1739	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
1740	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
1741	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
1742	&lea		("esi",&DWP(16*4,"esi"));
1743	&sub		("ecx",64);
1744	&jnz		(&label("loop"));
1745
1746&set_label("tail");
1747	&vsplat_input	();
1748	&and		("ebx",-64);			# restore pointer
1749
1750	&vpmuladd	(sub {	my $i=shift; &QWP(4+32*$i-128,"edx");	});
1751
1752	################################################################
1753	# horizontal addition
1754
1755	&vpsrldq	($T0,$D4,8);
1756	&vpsrldq	($T1,$D3,8);
1757	&vpaddq		($D4,$D4,$T0);
1758	&vpsrldq	($T0,$D0,8);
1759	&vpaddq		($D3,$D3,$T1);
1760	&vpsrldq	($T1,$D1,8);
1761	&vpaddq		($D0,$D0,$T0);
1762	&vpsrldq	($T0,$D2,8);
1763	&vpaddq		($D1,$D1,$T1);
1764	&vpermq		($T1,$D4,2);			# keep folding
1765	&vpaddq		($D2,$D2,$T0);
1766	&vpermq		($T0,$D3,2);
1767	&vpaddq		($D4,$D4,$T1);
1768	&vpermq		($T1,$D0,2);
1769	&vpaddq		($D3,$D3,$T0);
1770	&vpermq		($T0,$D1,2);
1771	&vpaddq		($D0,$D0,$T1);
1772	&vpermq		($T1,$D2,2);
1773	&vpaddq		($D1,$D1,$T0);
1774	&vpaddq		($D2,$D2,$T1);
1775
1776	&vlazy_reduction();
1777
1778	&cmp		("ecx",0);
1779	&je		(&label("done"));
1780
1781	################################################################
1782	# clear all but single word
1783
1784	&vpshufd	(&X($D0),&X($D0),0b11111100);
1785	&lea		("edx",&DWP(32*5+128,"esp"));	# restore pointer
1786	&vpshufd	(&X($D1),&X($D1),0b11111100);
1787	&vpshufd	(&X($D2),&X($D2),0b11111100);
1788	&vpshufd	(&X($D3),&X($D3),0b11111100);
1789	&vpshufd	(&X($D4),&X($D4),0b11111100);
1790	&jmp		(&label("even"));
1791
1792&set_label("done",16);
1793	&vmovd		(&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
1794	&vmovd		(&DWP(-16*3+4*1,"edi"),&X($D1));
1795	&vmovd		(&DWP(-16*3+4*2,"edi"),&X($D2));
1796	&vmovd		(&DWP(-16*3+4*3,"edi"),&X($D3));
1797	&vmovd		(&DWP(-16*3+4*4,"edi"),&X($D4));
1798	&vzeroupper	();
1799	&mov	("esp","ebp");
1800&set_label("nodata");
1801&function_end("_poly1305_blocks_avx2");
1802}
1803&set_label("const_sse2",64);
1804	&data_word(1<<24,0,	1<<24,0,	1<<24,0,	1<<24,0);
1805	&data_word(0,0,		0,0,		0,0,		0,0);
1806	&data_word(0x03ffffff,0,0x03ffffff,0,	0x03ffffff,0,	0x03ffffff,0);
1807	&data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
1808}
1809&asciz	("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
1810&align	(4);
1811
1812&asm_finish();
1813
1814close STDOUT or die "error closing STDOUT: $!";
1815