1#! /usr/bin/env perl
2# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for x86_64.
18#
19# March 2015
20#
21# Initial release.
22#
23# December 2016
24#
25# Add AVX512F+VL+BW code path.
26#
27# November 2017
28#
29# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
30# executed even on Knights Landing. Trigger for modification was
31# observation that AVX512 code paths can negatively affect overall
32# Skylake-X system performance. Since we are likely to suppress
33# AVX512F capability flag [at least on Skylake-X], conversion serves
34# as kind of "investment protection". Note that next *lake processor,
35# Cannolake, has AVX512IFMA code path to execute...
36#
37# Numbers are cycles per processed byte with poly1305_blocks alone,
38# measured with rdtsc at fixed clock frequency.
39#
40#		IALU/gcc-4.8(*)	AVX(**)		AVX2	AVX-512
41# P4		4.46/+120%	-
42# Core 2	2.41/+90%	-
43# Westmere	1.88/+120%	-
44# Sandy Bridge	1.39/+140%	1.10
45# Haswell	1.14/+175%	1.11		0.65
46# Skylake[-X]	1.13/+120%	0.96		0.51	[0.35]
47# Silvermont	2.83/+95%	-
48# Knights L	3.60/?		1.65		1.10	0.41(***)
49# Goldmont	1.70/+180%	-
50# VIA Nano	1.82/+150%	-
51# Sledgehammer	1.38/+160%	-
52# Bulldozer	2.30/+130%	0.97
53# Ryzen		1.15/+200%	1.08		1.18
54#
55# (*)	improvement coefficients relative to clang are more modest and
56#	are ~50% on most processors, in both cases we are comparing to
57#	__int128 code;
58# (**)	SSE2 implementation was attempted, but among non-AVX processors
59#	it was faster than integer-only code only on older Intel P4 and
60#	Core processors, 50-30%, less newer processor is, but slower on
61#	contemporary ones, for example almost 2x slower on Atom, and as
62#	former are naturally disappearing, SSE2 is deemed unnecessary;
63# (***)	strangely enough performance seems to vary from core to core,
64#	listed result is best case;
65
66# $output is the last argument if it looks like a file (it has an extension)
67# $flavour is the first argument if it doesn't look like a file
68$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
69$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
70
71$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
72
73$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
75( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
76die "can't locate x86_64-xlate.pl";
77
78if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
81}
82
83if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
85	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
86	$avx += 2 if ($1==2.11 && $2>=8);
87}
88
89if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
90	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
91	$avx = ($1>=10) + ($1>=12);
92}
93
94if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
95	$avx = ($2>=3.0) + ($2>3.0);
96}
97
98open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
99    or die "can't call $xlate: $!";
100*STDOUT=*OUT;
101
102my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
103my ($mac,$nonce)=($inp,$len);	# *_emit arguments
104my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
105my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
106
107sub poly1305_iteration {
108# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
109# output:	$h0-$h2 *= $r0-$r1
110$code.=<<___;
111	mulq	$h0			# h0*r1
112	mov	%rax,$d2
113	 mov	$r0,%rax
114	mov	%rdx,$d3
115
116	mulq	$h0			# h0*r0
117	mov	%rax,$h0		# future $h0
118	 mov	$r0,%rax
119	mov	%rdx,$d1
120
121	mulq	$h1			# h1*r0
122	add	%rax,$d2
123	 mov	$s1,%rax
124	adc	%rdx,$d3
125
126	mulq	$h1			# h1*s1
127	 mov	$h2,$h1			# borrow $h1
128	add	%rax,$h0
129	adc	%rdx,$d1
130
131	imulq	$s1,$h1			# h2*s1
132	add	$h1,$d2
133	 mov	$d1,$h1
134	adc	\$0,$d3
135
136	imulq	$r0,$h2			# h2*r0
137	add	$d2,$h1
138	mov	\$-4,%rax		# mask value
139	adc	$h2,$d3
140
141	and	$d3,%rax		# last reduction step
142	mov	$d3,$h2
143	shr	\$2,$d3
144	and	\$3,$h2
145	add	$d3,%rax
146	add	%rax,$h0
147	adc	\$0,$h1
148	adc	\$0,$h2
149___
150}
151
152########################################################################
153# Layout of opaque area is following.
154#
155#	unsigned __int64 h[3];		# current hash value base 2^64
156#	unsigned __int64 r[2];		# key value base 2^64
157
158$code.=<<___;
159.text
160
161.extern	OPENSSL_ia32cap_P
162
163.globl	poly1305_init
164.hidden	poly1305_init
165.globl	poly1305_blocks
166.hidden	poly1305_blocks
167.globl	poly1305_emit
168.hidden	poly1305_emit
169
170.type	poly1305_init,\@function,3
171.align	32
172poly1305_init:
173.cfi_startproc
174	xor	%rax,%rax
175	mov	%rax,0($ctx)		# initialize hash value
176	mov	%rax,8($ctx)
177	mov	%rax,16($ctx)
178
179	cmp	\$0,$inp
180	je	.Lno_key
181
182	lea	poly1305_blocks(%rip),%r10
183	lea	poly1305_emit(%rip),%r11
184___
185$code.=<<___	if ($avx);
186	mov	OPENSSL_ia32cap_P+4(%rip),%r9
187	lea	poly1305_blocks_avx(%rip),%rax
188	lea	poly1305_emit_avx(%rip),%rcx
189	bt	\$`60-32`,%r9		# AVX?
190	cmovc	%rax,%r10
191	cmovc	%rcx,%r11
192___
193$code.=<<___	if ($avx>1);
194	lea	poly1305_blocks_avx2(%rip),%rax
195	bt	\$`5+32`,%r9		# AVX2?
196	cmovc	%rax,%r10
197___
198$code.=<<___	if ($avx>3 && !$win64);
199	mov	\$`(1<<31|1<<21|1<<16)`,%rax
200	shr	\$32,%r9
201	and	%rax,%r9
202	cmp	%rax,%r9
203	je	.Linit_base2_44
204___
205$code.=<<___;
206	mov	\$0x0ffffffc0fffffff,%rax
207	mov	\$0x0ffffffc0ffffffc,%rcx
208	and	0($inp),%rax
209	and	8($inp),%rcx
210	mov	%rax,24($ctx)
211	mov	%rcx,32($ctx)
212___
213$code.=<<___	if ($flavour !~ /elf32/);
214	mov	%r10,0(%rdx)
215	mov	%r11,8(%rdx)
216___
217$code.=<<___	if ($flavour =~ /elf32/);
218	mov	%r10d,0(%rdx)
219	mov	%r11d,4(%rdx)
220___
221$code.=<<___;
222	mov	\$1,%eax
223.Lno_key:
224	ret
225.cfi_endproc
226.size	poly1305_init,.-poly1305_init
227
228.type	poly1305_blocks,\@function,4
229.align	32
230poly1305_blocks:
231.cfi_startproc
232	endbranch
233.Lblocks:
234	shr	\$4,$len
235	jz	.Lno_data		# too short
236
237	push	%rbx
238.cfi_push	%rbx
239	push	%rbp
240.cfi_push	%rbp
241	push	%r12
242.cfi_push	%r12
243	push	%r13
244.cfi_push	%r13
245	push	%r14
246.cfi_push	%r14
247	push	%r15
248.cfi_push	%r15
249.Lblocks_body:
250
251	mov	$len,%r15		# reassign $len
252
253	mov	24($ctx),$r0		# load r
254	mov	32($ctx),$s1
255
256	mov	0($ctx),$h0		# load hash value
257	mov	8($ctx),$h1
258	mov	16($ctx),$h2
259
260	mov	$s1,$r1
261	shr	\$2,$s1
262	mov	$r1,%rax
263	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
264	jmp	.Loop
265
266.align	32
267.Loop:
268	add	0($inp),$h0		# accumulate input
269	adc	8($inp),$h1
270	lea	16($inp),$inp
271	adc	$padbit,$h2
272___
273	&poly1305_iteration();
274$code.=<<___;
275	mov	$r1,%rax
276	dec	%r15			# len-=16
277	jnz	.Loop
278
279	mov	$h0,0($ctx)		# store hash value
280	mov	$h1,8($ctx)
281	mov	$h2,16($ctx)
282
283	mov	0(%rsp),%r15
284.cfi_restore	%r15
285	mov	8(%rsp),%r14
286.cfi_restore	%r14
287	mov	16(%rsp),%r13
288.cfi_restore	%r13
289	mov	24(%rsp),%r12
290.cfi_restore	%r12
291	mov	32(%rsp),%rbp
292.cfi_restore	%rbp
293	mov	40(%rsp),%rbx
294.cfi_restore	%rbx
295	lea	48(%rsp),%rsp
296.cfi_adjust_cfa_offset	-48
297.Lno_data:
298.Lblocks_epilogue:
299	ret
300.cfi_endproc
301.size	poly1305_blocks,.-poly1305_blocks
302
303.type	poly1305_emit,\@function,3
304.align	32
305poly1305_emit:
306.cfi_startproc
307	endbranch
308.Lemit:
309	mov	0($ctx),%r8	# load hash value
310	mov	8($ctx),%r9
311	mov	16($ctx),%r10
312
313	mov	%r8,%rax
314	add	\$5,%r8		# compare to modulus
315	mov	%r9,%rcx
316	adc	\$0,%r9
317	adc	\$0,%r10
318	shr	\$2,%r10	# did 130-bit value overflow?
319	cmovnz	%r8,%rax
320	cmovnz	%r9,%rcx
321
322	add	0($nonce),%rax	# accumulate nonce
323	adc	8($nonce),%rcx
324	mov	%rax,0($mac)	# write result
325	mov	%rcx,8($mac)
326
327	ret
328.cfi_endproc
329.size	poly1305_emit,.-poly1305_emit
330___
331if ($avx) {
332
333########################################################################
334# Layout of opaque area is following.
335#
336#	unsigned __int32 h[5];		# current hash value base 2^26
337#	unsigned __int32 is_base2_26;
338#	unsigned __int64 r[2];		# key value base 2^64
339#	unsigned __int64 pad;
340#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
341#
342# where r^n are base 2^26 digits of degrees of multiplier key. There are
343# 5 digits, but last four are interleaved with multiples of 5, totalling
344# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
345
346my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
347    map("%xmm$_",(0..15));
348
349$code.=<<___;
350.type	__poly1305_block,\@abi-omnipotent
351.align	32
352__poly1305_block:
353.cfi_startproc
354___
355	&poly1305_iteration();
356$code.=<<___;
357	ret
358.cfi_endproc
359.size	__poly1305_block,.-__poly1305_block
360
361.type	__poly1305_init_avx,\@abi-omnipotent
362.align	32
363__poly1305_init_avx:
364.cfi_startproc
365	mov	$r0,$h0
366	mov	$r1,$h1
367	xor	$h2,$h2
368
369	lea	48+64($ctx),$ctx	# size optimization
370
371	mov	$r1,%rax
372	call	__poly1305_block	# r^2
373
374	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
375	mov	\$0x3ffffff,%edx
376	mov	$h0,$d1
377	and	$h0#d,%eax
378	mov	$r0,$d2
379	and	$r0#d,%edx
380	mov	%eax,`16*0+0-64`($ctx)
381	shr	\$26,$d1
382	mov	%edx,`16*0+4-64`($ctx)
383	shr	\$26,$d2
384
385	mov	\$0x3ffffff,%eax
386	mov	\$0x3ffffff,%edx
387	and	$d1#d,%eax
388	and	$d2#d,%edx
389	mov	%eax,`16*1+0-64`($ctx)
390	lea	(%rax,%rax,4),%eax	# *5
391	mov	%edx,`16*1+4-64`($ctx)
392	lea	(%rdx,%rdx,4),%edx	# *5
393	mov	%eax,`16*2+0-64`($ctx)
394	shr	\$26,$d1
395	mov	%edx,`16*2+4-64`($ctx)
396	shr	\$26,$d2
397
398	mov	$h1,%rax
399	mov	$r1,%rdx
400	shl	\$12,%rax
401	shl	\$12,%rdx
402	or	$d1,%rax
403	or	$d2,%rdx
404	and	\$0x3ffffff,%eax
405	and	\$0x3ffffff,%edx
406	mov	%eax,`16*3+0-64`($ctx)
407	lea	(%rax,%rax,4),%eax	# *5
408	mov	%edx,`16*3+4-64`($ctx)
409	lea	(%rdx,%rdx,4),%edx	# *5
410	mov	%eax,`16*4+0-64`($ctx)
411	mov	$h1,$d1
412	mov	%edx,`16*4+4-64`($ctx)
413	mov	$r1,$d2
414
415	mov	\$0x3ffffff,%eax
416	mov	\$0x3ffffff,%edx
417	shr	\$14,$d1
418	shr	\$14,$d2
419	and	$d1#d,%eax
420	and	$d2#d,%edx
421	mov	%eax,`16*5+0-64`($ctx)
422	lea	(%rax,%rax,4),%eax	# *5
423	mov	%edx,`16*5+4-64`($ctx)
424	lea	(%rdx,%rdx,4),%edx	# *5
425	mov	%eax,`16*6+0-64`($ctx)
426	shr	\$26,$d1
427	mov	%edx,`16*6+4-64`($ctx)
428	shr	\$26,$d2
429
430	mov	$h2,%rax
431	shl	\$24,%rax
432	or	%rax,$d1
433	mov	$d1#d,`16*7+0-64`($ctx)
434	lea	($d1,$d1,4),$d1		# *5
435	mov	$d2#d,`16*7+4-64`($ctx)
436	lea	($d2,$d2,4),$d2		# *5
437	mov	$d1#d,`16*8+0-64`($ctx)
438	mov	$d2#d,`16*8+4-64`($ctx)
439
440	mov	$r1,%rax
441	call	__poly1305_block	# r^3
442
443	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
444	mov	$h0,$d1
445	and	$h0#d,%eax
446	shr	\$26,$d1
447	mov	%eax,`16*0+12-64`($ctx)
448
449	mov	\$0x3ffffff,%edx
450	and	$d1#d,%edx
451	mov	%edx,`16*1+12-64`($ctx)
452	lea	(%rdx,%rdx,4),%edx	# *5
453	shr	\$26,$d1
454	mov	%edx,`16*2+12-64`($ctx)
455
456	mov	$h1,%rax
457	shl	\$12,%rax
458	or	$d1,%rax
459	and	\$0x3ffffff,%eax
460	mov	%eax,`16*3+12-64`($ctx)
461	lea	(%rax,%rax,4),%eax	# *5
462	mov	$h1,$d1
463	mov	%eax,`16*4+12-64`($ctx)
464
465	mov	\$0x3ffffff,%edx
466	shr	\$14,$d1
467	and	$d1#d,%edx
468	mov	%edx,`16*5+12-64`($ctx)
469	lea	(%rdx,%rdx,4),%edx	# *5
470	shr	\$26,$d1
471	mov	%edx,`16*6+12-64`($ctx)
472
473	mov	$h2,%rax
474	shl	\$24,%rax
475	or	%rax,$d1
476	mov	$d1#d,`16*7+12-64`($ctx)
477	lea	($d1,$d1,4),$d1		# *5
478	mov	$d1#d,`16*8+12-64`($ctx)
479
480	mov	$r1,%rax
481	call	__poly1305_block	# r^4
482
483	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
484	mov	$h0,$d1
485	and	$h0#d,%eax
486	shr	\$26,$d1
487	mov	%eax,`16*0+8-64`($ctx)
488
489	mov	\$0x3ffffff,%edx
490	and	$d1#d,%edx
491	mov	%edx,`16*1+8-64`($ctx)
492	lea	(%rdx,%rdx,4),%edx	# *5
493	shr	\$26,$d1
494	mov	%edx,`16*2+8-64`($ctx)
495
496	mov	$h1,%rax
497	shl	\$12,%rax
498	or	$d1,%rax
499	and	\$0x3ffffff,%eax
500	mov	%eax,`16*3+8-64`($ctx)
501	lea	(%rax,%rax,4),%eax	# *5
502	mov	$h1,$d1
503	mov	%eax,`16*4+8-64`($ctx)
504
505	mov	\$0x3ffffff,%edx
506	shr	\$14,$d1
507	and	$d1#d,%edx
508	mov	%edx,`16*5+8-64`($ctx)
509	lea	(%rdx,%rdx,4),%edx	# *5
510	shr	\$26,$d1
511	mov	%edx,`16*6+8-64`($ctx)
512
513	mov	$h2,%rax
514	shl	\$24,%rax
515	or	%rax,$d1
516	mov	$d1#d,`16*7+8-64`($ctx)
517	lea	($d1,$d1,4),$d1		# *5
518	mov	$d1#d,`16*8+8-64`($ctx)
519
520	lea	-48-64($ctx),$ctx	# size [de-]optimization
521	ret
522.cfi_endproc
523.size	__poly1305_init_avx,.-__poly1305_init_avx
524
525.type	poly1305_blocks_avx,\@function,4
526.align	32
527poly1305_blocks_avx:
528.cfi_startproc
529	endbranch
530	mov	20($ctx),%r8d		# is_base2_26
531	cmp	\$128,$len
532	jae	.Lblocks_avx
533	test	%r8d,%r8d
534	jz	.Lblocks
535
536.Lblocks_avx:
537	and	\$-16,$len
538	jz	.Lno_data_avx
539
540	vzeroupper
541
542	test	%r8d,%r8d
543	jz	.Lbase2_64_avx
544
545	test	\$31,$len
546	jz	.Leven_avx
547
548	push	%rbx
549.cfi_push	%rbx
550	push	%rbp
551.cfi_push	%rbp
552	push	%r12
553.cfi_push	%r12
554	push	%r13
555.cfi_push	%r13
556	push	%r14
557.cfi_push	%r14
558	push	%r15
559.cfi_push	%r15
560.Lblocks_avx_body:
561
562	mov	$len,%r15		# reassign $len
563
564	mov	0($ctx),$d1		# load hash value
565	mov	8($ctx),$d2
566	mov	16($ctx),$h2#d
567
568	mov	24($ctx),$r0		# load r
569	mov	32($ctx),$s1
570
571	################################# base 2^26 -> base 2^64
572	mov	$d1#d,$h0#d
573	and	\$`-1*(1<<31)`,$d1
574	mov	$d2,$r1			# borrow $r1
575	mov	$d2#d,$h1#d
576	and	\$`-1*(1<<31)`,$d2
577
578	shr	\$6,$d1
579	shl	\$52,$r1
580	add	$d1,$h0
581	shr	\$12,$h1
582	shr	\$18,$d2
583	add	$r1,$h0
584	adc	$d2,$h1
585
586	mov	$h2,$d1
587	shl	\$40,$d1
588	shr	\$24,$h2
589	add	$d1,$h1
590	adc	\$0,$h2			# can be partially reduced...
591
592	mov	\$-4,$d2		# ... so reduce
593	mov	$h2,$d1
594	and	$h2,$d2
595	shr	\$2,$d1
596	and	\$3,$h2
597	add	$d2,$d1			# =*5
598	add	$d1,$h0
599	adc	\$0,$h1
600	adc	\$0,$h2
601
602	mov	$s1,$r1
603	mov	$s1,%rax
604	shr	\$2,$s1
605	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
606
607	add	0($inp),$h0		# accumulate input
608	adc	8($inp),$h1
609	lea	16($inp),$inp
610	adc	$padbit,$h2
611
612	call	__poly1305_block
613
614	test	$padbit,$padbit		# if $padbit is zero,
615	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
616
617	################################# base 2^64 -> base 2^26
618	mov	$h0,%rax
619	mov	$h0,%rdx
620	shr	\$52,$h0
621	mov	$h1,$r0
622	mov	$h1,$r1
623	shr	\$26,%rdx
624	and	\$0x3ffffff,%rax	# h[0]
625	shl	\$12,$r0
626	and	\$0x3ffffff,%rdx	# h[1]
627	shr	\$14,$h1
628	or	$r0,$h0
629	shl	\$24,$h2
630	and	\$0x3ffffff,$h0		# h[2]
631	shr	\$40,$r1
632	and	\$0x3ffffff,$h1		# h[3]
633	or	$r1,$h2			# h[4]
634
635	sub	\$16,%r15
636	jz	.Lstore_base2_26_avx
637
638	vmovd	%rax#d,$H0
639	vmovd	%rdx#d,$H1
640	vmovd	$h0#d,$H2
641	vmovd	$h1#d,$H3
642	vmovd	$h2#d,$H4
643	jmp	.Lproceed_avx
644
645.align	32
646.Lstore_base2_64_avx:
647	mov	$h0,0($ctx)
648	mov	$h1,8($ctx)
649	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
650	jmp	.Ldone_avx
651
652.align	16
653.Lstore_base2_26_avx:
654	mov	%rax#d,0($ctx)		# store hash value base 2^26
655	mov	%rdx#d,4($ctx)
656	mov	$h0#d,8($ctx)
657	mov	$h1#d,12($ctx)
658	mov	$h2#d,16($ctx)
659.align	16
660.Ldone_avx:
661	mov	0(%rsp),%r15
662.cfi_restore	%r15
663	mov	8(%rsp),%r14
664.cfi_restore	%r14
665	mov	16(%rsp),%r13
666.cfi_restore	%r13
667	mov	24(%rsp),%r12
668.cfi_restore	%r12
669	mov	32(%rsp),%rbp
670.cfi_restore	%rbp
671	mov	40(%rsp),%rbx
672.cfi_restore	%rbx
673	lea	48(%rsp),%rsp
674.cfi_adjust_cfa_offset	-48
675.Lno_data_avx:
676.Lblocks_avx_epilogue:
677	ret
678.cfi_endproc
679
680.align	32
681.Lbase2_64_avx:
682.cfi_startproc
683	push	%rbx
684.cfi_push	%rbx
685	push	%rbp
686.cfi_push	%rbp
687	push	%r12
688.cfi_push	%r12
689	push	%r13
690.cfi_push	%r13
691	push	%r14
692.cfi_push	%r14
693	push	%r15
694.cfi_push	%r15
695.Lbase2_64_avx_body:
696
697	mov	$len,%r15		# reassign $len
698
699	mov	24($ctx),$r0		# load r
700	mov	32($ctx),$s1
701
702	mov	0($ctx),$h0		# load hash value
703	mov	8($ctx),$h1
704	mov	16($ctx),$h2#d
705
706	mov	$s1,$r1
707	mov	$s1,%rax
708	shr	\$2,$s1
709	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
710
711	test	\$31,$len
712	jz	.Linit_avx
713
714	add	0($inp),$h0		# accumulate input
715	adc	8($inp),$h1
716	lea	16($inp),$inp
717	adc	$padbit,$h2
718	sub	\$16,%r15
719
720	call	__poly1305_block
721
722.Linit_avx:
723	################################# base 2^64 -> base 2^26
724	mov	$h0,%rax
725	mov	$h0,%rdx
726	shr	\$52,$h0
727	mov	$h1,$d1
728	mov	$h1,$d2
729	shr	\$26,%rdx
730	and	\$0x3ffffff,%rax	# h[0]
731	shl	\$12,$d1
732	and	\$0x3ffffff,%rdx	# h[1]
733	shr	\$14,$h1
734	or	$d1,$h0
735	shl	\$24,$h2
736	and	\$0x3ffffff,$h0		# h[2]
737	shr	\$40,$d2
738	and	\$0x3ffffff,$h1		# h[3]
739	or	$d2,$h2			# h[4]
740
741	vmovd	%rax#d,$H0
742	vmovd	%rdx#d,$H1
743	vmovd	$h0#d,$H2
744	vmovd	$h1#d,$H3
745	vmovd	$h2#d,$H4
746	movl	\$1,20($ctx)		# set is_base2_26
747
748	call	__poly1305_init_avx
749
750.Lproceed_avx:
751	mov	%r15,$len
752
753	mov	0(%rsp),%r15
754.cfi_restore	%r15
755	mov	8(%rsp),%r14
756.cfi_restore	%r14
757	mov	16(%rsp),%r13
758.cfi_restore	%r13
759	mov	24(%rsp),%r12
760.cfi_restore	%r12
761	mov	32(%rsp),%rbp
762.cfi_restore	%rbp
763	mov	40(%rsp),%rbx
764.cfi_restore	%rbx
765	lea	48(%rsp),%rax
766	lea	48(%rsp),%rsp
767.cfi_adjust_cfa_offset	-48
768.Lbase2_64_avx_epilogue:
769	jmp	.Ldo_avx
770.cfi_endproc
771
772.align	32
773.Leven_avx:
774.cfi_startproc
775	vmovd		4*0($ctx),$H0		# load hash value
776	vmovd		4*1($ctx),$H1
777	vmovd		4*2($ctx),$H2
778	vmovd		4*3($ctx),$H3
779	vmovd		4*4($ctx),$H4
780
781.Ldo_avx:
782___
783$code.=<<___	if (!$win64);
784	lea		-0x58(%rsp),%r11
785.cfi_def_cfa		%r11,0x60
786	sub		\$0x178,%rsp
787___
788$code.=<<___	if ($win64);
789	lea		-0xf8(%rsp),%r11
790	sub		\$0x218,%rsp
791	vmovdqa		%xmm6,0x50(%r11)
792	vmovdqa		%xmm7,0x60(%r11)
793	vmovdqa		%xmm8,0x70(%r11)
794	vmovdqa		%xmm9,0x80(%r11)
795	vmovdqa		%xmm10,0x90(%r11)
796	vmovdqa		%xmm11,0xa0(%r11)
797	vmovdqa		%xmm12,0xb0(%r11)
798	vmovdqa		%xmm13,0xc0(%r11)
799	vmovdqa		%xmm14,0xd0(%r11)
800	vmovdqa		%xmm15,0xe0(%r11)
801.Ldo_avx_body:
802___
803$code.=<<___;
804	sub		\$64,$len
805	lea		-32($inp),%rax
806	cmovc		%rax,$inp
807
808	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
809	lea		`16*3+64`($ctx),$ctx	# size optimization
810	lea		.Lconst(%rip),%rcx
811
812	################################################################
813	# load input
814	vmovdqu		16*2($inp),$T0
815	vmovdqu		16*3($inp),$T1
816	vmovdqa		64(%rcx),$MASK		# .Lmask26
817
818	vpsrldq		\$6,$T0,$T2		# splat input
819	vpsrldq		\$6,$T1,$T3
820	vpunpckhqdq	$T1,$T0,$T4		# 4
821	vpunpcklqdq	$T1,$T0,$T0		# 0:1
822	vpunpcklqdq	$T3,$T2,$T3		# 2:3
823
824	vpsrlq		\$40,$T4,$T4		# 4
825	vpsrlq		\$26,$T0,$T1
826	vpand		$MASK,$T0,$T0		# 0
827	vpsrlq		\$4,$T3,$T2
828	vpand		$MASK,$T1,$T1		# 1
829	vpsrlq		\$30,$T3,$T3
830	vpand		$MASK,$T2,$T2		# 2
831	vpand		$MASK,$T3,$T3		# 3
832	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
833
834	jbe		.Lskip_loop_avx
835
836	# expand and copy pre-calculated table to stack
837	vmovdqu		`16*1-64`($ctx),$D1
838	vmovdqu		`16*2-64`($ctx),$D2
839	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
840	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
841	vmovdqa		$D3,-0x90(%r11)
842	vmovdqa		$D0,0x00(%rsp)
843	vpshufd		\$0xEE,$D1,$D4
844	vmovdqu		`16*3-64`($ctx),$D0
845	vpshufd		\$0x44,$D1,$D1
846	vmovdqa		$D4,-0x80(%r11)
847	vmovdqa		$D1,0x10(%rsp)
848	vpshufd		\$0xEE,$D2,$D3
849	vmovdqu		`16*4-64`($ctx),$D1
850	vpshufd		\$0x44,$D2,$D2
851	vmovdqa		$D3,-0x70(%r11)
852	vmovdqa		$D2,0x20(%rsp)
853	vpshufd		\$0xEE,$D0,$D4
854	vmovdqu		`16*5-64`($ctx),$D2
855	vpshufd		\$0x44,$D0,$D0
856	vmovdqa		$D4,-0x60(%r11)
857	vmovdqa		$D0,0x30(%rsp)
858	vpshufd		\$0xEE,$D1,$D3
859	vmovdqu		`16*6-64`($ctx),$D0
860	vpshufd		\$0x44,$D1,$D1
861	vmovdqa		$D3,-0x50(%r11)
862	vmovdqa		$D1,0x40(%rsp)
863	vpshufd		\$0xEE,$D2,$D4
864	vmovdqu		`16*7-64`($ctx),$D1
865	vpshufd		\$0x44,$D2,$D2
866	vmovdqa		$D4,-0x40(%r11)
867	vmovdqa		$D2,0x50(%rsp)
868	vpshufd		\$0xEE,$D0,$D3
869	vmovdqu		`16*8-64`($ctx),$D2
870	vpshufd		\$0x44,$D0,$D0
871	vmovdqa		$D3,-0x30(%r11)
872	vmovdqa		$D0,0x60(%rsp)
873	vpshufd		\$0xEE,$D1,$D4
874	vpshufd		\$0x44,$D1,$D1
875	vmovdqa		$D4,-0x20(%r11)
876	vmovdqa		$D1,0x70(%rsp)
877	vpshufd		\$0xEE,$D2,$D3
878	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
879	vpshufd		\$0x44,$D2,$D2
880	vmovdqa		$D3,-0x10(%r11)
881	vmovdqa		$D2,0x80(%rsp)
882
883	jmp		.Loop_avx
884
885.align	32
886.Loop_avx:
887	################################################################
888	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
889	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
890	#   \___________________/
891	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
892	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
893	#   \___________________/ \____________________/
894	#
895	# Note that we start with inp[2:3]*r^2. This is because it
896	# doesn't depend on reduction in previous iteration.
897	################################################################
898	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
899	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
900	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
901	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
902	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
903	#
904	# though note that $Tx and $Hx are "reversed" in this section,
905	# and $D4 is preloaded with r0^2...
906
907	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
908	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
909	  vmovdqa	$H2,0x20(%r11)				# offload hash
910	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
911	 vmovdqa	0x10(%rsp),$H2		# r1^2
912	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
913	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
914
915	  vmovdqa	$H0,0x00(%r11)				#
916	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
917	  vmovdqa	$H1,0x10(%r11)				#
918	vpmuludq	$T3,$H2,$H1		# h3*r1
919	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
920	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
921	  vmovdqa	$H3,0x30(%r11)				#
922	vpmuludq	$T2,$H2,$H0		# h2*r1
923	vpmuludq	$T1,$H2,$H1		# h1*r1
924	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
925	 vmovdqa	0x30(%rsp),$H3		# r2^2
926	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
927	  vmovdqa	$H4,0x40(%r11)				#
928	vpmuludq	$T0,$H2,$H2		# h0*r1
929	 vpmuludq	$T2,$H3,$H0		# h2*r2
930	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
931
932	 vmovdqa	0x40(%rsp),$H4		# s2^2
933	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
934	vpmuludq	$T1,$H3,$H1		# h1*r2
935	vpmuludq	$T0,$H3,$H3		# h0*r2
936	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
937	 vmovdqa	0x50(%rsp),$H2		# r3^2
938	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
939	vpmuludq	$T4,$H4,$H0		# h4*s2
940	vpmuludq	$T3,$H4,$H4		# h3*s2
941	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
942	 vmovdqa	0x60(%rsp),$H3		# s3^2
943	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
944
945	 vmovdqa	0x80(%rsp),$H4		# s4^2
946	vpmuludq	$T1,$H2,$H1		# h1*r3
947	vpmuludq	$T0,$H2,$H2		# h0*r3
948	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
949	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
950	vpmuludq	$T4,$H3,$H0		# h4*s3
951	vpmuludq	$T3,$H3,$H1		# h3*s3
952	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
953	 vmovdqu	16*0($inp),$H0				# load input
954	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
955	vpmuludq	$T2,$H3,$H3		# h2*s3
956	 vpmuludq	$T2,$H4,$T2		# h2*s4
957	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
958
959	 vmovdqu	16*1($inp),$H1				#
960	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
961	vpmuludq	$T3,$H4,$T3		# h3*s4
962	vpmuludq	$T4,$H4,$T4		# h4*s4
963	 vpsrldq	\$6,$H0,$H2				# splat input
964	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
965	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
966	 vpsrldq	\$6,$H1,$H3				#
967	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
968	vpmuludq	$T1,$H4,$T0		# h1*s4
969	 vpunpckhqdq	$H1,$H0,$H4		# 4
970	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
971	 vmovdqa	-0x90(%r11),$T4		# r0^4
972	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
973
974	vpunpcklqdq	$H1,$H0,$H0		# 0:1
975	vpunpcklqdq	$H3,$H2,$H3		# 2:3
976
977	#vpsrlq		\$40,$H4,$H4		# 4
978	vpsrldq		\$`40/8`,$H4,$H4	# 4
979	vpsrlq		\$26,$H0,$H1
980	vpand		$MASK,$H0,$H0		# 0
981	vpsrlq		\$4,$H3,$H2
982	vpand		$MASK,$H1,$H1		# 1
983	vpand		0(%rcx),$H4,$H4		# .Lmask24
984	vpsrlq		\$30,$H3,$H3
985	vpand		$MASK,$H2,$H2		# 2
986	vpand		$MASK,$H3,$H3		# 3
987	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
988
989	vpaddq		0x00(%r11),$H0,$H0	# add hash value
990	vpaddq		0x10(%r11),$H1,$H1
991	vpaddq		0x20(%r11),$H2,$H2
992	vpaddq		0x30(%r11),$H3,$H3
993	vpaddq		0x40(%r11),$H4,$H4
994
995	lea		16*2($inp),%rax
996	lea		16*4($inp),$inp
997	sub		\$64,$len
998	cmovc		%rax,$inp
999
1000	################################################################
1001	# Now we accumulate (inp[0:1]+hash)*r^4
1002	################################################################
1003	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1004	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1005	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1006	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1007	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1008
1009	vpmuludq	$H0,$T4,$T0		# h0*r0
1010	vpmuludq	$H1,$T4,$T1		# h1*r0
1011	vpaddq		$T0,$D0,$D0
1012	vpaddq		$T1,$D1,$D1
1013	 vmovdqa	-0x80(%r11),$T2		# r1^4
1014	vpmuludq	$H2,$T4,$T0		# h2*r0
1015	vpmuludq	$H3,$T4,$T1		# h3*r0
1016	vpaddq		$T0,$D2,$D2
1017	vpaddq		$T1,$D3,$D3
1018	vpmuludq	$H4,$T4,$T4		# h4*r0
1019	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
1020	vpaddq		$T4,$D4,$D4
1021
1022	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
1023	vpmuludq	$H2,$T2,$T1		# h2*r1
1024	vpmuludq	$H3,$T2,$T0		# h3*r1
1025	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
1026	 vmovdqa	-0x60(%r11),$T3		# r2^4
1027	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
1028	vpmuludq	$H1,$T2,$T1		# h1*r1
1029	vpmuludq	$H0,$T2,$T2		# h0*r1
1030	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
1031	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
1032
1033	 vmovdqa	-0x50(%r11),$T4		# s2^4
1034	vpmuludq	$H2,$T3,$T0		# h2*r2
1035	vpmuludq	$H1,$T3,$T1		# h1*r2
1036	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
1037	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
1038	 vmovdqa	-0x40(%r11),$T2		# r3^4
1039	vpmuludq	$H0,$T3,$T3		# h0*r2
1040	vpmuludq	$H4,$T4,$T0		# h4*s2
1041	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
1042	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
1043	 vmovdqa	-0x30(%r11),$T3		# s3^4
1044	vpmuludq	$H3,$T4,$T4		# h3*s2
1045	 vpmuludq	$H1,$T2,$T1		# h1*r3
1046	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
1047
1048	 vmovdqa	-0x10(%r11),$T4		# s4^4
1049	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
1050	vpmuludq	$H0,$T2,$T2		# h0*r3
1051	vpmuludq	$H4,$T3,$T0		# h4*s3
1052	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
1053	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
1054	 vmovdqu	16*2($inp),$T0				# load input
1055	vpmuludq	$H3,$T3,$T2		# h3*s3
1056	vpmuludq	$H2,$T3,$T3		# h2*s3
1057	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
1058	 vmovdqu	16*3($inp),$T1				#
1059	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
1060
1061	vpmuludq	$H2,$T4,$H2		# h2*s4
1062	vpmuludq	$H3,$T4,$H3		# h3*s4
1063	 vpsrldq	\$6,$T0,$T2				# splat input
1064	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
1065	vpmuludq	$H4,$T4,$H4		# h4*s4
1066	 vpsrldq	\$6,$T1,$T3				#
1067	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
1068	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
1069	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
1070	vpmuludq	$H1,$T4,$H0
1071	 vpunpckhqdq	$T1,$T0,$T4		# 4
1072	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
1073	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
1074
1075	vpunpcklqdq	$T1,$T0,$T0		# 0:1
1076	vpunpcklqdq	$T3,$T2,$T3		# 2:3
1077
1078	#vpsrlq		\$40,$T4,$T4		# 4
1079	vpsrldq		\$`40/8`,$T4,$T4	# 4
1080	vpsrlq		\$26,$T0,$T1
1081	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
1082	vpand		$MASK,$T0,$T0		# 0
1083	vpsrlq		\$4,$T3,$T2
1084	vpand		$MASK,$T1,$T1		# 1
1085	vpand		0(%rcx),$T4,$T4		# .Lmask24
1086	vpsrlq		\$30,$T3,$T3
1087	vpand		$MASK,$T2,$T2		# 2
1088	vpand		$MASK,$T3,$T3		# 3
1089	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
1090
1091	################################################################
1092	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1093	# and P. Schwabe
1094
1095	vpsrlq		\$26,$H3,$D3
1096	vpand		$MASK,$H3,$H3
1097	vpaddq		$D3,$H4,$H4		# h3 -> h4
1098
1099	vpsrlq		\$26,$H0,$D0
1100	vpand		$MASK,$H0,$H0
1101	vpaddq		$D0,$D1,$H1		# h0 -> h1
1102
1103	vpsrlq		\$26,$H4,$D0
1104	vpand		$MASK,$H4,$H4
1105
1106	vpsrlq		\$26,$H1,$D1
1107	vpand		$MASK,$H1,$H1
1108	vpaddq		$D1,$H2,$H2		# h1 -> h2
1109
1110	vpaddq		$D0,$H0,$H0
1111	vpsllq		\$2,$D0,$D0
1112	vpaddq		$D0,$H0,$H0		# h4 -> h0
1113
1114	vpsrlq		\$26,$H2,$D2
1115	vpand		$MASK,$H2,$H2
1116	vpaddq		$D2,$H3,$H3		# h2 -> h3
1117
1118	vpsrlq		\$26,$H0,$D0
1119	vpand		$MASK,$H0,$H0
1120	vpaddq		$D0,$H1,$H1		# h0 -> h1
1121
1122	vpsrlq		\$26,$H3,$D3
1123	vpand		$MASK,$H3,$H3
1124	vpaddq		$D3,$H4,$H4		# h3 -> h4
1125
1126	ja		.Loop_avx
1127
1128.Lskip_loop_avx:
1129	################################################################
1130	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1131
1132	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
1133	add		\$32,$len
1134	jnz		.Long_tail_avx
1135
1136	vpaddq		$H2,$T2,$T2
1137	vpaddq		$H0,$T0,$T0
1138	vpaddq		$H1,$T1,$T1
1139	vpaddq		$H3,$T3,$T3
1140	vpaddq		$H4,$T4,$T4
1141
1142.Long_tail_avx:
1143	vmovdqa		$H2,0x20(%r11)
1144	vmovdqa		$H0,0x00(%r11)
1145	vmovdqa		$H1,0x10(%r11)
1146	vmovdqa		$H3,0x30(%r11)
1147	vmovdqa		$H4,0x40(%r11)
1148
1149	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1150	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1151	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1152	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1153	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1154
1155	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
1156	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
1157	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
1158	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
1159	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
1160	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
1161
1162	vpmuludq	$T3,$H2,$H0		# h3*r1
1163	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
1164	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
1165	vpmuludq	$T2,$H2,$H1		# h2*r1
1166	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
1167	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
1168	vpmuludq	$T1,$H2,$H0		# h1*r1
1169	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
1170	vpmuludq	$T0,$H2,$H2		# h0*r1
1171	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
1172	vpmuludq	$T4,$H3,$H3		# h4*s1
1173	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
1174
1175	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
1176	vpmuludq	$T2,$H4,$H1		# h2*r2
1177	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
1178	vpmuludq	$T1,$H4,$H0		# h1*r2
1179	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
1180	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
1181	vpmuludq	$T0,$H4,$H4		# h0*r2
1182	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
1183	vpmuludq	$T4,$H2,$H1		# h4*s2
1184	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
1185	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
1186	vpmuludq	$T3,$H2,$H2		# h3*s2
1187	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
1188
1189	vpmuludq	$T1,$H3,$H0		# h1*r3
1190	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
1191	vpmuludq	$T0,$H3,$H3		# h0*r3
1192	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
1193	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
1194	vpmuludq	$T4,$H4,$H1		# h4*s3
1195	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
1196	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
1197	vpmuludq	$T3,$H4,$H0		# h3*s3
1198	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
1199	vpmuludq	$T2,$H4,$H4		# h2*s3
1200	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
1201
1202	vpmuludq	$T0,$H2,$H2		# h0*r4
1203	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
1204	vpmuludq	$T4,$H3,$H1		# h4*s4
1205	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
1206	vpmuludq	$T3,$H3,$H0		# h3*s4
1207	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
1208	vpmuludq	$T2,$H3,$H1		# h2*s4
1209	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
1210	vpmuludq	$T1,$H3,$H3		# h1*s4
1211	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
1212
1213	jz		.Lshort_tail_avx
1214
1215	vmovdqu		16*0($inp),$H0		# load input
1216	vmovdqu		16*1($inp),$H1
1217
1218	vpsrldq		\$6,$H0,$H2		# splat input
1219	vpsrldq		\$6,$H1,$H3
1220	vpunpckhqdq	$H1,$H0,$H4		# 4
1221	vpunpcklqdq	$H1,$H0,$H0		# 0:1
1222	vpunpcklqdq	$H3,$H2,$H3		# 2:3
1223
1224	vpsrlq		\$40,$H4,$H4		# 4
1225	vpsrlq		\$26,$H0,$H1
1226	vpand		$MASK,$H0,$H0		# 0
1227	vpsrlq		\$4,$H3,$H2
1228	vpand		$MASK,$H1,$H1		# 1
1229	vpsrlq		\$30,$H3,$H3
1230	vpand		$MASK,$H2,$H2		# 2
1231	vpand		$MASK,$H3,$H3		# 3
1232	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
1233
1234	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
1235	vpaddq		0x00(%r11),$H0,$H0
1236	vpaddq		0x10(%r11),$H1,$H1
1237	vpaddq		0x20(%r11),$H2,$H2
1238	vpaddq		0x30(%r11),$H3,$H3
1239	vpaddq		0x40(%r11),$H4,$H4
1240
1241	################################################################
1242	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1243
1244	vpmuludq	$H0,$T4,$T0		# h0*r0
1245	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
1246	vpmuludq	$H1,$T4,$T1		# h1*r0
1247	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
1248	vpmuludq	$H2,$T4,$T0		# h2*r0
1249	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
1250	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
1251	vpmuludq	$H3,$T4,$T1		# h3*r0
1252	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
1253	vpmuludq	$H4,$T4,$T4		# h4*r0
1254	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
1255
1256	vpmuludq	$H3,$T2,$T0		# h3*r1
1257	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
1258	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
1259	vpmuludq	$H2,$T2,$T1		# h2*r1
1260	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
1261	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
1262	vpmuludq	$H1,$T2,$T0		# h1*r1
1263	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
1264	vpmuludq	$H0,$T2,$T2		# h0*r1
1265	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
1266	vpmuludq	$H4,$T3,$T3		# h4*s1
1267	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
1268
1269	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
1270	vpmuludq	$H2,$T4,$T1		# h2*r2
1271	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
1272	vpmuludq	$H1,$T4,$T0		# h1*r2
1273	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
1274	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
1275	vpmuludq	$H0,$T4,$T4		# h0*r2
1276	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
1277	vpmuludq	$H4,$T2,$T1		# h4*s2
1278	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
1279	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
1280	vpmuludq	$H3,$T2,$T2		# h3*s2
1281	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
1282
1283	vpmuludq	$H1,$T3,$T0		# h1*r3
1284	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
1285	vpmuludq	$H0,$T3,$T3		# h0*r3
1286	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
1287	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
1288	vpmuludq	$H4,$T4,$T1		# h4*s3
1289	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
1290	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
1291	vpmuludq	$H3,$T4,$T0		# h3*s3
1292	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
1293	vpmuludq	$H2,$T4,$T4		# h2*s3
1294	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
1295
1296	vpmuludq	$H0,$T2,$T2		# h0*r4
1297	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
1298	vpmuludq	$H4,$T3,$T1		# h4*s4
1299	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
1300	vpmuludq	$H3,$T3,$T0		# h3*s4
1301	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
1302	vpmuludq	$H2,$T3,$T1		# h2*s4
1303	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
1304	vpmuludq	$H1,$T3,$T3		# h1*s4
1305	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
1306
1307.Lshort_tail_avx:
1308	################################################################
1309	# horizontal addition
1310
1311	vpsrldq		\$8,$D4,$T4
1312	vpsrldq		\$8,$D3,$T3
1313	vpsrldq		\$8,$D1,$T1
1314	vpsrldq		\$8,$D0,$T0
1315	vpsrldq		\$8,$D2,$T2
1316	vpaddq		$T3,$D3,$D3
1317	vpaddq		$T4,$D4,$D4
1318	vpaddq		$T0,$D0,$D0
1319	vpaddq		$T1,$D1,$D1
1320	vpaddq		$T2,$D2,$D2
1321
1322	################################################################
1323	# lazy reduction
1324
1325	vpsrlq		\$26,$D3,$H3
1326	vpand		$MASK,$D3,$D3
1327	vpaddq		$H3,$D4,$D4		# h3 -> h4
1328
1329	vpsrlq		\$26,$D0,$H0
1330	vpand		$MASK,$D0,$D0
1331	vpaddq		$H0,$D1,$D1		# h0 -> h1
1332
1333	vpsrlq		\$26,$D4,$H4
1334	vpand		$MASK,$D4,$D4
1335
1336	vpsrlq		\$26,$D1,$H1
1337	vpand		$MASK,$D1,$D1
1338	vpaddq		$H1,$D2,$D2		# h1 -> h2
1339
1340	vpaddq		$H4,$D0,$D0
1341	vpsllq		\$2,$H4,$H4
1342	vpaddq		$H4,$D0,$D0		# h4 -> h0
1343
1344	vpsrlq		\$26,$D2,$H2
1345	vpand		$MASK,$D2,$D2
1346	vpaddq		$H2,$D3,$D3		# h2 -> h3
1347
1348	vpsrlq		\$26,$D0,$H0
1349	vpand		$MASK,$D0,$D0
1350	vpaddq		$H0,$D1,$D1		# h0 -> h1
1351
1352	vpsrlq		\$26,$D3,$H3
1353	vpand		$MASK,$D3,$D3
1354	vpaddq		$H3,$D4,$D4		# h3 -> h4
1355
1356	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
1357	vmovd		$D1,`4*1-48-64`($ctx)
1358	vmovd		$D2,`4*2-48-64`($ctx)
1359	vmovd		$D3,`4*3-48-64`($ctx)
1360	vmovd		$D4,`4*4-48-64`($ctx)
1361___
1362$code.=<<___	if ($win64);
1363	vmovdqa		0x50(%r11),%xmm6
1364	vmovdqa		0x60(%r11),%xmm7
1365	vmovdqa		0x70(%r11),%xmm8
1366	vmovdqa		0x80(%r11),%xmm9
1367	vmovdqa		0x90(%r11),%xmm10
1368	vmovdqa		0xa0(%r11),%xmm11
1369	vmovdqa		0xb0(%r11),%xmm12
1370	vmovdqa		0xc0(%r11),%xmm13
1371	vmovdqa		0xd0(%r11),%xmm14
1372	vmovdqa		0xe0(%r11),%xmm15
1373	lea		0xf8(%r11),%rsp
1374.Ldo_avx_epilogue:
1375___
1376$code.=<<___	if (!$win64);
1377	lea		0x58(%r11),%rsp
1378.cfi_def_cfa		%rsp,8
1379___
1380$code.=<<___;
1381	vzeroupper
1382	ret
1383.cfi_endproc
1384.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1385
1386.type	poly1305_emit_avx,\@function,3
1387.align	32
1388poly1305_emit_avx:
1389.cfi_startproc
1390	endbranch
1391	cmpl	\$0,20($ctx)	# is_base2_26?
1392	je	.Lemit
1393
1394	mov	0($ctx),%eax	# load hash value base 2^26
1395	mov	4($ctx),%ecx
1396	mov	8($ctx),%r8d
1397	mov	12($ctx),%r11d
1398	mov	16($ctx),%r10d
1399
1400	shl	\$26,%rcx	# base 2^26 -> base 2^64
1401	mov	%r8,%r9
1402	shl	\$52,%r8
1403	add	%rcx,%rax
1404	shr	\$12,%r9
1405	add	%rax,%r8	# h0
1406	adc	\$0,%r9
1407
1408	shl	\$14,%r11
1409	mov	%r10,%rax
1410	shr	\$24,%r10
1411	add	%r11,%r9
1412	shl	\$40,%rax
1413	add	%rax,%r9	# h1
1414	adc	\$0,%r10	# h2
1415
1416	mov	%r10,%rax	# could be partially reduced, so reduce
1417	mov	%r10,%rcx
1418	and	\$3,%r10
1419	shr	\$2,%rax
1420	and	\$-4,%rcx
1421	add	%rcx,%rax
1422	add	%rax,%r8
1423	adc	\$0,%r9
1424	adc	\$0,%r10
1425
1426	mov	%r8,%rax
1427	add	\$5,%r8		# compare to modulus
1428	mov	%r9,%rcx
1429	adc	\$0,%r9
1430	adc	\$0,%r10
1431	shr	\$2,%r10	# did 130-bit value overflow?
1432	cmovnz	%r8,%rax
1433	cmovnz	%r9,%rcx
1434
1435	add	0($nonce),%rax	# accumulate nonce
1436	adc	8($nonce),%rcx
1437	mov	%rax,0($mac)	# write result
1438	mov	%rcx,8($mac)
1439
1440	ret
1441.cfi_endproc
1442.size	poly1305_emit_avx,.-poly1305_emit_avx
1443___
1444
1445if ($avx>1) {
1446my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1447    map("%ymm$_",(0..15));
1448my $S4=$MASK;
1449
1450$code.=<<___;
1451.type	poly1305_blocks_avx2,\@function,4
1452.align	32
1453poly1305_blocks_avx2:
1454.cfi_startproc
1455	endbranch
1456	mov	20($ctx),%r8d		# is_base2_26
1457	cmp	\$128,$len
1458	jae	.Lblocks_avx2
1459	test	%r8d,%r8d
1460	jz	.Lblocks
1461
1462.Lblocks_avx2:
1463	and	\$-16,$len
1464	jz	.Lno_data_avx2
1465
1466	vzeroupper
1467
1468	test	%r8d,%r8d
1469	jz	.Lbase2_64_avx2
1470
1471	test	\$63,$len
1472	jz	.Leven_avx2
1473
1474	push	%rbx
1475.cfi_push	%rbx
1476	push	%rbp
1477.cfi_push	%rbp
1478	push	%r12
1479.cfi_push	%r12
1480	push	%r13
1481.cfi_push	%r13
1482	push	%r14
1483.cfi_push	%r14
1484	push	%r15
1485.cfi_push	%r15
1486.Lblocks_avx2_body:
1487
1488	mov	$len,%r15		# reassign $len
1489
1490	mov	0($ctx),$d1		# load hash value
1491	mov	8($ctx),$d2
1492	mov	16($ctx),$h2#d
1493
1494	mov	24($ctx),$r0		# load r
1495	mov	32($ctx),$s1
1496
1497	################################# base 2^26 -> base 2^64
1498	mov	$d1#d,$h0#d
1499	and	\$`-1*(1<<31)`,$d1
1500	mov	$d2,$r1			# borrow $r1
1501	mov	$d2#d,$h1#d
1502	and	\$`-1*(1<<31)`,$d2
1503
1504	shr	\$6,$d1
1505	shl	\$52,$r1
1506	add	$d1,$h0
1507	shr	\$12,$h1
1508	shr	\$18,$d2
1509	add	$r1,$h0
1510	adc	$d2,$h1
1511
1512	mov	$h2,$d1
1513	shl	\$40,$d1
1514	shr	\$24,$h2
1515	add	$d1,$h1
1516	adc	\$0,$h2			# can be partially reduced...
1517
1518	mov	\$-4,$d2		# ... so reduce
1519	mov	$h2,$d1
1520	and	$h2,$d2
1521	shr	\$2,$d1
1522	and	\$3,$h2
1523	add	$d2,$d1			# =*5
1524	add	$d1,$h0
1525	adc	\$0,$h1
1526	adc	\$0,$h2
1527
1528	mov	$s1,$r1
1529	mov	$s1,%rax
1530	shr	\$2,$s1
1531	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
1532
1533.Lbase2_26_pre_avx2:
1534	add	0($inp),$h0		# accumulate input
1535	adc	8($inp),$h1
1536	lea	16($inp),$inp
1537	adc	$padbit,$h2
1538	sub	\$16,%r15
1539
1540	call	__poly1305_block
1541	mov	$r1,%rax
1542
1543	test	\$63,%r15
1544	jnz	.Lbase2_26_pre_avx2
1545
1546	test	$padbit,$padbit		# if $padbit is zero,
1547	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
1548
1549	################################# base 2^64 -> base 2^26
1550	mov	$h0,%rax
1551	mov	$h0,%rdx
1552	shr	\$52,$h0
1553	mov	$h1,$r0
1554	mov	$h1,$r1
1555	shr	\$26,%rdx
1556	and	\$0x3ffffff,%rax	# h[0]
1557	shl	\$12,$r0
1558	and	\$0x3ffffff,%rdx	# h[1]
1559	shr	\$14,$h1
1560	or	$r0,$h0
1561	shl	\$24,$h2
1562	and	\$0x3ffffff,$h0		# h[2]
1563	shr	\$40,$r1
1564	and	\$0x3ffffff,$h1		# h[3]
1565	or	$r1,$h2			# h[4]
1566
1567	test	%r15,%r15
1568	jz	.Lstore_base2_26_avx2
1569
1570	vmovd	%rax#d,%x#$H0
1571	vmovd	%rdx#d,%x#$H1
1572	vmovd	$h0#d,%x#$H2
1573	vmovd	$h1#d,%x#$H3
1574	vmovd	$h2#d,%x#$H4
1575	jmp	.Lproceed_avx2
1576
1577.align	32
1578.Lstore_base2_64_avx2:
1579	mov	$h0,0($ctx)
1580	mov	$h1,8($ctx)
1581	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
1582	jmp	.Ldone_avx2
1583
1584.align	16
1585.Lstore_base2_26_avx2:
1586	mov	%rax#d,0($ctx)		# store hash value base 2^26
1587	mov	%rdx#d,4($ctx)
1588	mov	$h0#d,8($ctx)
1589	mov	$h1#d,12($ctx)
1590	mov	$h2#d,16($ctx)
1591.align	16
1592.Ldone_avx2:
1593	mov	0(%rsp),%r15
1594.cfi_restore	%r15
1595	mov	8(%rsp),%r14
1596.cfi_restore	%r14
1597	mov	16(%rsp),%r13
1598.cfi_restore	%r13
1599	mov	24(%rsp),%r12
1600.cfi_restore	%r12
1601	mov	32(%rsp),%rbp
1602.cfi_restore	%rbp
1603	mov	40(%rsp),%rbx
1604.cfi_restore	%rbx
1605	lea	48(%rsp),%rsp
1606.cfi_adjust_cfa_offset	-48
1607.Lno_data_avx2:
1608.Lblocks_avx2_epilogue:
1609	ret
1610.cfi_endproc
1611
1612.align	32
1613.Lbase2_64_avx2:
1614.cfi_startproc
1615	push	%rbx
1616.cfi_push	%rbx
1617	push	%rbp
1618.cfi_push	%rbp
1619	push	%r12
1620.cfi_push	%r12
1621	push	%r13
1622.cfi_push	%r13
1623	push	%r14
1624.cfi_push	%r14
1625	push	%r15
1626.cfi_push	%r15
1627.Lbase2_64_avx2_body:
1628
1629	mov	$len,%r15		# reassign $len
1630
1631	mov	24($ctx),$r0		# load r
1632	mov	32($ctx),$s1
1633
1634	mov	0($ctx),$h0		# load hash value
1635	mov	8($ctx),$h1
1636	mov	16($ctx),$h2#d
1637
1638	mov	$s1,$r1
1639	mov	$s1,%rax
1640	shr	\$2,$s1
1641	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
1642
1643	test	\$63,$len
1644	jz	.Linit_avx2
1645
1646.Lbase2_64_pre_avx2:
1647	add	0($inp),$h0		# accumulate input
1648	adc	8($inp),$h1
1649	lea	16($inp),$inp
1650	adc	$padbit,$h2
1651	sub	\$16,%r15
1652
1653	call	__poly1305_block
1654	mov	$r1,%rax
1655
1656	test	\$63,%r15
1657	jnz	.Lbase2_64_pre_avx2
1658
1659.Linit_avx2:
1660	################################# base 2^64 -> base 2^26
1661	mov	$h0,%rax
1662	mov	$h0,%rdx
1663	shr	\$52,$h0
1664	mov	$h1,$d1
1665	mov	$h1,$d2
1666	shr	\$26,%rdx
1667	and	\$0x3ffffff,%rax	# h[0]
1668	shl	\$12,$d1
1669	and	\$0x3ffffff,%rdx	# h[1]
1670	shr	\$14,$h1
1671	or	$d1,$h0
1672	shl	\$24,$h2
1673	and	\$0x3ffffff,$h0		# h[2]
1674	shr	\$40,$d2
1675	and	\$0x3ffffff,$h1		# h[3]
1676	or	$d2,$h2			# h[4]
1677
1678	vmovd	%rax#d,%x#$H0
1679	vmovd	%rdx#d,%x#$H1
1680	vmovd	$h0#d,%x#$H2
1681	vmovd	$h1#d,%x#$H3
1682	vmovd	$h2#d,%x#$H4
1683	movl	\$1,20($ctx)		# set is_base2_26
1684
1685	call	__poly1305_init_avx
1686
1687.Lproceed_avx2:
1688	mov	%r15,$len			# restore $len
1689	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
1690	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
1691
1692	mov	0(%rsp),%r15
1693.cfi_restore	%r15
1694	mov	8(%rsp),%r14
1695.cfi_restore	%r14
1696	mov	16(%rsp),%r13
1697.cfi_restore	%r13
1698	mov	24(%rsp),%r12
1699.cfi_restore	%r12
1700	mov	32(%rsp),%rbp
1701.cfi_restore	%rbp
1702	mov	40(%rsp),%rbx
1703.cfi_restore	%rbx
1704	lea	48(%rsp),%rax
1705	lea	48(%rsp),%rsp
1706.cfi_adjust_cfa_offset	-48
1707.Lbase2_64_avx2_epilogue:
1708	jmp	.Ldo_avx2
1709.cfi_endproc
1710
1711.align	32
1712.Leven_avx2:
1713.cfi_startproc
1714	mov		OPENSSL_ia32cap_P+8(%rip),%r10d
1715	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
1716	vmovd		4*1($ctx),%x#$H1
1717	vmovd		4*2($ctx),%x#$H2
1718	vmovd		4*3($ctx),%x#$H3
1719	vmovd		4*4($ctx),%x#$H4
1720
1721.Ldo_avx2:
1722___
1723$code.=<<___		if ($avx>2);
1724	cmp		\$512,$len
1725	jb		.Lskip_avx512
1726	and		%r11d,%r10d
1727	test		\$`1<<16`,%r10d		# check for AVX512F
1728	jnz		.Lblocks_avx512
1729.Lskip_avx512:
1730___
1731$code.=<<___	if (!$win64);
1732	lea		-8(%rsp),%r11
1733.cfi_def_cfa		%r11,16
1734	sub		\$0x128,%rsp
1735___
1736$code.=<<___	if ($win64);
1737	lea		-0xf8(%rsp),%r11
1738	sub		\$0x1c8,%rsp
1739	vmovdqa		%xmm6,0x50(%r11)
1740	vmovdqa		%xmm7,0x60(%r11)
1741	vmovdqa		%xmm8,0x70(%r11)
1742	vmovdqa		%xmm9,0x80(%r11)
1743	vmovdqa		%xmm10,0x90(%r11)
1744	vmovdqa		%xmm11,0xa0(%r11)
1745	vmovdqa		%xmm12,0xb0(%r11)
1746	vmovdqa		%xmm13,0xc0(%r11)
1747	vmovdqa		%xmm14,0xd0(%r11)
1748	vmovdqa		%xmm15,0xe0(%r11)
1749.Ldo_avx2_body:
1750___
1751$code.=<<___;
1752	lea		.Lconst(%rip),%rcx
1753	lea		48+64($ctx),$ctx	# size optimization
1754	vmovdqa		96(%rcx),$T0		# .Lpermd_avx2
1755
1756	# expand and copy pre-calculated table to stack
1757	vmovdqu		`16*0-64`($ctx),%x#$T2
1758	and		\$-512,%rsp
1759	vmovdqu		`16*1-64`($ctx),%x#$T3
1760	vmovdqu		`16*2-64`($ctx),%x#$T4
1761	vmovdqu		`16*3-64`($ctx),%x#$D0
1762	vmovdqu		`16*4-64`($ctx),%x#$D1
1763	vmovdqu		`16*5-64`($ctx),%x#$D2
1764	lea		0x90(%rsp),%rax		# size optimization
1765	vmovdqu		`16*6-64`($ctx),%x#$D3
1766	vpermd		$T2,$T0,$T2		# 00003412 -> 14243444
1767	vmovdqu		`16*7-64`($ctx),%x#$D4
1768	vpermd		$T3,$T0,$T3
1769	vmovdqu		`16*8-64`($ctx),%x#$MASK
1770	vpermd		$T4,$T0,$T4
1771	vmovdqa		$T2,0x00(%rsp)
1772	vpermd		$D0,$T0,$D0
1773	vmovdqa		$T3,0x20-0x90(%rax)
1774	vpermd		$D1,$T0,$D1
1775	vmovdqa		$T4,0x40-0x90(%rax)
1776	vpermd		$D2,$T0,$D2
1777	vmovdqa		$D0,0x60-0x90(%rax)
1778	vpermd		$D3,$T0,$D3
1779	vmovdqa		$D1,0x80-0x90(%rax)
1780	vpermd		$D4,$T0,$D4
1781	vmovdqa		$D2,0xa0-0x90(%rax)
1782	vpermd		$MASK,$T0,$MASK
1783	vmovdqa		$D3,0xc0-0x90(%rax)
1784	vmovdqa		$D4,0xe0-0x90(%rax)
1785	vmovdqa		$MASK,0x100-0x90(%rax)
1786	vmovdqa		64(%rcx),$MASK		# .Lmask26
1787
1788	################################################################
1789	# load input
1790	vmovdqu		16*0($inp),%x#$T0
1791	vmovdqu		16*1($inp),%x#$T1
1792	vinserti128	\$1,16*2($inp),$T0,$T0
1793	vinserti128	\$1,16*3($inp),$T1,$T1
1794	lea		16*4($inp),$inp
1795
1796	vpsrldq		\$6,$T0,$T2		# splat input
1797	vpsrldq		\$6,$T1,$T3
1798	vpunpckhqdq	$T1,$T0,$T4		# 4
1799	vpunpcklqdq	$T3,$T2,$T2		# 2:3
1800	vpunpcklqdq	$T1,$T0,$T0		# 0:1
1801
1802	vpsrlq		\$30,$T2,$T3
1803	vpsrlq		\$4,$T2,$T2
1804	vpsrlq		\$26,$T0,$T1
1805	vpsrlq		\$40,$T4,$T4		# 4
1806	vpand		$MASK,$T2,$T2		# 2
1807	vpand		$MASK,$T0,$T0		# 0
1808	vpand		$MASK,$T1,$T1		# 1
1809	vpand		$MASK,$T3,$T3		# 3
1810	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
1811
1812	vpaddq		$H2,$T2,$H2		# accumulate input
1813	sub		\$64,$len
1814	jz		.Ltail_avx2
1815	jmp		.Loop_avx2
1816
1817.align	32
1818.Loop_avx2:
1819	################################################################
1820	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1821	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1822	# ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1823	# ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1824	#   \________/\__________/
1825	################################################################
1826	#vpaddq		$H2,$T2,$H2		# accumulate input
1827	vpaddq		$H0,$T0,$H0
1828	vmovdqa		`32*0`(%rsp),$T0	# r0^4
1829	vpaddq		$H1,$T1,$H1
1830	vmovdqa		`32*1`(%rsp),$T1	# r1^4
1831	vpaddq		$H3,$T3,$H3
1832	vmovdqa		`32*3`(%rsp),$T2	# r2^4
1833	vpaddq		$H4,$T4,$H4
1834	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
1835	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
1836
1837	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1838	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1839	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1840	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1841	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1842	#
1843	# however, as h2 is "chronologically" first one available pull
1844	# corresponding operations up, so it's
1845	#
1846	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
1847	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
1848	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1849	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
1850	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
1851
1852	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
1853	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
1854	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
1855	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
1856	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
1857
1858	vpmuludq	$H0,$T1,$T4		# h0*r1
1859	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
1860	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
1861	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
1862	vpmuludq	$H3,$T1,$T4		# h3*r1
1863	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
1864	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
1865	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
1866	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
1867
1868	vpmuludq	$H0,$T0,$T4		# h0*r0
1869	vpmuludq	$H1,$T0,$H2		# h1*r0
1870	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
1871	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
1872	vpmuludq	$H3,$T0,$T4		# h3*r0
1873	vpmuludq	$H4,$T0,$H2		# h4*r0
1874	 vmovdqu	16*0($inp),%x#$T0	# load input
1875	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
1876	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
1877	 vinserti128	\$1,16*2($inp),$T0,$T0
1878
1879	vpmuludq	$H3,$T1,$T4		# h3*s2
1880	vpmuludq	$H4,$T1,$H2		# h4*s2
1881	 vmovdqu	16*1($inp),%x#$T1
1882	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
1883	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
1884	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
1885	vpmuludq	$H1,$T2,$T4		# h1*r2
1886	vpmuludq	$H0,$T2,$T2		# h0*r2
1887	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
1888	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
1889	 vinserti128	\$1,16*3($inp),$T1,$T1
1890	 lea		16*4($inp),$inp
1891
1892	vpmuludq	$H1,$H2,$T4		# h1*r3
1893	vpmuludq	$H0,$H2,$H2		# h0*r3
1894	 vpsrldq	\$6,$T0,$T2		# splat input
1895	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
1896	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
1897	vpmuludq	$H3,$T3,$T4		# h3*s3
1898	vpmuludq	$H4,$T3,$H2		# h4*s3
1899	 vpsrldq	\$6,$T1,$T3
1900	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
1901	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
1902	 vpunpckhqdq	$T1,$T0,$T4		# 4
1903
1904	vpmuludq	$H3,$S4,$H3		# h3*s4
1905	vpmuludq	$H4,$S4,$H4		# h4*s4
1906	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
1907	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
1908	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
1909	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
1910	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
1911	vpmuludq	$H1,$S4,$H0		# h1*s4
1912	vmovdqa		64(%rcx),$MASK		# .Lmask26
1913	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
1914	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
1915
1916	################################################################
1917	# lazy reduction (interleaved with tail of input splat)
1918
1919	vpsrlq		\$26,$H3,$D3
1920	vpand		$MASK,$H3,$H3
1921	vpaddq		$D3,$H4,$H4		# h3 -> h4
1922
1923	vpsrlq		\$26,$H0,$D0
1924	vpand		$MASK,$H0,$H0
1925	vpaddq		$D0,$D1,$H1		# h0 -> h1
1926
1927	vpsrlq		\$26,$H4,$D4
1928	vpand		$MASK,$H4,$H4
1929
1930	 vpsrlq		\$4,$T3,$T2
1931
1932	vpsrlq		\$26,$H1,$D1
1933	vpand		$MASK,$H1,$H1
1934	vpaddq		$D1,$H2,$H2		# h1 -> h2
1935
1936	vpaddq		$D4,$H0,$H0
1937	vpsllq		\$2,$D4,$D4
1938	vpaddq		$D4,$H0,$H0		# h4 -> h0
1939
1940	 vpand		$MASK,$T2,$T2		# 2
1941	 vpsrlq		\$26,$T0,$T1
1942
1943	vpsrlq		\$26,$H2,$D2
1944	vpand		$MASK,$H2,$H2
1945	vpaddq		$D2,$H3,$H3		# h2 -> h3
1946
1947	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
1948	 vpsrlq		\$30,$T3,$T3
1949
1950	vpsrlq		\$26,$H0,$D0
1951	vpand		$MASK,$H0,$H0
1952	vpaddq		$D0,$H1,$H1		# h0 -> h1
1953
1954	 vpsrlq		\$40,$T4,$T4		# 4
1955
1956	vpsrlq		\$26,$H3,$D3
1957	vpand		$MASK,$H3,$H3
1958	vpaddq		$D3,$H4,$H4		# h3 -> h4
1959
1960	 vpand		$MASK,$T0,$T0		# 0
1961	 vpand		$MASK,$T1,$T1		# 1
1962	 vpand		$MASK,$T3,$T3		# 3
1963	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
1964
1965	sub		\$64,$len
1966	jnz		.Loop_avx2
1967
1968	.byte		0x66,0x90
1969.Ltail_avx2:
1970	################################################################
1971	# while above multiplications were by r^4 in all lanes, in last
1972	# iteration we multiply least significant lane by r^4 and most
1973	# significant one by r, so copy of above except that references
1974	# to the precomputed table are displaced by 4...
1975
1976	#vpaddq		$H2,$T2,$H2		# accumulate input
1977	vpaddq		$H0,$T0,$H0
1978	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
1979	vpaddq		$H1,$T1,$H1
1980	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
1981	vpaddq		$H3,$T3,$H3
1982	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
1983	vpaddq		$H4,$T4,$H4
1984	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
1985	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
1986
1987	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
1988	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
1989	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
1990	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
1991	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
1992
1993	vpmuludq	$H0,$T1,$T4		# h0*r1
1994	vpmuludq	$H1,$T1,$H2		# h1*r1
1995	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
1996	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
1997	vpmuludq	$H3,$T1,$T4		# h3*r1
1998	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
1999	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
2000	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
2001
2002	vpmuludq	$H0,$T0,$T4		# h0*r0
2003	vpmuludq	$H1,$T0,$H2		# h1*r0
2004	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
2005	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
2006	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
2007	vpmuludq	$H3,$T0,$T4		# h3*r0
2008	vpmuludq	$H4,$T0,$H2		# h4*r0
2009	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
2010	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
2011
2012	vpmuludq	$H3,$T1,$T4		# h3*s2
2013	vpmuludq	$H4,$T1,$H2		# h4*s2
2014	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
2015	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
2016	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
2017	vpmuludq	$H1,$T2,$T4		# h1*r2
2018	vpmuludq	$H0,$T2,$T2		# h0*r2
2019	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
2020	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
2021
2022	vpmuludq	$H1,$H2,$T4		# h1*r3
2023	vpmuludq	$H0,$H2,$H2		# h0*r3
2024	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
2025	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
2026	vpmuludq	$H3,$T3,$T4		# h3*s3
2027	vpmuludq	$H4,$T3,$H2		# h4*s3
2028	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
2029	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
2030
2031	vpmuludq	$H3,$S4,$H3		# h3*s4
2032	vpmuludq	$H4,$S4,$H4		# h4*s4
2033	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
2034	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
2035	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
2036	vpmuludq	$H1,$S4,$H0		# h1*s4
2037	vmovdqa		64(%rcx),$MASK		# .Lmask26
2038	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
2039	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
2040
2041	################################################################
2042	# horizontal addition
2043
2044	vpsrldq		\$8,$D1,$T1
2045	vpsrldq		\$8,$H2,$T2
2046	vpsrldq		\$8,$H3,$T3
2047	vpsrldq		\$8,$H4,$T4
2048	vpsrldq		\$8,$H0,$T0
2049	vpaddq		$T1,$D1,$D1
2050	vpaddq		$T2,$H2,$H2
2051	vpaddq		$T3,$H3,$H3
2052	vpaddq		$T4,$H4,$H4
2053	vpaddq		$T0,$H0,$H0
2054
2055	vpermq		\$0x2,$H3,$T3
2056	vpermq		\$0x2,$H4,$T4
2057	vpermq		\$0x2,$H0,$T0
2058	vpermq		\$0x2,$D1,$T1
2059	vpermq		\$0x2,$H2,$T2
2060	vpaddq		$T3,$H3,$H3
2061	vpaddq		$T4,$H4,$H4
2062	vpaddq		$T0,$H0,$H0
2063	vpaddq		$T1,$D1,$D1
2064	vpaddq		$T2,$H2,$H2
2065
2066	################################################################
2067	# lazy reduction
2068
2069	vpsrlq		\$26,$H3,$D3
2070	vpand		$MASK,$H3,$H3
2071	vpaddq		$D3,$H4,$H4		# h3 -> h4
2072
2073	vpsrlq		\$26,$H0,$D0
2074	vpand		$MASK,$H0,$H0
2075	vpaddq		$D0,$D1,$H1		# h0 -> h1
2076
2077	vpsrlq		\$26,$H4,$D4
2078	vpand		$MASK,$H4,$H4
2079
2080	vpsrlq		\$26,$H1,$D1
2081	vpand		$MASK,$H1,$H1
2082	vpaddq		$D1,$H2,$H2		# h1 -> h2
2083
2084	vpaddq		$D4,$H0,$H0
2085	vpsllq		\$2,$D4,$D4
2086	vpaddq		$D4,$H0,$H0		# h4 -> h0
2087
2088	vpsrlq		\$26,$H2,$D2
2089	vpand		$MASK,$H2,$H2
2090	vpaddq		$D2,$H3,$H3		# h2 -> h3
2091
2092	vpsrlq		\$26,$H0,$D0
2093	vpand		$MASK,$H0,$H0
2094	vpaddq		$D0,$H1,$H1		# h0 -> h1
2095
2096	vpsrlq		\$26,$H3,$D3
2097	vpand		$MASK,$H3,$H3
2098	vpaddq		$D3,$H4,$H4		# h3 -> h4
2099
2100	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
2101	vmovd		%x#$H1,`4*1-48-64`($ctx)
2102	vmovd		%x#$H2,`4*2-48-64`($ctx)
2103	vmovd		%x#$H3,`4*3-48-64`($ctx)
2104	vmovd		%x#$H4,`4*4-48-64`($ctx)
2105___
2106$code.=<<___	if ($win64);
2107	vmovdqa		0x50(%r11),%xmm6
2108	vmovdqa		0x60(%r11),%xmm7
2109	vmovdqa		0x70(%r11),%xmm8
2110	vmovdqa		0x80(%r11),%xmm9
2111	vmovdqa		0x90(%r11),%xmm10
2112	vmovdqa		0xa0(%r11),%xmm11
2113	vmovdqa		0xb0(%r11),%xmm12
2114	vmovdqa		0xc0(%r11),%xmm13
2115	vmovdqa		0xd0(%r11),%xmm14
2116	vmovdqa		0xe0(%r11),%xmm15
2117	lea		0xf8(%r11),%rsp
2118.Ldo_avx2_epilogue:
2119___
2120$code.=<<___	if (!$win64);
2121	lea		8(%r11),%rsp
2122.cfi_def_cfa		%rsp,8
2123___
2124$code.=<<___;
2125	vzeroupper
2126	ret
2127.cfi_endproc
2128.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
2129___
2130#######################################################################
2131if ($avx>2) {
2132# On entry we have input length divisible by 64. But since inner loop
2133# processes 128 bytes per iteration, cases when length is not divisible
2134# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2135# reason stack layout is kept identical to poly1305_blocks_avx2. If not
2136# for this tail, we wouldn't have to even allocate stack frame...
2137
2138my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2139my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2140my $PADBIT="%zmm30";
2141
2142map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));		# switch to %zmm domain
2143map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2144map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2145map(s/%y/%z/,($MASK));
2146
2147$code.=<<___;
2148.type	poly1305_blocks_avx512,\@function,4
2149.align	32
2150poly1305_blocks_avx512:
2151.cfi_startproc
2152	endbranch
2153.Lblocks_avx512:
2154	mov		\$15,%eax
2155	kmovw		%eax,%k2
2156___
2157$code.=<<___	if (!$win64);
2158	lea		-8(%rsp),%r11
2159.cfi_def_cfa		%r11,16
2160	sub		\$0x128,%rsp
2161___
2162$code.=<<___	if ($win64);
2163	lea		-0xf8(%rsp),%r11
2164	sub		\$0x1c8,%rsp
2165	vmovdqa		%xmm6,0x50(%r11)
2166	vmovdqa		%xmm7,0x60(%r11)
2167	vmovdqa		%xmm8,0x70(%r11)
2168	vmovdqa		%xmm9,0x80(%r11)
2169	vmovdqa		%xmm10,0x90(%r11)
2170	vmovdqa		%xmm11,0xa0(%r11)
2171	vmovdqa		%xmm12,0xb0(%r11)
2172	vmovdqa		%xmm13,0xc0(%r11)
2173	vmovdqa		%xmm14,0xd0(%r11)
2174	vmovdqa		%xmm15,0xe0(%r11)
2175.Ldo_avx512_body:
2176___
2177$code.=<<___;
2178	lea		.Lconst(%rip),%rcx
2179	lea		48+64($ctx),$ctx	# size optimization
2180	vmovdqa		96(%rcx),%y#$T2		# .Lpermd_avx2
2181
2182	# expand pre-calculated table
2183	vmovdqu		`16*0-64`($ctx),%x#$D0	# will become expanded ${R0}
2184	and		\$-512,%rsp
2185	vmovdqu		`16*1-64`($ctx),%x#$D1	# will become ... ${R1}
2186	mov		\$0x20,%rax
2187	vmovdqu		`16*2-64`($ctx),%x#$T0	# ... ${S1}
2188	vmovdqu		`16*3-64`($ctx),%x#$D2	# ... ${R2}
2189	vmovdqu		`16*4-64`($ctx),%x#$T1	# ... ${S2}
2190	vmovdqu		`16*5-64`($ctx),%x#$D3	# ... ${R3}
2191	vmovdqu		`16*6-64`($ctx),%x#$T3	# ... ${S3}
2192	vmovdqu		`16*7-64`($ctx),%x#$D4	# ... ${R4}
2193	vmovdqu		`16*8-64`($ctx),%x#$T4	# ... ${S4}
2194	vpermd		$D0,$T2,$R0		# 00003412 -> 14243444
2195	vpbroadcastq	64(%rcx),$MASK		# .Lmask26
2196	vpermd		$D1,$T2,$R1
2197	vpermd		$T0,$T2,$S1
2198	vpermd		$D2,$T2,$R2
2199	vmovdqa64	$R0,0x00(%rsp){%k2}	# save in case $len%128 != 0
2200	 vpsrlq		\$32,$R0,$T0		# 14243444 -> 01020304
2201	vpermd		$T1,$T2,$S2
2202	vmovdqu64	$R1,0x00(%rsp,%rax){%k2}
2203	 vpsrlq		\$32,$R1,$T1
2204	vpermd		$D3,$T2,$R3
2205	vmovdqa64	$S1,0x40(%rsp){%k2}
2206	vpermd		$T3,$T2,$S3
2207	vpermd		$D4,$T2,$R4
2208	vmovdqu64	$R2,0x40(%rsp,%rax){%k2}
2209	vpermd		$T4,$T2,$S4
2210	vmovdqa64	$S2,0x80(%rsp){%k2}
2211	vmovdqu64	$R3,0x80(%rsp,%rax){%k2}
2212	vmovdqa64	$S3,0xc0(%rsp){%k2}
2213	vmovdqu64	$R4,0xc0(%rsp,%rax){%k2}
2214	vmovdqa64	$S4,0x100(%rsp){%k2}
2215
2216	################################################################
2217	# calculate 5th through 8th powers of the key
2218	#
2219	# d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2220	# d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2221	# d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
2222	# d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
2223	# d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
2224
2225	vpmuludq	$T0,$R0,$D0		# d0 = r0'*r0
2226	vpmuludq	$T0,$R1,$D1		# d1 = r0'*r1
2227	vpmuludq	$T0,$R2,$D2		# d2 = r0'*r2
2228	vpmuludq	$T0,$R3,$D3		# d3 = r0'*r3
2229	vpmuludq	$T0,$R4,$D4		# d4 = r0'*r4
2230	 vpsrlq		\$32,$R2,$T2
2231
2232	vpmuludq	$T1,$S4,$M0
2233	vpmuludq	$T1,$R0,$M1
2234	vpmuludq	$T1,$R1,$M2
2235	vpmuludq	$T1,$R2,$M3
2236	vpmuludq	$T1,$R3,$M4
2237	 vpsrlq		\$32,$R3,$T3
2238	vpaddq		$M0,$D0,$D0		# d0 += r1'*5*r4
2239	vpaddq		$M1,$D1,$D1		# d1 += r1'*r0
2240	vpaddq		$M2,$D2,$D2		# d2 += r1'*r1
2241	vpaddq		$M3,$D3,$D3		# d3 += r1'*r2
2242	vpaddq		$M4,$D4,$D4		# d4 += r1'*r3
2243
2244	vpmuludq	$T2,$S3,$M0
2245	vpmuludq	$T2,$S4,$M1
2246	vpmuludq	$T2,$R1,$M3
2247	vpmuludq	$T2,$R2,$M4
2248	vpmuludq	$T2,$R0,$M2
2249	 vpsrlq		\$32,$R4,$T4
2250	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r3
2251	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r4
2252	vpaddq		$M3,$D3,$D3		# d3 += r2'*r1
2253	vpaddq		$M4,$D4,$D4		# d4 += r2'*r2
2254	vpaddq		$M2,$D2,$D2		# d2 += r2'*r0
2255
2256	vpmuludq	$T3,$S2,$M0
2257	vpmuludq	$T3,$R0,$M3
2258	vpmuludq	$T3,$R1,$M4
2259	vpmuludq	$T3,$S3,$M1
2260	vpmuludq	$T3,$S4,$M2
2261	vpaddq		$M0,$D0,$D0		# d0 += r3'*5*r2
2262	vpaddq		$M3,$D3,$D3		# d3 += r3'*r0
2263	vpaddq		$M4,$D4,$D4		# d4 += r3'*r1
2264	vpaddq		$M1,$D1,$D1		# d1 += r3'*5*r3
2265	vpaddq		$M2,$D2,$D2		# d2 += r3'*5*r4
2266
2267	vpmuludq	$T4,$S4,$M3
2268	vpmuludq	$T4,$R0,$M4
2269	vpmuludq	$T4,$S1,$M0
2270	vpmuludq	$T4,$S2,$M1
2271	vpmuludq	$T4,$S3,$M2
2272	vpaddq		$M3,$D3,$D3		# d3 += r2'*5*r4
2273	vpaddq		$M4,$D4,$D4		# d4 += r2'*r0
2274	vpaddq		$M0,$D0,$D0		# d0 += r2'*5*r1
2275	vpaddq		$M1,$D1,$D1		# d1 += r2'*5*r2
2276	vpaddq		$M2,$D2,$D2		# d2 += r2'*5*r3
2277
2278	################################################################
2279	# load input
2280	vmovdqu64	16*0($inp),%z#$T3
2281	vmovdqu64	16*4($inp),%z#$T4
2282	lea		16*8($inp),$inp
2283
2284	################################################################
2285	# lazy reduction
2286
2287	vpsrlq		\$26,$D3,$M3
2288	vpandq		$MASK,$D3,$D3
2289	vpaddq		$M3,$D4,$D4		# d3 -> d4
2290
2291	vpsrlq		\$26,$D0,$M0
2292	vpandq		$MASK,$D0,$D0
2293	vpaddq		$M0,$D1,$D1		# d0 -> d1
2294
2295	vpsrlq		\$26,$D4,$M4
2296	vpandq		$MASK,$D4,$D4
2297
2298	vpsrlq		\$26,$D1,$M1
2299	vpandq		$MASK,$D1,$D1
2300	vpaddq		$M1,$D2,$D2		# d1 -> d2
2301
2302	vpaddq		$M4,$D0,$D0
2303	vpsllq		\$2,$M4,$M4
2304	vpaddq		$M4,$D0,$D0		# d4 -> d0
2305
2306	vpsrlq		\$26,$D2,$M2
2307	vpandq		$MASK,$D2,$D2
2308	vpaddq		$M2,$D3,$D3		# d2 -> d3
2309
2310	vpsrlq		\$26,$D0,$M0
2311	vpandq		$MASK,$D0,$D0
2312	vpaddq		$M0,$D1,$D1		# d0 -> d1
2313
2314	vpsrlq		\$26,$D3,$M3
2315	vpandq		$MASK,$D3,$D3
2316	vpaddq		$M3,$D4,$D4		# d3 -> d4
2317
2318	################################################################
2319	# at this point we have 14243444 in $R0-$S4 and 05060708 in
2320	# $D0-$D4, ...
2321
2322	vpunpcklqdq	$T4,$T3,$T0	# transpose input
2323	vpunpckhqdq	$T4,$T3,$T4
2324
2325	# ... since input 64-bit lanes are ordered as 73625140, we could
2326	# "vperm" it to 76543210 (here and in each loop iteration), *or*
2327	# we could just flow along, hence the goal for $R0-$S4 is
2328	# 1858286838784888 ...
2329
2330	vmovdqa32	128(%rcx),$M0		# .Lpermd_avx512:
2331	mov		\$0x7777,%eax
2332	kmovw		%eax,%k1
2333
2334	vpermd		$R0,$M0,$R0		# 14243444 -> 1---2---3---4---
2335	vpermd		$R1,$M0,$R1
2336	vpermd		$R2,$M0,$R2
2337	vpermd		$R3,$M0,$R3
2338	vpermd		$R4,$M0,$R4
2339
2340	vpermd		$D0,$M0,${R0}{%k1}	# 05060708 -> 1858286838784888
2341	vpermd		$D1,$M0,${R1}{%k1}
2342	vpermd		$D2,$M0,${R2}{%k1}
2343	vpermd		$D3,$M0,${R3}{%k1}
2344	vpermd		$D4,$M0,${R4}{%k1}
2345
2346	vpslld		\$2,$R1,$S1		# *5
2347	vpslld		\$2,$R2,$S2
2348	vpslld		\$2,$R3,$S3
2349	vpslld		\$2,$R4,$S4
2350	vpaddd		$R1,$S1,$S1
2351	vpaddd		$R2,$S2,$S2
2352	vpaddd		$R3,$S3,$S3
2353	vpaddd		$R4,$S4,$S4
2354
2355	vpbroadcastq	32(%rcx),$PADBIT	# .L129
2356
2357	vpsrlq		\$52,$T0,$T2		# splat input
2358	vpsllq		\$12,$T4,$T3
2359	vporq		$T3,$T2,$T2
2360	vpsrlq		\$26,$T0,$T1
2361	vpsrlq		\$14,$T4,$T3
2362	vpsrlq		\$40,$T4,$T4		# 4
2363	vpandq		$MASK,$T2,$T2		# 2
2364	vpandq		$MASK,$T0,$T0		# 0
2365	#vpandq		$MASK,$T1,$T1		# 1
2366	#vpandq		$MASK,$T3,$T3		# 3
2367	#vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2368
2369	vpaddq		$H2,$T2,$H2		# accumulate input
2370	sub		\$192,$len
2371	jbe		.Ltail_avx512
2372	jmp		.Loop_avx512
2373
2374.align	32
2375.Loop_avx512:
2376	################################################################
2377	# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2378	# ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2379	# ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2380	# ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2381	# ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2382	# ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2383	# ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2384	# ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2385	#   \________/\___________/
2386	################################################################
2387	#vpaddq		$H2,$T2,$H2		# accumulate input
2388
2389	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
2390	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
2391	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
2392	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
2393	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2394	#
2395	# however, as h2 is "chronologically" first one available pull
2396	# corresponding operations up, so it's
2397	#
2398	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
2399	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
2400	# d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
2401	# d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
2402	# d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
2403
2404	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
2405	 vpaddq		$H0,$T0,$H0
2406	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
2407	 vpandq		$MASK,$T1,$T1		# 1
2408	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
2409	 vpandq		$MASK,$T3,$T3		# 3
2410	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
2411	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2412	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
2413	 vpaddq		$H1,$T1,$H1		# accumulate input
2414	 vpaddq		$H3,$T3,$H3
2415	 vpaddq		$H4,$T4,$H4
2416
2417	  vmovdqu64	16*0($inp),$T3		# load input
2418	  vmovdqu64	16*4($inp),$T4
2419	  lea		16*8($inp),$inp
2420	vpmuludq	$H0,$R3,$M3
2421	vpmuludq	$H0,$R4,$M4
2422	vpmuludq	$H0,$R0,$M0
2423	vpmuludq	$H0,$R1,$M1
2424	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
2425	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
2426	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
2427	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
2428
2429	vpmuludq	$H1,$R2,$M3
2430	vpmuludq	$H1,$R3,$M4
2431	vpmuludq	$H1,$S4,$M0
2432	vpmuludq	$H0,$R2,$M2
2433	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
2434	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
2435	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
2436	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
2437
2438	  vpunpcklqdq	$T4,$T3,$T0		# transpose input
2439	  vpunpckhqdq	$T4,$T3,$T4
2440
2441	vpmuludq	$H3,$R0,$M3
2442	vpmuludq	$H3,$R1,$M4
2443	vpmuludq	$H1,$R0,$M1
2444	vpmuludq	$H1,$R1,$M2
2445	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
2446	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
2447	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
2448	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
2449
2450	vpmuludq	$H4,$S4,$M3
2451	vpmuludq	$H4,$R0,$M4
2452	vpmuludq	$H3,$S2,$M0
2453	vpmuludq	$H3,$S3,$M1
2454	vpaddq		$M3,$D3,$D3		# d3 += h4*s4
2455	vpmuludq	$H3,$S4,$M2
2456	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
2457	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
2458	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
2459	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
2460
2461	vpmuludq	$H4,$S1,$M0
2462	vpmuludq	$H4,$S2,$M1
2463	vpmuludq	$H4,$S3,$M2
2464	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
2465	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
2466	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
2467
2468	################################################################
2469	# lazy reduction (interleaved with input splat)
2470
2471	 vpsrlq		\$52,$T0,$T2		# splat input
2472	 vpsllq		\$12,$T4,$T3
2473
2474	vpsrlq		\$26,$D3,$H3
2475	vpandq		$MASK,$D3,$D3
2476	vpaddq		$H3,$D4,$H4		# h3 -> h4
2477
2478	 vporq		$T3,$T2,$T2
2479
2480	vpsrlq		\$26,$H0,$D0
2481	vpandq		$MASK,$H0,$H0
2482	vpaddq		$D0,$H1,$H1		# h0 -> h1
2483
2484	 vpandq		$MASK,$T2,$T2		# 2
2485
2486	vpsrlq		\$26,$H4,$D4
2487	vpandq		$MASK,$H4,$H4
2488
2489	vpsrlq		\$26,$H1,$D1
2490	vpandq		$MASK,$H1,$H1
2491	vpaddq		$D1,$H2,$H2		# h1 -> h2
2492
2493	vpaddq		$D4,$H0,$H0
2494	vpsllq		\$2,$D4,$D4
2495	vpaddq		$D4,$H0,$H0		# h4 -> h0
2496
2497	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
2498	 vpsrlq		\$26,$T0,$T1
2499
2500	vpsrlq		\$26,$H2,$D2
2501	vpandq		$MASK,$H2,$H2
2502	vpaddq		$D2,$D3,$H3		# h2 -> h3
2503
2504	 vpsrlq		\$14,$T4,$T3
2505
2506	vpsrlq		\$26,$H0,$D0
2507	vpandq		$MASK,$H0,$H0
2508	vpaddq		$D0,$H1,$H1		# h0 -> h1
2509
2510	 vpsrlq		\$40,$T4,$T4		# 4
2511
2512	vpsrlq		\$26,$H3,$D3
2513	vpandq		$MASK,$H3,$H3
2514	vpaddq		$D3,$H4,$H4		# h3 -> h4
2515
2516	 vpandq		$MASK,$T0,$T0		# 0
2517	 #vpandq	$MASK,$T1,$T1		# 1
2518	 #vpandq	$MASK,$T3,$T3		# 3
2519	 #vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2520
2521	sub		\$128,$len
2522	ja		.Loop_avx512
2523
2524.Ltail_avx512:
2525	################################################################
2526	# while above multiplications were by r^8 in all lanes, in last
2527	# iteration we multiply least significant lane by r^8 and most
2528	# significant one by r, that's why table gets shifted...
2529
2530	vpsrlq		\$32,$R0,$R0		# 0105020603070408
2531	vpsrlq		\$32,$R1,$R1
2532	vpsrlq		\$32,$R2,$R2
2533	vpsrlq		\$32,$S3,$S3
2534	vpsrlq		\$32,$S4,$S4
2535	vpsrlq		\$32,$R3,$R3
2536	vpsrlq		\$32,$R4,$R4
2537	vpsrlq		\$32,$S1,$S1
2538	vpsrlq		\$32,$S2,$S2
2539
2540	################################################################
2541	# load either next or last 64 byte of input
2542	lea		($inp,$len),$inp
2543
2544	#vpaddq		$H2,$T2,$H2		# accumulate input
2545	vpaddq		$H0,$T0,$H0
2546
2547	vpmuludq	$H2,$R1,$D3		# d3 = h2*r1
2548	vpmuludq	$H2,$R2,$D4		# d4 = h2*r2
2549	vpmuludq	$H2,$S3,$D0		# d0 = h2*s3
2550	 vpandq		$MASK,$T1,$T1		# 1
2551	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
2552	 vpandq		$MASK,$T3,$T3		# 3
2553	vpmuludq	$H2,$R0,$D2		# d2 = h2*r0
2554	 vporq		$PADBIT,$T4,$T4		# padbit, yes, always
2555	 vpaddq		$H1,$T1,$H1		# accumulate input
2556	 vpaddq		$H3,$T3,$H3
2557	 vpaddq		$H4,$T4,$H4
2558
2559	  vmovdqu	16*0($inp),%x#$T0
2560	vpmuludq	$H0,$R3,$M3
2561	vpmuludq	$H0,$R4,$M4
2562	vpmuludq	$H0,$R0,$M0
2563	vpmuludq	$H0,$R1,$M1
2564	vpaddq		$M3,$D3,$D3		# d3 += h0*r3
2565	vpaddq		$M4,$D4,$D4		# d4 += h0*r4
2566	vpaddq		$M0,$D0,$D0		# d0 += h0*r0
2567	vpaddq		$M1,$D1,$D1		# d1 += h0*r1
2568
2569	  vmovdqu	16*1($inp),%x#$T1
2570	vpmuludq	$H1,$R2,$M3
2571	vpmuludq	$H1,$R3,$M4
2572	vpmuludq	$H1,$S4,$M0
2573	vpmuludq	$H0,$R2,$M2
2574	vpaddq		$M3,$D3,$D3		# d3 += h1*r2
2575	vpaddq		$M4,$D4,$D4		# d4 += h1*r3
2576	vpaddq		$M0,$D0,$D0		# d0 += h1*s4
2577	vpaddq		$M2,$D2,$D2		# d2 += h0*r2
2578
2579	  vinserti128	\$1,16*2($inp),%y#$T0,%y#$T0
2580	vpmuludq	$H3,$R0,$M3
2581	vpmuludq	$H3,$R1,$M4
2582	vpmuludq	$H1,$R0,$M1
2583	vpmuludq	$H1,$R1,$M2
2584	vpaddq		$M3,$D3,$D3		# d3 += h3*r0
2585	vpaddq		$M4,$D4,$D4		# d4 += h3*r1
2586	vpaddq		$M1,$D1,$D1		# d1 += h1*r0
2587	vpaddq		$M2,$D2,$D2		# d2 += h1*r1
2588
2589	  vinserti128	\$1,16*3($inp),%y#$T1,%y#$T1
2590	vpmuludq	$H4,$S4,$M3
2591	vpmuludq	$H4,$R0,$M4
2592	vpmuludq	$H3,$S2,$M0
2593	vpmuludq	$H3,$S3,$M1
2594	vpmuludq	$H3,$S4,$M2
2595	vpaddq		$M3,$D3,$H3		# h3 = d3 + h4*s4
2596	vpaddq		$M4,$D4,$D4		# d4 += h4*r0
2597	vpaddq		$M0,$D0,$D0		# d0 += h3*s2
2598	vpaddq		$M1,$D1,$D1		# d1 += h3*s3
2599	vpaddq		$M2,$D2,$D2		# d2 += h3*s4
2600
2601	vpmuludq	$H4,$S1,$M0
2602	vpmuludq	$H4,$S2,$M1
2603	vpmuludq	$H4,$S3,$M2
2604	vpaddq		$M0,$D0,$H0		# h0 = d0 + h4*s1
2605	vpaddq		$M1,$D1,$H1		# h1 = d2 + h4*s2
2606	vpaddq		$M2,$D2,$H2		# h2 = d3 + h4*s3
2607
2608	################################################################
2609	# horizontal addition
2610
2611	mov		\$1,%eax
2612	vpermq		\$0xb1,$H3,$D3
2613	vpermq		\$0xb1,$D4,$H4
2614	vpermq		\$0xb1,$H0,$D0
2615	vpermq		\$0xb1,$H1,$D1
2616	vpermq		\$0xb1,$H2,$D2
2617	vpaddq		$D3,$H3,$H3
2618	vpaddq		$D4,$H4,$H4
2619	vpaddq		$D0,$H0,$H0
2620	vpaddq		$D1,$H1,$H1
2621	vpaddq		$D2,$H2,$H2
2622
2623	kmovw		%eax,%k3
2624	vpermq		\$0x2,$H3,$D3
2625	vpermq		\$0x2,$H4,$D4
2626	vpermq		\$0x2,$H0,$D0
2627	vpermq		\$0x2,$H1,$D1
2628	vpermq		\$0x2,$H2,$D2
2629	vpaddq		$D3,$H3,$H3
2630	vpaddq		$D4,$H4,$H4
2631	vpaddq		$D0,$H0,$H0
2632	vpaddq		$D1,$H1,$H1
2633	vpaddq		$D2,$H2,$H2
2634
2635	vextracti64x4	\$0x1,$H3,%y#$D3
2636	vextracti64x4	\$0x1,$H4,%y#$D4
2637	vextracti64x4	\$0x1,$H0,%y#$D0
2638	vextracti64x4	\$0x1,$H1,%y#$D1
2639	vextracti64x4	\$0x1,$H2,%y#$D2
2640	vpaddq		$D3,$H3,${H3}{%k3}{z}	# keep single qword in case
2641	vpaddq		$D4,$H4,${H4}{%k3}{z}	# it's passed to .Ltail_avx2
2642	vpaddq		$D0,$H0,${H0}{%k3}{z}
2643	vpaddq		$D1,$H1,${H1}{%k3}{z}
2644	vpaddq		$D2,$H2,${H2}{%k3}{z}
2645___
2646map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2647map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2648$code.=<<___;
2649	################################################################
2650	# lazy reduction (interleaved with input splat)
2651
2652	vpsrlq		\$26,$H3,$D3
2653	vpand		$MASK,$H3,$H3
2654	 vpsrldq	\$6,$T0,$T2		# splat input
2655	 vpsrldq	\$6,$T1,$T3
2656	 vpunpckhqdq	$T1,$T0,$T4		# 4
2657	vpaddq		$D3,$H4,$H4		# h3 -> h4
2658
2659	vpsrlq		\$26,$H0,$D0
2660	vpand		$MASK,$H0,$H0
2661	 vpunpcklqdq	$T3,$T2,$T2		# 2:3
2662	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
2663	vpaddq		$D0,$H1,$H1		# h0 -> h1
2664
2665	vpsrlq		\$26,$H4,$D4
2666	vpand		$MASK,$H4,$H4
2667
2668	vpsrlq		\$26,$H1,$D1
2669	vpand		$MASK,$H1,$H1
2670	 vpsrlq		\$30,$T2,$T3
2671	 vpsrlq		\$4,$T2,$T2
2672	vpaddq		$D1,$H2,$H2		# h1 -> h2
2673
2674	vpaddq		$D4,$H0,$H0
2675	vpsllq		\$2,$D4,$D4
2676	 vpsrlq		\$26,$T0,$T1
2677	 vpsrlq		\$40,$T4,$T4		# 4
2678	vpaddq		$D4,$H0,$H0		# h4 -> h0
2679
2680	vpsrlq		\$26,$H2,$D2
2681	vpand		$MASK,$H2,$H2
2682	 vpand		$MASK,$T2,$T2		# 2
2683	 vpand		$MASK,$T0,$T0		# 0
2684	vpaddq		$D2,$H3,$H3		# h2 -> h3
2685
2686	vpsrlq		\$26,$H0,$D0
2687	vpand		$MASK,$H0,$H0
2688	 vpaddq		$H2,$T2,$H2		# accumulate input for .Ltail_avx2
2689	 vpand		$MASK,$T1,$T1		# 1
2690	vpaddq		$D0,$H1,$H1		# h0 -> h1
2691
2692	vpsrlq		\$26,$H3,$D3
2693	vpand		$MASK,$H3,$H3
2694	 vpand		$MASK,$T3,$T3		# 3
2695	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
2696	vpaddq		$D3,$H4,$H4		# h3 -> h4
2697
2698	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
2699	add		\$64,$len
2700	jnz		.Ltail_avx2
2701
2702	vpsubq		$T2,$H2,$H2		# undo input accumulation
2703	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
2704	vmovd		%x#$H1,`4*1-48-64`($ctx)
2705	vmovd		%x#$H2,`4*2-48-64`($ctx)
2706	vmovd		%x#$H3,`4*3-48-64`($ctx)
2707	vmovd		%x#$H4,`4*4-48-64`($ctx)
2708	vzeroall
2709___
2710$code.=<<___	if ($win64);
2711	movdqa		0x50(%r11),%xmm6
2712	movdqa		0x60(%r11),%xmm7
2713	movdqa		0x70(%r11),%xmm8
2714	movdqa		0x80(%r11),%xmm9
2715	movdqa		0x90(%r11),%xmm10
2716	movdqa		0xa0(%r11),%xmm11
2717	movdqa		0xb0(%r11),%xmm12
2718	movdqa		0xc0(%r11),%xmm13
2719	movdqa		0xd0(%r11),%xmm14
2720	movdqa		0xe0(%r11),%xmm15
2721	lea		0xf8(%r11),%rsp
2722.Ldo_avx512_epilogue:
2723___
2724$code.=<<___	if (!$win64);
2725	lea		8(%r11),%rsp
2726.cfi_def_cfa		%rsp,8
2727___
2728$code.=<<___;
2729	ret
2730.cfi_endproc
2731.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
2732___
2733if ($avx>3 && !$win64) {
2734########################################################################
2735# VPMADD52 version using 2^44 radix.
2736#
2737# One can argue that base 2^52 would be more natural. Well, even though
2738# some operations would be more natural, one has to recognize couple of
2739# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2740# at amount of multiply-n-accumulate operations. Secondly, it makes it
2741# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2742# reference implementations], which means that more such operations
2743# would have to be performed in inner loop, which in turn makes critical
2744# path longer. In other words, even though base 2^44 reduction might
2745# look less elegant, overall critical path is actually shorter...
2746
2747########################################################################
2748# Layout of opaque area is following.
2749#
2750#	unsigned __int64 h[3];		# current hash value base 2^44
2751#	unsigned __int64 s[2];		# key value*20 base 2^44
2752#	unsigned __int64 r[3];		# key value base 2^44
2753#	struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2754#					# r^n positions reflect
2755#					# placement in register, not
2756#					# memory, R[3] is R[1]*20
2757
2758$code.=<<___;
2759.type	poly1305_init_base2_44,\@function,3
2760.align	32
2761poly1305_init_base2_44:
2762.cfi_startproc
2763	xor	%rax,%rax
2764	mov	%rax,0($ctx)		# initialize hash value
2765	mov	%rax,8($ctx)
2766	mov	%rax,16($ctx)
2767
2768.Linit_base2_44:
2769	lea	poly1305_blocks_vpmadd52(%rip),%r10
2770	lea	poly1305_emit_base2_44(%rip),%r11
2771
2772	mov	\$0x0ffffffc0fffffff,%rax
2773	mov	\$0x0ffffffc0ffffffc,%rcx
2774	and	0($inp),%rax
2775	mov	\$0x00000fffffffffff,%r8
2776	and	8($inp),%rcx
2777	mov	\$0x00000fffffffffff,%r9
2778	and	%rax,%r8
2779	shrd	\$44,%rcx,%rax
2780	mov	%r8,40($ctx)		# r0
2781	and	%r9,%rax
2782	shr	\$24,%rcx
2783	mov	%rax,48($ctx)		# r1
2784	lea	(%rax,%rax,4),%rax	# *5
2785	mov	%rcx,56($ctx)		# r2
2786	shl	\$2,%rax		# magic <<2
2787	lea	(%rcx,%rcx,4),%rcx	# *5
2788	shl	\$2,%rcx		# magic <<2
2789	mov	%rax,24($ctx)		# s1
2790	mov	%rcx,32($ctx)		# s2
2791	movq	\$-1,64($ctx)		# write impossible value
2792___
2793$code.=<<___	if ($flavour !~ /elf32/);
2794	mov	%r10,0(%rdx)
2795	mov	%r11,8(%rdx)
2796___
2797$code.=<<___	if ($flavour =~ /elf32/);
2798	mov	%r10d,0(%rdx)
2799	mov	%r11d,4(%rdx)
2800___
2801$code.=<<___;
2802	mov	\$1,%eax
2803	ret
2804.cfi_endproc
2805.size	poly1305_init_base2_44,.-poly1305_init_base2_44
2806___
2807{
2808my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2809my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2810my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2811
2812$code.=<<___;
2813.type	poly1305_blocks_vpmadd52,\@function,4
2814.align	32
2815poly1305_blocks_vpmadd52:
2816.cfi_startproc
2817	endbranch
2818	shr	\$4,$len
2819	jz	.Lno_data_vpmadd52		# too short
2820
2821	shl	\$40,$padbit
2822	mov	64($ctx),%r8			# peek on power of the key
2823
2824	# if powers of the key are not calculated yet, process up to 3
2825	# blocks with this single-block subroutine, otherwise ensure that
2826	# length is divisible by 2 blocks and pass the rest down to next
2827	# subroutine...
2828
2829	mov	\$3,%rax
2830	mov	\$1,%r10
2831	cmp	\$4,$len			# is input long
2832	cmovae	%r10,%rax
2833	test	%r8,%r8				# is power value impossible?
2834	cmovns	%r10,%rax
2835
2836	and	$len,%rax			# is input of favourable length?
2837	jz	.Lblocks_vpmadd52_4x
2838
2839	sub		%rax,$len
2840	mov		\$7,%r10d
2841	mov		\$1,%r11d
2842	kmovw		%r10d,%k7
2843	lea		.L2_44_inp_permd(%rip),%r10
2844	kmovw		%r11d,%k1
2845
2846	vmovq		$padbit,%x#$PAD
2847	vmovdqa64	0(%r10),$inp_permd	# .L2_44_inp_permd
2848	vmovdqa64	32(%r10),$inp_shift	# .L2_44_inp_shift
2849	vpermq		\$0xcf,$PAD,$PAD
2850	vmovdqa64	64(%r10),$reduc_mask	# .L2_44_mask
2851
2852	vmovdqu64	0($ctx),${Dlo}{%k7}{z}		# load hash value
2853	vmovdqu64	40($ctx),${r2r1r0}{%k7}{z}	# load keys
2854	vmovdqu64	32($ctx),${r1r0s2}{%k7}{z}
2855	vmovdqu64	24($ctx),${r0s2s1}{%k7}{z}
2856
2857	vmovdqa64	96(%r10),$reduc_rght	# .L2_44_shift_rgt
2858	vmovdqa64	128(%r10),$reduc_left	# .L2_44_shift_lft
2859
2860	jmp		.Loop_vpmadd52
2861
2862.align	32
2863.Loop_vpmadd52:
2864	vmovdqu32	0($inp),%x#$T0		# load input as ----3210
2865	lea		16($inp),$inp
2866
2867	vpermd		$T0,$inp_permd,$T0	# ----3210 -> --322110
2868	vpsrlvq		$inp_shift,$T0,$T0
2869	vpandq		$reduc_mask,$T0,$T0
2870	vporq		$PAD,$T0,$T0
2871
2872	vpaddq		$T0,$Dlo,$Dlo		# accumulate input
2873
2874	vpermq		\$0,$Dlo,${H0}{%k7}{z}	# smash hash value
2875	vpermq		\$0b01010101,$Dlo,${H1}{%k7}{z}
2876	vpermq		\$0b10101010,$Dlo,${H2}{%k7}{z}
2877
2878	vpxord		$Dlo,$Dlo,$Dlo
2879	vpxord		$Dhi,$Dhi,$Dhi
2880
2881	vpmadd52luq	$r2r1r0,$H0,$Dlo
2882	vpmadd52huq	$r2r1r0,$H0,$Dhi
2883
2884	vpmadd52luq	$r1r0s2,$H1,$Dlo
2885	vpmadd52huq	$r1r0s2,$H1,$Dhi
2886
2887	vpmadd52luq	$r0s2s1,$H2,$Dlo
2888	vpmadd52huq	$r0s2s1,$H2,$Dhi
2889
2890	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost qword
2891	vpsllvq		$reduc_left,$Dhi,$Dhi	# 0 in topmost qword
2892	vpandq		$reduc_mask,$Dlo,$Dlo
2893
2894	vpaddq		$T0,$Dhi,$Dhi
2895
2896	vpermq		\$0b10010011,$Dhi,$Dhi	# 0 in lowest qword
2897
2898	vpaddq		$Dhi,$Dlo,$Dlo		# note topmost qword :-)
2899
2900	vpsrlvq		$reduc_rght,$Dlo,$T0	# 0 in topmost word
2901	vpandq		$reduc_mask,$Dlo,$Dlo
2902
2903	vpermq		\$0b10010011,$T0,$T0
2904
2905	vpaddq		$T0,$Dlo,$Dlo
2906
2907	vpermq		\$0b10010011,$Dlo,${T0}{%k1}{z}
2908
2909	vpaddq		$T0,$Dlo,$Dlo
2910	vpsllq		\$2,$T0,$T0
2911
2912	vpaddq		$T0,$Dlo,$Dlo
2913
2914	dec		%rax			# len-=16
2915	jnz		.Loop_vpmadd52
2916
2917	vmovdqu64	$Dlo,0($ctx){%k7}	# store hash value
2918
2919	test		$len,$len
2920	jnz		.Lblocks_vpmadd52_4x
2921
2922.Lno_data_vpmadd52:
2923	ret
2924.cfi_endproc
2925.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2926___
2927}
2928{
2929########################################################################
2930# As implied by its name 4x subroutine processes 4 blocks in parallel
2931# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2932# and is handled in 256-bit %ymm registers.
2933
2934my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2935my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2936my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2937
2938$code.=<<___;
2939.type	poly1305_blocks_vpmadd52_4x,\@function,4
2940.align	32
2941poly1305_blocks_vpmadd52_4x:
2942.cfi_startproc
2943	shr	\$4,$len
2944	jz	.Lno_data_vpmadd52_4x		# too short
2945
2946	shl	\$40,$padbit
2947	mov	64($ctx),%r8			# peek on power of the key
2948
2949.Lblocks_vpmadd52_4x:
2950	vpbroadcastq	$padbit,$PAD
2951
2952	vmovdqa64	.Lx_mask44(%rip),$mask44
2953	mov		\$5,%eax
2954	vmovdqa64	.Lx_mask42(%rip),$mask42
2955	kmovw		%eax,%k1		# used in 2x path
2956
2957	test		%r8,%r8			# is power value impossible?
2958	js		.Linit_vpmadd52		# if it is, then init R[4]
2959
2960	vmovq		0($ctx),%x#$H0		# load current hash value
2961	vmovq		8($ctx),%x#$H1
2962	vmovq		16($ctx),%x#$H2
2963
2964	test		\$3,$len		# is length 4*n+2?
2965	jnz		.Lblocks_vpmadd52_2x_do
2966
2967.Lblocks_vpmadd52_4x_do:
2968	vpbroadcastq	64($ctx),$R0		# load 4th power of the key
2969	vpbroadcastq	96($ctx),$R1
2970	vpbroadcastq	128($ctx),$R2
2971	vpbroadcastq	160($ctx),$S1
2972
2973.Lblocks_vpmadd52_4x_key_loaded:
2974	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
2975	vpaddq		$R2,$S2,$S2
2976	vpsllq		\$2,$S2,$S2
2977
2978	test		\$7,$len		# is len 8*n?
2979	jz		.Lblocks_vpmadd52_8x
2980
2981	vmovdqu64	16*0($inp),$T2		# load data
2982	vmovdqu64	16*2($inp),$T3
2983	lea		16*4($inp),$inp
2984
2985	vpunpcklqdq	$T3,$T2,$T1		# transpose data
2986	vpunpckhqdq	$T3,$T2,$T3
2987
2988	# at this point 64-bit lanes are ordered as 3-1-2-0
2989
2990	vpsrlq		\$24,$T3,$T2		# splat the data
2991	vporq		$PAD,$T2,$T2
2992	 vpaddq		$T2,$H2,$H2		# accumulate input
2993	vpandq		$mask44,$T1,$T0
2994	vpsrlq		\$44,$T1,$T1
2995	vpsllq		\$20,$T3,$T3
2996	vporq		$T3,$T1,$T1
2997	vpandq		$mask44,$T1,$T1
2998
2999	sub		\$4,$len
3000	jz		.Ltail_vpmadd52_4x
3001	jmp		.Loop_vpmadd52_4x
3002	ud2
3003
3004.align	32
3005.Linit_vpmadd52:
3006	vmovq		24($ctx),%x#$S1		# load key
3007	vmovq		56($ctx),%x#$H2
3008	vmovq		32($ctx),%x#$S2
3009	vmovq		40($ctx),%x#$R0
3010	vmovq		48($ctx),%x#$R1
3011
3012	vmovdqa		$R0,$H0
3013	vmovdqa		$R1,$H1
3014	vmovdqa		$H2,$R2
3015
3016	mov		\$2,%eax
3017
3018.Lmul_init_vpmadd52:
3019	vpxorq		$D0lo,$D0lo,$D0lo
3020	vpmadd52luq	$H2,$S1,$D0lo
3021	vpxorq		$D0hi,$D0hi,$D0hi
3022	vpmadd52huq	$H2,$S1,$D0hi
3023	vpxorq		$D1lo,$D1lo,$D1lo
3024	vpmadd52luq	$H2,$S2,$D1lo
3025	vpxorq		$D1hi,$D1hi,$D1hi
3026	vpmadd52huq	$H2,$S2,$D1hi
3027	vpxorq		$D2lo,$D2lo,$D2lo
3028	vpmadd52luq	$H2,$R0,$D2lo
3029	vpxorq		$D2hi,$D2hi,$D2hi
3030	vpmadd52huq	$H2,$R0,$D2hi
3031
3032	vpmadd52luq	$H0,$R0,$D0lo
3033	vpmadd52huq	$H0,$R0,$D0hi
3034	vpmadd52luq	$H0,$R1,$D1lo
3035	vpmadd52huq	$H0,$R1,$D1hi
3036	vpmadd52luq	$H0,$R2,$D2lo
3037	vpmadd52huq	$H0,$R2,$D2hi
3038
3039	vpmadd52luq	$H1,$S2,$D0lo
3040	vpmadd52huq	$H1,$S2,$D0hi
3041	vpmadd52luq	$H1,$R0,$D1lo
3042	vpmadd52huq	$H1,$R0,$D1hi
3043	vpmadd52luq	$H1,$R1,$D2lo
3044	vpmadd52huq	$H1,$R1,$D2hi
3045
3046	################################################################
3047	# partial reduction
3048	vpsrlq		\$44,$D0lo,$tmp
3049	vpsllq		\$8,$D0hi,$D0hi
3050	vpandq		$mask44,$D0lo,$H0
3051	vpaddq		$tmp,$D0hi,$D0hi
3052
3053	vpaddq		$D0hi,$D1lo,$D1lo
3054
3055	vpsrlq		\$44,$D1lo,$tmp
3056	vpsllq		\$8,$D1hi,$D1hi
3057	vpandq		$mask44,$D1lo,$H1
3058	vpaddq		$tmp,$D1hi,$D1hi
3059
3060	vpaddq		$D1hi,$D2lo,$D2lo
3061
3062	vpsrlq		\$42,$D2lo,$tmp
3063	vpsllq		\$10,$D2hi,$D2hi
3064	vpandq		$mask42,$D2lo,$H2
3065	vpaddq		$tmp,$D2hi,$D2hi
3066
3067	vpaddq		$D2hi,$H0,$H0
3068	vpsllq		\$2,$D2hi,$D2hi
3069
3070	vpaddq		$D2hi,$H0,$H0
3071
3072	vpsrlq		\$44,$H0,$tmp		# additional step
3073	vpandq		$mask44,$H0,$H0
3074
3075	vpaddq		$tmp,$H1,$H1
3076
3077	dec		%eax
3078	jz		.Ldone_init_vpmadd52
3079
3080	vpunpcklqdq	$R1,$H1,$R1		# 1,2
3081	vpbroadcastq	%x#$H1,%x#$H1		# 2,2
3082	vpunpcklqdq	$R2,$H2,$R2
3083	vpbroadcastq	%x#$H2,%x#$H2
3084	vpunpcklqdq	$R0,$H0,$R0
3085	vpbroadcastq	%x#$H0,%x#$H0
3086
3087	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
3088	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
3089	vpaddq		$R1,$S1,$S1
3090	vpaddq		$R2,$S2,$S2
3091	vpsllq		\$2,$S1,$S1
3092	vpsllq		\$2,$S2,$S2
3093
3094	jmp		.Lmul_init_vpmadd52
3095	ud2
3096
3097.align	32
3098.Ldone_init_vpmadd52:
3099	vinserti128	\$1,%x#$R1,$H1,$R1	# 1,2,3,4
3100	vinserti128	\$1,%x#$R2,$H2,$R2
3101	vinserti128	\$1,%x#$R0,$H0,$R0
3102
3103	vpermq		\$0b11011000,$R1,$R1	# 1,3,2,4
3104	vpermq		\$0b11011000,$R2,$R2
3105	vpermq		\$0b11011000,$R0,$R0
3106
3107	vpsllq		\$2,$R1,$S1		# S1 = R1*5*4
3108	vpaddq		$R1,$S1,$S1
3109	vpsllq		\$2,$S1,$S1
3110
3111	vmovq		0($ctx),%x#$H0		# load current hash value
3112	vmovq		8($ctx),%x#$H1
3113	vmovq		16($ctx),%x#$H2
3114
3115	test		\$3,$len		# is length 4*n+2?
3116	jnz		.Ldone_init_vpmadd52_2x
3117
3118	vmovdqu64	$R0,64($ctx)		# save key powers
3119	vpbroadcastq	%x#$R0,$R0		# broadcast 4th power
3120	vmovdqu64	$R1,96($ctx)
3121	vpbroadcastq	%x#$R1,$R1
3122	vmovdqu64	$R2,128($ctx)
3123	vpbroadcastq	%x#$R2,$R2
3124	vmovdqu64	$S1,160($ctx)
3125	vpbroadcastq	%x#$S1,$S1
3126
3127	jmp		.Lblocks_vpmadd52_4x_key_loaded
3128	ud2
3129
3130.align	32
3131.Ldone_init_vpmadd52_2x:
3132	vmovdqu64	$R0,64($ctx)		# save key powers
3133	vpsrldq		\$8,$R0,$R0		# 0-1-0-2
3134	vmovdqu64	$R1,96($ctx)
3135	vpsrldq		\$8,$R1,$R1
3136	vmovdqu64	$R2,128($ctx)
3137	vpsrldq		\$8,$R2,$R2
3138	vmovdqu64	$S1,160($ctx)
3139	vpsrldq		\$8,$S1,$S1
3140	jmp		.Lblocks_vpmadd52_2x_key_loaded
3141	ud2
3142
3143.align	32
3144.Lblocks_vpmadd52_2x_do:
3145	vmovdqu64	128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3146	vmovdqu64	160+8($ctx),${S1}{%k1}{z}
3147	vmovdqu64	64+8($ctx),${R0}{%k1}{z}
3148	vmovdqu64	96+8($ctx),${R1}{%k1}{z}
3149
3150.Lblocks_vpmadd52_2x_key_loaded:
3151	vmovdqu64	16*0($inp),$T2		# load data
3152	vpxorq		$T3,$T3,$T3
3153	lea		16*2($inp),$inp
3154
3155	vpunpcklqdq	$T3,$T2,$T1		# transpose data
3156	vpunpckhqdq	$T3,$T2,$T3
3157
3158	# at this point 64-bit lanes are ordered as x-1-x-0
3159
3160	vpsrlq		\$24,$T3,$T2		# splat the data
3161	vporq		$PAD,$T2,$T2
3162	 vpaddq		$T2,$H2,$H2		# accumulate input
3163	vpandq		$mask44,$T1,$T0
3164	vpsrlq		\$44,$T1,$T1
3165	vpsllq		\$20,$T3,$T3
3166	vporq		$T3,$T1,$T1
3167	vpandq		$mask44,$T1,$T1
3168
3169	jmp		.Ltail_vpmadd52_2x
3170	ud2
3171
3172.align	32
3173.Loop_vpmadd52_4x:
3174	#vpaddq		$T2,$H2,$H2		# accumulate input
3175	vpaddq		$T0,$H0,$H0
3176	vpaddq		$T1,$H1,$H1
3177
3178	vpxorq		$D0lo,$D0lo,$D0lo
3179	vpmadd52luq	$H2,$S1,$D0lo
3180	vpxorq		$D0hi,$D0hi,$D0hi
3181	vpmadd52huq	$H2,$S1,$D0hi
3182	vpxorq		$D1lo,$D1lo,$D1lo
3183	vpmadd52luq	$H2,$S2,$D1lo
3184	vpxorq		$D1hi,$D1hi,$D1hi
3185	vpmadd52huq	$H2,$S2,$D1hi
3186	vpxorq		$D2lo,$D2lo,$D2lo
3187	vpmadd52luq	$H2,$R0,$D2lo
3188	vpxorq		$D2hi,$D2hi,$D2hi
3189	vpmadd52huq	$H2,$R0,$D2hi
3190
3191	 vmovdqu64	16*0($inp),$T2		# load data
3192	 vmovdqu64	16*2($inp),$T3
3193	 lea		16*4($inp),$inp
3194	vpmadd52luq	$H0,$R0,$D0lo
3195	vpmadd52huq	$H0,$R0,$D0hi
3196	vpmadd52luq	$H0,$R1,$D1lo
3197	vpmadd52huq	$H0,$R1,$D1hi
3198	vpmadd52luq	$H0,$R2,$D2lo
3199	vpmadd52huq	$H0,$R2,$D2hi
3200
3201	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
3202	 vpunpckhqdq	$T3,$T2,$T3
3203	vpmadd52luq	$H1,$S2,$D0lo
3204	vpmadd52huq	$H1,$S2,$D0hi
3205	vpmadd52luq	$H1,$R0,$D1lo
3206	vpmadd52huq	$H1,$R0,$D1hi
3207	vpmadd52luq	$H1,$R1,$D2lo
3208	vpmadd52huq	$H1,$R1,$D2hi
3209
3210	################################################################
3211	# partial reduction (interleaved with data splat)
3212	vpsrlq		\$44,$D0lo,$tmp
3213	vpsllq		\$8,$D0hi,$D0hi
3214	vpandq		$mask44,$D0lo,$H0
3215	vpaddq		$tmp,$D0hi,$D0hi
3216
3217	 vpsrlq		\$24,$T3,$T2
3218	 vporq		$PAD,$T2,$T2
3219	vpaddq		$D0hi,$D1lo,$D1lo
3220
3221	vpsrlq		\$44,$D1lo,$tmp
3222	vpsllq		\$8,$D1hi,$D1hi
3223	vpandq		$mask44,$D1lo,$H1
3224	vpaddq		$tmp,$D1hi,$D1hi
3225
3226	 vpandq		$mask44,$T1,$T0
3227	 vpsrlq		\$44,$T1,$T1
3228	 vpsllq		\$20,$T3,$T3
3229	vpaddq		$D1hi,$D2lo,$D2lo
3230
3231	vpsrlq		\$42,$D2lo,$tmp
3232	vpsllq		\$10,$D2hi,$D2hi
3233	vpandq		$mask42,$D2lo,$H2
3234	vpaddq		$tmp,$D2hi,$D2hi
3235
3236	  vpaddq	$T2,$H2,$H2		# accumulate input
3237	vpaddq		$D2hi,$H0,$H0
3238	vpsllq		\$2,$D2hi,$D2hi
3239
3240	vpaddq		$D2hi,$H0,$H0
3241	 vporq		$T3,$T1,$T1
3242	 vpandq		$mask44,$T1,$T1
3243
3244	vpsrlq		\$44,$H0,$tmp		# additional step
3245	vpandq		$mask44,$H0,$H0
3246
3247	vpaddq		$tmp,$H1,$H1
3248
3249	sub		\$4,$len		# len-=64
3250	jnz		.Loop_vpmadd52_4x
3251
3252.Ltail_vpmadd52_4x:
3253	vmovdqu64	128($ctx),$R2		# load all key powers
3254	vmovdqu64	160($ctx),$S1
3255	vmovdqu64	64($ctx),$R0
3256	vmovdqu64	96($ctx),$R1
3257
3258.Ltail_vpmadd52_2x:
3259	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
3260	vpaddq		$R2,$S2,$S2
3261	vpsllq		\$2,$S2,$S2
3262
3263	#vpaddq		$T2,$H2,$H2		# accumulate input
3264	vpaddq		$T0,$H0,$H0
3265	vpaddq		$T1,$H1,$H1
3266
3267	vpxorq		$D0lo,$D0lo,$D0lo
3268	vpmadd52luq	$H2,$S1,$D0lo
3269	vpxorq		$D0hi,$D0hi,$D0hi
3270	vpmadd52huq	$H2,$S1,$D0hi
3271	vpxorq		$D1lo,$D1lo,$D1lo
3272	vpmadd52luq	$H2,$S2,$D1lo
3273	vpxorq		$D1hi,$D1hi,$D1hi
3274	vpmadd52huq	$H2,$S2,$D1hi
3275	vpxorq		$D2lo,$D2lo,$D2lo
3276	vpmadd52luq	$H2,$R0,$D2lo
3277	vpxorq		$D2hi,$D2hi,$D2hi
3278	vpmadd52huq	$H2,$R0,$D2hi
3279
3280	vpmadd52luq	$H0,$R0,$D0lo
3281	vpmadd52huq	$H0,$R0,$D0hi
3282	vpmadd52luq	$H0,$R1,$D1lo
3283	vpmadd52huq	$H0,$R1,$D1hi
3284	vpmadd52luq	$H0,$R2,$D2lo
3285	vpmadd52huq	$H0,$R2,$D2hi
3286
3287	vpmadd52luq	$H1,$S2,$D0lo
3288	vpmadd52huq	$H1,$S2,$D0hi
3289	vpmadd52luq	$H1,$R0,$D1lo
3290	vpmadd52huq	$H1,$R0,$D1hi
3291	vpmadd52luq	$H1,$R1,$D2lo
3292	vpmadd52huq	$H1,$R1,$D2hi
3293
3294	################################################################
3295	# horizontal addition
3296
3297	mov		\$1,%eax
3298	kmovw		%eax,%k1
3299	vpsrldq		\$8,$D0lo,$T0
3300	vpsrldq		\$8,$D0hi,$H0
3301	vpsrldq		\$8,$D1lo,$T1
3302	vpsrldq		\$8,$D1hi,$H1
3303	vpaddq		$T0,$D0lo,$D0lo
3304	vpaddq		$H0,$D0hi,$D0hi
3305	vpsrldq		\$8,$D2lo,$T2
3306	vpsrldq		\$8,$D2hi,$H2
3307	vpaddq		$T1,$D1lo,$D1lo
3308	vpaddq		$H1,$D1hi,$D1hi
3309	 vpermq		\$0x2,$D0lo,$T0
3310	 vpermq		\$0x2,$D0hi,$H0
3311	vpaddq		$T2,$D2lo,$D2lo
3312	vpaddq		$H2,$D2hi,$D2hi
3313
3314	vpermq		\$0x2,$D1lo,$T1
3315	vpermq		\$0x2,$D1hi,$H1
3316	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
3317	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
3318	vpermq		\$0x2,$D2lo,$T2
3319	vpermq		\$0x2,$D2hi,$H2
3320	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
3321	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
3322	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
3323	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
3324
3325	################################################################
3326	# partial reduction
3327	vpsrlq		\$44,$D0lo,$tmp
3328	vpsllq		\$8,$D0hi,$D0hi
3329	vpandq		$mask44,$D0lo,$H0
3330	vpaddq		$tmp,$D0hi,$D0hi
3331
3332	vpaddq		$D0hi,$D1lo,$D1lo
3333
3334	vpsrlq		\$44,$D1lo,$tmp
3335	vpsllq		\$8,$D1hi,$D1hi
3336	vpandq		$mask44,$D1lo,$H1
3337	vpaddq		$tmp,$D1hi,$D1hi
3338
3339	vpaddq		$D1hi,$D2lo,$D2lo
3340
3341	vpsrlq		\$42,$D2lo,$tmp
3342	vpsllq		\$10,$D2hi,$D2hi
3343	vpandq		$mask42,$D2lo,$H2
3344	vpaddq		$tmp,$D2hi,$D2hi
3345
3346	vpaddq		$D2hi,$H0,$H0
3347	vpsllq		\$2,$D2hi,$D2hi
3348
3349	vpaddq		$D2hi,$H0,$H0
3350
3351	vpsrlq		\$44,$H0,$tmp		# additional step
3352	vpandq		$mask44,$H0,$H0
3353
3354	vpaddq		$tmp,$H1,$H1
3355						# at this point $len is
3356						# either 4*n+2 or 0...
3357	sub		\$2,$len		# len-=32
3358	ja		.Lblocks_vpmadd52_4x_do
3359
3360	vmovq		%x#$H0,0($ctx)
3361	vmovq		%x#$H1,8($ctx)
3362	vmovq		%x#$H2,16($ctx)
3363	vzeroall
3364
3365.Lno_data_vpmadd52_4x:
3366	ret
3367.cfi_endproc
3368.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3369___
3370}
3371{
3372########################################################################
3373# As implied by its name 8x subroutine processes 8 blocks in parallel...
3374# This is intermediate version, as it's used only in cases when input
3375# length is either 8*n, 8*n+1 or 8*n+2...
3376
3377my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3378my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3379my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3380my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3381
3382$code.=<<___;
3383.type	poly1305_blocks_vpmadd52_8x,\@function,4
3384.align	32
3385poly1305_blocks_vpmadd52_8x:
3386.cfi_startproc
3387	shr	\$4,$len
3388	jz	.Lno_data_vpmadd52_8x		# too short
3389
3390	shl	\$40,$padbit
3391	mov	64($ctx),%r8			# peek on power of the key
3392
3393	vmovdqa64	.Lx_mask44(%rip),$mask44
3394	vmovdqa64	.Lx_mask42(%rip),$mask42
3395
3396	test	%r8,%r8				# is power value impossible?
3397	js	.Linit_vpmadd52			# if it is, then init R[4]
3398
3399	vmovq	0($ctx),%x#$H0			# load current hash value
3400	vmovq	8($ctx),%x#$H1
3401	vmovq	16($ctx),%x#$H2
3402
3403.Lblocks_vpmadd52_8x:
3404	################################################################
3405	# fist we calculate more key powers
3406
3407	vmovdqu64	128($ctx),$R2		# load 1-3-2-4 powers
3408	vmovdqu64	160($ctx),$S1
3409	vmovdqu64	64($ctx),$R0
3410	vmovdqu64	96($ctx),$R1
3411
3412	vpsllq		\$2,$R2,$S2		# S2 = R2*5*4
3413	vpaddq		$R2,$S2,$S2
3414	vpsllq		\$2,$S2,$S2
3415
3416	vpbroadcastq	%x#$R2,$RR2		# broadcast 4th power
3417	vpbroadcastq	%x#$R0,$RR0
3418	vpbroadcastq	%x#$R1,$RR1
3419
3420	vpxorq		$D0lo,$D0lo,$D0lo
3421	vpmadd52luq	$RR2,$S1,$D0lo
3422	vpxorq		$D0hi,$D0hi,$D0hi
3423	vpmadd52huq	$RR2,$S1,$D0hi
3424	vpxorq		$D1lo,$D1lo,$D1lo
3425	vpmadd52luq	$RR2,$S2,$D1lo
3426	vpxorq		$D1hi,$D1hi,$D1hi
3427	vpmadd52huq	$RR2,$S2,$D1hi
3428	vpxorq		$D2lo,$D2lo,$D2lo
3429	vpmadd52luq	$RR2,$R0,$D2lo
3430	vpxorq		$D2hi,$D2hi,$D2hi
3431	vpmadd52huq	$RR2,$R0,$D2hi
3432
3433	vpmadd52luq	$RR0,$R0,$D0lo
3434	vpmadd52huq	$RR0,$R0,$D0hi
3435	vpmadd52luq	$RR0,$R1,$D1lo
3436	vpmadd52huq	$RR0,$R1,$D1hi
3437	vpmadd52luq	$RR0,$R2,$D2lo
3438	vpmadd52huq	$RR0,$R2,$D2hi
3439
3440	vpmadd52luq	$RR1,$S2,$D0lo
3441	vpmadd52huq	$RR1,$S2,$D0hi
3442	vpmadd52luq	$RR1,$R0,$D1lo
3443	vpmadd52huq	$RR1,$R0,$D1hi
3444	vpmadd52luq	$RR1,$R1,$D2lo
3445	vpmadd52huq	$RR1,$R1,$D2hi
3446
3447	################################################################
3448	# partial reduction
3449	vpsrlq		\$44,$D0lo,$tmp
3450	vpsllq		\$8,$D0hi,$D0hi
3451	vpandq		$mask44,$D0lo,$RR0
3452	vpaddq		$tmp,$D0hi,$D0hi
3453
3454	vpaddq		$D0hi,$D1lo,$D1lo
3455
3456	vpsrlq		\$44,$D1lo,$tmp
3457	vpsllq		\$8,$D1hi,$D1hi
3458	vpandq		$mask44,$D1lo,$RR1
3459	vpaddq		$tmp,$D1hi,$D1hi
3460
3461	vpaddq		$D1hi,$D2lo,$D2lo
3462
3463	vpsrlq		\$42,$D2lo,$tmp
3464	vpsllq		\$10,$D2hi,$D2hi
3465	vpandq		$mask42,$D2lo,$RR2
3466	vpaddq		$tmp,$D2hi,$D2hi
3467
3468	vpaddq		$D2hi,$RR0,$RR0
3469	vpsllq		\$2,$D2hi,$D2hi
3470
3471	vpaddq		$D2hi,$RR0,$RR0
3472
3473	vpsrlq		\$44,$RR0,$tmp		# additional step
3474	vpandq		$mask44,$RR0,$RR0
3475
3476	vpaddq		$tmp,$RR1,$RR1
3477
3478	################################################################
3479	# At this point Rx holds 1324 powers, RRx - 5768, and the goal
3480	# is 15263748, which reflects how data is loaded...
3481
3482	vpunpcklqdq	$R2,$RR2,$T2		# 3748
3483	vpunpckhqdq	$R2,$RR2,$R2		# 1526
3484	vpunpcklqdq	$R0,$RR0,$T0
3485	vpunpckhqdq	$R0,$RR0,$R0
3486	vpunpcklqdq	$R1,$RR1,$T1
3487	vpunpckhqdq	$R1,$RR1,$R1
3488___
3489######## switch to %zmm
3490map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3491map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3492map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3493map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3494
3495$code.=<<___;
3496	vshufi64x2	\$0x44,$R2,$T2,$RR2	# 15263748
3497	vshufi64x2	\$0x44,$R0,$T0,$RR0
3498	vshufi64x2	\$0x44,$R1,$T1,$RR1
3499
3500	vmovdqu64	16*0($inp),$T2		# load data
3501	vmovdqu64	16*4($inp),$T3
3502	lea		16*8($inp),$inp
3503
3504	vpsllq		\$2,$RR2,$SS2		# S2 = R2*5*4
3505	vpsllq		\$2,$RR1,$SS1		# S1 = R1*5*4
3506	vpaddq		$RR2,$SS2,$SS2
3507	vpaddq		$RR1,$SS1,$SS1
3508	vpsllq		\$2,$SS2,$SS2
3509	vpsllq		\$2,$SS1,$SS1
3510
3511	vpbroadcastq	$padbit,$PAD
3512	vpbroadcastq	%x#$mask44,$mask44
3513	vpbroadcastq	%x#$mask42,$mask42
3514
3515	vpbroadcastq	%x#$SS1,$S1		# broadcast 8th power
3516	vpbroadcastq	%x#$SS2,$S2
3517	vpbroadcastq	%x#$RR0,$R0
3518	vpbroadcastq	%x#$RR1,$R1
3519	vpbroadcastq	%x#$RR2,$R2
3520
3521	vpunpcklqdq	$T3,$T2,$T1		# transpose data
3522	vpunpckhqdq	$T3,$T2,$T3
3523
3524	# at this point 64-bit lanes are ordered as 73625140
3525
3526	vpsrlq		\$24,$T3,$T2		# splat the data
3527	vporq		$PAD,$T2,$T2
3528	 vpaddq		$T2,$H2,$H2		# accumulate input
3529	vpandq		$mask44,$T1,$T0
3530	vpsrlq		\$44,$T1,$T1
3531	vpsllq		\$20,$T3,$T3
3532	vporq		$T3,$T1,$T1
3533	vpandq		$mask44,$T1,$T1
3534
3535	sub		\$8,$len
3536	jz		.Ltail_vpmadd52_8x
3537	jmp		.Loop_vpmadd52_8x
3538
3539.align	32
3540.Loop_vpmadd52_8x:
3541	#vpaddq		$T2,$H2,$H2		# accumulate input
3542	vpaddq		$T0,$H0,$H0
3543	vpaddq		$T1,$H1,$H1
3544
3545	vpxorq		$D0lo,$D0lo,$D0lo
3546	vpmadd52luq	$H2,$S1,$D0lo
3547	vpxorq		$D0hi,$D0hi,$D0hi
3548	vpmadd52huq	$H2,$S1,$D0hi
3549	vpxorq		$D1lo,$D1lo,$D1lo
3550	vpmadd52luq	$H2,$S2,$D1lo
3551	vpxorq		$D1hi,$D1hi,$D1hi
3552	vpmadd52huq	$H2,$S2,$D1hi
3553	vpxorq		$D2lo,$D2lo,$D2lo
3554	vpmadd52luq	$H2,$R0,$D2lo
3555	vpxorq		$D2hi,$D2hi,$D2hi
3556	vpmadd52huq	$H2,$R0,$D2hi
3557
3558	 vmovdqu64	16*0($inp),$T2		# load data
3559	 vmovdqu64	16*4($inp),$T3
3560	 lea		16*8($inp),$inp
3561	vpmadd52luq	$H0,$R0,$D0lo
3562	vpmadd52huq	$H0,$R0,$D0hi
3563	vpmadd52luq	$H0,$R1,$D1lo
3564	vpmadd52huq	$H0,$R1,$D1hi
3565	vpmadd52luq	$H0,$R2,$D2lo
3566	vpmadd52huq	$H0,$R2,$D2hi
3567
3568	 vpunpcklqdq	$T3,$T2,$T1		# transpose data
3569	 vpunpckhqdq	$T3,$T2,$T3
3570	vpmadd52luq	$H1,$S2,$D0lo
3571	vpmadd52huq	$H1,$S2,$D0hi
3572	vpmadd52luq	$H1,$R0,$D1lo
3573	vpmadd52huq	$H1,$R0,$D1hi
3574	vpmadd52luq	$H1,$R1,$D2lo
3575	vpmadd52huq	$H1,$R1,$D2hi
3576
3577	################################################################
3578	# partial reduction (interleaved with data splat)
3579	vpsrlq		\$44,$D0lo,$tmp
3580	vpsllq		\$8,$D0hi,$D0hi
3581	vpandq		$mask44,$D0lo,$H0
3582	vpaddq		$tmp,$D0hi,$D0hi
3583
3584	 vpsrlq		\$24,$T3,$T2
3585	 vporq		$PAD,$T2,$T2
3586	vpaddq		$D0hi,$D1lo,$D1lo
3587
3588	vpsrlq		\$44,$D1lo,$tmp
3589	vpsllq		\$8,$D1hi,$D1hi
3590	vpandq		$mask44,$D1lo,$H1
3591	vpaddq		$tmp,$D1hi,$D1hi
3592
3593	 vpandq		$mask44,$T1,$T0
3594	 vpsrlq		\$44,$T1,$T1
3595	 vpsllq		\$20,$T3,$T3
3596	vpaddq		$D1hi,$D2lo,$D2lo
3597
3598	vpsrlq		\$42,$D2lo,$tmp
3599	vpsllq		\$10,$D2hi,$D2hi
3600	vpandq		$mask42,$D2lo,$H2
3601	vpaddq		$tmp,$D2hi,$D2hi
3602
3603	  vpaddq	$T2,$H2,$H2		# accumulate input
3604	vpaddq		$D2hi,$H0,$H0
3605	vpsllq		\$2,$D2hi,$D2hi
3606
3607	vpaddq		$D2hi,$H0,$H0
3608	 vporq		$T3,$T1,$T1
3609	 vpandq		$mask44,$T1,$T1
3610
3611	vpsrlq		\$44,$H0,$tmp		# additional step
3612	vpandq		$mask44,$H0,$H0
3613
3614	vpaddq		$tmp,$H1,$H1
3615
3616	sub		\$8,$len		# len-=128
3617	jnz		.Loop_vpmadd52_8x
3618
3619.Ltail_vpmadd52_8x:
3620	#vpaddq		$T2,$H2,$H2		# accumulate input
3621	vpaddq		$T0,$H0,$H0
3622	vpaddq		$T1,$H1,$H1
3623
3624	vpxorq		$D0lo,$D0lo,$D0lo
3625	vpmadd52luq	$H2,$SS1,$D0lo
3626	vpxorq		$D0hi,$D0hi,$D0hi
3627	vpmadd52huq	$H2,$SS1,$D0hi
3628	vpxorq		$D1lo,$D1lo,$D1lo
3629	vpmadd52luq	$H2,$SS2,$D1lo
3630	vpxorq		$D1hi,$D1hi,$D1hi
3631	vpmadd52huq	$H2,$SS2,$D1hi
3632	vpxorq		$D2lo,$D2lo,$D2lo
3633	vpmadd52luq	$H2,$RR0,$D2lo
3634	vpxorq		$D2hi,$D2hi,$D2hi
3635	vpmadd52huq	$H2,$RR0,$D2hi
3636
3637	vpmadd52luq	$H0,$RR0,$D0lo
3638	vpmadd52huq	$H0,$RR0,$D0hi
3639	vpmadd52luq	$H0,$RR1,$D1lo
3640	vpmadd52huq	$H0,$RR1,$D1hi
3641	vpmadd52luq	$H0,$RR2,$D2lo
3642	vpmadd52huq	$H0,$RR2,$D2hi
3643
3644	vpmadd52luq	$H1,$SS2,$D0lo
3645	vpmadd52huq	$H1,$SS2,$D0hi
3646	vpmadd52luq	$H1,$RR0,$D1lo
3647	vpmadd52huq	$H1,$RR0,$D1hi
3648	vpmadd52luq	$H1,$RR1,$D2lo
3649	vpmadd52huq	$H1,$RR1,$D2hi
3650
3651	################################################################
3652	# horizontal addition
3653
3654	mov		\$1,%eax
3655	kmovw		%eax,%k1
3656	vpsrldq		\$8,$D0lo,$T0
3657	vpsrldq		\$8,$D0hi,$H0
3658	vpsrldq		\$8,$D1lo,$T1
3659	vpsrldq		\$8,$D1hi,$H1
3660	vpaddq		$T0,$D0lo,$D0lo
3661	vpaddq		$H0,$D0hi,$D0hi
3662	vpsrldq		\$8,$D2lo,$T2
3663	vpsrldq		\$8,$D2hi,$H2
3664	vpaddq		$T1,$D1lo,$D1lo
3665	vpaddq		$H1,$D1hi,$D1hi
3666	 vpermq		\$0x2,$D0lo,$T0
3667	 vpermq		\$0x2,$D0hi,$H0
3668	vpaddq		$T2,$D2lo,$D2lo
3669	vpaddq		$H2,$D2hi,$D2hi
3670
3671	vpermq		\$0x2,$D1lo,$T1
3672	vpermq		\$0x2,$D1hi,$H1
3673	vpaddq		$T0,$D0lo,$D0lo
3674	vpaddq		$H0,$D0hi,$D0hi
3675	vpermq		\$0x2,$D2lo,$T2
3676	vpermq		\$0x2,$D2hi,$H2
3677	vpaddq		$T1,$D1lo,$D1lo
3678	vpaddq		$H1,$D1hi,$D1hi
3679	 vextracti64x4	\$1,$D0lo,%y#$T0
3680	 vextracti64x4	\$1,$D0hi,%y#$H0
3681	vpaddq		$T2,$D2lo,$D2lo
3682	vpaddq		$H2,$D2hi,$D2hi
3683
3684	vextracti64x4	\$1,$D1lo,%y#$T1
3685	vextracti64x4	\$1,$D1hi,%y#$H1
3686	vextracti64x4	\$1,$D2lo,%y#$T2
3687	vextracti64x4	\$1,$D2hi,%y#$H2
3688___
3689######## switch back to %ymm
3690map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3691map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3692map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3693
3694$code.=<<___;
3695	vpaddq		$T0,$D0lo,${D0lo}{%k1}{z}
3696	vpaddq		$H0,$D0hi,${D0hi}{%k1}{z}
3697	vpaddq		$T1,$D1lo,${D1lo}{%k1}{z}
3698	vpaddq		$H1,$D1hi,${D1hi}{%k1}{z}
3699	vpaddq		$T2,$D2lo,${D2lo}{%k1}{z}
3700	vpaddq		$H2,$D2hi,${D2hi}{%k1}{z}
3701
3702	################################################################
3703	# partial reduction
3704	vpsrlq		\$44,$D0lo,$tmp
3705	vpsllq		\$8,$D0hi,$D0hi
3706	vpandq		$mask44,$D0lo,$H0
3707	vpaddq		$tmp,$D0hi,$D0hi
3708
3709	vpaddq		$D0hi,$D1lo,$D1lo
3710
3711	vpsrlq		\$44,$D1lo,$tmp
3712	vpsllq		\$8,$D1hi,$D1hi
3713	vpandq		$mask44,$D1lo,$H1
3714	vpaddq		$tmp,$D1hi,$D1hi
3715
3716	vpaddq		$D1hi,$D2lo,$D2lo
3717
3718	vpsrlq		\$42,$D2lo,$tmp
3719	vpsllq		\$10,$D2hi,$D2hi
3720	vpandq		$mask42,$D2lo,$H2
3721	vpaddq		$tmp,$D2hi,$D2hi
3722
3723	vpaddq		$D2hi,$H0,$H0
3724	vpsllq		\$2,$D2hi,$D2hi
3725
3726	vpaddq		$D2hi,$H0,$H0
3727
3728	vpsrlq		\$44,$H0,$tmp		# additional step
3729	vpandq		$mask44,$H0,$H0
3730
3731	vpaddq		$tmp,$H1,$H1
3732
3733	################################################################
3734
3735	vmovq		%x#$H0,0($ctx)
3736	vmovq		%x#$H1,8($ctx)
3737	vmovq		%x#$H2,16($ctx)
3738	vzeroall
3739
3740.Lno_data_vpmadd52_8x:
3741	ret
3742.cfi_endproc
3743.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3744___
3745}
3746$code.=<<___;
3747.type	poly1305_emit_base2_44,\@function,3
3748.align	32
3749poly1305_emit_base2_44:
3750.cfi_startproc
3751	endbranch
3752	mov	0($ctx),%r8	# load hash value
3753	mov	8($ctx),%r9
3754	mov	16($ctx),%r10
3755
3756	mov	%r9,%rax
3757	shr	\$20,%r9
3758	shl	\$44,%rax
3759	mov	%r10,%rcx
3760	shr	\$40,%r10
3761	shl	\$24,%rcx
3762
3763	add	%rax,%r8
3764	adc	%rcx,%r9
3765	adc	\$0,%r10
3766
3767	mov	%r8,%rax
3768	add	\$5,%r8		# compare to modulus
3769	mov	%r9,%rcx
3770	adc	\$0,%r9
3771	adc	\$0,%r10
3772	shr	\$2,%r10	# did 130-bit value overflow?
3773	cmovnz	%r8,%rax
3774	cmovnz	%r9,%rcx
3775
3776	add	0($nonce),%rax	# accumulate nonce
3777	adc	8($nonce),%rcx
3778	mov	%rax,0($mac)	# write result
3779	mov	%rcx,8($mac)
3780
3781	ret
3782.cfi_endproc
3783.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
3784___
3785}	}	}
3786$code.=<<___;
3787.section .rodata align=64
3788.align	64
3789.Lconst:
3790.Lmask24:
3791.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3792.L129:
3793.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3794.Lmask26:
3795.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3796.Lpermd_avx2:
3797.long	2,2,2,3,2,0,2,1
3798.Lpermd_avx512:
3799.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3800
3801.L2_44_inp_permd:
3802.long	0,1,1,2,2,3,7,7
3803.L2_44_inp_shift:
3804.quad	0,12,24,64
3805.L2_44_mask:
3806.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3807.L2_44_shift_rgt:
3808.quad	44,44,42,64
3809.L2_44_shift_lft:
3810.quad	8,8,10,64
3811
3812.align	64
3813.Lx_mask44:
3814.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3815.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3816.Lx_mask42:
3817.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3818.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3819.previous
3820___
3821}
3822$code.=<<___;
3823.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3824.align	16
3825___
3826
3827{	# chacha20-poly1305 helpers
3828my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
3829                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
3830$code.=<<___;
3831.globl	xor128_encrypt_n_pad
3832.type	xor128_encrypt_n_pad,\@abi-omnipotent
3833.align	16
3834xor128_encrypt_n_pad:
3835.cfi_startproc
3836	sub	$otp,$inp
3837	sub	$otp,$out
3838	mov	$len,%r10		# put len aside
3839	shr	\$4,$len		# len / 16
3840	jz	.Ltail_enc
3841	nop
3842.Loop_enc_xmm:
3843	movdqu	($inp,$otp),%xmm0
3844	pxor	($otp),%xmm0
3845	movdqu	%xmm0,($out,$otp)
3846	movdqa	%xmm0,($otp)
3847	lea	16($otp),$otp
3848	dec	$len
3849	jnz	.Loop_enc_xmm
3850
3851	and	\$15,%r10		# len % 16
3852	jz	.Ldone_enc
3853
3854.Ltail_enc:
3855	mov	\$16,$len
3856	sub	%r10,$len
3857	xor	%eax,%eax
3858.Loop_enc_byte:
3859	mov	($inp,$otp),%al
3860	xor	($otp),%al
3861	mov	%al,($out,$otp)
3862	mov	%al,($otp)
3863	lea	1($otp),$otp
3864	dec	%r10
3865	jnz	.Loop_enc_byte
3866
3867	xor	%eax,%eax
3868.Loop_enc_pad:
3869	mov	%al,($otp)
3870	lea	1($otp),$otp
3871	dec	$len
3872	jnz	.Loop_enc_pad
3873
3874.Ldone_enc:
3875	mov	$otp,%rax
3876	ret
3877.cfi_endproc
3878.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3879
3880.globl	xor128_decrypt_n_pad
3881.type	xor128_decrypt_n_pad,\@abi-omnipotent
3882.align	16
3883xor128_decrypt_n_pad:
3884.cfi_startproc
3885	sub	$otp,$inp
3886	sub	$otp,$out
3887	mov	$len,%r10		# put len aside
3888	shr	\$4,$len		# len / 16
3889	jz	.Ltail_dec
3890	nop
3891.Loop_dec_xmm:
3892	movdqu	($inp,$otp),%xmm0
3893	movdqa	($otp),%xmm1
3894	pxor	%xmm0,%xmm1
3895	movdqu	%xmm1,($out,$otp)
3896	movdqa	%xmm0,($otp)
3897	lea	16($otp),$otp
3898	dec	$len
3899	jnz	.Loop_dec_xmm
3900
3901	pxor	%xmm1,%xmm1
3902	and	\$15,%r10		# len % 16
3903	jz	.Ldone_dec
3904
3905.Ltail_dec:
3906	mov	\$16,$len
3907	sub	%r10,$len
3908	xor	%eax,%eax
3909	xor	%r11,%r11
3910.Loop_dec_byte:
3911	mov	($inp,$otp),%r11b
3912	mov	($otp),%al
3913	xor	%r11b,%al
3914	mov	%al,($out,$otp)
3915	mov	%r11b,($otp)
3916	lea	1($otp),$otp
3917	dec	%r10
3918	jnz	.Loop_dec_byte
3919
3920	xor	%eax,%eax
3921.Loop_dec_pad:
3922	mov	%al,($otp)
3923	lea	1($otp),$otp
3924	dec	$len
3925	jnz	.Loop_dec_pad
3926
3927.Ldone_dec:
3928	mov	$otp,%rax
3929	ret
3930.cfi_endproc
3931.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3932___
3933}
3934
3935# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3936#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3937if ($win64) {
3938$rec="%rcx";
3939$frame="%rdx";
3940$context="%r8";
3941$disp="%r9";
3942
3943$code.=<<___;
3944.extern	__imp_RtlVirtualUnwind
3945.type	se_handler,\@abi-omnipotent
3946.align	16
3947se_handler:
3948	push	%rsi
3949	push	%rdi
3950	push	%rbx
3951	push	%rbp
3952	push	%r12
3953	push	%r13
3954	push	%r14
3955	push	%r15
3956	pushfq
3957	sub	\$64,%rsp
3958
3959	mov	120($context),%rax	# pull context->Rax
3960	mov	248($context),%rbx	# pull context->Rip
3961
3962	mov	8($disp),%rsi		# disp->ImageBase
3963	mov	56($disp),%r11		# disp->HandlerData
3964
3965	mov	0(%r11),%r10d		# HandlerData[0]
3966	lea	(%rsi,%r10),%r10	# prologue label
3967	cmp	%r10,%rbx		# context->Rip<.Lprologue
3968	jb	.Lcommon_seh_tail
3969
3970	mov	152($context),%rax	# pull context->Rsp
3971
3972	mov	4(%r11),%r10d		# HandlerData[1]
3973	lea	(%rsi,%r10),%r10	# epilogue label
3974	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
3975	jae	.Lcommon_seh_tail
3976
3977	lea	48(%rax),%rax
3978
3979	mov	-8(%rax),%rbx
3980	mov	-16(%rax),%rbp
3981	mov	-24(%rax),%r12
3982	mov	-32(%rax),%r13
3983	mov	-40(%rax),%r14
3984	mov	-48(%rax),%r15
3985	mov	%rbx,144($context)	# restore context->Rbx
3986	mov	%rbp,160($context)	# restore context->Rbp
3987	mov	%r12,216($context)	# restore context->R12
3988	mov	%r13,224($context)	# restore context->R13
3989	mov	%r14,232($context)	# restore context->R14
3990	mov	%r15,240($context)	# restore context->R14
3991
3992	jmp	.Lcommon_seh_tail
3993.size	se_handler,.-se_handler
3994
3995.type	avx_handler,\@abi-omnipotent
3996.align	16
3997avx_handler:
3998	push	%rsi
3999	push	%rdi
4000	push	%rbx
4001	push	%rbp
4002	push	%r12
4003	push	%r13
4004	push	%r14
4005	push	%r15
4006	pushfq
4007	sub	\$64,%rsp
4008
4009	mov	120($context),%rax	# pull context->Rax
4010	mov	248($context),%rbx	# pull context->Rip
4011
4012	mov	8($disp),%rsi		# disp->ImageBase
4013	mov	56($disp),%r11		# disp->HandlerData
4014
4015	mov	0(%r11),%r10d		# HandlerData[0]
4016	lea	(%rsi,%r10),%r10	# prologue label
4017	cmp	%r10,%rbx		# context->Rip<prologue label
4018	jb	.Lcommon_seh_tail
4019
4020	mov	152($context),%rax	# pull context->Rsp
4021
4022	mov	4(%r11),%r10d		# HandlerData[1]
4023	lea	(%rsi,%r10),%r10	# epilogue label
4024	cmp	%r10,%rbx		# context->Rip>=epilogue label
4025	jae	.Lcommon_seh_tail
4026
4027	mov	208($context),%rax	# pull context->R11
4028
4029	lea	0x50(%rax),%rsi
4030	lea	0xf8(%rax),%rax
4031	lea	512($context),%rdi	# &context.Xmm6
4032	mov	\$20,%ecx
4033	.long	0xa548f3fc		# cld; rep movsq
4034
4035.Lcommon_seh_tail:
4036	mov	8(%rax),%rdi
4037	mov	16(%rax),%rsi
4038	mov	%rax,152($context)	# restore context->Rsp
4039	mov	%rsi,168($context)	# restore context->Rsi
4040	mov	%rdi,176($context)	# restore context->Rdi
4041
4042	mov	40($disp),%rdi		# disp->ContextRecord
4043	mov	$context,%rsi		# context
4044	mov	\$154,%ecx		# sizeof(CONTEXT)
4045	.long	0xa548f3fc		# cld; rep movsq
4046
4047	mov	$disp,%rsi
4048	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4049	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4050	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4051	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4052	mov	40(%rsi),%r10		# disp->ContextRecord
4053	lea	56(%rsi),%r11		# &disp->HandlerData
4054	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4055	mov	%r10,32(%rsp)		# arg5
4056	mov	%r11,40(%rsp)		# arg6
4057	mov	%r12,48(%rsp)		# arg7
4058	mov	%rcx,56(%rsp)		# arg8, (NULL)
4059	call	*__imp_RtlVirtualUnwind(%rip)
4060
4061	mov	\$1,%eax		# ExceptionContinueSearch
4062	add	\$64,%rsp
4063	popfq
4064	pop	%r15
4065	pop	%r14
4066	pop	%r13
4067	pop	%r12
4068	pop	%rbp
4069	pop	%rbx
4070	pop	%rdi
4071	pop	%rsi
4072	ret
4073.size	avx_handler,.-avx_handler
4074
4075.section	.pdata
4076.align	4
4077	.rva	.LSEH_begin_poly1305_init
4078	.rva	.LSEH_end_poly1305_init
4079	.rva	.LSEH_info_poly1305_init
4080
4081	.rva	.LSEH_begin_poly1305_blocks
4082	.rva	.LSEH_end_poly1305_blocks
4083	.rva	.LSEH_info_poly1305_blocks
4084
4085	.rva	.LSEH_begin_poly1305_emit
4086	.rva	.LSEH_end_poly1305_emit
4087	.rva	.LSEH_info_poly1305_emit
4088___
4089$code.=<<___ if ($avx);
4090	.rva	.LSEH_begin_poly1305_blocks_avx
4091	.rva	.Lbase2_64_avx
4092	.rva	.LSEH_info_poly1305_blocks_avx_1
4093
4094	.rva	.Lbase2_64_avx
4095	.rva	.Leven_avx
4096	.rva	.LSEH_info_poly1305_blocks_avx_2
4097
4098	.rva	.Leven_avx
4099	.rva	.LSEH_end_poly1305_blocks_avx
4100	.rva	.LSEH_info_poly1305_blocks_avx_3
4101
4102	.rva	.LSEH_begin_poly1305_emit_avx
4103	.rva	.LSEH_end_poly1305_emit_avx
4104	.rva	.LSEH_info_poly1305_emit_avx
4105___
4106$code.=<<___ if ($avx>1);
4107	.rva	.LSEH_begin_poly1305_blocks_avx2
4108	.rva	.Lbase2_64_avx2
4109	.rva	.LSEH_info_poly1305_blocks_avx2_1
4110
4111	.rva	.Lbase2_64_avx2
4112	.rva	.Leven_avx2
4113	.rva	.LSEH_info_poly1305_blocks_avx2_2
4114
4115	.rva	.Leven_avx2
4116	.rva	.LSEH_end_poly1305_blocks_avx2
4117	.rva	.LSEH_info_poly1305_blocks_avx2_3
4118___
4119$code.=<<___ if ($avx>2);
4120	.rva	.LSEH_begin_poly1305_blocks_avx512
4121	.rva	.LSEH_end_poly1305_blocks_avx512
4122	.rva	.LSEH_info_poly1305_blocks_avx512
4123___
4124$code.=<<___;
4125.section	.xdata
4126.align	8
4127.LSEH_info_poly1305_init:
4128	.byte	9,0,0,0
4129	.rva	se_handler
4130	.rva	.LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4131
4132.LSEH_info_poly1305_blocks:
4133	.byte	9,0,0,0
4134	.rva	se_handler
4135	.rva	.Lblocks_body,.Lblocks_epilogue
4136
4137.LSEH_info_poly1305_emit:
4138	.byte	9,0,0,0
4139	.rva	se_handler
4140	.rva	.LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4141___
4142$code.=<<___ if ($avx);
4143.LSEH_info_poly1305_blocks_avx_1:
4144	.byte	9,0,0,0
4145	.rva	se_handler
4146	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
4147
4148.LSEH_info_poly1305_blocks_avx_2:
4149	.byte	9,0,0,0
4150	.rva	se_handler
4151	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
4152
4153.LSEH_info_poly1305_blocks_avx_3:
4154	.byte	9,0,0,0
4155	.rva	avx_handler
4156	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
4157
4158.LSEH_info_poly1305_emit_avx:
4159	.byte	9,0,0,0
4160	.rva	se_handler
4161	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4162___
4163$code.=<<___ if ($avx>1);
4164.LSEH_info_poly1305_blocks_avx2_1:
4165	.byte	9,0,0,0
4166	.rva	se_handler
4167	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
4168
4169.LSEH_info_poly1305_blocks_avx2_2:
4170	.byte	9,0,0,0
4171	.rva	se_handler
4172	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
4173
4174.LSEH_info_poly1305_blocks_avx2_3:
4175	.byte	9,0,0,0
4176	.rva	avx_handler
4177	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
4178___
4179$code.=<<___ if ($avx>2);
4180.LSEH_info_poly1305_blocks_avx512:
4181	.byte	9,0,0,0
4182	.rva	avx_handler
4183	.rva	.Ldo_avx512_body,.Ldo_avx512_epilogue		# HandlerData[]
4184___
4185}
4186
4187foreach (split('\n',$code)) {
4188	s/\`([^\`]*)\`/eval($1)/ge;
4189	s/%r([a-z]+)#d/%e$1/g;
4190	s/%r([0-9]+)#d/%r$1d/g;
4191	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4192
4193	print $_,"\n";
4194}
4195close STDOUT or die "error closing STDOUT: $!";
4196