xref: /openssl/crypto/sha/asm/sha1-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2006-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96# $output is the last argument if it looks like a file (it has an extension)
97# $flavour is the first argument if it doesn't look like a file
98$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
99$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
109		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
110	$avx = ($1>=2.19) + ($1>=2.22);
111}
112
113if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
114	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
115	$avx = ($1>=2.09) + ($1>=2.10);
116}
117
118if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
119	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
120	$avx = ($1>=10) + ($1>=11);
121}
122
123if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
124	$avx = ($2>=3.0) + ($2>3.0);
125}
126
127$shaext=1;	### set to zero if compiling for 1.0.1
128$avx=1		if (!$shaext && $avx);
129
130open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
131    or die "can't call $xlate: $!";
132*STDOUT=*OUT;
133
134$ctx="%rdi";	# 1st arg
135$inp="%rsi";	# 2nd arg
136$num="%rdx";	# 3rd arg
137
138# reassign arguments in order to produce more compact code
139$ctx="%r8";
140$inp="%r9";
141$num="%r10";
142
143$t0="%eax";
144$t1="%ebx";
145$t2="%ecx";
146@xi=("%edx","%ebp","%r14d");
147$A="%esi";
148$B="%edi";
149$C="%r11d";
150$D="%r12d";
151$E="%r13d";
152
153@V=($A,$B,$C,$D,$E);
154
155sub BODY_00_19 {
156my ($i,$a,$b,$c,$d,$e)=@_;
157my $j=$i+1;
158$code.=<<___ if ($i==0);
159	mov	`4*$i`($inp),$xi[0]
160	bswap	$xi[0]
161___
162$code.=<<___ if ($i<15);
163	mov	`4*$j`($inp),$xi[1]
164	mov	$d,$t0
165	mov	$xi[0],`4*$i`(%rsp)
166	mov	$a,$t2
167	bswap	$xi[1]
168	xor	$c,$t0
169	rol	\$5,$t2
170	and	$b,$t0
171	lea	0x5a827999($xi[0],$e),$e
172	add	$t2,$e
173	xor	$d,$t0
174	rol	\$30,$b
175	add	$t0,$e
176___
177$code.=<<___ if ($i>=15);
178	xor	`4*($j%16)`(%rsp),$xi[1]
179	mov	$d,$t0
180	mov	$xi[0],`4*($i%16)`(%rsp)
181	mov	$a,$t2
182	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
183	xor	$c,$t0
184	rol	\$5,$t2
185	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
186	and	$b,$t0
187	lea	0x5a827999($xi[0],$e),$e
188	rol	\$30,$b
189	xor	$d,$t0
190	add	$t2,$e
191	rol	\$1,$xi[1]
192	add	$t0,$e
193___
194push(@xi,shift(@xi));
195}
196
197sub BODY_20_39 {
198my ($i,$a,$b,$c,$d,$e)=@_;
199my $j=$i+1;
200my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
201$code.=<<___ if ($i<79);
202	xor	`4*($j%16)`(%rsp),$xi[1]
203	mov	$b,$t0
204	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
205	mov	$a,$t2
206	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
207	xor	$d,$t0
208	rol	\$5,$t2
209	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
210	lea	$K($xi[0],$e),$e
211	xor	$c,$t0
212	add	$t2,$e
213	rol	\$30,$b
214	add	$t0,$e
215	rol	\$1,$xi[1]
216___
217$code.=<<___ if ($i==79);
218	mov	$b,$t0
219	mov	$a,$t2
220	xor	$d,$t0
221	lea	$K($xi[0],$e),$e
222	rol	\$5,$t2
223	xor	$c,$t0
224	add	$t2,$e
225	rol	\$30,$b
226	add	$t0,$e
227___
228push(@xi,shift(@xi));
229}
230
231sub BODY_40_59 {
232my ($i,$a,$b,$c,$d,$e)=@_;
233my $j=$i+1;
234$code.=<<___;
235	xor	`4*($j%16)`(%rsp),$xi[1]
236	mov	$d,$t0
237	mov	$xi[0],`4*($i%16)`(%rsp)
238	mov	$d,$t1
239	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
240	and	$c,$t0
241	mov	$a,$t2
242	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
243	lea	0x8f1bbcdc($xi[0],$e),$e
244	xor	$c,$t1
245	rol	\$5,$t2
246	add	$t0,$e
247	rol	\$1,$xi[1]
248	and	$b,$t1
249	add	$t2,$e
250	rol	\$30,$b
251	add	$t1,$e
252___
253push(@xi,shift(@xi));
254}
255
256$code.=<<___;
257.text
258.extern	OPENSSL_ia32cap_P
259
260.globl	sha1_block_data_order
261.type	sha1_block_data_order,\@function,3
262.align	16
263sha1_block_data_order:
264.cfi_startproc
265	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
266	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
267	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
268	test	\$`1<<9`,%r8d		# check SSSE3 bit
269	jz	.Lialu
270___
271$code.=<<___ if ($shaext);
272	test	\$`1<<29`,%r10d		# check SHA bit
273	jnz	_shaext_shortcut
274___
275$code.=<<___ if ($avx>1);
276	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
277	cmp	\$`1<<3|1<<5|1<<8`,%r10d
278	je	_avx2_shortcut
279___
280$code.=<<___ if ($avx);
281	and	\$`1<<28`,%r8d		# mask AVX bit
282	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
283	or	%r9d,%r8d
284	cmp	\$`1<<28|1<<30`,%r8d
285	je	_avx_shortcut
286___
287$code.=<<___;
288	jmp	_ssse3_shortcut
289
290.align	16
291.Lialu:
292	mov	%rsp,%rax
293.cfi_def_cfa_register	%rax
294	push	%rbx
295.cfi_push	%rbx
296	push	%rbp
297.cfi_push	%rbp
298	push	%r12
299.cfi_push	%r12
300	push	%r13
301.cfi_push	%r13
302	push	%r14
303.cfi_push	%r14
304	mov	%rdi,$ctx	# reassigned argument
305	sub	\$`8+16*4`,%rsp
306	mov	%rsi,$inp	# reassigned argument
307	and	\$-64,%rsp
308	mov	%rdx,$num	# reassigned argument
309	mov	%rax,`16*4`(%rsp)
310.cfi_cfa_expression	%rsp+64,deref,+8
311.Lprologue:
312
313	mov	0($ctx),$A
314	mov	4($ctx),$B
315	mov	8($ctx),$C
316	mov	12($ctx),$D
317	mov	16($ctx),$E
318	jmp	.Lloop
319
320.align	16
321.Lloop:
322___
323for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
324for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
325for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
326for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
327$code.=<<___;
328	add	0($ctx),$A
329	add	4($ctx),$B
330	add	8($ctx),$C
331	add	12($ctx),$D
332	add	16($ctx),$E
333	mov	$A,0($ctx)
334	mov	$B,4($ctx)
335	mov	$C,8($ctx)
336	mov	$D,12($ctx)
337	mov	$E,16($ctx)
338
339	sub	\$1,$num
340	lea	`16*4`($inp),$inp
341	jnz	.Lloop
342
343	mov	`16*4`(%rsp),%rsi
344.cfi_def_cfa	%rsi,8
345	mov	-40(%rsi),%r14
346.cfi_restore	%r14
347	mov	-32(%rsi),%r13
348.cfi_restore	%r13
349	mov	-24(%rsi),%r12
350.cfi_restore	%r12
351	mov	-16(%rsi),%rbp
352.cfi_restore	%rbp
353	mov	-8(%rsi),%rbx
354.cfi_restore	%rbx
355	lea	(%rsi),%rsp
356.cfi_def_cfa_register	%rsp
357.Lepilogue:
358	ret
359.cfi_endproc
360.size	sha1_block_data_order,.-sha1_block_data_order
361___
362if ($shaext) {{{
363######################################################################
364# Intel SHA Extensions implementation of SHA1 update function.
365#
366my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
367my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
368my @MSG=map("%xmm$_",(4..7));
369
370$code.=<<___;
371.type	sha1_block_data_order_shaext,\@function,3
372.align	32
373sha1_block_data_order_shaext:
374_shaext_shortcut:
375.cfi_startproc
376___
377$code.=<<___ if ($win64);
378	lea	`-8-4*16`(%rsp),%rsp
379	movaps	%xmm6,-8-4*16(%rax)
380	movaps	%xmm7,-8-3*16(%rax)
381	movaps	%xmm8,-8-2*16(%rax)
382	movaps	%xmm9,-8-1*16(%rax)
383.Lprologue_shaext:
384___
385$code.=<<___;
386	movdqu	($ctx),$ABCD
387	movd	16($ctx),$E
388	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
389
390	movdqu	($inp),@MSG[0]
391	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
392	movdqu	0x10($inp),@MSG[1]
393	pshufd	\$0b00011011,$E,$E		# flip word order
394	movdqu	0x20($inp),@MSG[2]
395	pshufb	$BSWAP,@MSG[0]
396	movdqu	0x30($inp),@MSG[3]
397	pshufb	$BSWAP,@MSG[1]
398	pshufb	$BSWAP,@MSG[2]
399	movdqa	$E,$E_SAVE			# offload $E
400	pshufb	$BSWAP,@MSG[3]
401	jmp	.Loop_shaext
402
403.align	16
404.Loop_shaext:
405	dec		$num
406	lea		0x40($inp),%r8		# next input block
407	paddd		@MSG[0],$E
408	cmovne		%r8,$inp
409	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
410___
411for($i=0;$i<20-4;$i+=2) {
412$code.=<<___;
413	sha1msg1	@MSG[1],@MSG[0]
414	movdqa		$ABCD,$E_
415	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
416	sha1nexte	@MSG[1],$E_
417	pxor		@MSG[2],@MSG[0]
418	sha1msg1	@MSG[2],@MSG[1]
419	sha1msg2	@MSG[3],@MSG[0]
420
421	movdqa		$ABCD,$E
422	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
423	sha1nexte	@MSG[2],$E
424	pxor		@MSG[3],@MSG[1]
425	sha1msg2	@MSG[0],@MSG[1]
426___
427	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
428}
429$code.=<<___;
430	movdqu		($inp),@MSG[0]
431	movdqa		$ABCD,$E_
432	sha1rnds4	\$3,$E,$ABCD		# 64-67
433	sha1nexte	@MSG[1],$E_
434	movdqu		0x10($inp),@MSG[1]
435	pshufb		$BSWAP,@MSG[0]
436
437	movdqa		$ABCD,$E
438	sha1rnds4	\$3,$E_,$ABCD		# 68-71
439	sha1nexte	@MSG[2],$E
440	movdqu		0x20($inp),@MSG[2]
441	pshufb		$BSWAP,@MSG[1]
442
443	movdqa		$ABCD,$E_
444	sha1rnds4	\$3,$E,$ABCD		# 72-75
445	sha1nexte	@MSG[3],$E_
446	movdqu		0x30($inp),@MSG[3]
447	pshufb		$BSWAP,@MSG[2]
448
449	movdqa		$ABCD,$E
450	sha1rnds4	\$3,$E_,$ABCD		# 76-79
451	sha1nexte	$E_SAVE,$E
452	pshufb		$BSWAP,@MSG[3]
453
454	paddd		$ABCD_SAVE,$ABCD
455	movdqa		$E,$E_SAVE		# offload $E
456
457	jnz		.Loop_shaext
458
459	pshufd	\$0b00011011,$ABCD,$ABCD
460	pshufd	\$0b00011011,$E,$E
461	movdqu	$ABCD,($ctx)
462	movd	$E,16($ctx)
463___
464$code.=<<___ if ($win64);
465	movaps	-8-4*16(%rax),%xmm6
466	movaps	-8-3*16(%rax),%xmm7
467	movaps	-8-2*16(%rax),%xmm8
468	movaps	-8-1*16(%rax),%xmm9
469	mov	%rax,%rsp
470.Lepilogue_shaext:
471___
472$code.=<<___;
473	ret
474.cfi_endproc
475.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
476___
477}}}
478{{{
479my $Xi=4;
480my @X=map("%xmm$_",(4..7,0..3));
481my @Tx=map("%xmm$_",(8..10));
482my $Kx="%xmm11";
483my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
484my @T=("%esi","%edi");
485my $j=0;
486my $rx=0;
487my $K_XX_XX="%r14";
488my $fp="%r11";
489
490my $_rol=sub { &rol(@_) };
491my $_ror=sub { &ror(@_) };
492
493{ my $sn;
494sub align32() {
495  ++$sn;
496$code.=<<___;
497	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
498.align	32
499.Lalign32_$sn:
500___
501}
502}
503
504$code.=<<___;
505.type	sha1_block_data_order_ssse3,\@function,3
506.align	16
507sha1_block_data_order_ssse3:
508_ssse3_shortcut:
509.cfi_startproc
510	mov	%rsp,$fp	# frame pointer
511.cfi_def_cfa_register	$fp
512	push	%rbx
513.cfi_push	%rbx
514	push	%rbp
515.cfi_push	%rbp
516	push	%r12
517.cfi_push	%r12
518	push	%r13		# redundant, done to share Win64 SE handler
519.cfi_push	%r13
520	push	%r14
521.cfi_push	%r14
522	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
523___
524$code.=<<___ if ($win64);
525	movaps	%xmm6,-40-6*16($fp)
526	movaps	%xmm7,-40-5*16($fp)
527	movaps	%xmm8,-40-4*16($fp)
528	movaps	%xmm9,-40-3*16($fp)
529	movaps	%xmm10,-40-2*16($fp)
530	movaps	%xmm11,-40-1*16($fp)
531.Lprologue_ssse3:
532___
533$code.=<<___;
534	and	\$-64,%rsp
535	mov	%rdi,$ctx	# reassigned argument
536	mov	%rsi,$inp	# reassigned argument
537	mov	%rdx,$num	# reassigned argument
538
539	shl	\$6,$num
540	add	$inp,$num
541	lea	K_XX_XX+64(%rip),$K_XX_XX
542
543	mov	0($ctx),$A		# load context
544	mov	4($ctx),$B
545	mov	8($ctx),$C
546	mov	12($ctx),$D
547	mov	$B,@T[0]		# magic seed
548	mov	16($ctx),$E
549	mov	$C,@T[1]
550	xor	$D,@T[1]
551	and	@T[1],@T[0]
552
553	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
554	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
555	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
556	movdqu	16($inp),@X[-3&7]
557	movdqu	32($inp),@X[-2&7]
558	movdqu	48($inp),@X[-1&7]
559	pshufb	@X[2],@X[-4&7]		# byte swap
560	pshufb	@X[2],@X[-3&7]
561	pshufb	@X[2],@X[-2&7]
562	add	\$64,$inp
563	paddd	@Tx[1],@X[-4&7]		# add K_00_19
564	pshufb	@X[2],@X[-1&7]
565	paddd	@Tx[1],@X[-3&7]
566	paddd	@Tx[1],@X[-2&7]
567	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
568	psubd	@Tx[1],@X[-4&7]		# restore X[]
569	movdqa	@X[-3&7],16(%rsp)
570	psubd	@Tx[1],@X[-3&7]
571	movdqa	@X[-2&7],32(%rsp)
572	psubd	@Tx[1],@X[-2&7]
573	jmp	.Loop_ssse3
574___
575
576sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
577{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
578  my $arg = pop;
579    $arg = "\$$arg" if ($arg*1 eq $arg);
580    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
581}
582
583sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
584{ use integer;
585  my $body = shift;
586  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
587  my ($a,$b,$c,$d,$e);
588
589	 eval(shift(@insns));		# ror
590	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
591	 eval(shift(@insns));
592	&movdqa	(@Tx[0],@X[-1&7]);
593	  &paddd	(@Tx[1],@X[-1&7]);
594	 eval(shift(@insns));
595	 eval(shift(@insns));
596
597	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
598	 eval(shift(@insns));
599	 eval(shift(@insns));		# rol
600	 eval(shift(@insns));
601	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
602	 eval(shift(@insns));
603	 eval(shift(@insns));
604
605	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
606	 eval(shift(@insns));
607	 eval(shift(@insns));		# ror
608	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
609	 eval(shift(@insns));
610	 eval(shift(@insns));
611	 eval(shift(@insns));
612
613	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
614	 eval(shift(@insns));
615	 eval(shift(@insns));		# rol
616	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
617	 eval(shift(@insns));
618	 eval(shift(@insns));
619
620	&movdqa	(@Tx[2],@X[0]);
621	 eval(shift(@insns));
622	 eval(shift(@insns));
623	 eval(shift(@insns));		# ror
624	&movdqa	(@Tx[0],@X[0]);
625	 eval(shift(@insns));
626
627	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
628	&paddd	(@X[0],@X[0]);
629	 eval(shift(@insns));
630	 eval(shift(@insns));
631
632	&psrld	(@Tx[0],31);
633	 eval(shift(@insns));
634	 eval(shift(@insns));		# rol
635	 eval(shift(@insns));
636	&movdqa	(@Tx[1],@Tx[2]);
637	 eval(shift(@insns));
638	 eval(shift(@insns));
639
640	&psrld	(@Tx[2],30);
641	 eval(shift(@insns));
642	 eval(shift(@insns));		# ror
643	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
644	 eval(shift(@insns));
645	 eval(shift(@insns));
646	 eval(shift(@insns));
647
648	&pslld	(@Tx[1],2);
649	&pxor	(@X[0],@Tx[2]);
650	 eval(shift(@insns));
651	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
652	 eval(shift(@insns));		# rol
653	 eval(shift(@insns));
654	 eval(shift(@insns));
655
656	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
657	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
658
659	 foreach (@insns) { eval; }	# remaining instructions [if any]
660
661  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
662		push(@Tx,shift(@Tx));
663}
664
665sub Xupdate_ssse3_32_79()
666{ use integer;
667  my $body = shift;
668  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
669  my ($a,$b,$c,$d,$e);
670
671	 eval(shift(@insns))		if ($Xi==8);
672	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
673	 eval(shift(@insns))		if ($Xi==8);
674	 eval(shift(@insns));		# body_20_39
675	 eval(shift(@insns));
676	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
677	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
678	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
679	 eval(shift(@insns));
680	 eval(shift(@insns));		# rol
681
682	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
683	 eval(shift(@insns));
684	 eval(shift(@insns));
685	if ($Xi%5) {
686	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
687	} else {			# ... or load next one
688	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
689	}
690	 eval(shift(@insns));		# ror
691	  &paddd	(@Tx[1],@X[-1&7]);
692	 eval(shift(@insns));
693
694	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
695	 eval(shift(@insns));		# body_20_39
696	 eval(shift(@insns));
697	 eval(shift(@insns));
698	 eval(shift(@insns));		# rol
699	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
700
701	&movdqa	(@Tx[0],@X[0]);
702	 eval(shift(@insns));
703	 eval(shift(@insns));
704	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
705	 eval(shift(@insns));		# ror
706	 eval(shift(@insns));
707	 eval(shift(@insns));		# body_20_39
708
709	&pslld	(@X[0],2);
710	 eval(shift(@insns));
711	 eval(shift(@insns));
712	&psrld	(@Tx[0],30);
713	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
714	 eval(shift(@insns));
715	 eval(shift(@insns));
716	 eval(shift(@insns));		# ror
717
718	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
719	 eval(shift(@insns));
720	 eval(shift(@insns));		# body_20_39
721	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
722	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
723	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
724	 eval(shift(@insns));
725	 eval(shift(@insns));		# rol
726	 eval(shift(@insns));
727	 eval(shift(@insns));
728	 eval(shift(@insns));		# rol
729	 eval(shift(@insns));
730
731	 foreach (@insns) { eval; }	# remaining instructions
732
733  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
734		push(@Tx,shift(@Tx));
735}
736
737sub Xuplast_ssse3_80()
738{ use integer;
739  my $body = shift;
740  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
741  my ($a,$b,$c,$d,$e);
742
743	 eval(shift(@insns));
744	 eval(shift(@insns));
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	  &paddd	(@Tx[1],@X[-1&7]);
748	 eval(shift(@insns));
749	 eval(shift(@insns));
750
751	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
752
753	 foreach (@insns) { eval; }		# remaining instructions
754
755	&cmp	($inp,$num);
756	&je	(".Ldone_ssse3");
757
758	unshift(@Tx,pop(@Tx));
759
760	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
761	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
762	&movdqu	(@X[-4&7],"0($inp)");		# load input
763	&movdqu	(@X[-3&7],"16($inp)");
764	&movdqu	(@X[-2&7],"32($inp)");
765	&movdqu	(@X[-1&7],"48($inp)");
766	&pshufb	(@X[-4&7],@X[2]);		# byte swap
767	&add	($inp,64);
768
769  $Xi=0;
770}
771
772sub Xloop_ssse3()
773{ use integer;
774  my $body = shift;
775  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
776  my ($a,$b,$c,$d,$e);
777
778	 eval(shift(@insns));
779	 eval(shift(@insns));
780	 eval(shift(@insns));
781	&pshufb	(@X[($Xi-3)&7],@X[2]);
782	 eval(shift(@insns));
783	 eval(shift(@insns));
784	 eval(shift(@insns));
785	 eval(shift(@insns));
786	&paddd	(@X[($Xi-4)&7],@Tx[1]);
787	 eval(shift(@insns));
788	 eval(shift(@insns));
789	 eval(shift(@insns));
790	 eval(shift(@insns));
791	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
792	 eval(shift(@insns));
793	 eval(shift(@insns));
794	 eval(shift(@insns));
795	 eval(shift(@insns));
796	&psubd	(@X[($Xi-4)&7],@Tx[1]);
797
798	foreach (@insns) { eval; }
799  $Xi++;
800}
801
802sub Xtail_ssse3()
803{ use integer;
804  my $body = shift;
805  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
806  my ($a,$b,$c,$d,$e);
807
808	foreach (@insns) { eval; }
809}
810
811sub body_00_19 () {	# ((c^d)&b)^d
812	# on start @T[0]=(c^d)&b
813	return &body_20_39() if ($rx==19); $rx++;
814	(
815	'($a,$b,$c,$d,$e)=@V;'.
816	'&$_ror	($b,$j?7:2)',	# $b>>>2
817	'&xor	(@T[0],$d)',
818	'&mov	(@T[1],$a)',	# $b for next round
819
820	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
821	'&xor	($b,$c)',	# $c^$d for next round
822
823	'&$_rol	($a,5)',
824	'&add	($e,@T[0])',
825	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
826
827	'&xor	($b,$c)',	# restore $b
828	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
829	);
830}
831
832sub body_20_39 () {	# b^d^c
833	# on entry @T[0]=b^d
834	return &body_40_59() if ($rx==39); $rx++;
835	(
836	'($a,$b,$c,$d,$e)=@V;'.
837	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
838	'&xor	(@T[0],$d)	if($j==19);'.
839	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
840	'&mov	(@T[1],$a)',	# $b for next round
841
842	'&$_rol	($a,5)',
843	'&add	($e,@T[0])',
844	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
845
846	'&$_ror	($b,7)',	# $b>>>2
847	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
848	);
849}
850
851sub body_40_59 () {	# ((b^c)&(c^d))^c
852	# on entry @T[0]=(b^c), (c^=d)
853	$rx++;
854	(
855	'($a,$b,$c,$d,$e)=@V;'.
856	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
857	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
858	'&xor	($c,$d)		if ($j>=40)',	# restore $c
859
860	'&$_ror	($b,7)',	# $b>>>2
861	'&mov	(@T[1],$a)',	# $b for next round
862	'&xor	(@T[0],$c)',
863
864	'&$_rol	($a,5)',
865	'&add	($e,@T[0])',
866	'&xor	(@T[1],$c)	if ($j==59);'.
867	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
868
869	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
870	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
871	);
872}
873$code.=<<___;
874.align	16
875.Loop_ssse3:
876___
877	&Xupdate_ssse3_16_31(\&body_00_19);
878	&Xupdate_ssse3_16_31(\&body_00_19);
879	&Xupdate_ssse3_16_31(\&body_00_19);
880	&Xupdate_ssse3_16_31(\&body_00_19);
881	&Xupdate_ssse3_32_79(\&body_00_19);
882	&Xupdate_ssse3_32_79(\&body_20_39);
883	&Xupdate_ssse3_32_79(\&body_20_39);
884	&Xupdate_ssse3_32_79(\&body_20_39);
885	&Xupdate_ssse3_32_79(\&body_20_39);
886	&Xupdate_ssse3_32_79(\&body_20_39);
887	&Xupdate_ssse3_32_79(\&body_40_59);
888	&Xupdate_ssse3_32_79(\&body_40_59);
889	&Xupdate_ssse3_32_79(\&body_40_59);
890	&Xupdate_ssse3_32_79(\&body_40_59);
891	&Xupdate_ssse3_32_79(\&body_40_59);
892	&Xupdate_ssse3_32_79(\&body_20_39);
893	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
894
895				$saved_j=$j; @saved_V=@V;
896
897	&Xloop_ssse3(\&body_20_39);
898	&Xloop_ssse3(\&body_20_39);
899	&Xloop_ssse3(\&body_20_39);
900
901$code.=<<___;
902	add	0($ctx),$A			# update context
903	add	4($ctx),@T[0]
904	add	8($ctx),$C
905	add	12($ctx),$D
906	mov	$A,0($ctx)
907	add	16($ctx),$E
908	mov	@T[0],4($ctx)
909	mov	@T[0],$B			# magic seed
910	mov	$C,8($ctx)
911	mov	$C,@T[1]
912	mov	$D,12($ctx)
913	xor	$D,@T[1]
914	mov	$E,16($ctx)
915	and	@T[1],@T[0]
916	jmp	.Loop_ssse3
917
918.align	16
919.Ldone_ssse3:
920___
921				$j=$saved_j; @V=@saved_V;
922
923	&Xtail_ssse3(\&body_20_39);
924	&Xtail_ssse3(\&body_20_39);
925	&Xtail_ssse3(\&body_20_39);
926
927$code.=<<___;
928	add	0($ctx),$A			# update context
929	add	4($ctx),@T[0]
930	add	8($ctx),$C
931	mov	$A,0($ctx)
932	add	12($ctx),$D
933	mov	@T[0],4($ctx)
934	add	16($ctx),$E
935	mov	$C,8($ctx)
936	mov	$D,12($ctx)
937	mov	$E,16($ctx)
938___
939$code.=<<___ if ($win64);
940	movaps	-40-6*16($fp),%xmm6
941	movaps	-40-5*16($fp),%xmm7
942	movaps	-40-4*16($fp),%xmm8
943	movaps	-40-3*16($fp),%xmm9
944	movaps	-40-2*16($fp),%xmm10
945	movaps	-40-1*16($fp),%xmm11
946___
947$code.=<<___;
948	mov	-40($fp),%r14
949.cfi_restore	%r14
950	mov	-32($fp),%r13
951.cfi_restore	%r13
952	mov	-24($fp),%r12
953.cfi_restore	%r12
954	mov	-16($fp),%rbp
955.cfi_restore	%rbp
956	mov	-8($fp),%rbx
957.cfi_restore	%rbx
958	lea	($fp),%rsp
959.cfi_def_cfa_register	%rsp
960.Lepilogue_ssse3:
961	ret
962.cfi_endproc
963.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
964___
965
966if ($avx) {
967$Xi=4;				# reset variables
968@X=map("%xmm$_",(4..7,0..3));
969@Tx=map("%xmm$_",(8..10));
970$j=0;
971$rx=0;
972
973my $done_avx_label=".Ldone_avx";
974
975my $_rol=sub { &shld(@_[0],@_) };
976my $_ror=sub { &shrd(@_[0],@_) };
977
978$code.=<<___;
979.type	sha1_block_data_order_avx,\@function,3
980.align	16
981sha1_block_data_order_avx:
982_avx_shortcut:
983.cfi_startproc
984	mov	%rsp,$fp
985.cfi_def_cfa_register	$fp
986	push	%rbx
987.cfi_push	%rbx
988	push	%rbp
989.cfi_push	%rbp
990	push	%r12
991.cfi_push	%r12
992	push	%r13		# redundant, done to share Win64 SE handler
993.cfi_push	%r13
994	push	%r14
995.cfi_push	%r14
996	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
997	vzeroupper
998___
999$code.=<<___ if ($win64);
1000	vmovaps	%xmm6,-40-6*16($fp)
1001	vmovaps	%xmm7,-40-5*16($fp)
1002	vmovaps	%xmm8,-40-4*16($fp)
1003	vmovaps	%xmm9,-40-3*16($fp)
1004	vmovaps	%xmm10,-40-2*16($fp)
1005	vmovaps	%xmm11,-40-1*16($fp)
1006.Lprologue_avx:
1007___
1008$code.=<<___;
1009	and	\$-64,%rsp
1010	mov	%rdi,$ctx	# reassigned argument
1011	mov	%rsi,$inp	# reassigned argument
1012	mov	%rdx,$num	# reassigned argument
1013
1014	shl	\$6,$num
1015	add	$inp,$num
1016	lea	K_XX_XX+64(%rip),$K_XX_XX
1017
1018	mov	0($ctx),$A		# load context
1019	mov	4($ctx),$B
1020	mov	8($ctx),$C
1021	mov	12($ctx),$D
1022	mov	$B,@T[0]		# magic seed
1023	mov	16($ctx),$E
1024	mov	$C,@T[1]
1025	xor	$D,@T[1]
1026	and	@T[1],@T[0]
1027
1028	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1029	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
1030	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1031	vmovdqu	16($inp),@X[-3&7]
1032	vmovdqu	32($inp),@X[-2&7]
1033	vmovdqu	48($inp),@X[-1&7]
1034	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1035	add	\$64,$inp
1036	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1037	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1038	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1039	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1040	vpaddd	$Kx,@X[-3&7],@X[1]
1041	vpaddd	$Kx,@X[-2&7],@X[2]
1042	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1043	vmovdqa	@X[1],16(%rsp)
1044	vmovdqa	@X[2],32(%rsp)
1045	jmp	.Loop_avx
1046___
1047
1048sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1049{ use integer;
1050  my $body = shift;
1051  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1052  my ($a,$b,$c,$d,$e);
1053
1054	 eval(shift(@insns));
1055	 eval(shift(@insns));
1056	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1057	 eval(shift(@insns));
1058	 eval(shift(@insns));
1059
1060	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1061	 eval(shift(@insns));
1062	 eval(shift(@insns));
1063	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1064	 eval(shift(@insns));
1065	 eval(shift(@insns));
1066	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1067	 eval(shift(@insns));
1068	 eval(shift(@insns));
1069
1070	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1071	 eval(shift(@insns));
1072	 eval(shift(@insns));
1073	 eval(shift(@insns));
1074	 eval(shift(@insns));
1075
1076	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1077	 eval(shift(@insns));
1078	 eval(shift(@insns));
1079	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1080	 eval(shift(@insns));
1081	 eval(shift(@insns));
1082
1083	&vpsrld	(@Tx[0],@X[0],31);
1084	 eval(shift(@insns));
1085	 eval(shift(@insns));
1086	 eval(shift(@insns));
1087	 eval(shift(@insns));
1088
1089	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1090	&vpaddd	(@X[0],@X[0],@X[0]);
1091	 eval(shift(@insns));
1092	 eval(shift(@insns));
1093	 eval(shift(@insns));
1094	 eval(shift(@insns));
1095
1096	&vpsrld	(@Tx[1],@Tx[2],30);
1097	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1098	 eval(shift(@insns));
1099	 eval(shift(@insns));
1100	 eval(shift(@insns));
1101	 eval(shift(@insns));
1102
1103	&vpslld	(@Tx[2],@Tx[2],2);
1104	&vpxor	(@X[0],@X[0],@Tx[1]);
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));
1107	 eval(shift(@insns));
1108	 eval(shift(@insns));
1109
1110	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1111	 eval(shift(@insns));
1112	 eval(shift(@insns));
1113	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1114	 eval(shift(@insns));
1115	 eval(shift(@insns));
1116
1117
1118	 foreach (@insns) { eval; }	# remaining instructions [if any]
1119
1120  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1121}
1122
1123sub Xupdate_avx_32_79()
1124{ use integer;
1125  my $body = shift;
1126  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1127  my ($a,$b,$c,$d,$e);
1128
1129	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1130	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1131	 eval(shift(@insns));		# body_20_39
1132	 eval(shift(@insns));
1133	 eval(shift(@insns));
1134	 eval(shift(@insns));		# rol
1135
1136	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1137	 eval(shift(@insns));
1138	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1139	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1140	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1141	 eval(shift(@insns));		# ror
1142	 eval(shift(@insns));
1143
1144	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1145	 eval(shift(@insns));		# body_20_39
1146	 eval(shift(@insns));
1147	 eval(shift(@insns));
1148	 eval(shift(@insns));		# rol
1149
1150	&vpsrld	(@Tx[0],@X[0],30);
1151	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1152	 eval(shift(@insns));
1153	 eval(shift(@insns));
1154	 eval(shift(@insns));		# ror
1155	 eval(shift(@insns));
1156
1157	&vpslld	(@X[0],@X[0],2);
1158	 eval(shift(@insns));		# body_20_39
1159	 eval(shift(@insns));
1160	 eval(shift(@insns));
1161	 eval(shift(@insns));		# rol
1162	 eval(shift(@insns));
1163	 eval(shift(@insns));
1164	 eval(shift(@insns));		# ror
1165	 eval(shift(@insns));
1166
1167	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1168	 eval(shift(@insns));		# body_20_39
1169	 eval(shift(@insns));
1170	 eval(shift(@insns));
1171	 eval(shift(@insns));		# rol
1172	 eval(shift(@insns));
1173	 eval(shift(@insns));
1174	 eval(shift(@insns));		# rol
1175	 eval(shift(@insns));
1176
1177	 foreach (@insns) { eval; }	# remaining instructions
1178
1179  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1180}
1181
1182sub Xuplast_avx_80()
1183{ use integer;
1184  my $body = shift;
1185  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1186  my ($a,$b,$c,$d,$e);
1187
1188	 eval(shift(@insns));
1189	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1190	 eval(shift(@insns));
1191	 eval(shift(@insns));
1192	 eval(shift(@insns));
1193	 eval(shift(@insns));
1194
1195	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1196
1197	 foreach (@insns) { eval; }		# remaining instructions
1198
1199	&cmp	($inp,$num);
1200	&je	($done_avx_label);
1201
1202	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1203	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1204	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1205	&vmovdqu(@X[-3&7],"16($inp)");
1206	&vmovdqu(@X[-2&7],"32($inp)");
1207	&vmovdqu(@X[-1&7],"48($inp)");
1208	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1209	&add	($inp,64);
1210
1211  $Xi=0;
1212}
1213
1214sub Xloop_avx()
1215{ use integer;
1216  my $body = shift;
1217  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1218  my ($a,$b,$c,$d,$e);
1219
1220	 eval(shift(@insns));
1221	 eval(shift(@insns));
1222	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1223	 eval(shift(@insns));
1224	 eval(shift(@insns));
1225	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1226	 eval(shift(@insns));
1227	 eval(shift(@insns));
1228	 eval(shift(@insns));
1229	 eval(shift(@insns));
1230	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1231	 eval(shift(@insns));
1232	 eval(shift(@insns));
1233
1234	foreach (@insns) { eval; }
1235  $Xi++;
1236}
1237
1238sub Xtail_avx()
1239{ use integer;
1240  my $body = shift;
1241  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1242  my ($a,$b,$c,$d,$e);
1243
1244	foreach (@insns) { eval; }
1245}
1246
1247$code.=<<___;
1248.align	16
1249.Loop_avx:
1250___
1251	&Xupdate_avx_16_31(\&body_00_19);
1252	&Xupdate_avx_16_31(\&body_00_19);
1253	&Xupdate_avx_16_31(\&body_00_19);
1254	&Xupdate_avx_16_31(\&body_00_19);
1255	&Xupdate_avx_32_79(\&body_00_19);
1256	&Xupdate_avx_32_79(\&body_20_39);
1257	&Xupdate_avx_32_79(\&body_20_39);
1258	&Xupdate_avx_32_79(\&body_20_39);
1259	&Xupdate_avx_32_79(\&body_20_39);
1260	&Xupdate_avx_32_79(\&body_20_39);
1261	&Xupdate_avx_32_79(\&body_40_59);
1262	&Xupdate_avx_32_79(\&body_40_59);
1263	&Xupdate_avx_32_79(\&body_40_59);
1264	&Xupdate_avx_32_79(\&body_40_59);
1265	&Xupdate_avx_32_79(\&body_40_59);
1266	&Xupdate_avx_32_79(\&body_20_39);
1267	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1268
1269				$saved_j=$j; @saved_V=@V;
1270
1271	&Xloop_avx(\&body_20_39);
1272	&Xloop_avx(\&body_20_39);
1273	&Xloop_avx(\&body_20_39);
1274
1275$code.=<<___;
1276	add	0($ctx),$A			# update context
1277	add	4($ctx),@T[0]
1278	add	8($ctx),$C
1279	add	12($ctx),$D
1280	mov	$A,0($ctx)
1281	add	16($ctx),$E
1282	mov	@T[0],4($ctx)
1283	mov	@T[0],$B			# magic seed
1284	mov	$C,8($ctx)
1285	mov	$C,@T[1]
1286	mov	$D,12($ctx)
1287	xor	$D,@T[1]
1288	mov	$E,16($ctx)
1289	and	@T[1],@T[0]
1290	jmp	.Loop_avx
1291
1292.align	16
1293$done_avx_label:
1294___
1295				$j=$saved_j; @V=@saved_V;
1296
1297	&Xtail_avx(\&body_20_39);
1298	&Xtail_avx(\&body_20_39);
1299	&Xtail_avx(\&body_20_39);
1300
1301$code.=<<___;
1302	vzeroupper
1303
1304	add	0($ctx),$A			# update context
1305	add	4($ctx),@T[0]
1306	add	8($ctx),$C
1307	mov	$A,0($ctx)
1308	add	12($ctx),$D
1309	mov	@T[0],4($ctx)
1310	add	16($ctx),$E
1311	mov	$C,8($ctx)
1312	mov	$D,12($ctx)
1313	mov	$E,16($ctx)
1314___
1315$code.=<<___ if ($win64);
1316	movaps	-40-6*16($fp),%xmm6
1317	movaps	-40-5*16($fp),%xmm7
1318	movaps	-40-4*16($fp),%xmm8
1319	movaps	-40-3*16($fp),%xmm9
1320	movaps	-40-2*16($fp),%xmm10
1321	movaps	-40-1*16($fp),%xmm11
1322___
1323$code.=<<___;
1324	mov	-40($fp),%r14
1325.cfi_restore	%r14
1326	mov	-32($fp),%r13
1327.cfi_restore	%r13
1328	mov	-24($fp),%r12
1329.cfi_restore	%r12
1330	mov	-16($fp),%rbp
1331.cfi_restore	%rbp
1332	mov	-8($fp),%rbx
1333.cfi_restore	%rbx
1334	lea	($fp),%rsp
1335.cfi_def_cfa_register	%rsp
1336.Lepilogue_avx:
1337	ret
1338.cfi_endproc
1339.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1340___
1341
1342if ($avx>1) {
1343use integer;
1344$Xi=4;					# reset variables
1345@X=map("%ymm$_",(4..7,0..3));
1346@Tx=map("%ymm$_",(8..10));
1347$Kx="%ymm11";
1348$j=0;
1349
1350my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1351my ($a5,$t0)=("%r12d","%edi");
1352
1353my ($A,$F,$B,$C,$D,$E)=@ROTX;
1354my $rx=0;
1355my $frame="%r13";
1356
1357$code.=<<___;
1358.type	sha1_block_data_order_avx2,\@function,3
1359.align	16
1360sha1_block_data_order_avx2:
1361_avx2_shortcut:
1362.cfi_startproc
1363	mov	%rsp,$fp
1364.cfi_def_cfa_register	$fp
1365	push	%rbx
1366.cfi_push	%rbx
1367	push	%rbp
1368.cfi_push	%rbp
1369	push	%r12
1370.cfi_push	%r12
1371	push	%r13
1372.cfi_push	%r13
1373	push	%r14
1374.cfi_push	%r14
1375	vzeroupper
1376___
1377$code.=<<___ if ($win64);
1378	lea	-6*16(%rsp),%rsp
1379	vmovaps	%xmm6,-40-6*16($fp)
1380	vmovaps	%xmm7,-40-5*16($fp)
1381	vmovaps	%xmm8,-40-4*16($fp)
1382	vmovaps	%xmm9,-40-3*16($fp)
1383	vmovaps	%xmm10,-40-2*16($fp)
1384	vmovaps	%xmm11,-40-1*16($fp)
1385.Lprologue_avx2:
1386___
1387$code.=<<___;
1388	mov	%rdi,$ctx		# reassigned argument
1389	mov	%rsi,$inp		# reassigned argument
1390	mov	%rdx,$num		# reassigned argument
1391
1392	lea	-640(%rsp),%rsp
1393	shl	\$6,$num
1394	 lea	64($inp),$frame
1395	and	\$-128,%rsp
1396	add	$inp,$num
1397	lea	K_XX_XX+64(%rip),$K_XX_XX
1398
1399	mov	0($ctx),$A		# load context
1400	 cmp	$num,$frame
1401	 cmovae	$inp,$frame		# next or same block
1402	mov	4($ctx),$F
1403	mov	8($ctx),$C
1404	mov	12($ctx),$D
1405	mov	16($ctx),$E
1406	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1407
1408	vmovdqu		($inp),%xmm0
1409	vmovdqu		16($inp),%xmm1
1410	vmovdqu		32($inp),%xmm2
1411	vmovdqu		48($inp),%xmm3
1412	lea		64($inp),$inp
1413	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1414	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1415	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1416	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1417	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1418	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1419	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1420	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1421	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1422
1423	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1424	vpaddd	$Kx,@X[-3&7],@X[1]
1425	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1426	vpaddd	$Kx,@X[-2&7],@X[2]
1427	vmovdqu	@X[1],32(%rsp)
1428	vpaddd	$Kx,@X[-1&7],@X[3]
1429	vmovdqu	@X[2],64(%rsp)
1430	vmovdqu	@X[3],96(%rsp)
1431___
1432for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1433    use integer;
1434
1435	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1436	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1437	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1438	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1439	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1440	&vpsrld	(@Tx[0],@X[0],31);
1441	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1442	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1443	&vpaddd	(@X[0],@X[0],@X[0]);
1444	&vpsrld	(@Tx[1],@Tx[2],30);
1445	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1446	&vpslld	(@Tx[2],@Tx[2],2);
1447	&vpxor	(@X[0],@X[0],@Tx[1]);
1448	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1449	&vpaddd	(@Tx[1],@X[0],$Kx);
1450	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1451
1452	push(@X,shift(@X));	# "rotate" X[]
1453}
1454$code.=<<___;
1455	lea	128(%rsp),$frame
1456	jmp	.Loop_avx2
1457.align	32
1458.Loop_avx2:
1459	rorx	\$2,$F,$B
1460	andn	$D,$F,$t0
1461	and	$C,$F
1462	xor	$t0,$F
1463___
1464sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1465	# at start $f=(b&c)^(~b&d), $b>>>=2
1466	return &bodyx_20_39() if ($rx==19); $rx++;
1467	(
1468	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1469
1470	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1471	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1472	'&andn	($t0,$a,$c)',			# ~b&d for next round
1473
1474	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1475	'&rorx	($a5,$a,27)',			# a<<<5
1476	'&rorx	($f,$a,2)',			# b>>>2 for next round
1477	'&and	($a,$b)',			# b&c for next round
1478
1479	'&add	($e,$a5)',			# e+=a<<<5
1480	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1481
1482	'unshift(@ROTX,pop(@ROTX)); $j++;'
1483	)
1484}
1485
1486sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1487	# on entry $f=b^c^d, $b>>>=2
1488	return &bodyx_40_59() if ($rx==39); $rx++;
1489	(
1490	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1491
1492	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1493	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1494
1495	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1496	'&rorx	($a5,$a,27)',			# a<<<5
1497	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1498	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1499
1500	'&add	($e,$a5)',			# e+=a<<<5
1501	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1502
1503	'unshift(@ROTX,pop(@ROTX)); $j++;'
1504	)
1505}
1506
1507sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1508	# on entry $f=((b^c)&(c^d)), $b>>>=2
1509	$rx++;
1510	(
1511	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1512
1513	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1514	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1515	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1516	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1517	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1518
1519	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1520	'&rorx	($a5,$a,27)',			# a<<<5
1521	'&rorx	($f,$a,2)',			# b>>>2 in next round
1522	'&xor	($a,$b)',			# b^c for next round
1523
1524	'&add	($e,$a5)',			# e+=a<<<5
1525	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1526	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1527
1528	'unshift(@ROTX,pop(@ROTX)); $j++;'
1529	)
1530}
1531
1532sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1533{ use integer;
1534  my $body = shift;
1535  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1536  my ($a,$b,$c,$d,$e);
1537
1538	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1539	 eval(shift(@insns));
1540	 eval(shift(@insns));
1541	 eval(shift(@insns));
1542	 eval(shift(@insns));
1543
1544	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1545	 eval(shift(@insns));
1546	 eval(shift(@insns));
1547	 eval(shift(@insns));
1548
1549	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1550	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1551	 eval(shift(@insns));
1552	 eval(shift(@insns));
1553
1554	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1555	 eval(shift(@insns));
1556	 eval(shift(@insns));
1557	 eval(shift(@insns));
1558	 eval(shift(@insns));
1559
1560	&vpsrld	(@Tx[0],@X[0],31);
1561	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1562	 eval(shift(@insns));
1563	 eval(shift(@insns));
1564	 eval(shift(@insns));
1565
1566	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1567	&vpaddd	(@X[0],@X[0],@X[0]);
1568	 eval(shift(@insns));
1569	 eval(shift(@insns));
1570
1571	&vpsrld	(@Tx[1],@Tx[2],30);
1572	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1573	 eval(shift(@insns));
1574	 eval(shift(@insns));
1575
1576	&vpslld	(@Tx[2],@Tx[2],2);
1577	&vpxor	(@X[0],@X[0],@Tx[1]);
1578	 eval(shift(@insns));
1579	 eval(shift(@insns));
1580
1581	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1582	 eval(shift(@insns));
1583	 eval(shift(@insns));
1584	 eval(shift(@insns));
1585
1586	&vpaddd	(@Tx[1],@X[0],$Kx);
1587	 eval(shift(@insns));
1588	 eval(shift(@insns));
1589	 eval(shift(@insns));
1590	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1591
1592	 foreach (@insns) { eval; }	# remaining instructions [if any]
1593
1594	$Xi++;
1595	push(@X,shift(@X));	# "rotate" X[]
1596}
1597
1598sub Xupdate_avx2_32_79()
1599{ use integer;
1600  my $body = shift;
1601  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1602  my ($a,$b,$c,$d,$e);
1603
1604	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1605	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1606	 eval(shift(@insns));
1607	 eval(shift(@insns));
1608
1609	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1610	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1611	 eval(shift(@insns));
1612	 eval(shift(@insns));
1613	 eval(shift(@insns));
1614
1615	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1616	 eval(shift(@insns));
1617	 eval(shift(@insns));
1618	 eval(shift(@insns));
1619
1620	&vpsrld	(@Tx[0],@X[0],30);
1621	&vpslld	(@X[0],@X[0],2);
1622	 eval(shift(@insns));
1623	 eval(shift(@insns));
1624	 eval(shift(@insns));
1625
1626	#&vpslld	(@X[0],@X[0],2);
1627	 eval(shift(@insns));
1628	 eval(shift(@insns));
1629	 eval(shift(@insns));
1630
1631	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1632	 eval(shift(@insns));
1633	 eval(shift(@insns));
1634	 eval(shift(@insns));
1635	 eval(shift(@insns));
1636
1637	&vpaddd	(@Tx[1],@X[0],$Kx);
1638	 eval(shift(@insns));
1639	 eval(shift(@insns));
1640	 eval(shift(@insns));
1641	 eval(shift(@insns));
1642
1643	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1644
1645	 foreach (@insns) { eval; }	# remaining instructions
1646
1647	$Xi++;
1648	push(@X,shift(@X));	# "rotate" X[]
1649}
1650
1651sub Xloop_avx2()
1652{ use integer;
1653  my $body = shift;
1654  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1655  my ($a,$b,$c,$d,$e);
1656
1657	 foreach (@insns) { eval; }
1658}
1659
1660	&align32();
1661	&Xupdate_avx2_32_79(\&bodyx_00_19);
1662	&Xupdate_avx2_32_79(\&bodyx_00_19);
1663	&Xupdate_avx2_32_79(\&bodyx_00_19);
1664	&Xupdate_avx2_32_79(\&bodyx_00_19);
1665
1666	&Xupdate_avx2_32_79(\&bodyx_20_39);
1667	&Xupdate_avx2_32_79(\&bodyx_20_39);
1668	&Xupdate_avx2_32_79(\&bodyx_20_39);
1669	&Xupdate_avx2_32_79(\&bodyx_20_39);
1670
1671	&align32();
1672	&Xupdate_avx2_32_79(\&bodyx_40_59);
1673	&Xupdate_avx2_32_79(\&bodyx_40_59);
1674	&Xupdate_avx2_32_79(\&bodyx_40_59);
1675	&Xupdate_avx2_32_79(\&bodyx_40_59);
1676
1677	&Xloop_avx2(\&bodyx_20_39);
1678	&Xloop_avx2(\&bodyx_20_39);
1679	&Xloop_avx2(\&bodyx_20_39);
1680	&Xloop_avx2(\&bodyx_20_39);
1681
1682$code.=<<___;
1683	lea	128($inp),$frame
1684	lea	128($inp),%rdi			# borrow $t0
1685	cmp	$num,$frame
1686	cmovae	$inp,$frame			# next or previous block
1687
1688	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1689	add	0($ctx),@ROTX[0]		# update context
1690	add	4($ctx),@ROTX[1]
1691	add	8($ctx),@ROTX[3]
1692	mov	@ROTX[0],0($ctx)
1693	add	12($ctx),@ROTX[4]
1694	mov	@ROTX[1],4($ctx)
1695	 mov	@ROTX[0],$A			# A=d
1696	add	16($ctx),@ROTX[5]
1697	 mov	@ROTX[3],$a5
1698	mov	@ROTX[3],8($ctx)
1699	 mov	@ROTX[4],$D			# D=b
1700	 #xchg	@ROTX[5],$F			# F=c, C=f
1701	mov	@ROTX[4],12($ctx)
1702	 mov	@ROTX[1],$F			# F=e
1703	mov	@ROTX[5],16($ctx)
1704	#mov	$F,16($ctx)
1705	 mov	@ROTX[5],$E			# E=c
1706	 mov	$a5,$C				# C=f
1707	 #xchg	$F,$E				# E=c, F=e
1708
1709	cmp	$num,$inp
1710	je	.Ldone_avx2
1711___
1712
1713$Xi=4;				# reset variables
1714@X=map("%ymm$_",(4..7,0..3));
1715
1716$code.=<<___;
1717	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1718	cmp	$num,%rdi			# borrowed $t0
1719	ja	.Last_avx2
1720
1721	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1722	vmovdqu		-48(%rdi),%xmm1
1723	vmovdqu		-32(%rdi),%xmm2
1724	vmovdqu		-16(%rdi),%xmm3
1725	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1726	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1727	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1728	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1729	jmp	.Last_avx2
1730
1731.align	32
1732.Last_avx2:
1733	lea	128+16(%rsp),$frame
1734	rorx	\$2,$F,$B
1735	andn	$D,$F,$t0
1736	and	$C,$F
1737	xor	$t0,$F
1738	sub	\$-128,$inp
1739___
1740	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1741
1742	&Xloop_avx2	(\&bodyx_00_19);
1743	&Xloop_avx2	(\&bodyx_00_19);
1744	&Xloop_avx2	(\&bodyx_00_19);
1745	&Xloop_avx2	(\&bodyx_00_19);
1746
1747	&Xloop_avx2	(\&bodyx_20_39);
1748	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1749	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1750	&Xloop_avx2	(\&bodyx_20_39);
1751	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1752	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1753	&Xloop_avx2	(\&bodyx_20_39);
1754	  &vmovdqu	("0(%rsp)",@Tx[0]);
1755	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1756	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1757	&Xloop_avx2	(\&bodyx_20_39);
1758	  &vmovdqu	("32(%rsp)",@Tx[1]);
1759	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1760	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1761
1762	&Xloop_avx2	(\&bodyx_40_59);
1763	&align32	();
1764	  &vmovdqu	("64(%rsp)",@X[2]);
1765	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1766	&Xloop_avx2	(\&bodyx_40_59);
1767	  &vmovdqu	("96(%rsp)",@X[3]);
1768	&Xloop_avx2	(\&bodyx_40_59);
1769	&Xupdate_avx2_16_31(\&bodyx_40_59);
1770
1771	&Xupdate_avx2_16_31(\&bodyx_20_39);
1772	&Xupdate_avx2_16_31(\&bodyx_20_39);
1773	&Xupdate_avx2_16_31(\&bodyx_20_39);
1774	&Xloop_avx2	(\&bodyx_20_39);
1775
1776$code.=<<___;
1777	lea	128(%rsp),$frame
1778
1779	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1780	add	0($ctx),@ROTX[0]		# update context
1781	add	4($ctx),@ROTX[1]
1782	add	8($ctx),@ROTX[3]
1783	mov	@ROTX[0],0($ctx)
1784	add	12($ctx),@ROTX[4]
1785	mov	@ROTX[1],4($ctx)
1786	 mov	@ROTX[0],$A			# A=d
1787	add	16($ctx),@ROTX[5]
1788	 mov	@ROTX[3],$a5
1789	mov	@ROTX[3],8($ctx)
1790	 mov	@ROTX[4],$D			# D=b
1791	 #xchg	@ROTX[5],$F			# F=c, C=f
1792	mov	@ROTX[4],12($ctx)
1793	 mov	@ROTX[1],$F			# F=e
1794	mov	@ROTX[5],16($ctx)
1795	#mov	$F,16($ctx)
1796	 mov	@ROTX[5],$E			# E=c
1797	 mov	$a5,$C				# C=f
1798	 #xchg	$F,$E				# E=c, F=e
1799
1800	cmp	$num,$inp
1801	jbe	.Loop_avx2
1802
1803.Ldone_avx2:
1804	vzeroupper
1805___
1806$code.=<<___ if ($win64);
1807	movaps	-40-6*16($fp),%xmm6
1808	movaps	-40-5*16($fp),%xmm7
1809	movaps	-40-4*16($fp),%xmm8
1810	movaps	-40-3*16($fp),%xmm9
1811	movaps	-40-2*16($fp),%xmm10
1812	movaps	-40-1*16($fp),%xmm11
1813___
1814$code.=<<___;
1815	mov	-40($fp),%r14
1816.cfi_restore	%r14
1817	mov	-32($fp),%r13
1818.cfi_restore	%r13
1819	mov	-24($fp),%r12
1820.cfi_restore	%r12
1821	mov	-16($fp),%rbp
1822.cfi_restore	%rbp
1823	mov	-8($fp),%rbx
1824.cfi_restore	%rbx
1825	lea	($fp),%rsp
1826.cfi_def_cfa_register	%rsp
1827.Lepilogue_avx2:
1828	ret
1829.cfi_endproc
1830.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1831___
1832}
1833}
1834$code.=<<___;
1835.section .rodata align=64
1836.align	64
1837K_XX_XX:
1838.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1839.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1840.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1841.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1842.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1843.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1844.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1845.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1846.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1847.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1848.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1849.previous
1850___
1851}}}
1852$code.=<<___;
1853.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1854.align	64
1855___
1856
1857# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1858#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1859if ($win64) {
1860$rec="%rcx";
1861$frame="%rdx";
1862$context="%r8";
1863$disp="%r9";
1864
1865$code.=<<___;
1866.extern	__imp_RtlVirtualUnwind
1867.type	se_handler,\@abi-omnipotent
1868.align	16
1869se_handler:
1870	push	%rsi
1871	push	%rdi
1872	push	%rbx
1873	push	%rbp
1874	push	%r12
1875	push	%r13
1876	push	%r14
1877	push	%r15
1878	pushfq
1879	sub	\$64,%rsp
1880
1881	mov	120($context),%rax	# pull context->Rax
1882	mov	248($context),%rbx	# pull context->Rip
1883
1884	lea	.Lprologue(%rip),%r10
1885	cmp	%r10,%rbx		# context->Rip<.Lprologue
1886	jb	.Lcommon_seh_tail
1887
1888	mov	152($context),%rax	# pull context->Rsp
1889
1890	lea	.Lepilogue(%rip),%r10
1891	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1892	jae	.Lcommon_seh_tail
1893
1894	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1895
1896	mov	-8(%rax),%rbx
1897	mov	-16(%rax),%rbp
1898	mov	-24(%rax),%r12
1899	mov	-32(%rax),%r13
1900	mov	-40(%rax),%r14
1901	mov	%rbx,144($context)	# restore context->Rbx
1902	mov	%rbp,160($context)	# restore context->Rbp
1903	mov	%r12,216($context)	# restore context->R12
1904	mov	%r13,224($context)	# restore context->R13
1905	mov	%r14,232($context)	# restore context->R14
1906
1907	jmp	.Lcommon_seh_tail
1908.size	se_handler,.-se_handler
1909___
1910
1911$code.=<<___ if ($shaext);
1912.type	shaext_handler,\@abi-omnipotent
1913.align	16
1914shaext_handler:
1915	push	%rsi
1916	push	%rdi
1917	push	%rbx
1918	push	%rbp
1919	push	%r12
1920	push	%r13
1921	push	%r14
1922	push	%r15
1923	pushfq
1924	sub	\$64,%rsp
1925
1926	mov	120($context),%rax	# pull context->Rax
1927	mov	248($context),%rbx	# pull context->Rip
1928
1929	lea	.Lprologue_shaext(%rip),%r10
1930	cmp	%r10,%rbx		# context->Rip<.Lprologue
1931	jb	.Lcommon_seh_tail
1932
1933	lea	.Lepilogue_shaext(%rip),%r10
1934	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1935	jae	.Lcommon_seh_tail
1936
1937	lea	-8-4*16(%rax),%rsi
1938	lea	512($context),%rdi	# &context.Xmm6
1939	mov	\$8,%ecx
1940	.long	0xa548f3fc		# cld; rep movsq
1941
1942	jmp	.Lcommon_seh_tail
1943.size	shaext_handler,.-shaext_handler
1944___
1945
1946$code.=<<___;
1947.type	ssse3_handler,\@abi-omnipotent
1948.align	16
1949ssse3_handler:
1950	push	%rsi
1951	push	%rdi
1952	push	%rbx
1953	push	%rbp
1954	push	%r12
1955	push	%r13
1956	push	%r14
1957	push	%r15
1958	pushfq
1959	sub	\$64,%rsp
1960
1961	mov	120($context),%rax	# pull context->Rax
1962	mov	248($context),%rbx	# pull context->Rip
1963
1964	mov	8($disp),%rsi		# disp->ImageBase
1965	mov	56($disp),%r11		# disp->HandlerData
1966
1967	mov	0(%r11),%r10d		# HandlerData[0]
1968	lea	(%rsi,%r10),%r10	# prologue label
1969	cmp	%r10,%rbx		# context->Rip<prologue label
1970	jb	.Lcommon_seh_tail
1971
1972	mov	208($context),%rax	# pull context->R11
1973
1974	mov	4(%r11),%r10d		# HandlerData[1]
1975	lea	(%rsi,%r10),%r10	# epilogue label
1976	cmp	%r10,%rbx		# context->Rip>=epilogue label
1977	jae	.Lcommon_seh_tail
1978
1979	lea	-40-6*16(%rax),%rsi
1980	lea	512($context),%rdi	# &context.Xmm6
1981	mov	\$12,%ecx
1982	.long	0xa548f3fc		# cld; rep movsq
1983
1984	mov	-8(%rax),%rbx
1985	mov	-16(%rax),%rbp
1986	mov	-24(%rax),%r12
1987	mov	-32(%rax),%r13
1988	mov	-40(%rax),%r14
1989	mov	%rbx,144($context)	# restore context->Rbx
1990	mov	%rbp,160($context)	# restore context->Rbp
1991	mov	%r12,216($context)	# restore context->R12
1992	mov	%r13,224($context)	# restore context->R13
1993	mov	%r14,232($context)	# restore context->R14
1994
1995.Lcommon_seh_tail:
1996	mov	8(%rax),%rdi
1997	mov	16(%rax),%rsi
1998	mov	%rax,152($context)	# restore context->Rsp
1999	mov	%rsi,168($context)	# restore context->Rsi
2000	mov	%rdi,176($context)	# restore context->Rdi
2001
2002	mov	40($disp),%rdi		# disp->ContextRecord
2003	mov	$context,%rsi		# context
2004	mov	\$154,%ecx		# sizeof(CONTEXT)
2005	.long	0xa548f3fc		# cld; rep movsq
2006
2007	mov	$disp,%rsi
2008	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2009	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2010	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2011	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2012	mov	40(%rsi),%r10		# disp->ContextRecord
2013	lea	56(%rsi),%r11		# &disp->HandlerData
2014	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2015	mov	%r10,32(%rsp)		# arg5
2016	mov	%r11,40(%rsp)		# arg6
2017	mov	%r12,48(%rsp)		# arg7
2018	mov	%rcx,56(%rsp)		# arg8, (NULL)
2019	call	*__imp_RtlVirtualUnwind(%rip)
2020
2021	mov	\$1,%eax		# ExceptionContinueSearch
2022	add	\$64,%rsp
2023	popfq
2024	pop	%r15
2025	pop	%r14
2026	pop	%r13
2027	pop	%r12
2028	pop	%rbp
2029	pop	%rbx
2030	pop	%rdi
2031	pop	%rsi
2032	ret
2033.size	ssse3_handler,.-ssse3_handler
2034
2035.section	.pdata
2036.align	4
2037	.rva	.LSEH_begin_sha1_block_data_order
2038	.rva	.LSEH_end_sha1_block_data_order
2039	.rva	.LSEH_info_sha1_block_data_order
2040___
2041$code.=<<___ if ($shaext);
2042	.rva	.LSEH_begin_sha1_block_data_order_shaext
2043	.rva	.LSEH_end_sha1_block_data_order_shaext
2044	.rva	.LSEH_info_sha1_block_data_order_shaext
2045___
2046$code.=<<___;
2047	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2048	.rva	.LSEH_end_sha1_block_data_order_ssse3
2049	.rva	.LSEH_info_sha1_block_data_order_ssse3
2050___
2051$code.=<<___ if ($avx);
2052	.rva	.LSEH_begin_sha1_block_data_order_avx
2053	.rva	.LSEH_end_sha1_block_data_order_avx
2054	.rva	.LSEH_info_sha1_block_data_order_avx
2055___
2056$code.=<<___ if ($avx>1);
2057	.rva	.LSEH_begin_sha1_block_data_order_avx2
2058	.rva	.LSEH_end_sha1_block_data_order_avx2
2059	.rva	.LSEH_info_sha1_block_data_order_avx2
2060___
2061$code.=<<___;
2062.section	.xdata
2063.align	8
2064.LSEH_info_sha1_block_data_order:
2065	.byte	9,0,0,0
2066	.rva	se_handler
2067___
2068$code.=<<___ if ($shaext);
2069.LSEH_info_sha1_block_data_order_shaext:
2070	.byte	9,0,0,0
2071	.rva	shaext_handler
2072___
2073$code.=<<___;
2074.LSEH_info_sha1_block_data_order_ssse3:
2075	.byte	9,0,0,0
2076	.rva	ssse3_handler
2077	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2078___
2079$code.=<<___ if ($avx);
2080.LSEH_info_sha1_block_data_order_avx:
2081	.byte	9,0,0,0
2082	.rva	ssse3_handler
2083	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2084___
2085$code.=<<___ if ($avx>1);
2086.LSEH_info_sha1_block_data_order_avx2:
2087	.byte	9,0,0,0
2088	.rva	ssse3_handler
2089	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2090___
2091}
2092
2093####################################################################
2094
2095sub sha1rnds4 {
2096    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2097      my @opcode=(0x0f,0x3a,0xcc);
2098	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2099	my $c=$1;
2100	push @opcode,$c=~/^0/?oct($c):$c;
2101	return ".byte\t".join(',',@opcode);
2102    } else {
2103	return "sha1rnds4\t".@_[0];
2104    }
2105}
2106
2107sub sha1op38 {
2108    my $instr = shift;
2109    my %opcodelet = (
2110		"sha1nexte" => 0xc8,
2111  		"sha1msg1"  => 0xc9,
2112		"sha1msg2"  => 0xca	);
2113
2114    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2115      my @opcode=(0x0f,0x38);
2116      my $rex=0;
2117	$rex|=0x04			if ($2>=8);
2118	$rex|=0x01			if ($1>=8);
2119	unshift @opcode,0x40|$rex	if ($rex);
2120	push @opcode,$opcodelet{$instr};
2121	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2122	return ".byte\t".join(',',@opcode);
2123    } else {
2124	return $instr."\t".@_[0];
2125    }
2126}
2127
2128foreach (split("\n",$code)) {
2129	s/\`([^\`]*)\`/eval $1/geo;
2130
2131	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2132	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2133
2134	print $_,"\n";
2135}
2136close STDOUT or die "error closing STDOUT: $!";
2137