xref: /openssl/crypto/sha/asm/sha512-x86_64.pl (revision cd84d883)
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the License.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111
112# $output is the last argument if it looks like a file (it has an extension)
113# $flavour is the first argument if it doesn't look like a file
114$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
115$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
116
117$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
118
119$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
120( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
121( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
122die "can't locate x86_64-xlate.pl";
123
124if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
125		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
126	$avx = ($1>=2.19) + ($1>=2.22);
127}
128
129if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
130	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
131	$avx = ($1>=2.09) + ($1>=2.10);
132}
133
134if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
135	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
136	$avx = ($1>=10) + ($1>=11);
137}
138
139if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
140	$avx = ($2>=3.0) + ($2>3.0);
141}
142
143$shaext=1;	### set to zero if compiling for 1.0.1
144$avx=1		if (!$shaext && $avx);
145
146open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
147    or die "can't call $xlate: $!";
148*STDOUT=*OUT;
149
150if ($output =~ /512/) {
151	$func="sha512_block_data_order";
152	$TABLE="K512";
153	$SZ=8;
154	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
155					"%r8", "%r9", "%r10","%r11");
156	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
157	@Sigma0=(28,34,39);
158	@Sigma1=(14,18,41);
159	@sigma0=(1,  8, 7);
160	@sigma1=(19,61, 6);
161	$rounds=80;
162} else {
163	$func="sha256_block_data_order";
164	$TABLE="K256";
165	$SZ=4;
166	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
167					"%r8d","%r9d","%r10d","%r11d");
168	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
169	@Sigma0=( 2,13,22);
170	@Sigma1=( 6,11,25);
171	@sigma0=( 7,18, 3);
172	@sigma1=(17,19,10);
173	$rounds=64;
174}
175
176$ctx="%rdi";	# 1st arg, zapped by $a3
177$inp="%rsi";	# 2nd arg
178$Tbl="%rbp";
179
180$_ctx="16*$SZ+0*8(%rsp)";
181$_inp="16*$SZ+1*8(%rsp)";
182$_end="16*$SZ+2*8(%rsp)";
183$_rsp="`16*$SZ+3*8`(%rsp)";
184$framesz="16*$SZ+4*8";
185
186
187sub ROUND_00_15()
188{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
189  my $STRIDE=$SZ;
190     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
191
192$code.=<<___;
193	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
194	mov	$f,$a2
195
196	xor	$e,$a0
197	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
198	xor	$g,$a2			# f^g
199
200	mov	$T1,`$SZ*($i&0xf)`(%rsp)
201	xor	$a,$a1
202	and	$e,$a2			# (f^g)&e
203
204	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
205	add	$h,$T1			# T1+=h
206	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
207
208	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
209	xor	$e,$a0
210	add	$a2,$T1			# T1+=Ch(e,f,g)
211
212	mov	$a,$a2
213	add	($Tbl),$T1		# T1+=K[round]
214	xor	$a,$a1
215
216	xor	$b,$a2			# a^b, b^c in next round
217	ror	\$$Sigma1[0],$a0	# Sigma1(e)
218	mov	$b,$h
219
220	and	$a2,$a3
221	ror	\$$Sigma0[0],$a1	# Sigma0(a)
222	add	$a0,$T1			# T1+=Sigma1(e)
223
224	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
225	add	$T1,$d			# d+=T1
226	add	$T1,$h			# h+=T1
227
228	lea	$STRIDE($Tbl),$Tbl	# round++
229___
230$code.=<<___ if ($i<15);
231	add	$a1,$h			# h+=Sigma0(a)
232___
233	($a2,$a3) = ($a3,$a2);
234}
235
236sub ROUND_16_XX()
237{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
238
239$code.=<<___;
240	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
241	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
242
243	mov	$a0,$T1
244	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
245	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
246	mov	$a2,$a1
247	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
248
249	xor	$T1,$a0
250	shr	\$$sigma0[2],$T1
251	ror	\$$sigma0[0],$a0
252	xor	$a1,$a2
253	shr	\$$sigma1[2],$a1
254
255	ror	\$$sigma1[0],$a2
256	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
257	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
258	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
259
260	add	`$SZ*($i&0xf)`(%rsp),$T1
261	mov	$e,$a0
262	add	$a2,$T1
263	mov	$a,$a1
264___
265	&ROUND_00_15(@_);
266}
267
268$code=<<___;
269.text
270
271.extern	OPENSSL_ia32cap_P
272.globl	$func
273.type	$func,\@function,3
274.align	16
275$func:
276.cfi_startproc
277___
278$code.=<<___ if ($SZ==4 || $avx);
279	lea	OPENSSL_ia32cap_P(%rip),%r11
280	mov	0(%r11),%r9d
281	mov	4(%r11),%r10d
282	mov	8(%r11),%r11d
283___
284$code.=<<___ if ($SZ==4 && $shaext);
285	test	\$`1<<29`,%r11d		# check for SHA
286	jnz	_shaext_shortcut
287___
288$code.=<<___ if ($avx && $SZ==8);
289	test	\$`1<<11`,%r10d		# check for XOP
290	jnz	.Lxop_shortcut
291___
292$code.=<<___ if ($avx>1);
293	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
294	cmp	\$`1<<8|1<<5|1<<3`,%r11d
295	je	.Lavx2_shortcut
296___
297$code.=<<___ if ($avx);
298	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
299	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
300	or	%r9d,%r10d
301	cmp	\$`1<<28|1<<9|1<<30`,%r10d
302	je	.Lavx_shortcut
303___
304$code.=<<___ if ($SZ==4);
305	test	\$`1<<9`,%r10d
306	jnz	.Lssse3_shortcut
307___
308$code.=<<___;
309	mov	%rsp,%rax		# copy %rsp
310.cfi_def_cfa_register	%rax
311	push	%rbx
312.cfi_push	%rbx
313	push	%rbp
314.cfi_push	%rbp
315	push	%r12
316.cfi_push	%r12
317	push	%r13
318.cfi_push	%r13
319	push	%r14
320.cfi_push	%r14
321	push	%r15
322.cfi_push	%r15
323	shl	\$4,%rdx		# num*16
324	sub	\$$framesz,%rsp
325	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
326	and	\$-64,%rsp		# align stack frame
327	mov	$ctx,$_ctx		# save ctx, 1st arg
328	mov	$inp,$_inp		# save inp, 2nd arh
329	mov	%rdx,$_end		# save end pointer, "3rd" arg
330	mov	%rax,$_rsp		# save copy of %rsp
331.cfi_cfa_expression	$_rsp,deref,+8
332.Lprologue:
333
334	mov	$SZ*0($ctx),$A
335	mov	$SZ*1($ctx),$B
336	mov	$SZ*2($ctx),$C
337	mov	$SZ*3($ctx),$D
338	mov	$SZ*4($ctx),$E
339	mov	$SZ*5($ctx),$F
340	mov	$SZ*6($ctx),$G
341	mov	$SZ*7($ctx),$H
342	jmp	.Lloop
343
344.align	16
345.Lloop:
346	mov	$B,$a3
347	lea	$TABLE(%rip),$Tbl
348	xor	$C,$a3			# magic
349___
350	for($i=0;$i<16;$i++) {
351		$code.="	mov	$SZ*$i($inp),$T1\n";
352		$code.="	mov	@ROT[4],$a0\n";
353		$code.="	mov	@ROT[0],$a1\n";
354		$code.="	bswap	$T1\n";
355		&ROUND_00_15($i,@ROT);
356		unshift(@ROT,pop(@ROT));
357	}
358$code.=<<___;
359	jmp	.Lrounds_16_xx
360.align	16
361.Lrounds_16_xx:
362___
363	for(;$i<32;$i++) {
364		&ROUND_16_XX($i,@ROT);
365		unshift(@ROT,pop(@ROT));
366	}
367
368$code.=<<___;
369	cmpb	\$0,`$SZ-1`($Tbl)
370	jnz	.Lrounds_16_xx
371
372	mov	$_ctx,$ctx
373	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
374	lea	16*$SZ($inp),$inp
375
376	add	$SZ*0($ctx),$A
377	add	$SZ*1($ctx),$B
378	add	$SZ*2($ctx),$C
379	add	$SZ*3($ctx),$D
380	add	$SZ*4($ctx),$E
381	add	$SZ*5($ctx),$F
382	add	$SZ*6($ctx),$G
383	add	$SZ*7($ctx),$H
384
385	cmp	$_end,$inp
386
387	mov	$A,$SZ*0($ctx)
388	mov	$B,$SZ*1($ctx)
389	mov	$C,$SZ*2($ctx)
390	mov	$D,$SZ*3($ctx)
391	mov	$E,$SZ*4($ctx)
392	mov	$F,$SZ*5($ctx)
393	mov	$G,$SZ*6($ctx)
394	mov	$H,$SZ*7($ctx)
395	jb	.Lloop
396
397	mov	$_rsp,%rsi
398.cfi_def_cfa	%rsi,8
399	mov	-48(%rsi),%r15
400.cfi_restore	%r15
401	mov	-40(%rsi),%r14
402.cfi_restore	%r14
403	mov	-32(%rsi),%r13
404.cfi_restore	%r13
405	mov	-24(%rsi),%r12
406.cfi_restore	%r12
407	mov	-16(%rsi),%rbp
408.cfi_restore	%rbp
409	mov	-8(%rsi),%rbx
410.cfi_restore	%rbx
411	lea	(%rsi),%rsp
412.cfi_def_cfa_register	%rsp
413.Lepilogue:
414	ret
415.cfi_endproc
416.size	$func,.-$func
417___
418
419if ($SZ==4) {
420$code.=<<___;
421.align	64
422.type	$TABLE,\@object
423$TABLE:
424	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
425	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
426	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
427	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
428	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
429	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
430	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
431	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
432	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
433	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
434	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
435	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
436	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
437	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
438	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
439	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
440	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
441	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
442	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
443	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
444	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
445	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
446	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
447	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
448	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
449	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
450	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
451	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
452	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
453	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
454	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
455	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
456
457	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
458	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
459	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
460	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
461	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
462	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
463	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
464___
465} else {
466$code.=<<___;
467.align	64
468.type	$TABLE,\@object
469$TABLE:
470	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
471	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
472	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
473	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
474	.quad	0x3956c25bf348b538,0x59f111f1b605d019
475	.quad	0x3956c25bf348b538,0x59f111f1b605d019
476	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
477	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
478	.quad	0xd807aa98a3030242,0x12835b0145706fbe
479	.quad	0xd807aa98a3030242,0x12835b0145706fbe
480	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
481	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
482	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
483	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
484	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
485	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
486	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
487	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
488	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
489	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
490	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
491	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
492	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
493	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
494	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
495	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
496	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
497	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
498	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
499	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
500	.quad	0x06ca6351e003826f,0x142929670a0e6e70
501	.quad	0x06ca6351e003826f,0x142929670a0e6e70
502	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
503	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
504	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
505	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
506	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
507	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
508	.quad	0x81c2c92e47edaee6,0x92722c851482353b
509	.quad	0x81c2c92e47edaee6,0x92722c851482353b
510	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
511	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
512	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
513	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
514	.quad	0xd192e819d6ef5218,0xd69906245565a910
515	.quad	0xd192e819d6ef5218,0xd69906245565a910
516	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
517	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
518	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
519	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
520	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
521	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
522	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
523	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
524	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
525	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
526	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
527	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
528	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
529	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
530	.quad	0x90befffa23631e28,0xa4506cebde82bde9
531	.quad	0x90befffa23631e28,0xa4506cebde82bde9
532	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
533	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
534	.quad	0xca273eceea26619c,0xd186b8c721c0c207
535	.quad	0xca273eceea26619c,0xd186b8c721c0c207
536	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
537	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
538	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
539	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
540	.quad	0x113f9804bef90dae,0x1b710b35131c471b
541	.quad	0x113f9804bef90dae,0x1b710b35131c471b
542	.quad	0x28db77f523047d84,0x32caab7b40c72493
543	.quad	0x28db77f523047d84,0x32caab7b40c72493
544	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
545	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
546	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
547	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
548	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
549	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
550
551	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
552	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
553	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
554___
555}
556
557######################################################################
558# SIMD code paths
559#
560if ($SZ==4 && $shaext) {{{
561######################################################################
562# Intel SHA Extensions implementation of SHA256 update function.
563#
564my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
565
566my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
567my @MSG=map("%xmm$_",(3..6));
568
569$code.=<<___;
570.type	sha256_block_data_order_shaext,\@function,3
571.align	64
572sha256_block_data_order_shaext:
573_shaext_shortcut:
574.cfi_startproc
575___
576$code.=<<___ if ($win64);
577	lea	`-8-5*16`(%rsp),%rsp
578	movaps	%xmm6,-8-5*16(%rax)
579	movaps	%xmm7,-8-4*16(%rax)
580	movaps	%xmm8,-8-3*16(%rax)
581	movaps	%xmm9,-8-2*16(%rax)
582	movaps	%xmm10,-8-1*16(%rax)
583.Lprologue_shaext:
584___
585$code.=<<___;
586	lea		K256+0x80(%rip),$Tbl
587	movdqu		($ctx),$ABEF		# DCBA
588	movdqu		16($ctx),$CDGH		# HGFE
589	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
590
591	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
592	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
593	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
594	movdqa		$TMP,$BSWAP		# offload
595	palignr		\$8,$CDGH,$ABEF		# ABEF
596	punpcklqdq	$Wi,$CDGH		# CDGH
597	jmp		.Loop_shaext
598
599.align	16
600.Loop_shaext:
601	movdqu		($inp),@MSG[0]
602	movdqu		0x10($inp),@MSG[1]
603	movdqu		0x20($inp),@MSG[2]
604	pshufb		$TMP,@MSG[0]
605	movdqu		0x30($inp),@MSG[3]
606
607	movdqa		0*32-0x80($Tbl),$Wi
608	paddd		@MSG[0],$Wi
609	pshufb		$TMP,@MSG[1]
610	movdqa		$CDGH,$CDGH_SAVE	# offload
611	sha256rnds2	$ABEF,$CDGH		# 0-3
612	pshufd		\$0x0e,$Wi,$Wi
613	nop
614	movdqa		$ABEF,$ABEF_SAVE	# offload
615	sha256rnds2	$CDGH,$ABEF
616
617	movdqa		1*32-0x80($Tbl),$Wi
618	paddd		@MSG[1],$Wi
619	pshufb		$TMP,@MSG[2]
620	sha256rnds2	$ABEF,$CDGH		# 4-7
621	pshufd		\$0x0e,$Wi,$Wi
622	lea		0x40($inp),$inp
623	sha256msg1	@MSG[1],@MSG[0]
624	sha256rnds2	$CDGH,$ABEF
625
626	movdqa		2*32-0x80($Tbl),$Wi
627	paddd		@MSG[2],$Wi
628	pshufb		$TMP,@MSG[3]
629	sha256rnds2	$ABEF,$CDGH		# 8-11
630	pshufd		\$0x0e,$Wi,$Wi
631	movdqa		@MSG[3],$TMP
632	palignr		\$4,@MSG[2],$TMP
633	nop
634	paddd		$TMP,@MSG[0]
635	sha256msg1	@MSG[2],@MSG[1]
636	sha256rnds2	$CDGH,$ABEF
637
638	movdqa		3*32-0x80($Tbl),$Wi
639	paddd		@MSG[3],$Wi
640	sha256msg2	@MSG[3],@MSG[0]
641	sha256rnds2	$ABEF,$CDGH		# 12-15
642	pshufd		\$0x0e,$Wi,$Wi
643	movdqa		@MSG[0],$TMP
644	palignr		\$4,@MSG[3],$TMP
645	nop
646	paddd		$TMP,@MSG[1]
647	sha256msg1	@MSG[3],@MSG[2]
648	sha256rnds2	$CDGH,$ABEF
649___
650for($i=4;$i<16-3;$i++) {
651$code.=<<___;
652	movdqa		$i*32-0x80($Tbl),$Wi
653	paddd		@MSG[0],$Wi
654	sha256msg2	@MSG[0],@MSG[1]
655	sha256rnds2	$ABEF,$CDGH		# 16-19...
656	pshufd		\$0x0e,$Wi,$Wi
657	movdqa		@MSG[1],$TMP
658	palignr		\$4,@MSG[0],$TMP
659	nop
660	paddd		$TMP,@MSG[2]
661	sha256msg1	@MSG[0],@MSG[3]
662	sha256rnds2	$CDGH,$ABEF
663___
664	push(@MSG,shift(@MSG));
665}
666$code.=<<___;
667	movdqa		13*32-0x80($Tbl),$Wi
668	paddd		@MSG[0],$Wi
669	sha256msg2	@MSG[0],@MSG[1]
670	sha256rnds2	$ABEF,$CDGH		# 52-55
671	pshufd		\$0x0e,$Wi,$Wi
672	movdqa		@MSG[1],$TMP
673	palignr		\$4,@MSG[0],$TMP
674	sha256rnds2	$CDGH,$ABEF
675	paddd		$TMP,@MSG[2]
676
677	movdqa		14*32-0x80($Tbl),$Wi
678	paddd		@MSG[1],$Wi
679	sha256rnds2	$ABEF,$CDGH		# 56-59
680	pshufd		\$0x0e,$Wi,$Wi
681	sha256msg2	@MSG[1],@MSG[2]
682	movdqa		$BSWAP,$TMP
683	sha256rnds2	$CDGH,$ABEF
684
685	movdqa		15*32-0x80($Tbl),$Wi
686	paddd		@MSG[2],$Wi
687	nop
688	sha256rnds2	$ABEF,$CDGH		# 60-63
689	pshufd		\$0x0e,$Wi,$Wi
690	dec		$num
691	nop
692	sha256rnds2	$CDGH,$ABEF
693
694	paddd		$CDGH_SAVE,$CDGH
695	paddd		$ABEF_SAVE,$ABEF
696	jnz		.Loop_shaext
697
698	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
699	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
700	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
701	punpckhqdq	$CDGH,$ABEF		# DCBA
702	palignr		\$8,$TMP,$CDGH		# HGFE
703
704	movdqu	$ABEF,($ctx)
705	movdqu	$CDGH,16($ctx)
706___
707$code.=<<___ if ($win64);
708	movaps	-8-5*16(%rax),%xmm6
709	movaps	-8-4*16(%rax),%xmm7
710	movaps	-8-3*16(%rax),%xmm8
711	movaps	-8-2*16(%rax),%xmm9
712	movaps	-8-1*16(%rax),%xmm10
713	mov	%rax,%rsp
714.Lepilogue_shaext:
715___
716$code.=<<___;
717	ret
718.cfi_endproc
719.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
720___
721}}}
722{{{
723
724my $a4=$T1;
725my ($a,$b,$c,$d,$e,$f,$g,$h);
726
727sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
728{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
729  my $arg = pop;
730    $arg = "\$$arg" if ($arg*1 eq $arg);
731    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
732}
733
734sub body_00_15 () {
735	(
736	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
737
738	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
739	'&mov	($a,$a1)',
740	'&mov	($a4,$f)',
741
742	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
743	'&xor	($a0,$e)',
744	'&xor	($a4,$g)',			# f^g
745
746	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
747	'&xor	($a1,$a)',
748	'&and	($a4,$e)',			# (f^g)&e
749
750	'&xor	($a0,$e)',
751	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
752	'&mov	($a2,$a)',
753
754	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
755	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
756	'&xor	($a2,$b)',			# a^b, b^c in next round
757
758	'&add	($h,$a4)',			# h+=Ch(e,f,g)
759	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
760	'&and	($a3,$a2)',			# (b^c)&(a^b)
761
762	'&xor	($a1,$a)',
763	'&add	($h,$a0)',			# h+=Sigma1(e)
764	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
765
766	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
767	'&add	($d,$h)',			# d+=h
768	'&add	($h,$a3)',			# h+=Maj(a,b,c)
769
770	'&mov	($a0,$d)',
771	'&add	($a1,$h);'.			# h+=Sigma0(a)
772	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
773	);
774}
775
776######################################################################
777# SSSE3 code path
778#
779if ($SZ==4) {	# SHA256 only
780my @X = map("%xmm$_",(0..3));
781my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
782
783$code.=<<___;
784.type	${func}_ssse3,\@function,3
785.align	64
786${func}_ssse3:
787.cfi_startproc
788.Lssse3_shortcut:
789	mov	%rsp,%rax		# copy %rsp
790.cfi_def_cfa_register	%rax
791	push	%rbx
792.cfi_push	%rbx
793	push	%rbp
794.cfi_push	%rbp
795	push	%r12
796.cfi_push	%r12
797	push	%r13
798.cfi_push	%r13
799	push	%r14
800.cfi_push	%r14
801	push	%r15
802.cfi_push	%r15
803	shl	\$4,%rdx		# num*16
804	sub	\$`$framesz+$win64*16*4`,%rsp
805	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
806	and	\$-64,%rsp		# align stack frame
807	mov	$ctx,$_ctx		# save ctx, 1st arg
808	mov	$inp,$_inp		# save inp, 2nd arh
809	mov	%rdx,$_end		# save end pointer, "3rd" arg
810	mov	%rax,$_rsp		# save copy of %rsp
811.cfi_cfa_expression	$_rsp,deref,+8
812___
813$code.=<<___ if ($win64);
814	movaps	%xmm6,16*$SZ+32(%rsp)
815	movaps	%xmm7,16*$SZ+48(%rsp)
816	movaps	%xmm8,16*$SZ+64(%rsp)
817	movaps	%xmm9,16*$SZ+80(%rsp)
818___
819$code.=<<___;
820.Lprologue_ssse3:
821
822	mov	$SZ*0($ctx),$A
823	mov	$SZ*1($ctx),$B
824	mov	$SZ*2($ctx),$C
825	mov	$SZ*3($ctx),$D
826	mov	$SZ*4($ctx),$E
827	mov	$SZ*5($ctx),$F
828	mov	$SZ*6($ctx),$G
829	mov	$SZ*7($ctx),$H
830___
831
832$code.=<<___;
833	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
834	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
835	jmp	.Lloop_ssse3
836.align	16
837.Lloop_ssse3:
838	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
839	movdqu	0x00($inp),@X[0]
840	movdqu	0x10($inp),@X[1]
841	movdqu	0x20($inp),@X[2]
842	pshufb	$t3,@X[0]
843	movdqu	0x30($inp),@X[3]
844	lea	$TABLE(%rip),$Tbl
845	pshufb	$t3,@X[1]
846	movdqa	0x00($Tbl),$t0
847	movdqa	0x20($Tbl),$t1
848	pshufb	$t3,@X[2]
849	paddd	@X[0],$t0
850	movdqa	0x40($Tbl),$t2
851	pshufb	$t3,@X[3]
852	movdqa	0x60($Tbl),$t3
853	paddd	@X[1],$t1
854	paddd	@X[2],$t2
855	paddd	@X[3],$t3
856	movdqa	$t0,0x00(%rsp)
857	mov	$A,$a1
858	movdqa	$t1,0x10(%rsp)
859	mov	$B,$a3
860	movdqa	$t2,0x20(%rsp)
861	xor	$C,$a3			# magic
862	movdqa	$t3,0x30(%rsp)
863	mov	$E,$a0
864	jmp	.Lssse3_00_47
865
866.align	16
867.Lssse3_00_47:
868	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
869___
870sub Xupdate_256_SSSE3 () {
871	(
872	'&movdqa	($t0,@X[1]);',
873	'&movdqa	($t3,@X[3])',
874	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
875	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
876	'&movdqa	($t1,$t0)',
877	'&movdqa	($t2,$t0);',
878	'&psrld		($t0,$sigma0[2])',
879	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
880	'&psrld		($t2,$sigma0[0])',
881	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
882	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
883	'&pxor		($t0,$t2)',
884	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
885	'&pxor		($t0,$t1)',
886	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
887	'&pxor		($t0,$t2);',
888	 '&movdqa	($t2,$t3)',
889	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
890	 '&psrld	($t3,$sigma1[2])',
891	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
892	 '&psrlq	($t2,$sigma1[0])',
893	 '&pxor		($t3,$t2);',
894	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
895	 '&pxor		($t3,$t2)',
896	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
897	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
898	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
899	 '&movdqa	($t2,$t3);',
900	 '&psrld	($t3,$sigma1[2])',
901	 '&psrlq	($t2,$sigma1[0])',
902	 '&pxor		($t3,$t2);',
903	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
904	 '&pxor		($t3,$t2);',
905	'&movdqa	($t2,16*2*$j."($Tbl)")',
906	 '&pshufb	($t3,$t5)',
907	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
908	);
909}
910
911sub SSSE3_256_00_47 () {
912my $j = shift;
913my $body = shift;
914my @X = @_;
915my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
916
917    if (0) {
918	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
919	    eval;
920	    eval(shift(@insns));
921	    eval(shift(@insns));
922	    eval(shift(@insns));
923	}
924    } else {			# squeeze extra 4% on Westmere and 19% on Atom
925	  eval(shift(@insns));	#@
926	&movdqa		($t0,@X[1]);
927	  eval(shift(@insns));
928	  eval(shift(@insns));
929	&movdqa		($t3,@X[3]);
930	  eval(shift(@insns));	#@
931	  eval(shift(@insns));
932	  eval(shift(@insns));
933	  eval(shift(@insns));	#@
934	  eval(shift(@insns));
935	&palignr	($t0,@X[0],$SZ);	# X[1..4]
936	  eval(shift(@insns));
937	  eval(shift(@insns));
938	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	  eval(shift(@insns));
942	  eval(shift(@insns));	#@
943	&movdqa		($t1,$t0);
944	  eval(shift(@insns));
945	  eval(shift(@insns));
946	&movdqa		($t2,$t0);
947	  eval(shift(@insns));	#@
948	  eval(shift(@insns));
949	&psrld		($t0,$sigma0[2]);
950	  eval(shift(@insns));
951	  eval(shift(@insns));
952	  eval(shift(@insns));
953	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
954	  eval(shift(@insns));	#@
955	  eval(shift(@insns));
956	&psrld		($t2,$sigma0[0]);
957	  eval(shift(@insns));
958	  eval(shift(@insns));
959	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
960	  eval(shift(@insns));
961	  eval(shift(@insns));	#@
962	&pslld		($t1,8*$SZ-$sigma0[1]);
963	  eval(shift(@insns));
964	  eval(shift(@insns));
965	&pxor		($t0,$t2);
966	  eval(shift(@insns));	#@
967	  eval(shift(@insns));
968	  eval(shift(@insns));
969	  eval(shift(@insns));	#@
970	&psrld		($t2,$sigma0[1]-$sigma0[0]);
971	  eval(shift(@insns));
972	&pxor		($t0,$t1);
973	  eval(shift(@insns));
974	  eval(shift(@insns));
975	&pslld		($t1,$sigma0[1]-$sigma0[0]);
976	  eval(shift(@insns));
977	  eval(shift(@insns));
978	&pxor		($t0,$t2);
979	  eval(shift(@insns));
980	  eval(shift(@insns));	#@
981	 &movdqa	($t2,$t3);
982	  eval(shift(@insns));
983	  eval(shift(@insns));
984	&pxor		($t0,$t1);		# sigma0(X[1..4])
985	  eval(shift(@insns));	#@
986	  eval(shift(@insns));
987	  eval(shift(@insns));
988	 &psrld		($t3,$sigma1[2]);
989	  eval(shift(@insns));
990	  eval(shift(@insns));
991	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
992	  eval(shift(@insns));	#@
993	  eval(shift(@insns));
994	 &psrlq		($t2,$sigma1[0]);
995	  eval(shift(@insns));
996	  eval(shift(@insns));
997	  eval(shift(@insns));
998	 &pxor		($t3,$t2);
999	  eval(shift(@insns));	#@
1000	  eval(shift(@insns));
1001	  eval(shift(@insns));
1002	  eval(shift(@insns));	#@
1003	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));
1006	 &pxor		($t3,$t2);
1007	  eval(shift(@insns));	#@
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));
1010	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
1011	 &pshufd	($t3,$t3,0b10000000);
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	  eval(shift(@insns));
1015	 &psrldq	($t3,8);
1016	  eval(shift(@insns));
1017	  eval(shift(@insns));	#@
1018	  eval(shift(@insns));
1019	  eval(shift(@insns));
1020	  eval(shift(@insns));	#@
1021	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));
1024	  eval(shift(@insns));
1025	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1026	  eval(shift(@insns));
1027	  eval(shift(@insns));	#@
1028	  eval(shift(@insns));
1029	 &movdqa	($t2,$t3);
1030	  eval(shift(@insns));
1031	  eval(shift(@insns));
1032	 &psrld		($t3,$sigma1[2]);
1033	  eval(shift(@insns));
1034	  eval(shift(@insns));	#@
1035	 &psrlq		($t2,$sigma1[0]);
1036	  eval(shift(@insns));
1037	  eval(shift(@insns));
1038	 &pxor		($t3,$t2);
1039	  eval(shift(@insns));	#@
1040	  eval(shift(@insns));
1041	  eval(shift(@insns));
1042	  eval(shift(@insns));	#@
1043	  eval(shift(@insns));
1044	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1045	  eval(shift(@insns));
1046	  eval(shift(@insns));
1047	  eval(shift(@insns));
1048	 &pxor		($t3,$t2);
1049	  eval(shift(@insns));
1050	  eval(shift(@insns));
1051	  eval(shift(@insns));	#@
1052	 #&pshufb	($t3,$t5);
1053	 &pshufd	($t3,$t3,0b00001000);
1054	  eval(shift(@insns));
1055	  eval(shift(@insns));
1056	&movdqa		($t2,16*2*$j."($Tbl)");
1057	  eval(shift(@insns));	#@
1058	  eval(shift(@insns));
1059	 &pslldq	($t3,8);
1060	  eval(shift(@insns));
1061	  eval(shift(@insns));
1062	  eval(shift(@insns));
1063	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1064	  eval(shift(@insns));	#@
1065	  eval(shift(@insns));
1066	  eval(shift(@insns));
1067    }
1068	&paddd		($t2,@X[0]);
1069	  foreach (@insns) { eval; }		# remaining instructions
1070	&movdqa		(16*$j."(%rsp)",$t2);
1071}
1072
1073    for ($i=0,$j=0; $j<4; $j++) {
1074	&SSSE3_256_00_47($j,\&body_00_15,@X);
1075	push(@X,shift(@X));			# rotate(@X)
1076    }
1077	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1078	&jne	(".Lssse3_00_47");
1079
1080    for ($i=0; $i<16; ) {
1081	foreach(body_00_15()) { eval; }
1082    }
1083$code.=<<___;
1084	mov	$_ctx,$ctx
1085	mov	$a1,$A
1086
1087	add	$SZ*0($ctx),$A
1088	lea	16*$SZ($inp),$inp
1089	add	$SZ*1($ctx),$B
1090	add	$SZ*2($ctx),$C
1091	add	$SZ*3($ctx),$D
1092	add	$SZ*4($ctx),$E
1093	add	$SZ*5($ctx),$F
1094	add	$SZ*6($ctx),$G
1095	add	$SZ*7($ctx),$H
1096
1097	cmp	$_end,$inp
1098
1099	mov	$A,$SZ*0($ctx)
1100	mov	$B,$SZ*1($ctx)
1101	mov	$C,$SZ*2($ctx)
1102	mov	$D,$SZ*3($ctx)
1103	mov	$E,$SZ*4($ctx)
1104	mov	$F,$SZ*5($ctx)
1105	mov	$G,$SZ*6($ctx)
1106	mov	$H,$SZ*7($ctx)
1107	jb	.Lloop_ssse3
1108
1109	mov	$_rsp,%rsi
1110.cfi_def_cfa	%rsi,8
1111___
1112$code.=<<___ if ($win64);
1113	movaps	16*$SZ+32(%rsp),%xmm6
1114	movaps	16*$SZ+48(%rsp),%xmm7
1115	movaps	16*$SZ+64(%rsp),%xmm8
1116	movaps	16*$SZ+80(%rsp),%xmm9
1117___
1118$code.=<<___;
1119	mov	-48(%rsi),%r15
1120.cfi_restore	%r15
1121	mov	-40(%rsi),%r14
1122.cfi_restore	%r14
1123	mov	-32(%rsi),%r13
1124.cfi_restore	%r13
1125	mov	-24(%rsi),%r12
1126.cfi_restore	%r12
1127	mov	-16(%rsi),%rbp
1128.cfi_restore	%rbp
1129	mov	-8(%rsi),%rbx
1130.cfi_restore	%rbx
1131	lea	(%rsi),%rsp
1132.cfi_def_cfa_register	%rsp
1133.Lepilogue_ssse3:
1134	ret
1135.cfi_endproc
1136.size	${func}_ssse3,.-${func}_ssse3
1137___
1138}
1139
1140if ($avx) {{
1141######################################################################
1142# XOP code path
1143#
1144if ($SZ==8) {	# SHA512 only
1145$code.=<<___;
1146.type	${func}_xop,\@function,3
1147.align	64
1148${func}_xop:
1149.cfi_startproc
1150.Lxop_shortcut:
1151	mov	%rsp,%rax		# copy %rsp
1152.cfi_def_cfa_register	%rax
1153	push	%rbx
1154.cfi_push	%rbx
1155	push	%rbp
1156.cfi_push	%rbp
1157	push	%r12
1158.cfi_push	%r12
1159	push	%r13
1160.cfi_push	%r13
1161	push	%r14
1162.cfi_push	%r14
1163	push	%r15
1164.cfi_push	%r15
1165	shl	\$4,%rdx		# num*16
1166	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1167	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1168	and	\$-64,%rsp		# align stack frame
1169	mov	$ctx,$_ctx		# save ctx, 1st arg
1170	mov	$inp,$_inp		# save inp, 2nd arh
1171	mov	%rdx,$_end		# save end pointer, "3rd" arg
1172	mov	%rax,$_rsp		# save copy of %rsp
1173.cfi_cfa_expression	$_rsp,deref,+8
1174___
1175$code.=<<___ if ($win64);
1176	movaps	%xmm6,16*$SZ+32(%rsp)
1177	movaps	%xmm7,16*$SZ+48(%rsp)
1178	movaps	%xmm8,16*$SZ+64(%rsp)
1179	movaps	%xmm9,16*$SZ+80(%rsp)
1180___
1181$code.=<<___ if ($win64 && $SZ>4);
1182	movaps	%xmm10,16*$SZ+96(%rsp)
1183	movaps	%xmm11,16*$SZ+112(%rsp)
1184___
1185$code.=<<___;
1186.Lprologue_xop:
1187
1188	vzeroupper
1189	mov	$SZ*0($ctx),$A
1190	mov	$SZ*1($ctx),$B
1191	mov	$SZ*2($ctx),$C
1192	mov	$SZ*3($ctx),$D
1193	mov	$SZ*4($ctx),$E
1194	mov	$SZ*5($ctx),$F
1195	mov	$SZ*6($ctx),$G
1196	mov	$SZ*7($ctx),$H
1197	jmp	.Lloop_xop
1198___
1199					if ($SZ==4) {	# SHA256
1200    my @X = map("%xmm$_",(0..3));
1201    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1202
1203$code.=<<___;
1204.align	16
1205.Lloop_xop:
1206	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1207	vmovdqu	0x00($inp),@X[0]
1208	vmovdqu	0x10($inp),@X[1]
1209	vmovdqu	0x20($inp),@X[2]
1210	vmovdqu	0x30($inp),@X[3]
1211	vpshufb	$t3,@X[0],@X[0]
1212	lea	$TABLE(%rip),$Tbl
1213	vpshufb	$t3,@X[1],@X[1]
1214	vpshufb	$t3,@X[2],@X[2]
1215	vpaddd	0x00($Tbl),@X[0],$t0
1216	vpshufb	$t3,@X[3],@X[3]
1217	vpaddd	0x20($Tbl),@X[1],$t1
1218	vpaddd	0x40($Tbl),@X[2],$t2
1219	vpaddd	0x60($Tbl),@X[3],$t3
1220	vmovdqa	$t0,0x00(%rsp)
1221	mov	$A,$a1
1222	vmovdqa	$t1,0x10(%rsp)
1223	mov	$B,$a3
1224	vmovdqa	$t2,0x20(%rsp)
1225	xor	$C,$a3			# magic
1226	vmovdqa	$t3,0x30(%rsp)
1227	mov	$E,$a0
1228	jmp	.Lxop_00_47
1229
1230.align	16
1231.Lxop_00_47:
1232	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1233___
1234sub XOP_256_00_47 () {
1235my $j = shift;
1236my $body = shift;
1237my @X = @_;
1238my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1239
1240	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1241	  eval(shift(@insns));
1242	  eval(shift(@insns));
1243	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1244	  eval(shift(@insns));
1245	  eval(shift(@insns));
1246	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1247	  eval(shift(@insns));
1248	  eval(shift(@insns));
1249	&vpsrld		($t0,$t0,$sigma0[2]);
1250	  eval(shift(@insns));
1251	  eval(shift(@insns));
1252	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1253	  eval(shift(@insns));
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	  eval(shift(@insns));
1257	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1258	  eval(shift(@insns));
1259	  eval(shift(@insns));
1260	&vpxor		($t0,$t0,$t1);
1261	  eval(shift(@insns));
1262	  eval(shift(@insns));
1263	  eval(shift(@insns));
1264	  eval(shift(@insns));
1265	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1266	  eval(shift(@insns));
1267	  eval(shift(@insns));
1268	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1269	  eval(shift(@insns));
1270	  eval(shift(@insns));
1271	 &vpsrld	($t2,@X[3],$sigma1[2]);
1272	  eval(shift(@insns));
1273	  eval(shift(@insns));
1274	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1275	  eval(shift(@insns));
1276	  eval(shift(@insns));
1277	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1278	  eval(shift(@insns));
1279	  eval(shift(@insns));
1280	 &vpxor		($t3,$t3,$t2);
1281	  eval(shift(@insns));
1282	  eval(shift(@insns));
1283	  eval(shift(@insns));
1284	  eval(shift(@insns));
1285	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1286	  eval(shift(@insns));
1287	  eval(shift(@insns));
1288	  eval(shift(@insns));
1289	  eval(shift(@insns));
1290	&vpsrldq	($t3,$t3,8);
1291	  eval(shift(@insns));
1292	  eval(shift(@insns));
1293	  eval(shift(@insns));
1294	  eval(shift(@insns));
1295	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1296	  eval(shift(@insns));
1297	  eval(shift(@insns));
1298	  eval(shift(@insns));
1299	  eval(shift(@insns));
1300	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1301	  eval(shift(@insns));
1302	  eval(shift(@insns));
1303	 &vpsrld	($t2,@X[0],$sigma1[2]);
1304	  eval(shift(@insns));
1305	  eval(shift(@insns));
1306	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1307	  eval(shift(@insns));
1308	  eval(shift(@insns));
1309	 &vpxor		($t3,$t3,$t2);
1310	  eval(shift(@insns));
1311	  eval(shift(@insns));
1312	  eval(shift(@insns));
1313	  eval(shift(@insns));
1314	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1315	  eval(shift(@insns));
1316	  eval(shift(@insns));
1317	  eval(shift(@insns));
1318	  eval(shift(@insns));
1319	&vpslldq	($t3,$t3,8);		# 22 instructions
1320	  eval(shift(@insns));
1321	  eval(shift(@insns));
1322	  eval(shift(@insns));
1323	  eval(shift(@insns));
1324	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1325	  eval(shift(@insns));
1326	  eval(shift(@insns));
1327	  eval(shift(@insns));
1328	  eval(shift(@insns));
1329	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1330	  foreach (@insns) { eval; }		# remaining instructions
1331	&vmovdqa	(16*$j."(%rsp)",$t2);
1332}
1333
1334    for ($i=0,$j=0; $j<4; $j++) {
1335	&XOP_256_00_47($j,\&body_00_15,@X);
1336	push(@X,shift(@X));			# rotate(@X)
1337    }
1338	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1339	&jne	(".Lxop_00_47");
1340
1341    for ($i=0; $i<16; ) {
1342	foreach(body_00_15()) { eval; }
1343    }
1344
1345					} else {	# SHA512
1346    my @X = map("%xmm$_",(0..7));
1347    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1348
1349$code.=<<___;
1350.align	16
1351.Lloop_xop:
1352	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1353	vmovdqu	0x00($inp),@X[0]
1354	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1355	vmovdqu	0x10($inp),@X[1]
1356	vmovdqu	0x20($inp),@X[2]
1357	vpshufb	$t3,@X[0],@X[0]
1358	vmovdqu	0x30($inp),@X[3]
1359	vpshufb	$t3,@X[1],@X[1]
1360	vmovdqu	0x40($inp),@X[4]
1361	vpshufb	$t3,@X[2],@X[2]
1362	vmovdqu	0x50($inp),@X[5]
1363	vpshufb	$t3,@X[3],@X[3]
1364	vmovdqu	0x60($inp),@X[6]
1365	vpshufb	$t3,@X[4],@X[4]
1366	vmovdqu	0x70($inp),@X[7]
1367	vpshufb	$t3,@X[5],@X[5]
1368	vpaddq	-0x80($Tbl),@X[0],$t0
1369	vpshufb	$t3,@X[6],@X[6]
1370	vpaddq	-0x60($Tbl),@X[1],$t1
1371	vpshufb	$t3,@X[7],@X[7]
1372	vpaddq	-0x40($Tbl),@X[2],$t2
1373	vpaddq	-0x20($Tbl),@X[3],$t3
1374	vmovdqa	$t0,0x00(%rsp)
1375	vpaddq	0x00($Tbl),@X[4],$t0
1376	vmovdqa	$t1,0x10(%rsp)
1377	vpaddq	0x20($Tbl),@X[5],$t1
1378	vmovdqa	$t2,0x20(%rsp)
1379	vpaddq	0x40($Tbl),@X[6],$t2
1380	vmovdqa	$t3,0x30(%rsp)
1381	vpaddq	0x60($Tbl),@X[7],$t3
1382	vmovdqa	$t0,0x40(%rsp)
1383	mov	$A,$a1
1384	vmovdqa	$t1,0x50(%rsp)
1385	mov	$B,$a3
1386	vmovdqa	$t2,0x60(%rsp)
1387	xor	$C,$a3			# magic
1388	vmovdqa	$t3,0x70(%rsp)
1389	mov	$E,$a0
1390	jmp	.Lxop_00_47
1391
1392.align	16
1393.Lxop_00_47:
1394	add	\$`16*2*$SZ`,$Tbl
1395___
1396sub XOP_512_00_47 () {
1397my $j = shift;
1398my $body = shift;
1399my @X = @_;
1400my @insns = (&$body,&$body);			# 52 instructions
1401
1402	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1403	  eval(shift(@insns));
1404	  eval(shift(@insns));
1405	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1406	  eval(shift(@insns));
1407	  eval(shift(@insns));
1408	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1409	  eval(shift(@insns));
1410	  eval(shift(@insns));
1411	&vpsrlq		($t0,$t0,$sigma0[2]);
1412	  eval(shift(@insns));
1413	  eval(shift(@insns));
1414	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1415	  eval(shift(@insns));
1416	  eval(shift(@insns));
1417	  eval(shift(@insns));
1418	  eval(shift(@insns));
1419	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1420	  eval(shift(@insns));
1421	  eval(shift(@insns));
1422	&vpxor		($t0,$t0,$t1);
1423	  eval(shift(@insns));
1424	  eval(shift(@insns));
1425	  eval(shift(@insns));
1426	  eval(shift(@insns));
1427	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1428	  eval(shift(@insns));
1429	  eval(shift(@insns));
1430	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1431	  eval(shift(@insns));
1432	  eval(shift(@insns));
1433	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1434	  eval(shift(@insns));
1435	  eval(shift(@insns));
1436	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1437	  eval(shift(@insns));
1438	  eval(shift(@insns));
1439	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1440	  eval(shift(@insns));
1441	  eval(shift(@insns));
1442	 &vpxor		($t3,$t3,$t2);
1443	  eval(shift(@insns));
1444	  eval(shift(@insns));
1445	  eval(shift(@insns));
1446	  eval(shift(@insns));
1447	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1448	  eval(shift(@insns));
1449	  eval(shift(@insns));
1450	  eval(shift(@insns));
1451	  eval(shift(@insns));
1452	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1453	  eval(shift(@insns));
1454	  eval(shift(@insns));
1455	  eval(shift(@insns));
1456	  eval(shift(@insns));
1457	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1458	  foreach (@insns) { eval; }		# remaining instructions
1459	&vmovdqa	(16*$j."(%rsp)",$t2);
1460}
1461
1462    for ($i=0,$j=0; $j<8; $j++) {
1463	&XOP_512_00_47($j,\&body_00_15,@X);
1464	push(@X,shift(@X));			# rotate(@X)
1465    }
1466	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1467	&jne	(".Lxop_00_47");
1468
1469    for ($i=0; $i<16; ) {
1470	foreach(body_00_15()) { eval; }
1471    }
1472}
1473$code.=<<___;
1474	mov	$_ctx,$ctx
1475	mov	$a1,$A
1476
1477	add	$SZ*0($ctx),$A
1478	lea	16*$SZ($inp),$inp
1479	add	$SZ*1($ctx),$B
1480	add	$SZ*2($ctx),$C
1481	add	$SZ*3($ctx),$D
1482	add	$SZ*4($ctx),$E
1483	add	$SZ*5($ctx),$F
1484	add	$SZ*6($ctx),$G
1485	add	$SZ*7($ctx),$H
1486
1487	cmp	$_end,$inp
1488
1489	mov	$A,$SZ*0($ctx)
1490	mov	$B,$SZ*1($ctx)
1491	mov	$C,$SZ*2($ctx)
1492	mov	$D,$SZ*3($ctx)
1493	mov	$E,$SZ*4($ctx)
1494	mov	$F,$SZ*5($ctx)
1495	mov	$G,$SZ*6($ctx)
1496	mov	$H,$SZ*7($ctx)
1497	jb	.Lloop_xop
1498
1499	mov	$_rsp,%rsi
1500.cfi_def_cfa	%rsi,8
1501	vzeroupper
1502___
1503$code.=<<___ if ($win64);
1504	movaps	16*$SZ+32(%rsp),%xmm6
1505	movaps	16*$SZ+48(%rsp),%xmm7
1506	movaps	16*$SZ+64(%rsp),%xmm8
1507	movaps	16*$SZ+80(%rsp),%xmm9
1508___
1509$code.=<<___ if ($win64 && $SZ>4);
1510	movaps	16*$SZ+96(%rsp),%xmm10
1511	movaps	16*$SZ+112(%rsp),%xmm11
1512___
1513$code.=<<___;
1514	mov	-48(%rsi),%r15
1515.cfi_restore	%r15
1516	mov	-40(%rsi),%r14
1517.cfi_restore	%r14
1518	mov	-32(%rsi),%r13
1519.cfi_restore	%r13
1520	mov	-24(%rsi),%r12
1521.cfi_restore	%r12
1522	mov	-16(%rsi),%rbp
1523.cfi_restore	%rbp
1524	mov	-8(%rsi),%rbx
1525.cfi_restore	%rbx
1526	lea	(%rsi),%rsp
1527.cfi_def_cfa_register	%rsp
1528.Lepilogue_xop:
1529	ret
1530.cfi_endproc
1531.size	${func}_xop,.-${func}_xop
1532___
1533}
1534######################################################################
1535# AVX+shrd code path
1536#
1537local *ror = sub { &shrd(@_[0],@_) };
1538
1539$code.=<<___;
1540.type	${func}_avx,\@function,3
1541.align	64
1542${func}_avx:
1543.cfi_startproc
1544.Lavx_shortcut:
1545	mov	%rsp,%rax		# copy %rsp
1546.cfi_def_cfa_register	%rax
1547	push	%rbx
1548.cfi_push	%rbx
1549	push	%rbp
1550.cfi_push	%rbp
1551	push	%r12
1552.cfi_push	%r12
1553	push	%r13
1554.cfi_push	%r13
1555	push	%r14
1556.cfi_push	%r14
1557	push	%r15
1558.cfi_push	%r15
1559	shl	\$4,%rdx		# num*16
1560	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1561	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1562	and	\$-64,%rsp		# align stack frame
1563	mov	$ctx,$_ctx		# save ctx, 1st arg
1564	mov	$inp,$_inp		# save inp, 2nd arh
1565	mov	%rdx,$_end		# save end pointer, "3rd" arg
1566	mov	%rax,$_rsp		# save copy of %rsp
1567.cfi_cfa_expression	$_rsp,deref,+8
1568___
1569$code.=<<___ if ($win64);
1570	movaps	%xmm6,16*$SZ+32(%rsp)
1571	movaps	%xmm7,16*$SZ+48(%rsp)
1572	movaps	%xmm8,16*$SZ+64(%rsp)
1573	movaps	%xmm9,16*$SZ+80(%rsp)
1574___
1575$code.=<<___ if ($win64 && $SZ>4);
1576	movaps	%xmm10,16*$SZ+96(%rsp)
1577	movaps	%xmm11,16*$SZ+112(%rsp)
1578___
1579$code.=<<___;
1580.Lprologue_avx:
1581
1582	vzeroupper
1583	mov	$SZ*0($ctx),$A
1584	mov	$SZ*1($ctx),$B
1585	mov	$SZ*2($ctx),$C
1586	mov	$SZ*3($ctx),$D
1587	mov	$SZ*4($ctx),$E
1588	mov	$SZ*5($ctx),$F
1589	mov	$SZ*6($ctx),$G
1590	mov	$SZ*7($ctx),$H
1591___
1592					if ($SZ==4) {	# SHA256
1593    my @X = map("%xmm$_",(0..3));
1594    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1595
1596$code.=<<___;
1597	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1598	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1599	jmp	.Lloop_avx
1600.align	16
1601.Lloop_avx:
1602	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1603	vmovdqu	0x00($inp),@X[0]
1604	vmovdqu	0x10($inp),@X[1]
1605	vmovdqu	0x20($inp),@X[2]
1606	vmovdqu	0x30($inp),@X[3]
1607	vpshufb	$t3,@X[0],@X[0]
1608	lea	$TABLE(%rip),$Tbl
1609	vpshufb	$t3,@X[1],@X[1]
1610	vpshufb	$t3,@X[2],@X[2]
1611	vpaddd	0x00($Tbl),@X[0],$t0
1612	vpshufb	$t3,@X[3],@X[3]
1613	vpaddd	0x20($Tbl),@X[1],$t1
1614	vpaddd	0x40($Tbl),@X[2],$t2
1615	vpaddd	0x60($Tbl),@X[3],$t3
1616	vmovdqa	$t0,0x00(%rsp)
1617	mov	$A,$a1
1618	vmovdqa	$t1,0x10(%rsp)
1619	mov	$B,$a3
1620	vmovdqa	$t2,0x20(%rsp)
1621	xor	$C,$a3			# magic
1622	vmovdqa	$t3,0x30(%rsp)
1623	mov	$E,$a0
1624	jmp	.Lavx_00_47
1625
1626.align	16
1627.Lavx_00_47:
1628	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1629___
1630sub Xupdate_256_AVX () {
1631	(
1632	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1633	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1634	'&vpsrld	($t2,$t0,$sigma0[0]);',
1635	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1636	'&vpsrld	($t3,$t0,$sigma0[2])',
1637	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1638	'&vpxor		($t0,$t3,$t2)',
1639	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1640	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1641	'&vpxor		($t0,$t0,$t1)',
1642	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1643	'&vpxor		($t0,$t0,$t2)',
1644	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1645	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1646	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1647	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1648	 '&vpxor	($t2,$t2,$t3);',
1649	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1650	 '&vpxor	($t2,$t2,$t3)',
1651	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1652	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1653	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1654	 '&vpsrld	($t2,$t3,$sigma1[2])',
1655	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1656	 '&vpxor	($t2,$t2,$t3);',
1657	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1658	 '&vpxor	($t2,$t2,$t3)',
1659	 '&vpshufb	($t2,$t2,$t5)',
1660	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1661	);
1662}
1663
1664sub AVX_256_00_47 () {
1665my $j = shift;
1666my $body = shift;
1667my @X = @_;
1668my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1669
1670	foreach (Xupdate_256_AVX()) {		# 29 instructions
1671	    eval;
1672	    eval(shift(@insns));
1673	    eval(shift(@insns));
1674	    eval(shift(@insns));
1675	}
1676	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1677	  foreach (@insns) { eval; }		# remaining instructions
1678	&vmovdqa	(16*$j."(%rsp)",$t2);
1679}
1680
1681    for ($i=0,$j=0; $j<4; $j++) {
1682	&AVX_256_00_47($j,\&body_00_15,@X);
1683	push(@X,shift(@X));			# rotate(@X)
1684    }
1685	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1686	&jne	(".Lavx_00_47");
1687
1688    for ($i=0; $i<16; ) {
1689	foreach(body_00_15()) { eval; }
1690    }
1691
1692					} else {	# SHA512
1693    my @X = map("%xmm$_",(0..7));
1694    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1695
1696$code.=<<___;
1697	jmp	.Lloop_avx
1698.align	16
1699.Lloop_avx:
1700	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1701	vmovdqu	0x00($inp),@X[0]
1702	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1703	vmovdqu	0x10($inp),@X[1]
1704	vmovdqu	0x20($inp),@X[2]
1705	vpshufb	$t3,@X[0],@X[0]
1706	vmovdqu	0x30($inp),@X[3]
1707	vpshufb	$t3,@X[1],@X[1]
1708	vmovdqu	0x40($inp),@X[4]
1709	vpshufb	$t3,@X[2],@X[2]
1710	vmovdqu	0x50($inp),@X[5]
1711	vpshufb	$t3,@X[3],@X[3]
1712	vmovdqu	0x60($inp),@X[6]
1713	vpshufb	$t3,@X[4],@X[4]
1714	vmovdqu	0x70($inp),@X[7]
1715	vpshufb	$t3,@X[5],@X[5]
1716	vpaddq	-0x80($Tbl),@X[0],$t0
1717	vpshufb	$t3,@X[6],@X[6]
1718	vpaddq	-0x60($Tbl),@X[1],$t1
1719	vpshufb	$t3,@X[7],@X[7]
1720	vpaddq	-0x40($Tbl),@X[2],$t2
1721	vpaddq	-0x20($Tbl),@X[3],$t3
1722	vmovdqa	$t0,0x00(%rsp)
1723	vpaddq	0x00($Tbl),@X[4],$t0
1724	vmovdqa	$t1,0x10(%rsp)
1725	vpaddq	0x20($Tbl),@X[5],$t1
1726	vmovdqa	$t2,0x20(%rsp)
1727	vpaddq	0x40($Tbl),@X[6],$t2
1728	vmovdqa	$t3,0x30(%rsp)
1729	vpaddq	0x60($Tbl),@X[7],$t3
1730	vmovdqa	$t0,0x40(%rsp)
1731	mov	$A,$a1
1732	vmovdqa	$t1,0x50(%rsp)
1733	mov	$B,$a3
1734	vmovdqa	$t2,0x60(%rsp)
1735	xor	$C,$a3			# magic
1736	vmovdqa	$t3,0x70(%rsp)
1737	mov	$E,$a0
1738	jmp	.Lavx_00_47
1739
1740.align	16
1741.Lavx_00_47:
1742	add	\$`16*2*$SZ`,$Tbl
1743___
1744sub Xupdate_512_AVX () {
1745	(
1746	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1747	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1748	'&vpsrlq	($t2,$t0,$sigma0[0])',
1749	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1750	'&vpsrlq	($t3,$t0,$sigma0[2])',
1751	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1752	 '&vpxor	($t0,$t3,$t2)',
1753	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1754	 '&vpxor	($t0,$t0,$t1)',
1755	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1756	 '&vpxor	($t0,$t0,$t2)',
1757	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1758	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1759	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1760	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1761	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1762	 '&vpxor	($t3,$t3,$t2)',
1763	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1764	 '&vpxor	($t3,$t3,$t1)',
1765	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1766	 '&vpxor	($t3,$t3,$t2)',
1767	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1768	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1769	);
1770}
1771
1772sub AVX_512_00_47 () {
1773my $j = shift;
1774my $body = shift;
1775my @X = @_;
1776my @insns = (&$body,&$body);			# 52 instructions
1777
1778	foreach (Xupdate_512_AVX()) {		# 23 instructions
1779	    eval;
1780	    eval(shift(@insns));
1781	    eval(shift(@insns));
1782	}
1783	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1784	  foreach (@insns) { eval; }		# remaining instructions
1785	&vmovdqa	(16*$j."(%rsp)",$t2);
1786}
1787
1788    for ($i=0,$j=0; $j<8; $j++) {
1789	&AVX_512_00_47($j,\&body_00_15,@X);
1790	push(@X,shift(@X));			# rotate(@X)
1791    }
1792	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1793	&jne	(".Lavx_00_47");
1794
1795    for ($i=0; $i<16; ) {
1796	foreach(body_00_15()) { eval; }
1797    }
1798}
1799$code.=<<___;
1800	mov	$_ctx,$ctx
1801	mov	$a1,$A
1802
1803	add	$SZ*0($ctx),$A
1804	lea	16*$SZ($inp),$inp
1805	add	$SZ*1($ctx),$B
1806	add	$SZ*2($ctx),$C
1807	add	$SZ*3($ctx),$D
1808	add	$SZ*4($ctx),$E
1809	add	$SZ*5($ctx),$F
1810	add	$SZ*6($ctx),$G
1811	add	$SZ*7($ctx),$H
1812
1813	cmp	$_end,$inp
1814
1815	mov	$A,$SZ*0($ctx)
1816	mov	$B,$SZ*1($ctx)
1817	mov	$C,$SZ*2($ctx)
1818	mov	$D,$SZ*3($ctx)
1819	mov	$E,$SZ*4($ctx)
1820	mov	$F,$SZ*5($ctx)
1821	mov	$G,$SZ*6($ctx)
1822	mov	$H,$SZ*7($ctx)
1823	jb	.Lloop_avx
1824
1825	mov	$_rsp,%rsi
1826.cfi_def_cfa	%rsi,8
1827	vzeroupper
1828___
1829$code.=<<___ if ($win64);
1830	movaps	16*$SZ+32(%rsp),%xmm6
1831	movaps	16*$SZ+48(%rsp),%xmm7
1832	movaps	16*$SZ+64(%rsp),%xmm8
1833	movaps	16*$SZ+80(%rsp),%xmm9
1834___
1835$code.=<<___ if ($win64 && $SZ>4);
1836	movaps	16*$SZ+96(%rsp),%xmm10
1837	movaps	16*$SZ+112(%rsp),%xmm11
1838___
1839$code.=<<___;
1840	mov	-48(%rsi),%r15
1841.cfi_restore	%r15
1842	mov	-40(%rsi),%r14
1843.cfi_restore	%r14
1844	mov	-32(%rsi),%r13
1845.cfi_restore	%r13
1846	mov	-24(%rsi),%r12
1847.cfi_restore	%r12
1848	mov	-16(%rsi),%rbp
1849.cfi_restore	%rbp
1850	mov	-8(%rsi),%rbx
1851.cfi_restore	%rbx
1852	lea	(%rsi),%rsp
1853.cfi_def_cfa_register	%rsp
1854.Lepilogue_avx:
1855	ret
1856.cfi_endproc
1857.size	${func}_avx,.-${func}_avx
1858___
1859
1860if ($avx>1) {{
1861######################################################################
1862# AVX2+BMI code path
1863#
1864my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1865my $PUSH8=8*2*$SZ;
1866use integer;
1867
1868sub bodyx_00_15 () {
1869	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1870	(
1871	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1872
1873	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1874	'&and	($a4,$e)',		# f&e
1875	'&rorx	($a0,$e,$Sigma1[2])',
1876	'&rorx	($a2,$e,$Sigma1[1])',
1877
1878	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1879	'&lea	($h,"($h,$a4)")',
1880	'&andn	($a4,$e,$g)',		# ~e&g
1881	'&xor	($a0,$a2)',
1882
1883	'&rorx	($a1,$e,$Sigma1[0])',
1884	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1885	'&xor	($a0,$a1)',		# Sigma1(e)
1886	'&mov	($a2,$a)',
1887
1888	'&rorx	($a4,$a,$Sigma0[2])',
1889	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1890	'&xor	($a2,$b)',		# a^b, b^c in next round
1891	'&rorx	($a1,$a,$Sigma0[1])',
1892
1893	'&rorx	($a0,$a,$Sigma0[0])',
1894	'&lea	($d,"($d,$h)")',	# d+=h
1895	'&and	($a3,$a2)',		# (b^c)&(a^b)
1896	'&xor	($a1,$a4)',
1897
1898	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1899	'&xor	($a1,$a0)',		# Sigma0(a)
1900	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1901	'&mov	($a4,$e)',		# copy of f in future
1902
1903	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1904	);
1905	# and at the finish one has to $a+=$a1
1906}
1907
1908$code.=<<___;
1909.type	${func}_avx2,\@function,3
1910.align	64
1911${func}_avx2:
1912.cfi_startproc
1913.Lavx2_shortcut:
1914	mov	%rsp,%rax		# copy %rsp
1915.cfi_def_cfa_register	%rax
1916	push	%rbx
1917.cfi_push	%rbx
1918	push	%rbp
1919.cfi_push	%rbp
1920	push	%r12
1921.cfi_push	%r12
1922	push	%r13
1923.cfi_push	%r13
1924	push	%r14
1925.cfi_push	%r14
1926	push	%r15
1927.cfi_push	%r15
1928	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1929	shl	\$4,%rdx		# num*16
1930	and	\$-256*$SZ,%rsp		# align stack frame
1931	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1932	add	\$`2*$SZ*($rounds-8)`,%rsp
1933	mov	$ctx,$_ctx		# save ctx, 1st arg
1934	mov	$inp,$_inp		# save inp, 2nd arh
1935	mov	%rdx,$_end		# save end pointer, "3rd" arg
1936	mov	%rax,$_rsp		# save copy of %rsp
1937.cfi_cfa_expression	$_rsp,deref,+8
1938___
1939$code.=<<___ if ($win64);
1940	movaps	%xmm6,16*$SZ+32(%rsp)
1941	movaps	%xmm7,16*$SZ+48(%rsp)
1942	movaps	%xmm8,16*$SZ+64(%rsp)
1943	movaps	%xmm9,16*$SZ+80(%rsp)
1944___
1945$code.=<<___ if ($win64 && $SZ>4);
1946	movaps	%xmm10,16*$SZ+96(%rsp)
1947	movaps	%xmm11,16*$SZ+112(%rsp)
1948___
1949$code.=<<___;
1950.Lprologue_avx2:
1951
1952	vzeroupper
1953	sub	\$-16*$SZ,$inp		# inp++, size optimization
1954	mov	$SZ*0($ctx),$A
1955	mov	$inp,%r12		# borrow $T1
1956	mov	$SZ*1($ctx),$B
1957	cmp	%rdx,$inp		# $_end
1958	mov	$SZ*2($ctx),$C
1959	cmove	%rsp,%r12		# next block or random data
1960	mov	$SZ*3($ctx),$D
1961	mov	$SZ*4($ctx),$E
1962	mov	$SZ*5($ctx),$F
1963	mov	$SZ*6($ctx),$G
1964	mov	$SZ*7($ctx),$H
1965___
1966					if ($SZ==4) {	# SHA256
1967    my @X = map("%ymm$_",(0..3));
1968    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1969
1970$code.=<<___;
1971	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1972	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1973	jmp	.Loop_avx2
1974.align	16
1975.Loop_avx2:
1976	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1977	vmovdqu	-16*$SZ+0($inp),%xmm0
1978	vmovdqu	-16*$SZ+16($inp),%xmm1
1979	vmovdqu	-16*$SZ+32($inp),%xmm2
1980	vmovdqu	-16*$SZ+48($inp),%xmm3
1981	#mov		$inp,$_inp	# offload $inp
1982	vinserti128	\$1,(%r12),@X[0],@X[0]
1983	vinserti128	\$1,16(%r12),@X[1],@X[1]
1984	vpshufb		$t3,@X[0],@X[0]
1985	vinserti128	\$1,32(%r12),@X[2],@X[2]
1986	vpshufb		$t3,@X[1],@X[1]
1987	vinserti128	\$1,48(%r12),@X[3],@X[3]
1988
1989	lea	$TABLE(%rip),$Tbl
1990	vpshufb	$t3,@X[2],@X[2]
1991	vpaddd	0x00($Tbl),@X[0],$t0
1992	vpshufb	$t3,@X[3],@X[3]
1993	vpaddd	0x20($Tbl),@X[1],$t1
1994	vpaddd	0x40($Tbl),@X[2],$t2
1995	vpaddd	0x60($Tbl),@X[3],$t3
1996	vmovdqa	$t0,0x00(%rsp)
1997	xor	$a1,$a1
1998	vmovdqa	$t1,0x20(%rsp)
1999___
2000$code.=<<___ if (!$win64);
2001# temporarily use %rdi as frame pointer
2002	mov	$_rsp,%rdi
2003.cfi_def_cfa	%rdi,8
2004___
2005$code.=<<___;
2006	lea	-$PUSH8(%rsp),%rsp
2007___
2008$code.=<<___ if (!$win64);
2009# the frame info is at $_rsp, but the stack is moving...
2010# so a second frame pointer is saved at -8(%rsp)
2011# that is in the red zone
2012	mov	%rdi,-8(%rsp)
2013.cfi_cfa_expression	%rsp-8,deref,+8
2014___
2015$code.=<<___;
2016	mov	$B,$a3
2017	vmovdqa	$t2,0x00(%rsp)
2018	xor	$C,$a3			# magic
2019	vmovdqa	$t3,0x20(%rsp)
2020	mov	$F,$a4
2021	sub	\$-16*2*$SZ,$Tbl	# size optimization
2022	jmp	.Lavx2_00_47
2023
2024.align	16
2025.Lavx2_00_47:
2026___
2027
2028sub AVX2_256_00_47 () {
2029my $j = shift;
2030my $body = shift;
2031my @X = @_;
2032my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
2033my $base = "+2*$PUSH8(%rsp)";
2034
2035	if (($j%2)==0) {
2036	&lea	("%rsp","-$PUSH8(%rsp)");
2037$code.=<<___ if (!$win64);
2038.cfi_cfa_expression	%rsp+`$PUSH8-8`,deref,+8
2039# copy secondary frame pointer to new location again at -8(%rsp)
2040	pushq	$PUSH8-8(%rsp)
2041.cfi_cfa_expression	%rsp,deref,+8
2042	lea	8(%rsp),%rsp
2043.cfi_cfa_expression	%rsp-8,deref,+8
2044___
2045	}
2046
2047	foreach (Xupdate_256_AVX()) {		# 29 instructions
2048	    eval;
2049	    eval(shift(@insns));
2050	    eval(shift(@insns));
2051	    eval(shift(@insns));
2052	}
2053	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
2054	  foreach (@insns) { eval; }		# remaining instructions
2055	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2056}
2057
2058    for ($i=0,$j=0; $j<4; $j++) {
2059	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
2060	push(@X,shift(@X));			# rotate(@X)
2061    }
2062	&lea	($Tbl,16*2*$SZ."($Tbl)");
2063	&cmpb	(($SZ-1)."($Tbl)",0);
2064	&jne	(".Lavx2_00_47");
2065
2066    for ($i=0; $i<16; ) {
2067	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2068	foreach(bodyx_00_15()) { eval; }
2069    }
2070					} else {	# SHA512
2071    my @X = map("%ymm$_",(0..7));
2072    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2073
2074$code.=<<___;
2075	jmp	.Loop_avx2
2076.align	16
2077.Loop_avx2:
2078	vmovdqu	-16*$SZ($inp),%xmm0
2079	vmovdqu	-16*$SZ+16($inp),%xmm1
2080	vmovdqu	-16*$SZ+32($inp),%xmm2
2081	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
2082	vmovdqu	-16*$SZ+48($inp),%xmm3
2083	vmovdqu	-16*$SZ+64($inp),%xmm4
2084	vmovdqu	-16*$SZ+80($inp),%xmm5
2085	vmovdqu	-16*$SZ+96($inp),%xmm6
2086	vmovdqu	-16*$SZ+112($inp),%xmm7
2087	#mov	$inp,$_inp	# offload $inp
2088	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
2089	vinserti128	\$1,(%r12),@X[0],@X[0]
2090	vinserti128	\$1,16(%r12),@X[1],@X[1]
2091	 vpshufb	$t2,@X[0],@X[0]
2092	vinserti128	\$1,32(%r12),@X[2],@X[2]
2093	 vpshufb	$t2,@X[1],@X[1]
2094	vinserti128	\$1,48(%r12),@X[3],@X[3]
2095	 vpshufb	$t2,@X[2],@X[2]
2096	vinserti128	\$1,64(%r12),@X[4],@X[4]
2097	 vpshufb	$t2,@X[3],@X[3]
2098	vinserti128	\$1,80(%r12),@X[5],@X[5]
2099	 vpshufb	$t2,@X[4],@X[4]
2100	vinserti128	\$1,96(%r12),@X[6],@X[6]
2101	 vpshufb	$t2,@X[5],@X[5]
2102	vinserti128	\$1,112(%r12),@X[7],@X[7]
2103
2104	vpaddq	-0x80($Tbl),@X[0],$t0
2105	vpshufb	$t2,@X[6],@X[6]
2106	vpaddq	-0x60($Tbl),@X[1],$t1
2107	vpshufb	$t2,@X[7],@X[7]
2108	vpaddq	-0x40($Tbl),@X[2],$t2
2109	vpaddq	-0x20($Tbl),@X[3],$t3
2110	vmovdqa	$t0,0x00(%rsp)
2111	vpaddq	0x00($Tbl),@X[4],$t0
2112	vmovdqa	$t1,0x20(%rsp)
2113	vpaddq	0x20($Tbl),@X[5],$t1
2114	vmovdqa	$t2,0x40(%rsp)
2115	vpaddq	0x40($Tbl),@X[6],$t2
2116	vmovdqa	$t3,0x60(%rsp)
2117___
2118$code.=<<___ if (!$win64);
2119# temporarily use %rdi as frame pointer
2120	mov	$_rsp,%rdi
2121.cfi_def_cfa	%rdi,8
2122___
2123$code.=<<___;
2124	lea	-$PUSH8(%rsp),%rsp
2125___
2126$code.=<<___ if (!$win64);
2127# the frame info is at $_rsp, but the stack is moving...
2128# so a second frame pointer is saved at -8(%rsp)
2129# that is in the red zone
2130	mov	%rdi,-8(%rsp)
2131.cfi_cfa_expression	%rsp-8,deref,+8
2132___
2133$code.=<<___;
2134	vpaddq	0x60($Tbl),@X[7],$t3
2135	vmovdqa	$t0,0x00(%rsp)
2136	xor	$a1,$a1
2137	vmovdqa	$t1,0x20(%rsp)
2138	mov	$B,$a3
2139	vmovdqa	$t2,0x40(%rsp)
2140	xor	$C,$a3			# magic
2141	vmovdqa	$t3,0x60(%rsp)
2142	mov	$F,$a4
2143	add	\$16*2*$SZ,$Tbl
2144	jmp	.Lavx2_00_47
2145
2146.align	16
2147.Lavx2_00_47:
2148___
2149
2150sub AVX2_512_00_47 () {
2151my $j = shift;
2152my $body = shift;
2153my @X = @_;
2154my @insns = (&$body,&$body);			# 48 instructions
2155my $base = "+2*$PUSH8(%rsp)";
2156
2157	if (($j%4)==0) {
2158	&lea	("%rsp","-$PUSH8(%rsp)");
2159$code.=<<___ if (!$win64);
2160.cfi_cfa_expression	%rsp+`$PUSH8-8`,deref,+8
2161# copy secondary frame pointer to new location again at -8(%rsp)
2162	pushq	$PUSH8-8(%rsp)
2163.cfi_cfa_expression	%rsp,deref,+8
2164	lea	8(%rsp),%rsp
2165.cfi_cfa_expression	%rsp-8,deref,+8
2166___
2167	}
2168
2169	foreach (Xupdate_512_AVX()) {		# 23 instructions
2170	    eval;
2171	    if ($_ !~ /\;$/) {
2172		eval(shift(@insns));
2173		eval(shift(@insns));
2174		eval(shift(@insns));
2175	    }
2176	}
2177	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2178	  foreach (@insns) { eval; }		# remaining instructions
2179	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2180}
2181
2182    for ($i=0,$j=0; $j<8; $j++) {
2183	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2184	push(@X,shift(@X));			# rotate(@X)
2185    }
2186	&lea	($Tbl,16*2*$SZ."($Tbl)");
2187	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2188	&jne	(".Lavx2_00_47");
2189
2190    for ($i=0; $i<16; ) {
2191	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2192	foreach(bodyx_00_15()) { eval; }
2193    }
2194}
2195$code.=<<___;
2196	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2197	add	$a1,$A
2198	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2199	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2200
2201	add	$SZ*0($ctx),$A
2202	add	$SZ*1($ctx),$B
2203	add	$SZ*2($ctx),$C
2204	add	$SZ*3($ctx),$D
2205	add	$SZ*4($ctx),$E
2206	add	$SZ*5($ctx),$F
2207	add	$SZ*6($ctx),$G
2208	add	$SZ*7($ctx),$H
2209
2210	mov	$A,$SZ*0($ctx)
2211	mov	$B,$SZ*1($ctx)
2212	mov	$C,$SZ*2($ctx)
2213	mov	$D,$SZ*3($ctx)
2214	mov	$E,$SZ*4($ctx)
2215	mov	$F,$SZ*5($ctx)
2216	mov	$G,$SZ*6($ctx)
2217	mov	$H,$SZ*7($ctx)
2218
2219	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2220	je	.Ldone_avx2
2221
2222	xor	$a1,$a1
2223	mov	$B,$a3
2224	xor	$C,$a3			# magic
2225	mov	$F,$a4
2226	jmp	.Lower_avx2
2227.align	16
2228.Lower_avx2:
2229___
2230    for ($i=0; $i<8; ) {
2231	my $base="+16($Tbl)";
2232	foreach(bodyx_00_15()) { eval; }
2233    }
2234$code.=<<___;
2235	lea	-$PUSH8($Tbl),$Tbl
2236	cmp	%rsp,$Tbl
2237	jae	.Lower_avx2
2238
2239	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2240	add	$a1,$A
2241	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2242	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2243# restore frame pointer to original location at $_rsp
2244.cfi_cfa_expression	$_rsp,deref,+8
2245
2246	add	$SZ*0($ctx),$A
2247	add	$SZ*1($ctx),$B
2248	add	$SZ*2($ctx),$C
2249	add	$SZ*3($ctx),$D
2250	add	$SZ*4($ctx),$E
2251	add	$SZ*5($ctx),$F
2252	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2253	add	$SZ*6($ctx),$G
2254	mov	$inp,%r12
2255	add	$SZ*7($ctx),$H
2256	cmp	$_end,$inp
2257
2258	mov	$A,$SZ*0($ctx)
2259	cmove	%rsp,%r12		# next block or stale data
2260	mov	$B,$SZ*1($ctx)
2261	mov	$C,$SZ*2($ctx)
2262	mov	$D,$SZ*3($ctx)
2263	mov	$E,$SZ*4($ctx)
2264	mov	$F,$SZ*5($ctx)
2265	mov	$G,$SZ*6($ctx)
2266	mov	$H,$SZ*7($ctx)
2267
2268	jbe	.Loop_avx2
2269	lea	(%rsp),$Tbl
2270# temporarily use $Tbl as index to $_rsp
2271# this avoids the need to save a secondary frame pointer at -8(%rsp)
2272.cfi_cfa_expression	$Tbl+`16*$SZ+3*8`,deref,+8
2273
2274.Ldone_avx2:
2275	mov	`16*$SZ+3*8`($Tbl),%rsi
2276.cfi_def_cfa	%rsi,8
2277	vzeroupper
2278___
2279$code.=<<___ if ($win64);
2280	movaps	16*$SZ+32($Tbl),%xmm6
2281	movaps	16*$SZ+48($Tbl),%xmm7
2282	movaps	16*$SZ+64($Tbl),%xmm8
2283	movaps	16*$SZ+80($Tbl),%xmm9
2284___
2285$code.=<<___ if ($win64 && $SZ>4);
2286	movaps	16*$SZ+96($Tbl),%xmm10
2287	movaps	16*$SZ+112($Tbl),%xmm11
2288___
2289$code.=<<___;
2290	mov	-48(%rsi),%r15
2291.cfi_restore	%r15
2292	mov	-40(%rsi),%r14
2293.cfi_restore	%r14
2294	mov	-32(%rsi),%r13
2295.cfi_restore	%r13
2296	mov	-24(%rsi),%r12
2297.cfi_restore	%r12
2298	mov	-16(%rsi),%rbp
2299.cfi_restore	%rbp
2300	mov	-8(%rsi),%rbx
2301.cfi_restore	%rbx
2302	lea	(%rsi),%rsp
2303.cfi_def_cfa_register	%rsp
2304.Lepilogue_avx2:
2305	ret
2306.cfi_endproc
2307.size	${func}_avx2,.-${func}_avx2
2308___
2309}}
2310}}}}}
2311
2312# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2313#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2314if ($win64) {
2315$rec="%rcx";
2316$frame="%rdx";
2317$context="%r8";
2318$disp="%r9";
2319
2320$code.=<<___;
2321.extern	__imp_RtlVirtualUnwind
2322.type	se_handler,\@abi-omnipotent
2323.align	16
2324se_handler:
2325	push	%rsi
2326	push	%rdi
2327	push	%rbx
2328	push	%rbp
2329	push	%r12
2330	push	%r13
2331	push	%r14
2332	push	%r15
2333	pushfq
2334	sub	\$64,%rsp
2335
2336	mov	120($context),%rax	# pull context->Rax
2337	mov	248($context),%rbx	# pull context->Rip
2338
2339	mov	8($disp),%rsi		# disp->ImageBase
2340	mov	56($disp),%r11		# disp->HanderlData
2341
2342	mov	0(%r11),%r10d		# HandlerData[0]
2343	lea	(%rsi,%r10),%r10	# prologue label
2344	cmp	%r10,%rbx		# context->Rip<prologue label
2345	jb	.Lin_prologue
2346
2347	mov	152($context),%rax	# pull context->Rsp
2348
2349	mov	4(%r11),%r10d		# HandlerData[1]
2350	lea	(%rsi,%r10),%r10	# epilogue label
2351	cmp	%r10,%rbx		# context->Rip>=epilogue label
2352	jae	.Lin_prologue
2353___
2354$code.=<<___ if ($avx>1);
2355	lea	.Lavx2_shortcut(%rip),%r10
2356	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2357	jb	.Lnot_in_avx2
2358
2359	and	\$-256*$SZ,%rax
2360	add	\$`2*$SZ*($rounds-8)`,%rax
2361.Lnot_in_avx2:
2362___
2363$code.=<<___;
2364	mov	%rax,%rsi		# put aside Rsp
2365	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2366
2367	mov	-8(%rax),%rbx
2368	mov	-16(%rax),%rbp
2369	mov	-24(%rax),%r12
2370	mov	-32(%rax),%r13
2371	mov	-40(%rax),%r14
2372	mov	-48(%rax),%r15
2373	mov	%rbx,144($context)	# restore context->Rbx
2374	mov	%rbp,160($context)	# restore context->Rbp
2375	mov	%r12,216($context)	# restore context->R12
2376	mov	%r13,224($context)	# restore context->R13
2377	mov	%r14,232($context)	# restore context->R14
2378	mov	%r15,240($context)	# restore context->R15
2379
2380	lea	.Lepilogue(%rip),%r10
2381	cmp	%r10,%rbx
2382	jb	.Lin_prologue		# non-AVX code
2383
2384	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2385	lea	512($context),%rdi	# &context.Xmm6
2386	mov	\$`$SZ==4?8:12`,%ecx
2387	.long	0xa548f3fc		# cld; rep movsq
2388
2389.Lin_prologue:
2390	mov	8(%rax),%rdi
2391	mov	16(%rax),%rsi
2392	mov	%rax,152($context)	# restore context->Rsp
2393	mov	%rsi,168($context)	# restore context->Rsi
2394	mov	%rdi,176($context)	# restore context->Rdi
2395
2396	mov	40($disp),%rdi		# disp->ContextRecord
2397	mov	$context,%rsi		# context
2398	mov	\$154,%ecx		# sizeof(CONTEXT)
2399	.long	0xa548f3fc		# cld; rep movsq
2400
2401	mov	$disp,%rsi
2402	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2403	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2404	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2405	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2406	mov	40(%rsi),%r10		# disp->ContextRecord
2407	lea	56(%rsi),%r11		# &disp->HandlerData
2408	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2409	mov	%r10,32(%rsp)		# arg5
2410	mov	%r11,40(%rsp)		# arg6
2411	mov	%r12,48(%rsp)		# arg7
2412	mov	%rcx,56(%rsp)		# arg8, (NULL)
2413	call	*__imp_RtlVirtualUnwind(%rip)
2414
2415	mov	\$1,%eax		# ExceptionContinueSearch
2416	add	\$64,%rsp
2417	popfq
2418	pop	%r15
2419	pop	%r14
2420	pop	%r13
2421	pop	%r12
2422	pop	%rbp
2423	pop	%rbx
2424	pop	%rdi
2425	pop	%rsi
2426	ret
2427.size	se_handler,.-se_handler
2428___
2429
2430$code.=<<___ if ($SZ==4 && $shaext);
2431.type	shaext_handler,\@abi-omnipotent
2432.align	16
2433shaext_handler:
2434	push	%rsi
2435	push	%rdi
2436	push	%rbx
2437	push	%rbp
2438	push	%r12
2439	push	%r13
2440	push	%r14
2441	push	%r15
2442	pushfq
2443	sub	\$64,%rsp
2444
2445	mov	120($context),%rax	# pull context->Rax
2446	mov	248($context),%rbx	# pull context->Rip
2447
2448	lea	.Lprologue_shaext(%rip),%r10
2449	cmp	%r10,%rbx		# context->Rip<.Lprologue
2450	jb	.Lin_prologue
2451
2452	lea	.Lepilogue_shaext(%rip),%r10
2453	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2454	jae	.Lin_prologue
2455
2456	lea	-8-5*16(%rax),%rsi
2457	lea	512($context),%rdi	# &context.Xmm6
2458	mov	\$10,%ecx
2459	.long	0xa548f3fc		# cld; rep movsq
2460
2461	jmp	.Lin_prologue
2462.size	shaext_handler,.-shaext_handler
2463___
2464
2465$code.=<<___;
2466.section	.pdata
2467.align	4
2468	.rva	.LSEH_begin_$func
2469	.rva	.LSEH_end_$func
2470	.rva	.LSEH_info_$func
2471___
2472$code.=<<___ if ($SZ==4 && $shaext);
2473	.rva	.LSEH_begin_${func}_shaext
2474	.rva	.LSEH_end_${func}_shaext
2475	.rva	.LSEH_info_${func}_shaext
2476___
2477$code.=<<___ if ($SZ==4);
2478	.rva	.LSEH_begin_${func}_ssse3
2479	.rva	.LSEH_end_${func}_ssse3
2480	.rva	.LSEH_info_${func}_ssse3
2481___
2482$code.=<<___ if ($avx && $SZ==8);
2483	.rva	.LSEH_begin_${func}_xop
2484	.rva	.LSEH_end_${func}_xop
2485	.rva	.LSEH_info_${func}_xop
2486___
2487$code.=<<___ if ($avx);
2488	.rva	.LSEH_begin_${func}_avx
2489	.rva	.LSEH_end_${func}_avx
2490	.rva	.LSEH_info_${func}_avx
2491___
2492$code.=<<___ if ($avx>1);
2493	.rva	.LSEH_begin_${func}_avx2
2494	.rva	.LSEH_end_${func}_avx2
2495	.rva	.LSEH_info_${func}_avx2
2496___
2497$code.=<<___;
2498.section	.xdata
2499.align	8
2500.LSEH_info_$func:
2501	.byte	9,0,0,0
2502	.rva	se_handler
2503	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2504___
2505$code.=<<___ if ($SZ==4 && $shaext);
2506.LSEH_info_${func}_shaext:
2507	.byte	9,0,0,0
2508	.rva	shaext_handler
2509___
2510$code.=<<___ if ($SZ==4);
2511.LSEH_info_${func}_ssse3:
2512	.byte	9,0,0,0
2513	.rva	se_handler
2514	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2515___
2516$code.=<<___ if ($avx && $SZ==8);
2517.LSEH_info_${func}_xop:
2518	.byte	9,0,0,0
2519	.rva	se_handler
2520	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2521___
2522$code.=<<___ if ($avx);
2523.LSEH_info_${func}_avx:
2524	.byte	9,0,0,0
2525	.rva	se_handler
2526	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2527___
2528$code.=<<___ if ($avx>1);
2529.LSEH_info_${func}_avx2:
2530	.byte	9,0,0,0
2531	.rva	se_handler
2532	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2533___
2534}
2535
2536sub sha256op38 {
2537    my $instr = shift;
2538    my %opcodelet = (
2539		"sha256rnds2" => 0xcb,
2540  		"sha256msg1"  => 0xcc,
2541		"sha256msg2"  => 0xcd	);
2542
2543    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2544      my @opcode=(0x0f,0x38);
2545	push @opcode,$opcodelet{$instr};
2546	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2547	return ".byte\t".join(',',@opcode);
2548    } else {
2549	return $instr."\t".@_[0];
2550    }
2551}
2552
2553foreach (split("\n",$code)) {
2554	s/\`([^\`]*)\`/eval $1/geo;
2555
2556	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2557
2558	print $_,"\n";
2559}
2560close STDOUT or die "error closing STDOUT: $!";
2561