xref: /openssl/crypto/sha/asm/sha512-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2005-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the License.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111
112# $output is the last argument if it looks like a file (it has an extension)
113# $flavour is the first argument if it doesn't look like a file
114$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
115$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
116
117$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
118
119$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
120( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
121( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
122die "can't locate x86_64-xlate.pl";
123
124if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
125		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
126	$avx = ($1>=2.19) + ($1>=2.22);
127}
128
129if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
130	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
131	$avx = ($1>=2.09) + ($1>=2.10);
132}
133
134if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
135	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
136	$avx = ($1>=10) + ($1>=11);
137}
138
139if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
140	$avx = ($2>=3.0) + ($2>3.0);
141}
142
143$shaext=1;	### set to zero if compiling for 1.0.1
144$avx=1		if (!$shaext && $avx);
145
146open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
147    or die "can't call $xlate: $!";
148*STDOUT=*OUT;
149
150if ($output =~ /512/) {
151	$func="sha512_block_data_order";
152	$TABLE="K512";
153	$SZ=8;
154	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
155					"%r8", "%r9", "%r10","%r11");
156	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
157	@Sigma0=(28,34,39);
158	@Sigma1=(14,18,41);
159	@sigma0=(1,  8, 7);
160	@sigma1=(19,61, 6);
161	$rounds=80;
162} else {
163	$func="sha256_block_data_order";
164	$TABLE="K256";
165	$SZ=4;
166	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
167					"%r8d","%r9d","%r10d","%r11d");
168	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
169	@Sigma0=( 2,13,22);
170	@Sigma1=( 6,11,25);
171	@sigma0=( 7,18, 3);
172	@sigma1=(17,19,10);
173	$rounds=64;
174}
175
176$ctx="%rdi";	# 1st arg, zapped by $a3
177$inp="%rsi";	# 2nd arg
178$Tbl="%rbp";
179
180$_ctx="16*$SZ+0*8(%rsp)";
181$_inp="16*$SZ+1*8(%rsp)";
182$_end="16*$SZ+2*8(%rsp)";
183$_rsp="`16*$SZ+3*8`(%rsp)";
184$framesz="16*$SZ+4*8";
185
186
187sub ROUND_00_15()
188{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
189  my $STRIDE=$SZ;
190     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
191
192$code.=<<___;
193	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
194	mov	$f,$a2
195
196	xor	$e,$a0
197	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
198	xor	$g,$a2			# f^g
199
200	mov	$T1,`$SZ*($i&0xf)`(%rsp)
201	xor	$a,$a1
202	and	$e,$a2			# (f^g)&e
203
204	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
205	add	$h,$T1			# T1+=h
206	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
207
208	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
209	xor	$e,$a0
210	add	$a2,$T1			# T1+=Ch(e,f,g)
211
212	mov	$a,$a2
213	add	($Tbl),$T1		# T1+=K[round]
214	xor	$a,$a1
215
216	xor	$b,$a2			# a^b, b^c in next round
217	ror	\$$Sigma1[0],$a0	# Sigma1(e)
218	mov	$b,$h
219
220	and	$a2,$a3
221	ror	\$$Sigma0[0],$a1	# Sigma0(a)
222	add	$a0,$T1			# T1+=Sigma1(e)
223
224	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
225	add	$T1,$d			# d+=T1
226	add	$T1,$h			# h+=T1
227
228	lea	$STRIDE($Tbl),$Tbl	# round++
229___
230$code.=<<___ if ($i<15);
231	add	$a1,$h			# h+=Sigma0(a)
232___
233	($a2,$a3) = ($a3,$a2);
234}
235
236sub ROUND_16_XX()
237{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
238
239$code.=<<___;
240	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
241	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
242
243	mov	$a0,$T1
244	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
245	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
246	mov	$a2,$a1
247	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
248
249	xor	$T1,$a0
250	shr	\$$sigma0[2],$T1
251	ror	\$$sigma0[0],$a0
252	xor	$a1,$a2
253	shr	\$$sigma1[2],$a1
254
255	ror	\$$sigma1[0],$a2
256	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
257	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
258	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
259
260	add	`$SZ*($i&0xf)`(%rsp),$T1
261	mov	$e,$a0
262	add	$a2,$T1
263	mov	$a,$a1
264___
265	&ROUND_00_15(@_);
266}
267
268$code=<<___;
269.text
270
271.extern	OPENSSL_ia32cap_P
272.globl	$func
273.type	$func,\@function,3
274.align	16
275$func:
276.cfi_startproc
277___
278$code.=<<___ if ($SZ==4 || $avx);
279	lea	OPENSSL_ia32cap_P(%rip),%r11
280	mov	0(%r11),%r9d
281	mov	4(%r11),%r10d
282	mov	8(%r11),%r11d
283___
284$code.=<<___ if ($SZ==4 && $shaext);
285	test	\$`1<<29`,%r11d		# check for SHA
286	jnz	_shaext_shortcut
287___
288$code.=<<___ if ($avx && $SZ==8);
289	test	\$`1<<11`,%r10d		# check for XOP
290	jnz	.Lxop_shortcut
291___
292$code.=<<___ if ($avx>1);
293	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
294	cmp	\$`1<<8|1<<5|1<<3`,%r11d
295	je	.Lavx2_shortcut
296___
297$code.=<<___ if ($avx);
298	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
299	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
300	or	%r9d,%r10d
301	cmp	\$`1<<28|1<<9|1<<30`,%r10d
302	je	.Lavx_shortcut
303___
304$code.=<<___ if ($SZ==4);
305	test	\$`1<<9`,%r10d
306	jnz	.Lssse3_shortcut
307___
308$code.=<<___;
309	mov	%rsp,%rax		# copy %rsp
310.cfi_def_cfa_register	%rax
311	push	%rbx
312.cfi_push	%rbx
313	push	%rbp
314.cfi_push	%rbp
315	push	%r12
316.cfi_push	%r12
317	push	%r13
318.cfi_push	%r13
319	push	%r14
320.cfi_push	%r14
321	push	%r15
322.cfi_push	%r15
323	shl	\$4,%rdx		# num*16
324	sub	\$$framesz,%rsp
325	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
326	and	\$-64,%rsp		# align stack frame
327	mov	$ctx,$_ctx		# save ctx, 1st arg
328	mov	$inp,$_inp		# save inp, 2nd arh
329	mov	%rdx,$_end		# save end pointer, "3rd" arg
330	mov	%rax,$_rsp		# save copy of %rsp
331.cfi_cfa_expression	$_rsp,deref,+8
332.Lprologue:
333
334	mov	$SZ*0($ctx),$A
335	mov	$SZ*1($ctx),$B
336	mov	$SZ*2($ctx),$C
337	mov	$SZ*3($ctx),$D
338	mov	$SZ*4($ctx),$E
339	mov	$SZ*5($ctx),$F
340	mov	$SZ*6($ctx),$G
341	mov	$SZ*7($ctx),$H
342	jmp	.Lloop
343
344.align	16
345.Lloop:
346	mov	$B,$a3
347	lea	$TABLE(%rip),$Tbl
348	xor	$C,$a3			# magic
349___
350	for($i=0;$i<16;$i++) {
351		$code.="	mov	$SZ*$i($inp),$T1\n";
352		$code.="	mov	@ROT[4],$a0\n";
353		$code.="	mov	@ROT[0],$a1\n";
354		$code.="	bswap	$T1\n";
355		&ROUND_00_15($i,@ROT);
356		unshift(@ROT,pop(@ROT));
357	}
358$code.=<<___;
359	jmp	.Lrounds_16_xx
360.align	16
361.Lrounds_16_xx:
362___
363	for(;$i<32;$i++) {
364		&ROUND_16_XX($i,@ROT);
365		unshift(@ROT,pop(@ROT));
366	}
367
368$code.=<<___;
369	cmpb	\$0,`$SZ-1`($Tbl)
370	jnz	.Lrounds_16_xx
371
372	mov	$_ctx,$ctx
373	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
374	lea	16*$SZ($inp),$inp
375
376	add	$SZ*0($ctx),$A
377	add	$SZ*1($ctx),$B
378	add	$SZ*2($ctx),$C
379	add	$SZ*3($ctx),$D
380	add	$SZ*4($ctx),$E
381	add	$SZ*5($ctx),$F
382	add	$SZ*6($ctx),$G
383	add	$SZ*7($ctx),$H
384
385	cmp	$_end,$inp
386
387	mov	$A,$SZ*0($ctx)
388	mov	$B,$SZ*1($ctx)
389	mov	$C,$SZ*2($ctx)
390	mov	$D,$SZ*3($ctx)
391	mov	$E,$SZ*4($ctx)
392	mov	$F,$SZ*5($ctx)
393	mov	$G,$SZ*6($ctx)
394	mov	$H,$SZ*7($ctx)
395	jb	.Lloop
396
397	mov	$_rsp,%rsi
398.cfi_def_cfa	%rsi,8
399	mov	-48(%rsi),%r15
400.cfi_restore	%r15
401	mov	-40(%rsi),%r14
402.cfi_restore	%r14
403	mov	-32(%rsi),%r13
404.cfi_restore	%r13
405	mov	-24(%rsi),%r12
406.cfi_restore	%r12
407	mov	-16(%rsi),%rbp
408.cfi_restore	%rbp
409	mov	-8(%rsi),%rbx
410.cfi_restore	%rbx
411	lea	(%rsi),%rsp
412.cfi_def_cfa_register	%rsp
413.Lepilogue:
414	ret
415.cfi_endproc
416.size	$func,.-$func
417___
418
419if ($SZ==4) {
420$code.=<<___;
421.section .rodata align=64
422.align	64
423.type	$TABLE,\@object
424$TABLE:
425	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
426	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
427	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
428	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
429	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
430	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
431	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
432	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
433	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
434	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
435	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
436	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
437	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
438	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
439	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
440	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
441	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
442	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
443	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
444	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
445	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
446	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
447	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
448	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
449	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
450	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
451	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
452	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
453	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
454	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
455	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
456	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
457
458	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
459	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
460	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
461	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
462	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
463	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
464	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
465.previous
466___
467} else {
468$code.=<<___;
469.section .rodata align=64
470.align	64
471.type	$TABLE,\@object
472$TABLE:
473	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
474	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
475	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
476	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
477	.quad	0x3956c25bf348b538,0x59f111f1b605d019
478	.quad	0x3956c25bf348b538,0x59f111f1b605d019
479	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
480	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
481	.quad	0xd807aa98a3030242,0x12835b0145706fbe
482	.quad	0xd807aa98a3030242,0x12835b0145706fbe
483	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
484	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
485	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
486	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
487	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
488	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
489	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
490	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
491	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
492	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
493	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
494	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
495	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
496	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
497	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
498	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
499	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
500	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
501	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
502	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
503	.quad	0x06ca6351e003826f,0x142929670a0e6e70
504	.quad	0x06ca6351e003826f,0x142929670a0e6e70
505	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
506	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
507	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
508	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
509	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
510	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
511	.quad	0x81c2c92e47edaee6,0x92722c851482353b
512	.quad	0x81c2c92e47edaee6,0x92722c851482353b
513	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
514	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
515	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
516	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
517	.quad	0xd192e819d6ef5218,0xd69906245565a910
518	.quad	0xd192e819d6ef5218,0xd69906245565a910
519	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
520	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
521	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
522	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
523	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
524	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
525	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
526	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
527	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
528	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
529	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
530	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
531	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
532	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
533	.quad	0x90befffa23631e28,0xa4506cebde82bde9
534	.quad	0x90befffa23631e28,0xa4506cebde82bde9
535	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
536	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
537	.quad	0xca273eceea26619c,0xd186b8c721c0c207
538	.quad	0xca273eceea26619c,0xd186b8c721c0c207
539	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
540	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
541	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
542	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
543	.quad	0x113f9804bef90dae,0x1b710b35131c471b
544	.quad	0x113f9804bef90dae,0x1b710b35131c471b
545	.quad	0x28db77f523047d84,0x32caab7b40c72493
546	.quad	0x28db77f523047d84,0x32caab7b40c72493
547	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
548	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
549	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
550	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
551	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
552	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
553
554	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
555	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
556	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
557.previous
558___
559}
560
561######################################################################
562# SIMD code paths
563#
564if ($SZ==4 && $shaext) {{{
565######################################################################
566# Intel SHA Extensions implementation of SHA256 update function.
567#
568my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
569
570my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
571my @MSG=map("%xmm$_",(3..6));
572
573$code.=<<___;
574.type	sha256_block_data_order_shaext,\@function,3
575.align	64
576sha256_block_data_order_shaext:
577_shaext_shortcut:
578.cfi_startproc
579___
580$code.=<<___ if ($win64);
581	lea	`-8-5*16`(%rsp),%rsp
582	movaps	%xmm6,-8-5*16(%rax)
583	movaps	%xmm7,-8-4*16(%rax)
584	movaps	%xmm8,-8-3*16(%rax)
585	movaps	%xmm9,-8-2*16(%rax)
586	movaps	%xmm10,-8-1*16(%rax)
587.Lprologue_shaext:
588___
589$code.=<<___;
590	lea		K256+0x80(%rip),$Tbl
591	movdqu		($ctx),$ABEF		# DCBA
592	movdqu		16($ctx),$CDGH		# HGFE
593	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
594
595	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
596	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
597	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
598	movdqa		$TMP,$BSWAP		# offload
599	palignr		\$8,$CDGH,$ABEF		# ABEF
600	punpcklqdq	$Wi,$CDGH		# CDGH
601	jmp		.Loop_shaext
602
603.align	16
604.Loop_shaext:
605	movdqu		($inp),@MSG[0]
606	movdqu		0x10($inp),@MSG[1]
607	movdqu		0x20($inp),@MSG[2]
608	pshufb		$TMP,@MSG[0]
609	movdqu		0x30($inp),@MSG[3]
610
611	movdqa		0*32-0x80($Tbl),$Wi
612	paddd		@MSG[0],$Wi
613	pshufb		$TMP,@MSG[1]
614	movdqa		$CDGH,$CDGH_SAVE	# offload
615	sha256rnds2	$ABEF,$CDGH		# 0-3
616	pshufd		\$0x0e,$Wi,$Wi
617	nop
618	movdqa		$ABEF,$ABEF_SAVE	# offload
619	sha256rnds2	$CDGH,$ABEF
620
621	movdqa		1*32-0x80($Tbl),$Wi
622	paddd		@MSG[1],$Wi
623	pshufb		$TMP,@MSG[2]
624	sha256rnds2	$ABEF,$CDGH		# 4-7
625	pshufd		\$0x0e,$Wi,$Wi
626	lea		0x40($inp),$inp
627	sha256msg1	@MSG[1],@MSG[0]
628	sha256rnds2	$CDGH,$ABEF
629
630	movdqa		2*32-0x80($Tbl),$Wi
631	paddd		@MSG[2],$Wi
632	pshufb		$TMP,@MSG[3]
633	sha256rnds2	$ABEF,$CDGH		# 8-11
634	pshufd		\$0x0e,$Wi,$Wi
635	movdqa		@MSG[3],$TMP
636	palignr		\$4,@MSG[2],$TMP
637	nop
638	paddd		$TMP,@MSG[0]
639	sha256msg1	@MSG[2],@MSG[1]
640	sha256rnds2	$CDGH,$ABEF
641
642	movdqa		3*32-0x80($Tbl),$Wi
643	paddd		@MSG[3],$Wi
644	sha256msg2	@MSG[3],@MSG[0]
645	sha256rnds2	$ABEF,$CDGH		# 12-15
646	pshufd		\$0x0e,$Wi,$Wi
647	movdqa		@MSG[0],$TMP
648	palignr		\$4,@MSG[3],$TMP
649	nop
650	paddd		$TMP,@MSG[1]
651	sha256msg1	@MSG[3],@MSG[2]
652	sha256rnds2	$CDGH,$ABEF
653___
654for($i=4;$i<16-3;$i++) {
655$code.=<<___;
656	movdqa		$i*32-0x80($Tbl),$Wi
657	paddd		@MSG[0],$Wi
658	sha256msg2	@MSG[0],@MSG[1]
659	sha256rnds2	$ABEF,$CDGH		# 16-19...
660	pshufd		\$0x0e,$Wi,$Wi
661	movdqa		@MSG[1],$TMP
662	palignr		\$4,@MSG[0],$TMP
663	nop
664	paddd		$TMP,@MSG[2]
665	sha256msg1	@MSG[0],@MSG[3]
666	sha256rnds2	$CDGH,$ABEF
667___
668	push(@MSG,shift(@MSG));
669}
670$code.=<<___;
671	movdqa		13*32-0x80($Tbl),$Wi
672	paddd		@MSG[0],$Wi
673	sha256msg2	@MSG[0],@MSG[1]
674	sha256rnds2	$ABEF,$CDGH		# 52-55
675	pshufd		\$0x0e,$Wi,$Wi
676	movdqa		@MSG[1],$TMP
677	palignr		\$4,@MSG[0],$TMP
678	sha256rnds2	$CDGH,$ABEF
679	paddd		$TMP,@MSG[2]
680
681	movdqa		14*32-0x80($Tbl),$Wi
682	paddd		@MSG[1],$Wi
683	sha256rnds2	$ABEF,$CDGH		# 56-59
684	pshufd		\$0x0e,$Wi,$Wi
685	sha256msg2	@MSG[1],@MSG[2]
686	movdqa		$BSWAP,$TMP
687	sha256rnds2	$CDGH,$ABEF
688
689	movdqa		15*32-0x80($Tbl),$Wi
690	paddd		@MSG[2],$Wi
691	nop
692	sha256rnds2	$ABEF,$CDGH		# 60-63
693	pshufd		\$0x0e,$Wi,$Wi
694	dec		$num
695	nop
696	sha256rnds2	$CDGH,$ABEF
697
698	paddd		$CDGH_SAVE,$CDGH
699	paddd		$ABEF_SAVE,$ABEF
700	jnz		.Loop_shaext
701
702	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
703	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
704	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
705	punpckhqdq	$CDGH,$ABEF		# DCBA
706	palignr		\$8,$TMP,$CDGH		# HGFE
707
708	movdqu	$ABEF,($ctx)
709	movdqu	$CDGH,16($ctx)
710___
711$code.=<<___ if ($win64);
712	movaps	-8-5*16(%rax),%xmm6
713	movaps	-8-4*16(%rax),%xmm7
714	movaps	-8-3*16(%rax),%xmm8
715	movaps	-8-2*16(%rax),%xmm9
716	movaps	-8-1*16(%rax),%xmm10
717	mov	%rax,%rsp
718.Lepilogue_shaext:
719___
720$code.=<<___;
721	ret
722.cfi_endproc
723.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
724___
725}}}
726{{{
727
728my $a4=$T1;
729my ($a,$b,$c,$d,$e,$f,$g,$h);
730
731sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
732{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
733  my $arg = pop;
734    $arg = "\$$arg" if ($arg*1 eq $arg);
735    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
736}
737
738sub body_00_15 () {
739	(
740	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
741
742	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
743	'&mov	($a,$a1)',
744	'&mov	($a4,$f)',
745
746	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
747	'&xor	($a0,$e)',
748	'&xor	($a4,$g)',			# f^g
749
750	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
751	'&xor	($a1,$a)',
752	'&and	($a4,$e)',			# (f^g)&e
753
754	'&xor	($a0,$e)',
755	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
756	'&mov	($a2,$a)',
757
758	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
759	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
760	'&xor	($a2,$b)',			# a^b, b^c in next round
761
762	'&add	($h,$a4)',			# h+=Ch(e,f,g)
763	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
764	'&and	($a3,$a2)',			# (b^c)&(a^b)
765
766	'&xor	($a1,$a)',
767	'&add	($h,$a0)',			# h+=Sigma1(e)
768	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
769
770	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
771	'&add	($d,$h)',			# d+=h
772	'&add	($h,$a3)',			# h+=Maj(a,b,c)
773
774	'&mov	($a0,$d)',
775	'&add	($a1,$h);'.			# h+=Sigma0(a)
776	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
777	);
778}
779
780######################################################################
781# SSSE3 code path
782#
783if ($SZ==4) {	# SHA256 only
784my @X = map("%xmm$_",(0..3));
785my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
786
787$code.=<<___;
788.type	${func}_ssse3,\@function,3
789.align	64
790${func}_ssse3:
791.cfi_startproc
792.Lssse3_shortcut:
793	mov	%rsp,%rax		# copy %rsp
794.cfi_def_cfa_register	%rax
795	push	%rbx
796.cfi_push	%rbx
797	push	%rbp
798.cfi_push	%rbp
799	push	%r12
800.cfi_push	%r12
801	push	%r13
802.cfi_push	%r13
803	push	%r14
804.cfi_push	%r14
805	push	%r15
806.cfi_push	%r15
807	shl	\$4,%rdx		# num*16
808	sub	\$`$framesz+$win64*16*4`,%rsp
809	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
810	and	\$-64,%rsp		# align stack frame
811	mov	$ctx,$_ctx		# save ctx, 1st arg
812	mov	$inp,$_inp		# save inp, 2nd arh
813	mov	%rdx,$_end		# save end pointer, "3rd" arg
814	mov	%rax,$_rsp		# save copy of %rsp
815.cfi_cfa_expression	$_rsp,deref,+8
816___
817$code.=<<___ if ($win64);
818	movaps	%xmm6,16*$SZ+32(%rsp)
819	movaps	%xmm7,16*$SZ+48(%rsp)
820	movaps	%xmm8,16*$SZ+64(%rsp)
821	movaps	%xmm9,16*$SZ+80(%rsp)
822___
823$code.=<<___;
824.Lprologue_ssse3:
825
826	mov	$SZ*0($ctx),$A
827	mov	$SZ*1($ctx),$B
828	mov	$SZ*2($ctx),$C
829	mov	$SZ*3($ctx),$D
830	mov	$SZ*4($ctx),$E
831	mov	$SZ*5($ctx),$F
832	mov	$SZ*6($ctx),$G
833	mov	$SZ*7($ctx),$H
834___
835
836$code.=<<___;
837	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
838	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
839	jmp	.Lloop_ssse3
840.align	16
841.Lloop_ssse3:
842	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
843	movdqu	0x00($inp),@X[0]
844	movdqu	0x10($inp),@X[1]
845	movdqu	0x20($inp),@X[2]
846	pshufb	$t3,@X[0]
847	movdqu	0x30($inp),@X[3]
848	lea	$TABLE(%rip),$Tbl
849	pshufb	$t3,@X[1]
850	movdqa	0x00($Tbl),$t0
851	movdqa	0x20($Tbl),$t1
852	pshufb	$t3,@X[2]
853	paddd	@X[0],$t0
854	movdqa	0x40($Tbl),$t2
855	pshufb	$t3,@X[3]
856	movdqa	0x60($Tbl),$t3
857	paddd	@X[1],$t1
858	paddd	@X[2],$t2
859	paddd	@X[3],$t3
860	movdqa	$t0,0x00(%rsp)
861	mov	$A,$a1
862	movdqa	$t1,0x10(%rsp)
863	mov	$B,$a3
864	movdqa	$t2,0x20(%rsp)
865	xor	$C,$a3			# magic
866	movdqa	$t3,0x30(%rsp)
867	mov	$E,$a0
868	jmp	.Lssse3_00_47
869
870.align	16
871.Lssse3_00_47:
872	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
873___
874sub Xupdate_256_SSSE3 () {
875	(
876	'&movdqa	($t0,@X[1]);',
877	'&movdqa	($t3,@X[3])',
878	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
879	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
880	'&movdqa	($t1,$t0)',
881	'&movdqa	($t2,$t0);',
882	'&psrld		($t0,$sigma0[2])',
883	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
884	'&psrld		($t2,$sigma0[0])',
885	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
886	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
887	'&pxor		($t0,$t2)',
888	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
889	'&pxor		($t0,$t1)',
890	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
891	'&pxor		($t0,$t2);',
892	 '&movdqa	($t2,$t3)',
893	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
894	 '&psrld	($t3,$sigma1[2])',
895	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
896	 '&psrlq	($t2,$sigma1[0])',
897	 '&pxor		($t3,$t2);',
898	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
899	 '&pxor		($t3,$t2)',
900	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
901	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
902	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
903	 '&movdqa	($t2,$t3);',
904	 '&psrld	($t3,$sigma1[2])',
905	 '&psrlq	($t2,$sigma1[0])',
906	 '&pxor		($t3,$t2);',
907	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
908	 '&pxor		($t3,$t2);',
909	'&movdqa	($t2,16*2*$j."($Tbl)")',
910	 '&pshufb	($t3,$t5)',
911	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
912	);
913}
914
915sub SSSE3_256_00_47 () {
916my $j = shift;
917my $body = shift;
918my @X = @_;
919my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
920
921    if (0) {
922	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
923	    eval;
924	    eval(shift(@insns));
925	    eval(shift(@insns));
926	    eval(shift(@insns));
927	}
928    } else {			# squeeze extra 4% on Westmere and 19% on Atom
929	  eval(shift(@insns));	#@
930	&movdqa		($t0,@X[1]);
931	  eval(shift(@insns));
932	  eval(shift(@insns));
933	&movdqa		($t3,@X[3]);
934	  eval(shift(@insns));	#@
935	  eval(shift(@insns));
936	  eval(shift(@insns));
937	  eval(shift(@insns));	#@
938	  eval(shift(@insns));
939	&palignr	($t0,@X[0],$SZ);	# X[1..4]
940	  eval(shift(@insns));
941	  eval(shift(@insns));
942	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
943	  eval(shift(@insns));
944	  eval(shift(@insns));
945	  eval(shift(@insns));
946	  eval(shift(@insns));	#@
947	&movdqa		($t1,$t0);
948	  eval(shift(@insns));
949	  eval(shift(@insns));
950	&movdqa		($t2,$t0);
951	  eval(shift(@insns));	#@
952	  eval(shift(@insns));
953	&psrld		($t0,$sigma0[2]);
954	  eval(shift(@insns));
955	  eval(shift(@insns));
956	  eval(shift(@insns));
957	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
958	  eval(shift(@insns));	#@
959	  eval(shift(@insns));
960	&psrld		($t2,$sigma0[0]);
961	  eval(shift(@insns));
962	  eval(shift(@insns));
963	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
964	  eval(shift(@insns));
965	  eval(shift(@insns));	#@
966	&pslld		($t1,8*$SZ-$sigma0[1]);
967	  eval(shift(@insns));
968	  eval(shift(@insns));
969	&pxor		($t0,$t2);
970	  eval(shift(@insns));	#@
971	  eval(shift(@insns));
972	  eval(shift(@insns));
973	  eval(shift(@insns));	#@
974	&psrld		($t2,$sigma0[1]-$sigma0[0]);
975	  eval(shift(@insns));
976	&pxor		($t0,$t1);
977	  eval(shift(@insns));
978	  eval(shift(@insns));
979	&pslld		($t1,$sigma0[1]-$sigma0[0]);
980	  eval(shift(@insns));
981	  eval(shift(@insns));
982	&pxor		($t0,$t2);
983	  eval(shift(@insns));
984	  eval(shift(@insns));	#@
985	 &movdqa	($t2,$t3);
986	  eval(shift(@insns));
987	  eval(shift(@insns));
988	&pxor		($t0,$t1);		# sigma0(X[1..4])
989	  eval(shift(@insns));	#@
990	  eval(shift(@insns));
991	  eval(shift(@insns));
992	 &psrld		($t3,$sigma1[2]);
993	  eval(shift(@insns));
994	  eval(shift(@insns));
995	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
996	  eval(shift(@insns));	#@
997	  eval(shift(@insns));
998	 &psrlq		($t2,$sigma1[0]);
999	  eval(shift(@insns));
1000	  eval(shift(@insns));
1001	  eval(shift(@insns));
1002	 &pxor		($t3,$t2);
1003	  eval(shift(@insns));	#@
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));
1006	  eval(shift(@insns));	#@
1007	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));
1010	 &pxor		($t3,$t2);
1011	  eval(shift(@insns));	#@
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
1015	 &pshufd	($t3,$t3,0b10000000);
1016	  eval(shift(@insns));
1017	  eval(shift(@insns));
1018	  eval(shift(@insns));
1019	 &psrldq	($t3,8);
1020	  eval(shift(@insns));
1021	  eval(shift(@insns));	#@
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));
1024	  eval(shift(@insns));	#@
1025	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1026	  eval(shift(@insns));
1027	  eval(shift(@insns));
1028	  eval(shift(@insns));
1029	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1030	  eval(shift(@insns));
1031	  eval(shift(@insns));	#@
1032	  eval(shift(@insns));
1033	 &movdqa	($t2,$t3);
1034	  eval(shift(@insns));
1035	  eval(shift(@insns));
1036	 &psrld		($t3,$sigma1[2]);
1037	  eval(shift(@insns));
1038	  eval(shift(@insns));	#@
1039	 &psrlq		($t2,$sigma1[0]);
1040	  eval(shift(@insns));
1041	  eval(shift(@insns));
1042	 &pxor		($t3,$t2);
1043	  eval(shift(@insns));	#@
1044	  eval(shift(@insns));
1045	  eval(shift(@insns));
1046	  eval(shift(@insns));	#@
1047	  eval(shift(@insns));
1048	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1049	  eval(shift(@insns));
1050	  eval(shift(@insns));
1051	  eval(shift(@insns));
1052	 &pxor		($t3,$t2);
1053	  eval(shift(@insns));
1054	  eval(shift(@insns));
1055	  eval(shift(@insns));	#@
1056	 #&pshufb	($t3,$t5);
1057	 &pshufd	($t3,$t3,0b00001000);
1058	  eval(shift(@insns));
1059	  eval(shift(@insns));
1060	&movdqa		($t2,16*2*$j."($Tbl)");
1061	  eval(shift(@insns));	#@
1062	  eval(shift(@insns));
1063	 &pslldq	($t3,8);
1064	  eval(shift(@insns));
1065	  eval(shift(@insns));
1066	  eval(shift(@insns));
1067	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1068	  eval(shift(@insns));	#@
1069	  eval(shift(@insns));
1070	  eval(shift(@insns));
1071    }
1072	&paddd		($t2,@X[0]);
1073	  foreach (@insns) { eval; }		# remaining instructions
1074	&movdqa		(16*$j."(%rsp)",$t2);
1075}
1076
1077    for ($i=0,$j=0; $j<4; $j++) {
1078	&SSSE3_256_00_47($j,\&body_00_15,@X);
1079	push(@X,shift(@X));			# rotate(@X)
1080    }
1081	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1082	&jne	(".Lssse3_00_47");
1083
1084    for ($i=0; $i<16; ) {
1085	foreach(body_00_15()) { eval; }
1086    }
1087$code.=<<___;
1088	mov	$_ctx,$ctx
1089	mov	$a1,$A
1090
1091	add	$SZ*0($ctx),$A
1092	lea	16*$SZ($inp),$inp
1093	add	$SZ*1($ctx),$B
1094	add	$SZ*2($ctx),$C
1095	add	$SZ*3($ctx),$D
1096	add	$SZ*4($ctx),$E
1097	add	$SZ*5($ctx),$F
1098	add	$SZ*6($ctx),$G
1099	add	$SZ*7($ctx),$H
1100
1101	cmp	$_end,$inp
1102
1103	mov	$A,$SZ*0($ctx)
1104	mov	$B,$SZ*1($ctx)
1105	mov	$C,$SZ*2($ctx)
1106	mov	$D,$SZ*3($ctx)
1107	mov	$E,$SZ*4($ctx)
1108	mov	$F,$SZ*5($ctx)
1109	mov	$G,$SZ*6($ctx)
1110	mov	$H,$SZ*7($ctx)
1111	jb	.Lloop_ssse3
1112
1113	mov	$_rsp,%rsi
1114.cfi_def_cfa	%rsi,8
1115___
1116$code.=<<___ if ($win64);
1117	movaps	16*$SZ+32(%rsp),%xmm6
1118	movaps	16*$SZ+48(%rsp),%xmm7
1119	movaps	16*$SZ+64(%rsp),%xmm8
1120	movaps	16*$SZ+80(%rsp),%xmm9
1121___
1122$code.=<<___;
1123	mov	-48(%rsi),%r15
1124.cfi_restore	%r15
1125	mov	-40(%rsi),%r14
1126.cfi_restore	%r14
1127	mov	-32(%rsi),%r13
1128.cfi_restore	%r13
1129	mov	-24(%rsi),%r12
1130.cfi_restore	%r12
1131	mov	-16(%rsi),%rbp
1132.cfi_restore	%rbp
1133	mov	-8(%rsi),%rbx
1134.cfi_restore	%rbx
1135	lea	(%rsi),%rsp
1136.cfi_def_cfa_register	%rsp
1137.Lepilogue_ssse3:
1138	ret
1139.cfi_endproc
1140.size	${func}_ssse3,.-${func}_ssse3
1141___
1142}
1143
1144if ($avx) {{
1145######################################################################
1146# XOP code path
1147#
1148if ($SZ==8) {	# SHA512 only
1149$code.=<<___;
1150.type	${func}_xop,\@function,3
1151.align	64
1152${func}_xop:
1153.cfi_startproc
1154.Lxop_shortcut:
1155	mov	%rsp,%rax		# copy %rsp
1156.cfi_def_cfa_register	%rax
1157	push	%rbx
1158.cfi_push	%rbx
1159	push	%rbp
1160.cfi_push	%rbp
1161	push	%r12
1162.cfi_push	%r12
1163	push	%r13
1164.cfi_push	%r13
1165	push	%r14
1166.cfi_push	%r14
1167	push	%r15
1168.cfi_push	%r15
1169	shl	\$4,%rdx		# num*16
1170	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1171	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1172	and	\$-64,%rsp		# align stack frame
1173	mov	$ctx,$_ctx		# save ctx, 1st arg
1174	mov	$inp,$_inp		# save inp, 2nd arh
1175	mov	%rdx,$_end		# save end pointer, "3rd" arg
1176	mov	%rax,$_rsp		# save copy of %rsp
1177.cfi_cfa_expression	$_rsp,deref,+8
1178___
1179$code.=<<___ if ($win64);
1180	movaps	%xmm6,16*$SZ+32(%rsp)
1181	movaps	%xmm7,16*$SZ+48(%rsp)
1182	movaps	%xmm8,16*$SZ+64(%rsp)
1183	movaps	%xmm9,16*$SZ+80(%rsp)
1184___
1185$code.=<<___ if ($win64 && $SZ>4);
1186	movaps	%xmm10,16*$SZ+96(%rsp)
1187	movaps	%xmm11,16*$SZ+112(%rsp)
1188___
1189$code.=<<___;
1190.Lprologue_xop:
1191
1192	vzeroupper
1193	mov	$SZ*0($ctx),$A
1194	mov	$SZ*1($ctx),$B
1195	mov	$SZ*2($ctx),$C
1196	mov	$SZ*3($ctx),$D
1197	mov	$SZ*4($ctx),$E
1198	mov	$SZ*5($ctx),$F
1199	mov	$SZ*6($ctx),$G
1200	mov	$SZ*7($ctx),$H
1201	jmp	.Lloop_xop
1202___
1203					if ($SZ==4) {	# SHA256
1204    my @X = map("%xmm$_",(0..3));
1205    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1206
1207$code.=<<___;
1208.align	16
1209.Lloop_xop:
1210	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1211	vmovdqu	0x00($inp),@X[0]
1212	vmovdqu	0x10($inp),@X[1]
1213	vmovdqu	0x20($inp),@X[2]
1214	vmovdqu	0x30($inp),@X[3]
1215	vpshufb	$t3,@X[0],@X[0]
1216	lea	$TABLE(%rip),$Tbl
1217	vpshufb	$t3,@X[1],@X[1]
1218	vpshufb	$t3,@X[2],@X[2]
1219	vpaddd	0x00($Tbl),@X[0],$t0
1220	vpshufb	$t3,@X[3],@X[3]
1221	vpaddd	0x20($Tbl),@X[1],$t1
1222	vpaddd	0x40($Tbl),@X[2],$t2
1223	vpaddd	0x60($Tbl),@X[3],$t3
1224	vmovdqa	$t0,0x00(%rsp)
1225	mov	$A,$a1
1226	vmovdqa	$t1,0x10(%rsp)
1227	mov	$B,$a3
1228	vmovdqa	$t2,0x20(%rsp)
1229	xor	$C,$a3			# magic
1230	vmovdqa	$t3,0x30(%rsp)
1231	mov	$E,$a0
1232	jmp	.Lxop_00_47
1233
1234.align	16
1235.Lxop_00_47:
1236	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1237___
1238sub XOP_256_00_47 () {
1239my $j = shift;
1240my $body = shift;
1241my @X = @_;
1242my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1243
1244	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1245	  eval(shift(@insns));
1246	  eval(shift(@insns));
1247	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1248	  eval(shift(@insns));
1249	  eval(shift(@insns));
1250	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1251	  eval(shift(@insns));
1252	  eval(shift(@insns));
1253	&vpsrld		($t0,$t0,$sigma0[2]);
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1257	  eval(shift(@insns));
1258	  eval(shift(@insns));
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1262	  eval(shift(@insns));
1263	  eval(shift(@insns));
1264	&vpxor		($t0,$t0,$t1);
1265	  eval(shift(@insns));
1266	  eval(shift(@insns));
1267	  eval(shift(@insns));
1268	  eval(shift(@insns));
1269	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1270	  eval(shift(@insns));
1271	  eval(shift(@insns));
1272	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1273	  eval(shift(@insns));
1274	  eval(shift(@insns));
1275	 &vpsrld	($t2,@X[3],$sigma1[2]);
1276	  eval(shift(@insns));
1277	  eval(shift(@insns));
1278	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1279	  eval(shift(@insns));
1280	  eval(shift(@insns));
1281	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1282	  eval(shift(@insns));
1283	  eval(shift(@insns));
1284	 &vpxor		($t3,$t3,$t2);
1285	  eval(shift(@insns));
1286	  eval(shift(@insns));
1287	  eval(shift(@insns));
1288	  eval(shift(@insns));
1289	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1290	  eval(shift(@insns));
1291	  eval(shift(@insns));
1292	  eval(shift(@insns));
1293	  eval(shift(@insns));
1294	&vpsrldq	($t3,$t3,8);
1295	  eval(shift(@insns));
1296	  eval(shift(@insns));
1297	  eval(shift(@insns));
1298	  eval(shift(@insns));
1299	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1300	  eval(shift(@insns));
1301	  eval(shift(@insns));
1302	  eval(shift(@insns));
1303	  eval(shift(@insns));
1304	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1305	  eval(shift(@insns));
1306	  eval(shift(@insns));
1307	 &vpsrld	($t2,@X[0],$sigma1[2]);
1308	  eval(shift(@insns));
1309	  eval(shift(@insns));
1310	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1311	  eval(shift(@insns));
1312	  eval(shift(@insns));
1313	 &vpxor		($t3,$t3,$t2);
1314	  eval(shift(@insns));
1315	  eval(shift(@insns));
1316	  eval(shift(@insns));
1317	  eval(shift(@insns));
1318	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1319	  eval(shift(@insns));
1320	  eval(shift(@insns));
1321	  eval(shift(@insns));
1322	  eval(shift(@insns));
1323	&vpslldq	($t3,$t3,8);		# 22 instructions
1324	  eval(shift(@insns));
1325	  eval(shift(@insns));
1326	  eval(shift(@insns));
1327	  eval(shift(@insns));
1328	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1329	  eval(shift(@insns));
1330	  eval(shift(@insns));
1331	  eval(shift(@insns));
1332	  eval(shift(@insns));
1333	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1334	  foreach (@insns) { eval; }		# remaining instructions
1335	&vmovdqa	(16*$j."(%rsp)",$t2);
1336}
1337
1338    for ($i=0,$j=0; $j<4; $j++) {
1339	&XOP_256_00_47($j,\&body_00_15,@X);
1340	push(@X,shift(@X));			# rotate(@X)
1341    }
1342	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1343	&jne	(".Lxop_00_47");
1344
1345    for ($i=0; $i<16; ) {
1346	foreach(body_00_15()) { eval; }
1347    }
1348
1349					} else {	# SHA512
1350    my @X = map("%xmm$_",(0..7));
1351    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1352
1353$code.=<<___;
1354.align	16
1355.Lloop_xop:
1356	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1357	vmovdqu	0x00($inp),@X[0]
1358	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1359	vmovdqu	0x10($inp),@X[1]
1360	vmovdqu	0x20($inp),@X[2]
1361	vpshufb	$t3,@X[0],@X[0]
1362	vmovdqu	0x30($inp),@X[3]
1363	vpshufb	$t3,@X[1],@X[1]
1364	vmovdqu	0x40($inp),@X[4]
1365	vpshufb	$t3,@X[2],@X[2]
1366	vmovdqu	0x50($inp),@X[5]
1367	vpshufb	$t3,@X[3],@X[3]
1368	vmovdqu	0x60($inp),@X[6]
1369	vpshufb	$t3,@X[4],@X[4]
1370	vmovdqu	0x70($inp),@X[7]
1371	vpshufb	$t3,@X[5],@X[5]
1372	vpaddq	-0x80($Tbl),@X[0],$t0
1373	vpshufb	$t3,@X[6],@X[6]
1374	vpaddq	-0x60($Tbl),@X[1],$t1
1375	vpshufb	$t3,@X[7],@X[7]
1376	vpaddq	-0x40($Tbl),@X[2],$t2
1377	vpaddq	-0x20($Tbl),@X[3],$t3
1378	vmovdqa	$t0,0x00(%rsp)
1379	vpaddq	0x00($Tbl),@X[4],$t0
1380	vmovdqa	$t1,0x10(%rsp)
1381	vpaddq	0x20($Tbl),@X[5],$t1
1382	vmovdqa	$t2,0x20(%rsp)
1383	vpaddq	0x40($Tbl),@X[6],$t2
1384	vmovdqa	$t3,0x30(%rsp)
1385	vpaddq	0x60($Tbl),@X[7],$t3
1386	vmovdqa	$t0,0x40(%rsp)
1387	mov	$A,$a1
1388	vmovdqa	$t1,0x50(%rsp)
1389	mov	$B,$a3
1390	vmovdqa	$t2,0x60(%rsp)
1391	xor	$C,$a3			# magic
1392	vmovdqa	$t3,0x70(%rsp)
1393	mov	$E,$a0
1394	jmp	.Lxop_00_47
1395
1396.align	16
1397.Lxop_00_47:
1398	add	\$`16*2*$SZ`,$Tbl
1399___
1400sub XOP_512_00_47 () {
1401my $j = shift;
1402my $body = shift;
1403my @X = @_;
1404my @insns = (&$body,&$body);			# 52 instructions
1405
1406	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1407	  eval(shift(@insns));
1408	  eval(shift(@insns));
1409	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1410	  eval(shift(@insns));
1411	  eval(shift(@insns));
1412	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1413	  eval(shift(@insns));
1414	  eval(shift(@insns));
1415	&vpsrlq		($t0,$t0,$sigma0[2]);
1416	  eval(shift(@insns));
1417	  eval(shift(@insns));
1418	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1419	  eval(shift(@insns));
1420	  eval(shift(@insns));
1421	  eval(shift(@insns));
1422	  eval(shift(@insns));
1423	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1424	  eval(shift(@insns));
1425	  eval(shift(@insns));
1426	&vpxor		($t0,$t0,$t1);
1427	  eval(shift(@insns));
1428	  eval(shift(@insns));
1429	  eval(shift(@insns));
1430	  eval(shift(@insns));
1431	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1432	  eval(shift(@insns));
1433	  eval(shift(@insns));
1434	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1435	  eval(shift(@insns));
1436	  eval(shift(@insns));
1437	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1438	  eval(shift(@insns));
1439	  eval(shift(@insns));
1440	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1441	  eval(shift(@insns));
1442	  eval(shift(@insns));
1443	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1444	  eval(shift(@insns));
1445	  eval(shift(@insns));
1446	 &vpxor		($t3,$t3,$t2);
1447	  eval(shift(@insns));
1448	  eval(shift(@insns));
1449	  eval(shift(@insns));
1450	  eval(shift(@insns));
1451	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1452	  eval(shift(@insns));
1453	  eval(shift(@insns));
1454	  eval(shift(@insns));
1455	  eval(shift(@insns));
1456	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1457	  eval(shift(@insns));
1458	  eval(shift(@insns));
1459	  eval(shift(@insns));
1460	  eval(shift(@insns));
1461	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1462	  foreach (@insns) { eval; }		# remaining instructions
1463	&vmovdqa	(16*$j."(%rsp)",$t2);
1464}
1465
1466    for ($i=0,$j=0; $j<8; $j++) {
1467	&XOP_512_00_47($j,\&body_00_15,@X);
1468	push(@X,shift(@X));			# rotate(@X)
1469    }
1470	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1471	&jne	(".Lxop_00_47");
1472
1473    for ($i=0; $i<16; ) {
1474	foreach(body_00_15()) { eval; }
1475    }
1476}
1477$code.=<<___;
1478	mov	$_ctx,$ctx
1479	mov	$a1,$A
1480
1481	add	$SZ*0($ctx),$A
1482	lea	16*$SZ($inp),$inp
1483	add	$SZ*1($ctx),$B
1484	add	$SZ*2($ctx),$C
1485	add	$SZ*3($ctx),$D
1486	add	$SZ*4($ctx),$E
1487	add	$SZ*5($ctx),$F
1488	add	$SZ*6($ctx),$G
1489	add	$SZ*7($ctx),$H
1490
1491	cmp	$_end,$inp
1492
1493	mov	$A,$SZ*0($ctx)
1494	mov	$B,$SZ*1($ctx)
1495	mov	$C,$SZ*2($ctx)
1496	mov	$D,$SZ*3($ctx)
1497	mov	$E,$SZ*4($ctx)
1498	mov	$F,$SZ*5($ctx)
1499	mov	$G,$SZ*6($ctx)
1500	mov	$H,$SZ*7($ctx)
1501	jb	.Lloop_xop
1502
1503	mov	$_rsp,%rsi
1504.cfi_def_cfa	%rsi,8
1505	vzeroupper
1506___
1507$code.=<<___ if ($win64);
1508	movaps	16*$SZ+32(%rsp),%xmm6
1509	movaps	16*$SZ+48(%rsp),%xmm7
1510	movaps	16*$SZ+64(%rsp),%xmm8
1511	movaps	16*$SZ+80(%rsp),%xmm9
1512___
1513$code.=<<___ if ($win64 && $SZ>4);
1514	movaps	16*$SZ+96(%rsp),%xmm10
1515	movaps	16*$SZ+112(%rsp),%xmm11
1516___
1517$code.=<<___;
1518	mov	-48(%rsi),%r15
1519.cfi_restore	%r15
1520	mov	-40(%rsi),%r14
1521.cfi_restore	%r14
1522	mov	-32(%rsi),%r13
1523.cfi_restore	%r13
1524	mov	-24(%rsi),%r12
1525.cfi_restore	%r12
1526	mov	-16(%rsi),%rbp
1527.cfi_restore	%rbp
1528	mov	-8(%rsi),%rbx
1529.cfi_restore	%rbx
1530	lea	(%rsi),%rsp
1531.cfi_def_cfa_register	%rsp
1532.Lepilogue_xop:
1533	ret
1534.cfi_endproc
1535.size	${func}_xop,.-${func}_xop
1536___
1537}
1538######################################################################
1539# AVX+shrd code path
1540#
1541local *ror = sub { &shrd(@_[0],@_) };
1542
1543$code.=<<___;
1544.type	${func}_avx,\@function,3
1545.align	64
1546${func}_avx:
1547.cfi_startproc
1548.Lavx_shortcut:
1549	mov	%rsp,%rax		# copy %rsp
1550.cfi_def_cfa_register	%rax
1551	push	%rbx
1552.cfi_push	%rbx
1553	push	%rbp
1554.cfi_push	%rbp
1555	push	%r12
1556.cfi_push	%r12
1557	push	%r13
1558.cfi_push	%r13
1559	push	%r14
1560.cfi_push	%r14
1561	push	%r15
1562.cfi_push	%r15
1563	shl	\$4,%rdx		# num*16
1564	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1565	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1566	and	\$-64,%rsp		# align stack frame
1567	mov	$ctx,$_ctx		# save ctx, 1st arg
1568	mov	$inp,$_inp		# save inp, 2nd arh
1569	mov	%rdx,$_end		# save end pointer, "3rd" arg
1570	mov	%rax,$_rsp		# save copy of %rsp
1571.cfi_cfa_expression	$_rsp,deref,+8
1572___
1573$code.=<<___ if ($win64);
1574	movaps	%xmm6,16*$SZ+32(%rsp)
1575	movaps	%xmm7,16*$SZ+48(%rsp)
1576	movaps	%xmm8,16*$SZ+64(%rsp)
1577	movaps	%xmm9,16*$SZ+80(%rsp)
1578___
1579$code.=<<___ if ($win64 && $SZ>4);
1580	movaps	%xmm10,16*$SZ+96(%rsp)
1581	movaps	%xmm11,16*$SZ+112(%rsp)
1582___
1583$code.=<<___;
1584.Lprologue_avx:
1585
1586	vzeroupper
1587	mov	$SZ*0($ctx),$A
1588	mov	$SZ*1($ctx),$B
1589	mov	$SZ*2($ctx),$C
1590	mov	$SZ*3($ctx),$D
1591	mov	$SZ*4($ctx),$E
1592	mov	$SZ*5($ctx),$F
1593	mov	$SZ*6($ctx),$G
1594	mov	$SZ*7($ctx),$H
1595___
1596					if ($SZ==4) {	# SHA256
1597    my @X = map("%xmm$_",(0..3));
1598    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1599
1600$code.=<<___;
1601	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1602	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1603	jmp	.Lloop_avx
1604.align	16
1605.Lloop_avx:
1606	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1607	vmovdqu	0x00($inp),@X[0]
1608	vmovdqu	0x10($inp),@X[1]
1609	vmovdqu	0x20($inp),@X[2]
1610	vmovdqu	0x30($inp),@X[3]
1611	vpshufb	$t3,@X[0],@X[0]
1612	lea	$TABLE(%rip),$Tbl
1613	vpshufb	$t3,@X[1],@X[1]
1614	vpshufb	$t3,@X[2],@X[2]
1615	vpaddd	0x00($Tbl),@X[0],$t0
1616	vpshufb	$t3,@X[3],@X[3]
1617	vpaddd	0x20($Tbl),@X[1],$t1
1618	vpaddd	0x40($Tbl),@X[2],$t2
1619	vpaddd	0x60($Tbl),@X[3],$t3
1620	vmovdqa	$t0,0x00(%rsp)
1621	mov	$A,$a1
1622	vmovdqa	$t1,0x10(%rsp)
1623	mov	$B,$a3
1624	vmovdqa	$t2,0x20(%rsp)
1625	xor	$C,$a3			# magic
1626	vmovdqa	$t3,0x30(%rsp)
1627	mov	$E,$a0
1628	jmp	.Lavx_00_47
1629
1630.align	16
1631.Lavx_00_47:
1632	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1633___
1634sub Xupdate_256_AVX () {
1635	(
1636	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1637	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1638	'&vpsrld	($t2,$t0,$sigma0[0]);',
1639	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1640	'&vpsrld	($t3,$t0,$sigma0[2])',
1641	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1642	'&vpxor		($t0,$t3,$t2)',
1643	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1644	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1645	'&vpxor		($t0,$t0,$t1)',
1646	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1647	'&vpxor		($t0,$t0,$t2)',
1648	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1649	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1650	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1651	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1652	 '&vpxor	($t2,$t2,$t3);',
1653	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1654	 '&vpxor	($t2,$t2,$t3)',
1655	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1656	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1657	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1658	 '&vpsrld	($t2,$t3,$sigma1[2])',
1659	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1660	 '&vpxor	($t2,$t2,$t3);',
1661	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1662	 '&vpxor	($t2,$t2,$t3)',
1663	 '&vpshufb	($t2,$t2,$t5)',
1664	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1665	);
1666}
1667
1668sub AVX_256_00_47 () {
1669my $j = shift;
1670my $body = shift;
1671my @X = @_;
1672my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1673
1674	foreach (Xupdate_256_AVX()) {		# 29 instructions
1675	    eval;
1676	    eval(shift(@insns));
1677	    eval(shift(@insns));
1678	    eval(shift(@insns));
1679	}
1680	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1681	  foreach (@insns) { eval; }		# remaining instructions
1682	&vmovdqa	(16*$j."(%rsp)",$t2);
1683}
1684
1685    for ($i=0,$j=0; $j<4; $j++) {
1686	&AVX_256_00_47($j,\&body_00_15,@X);
1687	push(@X,shift(@X));			# rotate(@X)
1688    }
1689	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1690	&jne	(".Lavx_00_47");
1691
1692    for ($i=0; $i<16; ) {
1693	foreach(body_00_15()) { eval; }
1694    }
1695
1696					} else {	# SHA512
1697    my @X = map("%xmm$_",(0..7));
1698    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1699
1700$code.=<<___;
1701	jmp	.Lloop_avx
1702.align	16
1703.Lloop_avx:
1704	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1705	vmovdqu	0x00($inp),@X[0]
1706	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1707	vmovdqu	0x10($inp),@X[1]
1708	vmovdqu	0x20($inp),@X[2]
1709	vpshufb	$t3,@X[0],@X[0]
1710	vmovdqu	0x30($inp),@X[3]
1711	vpshufb	$t3,@X[1],@X[1]
1712	vmovdqu	0x40($inp),@X[4]
1713	vpshufb	$t3,@X[2],@X[2]
1714	vmovdqu	0x50($inp),@X[5]
1715	vpshufb	$t3,@X[3],@X[3]
1716	vmovdqu	0x60($inp),@X[6]
1717	vpshufb	$t3,@X[4],@X[4]
1718	vmovdqu	0x70($inp),@X[7]
1719	vpshufb	$t3,@X[5],@X[5]
1720	vpaddq	-0x80($Tbl),@X[0],$t0
1721	vpshufb	$t3,@X[6],@X[6]
1722	vpaddq	-0x60($Tbl),@X[1],$t1
1723	vpshufb	$t3,@X[7],@X[7]
1724	vpaddq	-0x40($Tbl),@X[2],$t2
1725	vpaddq	-0x20($Tbl),@X[3],$t3
1726	vmovdqa	$t0,0x00(%rsp)
1727	vpaddq	0x00($Tbl),@X[4],$t0
1728	vmovdqa	$t1,0x10(%rsp)
1729	vpaddq	0x20($Tbl),@X[5],$t1
1730	vmovdqa	$t2,0x20(%rsp)
1731	vpaddq	0x40($Tbl),@X[6],$t2
1732	vmovdqa	$t3,0x30(%rsp)
1733	vpaddq	0x60($Tbl),@X[7],$t3
1734	vmovdqa	$t0,0x40(%rsp)
1735	mov	$A,$a1
1736	vmovdqa	$t1,0x50(%rsp)
1737	mov	$B,$a3
1738	vmovdqa	$t2,0x60(%rsp)
1739	xor	$C,$a3			# magic
1740	vmovdqa	$t3,0x70(%rsp)
1741	mov	$E,$a0
1742	jmp	.Lavx_00_47
1743
1744.align	16
1745.Lavx_00_47:
1746	add	\$`16*2*$SZ`,$Tbl
1747___
1748sub Xupdate_512_AVX () {
1749	(
1750	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1751	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1752	'&vpsrlq	($t2,$t0,$sigma0[0])',
1753	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1754	'&vpsrlq	($t3,$t0,$sigma0[2])',
1755	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1756	 '&vpxor	($t0,$t3,$t2)',
1757	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1758	 '&vpxor	($t0,$t0,$t1)',
1759	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1760	 '&vpxor	($t0,$t0,$t2)',
1761	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1762	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1763	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1764	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1765	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1766	 '&vpxor	($t3,$t3,$t2)',
1767	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1768	 '&vpxor	($t3,$t3,$t1)',
1769	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1770	 '&vpxor	($t3,$t3,$t2)',
1771	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1772	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1773	);
1774}
1775
1776sub AVX_512_00_47 () {
1777my $j = shift;
1778my $body = shift;
1779my @X = @_;
1780my @insns = (&$body,&$body);			# 52 instructions
1781
1782	foreach (Xupdate_512_AVX()) {		# 23 instructions
1783	    eval;
1784	    eval(shift(@insns));
1785	    eval(shift(@insns));
1786	}
1787	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1788	  foreach (@insns) { eval; }		# remaining instructions
1789	&vmovdqa	(16*$j."(%rsp)",$t2);
1790}
1791
1792    for ($i=0,$j=0; $j<8; $j++) {
1793	&AVX_512_00_47($j,\&body_00_15,@X);
1794	push(@X,shift(@X));			# rotate(@X)
1795    }
1796	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1797	&jne	(".Lavx_00_47");
1798
1799    for ($i=0; $i<16; ) {
1800	foreach(body_00_15()) { eval; }
1801    }
1802}
1803$code.=<<___;
1804	mov	$_ctx,$ctx
1805	mov	$a1,$A
1806
1807	add	$SZ*0($ctx),$A
1808	lea	16*$SZ($inp),$inp
1809	add	$SZ*1($ctx),$B
1810	add	$SZ*2($ctx),$C
1811	add	$SZ*3($ctx),$D
1812	add	$SZ*4($ctx),$E
1813	add	$SZ*5($ctx),$F
1814	add	$SZ*6($ctx),$G
1815	add	$SZ*7($ctx),$H
1816
1817	cmp	$_end,$inp
1818
1819	mov	$A,$SZ*0($ctx)
1820	mov	$B,$SZ*1($ctx)
1821	mov	$C,$SZ*2($ctx)
1822	mov	$D,$SZ*3($ctx)
1823	mov	$E,$SZ*4($ctx)
1824	mov	$F,$SZ*5($ctx)
1825	mov	$G,$SZ*6($ctx)
1826	mov	$H,$SZ*7($ctx)
1827	jb	.Lloop_avx
1828
1829	mov	$_rsp,%rsi
1830.cfi_def_cfa	%rsi,8
1831	vzeroupper
1832___
1833$code.=<<___ if ($win64);
1834	movaps	16*$SZ+32(%rsp),%xmm6
1835	movaps	16*$SZ+48(%rsp),%xmm7
1836	movaps	16*$SZ+64(%rsp),%xmm8
1837	movaps	16*$SZ+80(%rsp),%xmm9
1838___
1839$code.=<<___ if ($win64 && $SZ>4);
1840	movaps	16*$SZ+96(%rsp),%xmm10
1841	movaps	16*$SZ+112(%rsp),%xmm11
1842___
1843$code.=<<___;
1844	mov	-48(%rsi),%r15
1845.cfi_restore	%r15
1846	mov	-40(%rsi),%r14
1847.cfi_restore	%r14
1848	mov	-32(%rsi),%r13
1849.cfi_restore	%r13
1850	mov	-24(%rsi),%r12
1851.cfi_restore	%r12
1852	mov	-16(%rsi),%rbp
1853.cfi_restore	%rbp
1854	mov	-8(%rsi),%rbx
1855.cfi_restore	%rbx
1856	lea	(%rsi),%rsp
1857.cfi_def_cfa_register	%rsp
1858.Lepilogue_avx:
1859	ret
1860.cfi_endproc
1861.size	${func}_avx,.-${func}_avx
1862___
1863
1864if ($avx>1) {{
1865######################################################################
1866# AVX2+BMI code path
1867#
1868my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1869my $PUSH8=8*2*$SZ;
1870use integer;
1871
1872sub bodyx_00_15 () {
1873	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1874	(
1875	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1876
1877	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1878	'&and	($a4,$e)',		# f&e
1879	'&rorx	($a0,$e,$Sigma1[2])',
1880	'&rorx	($a2,$e,$Sigma1[1])',
1881
1882	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1883	'&lea	($h,"($h,$a4)")',
1884	'&andn	($a4,$e,$g)',		# ~e&g
1885	'&xor	($a0,$a2)',
1886
1887	'&rorx	($a1,$e,$Sigma1[0])',
1888	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1889	'&xor	($a0,$a1)',		# Sigma1(e)
1890	'&mov	($a2,$a)',
1891
1892	'&rorx	($a4,$a,$Sigma0[2])',
1893	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1894	'&xor	($a2,$b)',		# a^b, b^c in next round
1895	'&rorx	($a1,$a,$Sigma0[1])',
1896
1897	'&rorx	($a0,$a,$Sigma0[0])',
1898	'&lea	($d,"($d,$h)")',	# d+=h
1899	'&and	($a3,$a2)',		# (b^c)&(a^b)
1900	'&xor	($a1,$a4)',
1901
1902	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1903	'&xor	($a1,$a0)',		# Sigma0(a)
1904	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1905	'&mov	($a4,$e)',		# copy of f in future
1906
1907	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1908	);
1909	# and at the finish one has to $a+=$a1
1910}
1911
1912$code.=<<___;
1913.type	${func}_avx2,\@function,3
1914.align	64
1915${func}_avx2:
1916.cfi_startproc
1917.Lavx2_shortcut:
1918	mov	%rsp,%rax		# copy %rsp
1919.cfi_def_cfa_register	%rax
1920	push	%rbx
1921.cfi_push	%rbx
1922	push	%rbp
1923.cfi_push	%rbp
1924	push	%r12
1925.cfi_push	%r12
1926	push	%r13
1927.cfi_push	%r13
1928	push	%r14
1929.cfi_push	%r14
1930	push	%r15
1931.cfi_push	%r15
1932	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1933	shl	\$4,%rdx		# num*16
1934	and	\$-256*$SZ,%rsp		# align stack frame
1935	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1936	add	\$`2*$SZ*($rounds-8)`,%rsp
1937	mov	$ctx,$_ctx		# save ctx, 1st arg
1938	mov	$inp,$_inp		# save inp, 2nd arh
1939	mov	%rdx,$_end		# save end pointer, "3rd" arg
1940	mov	%rax,$_rsp		# save copy of %rsp
1941.cfi_cfa_expression	$_rsp,deref,+8
1942___
1943$code.=<<___ if ($win64);
1944	movaps	%xmm6,16*$SZ+32(%rsp)
1945	movaps	%xmm7,16*$SZ+48(%rsp)
1946	movaps	%xmm8,16*$SZ+64(%rsp)
1947	movaps	%xmm9,16*$SZ+80(%rsp)
1948___
1949$code.=<<___ if ($win64 && $SZ>4);
1950	movaps	%xmm10,16*$SZ+96(%rsp)
1951	movaps	%xmm11,16*$SZ+112(%rsp)
1952___
1953$code.=<<___;
1954.Lprologue_avx2:
1955
1956	vzeroupper
1957	sub	\$-16*$SZ,$inp		# inp++, size optimization
1958	mov	$SZ*0($ctx),$A
1959	mov	$inp,%r12		# borrow $T1
1960	mov	$SZ*1($ctx),$B
1961	cmp	%rdx,$inp		# $_end
1962	mov	$SZ*2($ctx),$C
1963	cmove	%rsp,%r12		# next block or random data
1964	mov	$SZ*3($ctx),$D
1965	mov	$SZ*4($ctx),$E
1966	mov	$SZ*5($ctx),$F
1967	mov	$SZ*6($ctx),$G
1968	mov	$SZ*7($ctx),$H
1969___
1970					if ($SZ==4) {	# SHA256
1971    my @X = map("%ymm$_",(0..3));
1972    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1973
1974$code.=<<___;
1975	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1976	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1977	jmp	.Loop_avx2
1978.align	16
1979.Loop_avx2:
1980	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1981	vmovdqu	-16*$SZ+0($inp),%xmm0
1982	vmovdqu	-16*$SZ+16($inp),%xmm1
1983	vmovdqu	-16*$SZ+32($inp),%xmm2
1984	vmovdqu	-16*$SZ+48($inp),%xmm3
1985	#mov		$inp,$_inp	# offload $inp
1986	vinserti128	\$1,(%r12),@X[0],@X[0]
1987	vinserti128	\$1,16(%r12),@X[1],@X[1]
1988	vpshufb		$t3,@X[0],@X[0]
1989	vinserti128	\$1,32(%r12),@X[2],@X[2]
1990	vpshufb		$t3,@X[1],@X[1]
1991	vinserti128	\$1,48(%r12),@X[3],@X[3]
1992
1993	lea	$TABLE(%rip),$Tbl
1994	vpshufb	$t3,@X[2],@X[2]
1995	vpaddd	0x00($Tbl),@X[0],$t0
1996	vpshufb	$t3,@X[3],@X[3]
1997	vpaddd	0x20($Tbl),@X[1],$t1
1998	vpaddd	0x40($Tbl),@X[2],$t2
1999	vpaddd	0x60($Tbl),@X[3],$t3
2000	vmovdqa	$t0,0x00(%rsp)
2001	xor	$a1,$a1
2002	vmovdqa	$t1,0x20(%rsp)
2003___
2004$code.=<<___ if (!$win64);
2005# temporarily use %rdi as frame pointer
2006	mov	$_rsp,%rdi
2007.cfi_def_cfa	%rdi,8
2008___
2009$code.=<<___;
2010	lea	-$PUSH8(%rsp),%rsp
2011___
2012$code.=<<___ if (!$win64);
2013# the frame info is at $_rsp, but the stack is moving...
2014# so a second frame pointer is saved at -8(%rsp)
2015# that is in the red zone
2016	mov	%rdi,-8(%rsp)
2017.cfi_cfa_expression	%rsp-8,deref,+8
2018___
2019$code.=<<___;
2020	mov	$B,$a3
2021	vmovdqa	$t2,0x00(%rsp)
2022	xor	$C,$a3			# magic
2023	vmovdqa	$t3,0x20(%rsp)
2024	mov	$F,$a4
2025	sub	\$-16*2*$SZ,$Tbl	# size optimization
2026	jmp	.Lavx2_00_47
2027
2028.align	16
2029.Lavx2_00_47:
2030___
2031
2032sub AVX2_256_00_47 () {
2033my $j = shift;
2034my $body = shift;
2035my @X = @_;
2036my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
2037my $base = "+2*$PUSH8(%rsp)";
2038
2039	if (($j%2)==0) {
2040	&lea	("%rsp","-$PUSH8(%rsp)");
2041$code.=<<___ if (!$win64);
2042.cfi_cfa_expression	%rsp+`$PUSH8-8`,deref,+8
2043# copy secondary frame pointer to new location again at -8(%rsp)
2044	pushq	$PUSH8-8(%rsp)
2045.cfi_cfa_expression	%rsp,deref,+8
2046	lea	8(%rsp),%rsp
2047.cfi_cfa_expression	%rsp-8,deref,+8
2048___
2049	}
2050
2051	foreach (Xupdate_256_AVX()) {		# 29 instructions
2052	    eval;
2053	    eval(shift(@insns));
2054	    eval(shift(@insns));
2055	    eval(shift(@insns));
2056	}
2057	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
2058	  foreach (@insns) { eval; }		# remaining instructions
2059	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2060}
2061
2062    for ($i=0,$j=0; $j<4; $j++) {
2063	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
2064	push(@X,shift(@X));			# rotate(@X)
2065    }
2066	&lea	($Tbl,16*2*$SZ."($Tbl)");
2067	&cmpb	(($SZ-1)."($Tbl)",0);
2068	&jne	(".Lavx2_00_47");
2069
2070    for ($i=0; $i<16; ) {
2071	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2072	foreach(bodyx_00_15()) { eval; }
2073    }
2074					} else {	# SHA512
2075    my @X = map("%ymm$_",(0..7));
2076    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2077
2078$code.=<<___;
2079	jmp	.Loop_avx2
2080.align	16
2081.Loop_avx2:
2082	vmovdqu	-16*$SZ($inp),%xmm0
2083	vmovdqu	-16*$SZ+16($inp),%xmm1
2084	vmovdqu	-16*$SZ+32($inp),%xmm2
2085	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
2086	vmovdqu	-16*$SZ+48($inp),%xmm3
2087	vmovdqu	-16*$SZ+64($inp),%xmm4
2088	vmovdqu	-16*$SZ+80($inp),%xmm5
2089	vmovdqu	-16*$SZ+96($inp),%xmm6
2090	vmovdqu	-16*$SZ+112($inp),%xmm7
2091	#mov	$inp,$_inp	# offload $inp
2092	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
2093	vinserti128	\$1,(%r12),@X[0],@X[0]
2094	vinserti128	\$1,16(%r12),@X[1],@X[1]
2095	 vpshufb	$t2,@X[0],@X[0]
2096	vinserti128	\$1,32(%r12),@X[2],@X[2]
2097	 vpshufb	$t2,@X[1],@X[1]
2098	vinserti128	\$1,48(%r12),@X[3],@X[3]
2099	 vpshufb	$t2,@X[2],@X[2]
2100	vinserti128	\$1,64(%r12),@X[4],@X[4]
2101	 vpshufb	$t2,@X[3],@X[3]
2102	vinserti128	\$1,80(%r12),@X[5],@X[5]
2103	 vpshufb	$t2,@X[4],@X[4]
2104	vinserti128	\$1,96(%r12),@X[6],@X[6]
2105	 vpshufb	$t2,@X[5],@X[5]
2106	vinserti128	\$1,112(%r12),@X[7],@X[7]
2107
2108	vpaddq	-0x80($Tbl),@X[0],$t0
2109	vpshufb	$t2,@X[6],@X[6]
2110	vpaddq	-0x60($Tbl),@X[1],$t1
2111	vpshufb	$t2,@X[7],@X[7]
2112	vpaddq	-0x40($Tbl),@X[2],$t2
2113	vpaddq	-0x20($Tbl),@X[3],$t3
2114	vmovdqa	$t0,0x00(%rsp)
2115	vpaddq	0x00($Tbl),@X[4],$t0
2116	vmovdqa	$t1,0x20(%rsp)
2117	vpaddq	0x20($Tbl),@X[5],$t1
2118	vmovdqa	$t2,0x40(%rsp)
2119	vpaddq	0x40($Tbl),@X[6],$t2
2120	vmovdqa	$t3,0x60(%rsp)
2121___
2122$code.=<<___ if (!$win64);
2123# temporarily use %rdi as frame pointer
2124	mov	$_rsp,%rdi
2125.cfi_def_cfa	%rdi,8
2126___
2127$code.=<<___;
2128	lea	-$PUSH8(%rsp),%rsp
2129___
2130$code.=<<___ if (!$win64);
2131# the frame info is at $_rsp, but the stack is moving...
2132# so a second frame pointer is saved at -8(%rsp)
2133# that is in the red zone
2134	mov	%rdi,-8(%rsp)
2135.cfi_cfa_expression	%rsp-8,deref,+8
2136___
2137$code.=<<___;
2138	vpaddq	0x60($Tbl),@X[7],$t3
2139	vmovdqa	$t0,0x00(%rsp)
2140	xor	$a1,$a1
2141	vmovdqa	$t1,0x20(%rsp)
2142	mov	$B,$a3
2143	vmovdqa	$t2,0x40(%rsp)
2144	xor	$C,$a3			# magic
2145	vmovdqa	$t3,0x60(%rsp)
2146	mov	$F,$a4
2147	add	\$16*2*$SZ,$Tbl
2148	jmp	.Lavx2_00_47
2149
2150.align	16
2151.Lavx2_00_47:
2152___
2153
2154sub AVX2_512_00_47 () {
2155my $j = shift;
2156my $body = shift;
2157my @X = @_;
2158my @insns = (&$body,&$body);			# 48 instructions
2159my $base = "+2*$PUSH8(%rsp)";
2160
2161	if (($j%4)==0) {
2162	&lea	("%rsp","-$PUSH8(%rsp)");
2163$code.=<<___ if (!$win64);
2164.cfi_cfa_expression	%rsp+`$PUSH8-8`,deref,+8
2165# copy secondary frame pointer to new location again at -8(%rsp)
2166	pushq	$PUSH8-8(%rsp)
2167.cfi_cfa_expression	%rsp,deref,+8
2168	lea	8(%rsp),%rsp
2169.cfi_cfa_expression	%rsp-8,deref,+8
2170___
2171	}
2172
2173	foreach (Xupdate_512_AVX()) {		# 23 instructions
2174	    eval;
2175	    if ($_ !~ /\;$/) {
2176		eval(shift(@insns));
2177		eval(shift(@insns));
2178		eval(shift(@insns));
2179	    }
2180	}
2181	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2182	  foreach (@insns) { eval; }		# remaining instructions
2183	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2184}
2185
2186    for ($i=0,$j=0; $j<8; $j++) {
2187	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2188	push(@X,shift(@X));			# rotate(@X)
2189    }
2190	&lea	($Tbl,16*2*$SZ."($Tbl)");
2191	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2192	&jne	(".Lavx2_00_47");
2193
2194    for ($i=0; $i<16; ) {
2195	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2196	foreach(bodyx_00_15()) { eval; }
2197    }
2198}
2199$code.=<<___;
2200	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2201	add	$a1,$A
2202	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2203	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2204
2205	add	$SZ*0($ctx),$A
2206	add	$SZ*1($ctx),$B
2207	add	$SZ*2($ctx),$C
2208	add	$SZ*3($ctx),$D
2209	add	$SZ*4($ctx),$E
2210	add	$SZ*5($ctx),$F
2211	add	$SZ*6($ctx),$G
2212	add	$SZ*7($ctx),$H
2213
2214	mov	$A,$SZ*0($ctx)
2215	mov	$B,$SZ*1($ctx)
2216	mov	$C,$SZ*2($ctx)
2217	mov	$D,$SZ*3($ctx)
2218	mov	$E,$SZ*4($ctx)
2219	mov	$F,$SZ*5($ctx)
2220	mov	$G,$SZ*6($ctx)
2221	mov	$H,$SZ*7($ctx)
2222
2223	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2224	je	.Ldone_avx2
2225
2226	xor	$a1,$a1
2227	mov	$B,$a3
2228	xor	$C,$a3			# magic
2229	mov	$F,$a4
2230	jmp	.Lower_avx2
2231.align	16
2232.Lower_avx2:
2233___
2234    for ($i=0; $i<8; ) {
2235	my $base="+16($Tbl)";
2236	foreach(bodyx_00_15()) { eval; }
2237    }
2238$code.=<<___;
2239	lea	-$PUSH8($Tbl),$Tbl
2240	cmp	%rsp,$Tbl
2241	jae	.Lower_avx2
2242
2243	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2244	add	$a1,$A
2245	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2246	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2247# restore frame pointer to original location at $_rsp
2248.cfi_cfa_expression	$_rsp,deref,+8
2249
2250	add	$SZ*0($ctx),$A
2251	add	$SZ*1($ctx),$B
2252	add	$SZ*2($ctx),$C
2253	add	$SZ*3($ctx),$D
2254	add	$SZ*4($ctx),$E
2255	add	$SZ*5($ctx),$F
2256	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2257	add	$SZ*6($ctx),$G
2258	mov	$inp,%r12
2259	add	$SZ*7($ctx),$H
2260	cmp	$_end,$inp
2261
2262	mov	$A,$SZ*0($ctx)
2263	cmove	%rsp,%r12		# next block or stale data
2264	mov	$B,$SZ*1($ctx)
2265	mov	$C,$SZ*2($ctx)
2266	mov	$D,$SZ*3($ctx)
2267	mov	$E,$SZ*4($ctx)
2268	mov	$F,$SZ*5($ctx)
2269	mov	$G,$SZ*6($ctx)
2270	mov	$H,$SZ*7($ctx)
2271
2272	jbe	.Loop_avx2
2273	lea	(%rsp),$Tbl
2274# temporarily use $Tbl as index to $_rsp
2275# this avoids the need to save a secondary frame pointer at -8(%rsp)
2276.cfi_cfa_expression	$Tbl+`16*$SZ+3*8`,deref,+8
2277
2278.Ldone_avx2:
2279	mov	`16*$SZ+3*8`($Tbl),%rsi
2280.cfi_def_cfa	%rsi,8
2281	vzeroupper
2282___
2283$code.=<<___ if ($win64);
2284	movaps	16*$SZ+32($Tbl),%xmm6
2285	movaps	16*$SZ+48($Tbl),%xmm7
2286	movaps	16*$SZ+64($Tbl),%xmm8
2287	movaps	16*$SZ+80($Tbl),%xmm9
2288___
2289$code.=<<___ if ($win64 && $SZ>4);
2290	movaps	16*$SZ+96($Tbl),%xmm10
2291	movaps	16*$SZ+112($Tbl),%xmm11
2292___
2293$code.=<<___;
2294	mov	-48(%rsi),%r15
2295.cfi_restore	%r15
2296	mov	-40(%rsi),%r14
2297.cfi_restore	%r14
2298	mov	-32(%rsi),%r13
2299.cfi_restore	%r13
2300	mov	-24(%rsi),%r12
2301.cfi_restore	%r12
2302	mov	-16(%rsi),%rbp
2303.cfi_restore	%rbp
2304	mov	-8(%rsi),%rbx
2305.cfi_restore	%rbx
2306	lea	(%rsi),%rsp
2307.cfi_def_cfa_register	%rsp
2308.Lepilogue_avx2:
2309	ret
2310.cfi_endproc
2311.size	${func}_avx2,.-${func}_avx2
2312___
2313}}
2314}}}}}
2315
2316# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2317#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2318if ($win64) {
2319$rec="%rcx";
2320$frame="%rdx";
2321$context="%r8";
2322$disp="%r9";
2323
2324$code.=<<___;
2325.extern	__imp_RtlVirtualUnwind
2326.type	se_handler,\@abi-omnipotent
2327.align	16
2328se_handler:
2329	push	%rsi
2330	push	%rdi
2331	push	%rbx
2332	push	%rbp
2333	push	%r12
2334	push	%r13
2335	push	%r14
2336	push	%r15
2337	pushfq
2338	sub	\$64,%rsp
2339
2340	mov	120($context),%rax	# pull context->Rax
2341	mov	248($context),%rbx	# pull context->Rip
2342
2343	mov	8($disp),%rsi		# disp->ImageBase
2344	mov	56($disp),%r11		# disp->HanderlData
2345
2346	mov	0(%r11),%r10d		# HandlerData[0]
2347	lea	(%rsi,%r10),%r10	# prologue label
2348	cmp	%r10,%rbx		# context->Rip<prologue label
2349	jb	.Lin_prologue
2350
2351	mov	152($context),%rax	# pull context->Rsp
2352
2353	mov	4(%r11),%r10d		# HandlerData[1]
2354	lea	(%rsi,%r10),%r10	# epilogue label
2355	cmp	%r10,%rbx		# context->Rip>=epilogue label
2356	jae	.Lin_prologue
2357___
2358$code.=<<___ if ($avx>1);
2359	lea	.Lavx2_shortcut(%rip),%r10
2360	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2361	jb	.Lnot_in_avx2
2362
2363	and	\$-256*$SZ,%rax
2364	add	\$`2*$SZ*($rounds-8)`,%rax
2365.Lnot_in_avx2:
2366___
2367$code.=<<___;
2368	mov	%rax,%rsi		# put aside Rsp
2369	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2370
2371	mov	-8(%rax),%rbx
2372	mov	-16(%rax),%rbp
2373	mov	-24(%rax),%r12
2374	mov	-32(%rax),%r13
2375	mov	-40(%rax),%r14
2376	mov	-48(%rax),%r15
2377	mov	%rbx,144($context)	# restore context->Rbx
2378	mov	%rbp,160($context)	# restore context->Rbp
2379	mov	%r12,216($context)	# restore context->R12
2380	mov	%r13,224($context)	# restore context->R13
2381	mov	%r14,232($context)	# restore context->R14
2382	mov	%r15,240($context)	# restore context->R15
2383
2384	lea	.Lepilogue(%rip),%r10
2385	cmp	%r10,%rbx
2386	jb	.Lin_prologue		# non-AVX code
2387
2388	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2389	lea	512($context),%rdi	# &context.Xmm6
2390	mov	\$`$SZ==4?8:12`,%ecx
2391	.long	0xa548f3fc		# cld; rep movsq
2392
2393.Lin_prologue:
2394	mov	8(%rax),%rdi
2395	mov	16(%rax),%rsi
2396	mov	%rax,152($context)	# restore context->Rsp
2397	mov	%rsi,168($context)	# restore context->Rsi
2398	mov	%rdi,176($context)	# restore context->Rdi
2399
2400	mov	40($disp),%rdi		# disp->ContextRecord
2401	mov	$context,%rsi		# context
2402	mov	\$154,%ecx		# sizeof(CONTEXT)
2403	.long	0xa548f3fc		# cld; rep movsq
2404
2405	mov	$disp,%rsi
2406	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2407	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2408	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2409	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2410	mov	40(%rsi),%r10		# disp->ContextRecord
2411	lea	56(%rsi),%r11		# &disp->HandlerData
2412	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2413	mov	%r10,32(%rsp)		# arg5
2414	mov	%r11,40(%rsp)		# arg6
2415	mov	%r12,48(%rsp)		# arg7
2416	mov	%rcx,56(%rsp)		# arg8, (NULL)
2417	call	*__imp_RtlVirtualUnwind(%rip)
2418
2419	mov	\$1,%eax		# ExceptionContinueSearch
2420	add	\$64,%rsp
2421	popfq
2422	pop	%r15
2423	pop	%r14
2424	pop	%r13
2425	pop	%r12
2426	pop	%rbp
2427	pop	%rbx
2428	pop	%rdi
2429	pop	%rsi
2430	ret
2431.size	se_handler,.-se_handler
2432___
2433
2434$code.=<<___ if ($SZ==4 && $shaext);
2435.type	shaext_handler,\@abi-omnipotent
2436.align	16
2437shaext_handler:
2438	push	%rsi
2439	push	%rdi
2440	push	%rbx
2441	push	%rbp
2442	push	%r12
2443	push	%r13
2444	push	%r14
2445	push	%r15
2446	pushfq
2447	sub	\$64,%rsp
2448
2449	mov	120($context),%rax	# pull context->Rax
2450	mov	248($context),%rbx	# pull context->Rip
2451
2452	lea	.Lprologue_shaext(%rip),%r10
2453	cmp	%r10,%rbx		# context->Rip<.Lprologue
2454	jb	.Lin_prologue
2455
2456	lea	.Lepilogue_shaext(%rip),%r10
2457	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2458	jae	.Lin_prologue
2459
2460	lea	-8-5*16(%rax),%rsi
2461	lea	512($context),%rdi	# &context.Xmm6
2462	mov	\$10,%ecx
2463	.long	0xa548f3fc		# cld; rep movsq
2464
2465	jmp	.Lin_prologue
2466.size	shaext_handler,.-shaext_handler
2467___
2468
2469$code.=<<___;
2470.section	.pdata
2471.align	4
2472	.rva	.LSEH_begin_$func
2473	.rva	.LSEH_end_$func
2474	.rva	.LSEH_info_$func
2475___
2476$code.=<<___ if ($SZ==4 && $shaext);
2477	.rva	.LSEH_begin_${func}_shaext
2478	.rva	.LSEH_end_${func}_shaext
2479	.rva	.LSEH_info_${func}_shaext
2480___
2481$code.=<<___ if ($SZ==4);
2482	.rva	.LSEH_begin_${func}_ssse3
2483	.rva	.LSEH_end_${func}_ssse3
2484	.rva	.LSEH_info_${func}_ssse3
2485___
2486$code.=<<___ if ($avx && $SZ==8);
2487	.rva	.LSEH_begin_${func}_xop
2488	.rva	.LSEH_end_${func}_xop
2489	.rva	.LSEH_info_${func}_xop
2490___
2491$code.=<<___ if ($avx);
2492	.rva	.LSEH_begin_${func}_avx
2493	.rva	.LSEH_end_${func}_avx
2494	.rva	.LSEH_info_${func}_avx
2495___
2496$code.=<<___ if ($avx>1);
2497	.rva	.LSEH_begin_${func}_avx2
2498	.rva	.LSEH_end_${func}_avx2
2499	.rva	.LSEH_info_${func}_avx2
2500___
2501$code.=<<___;
2502.section	.xdata
2503.align	8
2504.LSEH_info_$func:
2505	.byte	9,0,0,0
2506	.rva	se_handler
2507	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2508___
2509$code.=<<___ if ($SZ==4 && $shaext);
2510.LSEH_info_${func}_shaext:
2511	.byte	9,0,0,0
2512	.rva	shaext_handler
2513___
2514$code.=<<___ if ($SZ==4);
2515.LSEH_info_${func}_ssse3:
2516	.byte	9,0,0,0
2517	.rva	se_handler
2518	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2519___
2520$code.=<<___ if ($avx && $SZ==8);
2521.LSEH_info_${func}_xop:
2522	.byte	9,0,0,0
2523	.rva	se_handler
2524	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2525___
2526$code.=<<___ if ($avx);
2527.LSEH_info_${func}_avx:
2528	.byte	9,0,0,0
2529	.rva	se_handler
2530	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2531___
2532$code.=<<___ if ($avx>1);
2533.LSEH_info_${func}_avx2:
2534	.byte	9,0,0,0
2535	.rva	se_handler
2536	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2537___
2538}
2539
2540sub sha256op38 {
2541    my $instr = shift;
2542    my %opcodelet = (
2543		"sha256rnds2" => 0xcb,
2544  		"sha256msg1"  => 0xcc,
2545		"sha256msg2"  => 0xcd	);
2546
2547    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2548      my @opcode=(0x0f,0x38);
2549	push @opcode,$opcodelet{$instr};
2550	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2551	return ".byte\t".join(',',@opcode);
2552    } else {
2553	return $instr."\t".@_[0];
2554    }
2555}
2556
2557foreach (split("\n",$code)) {
2558	s/\`([^\`]*)\`/eval $1/geo;
2559
2560	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2561
2562	print $_,"\n";
2563}
2564close STDOUT or die "error closing STDOUT: $!";
2565