xref: /openssl/crypto/sha/asm/sha256-mb-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2013-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer SHA256 procedure processes n buffers in parallel by
18# placing buffer data to designated lane of SIMD register. n is
19# naturally limited to 4 on pre-AVX2 processors and to 8 on
20# AVX2-capable processors such as Haswell.
21#
22#		this	+aesni(i)	sha256	aesni-sha256	gain(iv)
23# -------------------------------------------------------------------
24# Westmere(ii)	23.3/n	+1.28=7.11(n=4)	12.3	+3.75=16.1	+126%
25# Atom(ii)	38.7/n	+3.93=13.6(n=4)	20.8	+5.69=26.5	+95%
26# Sandy Bridge	(20.5	+5.15=25.7)/n	11.6	13.0		+103%
27# Ivy Bridge	(20.4	+5.14=25.5)/n	10.3	11.6		+82%
28# Haswell(iii)	(21.0	+5.00=26.0)/n	7.80	8.79		+170%
29# Skylake	(18.9	+5.00=23.9)/n	7.70	8.17		+170%
30# Bulldozer	(21.6	+5.76=27.4)/n	13.6	13.7		+100%
31#
32# (i)	multi-block CBC encrypt with 128-bit key;
33# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34#	because of lower AES-NI instruction throughput, nor is there
35#	AES-NI-SHA256 stitch for these processors;
36# (iii)	"this" is for n=8, when we gather twice as much data, result
37#	for n=4 is 20.3+4.44=24.7;
38# (iv)	presented improvement coefficients are asymptotic limits and
39#	in real-life application are somewhat lower, e.g. for 2KB
40#	fragments they range from 75% to 130% (on Haswell);
41
42# $output is the last argument if it looks like a file (it has an extension)
43# $flavour is the first argument if it doesn't look like a file
44$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54push(@INC,"${dir}","${dir}../../perlasm");
55require "x86_64-support.pl";
56
57$ptr_size=&pointer_size($flavour);
58
59$avx=0;
60
61if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
62		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
63	$avx = ($1>=2.19) + ($1>=2.22);
64}
65
66if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
67	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
68	$avx = ($1>=2.09) + ($1>=2.10);
69}
70
71if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73	$avx = ($1>=10) + ($1>=11);
74}
75
76if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
77	$avx = ($2>=3.0) + ($2>3.0);
78}
79
80open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
81    or die "can't call $xlate: $!";
82*STDOUT=*OUT;
83
84# void sha256_multi_block (
85#     struct {	unsigned int A[8];
86#		unsigned int B[8];
87#		unsigned int C[8];
88#		unsigned int D[8];
89#		unsigned int E[8];
90#		unsigned int F[8];
91#		unsigned int G[8];
92#		unsigned int H[8];	} *ctx,
93#     struct {	void *ptr; int blocks;	} inp[8],
94#     int num);		/* 1 or 2 */
95#
96$ctx="%rdi";	# 1st arg
97$inp="%rsi";	# 2nd arg
98$num="%edx";	# 3rd arg
99@ptr=map("%r$_",(8..11));
100$Tbl="%rbp";
101$inp_elm_size=2*$ptr_size;
102
103@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
104($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
105
106$REG_SZ=16;
107
108sub Xi_off {
109my $off = shift;
110
111    $off %= 16; $off *= $REG_SZ;
112    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
113}
114
115sub ROUND_00_15 {
116my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
117
118$code.=<<___ if ($i<15);
119	movd		`4*$i`(@ptr[0]),$Xi
120	movd		`4*$i`(@ptr[1]),$t1
121	movd		`4*$i`(@ptr[2]),$t2
122	movd		`4*$i`(@ptr[3]),$t3
123	punpckldq	$t2,$Xi
124	punpckldq	$t3,$t1
125	punpckldq	$t1,$Xi
126___
127$code.=<<___ if ($i==15);
128	movd		`4*$i`(@ptr[0]),$Xi
129	 lea		`16*4`(@ptr[0]),@ptr[0]
130	movd		`4*$i`(@ptr[1]),$t1
131	 lea		`16*4`(@ptr[1]),@ptr[1]
132	movd		`4*$i`(@ptr[2]),$t2
133	 lea		`16*4`(@ptr[2]),@ptr[2]
134	movd		`4*$i`(@ptr[3]),$t3
135	 lea		`16*4`(@ptr[3]),@ptr[3]
136	punpckldq	$t2,$Xi
137	punpckldq	$t3,$t1
138	punpckldq	$t1,$Xi
139___
140$code.=<<___;
141	movdqa	$e,$sigma
142	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==0)`
143	movdqa	$e,$t3
144	`"pshufb	$Xn,$Xi"		if ($i<=15 && ($i&1)==1)`
145	psrld	\$6,$sigma
146	movdqa	$e,$t2
147	pslld	\$7,$t3
148	movdqa	$Xi,`&Xi_off($i)`
149	 paddd	$h,$Xi				# Xi+=h
150
151	psrld	\$11,$t2
152	pxor	$t3,$sigma
153	pslld	\$21-7,$t3
154	 paddd	`32*($i%8)-128`($Tbl),$Xi	# Xi+=K[round]
155	pxor	$t2,$sigma
156
157	psrld	\$25-11,$t2
158	 movdqa	$e,$t1
159	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
160	pxor	$t3,$sigma
161	 movdqa	$e,$axb				# borrow $axb
162	pslld	\$26-21,$t3
163	 pandn	$g,$t1
164	 pand	$f,$axb
165	pxor	$t2,$sigma
166
167	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
168	movdqa	$a,$t2
169	pxor	$t3,$sigma			# Sigma1(e)
170	movdqa	$a,$t3
171	psrld	\$2,$t2
172	paddd	$sigma,$Xi			# Xi+=Sigma1(e)
173	 pxor	$axb,$t1			# Ch(e,f,g)
174	 movdqa	$b,$axb
175	movdqa	$a,$sigma
176	pslld	\$10,$t3
177	 pxor	$a,$axb				# a^b, b^c in next round
178
179	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
180	psrld	\$13,$sigma
181	pxor	$t3,$t2
182	 paddd	$t1,$Xi				# Xi+=Ch(e,f,g)
183	pslld	\$19-10,$t3
184	 pand	$axb,$bxc
185	pxor	$sigma,$t2
186
187	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
188	psrld	\$22-13,$sigma
189	pxor	$t3,$t2
190	 movdqa	$b,$h
191	pslld	\$30-19,$t3
192	pxor	$t2,$sigma
193	 pxor	$bxc,$h				# h=Maj(a,b,c)=Ch(a^b,c,b)
194	 paddd	$Xi,$d				# d+=Xi
195	pxor	$t3,$sigma			# Sigma0(a)
196
197	paddd	$Xi,$h				# h+=Xi
198	paddd	$sigma,$h			# h+=Sigma0(a)
199___
200$code.=<<___ if (($i%8)==7);
201	lea	`32*8`($Tbl),$Tbl
202___
203	($axb,$bxc)=($bxc,$axb);
204}
205
206sub ROUND_16_XX {
207my $i=shift;
208
209$code.=<<___;
210	movdqa	`&Xi_off($i+1)`,$Xn
211	paddd	`&Xi_off($i+9)`,$Xi		# Xi+=X[i+9]
212
213	movdqa	$Xn,$sigma
214	movdqa	$Xn,$t2
215	psrld	\$3,$sigma
216	movdqa	$Xn,$t3
217
218	psrld	\$7,$t2
219	movdqa	`&Xi_off($i+14)`,$t1
220	pslld	\$14,$t3
221	pxor	$t2,$sigma
222	psrld	\$18-7,$t2
223	movdqa	$t1,$axb			# borrow $axb
224	pxor	$t3,$sigma
225	pslld	\$25-14,$t3
226	pxor	$t2,$sigma
227	psrld	\$10,$t1
228	movdqa	$axb,$t2
229
230	psrld	\$17,$axb
231	pxor	$t3,$sigma			# sigma0(X[i+1])
232	pslld	\$13,$t2
233	 paddd	$sigma,$Xi			# Xi+=sigma0(e)
234	pxor	$axb,$t1
235	psrld	\$19-17,$axb
236	pxor	$t2,$t1
237	pslld	\$15-13,$t2
238	pxor	$axb,$t1
239	pxor	$t2,$t1				# sigma0(X[i+14])
240	paddd	$t1,$Xi				# Xi+=sigma1(X[i+14])
241___
242	&ROUND_00_15($i,@_);
243	($Xi,$Xn)=($Xn,$Xi);
244}
245
246$code.=<<___;
247.text
248
249.extern	OPENSSL_ia32cap_P
250
251.globl	sha256_multi_block
252.type	sha256_multi_block,\@function,3
253.align	32
254sha256_multi_block:
255.cfi_startproc
256	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
257	bt	\$61,%rcx			# check SHA bit
258	jc	_shaext_shortcut
259___
260$code.=<<___ if ($avx);
261	test	\$`1<<28`,%ecx
262	jnz	_avx_shortcut
263___
264$code.=<<___;
265	mov	%rsp,%rax
266.cfi_def_cfa_register	%rax
267	push	%rbx
268.cfi_push	%rbx
269	push	%rbp
270.cfi_push	%rbp
271___
272$code.=<<___ if ($win64);
273	lea	-0xa8(%rsp),%rsp
274	movaps	%xmm6,(%rsp)
275	movaps	%xmm7,0x10(%rsp)
276	movaps	%xmm8,0x20(%rsp)
277	movaps	%xmm9,0x30(%rsp)
278	movaps	%xmm10,-0x78(%rax)
279	movaps	%xmm11,-0x68(%rax)
280	movaps	%xmm12,-0x58(%rax)
281	movaps	%xmm13,-0x48(%rax)
282	movaps	%xmm14,-0x38(%rax)
283	movaps	%xmm15,-0x28(%rax)
284___
285$code.=<<___;
286	sub	\$`$REG_SZ*18`, %rsp
287	and	\$-256,%rsp
288	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
289.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
290.Lbody:
291	lea	K256+128(%rip),$Tbl
292	lea	`$REG_SZ*16`(%rsp),%rbx
293	lea	0x80($ctx),$ctx			# size optimization
294
295.Loop_grande:
296	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
297	xor	$num,$num
298___
299for($i=0;$i<4;$i++) {
300    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
301    $code.=<<___;
302	# input pointer
303	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
304	# number of blocks
305	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
306	cmp	$num,%ecx
307	cmovg	%ecx,$num			# find maximum
308	test	%ecx,%ecx
309	mov	%ecx,`4*$i`(%rbx)		# initialize counters
310	cmovle	$Tbl,@ptr[$i]			# cancel input
311___
312}
313$code.=<<___;
314	test	$num,$num
315	jz	.Ldone
316
317	movdqu	0x00-0x80($ctx),$A		# load context
318	 lea	128(%rsp),%rax
319	movdqu	0x20-0x80($ctx),$B
320	movdqu	0x40-0x80($ctx),$C
321	movdqu	0x60-0x80($ctx),$D
322	movdqu	0x80-0x80($ctx),$E
323	movdqu	0xa0-0x80($ctx),$F
324	movdqu	0xc0-0x80($ctx),$G
325	movdqu	0xe0-0x80($ctx),$H
326	movdqu	.Lpbswap(%rip),$Xn
327	jmp	.Loop
328
329.align	32
330.Loop:
331	movdqa	$C,$bxc
332	pxor	$B,$bxc				# magic seed
333___
334for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
335$code.=<<___;
336	movdqu	`&Xi_off($i)`,$Xi
337	mov	\$3,%ecx
338	jmp	.Loop_16_xx
339.align	32
340.Loop_16_xx:
341___
342for(;$i<32;$i++)	{ &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
343$code.=<<___;
344	dec	%ecx
345	jnz	.Loop_16_xx
346
347	mov	\$1,%ecx
348	lea	K256+128(%rip),$Tbl
349
350	movdqa	(%rbx),$sigma			# pull counters
351	cmp	4*0(%rbx),%ecx			# examine counters
352	pxor	$t1,$t1
353	cmovge	$Tbl,@ptr[0]			# cancel input
354	cmp	4*1(%rbx),%ecx
355	movdqa	$sigma,$Xn
356	cmovge	$Tbl,@ptr[1]
357	cmp	4*2(%rbx),%ecx
358	pcmpgtd	$t1,$Xn				# mask value
359	cmovge	$Tbl,@ptr[2]
360	cmp	4*3(%rbx),%ecx
361	paddd	$Xn,$sigma			# counters--
362	cmovge	$Tbl,@ptr[3]
363
364	movdqu	0x00-0x80($ctx),$t1
365	pand	$Xn,$A
366	movdqu	0x20-0x80($ctx),$t2
367	pand	$Xn,$B
368	movdqu	0x40-0x80($ctx),$t3
369	pand	$Xn,$C
370	movdqu	0x60-0x80($ctx),$Xi
371	pand	$Xn,$D
372	paddd	$t1,$A
373	movdqu	0x80-0x80($ctx),$t1
374	pand	$Xn,$E
375	paddd	$t2,$B
376	movdqu	0xa0-0x80($ctx),$t2
377	pand	$Xn,$F
378	paddd	$t3,$C
379	movdqu	0xc0-0x80($ctx),$t3
380	pand	$Xn,$G
381	paddd	$Xi,$D
382	movdqu	0xe0-0x80($ctx),$Xi
383	pand	$Xn,$H
384	paddd	$t1,$E
385	paddd	$t2,$F
386	movdqu	$A,0x00-0x80($ctx)
387	paddd	$t3,$G
388	movdqu	$B,0x20-0x80($ctx)
389	paddd	$Xi,$H
390	movdqu	$C,0x40-0x80($ctx)
391	movdqu	$D,0x60-0x80($ctx)
392	movdqu	$E,0x80-0x80($ctx)
393	movdqu	$F,0xa0-0x80($ctx)
394	movdqu	$G,0xc0-0x80($ctx)
395	movdqu	$H,0xe0-0x80($ctx)
396
397	movdqa	$sigma,(%rbx)			# save counters
398	movdqa	.Lpbswap(%rip),$Xn
399	dec	$num
400	jnz	.Loop
401
402	mov	`$REG_SZ*17+8`(%rsp),$num
403	lea	$REG_SZ($ctx),$ctx
404	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
405	dec	$num
406	jnz	.Loop_grande
407
408.Ldone:
409	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
410.cfi_def_cfa	%rax,8
411___
412$code.=<<___ if ($win64);
413	movaps	-0xb8(%rax),%xmm6
414	movaps	-0xa8(%rax),%xmm7
415	movaps	-0x98(%rax),%xmm8
416	movaps	-0x88(%rax),%xmm9
417	movaps	-0x78(%rax),%xmm10
418	movaps	-0x68(%rax),%xmm11
419	movaps	-0x58(%rax),%xmm12
420	movaps	-0x48(%rax),%xmm13
421	movaps	-0x38(%rax),%xmm14
422	movaps	-0x28(%rax),%xmm15
423___
424$code.=<<___;
425	mov	-16(%rax),%rbp
426.cfi_restore	%rbp
427	mov	-8(%rax),%rbx
428.cfi_restore	%rbx
429	lea	(%rax),%rsp
430.cfi_def_cfa_register	%rsp
431.Lepilogue:
432	ret
433.cfi_endproc
434.size	sha256_multi_block,.-sha256_multi_block
435___
436						{{{
437my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
438my @MSG0=map("%xmm$_",(4..7));
439my @MSG1=map("%xmm$_",(8..11));
440
441$code.=<<___;
442.type	sha256_multi_block_shaext,\@function,3
443.align	32
444sha256_multi_block_shaext:
445.cfi_startproc
446_shaext_shortcut:
447	mov	%rsp,%rax
448.cfi_def_cfa_register	%rax
449	push	%rbx
450.cfi_push	%rbx
451	push	%rbp
452.cfi_push	%rbp
453___
454$code.=<<___ if ($win64);
455	lea	-0xa8(%rsp),%rsp
456	movaps	%xmm6,(%rsp)
457	movaps	%xmm7,0x10(%rsp)
458	movaps	%xmm8,0x20(%rsp)
459	movaps	%xmm9,0x30(%rsp)
460	movaps	%xmm10,-0x78(%rax)
461	movaps	%xmm11,-0x68(%rax)
462	movaps	%xmm12,-0x58(%rax)
463	movaps	%xmm13,-0x48(%rax)
464	movaps	%xmm14,-0x38(%rax)
465	movaps	%xmm15,-0x28(%rax)
466___
467$code.=<<___;
468	sub	\$`$REG_SZ*18`,%rsp
469	shl	\$1,$num			# we process pair at a time
470	and	\$-256,%rsp
471	lea	0x80($ctx),$ctx			# size optimization
472	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
473.Lbody_shaext:
474	lea	`$REG_SZ*16`(%rsp),%rbx
475	lea	K256_shaext+0x80(%rip),$Tbl
476
477.Loop_grande_shaext:
478	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
479	xor	$num,$num
480___
481for($i=0;$i<2;$i++) {
482    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
483    $code.=<<___;
484	# input pointer
485	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
486	# number of blocks
487	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
488	cmp	$num,%ecx
489	cmovg	%ecx,$num			# find maximum
490	test	%ecx,%ecx
491	mov	%ecx,`4*$i`(%rbx)		# initialize counters
492	cmovle	%rsp,@ptr[$i]			# cancel input
493___
494}
495$code.=<<___;
496	test	$num,$num
497	jz	.Ldone_shaext
498
499	movq		0x00-0x80($ctx),$ABEF0		# A1.A0
500	movq		0x20-0x80($ctx),@MSG0[0]	# B1.B0
501	movq		0x40-0x80($ctx),$CDGH0		# C1.C0
502	movq		0x60-0x80($ctx),@MSG0[1]	# D1.D0
503	movq		0x80-0x80($ctx),@MSG1[0]	# E1.E0
504	movq		0xa0-0x80($ctx),@MSG1[1]	# F1.F0
505	movq		0xc0-0x80($ctx),@MSG1[2]	# G1.G0
506	movq		0xe0-0x80($ctx),@MSG1[3]	# H1.H0
507
508	punpckldq	@MSG0[0],$ABEF0			# B1.A1.B0.A0
509	punpckldq	@MSG0[1],$CDGH0			# D1.C1.D0.C0
510	punpckldq	@MSG1[1],@MSG1[0]		# F1.E1.F0.E0
511	punpckldq	@MSG1[3],@MSG1[2]		# H1.G1.H0.G0
512	movdqa		K256_shaext-0x10(%rip),$TMPx	# byte swap
513
514	movdqa		$ABEF0,$ABEF1
515	movdqa		$CDGH0,$CDGH1
516	punpcklqdq	@MSG1[0],$ABEF0			# F0.E0.B0.A0
517	punpcklqdq	@MSG1[2],$CDGH0			# H0.G0.D0.C0
518	punpckhqdq	@MSG1[0],$ABEF1			# F1.E1.B1.A1
519	punpckhqdq	@MSG1[2],$CDGH1			# H1.G1.D1.C1
520
521	pshufd		\$0b00011011,$ABEF0,$ABEF0
522	pshufd		\$0b00011011,$CDGH0,$CDGH0
523	pshufd		\$0b00011011,$ABEF1,$ABEF1
524	pshufd		\$0b00011011,$CDGH1,$CDGH1
525	jmp		.Loop_shaext
526
527.align	32
528.Loop_shaext:
529	movdqu		0x00(@ptr[0]),@MSG0[0]
530	 movdqu		0x00(@ptr[1]),@MSG1[0]
531	movdqu		0x10(@ptr[0]),@MSG0[1]
532	 movdqu		0x10(@ptr[1]),@MSG1[1]
533	movdqu		0x20(@ptr[0]),@MSG0[2]
534	pshufb		$TMPx,@MSG0[0]
535	 movdqu		0x20(@ptr[1]),@MSG1[2]
536	 pshufb		$TMPx,@MSG1[0]
537	movdqu		0x30(@ptr[0]),@MSG0[3]
538	lea		0x40(@ptr[0]),@ptr[0]
539	 movdqu		0x30(@ptr[1]),@MSG1[3]
540	 lea		0x40(@ptr[1]),@ptr[1]
541
542	movdqa		0*16-0x80($Tbl),$Wi
543	pshufb		$TMPx,@MSG0[1]
544	paddd		@MSG0[0],$Wi
545	pxor		$ABEF0,@MSG0[0]		# black magic
546	movdqa		$Wi,$TMP0
547	 movdqa		0*16-0x80($Tbl),$TMP1
548	 pshufb		$TMPx,@MSG1[1]
549	 paddd		@MSG1[0],$TMP1
550	movdqa		$CDGH0,0x50(%rsp)	# offload
551	sha256rnds2	$ABEF0,$CDGH0		# 0-3
552	 pxor		$ABEF1,@MSG1[0]		# black magic
553	 movdqa		$TMP1,$Wi
554	 movdqa		$CDGH1,0x70(%rsp)
555	 sha256rnds2	$ABEF1,$CDGH1		# 0-3
556	pshufd		\$0x0e,$TMP0,$Wi
557	pxor		$ABEF0,@MSG0[0]		# black magic
558	movdqa		$ABEF0,0x40(%rsp)	# offload
559	sha256rnds2	$CDGH0,$ABEF0
560	 pshufd		\$0x0e,$TMP1,$Wi
561	 pxor		$ABEF1,@MSG1[0]		# black magic
562	 movdqa		$ABEF1,0x60(%rsp)
563	movdqa		1*16-0x80($Tbl),$TMP0
564	paddd		@MSG0[1],$TMP0
565	pshufb		$TMPx,@MSG0[2]
566	 sha256rnds2	$CDGH1,$ABEF1
567
568	movdqa		$TMP0,$Wi
569	 movdqa		1*16-0x80($Tbl),$TMP1
570	 paddd		@MSG1[1],$TMP1
571	sha256rnds2	$ABEF0,$CDGH0		# 4-7
572	 movdqa		$TMP1,$Wi
573	prefetcht0	127(@ptr[0])
574	pshufb		$TMPx,@MSG0[3]
575	 pshufb		$TMPx,@MSG1[2]
576	 prefetcht0	127(@ptr[1])
577	 sha256rnds2	$ABEF1,$CDGH1		# 4-7
578	pshufd		\$0x0e,$TMP0,$Wi
579	 pshufb		$TMPx,@MSG1[3]
580	sha256msg1	@MSG0[1],@MSG0[0]
581	sha256rnds2	$CDGH0,$ABEF0
582	 pshufd		\$0x0e,$TMP1,$Wi
583	movdqa		2*16-0x80($Tbl),$TMP0
584	paddd		@MSG0[2],$TMP0
585	 sha256rnds2	$CDGH1,$ABEF1
586
587	movdqa		$TMP0,$Wi
588	 movdqa		2*16-0x80($Tbl),$TMP1
589	 paddd		@MSG1[2],$TMP1
590	sha256rnds2	$ABEF0,$CDGH0		# 8-11
591	 sha256msg1	@MSG1[1],@MSG1[0]
592	 movdqa		$TMP1,$Wi
593	movdqa		@MSG0[3],$TMPx
594	 sha256rnds2	$ABEF1,$CDGH1		# 8-11
595	pshufd		\$0x0e,$TMP0,$Wi
596	palignr		\$4,@MSG0[2],$TMPx
597	paddd		$TMPx,@MSG0[0]
598	 movdqa		@MSG1[3],$TMPx
599	 palignr	\$4,@MSG1[2],$TMPx
600	sha256msg1	@MSG0[2],@MSG0[1]
601	sha256rnds2	$CDGH0,$ABEF0
602	 pshufd		\$0x0e,$TMP1,$Wi
603	movdqa		3*16-0x80($Tbl),$TMP0
604	paddd		@MSG0[3],$TMP0
605	 sha256rnds2	$CDGH1,$ABEF1
606	 sha256msg1	@MSG1[2],@MSG1[1]
607
608	movdqa		$TMP0,$Wi
609	 movdqa		3*16-0x80($Tbl),$TMP1
610	 paddd		$TMPx,@MSG1[0]
611	 paddd		@MSG1[3],$TMP1
612	sha256msg2	@MSG0[3],@MSG0[0]
613	sha256rnds2	$ABEF0,$CDGH0		# 12-15
614	 movdqa		$TMP1,$Wi
615	movdqa		@MSG0[0],$TMPx
616	palignr		\$4,@MSG0[3],$TMPx
617	 sha256rnds2	$ABEF1,$CDGH1		# 12-15
618	 sha256msg2	@MSG1[3],@MSG1[0]
619	pshufd		\$0x0e,$TMP0,$Wi
620	paddd		$TMPx,@MSG0[1]
621	 movdqa		@MSG1[0],$TMPx
622	 palignr	\$4,@MSG1[3],$TMPx
623	sha256msg1	@MSG0[3],@MSG0[2]
624	sha256rnds2	$CDGH0,$ABEF0
625	 pshufd		\$0x0e,$TMP1,$Wi
626	movdqa		4*16-0x80($Tbl),$TMP0
627	paddd		@MSG0[0],$TMP0
628	 sha256rnds2	$CDGH1,$ABEF1
629	 sha256msg1	@MSG1[3],@MSG1[2]
630___
631for($i=4;$i<16-3;$i++) {
632$code.=<<___;
633	movdqa		$TMP0,$Wi
634	 movdqa		$i*16-0x80($Tbl),$TMP1
635	 paddd		$TMPx,@MSG1[1]
636	 paddd		@MSG1[0],$TMP1
637	sha256msg2	@MSG0[0],@MSG0[1]
638	sha256rnds2	$ABEF0,$CDGH0		# 16-19...
639	 movdqa		$TMP1,$Wi
640	movdqa		@MSG0[1],$TMPx
641	palignr		\$4,@MSG0[0],$TMPx
642	 sha256rnds2	$ABEF1,$CDGH1		# 16-19...
643	 sha256msg2	@MSG1[0],@MSG1[1]
644	pshufd		\$0x0e,$TMP0,$Wi
645	paddd		$TMPx,@MSG0[2]
646	 movdqa		@MSG1[1],$TMPx
647	 palignr	\$4,@MSG1[0],$TMPx
648	sha256msg1	@MSG0[0],@MSG0[3]
649	sha256rnds2	$CDGH0,$ABEF0
650	 pshufd		\$0x0e,$TMP1,$Wi
651	movdqa		`($i+1)*16`-0x80($Tbl),$TMP0
652	paddd		@MSG0[1],$TMP0
653	 sha256rnds2	$CDGH1,$ABEF1
654	 sha256msg1	@MSG1[0],@MSG1[3]
655___
656	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
657}
658$code.=<<___;
659	movdqa		$TMP0,$Wi
660	 movdqa		13*16-0x80($Tbl),$TMP1
661	 paddd		$TMPx,@MSG1[1]
662	 paddd		@MSG1[0],$TMP1
663	sha256msg2	@MSG0[0],@MSG0[1]
664	sha256rnds2	$ABEF0,$CDGH0		# 52-55
665	 movdqa		$TMP1,$Wi
666	movdqa		@MSG0[1],$TMPx
667	palignr		\$4,@MSG0[0],$TMPx
668	 sha256rnds2	$ABEF1,$CDGH1		# 52-55
669	 sha256msg2	@MSG1[0],@MSG1[1]
670	pshufd		\$0x0e,$TMP0,$Wi
671	paddd		$TMPx,@MSG0[2]
672	 movdqa		@MSG1[1],$TMPx
673	 palignr	\$4,@MSG1[0],$TMPx
674	nop
675	sha256rnds2	$CDGH0,$ABEF0
676	 pshufd		\$0x0e,$TMP1,$Wi
677	movdqa		14*16-0x80($Tbl),$TMP0
678	paddd		@MSG0[1],$TMP0
679	 sha256rnds2	$CDGH1,$ABEF1
680
681	movdqa		$TMP0,$Wi
682	 movdqa		14*16-0x80($Tbl),$TMP1
683	 paddd		$TMPx,@MSG1[2]
684	 paddd		@MSG1[1],$TMP1
685	sha256msg2	@MSG0[1],@MSG0[2]
686	nop
687	sha256rnds2	$ABEF0,$CDGH0		# 56-59
688	 movdqa		$TMP1,$Wi
689	  mov		\$1,%ecx
690	  pxor		@MSG0[1],@MSG0[1]	# zero
691	 sha256rnds2	$ABEF1,$CDGH1		# 56-59
692	 sha256msg2	@MSG1[1],@MSG1[2]
693	pshufd		\$0x0e,$TMP0,$Wi
694	movdqa		15*16-0x80($Tbl),$TMP0
695	paddd		@MSG0[2],$TMP0
696	  movq		(%rbx),@MSG0[2]		# pull counters
697	  nop
698	sha256rnds2	$CDGH0,$ABEF0
699	 pshufd		\$0x0e,$TMP1,$Wi
700	 movdqa		15*16-0x80($Tbl),$TMP1
701	 paddd		@MSG1[2],$TMP1
702	 sha256rnds2	$CDGH1,$ABEF1
703
704	movdqa		$TMP0,$Wi
705	  cmp		4*0(%rbx),%ecx		# examine counters
706	  cmovge	%rsp,@ptr[0]		# cancel input
707	  cmp		4*1(%rbx),%ecx
708	  cmovge	%rsp,@ptr[1]
709	  pshufd	\$0x00,@MSG0[2],@MSG1[0]
710	sha256rnds2	$ABEF0,$CDGH0		# 60-63
711	 movdqa		$TMP1,$Wi
712	  pshufd	\$0x55,@MSG0[2],@MSG1[1]
713	  movdqa	@MSG0[2],@MSG1[2]
714	 sha256rnds2	$ABEF1,$CDGH1		# 60-63
715	pshufd		\$0x0e,$TMP0,$Wi
716	  pcmpgtd	@MSG0[1],@MSG1[0]
717	  pcmpgtd	@MSG0[1],@MSG1[1]
718	sha256rnds2	$CDGH0,$ABEF0
719	 pshufd		\$0x0e,$TMP1,$Wi
720	  pcmpgtd	@MSG0[1],@MSG1[2]	# counter mask
721	  movdqa	K256_shaext-0x10(%rip),$TMPx
722	 sha256rnds2	$CDGH1,$ABEF1
723
724	pand		@MSG1[0],$CDGH0
725	 pand		@MSG1[1],$CDGH1
726	pand		@MSG1[0],$ABEF0
727	 pand		@MSG1[1],$ABEF1
728	paddd		@MSG0[2],@MSG1[2]	# counters--
729
730	paddd		0x50(%rsp),$CDGH0
731	 paddd		0x70(%rsp),$CDGH1
732	paddd		0x40(%rsp),$ABEF0
733	 paddd		0x60(%rsp),$ABEF1
734
735	movq		@MSG1[2],(%rbx)		# save counters
736	dec		$num
737	jnz		.Loop_shaext
738
739	mov		`$REG_SZ*17+8`(%rsp),$num
740
741	pshufd		\$0b00011011,$ABEF0,$ABEF0
742	pshufd		\$0b00011011,$CDGH0,$CDGH0
743	pshufd		\$0b00011011,$ABEF1,$ABEF1
744	pshufd		\$0b00011011,$CDGH1,$CDGH1
745
746	movdqa		$ABEF0,@MSG0[0]
747	movdqa		$CDGH0,@MSG0[1]
748	punpckldq	$ABEF1,$ABEF0			# B1.B0.A1.A0
749	punpckhdq	$ABEF1,@MSG0[0]			# F1.F0.E1.E0
750	punpckldq	$CDGH1,$CDGH0			# D1.D0.C1.C0
751	punpckhdq	$CDGH1,@MSG0[1]			# H1.H0.G1.G0
752
753	movq		$ABEF0,0x00-0x80($ctx)		# A1.A0
754	psrldq		\$8,$ABEF0
755	movq		@MSG0[0],0x80-0x80($ctx)	# E1.E0
756	psrldq		\$8,@MSG0[0]
757	movq		$ABEF0,0x20-0x80($ctx)		# B1.B0
758	movq		@MSG0[0],0xa0-0x80($ctx)	# F1.F0
759
760	movq		$CDGH0,0x40-0x80($ctx)		# C1.C0
761	psrldq		\$8,$CDGH0
762	movq		@MSG0[1],0xc0-0x80($ctx)	# G1.G0
763	psrldq		\$8,@MSG0[1]
764	movq		$CDGH0,0x60-0x80($ctx)		# D1.D0
765	movq		@MSG0[1],0xe0-0x80($ctx)	# H1.H0
766
767	lea	`$REG_SZ/2`($ctx),$ctx
768	lea	`$inp_elm_size*2`($inp),$inp
769	dec	$num
770	jnz	.Loop_grande_shaext
771
772.Ldone_shaext:
773	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
774___
775$code.=<<___ if ($win64);
776	movaps	-0xb8(%rax),%xmm6
777	movaps	-0xa8(%rax),%xmm7
778	movaps	-0x98(%rax),%xmm8
779	movaps	-0x88(%rax),%xmm9
780	movaps	-0x78(%rax),%xmm10
781	movaps	-0x68(%rax),%xmm11
782	movaps	-0x58(%rax),%xmm12
783	movaps	-0x48(%rax),%xmm13
784	movaps	-0x38(%rax),%xmm14
785	movaps	-0x28(%rax),%xmm15
786___
787$code.=<<___;
788	mov	-16(%rax),%rbp
789.cfi_restore	%rbp
790	mov	-8(%rax),%rbx
791.cfi_restore	%rbx
792	lea	(%rax),%rsp
793.cfi_def_cfa_register	%rsp
794.Lepilogue_shaext:
795	ret
796.cfi_endproc
797.size	sha256_multi_block_shaext,.-sha256_multi_block_shaext
798___
799						}}}
800						if ($avx) {{{
801sub ROUND_00_15_avx {
802my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
803
804$code.=<<___ if ($i<15 && $REG_SZ==16);
805	vmovd		`4*$i`(@ptr[0]),$Xi
806	vmovd		`4*$i`(@ptr[1]),$t1
807	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
808	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
809	vpunpckldq	$t1,$Xi,$Xi
810	vpshufb		$Xn,$Xi,$Xi
811___
812$code.=<<___ if ($i==15 && $REG_SZ==16);
813	vmovd		`4*$i`(@ptr[0]),$Xi
814	 lea		`16*4`(@ptr[0]),@ptr[0]
815	vmovd		`4*$i`(@ptr[1]),$t1
816	 lea		`16*4`(@ptr[1]),@ptr[1]
817	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
818	 lea		`16*4`(@ptr[2]),@ptr[2]
819	vpinsrd		\$1,`4*$i`(@ptr[3]),$t1,$t1
820	 lea		`16*4`(@ptr[3]),@ptr[3]
821	vpunpckldq	$t1,$Xi,$Xi
822	vpshufb		$Xn,$Xi,$Xi
823___
824$code.=<<___ if ($i<15 && $REG_SZ==32);
825	vmovd		`4*$i`(@ptr[0]),$Xi
826	vmovd		`4*$i`(@ptr[4]),$t1
827	vmovd		`4*$i`(@ptr[1]),$t2
828	vmovd		`4*$i`(@ptr[5]),$t3
829	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
830	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
831	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
832	vpunpckldq	$t2,$Xi,$Xi
833	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
834	vpunpckldq	$t3,$t1,$t1
835	vinserti128	$t1,$Xi,$Xi
836	vpshufb		$Xn,$Xi,$Xi
837___
838$code.=<<___ if ($i==15 && $REG_SZ==32);
839	vmovd		`4*$i`(@ptr[0]),$Xi
840	 lea		`16*4`(@ptr[0]),@ptr[0]
841	vmovd		`4*$i`(@ptr[4]),$t1
842	 lea		`16*4`(@ptr[4]),@ptr[4]
843	vmovd		`4*$i`(@ptr[1]),$t2
844	 lea		`16*4`(@ptr[1]),@ptr[1]
845	vmovd		`4*$i`(@ptr[5]),$t3
846	 lea		`16*4`(@ptr[5]),@ptr[5]
847	vpinsrd		\$1,`4*$i`(@ptr[2]),$Xi,$Xi
848	 lea		`16*4`(@ptr[2]),@ptr[2]
849	vpinsrd		\$1,`4*$i`(@ptr[6]),$t1,$t1
850	 lea		`16*4`(@ptr[6]),@ptr[6]
851	vpinsrd		\$1,`4*$i`(@ptr[3]),$t2,$t2
852	 lea		`16*4`(@ptr[3]),@ptr[3]
853	vpunpckldq	$t2,$Xi,$Xi
854	vpinsrd		\$1,`4*$i`(@ptr[7]),$t3,$t3
855	 lea		`16*4`(@ptr[7]),@ptr[7]
856	vpunpckldq	$t3,$t1,$t1
857	vinserti128	$t1,$Xi,$Xi
858	vpshufb		$Xn,$Xi,$Xi
859___
860$code.=<<___;
861	vpsrld	\$6,$e,$sigma
862	vpslld	\$26,$e,$t3
863	vmovdqu	$Xi,`&Xi_off($i)`
864	 vpaddd	$h,$Xi,$Xi			# Xi+=h
865
866	vpsrld	\$11,$e,$t2
867	vpxor	$t3,$sigma,$sigma
868	vpslld	\$21,$e,$t3
869	 vpaddd	`32*($i%8)-128`($Tbl),$Xi,$Xi	# Xi+=K[round]
870	vpxor	$t2,$sigma,$sigma
871
872	vpsrld	\$25,$e,$t2
873	vpxor	$t3,$sigma,$sigma
874	 `"prefetcht0	63(@ptr[0])"		if ($i==15)`
875	vpslld	\$7,$e,$t3
876	 vpandn	$g,$e,$t1
877	 vpand	$f,$e,$axb			# borrow $axb
878	 `"prefetcht0	63(@ptr[1])"		if ($i==15)`
879	vpxor	$t2,$sigma,$sigma
880
881	vpsrld	\$2,$a,$h			# borrow $h
882	vpxor	$t3,$sigma,$sigma		# Sigma1(e)
883	 `"prefetcht0	63(@ptr[2])"		if ($i==15)`
884	vpslld	\$30,$a,$t2
885	 vpxor	$axb,$t1,$t1			# Ch(e,f,g)
886	 vpxor	$a,$b,$axb			# a^b, b^c in next round
887	 `"prefetcht0	63(@ptr[3])"		if ($i==15)`
888	vpxor	$t2,$h,$h
889	vpaddd	$sigma,$Xi,$Xi			# Xi+=Sigma1(e)
890
891	vpsrld	\$13,$a,$t2
892	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
893	vpslld	\$19,$a,$t3
894	 vpaddd	$t1,$Xi,$Xi			# Xi+=Ch(e,f,g)
895	 vpand	$axb,$bxc,$bxc
896	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
897	vpxor	$t2,$h,$sigma
898
899	vpsrld	\$22,$a,$t2
900	vpxor	$t3,$sigma,$sigma
901	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
902	vpslld	\$10,$a,$t3
903	 vpxor	$bxc,$b,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
904	 vpaddd	$Xi,$d,$d			# d+=Xi
905	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
906	vpxor	$t2,$sigma,$sigma
907	vpxor	$t3,$sigma,$sigma		# Sigma0(a)
908
909	vpaddd	$Xi,$h,$h			# h+=Xi
910	vpaddd	$sigma,$h,$h			# h+=Sigma0(a)
911___
912$code.=<<___ if (($i%8)==7);
913	add	\$`32*8`,$Tbl
914___
915	($axb,$bxc)=($bxc,$axb);
916}
917
918sub ROUND_16_XX_avx {
919my $i=shift;
920
921$code.=<<___;
922	vmovdqu	`&Xi_off($i+1)`,$Xn
923	vpaddd	`&Xi_off($i+9)`,$Xi,$Xi		# Xi+=X[i+9]
924
925	vpsrld	\$3,$Xn,$sigma
926	vpsrld	\$7,$Xn,$t2
927	vpslld	\$25,$Xn,$t3
928	vpxor	$t2,$sigma,$sigma
929	vpsrld	\$18,$Xn,$t2
930	vpxor	$t3,$sigma,$sigma
931	vpslld	\$14,$Xn,$t3
932	vmovdqu	`&Xi_off($i+14)`,$t1
933	vpsrld	\$10,$t1,$axb			# borrow $axb
934
935	vpxor	$t2,$sigma,$sigma
936	vpsrld	\$17,$t1,$t2
937	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+1])
938	vpslld	\$15,$t1,$t3
939	 vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma0(e)
940	vpxor	$t2,$axb,$sigma
941	vpsrld	\$19,$t1,$t2
942	vpxor	$t3,$sigma,$sigma
943	vpslld	\$13,$t1,$t3
944	vpxor	$t2,$sigma,$sigma
945	vpxor	$t3,$sigma,$sigma		# sigma0(X[i+14])
946	vpaddd	$sigma,$Xi,$Xi			# Xi+=sigma1(X[i+14])
947___
948	&ROUND_00_15_avx($i,@_);
949	($Xi,$Xn)=($Xn,$Xi);
950}
951
952$code.=<<___;
953.type	sha256_multi_block_avx,\@function,3
954.align	32
955sha256_multi_block_avx:
956.cfi_startproc
957_avx_shortcut:
958___
959$code.=<<___ if ($avx>1);
960	shr	\$32,%rcx
961	cmp	\$2,$num
962	jb	.Lavx
963	test	\$`1<<5`,%ecx
964	jnz	_avx2_shortcut
965	jmp	.Lavx
966.align	32
967.Lavx:
968___
969$code.=<<___;
970	mov	%rsp,%rax
971.cfi_def_cfa_register	%rax
972	push	%rbx
973.cfi_push	%rbx
974	push	%rbp
975.cfi_push	%rbp
976___
977$code.=<<___ if ($win64);
978	lea	-0xa8(%rsp),%rsp
979	movaps	%xmm6,(%rsp)
980	movaps	%xmm7,0x10(%rsp)
981	movaps	%xmm8,0x20(%rsp)
982	movaps	%xmm9,0x30(%rsp)
983	movaps	%xmm10,-0x78(%rax)
984	movaps	%xmm11,-0x68(%rax)
985	movaps	%xmm12,-0x58(%rax)
986	movaps	%xmm13,-0x48(%rax)
987	movaps	%xmm14,-0x38(%rax)
988	movaps	%xmm15,-0x28(%rax)
989___
990$code.=<<___;
991	sub	\$`$REG_SZ*18`, %rsp
992	and	\$-256,%rsp
993	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
994.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
995.Lbody_avx:
996	lea	K256+128(%rip),$Tbl
997	lea	`$REG_SZ*16`(%rsp),%rbx
998	lea	0x80($ctx),$ctx			# size optimization
999
1000.Loop_grande_avx:
1001	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1002	xor	$num,$num
1003___
1004for($i=0;$i<4;$i++) {
1005    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1006    $code.=<<___;
1007	# input pointer
1008	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1009	# number of blocks
1010	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1011	cmp	$num,%ecx
1012	cmovg	%ecx,$num			# find maximum
1013	test	%ecx,%ecx
1014	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1015	cmovle	$Tbl,@ptr[$i]			# cancel input
1016___
1017}
1018$code.=<<___;
1019	test	$num,$num
1020	jz	.Ldone_avx
1021
1022	vmovdqu	0x00-0x80($ctx),$A		# load context
1023	 lea	128(%rsp),%rax
1024	vmovdqu	0x20-0x80($ctx),$B
1025	vmovdqu	0x40-0x80($ctx),$C
1026	vmovdqu	0x60-0x80($ctx),$D
1027	vmovdqu	0x80-0x80($ctx),$E
1028	vmovdqu	0xa0-0x80($ctx),$F
1029	vmovdqu	0xc0-0x80($ctx),$G
1030	vmovdqu	0xe0-0x80($ctx),$H
1031	vmovdqu	.Lpbswap(%rip),$Xn
1032	jmp	.Loop_avx
1033
1034.align	32
1035.Loop_avx:
1036	vpxor	$B,$C,$bxc			# magic seed
1037___
1038for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1039$code.=<<___;
1040	vmovdqu	`&Xi_off($i)`,$Xi
1041	mov	\$3,%ecx
1042	jmp	.Loop_16_xx_avx
1043.align	32
1044.Loop_16_xx_avx:
1045___
1046for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1047$code.=<<___;
1048	dec	%ecx
1049	jnz	.Loop_16_xx_avx
1050
1051	mov	\$1,%ecx
1052	lea	K256+128(%rip),$Tbl
1053___
1054for($i=0;$i<4;$i++) {
1055    $code.=<<___;
1056	cmp	`4*$i`(%rbx),%ecx		# examine counters
1057	cmovge	$Tbl,@ptr[$i]			# cancel input
1058___
1059}
1060$code.=<<___;
1061	vmovdqa	(%rbx),$sigma			# pull counters
1062	vpxor	$t1,$t1,$t1
1063	vmovdqa	$sigma,$Xn
1064	vpcmpgtd $t1,$Xn,$Xn			# mask value
1065	vpaddd	$Xn,$sigma,$sigma		# counters--
1066
1067	vmovdqu	0x00-0x80($ctx),$t1
1068	vpand	$Xn,$A,$A
1069	vmovdqu	0x20-0x80($ctx),$t2
1070	vpand	$Xn,$B,$B
1071	vmovdqu	0x40-0x80($ctx),$t3
1072	vpand	$Xn,$C,$C
1073	vmovdqu	0x60-0x80($ctx),$Xi
1074	vpand	$Xn,$D,$D
1075	vpaddd	$t1,$A,$A
1076	vmovdqu	0x80-0x80($ctx),$t1
1077	vpand	$Xn,$E,$E
1078	vpaddd	$t2,$B,$B
1079	vmovdqu	0xa0-0x80($ctx),$t2
1080	vpand	$Xn,$F,$F
1081	vpaddd	$t3,$C,$C
1082	vmovdqu	0xc0-0x80($ctx),$t3
1083	vpand	$Xn,$G,$G
1084	vpaddd	$Xi,$D,$D
1085	vmovdqu	0xe0-0x80($ctx),$Xi
1086	vpand	$Xn,$H,$H
1087	vpaddd	$t1,$E,$E
1088	vpaddd	$t2,$F,$F
1089	vmovdqu	$A,0x00-0x80($ctx)
1090	vpaddd	$t3,$G,$G
1091	vmovdqu	$B,0x20-0x80($ctx)
1092	vpaddd	$Xi,$H,$H
1093	vmovdqu	$C,0x40-0x80($ctx)
1094	vmovdqu	$D,0x60-0x80($ctx)
1095	vmovdqu	$E,0x80-0x80($ctx)
1096	vmovdqu	$F,0xa0-0x80($ctx)
1097	vmovdqu	$G,0xc0-0x80($ctx)
1098	vmovdqu	$H,0xe0-0x80($ctx)
1099
1100	vmovdqu	$sigma,(%rbx)			# save counters
1101	vmovdqu	.Lpbswap(%rip),$Xn
1102	dec	$num
1103	jnz	.Loop_avx
1104
1105	mov	`$REG_SZ*17+8`(%rsp),$num
1106	lea	$REG_SZ($ctx),$ctx
1107	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1108	dec	$num
1109	jnz	.Loop_grande_avx
1110
1111.Ldone_avx:
1112	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1113.cfi_def_cfa	%rax,8
1114	vzeroupper
1115___
1116$code.=<<___ if ($win64);
1117	movaps	-0xb8(%rax),%xmm6
1118	movaps	-0xa8(%rax),%xmm7
1119	movaps	-0x98(%rax),%xmm8
1120	movaps	-0x88(%rax),%xmm9
1121	movaps	-0x78(%rax),%xmm10
1122	movaps	-0x68(%rax),%xmm11
1123	movaps	-0x58(%rax),%xmm12
1124	movaps	-0x48(%rax),%xmm13
1125	movaps	-0x38(%rax),%xmm14
1126	movaps	-0x28(%rax),%xmm15
1127___
1128$code.=<<___;
1129	mov	-16(%rax),%rbp
1130.cfi_restore	%rbp
1131	mov	-8(%rax),%rbx
1132.cfi_restore	%rbx
1133	lea	(%rax),%rsp
1134.cfi_def_cfa_register	%rsp
1135.Lepilogue_avx:
1136	ret
1137.cfi_endproc
1138.size	sha256_multi_block_avx,.-sha256_multi_block_avx
1139___
1140						if ($avx>1) {
1141$code =~ s/\`([^\`]*)\`/eval $1/gem;
1142
1143$REG_SZ=32;
1144@ptr=map("%r$_",(12..15,8..11));
1145
1146@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1147($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1148
1149$code.=<<___;
1150.type	sha256_multi_block_avx2,\@function,3
1151.align	32
1152sha256_multi_block_avx2:
1153.cfi_startproc
1154_avx2_shortcut:
1155	mov	%rsp,%rax
1156.cfi_def_cfa_register	%rax
1157	push	%rbx
1158.cfi_push	%rbx
1159	push	%rbp
1160.cfi_push	%rbp
1161	push	%r12
1162.cfi_push	%r12
1163	push	%r13
1164.cfi_push	%r13
1165	push	%r14
1166.cfi_push	%r14
1167	push	%r15
1168.cfi_push	%r15
1169___
1170$code.=<<___ if ($win64);
1171	lea	-0xa8(%rsp),%rsp
1172	movaps	%xmm6,(%rsp)
1173	movaps	%xmm7,0x10(%rsp)
1174	movaps	%xmm8,0x20(%rsp)
1175	movaps	%xmm9,0x30(%rsp)
1176	movaps	%xmm10,0x40(%rsp)
1177	movaps	%xmm11,0x50(%rsp)
1178	movaps	%xmm12,-0x78(%rax)
1179	movaps	%xmm13,-0x68(%rax)
1180	movaps	%xmm14,-0x58(%rax)
1181	movaps	%xmm15,-0x48(%rax)
1182___
1183$code.=<<___;
1184	sub	\$`$REG_SZ*18`, %rsp
1185	and	\$-256,%rsp
1186	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1187.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1188.Lbody_avx2:
1189	lea	K256+128(%rip),$Tbl
1190	lea	0x80($ctx),$ctx			# size optimization
1191
1192.Loop_grande_avx2:
1193	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1194	xor	$num,$num
1195	lea	`$REG_SZ*16`(%rsp),%rbx
1196___
1197for($i=0;$i<8;$i++) {
1198    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1199    $code.=<<___;
1200	# input pointer
1201	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1202	# number of blocks
1203	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1204	cmp	$num,%ecx
1205	cmovg	%ecx,$num			# find maximum
1206	test	%ecx,%ecx
1207	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1208	cmovle	$Tbl,@ptr[$i]			# cancel input
1209___
1210}
1211$code.=<<___;
1212	vmovdqu	0x00-0x80($ctx),$A		# load context
1213	 lea	128(%rsp),%rax
1214	vmovdqu	0x20-0x80($ctx),$B
1215	 lea	256+128(%rsp),%rbx
1216	vmovdqu	0x40-0x80($ctx),$C
1217	vmovdqu	0x60-0x80($ctx),$D
1218	vmovdqu	0x80-0x80($ctx),$E
1219	vmovdqu	0xa0-0x80($ctx),$F
1220	vmovdqu	0xc0-0x80($ctx),$G
1221	vmovdqu	0xe0-0x80($ctx),$H
1222	vmovdqu	.Lpbswap(%rip),$Xn
1223	jmp	.Loop_avx2
1224
1225.align	32
1226.Loop_avx2:
1227	vpxor	$B,$C,$bxc			# magic seed
1228___
1229for($i=0;$i<16;$i++)	{ &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1230$code.=<<___;
1231	vmovdqu	`&Xi_off($i)`,$Xi
1232	mov	\$3,%ecx
1233	jmp	.Loop_16_xx_avx2
1234.align	32
1235.Loop_16_xx_avx2:
1236___
1237for(;$i<32;$i++)	{ &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1238$code.=<<___;
1239	dec	%ecx
1240	jnz	.Loop_16_xx_avx2
1241
1242	mov	\$1,%ecx
1243	lea	`$REG_SZ*16`(%rsp),%rbx
1244	lea	K256+128(%rip),$Tbl
1245___
1246for($i=0;$i<8;$i++) {
1247    $code.=<<___;
1248	cmp	`4*$i`(%rbx),%ecx		# examine counters
1249	cmovge	$Tbl,@ptr[$i]			# cancel input
1250___
1251}
1252$code.=<<___;
1253	vmovdqa	(%rbx),$sigma			# pull counters
1254	vpxor	$t1,$t1,$t1
1255	vmovdqa	$sigma,$Xn
1256	vpcmpgtd $t1,$Xn,$Xn			# mask value
1257	vpaddd	$Xn,$sigma,$sigma		# counters--
1258
1259	vmovdqu	0x00-0x80($ctx),$t1
1260	vpand	$Xn,$A,$A
1261	vmovdqu	0x20-0x80($ctx),$t2
1262	vpand	$Xn,$B,$B
1263	vmovdqu	0x40-0x80($ctx),$t3
1264	vpand	$Xn,$C,$C
1265	vmovdqu	0x60-0x80($ctx),$Xi
1266	vpand	$Xn,$D,$D
1267	vpaddd	$t1,$A,$A
1268	vmovdqu	0x80-0x80($ctx),$t1
1269	vpand	$Xn,$E,$E
1270	vpaddd	$t2,$B,$B
1271	vmovdqu	0xa0-0x80($ctx),$t2
1272	vpand	$Xn,$F,$F
1273	vpaddd	$t3,$C,$C
1274	vmovdqu	0xc0-0x80($ctx),$t3
1275	vpand	$Xn,$G,$G
1276	vpaddd	$Xi,$D,$D
1277	vmovdqu	0xe0-0x80($ctx),$Xi
1278	vpand	$Xn,$H,$H
1279	vpaddd	$t1,$E,$E
1280	vpaddd	$t2,$F,$F
1281	vmovdqu	$A,0x00-0x80($ctx)
1282	vpaddd	$t3,$G,$G
1283	vmovdqu	$B,0x20-0x80($ctx)
1284	vpaddd	$Xi,$H,$H
1285	vmovdqu	$C,0x40-0x80($ctx)
1286	vmovdqu	$D,0x60-0x80($ctx)
1287	vmovdqu	$E,0x80-0x80($ctx)
1288	vmovdqu	$F,0xa0-0x80($ctx)
1289	vmovdqu	$G,0xc0-0x80($ctx)
1290	vmovdqu	$H,0xe0-0x80($ctx)
1291
1292	vmovdqu	$sigma,(%rbx)			# save counters
1293	lea	256+128(%rsp),%rbx
1294	vmovdqu	.Lpbswap(%rip),$Xn
1295	dec	$num
1296	jnz	.Loop_avx2
1297
1298	#mov	`$REG_SZ*17+8`(%rsp),$num
1299	#lea	$REG_SZ($ctx),$ctx
1300	#lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1301	#dec	$num
1302	#jnz	.Loop_grande_avx2
1303
1304.Ldone_avx2:
1305	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1306.cfi_def_cfa	%rax,8
1307	vzeroupper
1308___
1309$code.=<<___ if ($win64);
1310	movaps	-0xd8(%rax),%xmm6
1311	movaps	-0xc8(%rax),%xmm7
1312	movaps	-0xb8(%rax),%xmm8
1313	movaps	-0xa8(%rax),%xmm9
1314	movaps	-0x98(%rax),%xmm10
1315	movaps	-0x88(%rax),%xmm11
1316	movaps	-0x78(%rax),%xmm12
1317	movaps	-0x68(%rax),%xmm13
1318	movaps	-0x58(%rax),%xmm14
1319	movaps	-0x48(%rax),%xmm15
1320___
1321$code.=<<___;
1322	mov	-48(%rax),%r15
1323.cfi_restore	%r15
1324	mov	-40(%rax),%r14
1325.cfi_restore	%r14
1326	mov	-32(%rax),%r13
1327.cfi_restore	%r13
1328	mov	-24(%rax),%r12
1329.cfi_restore	%r12
1330	mov	-16(%rax),%rbp
1331.cfi_restore	%rbp
1332	mov	-8(%rax),%rbx
1333.cfi_restore	%rbx
1334	lea	(%rax),%rsp
1335.cfi_def_cfa_register	%rsp
1336.Lepilogue_avx2:
1337	ret
1338.cfi_endproc
1339.size	sha256_multi_block_avx2,.-sha256_multi_block_avx2
1340___
1341					}	}}}
1342$code.=<<___;
1343.section .rodata align=256
1344.align	256
1345K256:
1346___
1347sub TABLE {
1348    foreach (@_) {
1349	$code.=<<___;
1350	.long	$_,$_,$_,$_
1351	.long	$_,$_,$_,$_
1352___
1353    }
1354}
1355&TABLE(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1356	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1357	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1358	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1359	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1360	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1361	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1362	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1363	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1364	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1365	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1366	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1367	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1368	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1369	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1370	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1371$code.=<<___;
1372.Lpbswap:
1373	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1374	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1375K256_shaext:
1376	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1377	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1378	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1379	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1380	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1381	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1382	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1383	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1384	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1385	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1386	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1387	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1388	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1389	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1390	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1391	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1392	.asciz	"SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1393.previous
1394___
1395
1396if ($win64) {
1397# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1398#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1399$rec="%rcx";
1400$frame="%rdx";
1401$context="%r8";
1402$disp="%r9";
1403
1404$code.=<<___;
1405.extern	__imp_RtlVirtualUnwind
1406.type	se_handler,\@abi-omnipotent
1407.align	16
1408se_handler:
1409	push	%rsi
1410	push	%rdi
1411	push	%rbx
1412	push	%rbp
1413	push	%r12
1414	push	%r13
1415	push	%r14
1416	push	%r15
1417	pushfq
1418	sub	\$64,%rsp
1419
1420	mov	120($context),%rax	# pull context->Rax
1421	mov	248($context),%rbx	# pull context->Rip
1422
1423	mov	8($disp),%rsi		# disp->ImageBase
1424	mov	56($disp),%r11		# disp->HandlerData
1425
1426	mov	0(%r11),%r10d		# HandlerData[0]
1427	lea	(%rsi,%r10),%r10	# end of prologue label
1428	cmp	%r10,%rbx		# context->Rip<.Lbody
1429	jb	.Lin_prologue
1430
1431	mov	152($context),%rax	# pull context->Rsp
1432
1433	mov	4(%r11),%r10d		# HandlerData[1]
1434	lea	(%rsi,%r10),%r10	# epilogue label
1435	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1436	jae	.Lin_prologue
1437
1438	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1439
1440	mov	-8(%rax),%rbx
1441	mov	-16(%rax),%rbp
1442	mov	%rbx,144($context)	# restore context->Rbx
1443	mov	%rbp,160($context)	# restore context->Rbp
1444
1445	lea	-24-10*16(%rax),%rsi
1446	lea	512($context),%rdi	# &context.Xmm6
1447	mov	\$20,%ecx
1448	.long	0xa548f3fc		# cld; rep movsq
1449
1450.Lin_prologue:
1451	mov	8(%rax),%rdi
1452	mov	16(%rax),%rsi
1453	mov	%rax,152($context)	# restore context->Rsp
1454	mov	%rsi,168($context)	# restore context->Rsi
1455	mov	%rdi,176($context)	# restore context->Rdi
1456
1457	mov	40($disp),%rdi		# disp->ContextRecord
1458	mov	$context,%rsi		# context
1459	mov	\$154,%ecx		# sizeof(CONTEXT)
1460	.long	0xa548f3fc		# cld; rep movsq
1461
1462	mov	$disp,%rsi
1463	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1464	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1465	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1466	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1467	mov	40(%rsi),%r10		# disp->ContextRecord
1468	lea	56(%rsi),%r11		# &disp->HandlerData
1469	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1470	mov	%r10,32(%rsp)		# arg5
1471	mov	%r11,40(%rsp)		# arg6
1472	mov	%r12,48(%rsp)		# arg7
1473	mov	%rcx,56(%rsp)		# arg8, (NULL)
1474	call	*__imp_RtlVirtualUnwind(%rip)
1475
1476	mov	\$1,%eax		# ExceptionContinueSearch
1477	add	\$64,%rsp
1478	popfq
1479	pop	%r15
1480	pop	%r14
1481	pop	%r13
1482	pop	%r12
1483	pop	%rbp
1484	pop	%rbx
1485	pop	%rdi
1486	pop	%rsi
1487	ret
1488.size	se_handler,.-se_handler
1489___
1490$code.=<<___ if ($avx>1);
1491.type	avx2_handler,\@abi-omnipotent
1492.align	16
1493avx2_handler:
1494	push	%rsi
1495	push	%rdi
1496	push	%rbx
1497	push	%rbp
1498	push	%r12
1499	push	%r13
1500	push	%r14
1501	push	%r15
1502	pushfq
1503	sub	\$64,%rsp
1504
1505	mov	120($context),%rax	# pull context->Rax
1506	mov	248($context),%rbx	# pull context->Rip
1507
1508	mov	8($disp),%rsi		# disp->ImageBase
1509	mov	56($disp),%r11		# disp->HandlerData
1510
1511	mov	0(%r11),%r10d		# HandlerData[0]
1512	lea	(%rsi,%r10),%r10	# end of prologue label
1513	cmp	%r10,%rbx		# context->Rip<body label
1514	jb	.Lin_prologue
1515
1516	mov	152($context),%rax	# pull context->Rsp
1517
1518	mov	4(%r11),%r10d		# HandlerData[1]
1519	lea	(%rsi,%r10),%r10	# epilogue label
1520	cmp	%r10,%rbx		# context->Rip>=epilogue label
1521	jae	.Lin_prologue
1522
1523	mov	`32*17`($context),%rax	# pull saved stack pointer
1524
1525	mov	-8(%rax),%rbx
1526	mov	-16(%rax),%rbp
1527	mov	-24(%rax),%r12
1528	mov	-32(%rax),%r13
1529	mov	-40(%rax),%r14
1530	mov	-48(%rax),%r15
1531	mov	%rbx,144($context)	# restore context->Rbx
1532	mov	%rbp,160($context)	# restore context->Rbp
1533	mov	%r12,216($context)	# restore context->R12
1534	mov	%r13,224($context)	# restore context->R13
1535	mov	%r14,232($context)	# restore context->R14
1536	mov	%r15,240($context)	# restore context->R15
1537
1538	lea	-56-10*16(%rax),%rsi
1539	lea	512($context),%rdi	# &context.Xmm6
1540	mov	\$20,%ecx
1541	.long	0xa548f3fc		# cld; rep movsq
1542
1543	jmp	.Lin_prologue
1544.size	avx2_handler,.-avx2_handler
1545___
1546$code.=<<___;
1547.section	.pdata
1548.align	4
1549	.rva	.LSEH_begin_sha256_multi_block
1550	.rva	.LSEH_end_sha256_multi_block
1551	.rva	.LSEH_info_sha256_multi_block
1552	.rva	.LSEH_begin_sha256_multi_block_shaext
1553	.rva	.LSEH_end_sha256_multi_block_shaext
1554	.rva	.LSEH_info_sha256_multi_block_shaext
1555___
1556$code.=<<___ if ($avx);
1557	.rva	.LSEH_begin_sha256_multi_block_avx
1558	.rva	.LSEH_end_sha256_multi_block_avx
1559	.rva	.LSEH_info_sha256_multi_block_avx
1560___
1561$code.=<<___ if ($avx>1);
1562	.rva	.LSEH_begin_sha256_multi_block_avx2
1563	.rva	.LSEH_end_sha256_multi_block_avx2
1564	.rva	.LSEH_info_sha256_multi_block_avx2
1565___
1566$code.=<<___;
1567.section	.xdata
1568.align	8
1569.LSEH_info_sha256_multi_block:
1570	.byte	9,0,0,0
1571	.rva	se_handler
1572	.rva	.Lbody,.Lepilogue			# HandlerData[]
1573.LSEH_info_sha256_multi_block_shaext:
1574	.byte	9,0,0,0
1575	.rva	se_handler
1576	.rva	.Lbody_shaext,.Lepilogue_shaext		# HandlerData[]
1577___
1578$code.=<<___ if ($avx);
1579.LSEH_info_sha256_multi_block_avx:
1580	.byte	9,0,0,0
1581	.rva	se_handler
1582	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1583___
1584$code.=<<___ if ($avx>1);
1585.LSEH_info_sha256_multi_block_avx2:
1586	.byte	9,0,0,0
1587	.rva	avx2_handler
1588	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1589___
1590}
1591####################################################################
1592
1593sub rex {
1594  local *opcode=shift;
1595  my ($dst,$src)=@_;
1596  my $rex=0;
1597
1598    $rex|=0x04			if ($dst>=8);
1599    $rex|=0x01			if ($src>=8);
1600    unshift @opcode,$rex|0x40	if ($rex);
1601}
1602
1603sub sha256op38 {
1604    my $instr = shift;
1605    my %opcodelet = (
1606		"sha256rnds2" => 0xcb,
1607  		"sha256msg1"  => 0xcc,
1608		"sha256msg2"  => 0xcd	);
1609
1610    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1611      my @opcode=(0x0f,0x38);
1612	rex(\@opcode,$2,$1);
1613	push @opcode,$opcodelet{$instr};
1614	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1615	return ".byte\t".join(',',@opcode);
1616    } else {
1617	return $instr."\t".@_[0];
1618    }
1619}
1620
1621foreach (split("\n",$code)) {
1622	s/\`([^\`]*)\`/eval($1)/ge;
1623
1624	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo		or
1625
1626	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1627	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1628	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1629	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1630	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1631	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1632
1633	print $_,"\n";
1634}
1635
1636close STDOUT or die "error closing STDOUT: $!";
1637