1#! /usr/bin/env perl
2# Copyright 2013-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# January 2013
18#
19# This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20# in http://download.intel.com/design/intarch/papers/323686.pdf, is
21# that since AESNI-CBC encrypt exhibit *very* low instruction-level
22# parallelism, interleaving it with another algorithm would allow to
23# utilize processor resources better and achieve better performance.
24# SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25# AESNI code is weaved into it. As SHA256 dominates execution time,
26# stitch performance does not depend on AES key length. Below are
27# performance numbers in cycles per processed byte, less is better,
28# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
29# subroutine:
30#
31#		 AES-128/-192/-256+SHA256   this(**)	gain
32# Sandy Bridge	    5.05/6.05/7.05+11.6	    13.0	+28%/36%/43%
33# Ivy Bridge	    5.05/6.05/7.05+10.3	    11.6	+32%/41%/50%
34# Haswell	    4.43/5.29/6.19+7.80	    8.79	+39%/49%/59%
35# Skylake	    2.62/3.14/3.62+7.70	    8.10	+27%/34%/40%
36# Bulldozer	    5.77/6.89/8.00+13.7	    13.7	+42%/50%/58%
37# Ryzen(***)	    2.71/-/3.71+2.05	    2.74/-/3.73	+74%/-/54%
38# Goldmont(***)	    3.82/-/5.35+4.16	    4.73/-/5.94	+69%/-/60%
39#
40# (*)	there are XOP, AVX1 and AVX2 code paths, meaning that
41#	Westmere is omitted from loop, this is because gain was not
42#	estimated high enough to justify the effort;
43# (**)	these are EVP-free results, results obtained with 'speed
44#	-evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45# (***)	these are SHAEXT results;
46
47# $output is the last argument if it looks like a file (it has an extension)
48# $flavour is the first argument if it doesn't look like a file
49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61	$avx = ($1>=2.19) + ($1>=2.22);
62}
63
64if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66	$avx = ($1>=2.09) + ($1>=2.10);
67}
68
69if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71	$avx = ($1>=10) + ($1>=12);
72}
73
74if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75	$avx = ($2>=3.0) + ($2>3.0);
76}
77
78$shaext=$avx;	### set to zero if compiling for 1.0.1
79$avx=1		if (!$shaext && $avx);
80
81open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82    or die "can't call $xlate: $!";
83*STDOUT=*OUT;
84
85$func="aesni_cbc_sha256_enc";
86$TABLE="K256";
87$SZ=4;
88@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89				"%r8d","%r9d","%r10d","%r11d");
90($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
91@Sigma0=( 2,13,22);
92@Sigma1=( 6,11,25);
93@sigma0=( 7,18, 3);
94@sigma1=(17,19,10);
95$rounds=64;
96
97########################################################################
98# void aesni_cbc_sha256_enc(const void *inp,
99#			void *out,
100#			size_t length,
101#			const AES_KEY *key,
102#			unsigned char *iv,
103#			SHA256_CTX *ctx,
104#			const void *in0);
105($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
106("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
107
108$Tbl="%rbp";
109
110$_inp="16*$SZ+0*8(%rsp)";
111$_out="16*$SZ+1*8(%rsp)";
112$_end="16*$SZ+2*8(%rsp)";
113$_key="16*$SZ+3*8(%rsp)";
114$_ivp="16*$SZ+4*8(%rsp)";
115$_ctx="16*$SZ+5*8(%rsp)";
116$_in0="16*$SZ+6*8(%rsp)";
117$_rsp="`16*$SZ+7*8`(%rsp)";
118$framesz=16*$SZ+8*8;
119
120$code=<<___;
121.text
122
123.extern	OPENSSL_ia32cap_P
124.globl	$func
125.type	$func,\@abi-omnipotent
126.align	16
127$func:
128.cfi_startproc
129___
130						if ($avx) {
131$code.=<<___;
132	lea	OPENSSL_ia32cap_P(%rip),%r11
133	mov	\$1,%eax
134	cmp	\$0,`$win64?"%rcx":"%rdi"`
135	je	.Lprobe
136	mov	0(%r11),%eax
137	mov	4(%r11),%r10
138___
139$code.=<<___ if ($shaext);
140	bt	\$61,%r10			# check for SHA
141	jc	${func}_shaext
142___
143$code.=<<___;
144	mov	%r10,%r11
145	shr	\$32,%r11
146
147	test	\$`1<<11`,%r10d			# check for XOP
148	jnz	${func}_xop
149___
150$code.=<<___ if ($avx>1);
151	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
152	cmp	\$`1<<8|1<<5|1<<3`,%r11d
153	je	${func}_avx2
154___
155$code.=<<___;
156	and	\$`1<<28`,%r10d			# check for AVX
157	jnz	${func}_avx
158	ud2
159___
160						}
161$code.=<<___;
162	xor	%eax,%eax
163	cmp	\$0,`$win64?"%rcx":"%rdi"`
164	je	.Lprobe
165	ud2
166.Lprobe:
167	ret
168.cfi_endproc
169.size	$func,.-$func
170
171.section .rodata align=64
172.align	64
173.type	$TABLE,\@object
174$TABLE:
175	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
177	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
179	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
181	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
183	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
185	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
187	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
189	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
191	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
193	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
195	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
197	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
199	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
201	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
203	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
205	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
207
208	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
210	.long	0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
211	.long	0,0,0,0,   0,0,0,0
212	.asciz	"AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
213.align	64
214.previous
215___
216
217######################################################################
218# SIMD code paths
219#
220{{{
221($iv,$inout,$roundkey,$temp,
222 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
223
224$aesni_cbc_idx=0;
225@aesni_cbc_block = (
226##	&vmovdqu	($roundkey,"0x00-0x80($inp)");'
227##	&vmovdqu	($inout,($inp));
228##	&mov		($_inp,$inp);
229
230	'&vpxor		($inout,$inout,$roundkey);'.
231	' &vmovdqu	($roundkey,"0x10-0x80($inp)");',
232
233	'&vpxor		($inout,$inout,$iv);',
234
235	'&vaesenc	($inout,$inout,$roundkey);'.
236	' &vmovdqu	($roundkey,"0x20-0x80($inp)");',
237
238	'&vaesenc	($inout,$inout,$roundkey);'.
239	' &vmovdqu	($roundkey,"0x30-0x80($inp)");',
240
241	'&vaesenc	($inout,$inout,$roundkey);'.
242	' &vmovdqu	($roundkey,"0x40-0x80($inp)");',
243
244	'&vaesenc	($inout,$inout,$roundkey);'.
245	' &vmovdqu	($roundkey,"0x50-0x80($inp)");',
246
247	'&vaesenc	($inout,$inout,$roundkey);'.
248	' &vmovdqu	($roundkey,"0x60-0x80($inp)");',
249
250	'&vaesenc	($inout,$inout,$roundkey);'.
251	' &vmovdqu	($roundkey,"0x70-0x80($inp)");',
252
253	'&vaesenc	($inout,$inout,$roundkey);'.
254	' &vmovdqu	($roundkey,"0x80-0x80($inp)");',
255
256	'&vaesenc	($inout,$inout,$roundkey);'.
257	' &vmovdqu	($roundkey,"0x90-0x80($inp)");',
258
259	'&vaesenc	($inout,$inout,$roundkey);'.
260	' &vmovdqu	($roundkey,"0xa0-0x80($inp)");',
261
262	'&vaesenclast	($temp,$inout,$roundkey);'.
263	' &vaesenc	($inout,$inout,$roundkey);'.
264	' &vmovdqu	($roundkey,"0xb0-0x80($inp)");',
265
266	'&vpand		($iv,$temp,$mask10);'.
267	' &vaesenc	($inout,$inout,$roundkey);'.
268	' &vmovdqu	($roundkey,"0xc0-0x80($inp)");',
269
270	'&vaesenclast	($temp,$inout,$roundkey);'.
271	' &vaesenc	($inout,$inout,$roundkey);'.
272	' &vmovdqu	($roundkey,"0xd0-0x80($inp)");',
273
274	'&vpand		($temp,$temp,$mask12);'.
275	' &vaesenc	($inout,$inout,$roundkey);'.
276	 '&vmovdqu	($roundkey,"0xe0-0x80($inp)");',
277
278	'&vpor		($iv,$iv,$temp);'.
279	' &vaesenclast	($temp,$inout,$roundkey);'.
280	' &vmovdqu	($roundkey,"0x00-0x80($inp)");'
281
282##	&mov		($inp,$_inp);
283##	&mov		($out,$_out);
284##	&vpand		($temp,$temp,$mask14);
285##	&vpor		($iv,$iv,$temp);
286##	&vmovdqu	($iv,($out,$inp);
287##	&lea		(inp,16($inp));
288);
289
290my $a4=$T1;
291my ($a,$b,$c,$d,$e,$f,$g,$h);
292
293sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
294{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
295  my $arg = pop;
296    $arg = "\$$arg" if ($arg*1 eq $arg);
297    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
298}
299
300sub body_00_15 () {
301	(
302	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
303
304	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
305	'&mov	($a,$a1)',
306	'&mov	($a4,$f)',
307
308	'&xor	($a0,$e)',
309	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
310	'&xor	($a4,$g)',			# f^g
311
312	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
313	'&xor	($a1,$a)',
314	'&and	($a4,$e)',			# (f^g)&e
315
316	@aesni_cbc_block[$aesni_cbc_idx++].
317	'&xor	($a0,$e)',
318	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
319	'&mov	($a2,$a)',
320
321	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
322	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
323	'&xor	($a2,$b)',			# a^b, b^c in next round
324
325	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
326	'&add	($h,$a4)',			# h+=Ch(e,f,g)
327	'&and	($a3,$a2)',			# (b^c)&(a^b)
328
329	'&xor	($a1,$a)',
330	'&add	($h,$a0)',			# h+=Sigma1(e)
331	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
332
333	'&add	($d,$h)',			# d+=h
334	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
335	'&add	($h,$a3)',			# h+=Maj(a,b,c)
336
337	'&mov	($a0,$d)',
338	'&add	($a1,$h);'.			# h+=Sigma0(a)
339	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
340	);
341}
342
343if ($avx) {{
344######################################################################
345# XOP code path
346#
347$code.=<<___;
348.type	${func}_xop,\@function,6
349.align	64
350${func}_xop:
351.cfi_startproc
352.Lxop_shortcut:
353	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
354	mov	%rsp,%rax		# copy %rsp
355.cfi_def_cfa_register	%rax
356	push	%rbx
357.cfi_push	%rbx
358	push	%rbp
359.cfi_push	%rbp
360	push	%r12
361.cfi_push	%r12
362	push	%r13
363.cfi_push	%r13
364	push	%r14
365.cfi_push	%r14
366	push	%r15
367.cfi_push	%r15
368	sub	\$`$framesz+$win64*16*10`,%rsp
369	and	\$-64,%rsp		# align stack frame
370
371	shl	\$6,$len
372	sub	$inp,$out		# re-bias
373	sub	$inp,$in0
374	add	$inp,$len		# end of input
375
376	#mov	$inp,$_inp		# saved later
377	mov	$out,$_out
378	mov	$len,$_end
379	#mov	$key,$_key		# remains resident in $inp register
380	mov	$ivp,$_ivp
381	mov	$ctx,$_ctx
382	mov	$in0,$_in0
383	mov	%rax,$_rsp
384.cfi_cfa_expression	$_rsp,deref,+8
385___
386$code.=<<___ if ($win64);
387	movaps	%xmm6,`$framesz+16*0`(%rsp)
388	movaps	%xmm7,`$framesz+16*1`(%rsp)
389	movaps	%xmm8,`$framesz+16*2`(%rsp)
390	movaps	%xmm9,`$framesz+16*3`(%rsp)
391	movaps	%xmm10,`$framesz+16*4`(%rsp)
392	movaps	%xmm11,`$framesz+16*5`(%rsp)
393	movaps	%xmm12,`$framesz+16*6`(%rsp)
394	movaps	%xmm13,`$framesz+16*7`(%rsp)
395	movaps	%xmm14,`$framesz+16*8`(%rsp)
396	movaps	%xmm15,`$framesz+16*9`(%rsp)
397___
398$code.=<<___;
399.Lprologue_xop:
400	vzeroall
401
402	mov	$inp,%r12		# borrow $a4
403	lea	0x80($key),$inp		# size optimization, reassign
404	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
405	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
406	mov	$ctx,%r15		# borrow $a2
407	mov	$in0,%rsi		# borrow $a3
408	vmovdqu	($ivp),$iv		# load IV
409	sub	\$9,%r14
410
411	mov	$SZ*0(%r15),$A
412	mov	$SZ*1(%r15),$B
413	mov	$SZ*2(%r15),$C
414	mov	$SZ*3(%r15),$D
415	mov	$SZ*4(%r15),$E
416	mov	$SZ*5(%r15),$F
417	mov	$SZ*6(%r15),$G
418	mov	$SZ*7(%r15),$H
419
420	vmovdqa	0x00(%r13,%r14,8),$mask14
421	vmovdqa	0x10(%r13,%r14,8),$mask12
422	vmovdqa	0x20(%r13,%r14,8),$mask10
423	vmovdqu	0x00-0x80($inp),$roundkey
424	jmp	.Lloop_xop
425___
426					if ($SZ==4) {	# SHA256
427    my @X = map("%xmm$_",(0..3));
428    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
429
430$code.=<<___;
431.align	16
432.Lloop_xop:
433	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
434	vmovdqu	0x00(%rsi,%r12),@X[0]
435	vmovdqu	0x10(%rsi,%r12),@X[1]
436	vmovdqu	0x20(%rsi,%r12),@X[2]
437	vmovdqu	0x30(%rsi,%r12),@X[3]
438	vpshufb	$t3,@X[0],@X[0]
439	lea	$TABLE(%rip),$Tbl
440	vpshufb	$t3,@X[1],@X[1]
441	vpshufb	$t3,@X[2],@X[2]
442	vpaddd	0x00($Tbl),@X[0],$t0
443	vpshufb	$t3,@X[3],@X[3]
444	vpaddd	0x20($Tbl),@X[1],$t1
445	vpaddd	0x40($Tbl),@X[2],$t2
446	vpaddd	0x60($Tbl),@X[3],$t3
447	vmovdqa	$t0,0x00(%rsp)
448	mov	$A,$a1
449	vmovdqa	$t1,0x10(%rsp)
450	mov	$B,$a3
451	vmovdqa	$t2,0x20(%rsp)
452	xor	$C,$a3			# magic
453	vmovdqa	$t3,0x30(%rsp)
454	mov	$E,$a0
455	jmp	.Lxop_00_47
456
457.align	16
458.Lxop_00_47:
459	sub	\$-16*2*$SZ,$Tbl	# size optimization
460	vmovdqu	(%r12),$inout		# $a4
461	mov	%r12,$_inp		# $a4
462___
463sub XOP_256_00_47 () {
464my $j = shift;
465my $body = shift;
466my @X = @_;
467my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
468
469	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
470	  eval(shift(@insns));
471	  eval(shift(@insns));
472	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
473	  eval(shift(@insns));
474	  eval(shift(@insns));
475	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
476	  eval(shift(@insns));
477	  eval(shift(@insns));
478	&vpsrld		($t0,$t0,$sigma0[2]);
479	  eval(shift(@insns));
480	  eval(shift(@insns));
481	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
482	  eval(shift(@insns));
483	  eval(shift(@insns));
484	  eval(shift(@insns));
485	  eval(shift(@insns));
486	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
487	  eval(shift(@insns));
488	  eval(shift(@insns));
489	&vpxor		($t0,$t0,$t1);
490	  eval(shift(@insns));
491	  eval(shift(@insns));
492	  eval(shift(@insns));
493	  eval(shift(@insns));
494	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
495	  eval(shift(@insns));
496	  eval(shift(@insns));
497	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
498	  eval(shift(@insns));
499	  eval(shift(@insns));
500	 &vpsrld	($t2,@X[3],$sigma1[2]);
501	  eval(shift(@insns));
502	  eval(shift(@insns));
503	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
504	  eval(shift(@insns));
505	  eval(shift(@insns));
506	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
507	  eval(shift(@insns));
508	  eval(shift(@insns));
509	 &vpxor		($t3,$t3,$t2);
510	  eval(shift(@insns));
511	  eval(shift(@insns));
512	  eval(shift(@insns));
513	  eval(shift(@insns));
514	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
515	  eval(shift(@insns));
516	  eval(shift(@insns));
517	  eval(shift(@insns));
518	  eval(shift(@insns));
519	&vpsrldq	($t3,$t3,8);
520	  eval(shift(@insns));
521	  eval(shift(@insns));
522	  eval(shift(@insns));
523	  eval(shift(@insns));
524	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
525	  eval(shift(@insns));
526	  eval(shift(@insns));
527	  eval(shift(@insns));
528	  eval(shift(@insns));
529	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
530	  eval(shift(@insns));
531	  eval(shift(@insns));
532	 &vpsrld	($t2,@X[0],$sigma1[2]);
533	  eval(shift(@insns));
534	  eval(shift(@insns));
535	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
536	  eval(shift(@insns));
537	  eval(shift(@insns));
538	 &vpxor		($t3,$t3,$t2);
539	  eval(shift(@insns));
540	  eval(shift(@insns));
541	  eval(shift(@insns));
542	  eval(shift(@insns));
543	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
544	  eval(shift(@insns));
545	  eval(shift(@insns));
546	  eval(shift(@insns));
547	  eval(shift(@insns));
548	&vpslldq	($t3,$t3,8);		# 22 instructions
549	  eval(shift(@insns));
550	  eval(shift(@insns));
551	  eval(shift(@insns));
552	  eval(shift(@insns));
553	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
554	  eval(shift(@insns));
555	  eval(shift(@insns));
556	  eval(shift(@insns));
557	  eval(shift(@insns));
558	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
559	  foreach (@insns) { eval; }		# remaining instructions
560	&vmovdqa	(16*$j."(%rsp)",$t2);
561}
562
563    $aesni_cbc_idx=0;
564    for ($i=0,$j=0; $j<4; $j++) {
565	&XOP_256_00_47($j,\&body_00_15,@X);
566	push(@X,shift(@X));			# rotate(@X)
567    }
568    	&mov		("%r12",$_inp);		# borrow $a4
569	&vpand		($temp,$temp,$mask14);
570	&mov		("%r15",$_out);		# borrow $a2
571	&vpor		($iv,$iv,$temp);
572	&vmovdqu	("(%r15,%r12)",$iv);	# write output
573	&lea		("%r12","16(%r12)");	# inp++
574
575	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
576	&jne	(".Lxop_00_47");
577
578	&vmovdqu	($inout,"(%r12)");
579	&mov		($_inp,"%r12");
580
581    $aesni_cbc_idx=0;
582    for ($i=0; $i<16; ) {
583	foreach(body_00_15()) { eval; }
584    }
585					}
586$code.=<<___;
587	mov	$_inp,%r12		# borrow $a4
588	mov	$_out,%r13		# borrow $a0
589	mov	$_ctx,%r15		# borrow $a2
590	mov	$_in0,%rsi		# borrow $a3
591
592	vpand	$mask14,$temp,$temp
593	mov	$a1,$A
594	vpor	$temp,$iv,$iv
595	vmovdqu	$iv,(%r13,%r12)		# write output
596	lea	16(%r12),%r12		# inp++
597
598	add	$SZ*0(%r15),$A
599	add	$SZ*1(%r15),$B
600	add	$SZ*2(%r15),$C
601	add	$SZ*3(%r15),$D
602	add	$SZ*4(%r15),$E
603	add	$SZ*5(%r15),$F
604	add	$SZ*6(%r15),$G
605	add	$SZ*7(%r15),$H
606
607	cmp	$_end,%r12
608
609	mov	$A,$SZ*0(%r15)
610	mov	$B,$SZ*1(%r15)
611	mov	$C,$SZ*2(%r15)
612	mov	$D,$SZ*3(%r15)
613	mov	$E,$SZ*4(%r15)
614	mov	$F,$SZ*5(%r15)
615	mov	$G,$SZ*6(%r15)
616	mov	$H,$SZ*7(%r15)
617
618	jb	.Lloop_xop
619
620	mov	$_ivp,$ivp
621	mov	$_rsp,%rsi
622.cfi_def_cfa	%rsi,8
623	vmovdqu	$iv,($ivp)		# output IV
624	vzeroall
625___
626$code.=<<___ if ($win64);
627	movaps	`$framesz+16*0`(%rsp),%xmm6
628	movaps	`$framesz+16*1`(%rsp),%xmm7
629	movaps	`$framesz+16*2`(%rsp),%xmm8
630	movaps	`$framesz+16*3`(%rsp),%xmm9
631	movaps	`$framesz+16*4`(%rsp),%xmm10
632	movaps	`$framesz+16*5`(%rsp),%xmm11
633	movaps	`$framesz+16*6`(%rsp),%xmm12
634	movaps	`$framesz+16*7`(%rsp),%xmm13
635	movaps	`$framesz+16*8`(%rsp),%xmm14
636	movaps	`$framesz+16*9`(%rsp),%xmm15
637___
638$code.=<<___;
639	mov	-48(%rsi),%r15
640.cfi_restore	%r15
641	mov	-40(%rsi),%r14
642.cfi_restore	%r14
643	mov	-32(%rsi),%r13
644.cfi_restore	%r13
645	mov	-24(%rsi),%r12
646.cfi_restore	%r12
647	mov	-16(%rsi),%rbp
648.cfi_restore	%rbp
649	mov	-8(%rsi),%rbx
650.cfi_restore	%rbx
651	lea	(%rsi),%rsp
652.cfi_def_cfa_register	%rsp
653.Lepilogue_xop:
654	ret
655.cfi_endproc
656.size	${func}_xop,.-${func}_xop
657___
658######################################################################
659# AVX+shrd code path
660#
661local *ror = sub { &shrd(@_[0],@_) };
662
663$code.=<<___;
664.type	${func}_avx,\@function,6
665.align	64
666${func}_avx:
667.cfi_startproc
668.Lavx_shortcut:
669	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
670	mov	%rsp,%rax		# copy %rsp
671.cfi_def_cfa_register	%rax
672	push	%rbx
673.cfi_push	%rbx
674	push	%rbp
675.cfi_push	%rbp
676	push	%r12
677.cfi_push	%r12
678	push	%r13
679.cfi_push	%r13
680	push	%r14
681.cfi_push	%r14
682	push	%r15
683.cfi_push	%r15
684	sub	\$`$framesz+$win64*16*10`,%rsp
685	and	\$-64,%rsp		# align stack frame
686
687	shl	\$6,$len
688	sub	$inp,$out		# re-bias
689	sub	$inp,$in0
690	add	$inp,$len		# end of input
691
692	#mov	$inp,$_inp		# saved later
693	mov	$out,$_out
694	mov	$len,$_end
695	#mov	$key,$_key		# remains resident in $inp register
696	mov	$ivp,$_ivp
697	mov	$ctx,$_ctx
698	mov	$in0,$_in0
699	mov	%rax,$_rsp
700.cfi_cfa_expression	$_rsp,deref,+8
701___
702$code.=<<___ if ($win64);
703	movaps	%xmm6,`$framesz+16*0`(%rsp)
704	movaps	%xmm7,`$framesz+16*1`(%rsp)
705	movaps	%xmm8,`$framesz+16*2`(%rsp)
706	movaps	%xmm9,`$framesz+16*3`(%rsp)
707	movaps	%xmm10,`$framesz+16*4`(%rsp)
708	movaps	%xmm11,`$framesz+16*5`(%rsp)
709	movaps	%xmm12,`$framesz+16*6`(%rsp)
710	movaps	%xmm13,`$framesz+16*7`(%rsp)
711	movaps	%xmm14,`$framesz+16*8`(%rsp)
712	movaps	%xmm15,`$framesz+16*9`(%rsp)
713___
714$code.=<<___;
715.Lprologue_avx:
716	vzeroall
717
718	mov	$inp,%r12		# borrow $a4
719	lea	0x80($key),$inp		# size optimization, reassign
720	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r13	# borrow $a0
721	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
722	mov	$ctx,%r15		# borrow $a2
723	mov	$in0,%rsi		# borrow $a3
724	vmovdqu	($ivp),$iv		# load IV
725	sub	\$9,%r14
726
727	mov	$SZ*0(%r15),$A
728	mov	$SZ*1(%r15),$B
729	mov	$SZ*2(%r15),$C
730	mov	$SZ*3(%r15),$D
731	mov	$SZ*4(%r15),$E
732	mov	$SZ*5(%r15),$F
733	mov	$SZ*6(%r15),$G
734	mov	$SZ*7(%r15),$H
735
736	vmovdqa	0x00(%r13,%r14,8),$mask14
737	vmovdqa	0x10(%r13,%r14,8),$mask12
738	vmovdqa	0x20(%r13,%r14,8),$mask10
739	vmovdqu	0x00-0x80($inp),$roundkey
740___
741					if ($SZ==4) {	# SHA256
742    my @X = map("%xmm$_",(0..3));
743    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
744
745$code.=<<___;
746	jmp	.Lloop_avx
747.align	16
748.Lloop_avx:
749	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
750	vmovdqu	0x00(%rsi,%r12),@X[0]
751	vmovdqu	0x10(%rsi,%r12),@X[1]
752	vmovdqu	0x20(%rsi,%r12),@X[2]
753	vmovdqu	0x30(%rsi,%r12),@X[3]
754	vpshufb	$t3,@X[0],@X[0]
755	lea	$TABLE(%rip),$Tbl
756	vpshufb	$t3,@X[1],@X[1]
757	vpshufb	$t3,@X[2],@X[2]
758	vpaddd	0x00($Tbl),@X[0],$t0
759	vpshufb	$t3,@X[3],@X[3]
760	vpaddd	0x20($Tbl),@X[1],$t1
761	vpaddd	0x40($Tbl),@X[2],$t2
762	vpaddd	0x60($Tbl),@X[3],$t3
763	vmovdqa	$t0,0x00(%rsp)
764	mov	$A,$a1
765	vmovdqa	$t1,0x10(%rsp)
766	mov	$B,$a3
767	vmovdqa	$t2,0x20(%rsp)
768	xor	$C,$a3			# magic
769	vmovdqa	$t3,0x30(%rsp)
770	mov	$E,$a0
771	jmp	.Lavx_00_47
772
773.align	16
774.Lavx_00_47:
775	sub	\$-16*2*$SZ,$Tbl	# size optimization
776	vmovdqu	(%r12),$inout		# $a4
777	mov	%r12,$_inp		# $a4
778___
779sub Xupdate_256_AVX () {
780	(
781	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
782	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
783	'&vpsrld	($t2,$t0,$sigma0[0]);',
784	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
785	'&vpsrld	($t3,$t0,$sigma0[2])',
786	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
787	'&vpxor		($t0,$t3,$t2)',
788	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
789	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
790	'&vpxor		($t0,$t0,$t1)',
791	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
792	'&vpxor		($t0,$t0,$t2)',
793	 '&vpsrld	($t2,$t3,$sigma1[2]);',
794	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
795	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
796	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
797	 '&vpxor	($t2,$t2,$t3);',
798	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
799	 '&vpxor	($t2,$t2,$t3)',		# sigma1(X[14..15])
800	 '&vpshufd	($t2,$t2,0b10000100)',
801	 '&vpsrldq	($t2,$t2,8)',
802	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
803	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
804	 '&vpsrld	($t2,$t3,$sigma1[2])',
805	 '&vpsrlq	($t3,$t3,$sigma1[0])',
806	 '&vpxor	($t2,$t2,$t3);',
807	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
808	 '&vpxor	($t2,$t2,$t3)',
809	 '&vpshufd	($t2,$t2,0b11101000)',
810	 '&vpslldq	($t2,$t2,8)',
811	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
812	);
813}
814
815sub AVX_256_00_47 () {
816my $j = shift;
817my $body = shift;
818my @X = @_;
819my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
820
821	foreach (Xupdate_256_AVX()) {		# 29 instructions
822	    eval;
823	    eval(shift(@insns));
824	    eval(shift(@insns));
825	    eval(shift(@insns));
826	}
827	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
828	  foreach (@insns) { eval; }		# remaining instructions
829	&vmovdqa	(16*$j."(%rsp)",$t2);
830}
831
832    $aesni_cbc_idx=0;
833    for ($i=0,$j=0; $j<4; $j++) {
834	&AVX_256_00_47($j,\&body_00_15,@X);
835	push(@X,shift(@X));			# rotate(@X)
836    }
837    	&mov		("%r12",$_inp);		# borrow $a4
838	&vpand		($temp,$temp,$mask14);
839	&mov		("%r15",$_out);		# borrow $a2
840	&vpor		($iv,$iv,$temp);
841	&vmovdqu	("(%r15,%r12)",$iv);	# write output
842	&lea		("%r12","16(%r12)");	# inp++
843
844	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
845	&jne	(".Lavx_00_47");
846
847	&vmovdqu	($inout,"(%r12)");
848	&mov		($_inp,"%r12");
849
850    $aesni_cbc_idx=0;
851    for ($i=0; $i<16; ) {
852	foreach(body_00_15()) { eval; }
853    }
854
855					}
856$code.=<<___;
857	mov	$_inp,%r12		# borrow $a4
858	mov	$_out,%r13		# borrow $a0
859	mov	$_ctx,%r15		# borrow $a2
860	mov	$_in0,%rsi		# borrow $a3
861
862	vpand	$mask14,$temp,$temp
863	mov	$a1,$A
864	vpor	$temp,$iv,$iv
865	vmovdqu	$iv,(%r13,%r12)		# write output
866	lea	16(%r12),%r12		# inp++
867
868	add	$SZ*0(%r15),$A
869	add	$SZ*1(%r15),$B
870	add	$SZ*2(%r15),$C
871	add	$SZ*3(%r15),$D
872	add	$SZ*4(%r15),$E
873	add	$SZ*5(%r15),$F
874	add	$SZ*6(%r15),$G
875	add	$SZ*7(%r15),$H
876
877	cmp	$_end,%r12
878
879	mov	$A,$SZ*0(%r15)
880	mov	$B,$SZ*1(%r15)
881	mov	$C,$SZ*2(%r15)
882	mov	$D,$SZ*3(%r15)
883	mov	$E,$SZ*4(%r15)
884	mov	$F,$SZ*5(%r15)
885	mov	$G,$SZ*6(%r15)
886	mov	$H,$SZ*7(%r15)
887	jb	.Lloop_avx
888
889	mov	$_ivp,$ivp
890	mov	$_rsp,%rsi
891.cfi_def_cfa	%rsi,8
892	vmovdqu	$iv,($ivp)		# output IV
893	vzeroall
894___
895$code.=<<___ if ($win64);
896	movaps	`$framesz+16*0`(%rsp),%xmm6
897	movaps	`$framesz+16*1`(%rsp),%xmm7
898	movaps	`$framesz+16*2`(%rsp),%xmm8
899	movaps	`$framesz+16*3`(%rsp),%xmm9
900	movaps	`$framesz+16*4`(%rsp),%xmm10
901	movaps	`$framesz+16*5`(%rsp),%xmm11
902	movaps	`$framesz+16*6`(%rsp),%xmm12
903	movaps	`$framesz+16*7`(%rsp),%xmm13
904	movaps	`$framesz+16*8`(%rsp),%xmm14
905	movaps	`$framesz+16*9`(%rsp),%xmm15
906___
907$code.=<<___;
908	mov	-48(%rsi),%r15
909.cfi_restore	%r15
910	mov	-40(%rsi),%r14
911.cfi_restore	%r14
912	mov	-32(%rsi),%r13
913.cfi_restore	%r13
914	mov	-24(%rsi),%r12
915.cfi_restore	%r12
916	mov	-16(%rsi),%rbp
917.cfi_restore	%rbp
918	mov	-8(%rsi),%rbx
919.cfi_restore	%rbx
920	lea	(%rsi),%rsp
921.cfi_def_cfa_register	%rsp
922.Lepilogue_avx:
923	ret
924.cfi_endproc
925.size	${func}_avx,.-${func}_avx
926___
927
928if ($avx>1) {{
929######################################################################
930# AVX2+BMI code path
931#
932my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
933my $PUSH8=8*2*$SZ;
934use integer;
935
936sub bodyx_00_15 () {
937	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
938	(
939	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
940
941	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
942	'&and	($a4,$e)',		# f&e
943	'&rorx	($a0,$e,$Sigma1[2])',
944	'&rorx	($a2,$e,$Sigma1[1])',
945
946	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
947	'&lea	($h,"($h,$a4)")',
948	'&andn	($a4,$e,$g)',		# ~e&g
949	'&xor	($a0,$a2)',
950
951	'&rorx	($a1,$e,$Sigma1[0])',
952	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
953	'&xor	($a0,$a1)',		# Sigma1(e)
954	'&mov	($a2,$a)',
955
956	'&rorx	($a4,$a,$Sigma0[2])',
957	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
958	'&xor	($a2,$b)',		# a^b, b^c in next round
959	'&rorx	($a1,$a,$Sigma0[1])',
960
961	'&rorx	($a0,$a,$Sigma0[0])',
962	'&lea	($d,"($d,$h)")',	# d+=h
963	'&and	($a3,$a2)',		# (b^c)&(a^b)
964	@aesni_cbc_block[$aesni_cbc_idx++].
965	'&xor	($a1,$a4)',
966
967	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
968	'&xor	($a1,$a0)',		# Sigma0(a)
969	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
970	'&mov	($a4,$e)',		# copy of f in future
971
972	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
973	);
974	# and at the finish one has to $a+=$a1
975}
976
977$code.=<<___;
978.type	${func}_avx2,\@function,6
979.align	64
980${func}_avx2:
981.cfi_startproc
982.Lavx2_shortcut:
983	mov	`($win64?56:8)`(%rsp),$in0	# load 7th parameter
984	mov	%rsp,%rax		# copy %rsp
985.cfi_def_cfa_register	%rax
986	push	%rbx
987.cfi_push	%rbx
988	push	%rbp
989.cfi_push	%rbp
990	push	%r12
991.cfi_push	%r12
992	push	%r13
993.cfi_push	%r13
994	push	%r14
995.cfi_push	%r14
996	push	%r15
997.cfi_push	%r15
998	sub	\$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
999	and	\$-256*$SZ,%rsp		# align stack frame
1000	add	\$`2*$SZ*($rounds-8)`,%rsp
1001
1002	shl	\$6,$len
1003	sub	$inp,$out		# re-bias
1004	sub	$inp,$in0
1005	add	$inp,$len		# end of input
1006
1007	#mov	$inp,$_inp		# saved later
1008	#mov	$out,$_out		# kept in $offload
1009	mov	$len,$_end
1010	#mov	$key,$_key		# remains resident in $inp register
1011	mov	$ivp,$_ivp
1012	mov	$ctx,$_ctx
1013	mov	$in0,$_in0
1014	mov	%rax,$_rsp
1015.cfi_cfa_expression	$_rsp,deref,+8
1016___
1017$code.=<<___ if ($win64);
1018	movaps	%xmm6,`$framesz+16*0`(%rsp)
1019	movaps	%xmm7,`$framesz+16*1`(%rsp)
1020	movaps	%xmm8,`$framesz+16*2`(%rsp)
1021	movaps	%xmm9,`$framesz+16*3`(%rsp)
1022	movaps	%xmm10,`$framesz+16*4`(%rsp)
1023	movaps	%xmm11,`$framesz+16*5`(%rsp)
1024	movaps	%xmm12,`$framesz+16*6`(%rsp)
1025	movaps	%xmm13,`$framesz+16*7`(%rsp)
1026	movaps	%xmm14,`$framesz+16*8`(%rsp)
1027	movaps	%xmm15,`$framesz+16*9`(%rsp)
1028___
1029$code.=<<___;
1030.Lprologue_avx2:
1031	vzeroall
1032
1033	mov	$inp,%r13		# borrow $a0
1034	vpinsrq	\$1,$out,$offload,$offload
1035	lea	0x80($key),$inp		# size optimization, reassign
1036	lea	$TABLE+`$SZ*2*$rounds+32`(%rip),%r12	# borrow $a4
1037	mov	0xf0-0x80($inp),%r14d	# rounds, borrow $a1
1038	mov	$ctx,%r15		# borrow $a2
1039	mov	$in0,%rsi		# borrow $a3
1040	vmovdqu	($ivp),$iv		# load IV
1041	lea	-9(%r14),%r14
1042
1043	vmovdqa	0x00(%r12,%r14,8),$mask14
1044	vmovdqa	0x10(%r12,%r14,8),$mask12
1045	vmovdqa	0x20(%r12,%r14,8),$mask10
1046
1047	sub	\$-16*$SZ,%r13		# inp++, size optimization
1048	mov	$SZ*0(%r15),$A
1049	lea	(%rsi,%r13),%r12	# borrow $a0
1050	mov	$SZ*1(%r15),$B
1051	cmp	$len,%r13		# $_end
1052	mov	$SZ*2(%r15),$C
1053	cmove	%rsp,%r12		# next block or random data
1054	mov	$SZ*3(%r15),$D
1055	mov	$SZ*4(%r15),$E
1056	mov	$SZ*5(%r15),$F
1057	mov	$SZ*6(%r15),$G
1058	mov	$SZ*7(%r15),$H
1059	vmovdqu	0x00-0x80($inp),$roundkey
1060___
1061					if ($SZ==4) {	# SHA256
1062    my @X = map("%ymm$_",(0..3));
1063    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1064
1065$code.=<<___;
1066	jmp	.Loop_avx2
1067.align	16
1068.Loop_avx2:
1069	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1070	vmovdqu	-16*$SZ+0(%rsi,%r13),%xmm0
1071	vmovdqu	-16*$SZ+16(%rsi,%r13),%xmm1
1072	vmovdqu	-16*$SZ+32(%rsi,%r13),%xmm2
1073	vmovdqu	-16*$SZ+48(%rsi,%r13),%xmm3
1074
1075	vinserti128	\$1,(%r12),@X[0],@X[0]
1076	vinserti128	\$1,16(%r12),@X[1],@X[1]
1077	 vpshufb	$t3,@X[0],@X[0]
1078	vinserti128	\$1,32(%r12),@X[2],@X[2]
1079	 vpshufb	$t3,@X[1],@X[1]
1080	vinserti128	\$1,48(%r12),@X[3],@X[3]
1081
1082	lea	$TABLE(%rip),$Tbl
1083	vpshufb	$t3,@X[2],@X[2]
1084	lea	-16*$SZ(%r13),%r13
1085	vpaddd	0x00($Tbl),@X[0],$t0
1086	vpshufb	$t3,@X[3],@X[3]
1087	vpaddd	0x20($Tbl),@X[1],$t1
1088	vpaddd	0x40($Tbl),@X[2],$t2
1089	vpaddd	0x60($Tbl),@X[3],$t3
1090	vmovdqa	$t0,0x00(%rsp)
1091	xor	$a1,$a1
1092	vmovdqa	$t1,0x20(%rsp)
1093___
1094$code.=<<___ if (!$win64);
1095# temporarily use %rsi as frame pointer
1096        mov     $_rsp,%rsi
1097.cfi_def_cfa    %rsi,8
1098___
1099$code.=<<___;
1100	lea	-$PUSH8(%rsp),%rsp
1101___
1102$code.=<<___ if (!$win64);
1103# the frame info is at $_rsp, but the stack is moving...
1104# so a second frame pointer is saved at -8(%rsp)
1105# that is in the red zone
1106        mov     %rsi,-8(%rsp)
1107.cfi_cfa_expression     %rsp-8,deref,+8
1108___
1109$code.=<<___;
1110	mov	$B,$a3
1111	vmovdqa	$t2,0x00(%rsp)
1112	xor	$C,$a3			# magic
1113	vmovdqa	$t3,0x20(%rsp)
1114	mov	$F,$a4
1115	sub	\$-16*2*$SZ,$Tbl	# size optimization
1116	jmp	.Lavx2_00_47
1117
1118.align	16
1119.Lavx2_00_47:
1120	vmovdqu	(%r13),$inout
1121	vpinsrq	\$0,%r13,$offload,$offload
1122___
1123
1124sub AVX2_256_00_47 () {
1125my $j = shift;
1126my $body = shift;
1127my @X = @_;
1128my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1129my $base = "+2*$PUSH8(%rsp)";
1130
1131	if (($j%2)==0) {
1132	&lea	("%rsp","-$PUSH8(%rsp)");
1133$code.=<<___ if (!$win64);
1134.cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
1135# copy secondary frame pointer to new location again at -8(%rsp)
1136        pushq   $PUSH8-8(%rsp)
1137.cfi_cfa_expression     %rsp,deref,+8
1138        lea     8(%rsp),%rsp
1139.cfi_cfa_expression     %rsp-8,deref,+8
1140___
1141	}
1142	foreach (Xupdate_256_AVX()) {		# 29 instructions
1143	    eval;
1144	    eval(shift(@insns));
1145	    eval(shift(@insns));
1146	    eval(shift(@insns));
1147	}
1148	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1149	  foreach (@insns) { eval; }		# remaining instructions
1150	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1151}
1152    $aesni_cbc_idx=0;
1153    for ($i=0,$j=0; $j<4; $j++) {
1154	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1155	push(@X,shift(@X));			# rotate(@X)
1156    }
1157	&vmovq		("%r13",$offload);	# borrow $a0
1158	&vpextrq	("%r15",$offload,1);	# borrow $a2
1159	&vpand		($temp,$temp,$mask14);
1160	&vpor		($iv,$iv,$temp);
1161	&vmovdqu	("(%r15,%r13)",$iv);	# write output
1162	&lea		("%r13","16(%r13)");	# inp++
1163
1164	&lea	($Tbl,16*2*$SZ."($Tbl)");
1165	&cmpb	(($SZ-1)."($Tbl)",0);
1166	&jne	(".Lavx2_00_47");
1167
1168	&vmovdqu	($inout,"(%r13)");
1169	&vpinsrq	($offload,$offload,"%r13",0);
1170
1171    $aesni_cbc_idx=0;
1172    for ($i=0; $i<16; ) {
1173	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1174	foreach(bodyx_00_15()) { eval; }
1175    }
1176					}
1177$code.=<<___;
1178	vpextrq	\$1,$offload,%r12		# $_out, borrow $a4
1179	vmovq	$offload,%r13			# $_inp, borrow $a0
1180	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1181	add	$a1,$A
1182	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1183
1184	vpand	$mask14,$temp,$temp
1185	vpor	$temp,$iv,$iv
1186	vmovdqu	$iv,(%r12,%r13)			# write output
1187	lea	16(%r13),%r13
1188
1189	add	$SZ*0(%r15),$A
1190	add	$SZ*1(%r15),$B
1191	add	$SZ*2(%r15),$C
1192	add	$SZ*3(%r15),$D
1193	add	$SZ*4(%r15),$E
1194	add	$SZ*5(%r15),$F
1195	add	$SZ*6(%r15),$G
1196	add	$SZ*7(%r15),$H
1197
1198	mov	$A,$SZ*0(%r15)
1199	mov	$B,$SZ*1(%r15)
1200	mov	$C,$SZ*2(%r15)
1201	mov	$D,$SZ*3(%r15)
1202	mov	$E,$SZ*4(%r15)
1203	mov	$F,$SZ*5(%r15)
1204	mov	$G,$SZ*6(%r15)
1205	mov	$H,$SZ*7(%r15)
1206
1207	cmp	`$PUSH8+2*8`($Tbl),%r13		# $_end
1208	je	.Ldone_avx2
1209
1210	xor	$a1,$a1
1211	mov	$B,$a3
1212	mov	$F,$a4
1213	xor	$C,$a3			# magic
1214	jmp	.Lower_avx2
1215.align	16
1216.Lower_avx2:
1217	vmovdqu	(%r13),$inout
1218	vpinsrq	\$0,%r13,$offload,$offload
1219___
1220    $aesni_cbc_idx=0;
1221    for ($i=0; $i<16; ) {
1222	my $base="+16($Tbl)";
1223	foreach(bodyx_00_15()) { eval; }
1224	&lea	($Tbl,"-$PUSH8($Tbl)")	if ($i==8);
1225    }
1226$code.=<<___;
1227	vmovq	$offload,%r13			# borrow $a0
1228	vpextrq	\$1,$offload,%r15		# borrow $a2
1229	vpand	$mask14,$temp,$temp
1230	vpor	$temp,$iv,$iv
1231	lea	-$PUSH8($Tbl),$Tbl
1232	vmovdqu	$iv,(%r15,%r13)			# write output
1233	lea	16(%r13),%r13			# inp++
1234	cmp	%rsp,$Tbl
1235	jae	.Lower_avx2
1236
1237	mov	`2*$SZ*$rounds+5*8`(%rsp),%r15	# $_ctx, borrow $a2
1238	lea	16*$SZ(%r13),%r13
1239	mov	`2*$SZ*$rounds+6*8`(%rsp),%rsi	# $_in0, borrow $a3
1240	add	$a1,$A
1241	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1242
1243	add	$SZ*0(%r15),$A
1244	add	$SZ*1(%r15),$B
1245	add	$SZ*2(%r15),$C
1246	add	$SZ*3(%r15),$D
1247	add	$SZ*4(%r15),$E
1248	add	$SZ*5(%r15),$F
1249	add	$SZ*6(%r15),$G
1250	lea	(%rsi,%r13),%r12
1251	add	$SZ*7(%r15),$H
1252
1253	cmp	$_end,%r13
1254
1255	mov	$A,$SZ*0(%r15)
1256	cmove	%rsp,%r12		# next block or stale data
1257	mov	$B,$SZ*1(%r15)
1258	mov	$C,$SZ*2(%r15)
1259	mov	$D,$SZ*3(%r15)
1260	mov	$E,$SZ*4(%r15)
1261	mov	$F,$SZ*5(%r15)
1262	mov	$G,$SZ*6(%r15)
1263	mov	$H,$SZ*7(%r15)
1264
1265	jbe	.Loop_avx2
1266	lea	(%rsp),$Tbl
1267# temporarily use $Tbl as index to $_rsp
1268# this avoids the need to save a secondary frame pointer at -8(%rsp)
1269.cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
1270
1271.Ldone_avx2:
1272	mov	16*$SZ+4*8($Tbl),$ivp
1273	mov	16*$SZ+7*8($Tbl),%rsi
1274.cfi_def_cfa	%rsi,8
1275	vmovdqu	$iv,($ivp)		# output IV
1276	vzeroall
1277___
1278$code.=<<___ if ($win64);
1279	movaps	`$framesz+16*0`($Tbl),%xmm6
1280	movaps	`$framesz+16*1`($Tbl),%xmm7
1281	movaps	`$framesz+16*2`($Tbl),%xmm8
1282	movaps	`$framesz+16*3`($Tbl),%xmm9
1283	movaps	`$framesz+16*4`($Tbl),%xmm10
1284	movaps	`$framesz+16*5`($Tbl),%xmm11
1285	movaps	`$framesz+16*6`($Tbl),%xmm12
1286	movaps	`$framesz+16*7`($Tbl),%xmm13
1287	movaps	`$framesz+16*8`($Tbl),%xmm14
1288	movaps	`$framesz+16*9`($Tbl),%xmm15
1289___
1290$code.=<<___;
1291	mov	-48(%rsi),%r15
1292.cfi_restore	%r15
1293	mov	-40(%rsi),%r14
1294.cfi_restore	%r14
1295	mov	-32(%rsi),%r13
1296.cfi_restore	%r13
1297	mov	-24(%rsi),%r12
1298.cfi_restore	%r12
1299	mov	-16(%rsi),%rbp
1300.cfi_restore	%rbp
1301	mov	-8(%rsi),%rbx
1302.cfi_restore	%rbx
1303	lea	(%rsi),%rsp
1304.cfi_def_cfa_register	%rsp
1305.Lepilogue_avx2:
1306	ret
1307.cfi_endproc
1308.size	${func}_avx2,.-${func}_avx2
1309___
1310}}
1311}}
1312{{
1313my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1314
1315my ($rounds,$Tbl)=("%r11d","%rbx");
1316
1317my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1318my @rndkey=("%xmm4","%xmm5");
1319my $r=0;
1320my $sn=0;
1321
1322my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1323my @MSG=map("%xmm$_",(10..13));
1324
1325my $aesenc=sub {
1326  use integer;
1327  my ($n,$k)=($r/10,$r%10);
1328    if ($k==0) {
1329      $code.=<<___;
1330	movups		`16*$n`($in0),$in		# load input
1331	xorps		$rndkey0,$in
1332___
1333      $code.=<<___ if ($n);
1334	movups		$iv,`16*($n-1)`($out,$in0)	# write output
1335___
1336      $code.=<<___;
1337	xorps		$in,$iv
1338	movups		`32+16*$k-112`($key),$rndkey[1]
1339	aesenc		$rndkey[0],$iv
1340___
1341    } elsif ($k==9) {
1342      $sn++;
1343      $code.=<<___;
1344	cmp		\$11,$rounds
1345	jb		.Laesenclast$sn
1346	movups		`32+16*($k+0)-112`($key),$rndkey[1]
1347	aesenc		$rndkey[0],$iv
1348	movups		`32+16*($k+1)-112`($key),$rndkey[0]
1349	aesenc		$rndkey[1],$iv
1350	je		.Laesenclast$sn
1351	movups		`32+16*($k+2)-112`($key),$rndkey[1]
1352	aesenc		$rndkey[0],$iv
1353	movups		`32+16*($k+3)-112`($key),$rndkey[0]
1354	aesenc		$rndkey[1],$iv
1355.Laesenclast$sn:
1356	aesenclast	$rndkey[0],$iv
1357	movups		16-112($key),$rndkey[1]		# forward reference
1358	nop
1359___
1360    } else {
1361      $code.=<<___;
1362	movups		`32+16*$k-112`($key),$rndkey[1]
1363	aesenc		$rndkey[0],$iv
1364___
1365    }
1366    $r++;	unshift(@rndkey,pop(@rndkey));
1367};
1368
1369if ($shaext) {
1370my $Tbl="%rax";
1371
1372$code.=<<___;
1373.type	${func}_shaext,\@function,6
1374.align	32
1375${func}_shaext:
1376.cfi_startproc
1377	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
1378___
1379$code.=<<___ if ($win64);
1380	lea	`-8-10*16`(%rsp),%rsp
1381	movaps	%xmm6,-8-10*16(%rax)
1382	movaps	%xmm7,-8-9*16(%rax)
1383	movaps	%xmm8,-8-8*16(%rax)
1384	movaps	%xmm9,-8-7*16(%rax)
1385	movaps	%xmm10,-8-6*16(%rax)
1386	movaps	%xmm11,-8-5*16(%rax)
1387	movaps	%xmm12,-8-4*16(%rax)
1388	movaps	%xmm13,-8-3*16(%rax)
1389	movaps	%xmm14,-8-2*16(%rax)
1390	movaps	%xmm15,-8-1*16(%rax)
1391.Lprologue_shaext:
1392___
1393$code.=<<___;
1394	lea		K256+0x80(%rip),$Tbl
1395	movdqu		($ctx),$ABEF		# DCBA
1396	movdqu		16($ctx),$CDGH		# HGFE
1397	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
1398
1399	mov		240($key),$rounds
1400	sub		$in0,$out
1401	movups		($key),$rndkey0		# $key[0]
1402	movups		($ivp),$iv		# load IV
1403	movups		16($key),$rndkey[0]	# forward reference
1404	lea		112($key),$key		# size optimization
1405
1406	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
1407	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
1408	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
1409	movdqa		$TMP,$BSWAP		# offload
1410	palignr		\$8,$CDGH,$ABEF		# ABEF
1411	punpcklqdq	$Wi,$CDGH		# CDGH
1412
1413	jmp	.Loop_shaext
1414
1415.align	16
1416.Loop_shaext:
1417	movdqu		($inp),@MSG[0]
1418	movdqu		0x10($inp),@MSG[1]
1419	movdqu		0x20($inp),@MSG[2]
1420	pshufb		$TMP,@MSG[0]
1421	movdqu		0x30($inp),@MSG[3]
1422
1423	movdqa		0*32-0x80($Tbl),$Wi
1424	paddd		@MSG[0],$Wi
1425	pshufb		$TMP,@MSG[1]
1426	movdqa		$CDGH,$CDGH_SAVE	# offload
1427	movdqa		$ABEF,$ABEF_SAVE	# offload
1428___
1429	&$aesenc();
1430$code.=<<___;
1431	sha256rnds2	$ABEF,$CDGH		# 0-3
1432	pshufd		\$0x0e,$Wi,$Wi
1433___
1434	&$aesenc();
1435$code.=<<___;
1436	sha256rnds2	$CDGH,$ABEF
1437
1438	movdqa		1*32-0x80($Tbl),$Wi
1439	paddd		@MSG[1],$Wi
1440	pshufb		$TMP,@MSG[2]
1441	lea		0x40($inp),$inp
1442___
1443	&$aesenc();
1444$code.=<<___;
1445	sha256rnds2	$ABEF,$CDGH		# 4-7
1446	pshufd		\$0x0e,$Wi,$Wi
1447___
1448	&$aesenc();
1449$code.=<<___;
1450	sha256rnds2	$CDGH,$ABEF
1451
1452	movdqa		2*32-0x80($Tbl),$Wi
1453	paddd		@MSG[2],$Wi
1454	pshufb		$TMP,@MSG[3]
1455	sha256msg1	@MSG[1],@MSG[0]
1456___
1457	&$aesenc();
1458$code.=<<___;
1459	sha256rnds2	$ABEF,$CDGH		# 8-11
1460	pshufd		\$0x0e,$Wi,$Wi
1461	movdqa		@MSG[3],$TMP
1462	palignr		\$4,@MSG[2],$TMP
1463	paddd		$TMP,@MSG[0]
1464___
1465	&$aesenc();
1466$code.=<<___;
1467	sha256rnds2	$CDGH,$ABEF
1468
1469	movdqa		3*32-0x80($Tbl),$Wi
1470	paddd		@MSG[3],$Wi
1471	sha256msg2	@MSG[3],@MSG[0]
1472	sha256msg1	@MSG[2],@MSG[1]
1473___
1474	&$aesenc();
1475$code.=<<___;
1476	sha256rnds2	$ABEF,$CDGH		# 12-15
1477	pshufd		\$0x0e,$Wi,$Wi
1478___
1479	&$aesenc();
1480$code.=<<___;
1481	movdqa		@MSG[0],$TMP
1482	palignr		\$4,@MSG[3],$TMP
1483	paddd		$TMP,@MSG[1]
1484	sha256rnds2	$CDGH,$ABEF
1485___
1486for($i=4;$i<16-3;$i++) {
1487	&$aesenc()	if (($r%10)==0);
1488$code.=<<___;
1489	movdqa		$i*32-0x80($Tbl),$Wi
1490	paddd		@MSG[0],$Wi
1491	sha256msg2	@MSG[0],@MSG[1]
1492	sha256msg1	@MSG[3],@MSG[2]
1493___
1494	&$aesenc();
1495$code.=<<___;
1496	sha256rnds2	$ABEF,$CDGH		# 16-19...
1497	pshufd		\$0x0e,$Wi,$Wi
1498	movdqa		@MSG[1],$TMP
1499	palignr		\$4,@MSG[0],$TMP
1500	paddd		$TMP,@MSG[2]
1501___
1502	&$aesenc();
1503	&$aesenc()	if ($r==19);
1504$code.=<<___;
1505	sha256rnds2	$CDGH,$ABEF
1506___
1507	push(@MSG,shift(@MSG));
1508}
1509$code.=<<___;
1510	movdqa		13*32-0x80($Tbl),$Wi
1511	paddd		@MSG[0],$Wi
1512	sha256msg2	@MSG[0],@MSG[1]
1513	sha256msg1	@MSG[3],@MSG[2]
1514___
1515	&$aesenc();
1516$code.=<<___;
1517	sha256rnds2	$ABEF,$CDGH		# 52-55
1518	pshufd		\$0x0e,$Wi,$Wi
1519	movdqa		@MSG[1],$TMP
1520	palignr		\$4,@MSG[0],$TMP
1521	paddd		$TMP,@MSG[2]
1522___
1523	&$aesenc();
1524	&$aesenc();
1525$code.=<<___;
1526	sha256rnds2	$CDGH,$ABEF
1527
1528	movdqa		14*32-0x80($Tbl),$Wi
1529	paddd		@MSG[1],$Wi
1530	sha256msg2	@MSG[1],@MSG[2]
1531	movdqa		$BSWAP,$TMP
1532___
1533	&$aesenc();
1534$code.=<<___;
1535	sha256rnds2	$ABEF,$CDGH		# 56-59
1536	pshufd		\$0x0e,$Wi,$Wi
1537___
1538	&$aesenc();
1539$code.=<<___;
1540	sha256rnds2	$CDGH,$ABEF
1541
1542	movdqa		15*32-0x80($Tbl),$Wi
1543	paddd		@MSG[2],$Wi
1544___
1545	&$aesenc();
1546	&$aesenc();
1547$code.=<<___;
1548	sha256rnds2	$ABEF,$CDGH		# 60-63
1549	pshufd		\$0x0e,$Wi,$Wi
1550___
1551	&$aesenc();
1552$code.=<<___;
1553	sha256rnds2	$CDGH,$ABEF
1554	#pxor		$CDGH,$rndkey0		# black magic
1555___
1556	while ($r<40)	{ &$aesenc(); }		# remaining aesenc's
1557$code.=<<___;
1558	#xorps		$CDGH,$rndkey0		# black magic
1559	paddd		$CDGH_SAVE,$CDGH
1560	paddd		$ABEF_SAVE,$ABEF
1561
1562	dec		$len
1563	movups		$iv,48($out,$in0)	# write output
1564	lea		64($in0),$in0
1565	jnz		.Loop_shaext
1566
1567	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
1568	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
1569	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
1570	punpckhqdq	$CDGH,$ABEF		# DCBA
1571	palignr		\$8,$TMP,$CDGH		# HGFE
1572
1573	movups		$iv,($ivp)		# write IV
1574	movdqu		$ABEF,($ctx)
1575	movdqu		$CDGH,16($ctx)
1576___
1577$code.=<<___ if ($win64);
1578	movaps	0*16(%rsp),%xmm6
1579	movaps	1*16(%rsp),%xmm7
1580	movaps	2*16(%rsp),%xmm8
1581	movaps	3*16(%rsp),%xmm9
1582	movaps	4*16(%rsp),%xmm10
1583	movaps	5*16(%rsp),%xmm11
1584	movaps	6*16(%rsp),%xmm12
1585	movaps	7*16(%rsp),%xmm13
1586	movaps	8*16(%rsp),%xmm14
1587	movaps	9*16(%rsp),%xmm15
1588	lea	8+10*16(%rsp),%rsp
1589.Lepilogue_shaext:
1590___
1591$code.=<<___;
1592	ret
1593.cfi_endproc
1594.size	${func}_shaext,.-${func}_shaext
1595___
1596}
1597}}}}}
1598
1599# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1600#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1601if ($win64 && $avx) {
1602$rec="%rcx";
1603$frame="%rdx";
1604$context="%r8";
1605$disp="%r9";
1606
1607$code.=<<___;
1608.extern	__imp_RtlVirtualUnwind
1609.type	se_handler,\@abi-omnipotent
1610.align	16
1611se_handler:
1612	push	%rsi
1613	push	%rdi
1614	push	%rbx
1615	push	%rbp
1616	push	%r12
1617	push	%r13
1618	push	%r14
1619	push	%r15
1620	pushfq
1621	sub	\$64,%rsp
1622
1623	mov	120($context),%rax	# pull context->Rax
1624	mov	248($context),%rbx	# pull context->Rip
1625
1626	mov	8($disp),%rsi		# disp->ImageBase
1627	mov	56($disp),%r11		# disp->HanderlData
1628
1629	mov	0(%r11),%r10d		# HandlerData[0]
1630	lea	(%rsi,%r10),%r10	# prologue label
1631	cmp	%r10,%rbx		# context->Rip<prologue label
1632	jb	.Lin_prologue
1633
1634	mov	152($context),%rax	# pull context->Rsp
1635
1636	mov	4(%r11),%r10d		# HandlerData[1]
1637	lea	(%rsi,%r10),%r10	# epilogue label
1638	cmp	%r10,%rbx		# context->Rip>=epilogue label
1639	jae	.Lin_prologue
1640___
1641$code.=<<___ if ($shaext);
1642	lea	aesni_cbc_sha256_enc_shaext(%rip),%r10
1643	cmp	%r10,%rbx
1644	jb	.Lnot_in_shaext
1645
1646	lea	(%rax),%rsi
1647	lea	512($context),%rdi	# &context.Xmm6
1648	mov	\$20,%ecx
1649	.long	0xa548f3fc		# cld; rep movsq
1650	lea	168(%rax),%rax		# adjust stack pointer
1651	jmp	.Lin_prologue
1652.Lnot_in_shaext:
1653___
1654$code.=<<___ if ($avx>1);
1655	lea	.Lavx2_shortcut(%rip),%r10
1656	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1657	jb	.Lnot_in_avx2
1658
1659	and	\$-256*$SZ,%rax
1660	add	\$`2*$SZ*($rounds-8)`,%rax
1661.Lnot_in_avx2:
1662___
1663$code.=<<___;
1664	mov	%rax,%rsi		# put aside Rsp
1665	mov	16*$SZ+7*8(%rax),%rax	# pull $_rsp
1666
1667	mov	-8(%rax),%rbx
1668	mov	-16(%rax),%rbp
1669	mov	-24(%rax),%r12
1670	mov	-32(%rax),%r13
1671	mov	-40(%rax),%r14
1672	mov	-48(%rax),%r15
1673	mov	%rbx,144($context)	# restore context->Rbx
1674	mov	%rbp,160($context)	# restore context->Rbp
1675	mov	%r12,216($context)	# restore context->R12
1676	mov	%r13,224($context)	# restore context->R13
1677	mov	%r14,232($context)	# restore context->R14
1678	mov	%r15,240($context)	# restore context->R15
1679
1680	lea	16*$SZ+8*8(%rsi),%rsi	# Xmm6- save area
1681	lea	512($context),%rdi	# &context.Xmm6
1682	mov	\$20,%ecx
1683	.long	0xa548f3fc		# cld; rep movsq
1684
1685.Lin_prologue:
1686	mov	8(%rax),%rdi
1687	mov	16(%rax),%rsi
1688	mov	%rax,152($context)	# restore context->Rsp
1689	mov	%rsi,168($context)	# restore context->Rsi
1690	mov	%rdi,176($context)	# restore context->Rdi
1691
1692	mov	40($disp),%rdi		# disp->ContextRecord
1693	mov	$context,%rsi		# context
1694	mov	\$154,%ecx		# sizeof(CONTEXT)
1695	.long	0xa548f3fc		# cld; rep movsq
1696
1697	mov	$disp,%rsi
1698	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1699	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1700	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1701	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1702	mov	40(%rsi),%r10		# disp->ContextRecord
1703	lea	56(%rsi),%r11		# &disp->HandlerData
1704	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1705	mov	%r10,32(%rsp)		# arg5
1706	mov	%r11,40(%rsp)		# arg6
1707	mov	%r12,48(%rsp)		# arg7
1708	mov	%rcx,56(%rsp)		# arg8, (NULL)
1709	call	*__imp_RtlVirtualUnwind(%rip)
1710
1711	mov	\$1,%eax		# ExceptionContinueSearch
1712	add	\$64,%rsp
1713	popfq
1714	pop	%r15
1715	pop	%r14
1716	pop	%r13
1717	pop	%r12
1718	pop	%rbp
1719	pop	%rbx
1720	pop	%rdi
1721	pop	%rsi
1722	ret
1723.size	se_handler,.-se_handler
1724
1725.section	.pdata
1726	.rva	.LSEH_begin_${func}_xop
1727	.rva	.LSEH_end_${func}_xop
1728	.rva	.LSEH_info_${func}_xop
1729
1730	.rva	.LSEH_begin_${func}_avx
1731	.rva	.LSEH_end_${func}_avx
1732	.rva	.LSEH_info_${func}_avx
1733___
1734$code.=<<___ if ($avx>1);
1735	.rva	.LSEH_begin_${func}_avx2
1736	.rva	.LSEH_end_${func}_avx2
1737	.rva	.LSEH_info_${func}_avx2
1738___
1739$code.=<<___ if ($shaext);
1740	.rva	.LSEH_begin_${func}_shaext
1741	.rva	.LSEH_end_${func}_shaext
1742	.rva	.LSEH_info_${func}_shaext
1743___
1744$code.=<<___;
1745.section	.xdata
1746.align	8
1747.LSEH_info_${func}_xop:
1748	.byte	9,0,0,0
1749	.rva	se_handler
1750	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
1751
1752.LSEH_info_${func}_avx:
1753	.byte	9,0,0,0
1754	.rva	se_handler
1755	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1756___
1757$code.=<<___ if ($avx>1);
1758.LSEH_info_${func}_avx2:
1759	.byte	9,0,0,0
1760	.rva	se_handler
1761	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
1762___
1763$code.=<<___ if ($shaext);
1764.LSEH_info_${func}_shaext:
1765	.byte	9,0,0,0
1766	.rva	se_handler
1767	.rva	.Lprologue_shaext,.Lepilogue_shaext	# HandlerData[]
1768___
1769}
1770
1771####################################################################
1772sub rex {
1773  local *opcode=shift;
1774  my ($dst,$src)=@_;
1775  my $rex=0;
1776
1777    $rex|=0x04			if($dst>=8);
1778    $rex|=0x01			if($src>=8);
1779    unshift @opcode,$rex|0x40	if($rex);
1780}
1781
1782{
1783  my %opcodelet = (
1784		"sha256rnds2" => 0xcb,
1785  		"sha256msg1"  => 0xcc,
1786		"sha256msg2"  => 0xcd	);
1787
1788  sub sha256op38 {
1789    my $instr = shift;
1790
1791    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1792      my @opcode=(0x0f,0x38);
1793	rex(\@opcode,$2,$1);
1794	push @opcode,$opcodelet{$instr};
1795	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1796	return ".byte\t".join(',',@opcode);
1797    } else {
1798	return $instr."\t".@_[0];
1799    }
1800  }
1801}
1802
1803$code =~ s/\`([^\`]*)\`/eval $1/gem;
1804$code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1805print $code;
1806close STDOUT or die "error closing STDOUT: $!";
1807