xref: /openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2013-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# Multi-buffer SHA1 procedure processes n buffers in parallel by
18# placing buffer data to designated lane of SIMD register. n is
19# naturally limited to 4 on pre-AVX2 processors and to 8 on
20# AVX2-capable processors such as Haswell.
21#
22#		this	+aesni(i)	sha1	aesni-sha1	gain(iv)
23# -------------------------------------------------------------------
24# Westmere(ii)	10.7/n	+1.28=3.96(n=4)	5.30	6.66		+68%
25# Atom(ii)	18.1/n	+3.93=8.46(n=4)	9.37	12.8		+51%
26# Sandy Bridge	(8.16	+5.15=13.3)/n	4.99	5.98		+80%
27# Ivy Bridge	(8.08	+5.14=13.2)/n	4.60	5.54		+68%
28# Haswell(iii)	(8.96	+5.00=14.0)/n	3.57	4.55		+160%
29# Skylake	(8.70	+5.00=13.7)/n	3.64	4.20		+145%
30# Bulldozer	(9.76	+5.76=15.5)/n	5.95	6.37		+64%
31#
32# (i)	multi-block CBC encrypt with 128-bit key;
33# (ii)	(HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34#	because of lower AES-NI instruction throughput;
35# (iii)	"this" is for n=8, when we gather twice as much data, result
36#	for n=4 is 8.00+4.44=12.4;
37# (iv)	presented improvement coefficients are asymptotic limits and
38#	in real-life application are somewhat lower, e.g. for 2KB
39#	fragments they range from 30% to 100% (on Haswell);
40
41# $output is the last argument if it looks like a file (it has an extension)
42# $flavour is the first argument if it doesn't look like a file
43$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
44$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53push(@INC,"${dir}","${dir}../../perlasm");
54require "x86_64-support.pl";
55
56$ptr_size=&pointer_size($flavour);
57
58$avx=0;
59
60if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
61		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
62	$avx = ($1>=2.19) + ($1>=2.22);
63}
64
65if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
67	$avx = ($1>=2.09) + ($1>=2.10);
68}
69
70if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
71	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
72	$avx = ($1>=10) + ($1>=11);
73}
74
75if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
76	$avx = ($2>=3.0) + ($2>3.0);
77}
78
79open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
80    or die "can't call $xlate: $!";
81*STDOUT=*OUT;
82
83# void sha1_multi_block (
84#     struct {	unsigned int A[8];
85#		unsigned int B[8];
86#		unsigned int C[8];
87#		unsigned int D[8];
88#		unsigned int E[8];	} *ctx,
89#     struct {	void *ptr; int blocks;	} inp[8],
90#     int num);		/* 1 or 2 */
91#
92$ctx="%rdi";	# 1st arg
93$inp="%rsi";	# 2nd arg
94$num="%edx";
95@ptr=map("%r$_",(8..11));
96$Tbl="%rbp";
97$inp_elm_size=2*$ptr_size;
98
99@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
100($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
101@Xi=map("%xmm$_",(10..14));
102$K="%xmm15";
103
104if (1) {
105    # Atom-specific optimization aiming to eliminate pshufb with high
106    # registers [and thus get rid of 48 cycles accumulated penalty]
107    @Xi=map("%xmm$_",(0..4));
108    ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
109    @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
110}
111
112$REG_SZ=16;
113
114sub Xi_off {
115my $off = shift;
116
117    $off %= 16; $off *= $REG_SZ;
118    $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
119}
120
121sub BODY_00_19 {
122my ($i,$a,$b,$c,$d,$e)=@_;
123my $j=$i+1;
124my $k=$i+2;
125
126# Loads are performed 2+3/4 iterations in advance. 3/4 means that out
127# of 4 words you would expect to be loaded per given iteration one is
128# spilled to next iteration. In other words indices in four input
129# streams are distributed as following:
130#
131# $i==0:	0,0,0,0,1,1,1,1,2,2,2,
132# $i==1:	2,3,3,3,
133# $i==2:	3,4,4,4,
134# ...
135# $i==13:	14,15,15,15,
136# $i==14:	15
137#
138# Then at $i==15 Xupdate is applied one iteration in advance...
139$code.=<<___ if ($i==0);
140	movd		(@ptr[0]),@Xi[0]
141	 lea		`16*4`(@ptr[0]),@ptr[0]
142	movd		(@ptr[1]),@Xi[2]	# borrow @Xi[2]
143	 lea		`16*4`(@ptr[1]),@ptr[1]
144	movd		(@ptr[2]),@Xi[3]	# borrow @Xi[3]
145	 lea		`16*4`(@ptr[2]),@ptr[2]
146	movd		(@ptr[3]),@Xi[4]	# borrow @Xi[4]
147	 lea		`16*4`(@ptr[3]),@ptr[3]
148	punpckldq	@Xi[3],@Xi[0]
149	 movd		`4*$j-16*4`(@ptr[0]),@Xi[1]
150	punpckldq	@Xi[4],@Xi[2]
151	 movd		`4*$j-16*4`(@ptr[1]),$t3
152	punpckldq	@Xi[2],@Xi[0]
153	 movd		`4*$j-16*4`(@ptr[2]),$t2
154	pshufb		$tx,@Xi[0]
155___
156$code.=<<___ if ($i<14);			# just load input
157	 movd		`4*$j-16*4`(@ptr[3]),$t1
158	 punpckldq	$t2,@Xi[1]
159	movdqa	$a,$t2
160	paddd	$K,$e				# e+=K_00_19
161	 punpckldq	$t1,$t3
162	movdqa	$b,$t1
163	movdqa	$b,$t0
164	pslld	\$5,$t2
165	pandn	$d,$t1
166	pand	$c,$t0
167	 punpckldq	$t3,@Xi[1]
168	movdqa	$a,$t3
169
170	movdqa	@Xi[0],`&Xi_off($i)`
171	paddd	@Xi[0],$e			# e+=X[i]
172	 movd		`4*$k-16*4`(@ptr[0]),@Xi[2]
173	psrld	\$27,$t3
174	pxor	$t1,$t0				# Ch(b,c,d)
175	movdqa	$b,$t1
176
177	por	$t3,$t2				# rol(a,5)
178	 movd		`4*$k-16*4`(@ptr[1]),$t3
179	pslld	\$30,$t1
180	paddd	$t0,$e				# e+=Ch(b,c,d)
181
182	psrld	\$2,$b
183	paddd	$t2,$e				# e+=rol(a,5)
184	 pshufb	$tx,@Xi[1]
185	 movd		`4*$k-16*4`(@ptr[2]),$t2
186	por	$t1,$b				# b=rol(b,30)
187___
188$code.=<<___ if ($i==14);			# just load input
189	 movd		`4*$j-16*4`(@ptr[3]),$t1
190	 punpckldq	$t2,@Xi[1]
191	movdqa	$a,$t2
192	paddd	$K,$e				# e+=K_00_19
193	 punpckldq	$t1,$t3
194	movdqa	$b,$t1
195	movdqa	$b,$t0
196	pslld	\$5,$t2
197	 prefetcht0	63(@ptr[0])
198	pandn	$d,$t1
199	pand	$c,$t0
200	 punpckldq	$t3,@Xi[1]
201	movdqa	$a,$t3
202
203	movdqa	@Xi[0],`&Xi_off($i)`
204	paddd	@Xi[0],$e			# e+=X[i]
205	psrld	\$27,$t3
206	pxor	$t1,$t0				# Ch(b,c,d)
207	movdqa	$b,$t1
208	 prefetcht0	63(@ptr[1])
209
210	por	$t3,$t2				# rol(a,5)
211	pslld	\$30,$t1
212	paddd	$t0,$e				# e+=Ch(b,c,d)
213	 prefetcht0	63(@ptr[2])
214
215	psrld	\$2,$b
216	paddd	$t2,$e				# e+=rol(a,5)
217	 pshufb	$tx,@Xi[1]
218	 prefetcht0	63(@ptr[3])
219	por	$t1,$b				# b=rol(b,30)
220___
221$code.=<<___ if ($i>=13 && $i<15);
222	movdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
223___
224$code.=<<___ if ($i>=15);			# apply Xupdate
225	pxor	@Xi[-2],@Xi[1]			# "X[13]"
226	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
227
228	movdqa	$a,$t2
229	 pxor	`&Xi_off($j+8)`,@Xi[1]
230	paddd	$K,$e				# e+=K_00_19
231	movdqa	$b,$t1
232	pslld	\$5,$t2
233	 pxor	@Xi[3],@Xi[1]
234	movdqa	$b,$t0
235	pandn	$d,$t1
236	 movdqa	@Xi[1],$tx
237	pand	$c,$t0
238	movdqa	$a,$t3
239	 psrld	\$31,$tx
240	 paddd	@Xi[1],@Xi[1]
241
242	movdqa	@Xi[0],`&Xi_off($i)`
243	paddd	@Xi[0],$e			# e+=X[i]
244	psrld	\$27,$t3
245	pxor	$t1,$t0				# Ch(b,c,d)
246
247	movdqa	$b,$t1
248	por	$t3,$t2				# rol(a,5)
249	pslld	\$30,$t1
250	paddd	$t0,$e				# e+=Ch(b,c,d)
251
252	psrld	\$2,$b
253	paddd	$t2,$e				# e+=rol(a,5)
254	 por	$tx,@Xi[1]			# rol	\$1,@Xi[1]
255	por	$t1,$b				# b=rol(b,30)
256___
257push(@Xi,shift(@Xi));
258}
259
260sub BODY_20_39 {
261my ($i,$a,$b,$c,$d,$e)=@_;
262my $j=$i+1;
263
264$code.=<<___ if ($i<79);
265	pxor	@Xi[-2],@Xi[1]			# "X[13]"
266	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
267
268	movdqa	$a,$t2
269	movdqa	$d,$t0
270	 pxor	`&Xi_off($j+8)`,@Xi[1]
271	paddd	$K,$e				# e+=K_20_39
272	pslld	\$5,$t2
273	pxor	$b,$t0
274
275	movdqa	$a,$t3
276___
277$code.=<<___ if ($i<72);
278	movdqa	@Xi[0],`&Xi_off($i)`
279___
280$code.=<<___ if ($i<79);
281	paddd	@Xi[0],$e			# e+=X[i]
282	 pxor	@Xi[3],@Xi[1]
283	psrld	\$27,$t3
284	pxor	$c,$t0				# Parity(b,c,d)
285	movdqa	$b,$t1
286
287	pslld	\$30,$t1
288	 movdqa	@Xi[1],$tx
289	por	$t3,$t2				# rol(a,5)
290	 psrld	\$31,$tx
291	paddd	$t0,$e				# e+=Parity(b,c,d)
292	 paddd	@Xi[1],@Xi[1]
293
294	psrld	\$2,$b
295	paddd	$t2,$e				# e+=rol(a,5)
296	 por	$tx,@Xi[1]			# rol(@Xi[1],1)
297	por	$t1,$b				# b=rol(b,30)
298___
299$code.=<<___ if ($i==79);
300	movdqa	$a,$t2
301	paddd	$K,$e				# e+=K_20_39
302	movdqa	$d,$t0
303	pslld	\$5,$t2
304	pxor	$b,$t0
305
306	movdqa	$a,$t3
307	paddd	@Xi[0],$e			# e+=X[i]
308	psrld	\$27,$t3
309	movdqa	$b,$t1
310	pxor	$c,$t0				# Parity(b,c,d)
311
312	pslld	\$30,$t1
313	por	$t3,$t2				# rol(a,5)
314	paddd	$t0,$e				# e+=Parity(b,c,d)
315
316	psrld	\$2,$b
317	paddd	$t2,$e				# e+=rol(a,5)
318	por	$t1,$b				# b=rol(b,30)
319___
320push(@Xi,shift(@Xi));
321}
322
323sub BODY_40_59 {
324my ($i,$a,$b,$c,$d,$e)=@_;
325my $j=$i+1;
326
327$code.=<<___;
328	pxor	@Xi[-2],@Xi[1]			# "X[13]"
329	movdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
330
331	movdqa	$a,$t2
332	movdqa	$d,$t1
333	 pxor	`&Xi_off($j+8)`,@Xi[1]
334	pxor	@Xi[3],@Xi[1]
335	paddd	$K,$e				# e+=K_40_59
336	pslld	\$5,$t2
337	movdqa	$a,$t3
338	pand	$c,$t1
339
340	movdqa	$d,$t0
341	 movdqa	@Xi[1],$tx
342	psrld	\$27,$t3
343	paddd	$t1,$e
344	pxor	$c,$t0
345
346	movdqa	@Xi[0],`&Xi_off($i)`
347	paddd	@Xi[0],$e			# e+=X[i]
348	por	$t3,$t2				# rol(a,5)
349	 psrld	\$31,$tx
350	pand	$b,$t0
351	movdqa	$b,$t1
352
353	pslld	\$30,$t1
354	 paddd	@Xi[1],@Xi[1]
355	paddd	$t0,$e				# e+=Maj(b,d,c)
356
357	psrld	\$2,$b
358	paddd	$t2,$e				# e+=rol(a,5)
359	 por	$tx,@Xi[1]			# rol(@X[1],1)
360	por	$t1,$b				# b=rol(b,30)
361___
362push(@Xi,shift(@Xi));
363}
364
365$code.=<<___;
366.text
367
368.extern	OPENSSL_ia32cap_P
369
370.globl	sha1_multi_block
371.type	sha1_multi_block,\@function,3
372.align	32
373sha1_multi_block:
374.cfi_startproc
375	mov	OPENSSL_ia32cap_P+4(%rip),%rcx
376	bt	\$61,%rcx			# check SHA bit
377	jc	_shaext_shortcut
378___
379$code.=<<___ if ($avx);
380	test	\$`1<<28`,%ecx
381	jnz	_avx_shortcut
382___
383$code.=<<___;
384	mov	%rsp,%rax
385.cfi_def_cfa_register	%rax
386	push	%rbx
387.cfi_push	%rbx
388	push	%rbp
389.cfi_push	%rbx
390___
391$code.=<<___ if ($win64);
392	lea	-0xa8(%rsp),%rsp
393	movaps	%xmm6,(%rsp)
394	movaps	%xmm7,0x10(%rsp)
395	movaps	%xmm8,0x20(%rsp)
396	movaps	%xmm9,0x30(%rsp)
397	movaps	%xmm10,-0x78(%rax)
398	movaps	%xmm11,-0x68(%rax)
399	movaps	%xmm12,-0x58(%rax)
400	movaps	%xmm13,-0x48(%rax)
401	movaps	%xmm14,-0x38(%rax)
402	movaps	%xmm15,-0x28(%rax)
403___
404$code.=<<___;
405	sub	\$`$REG_SZ*18`,%rsp
406	and	\$-256,%rsp
407	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
408.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
409.Lbody:
410	lea	K_XX_XX(%rip),$Tbl
411	lea	`$REG_SZ*16`(%rsp),%rbx
412
413.Loop_grande:
414	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
415	xor	$num,$num
416___
417for($i=0;$i<4;$i++) {
418    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
419    $code.=<<___;
420	# input pointer
421	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
422	# number of blocks
423	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
424	cmp	$num,%ecx
425	cmovg	%ecx,$num			# find maximum
426	test	%ecx,%ecx
427	mov	%ecx,`4*$i`(%rbx)		# initialize counters
428	cmovle	$Tbl,@ptr[$i]			# cancel input
429___
430}
431$code.=<<___;
432	test	$num,$num
433	jz	.Ldone
434
435	movdqu	0x00($ctx),$A			# load context
436	 lea	128(%rsp),%rax
437	movdqu	0x20($ctx),$B
438	movdqu	0x40($ctx),$C
439	movdqu	0x60($ctx),$D
440	movdqu	0x80($ctx),$E
441	movdqa	0x60($Tbl),$tx			# pbswap_mask
442	movdqa	-0x20($Tbl),$K			# K_00_19
443	jmp	.Loop
444
445.align	32
446.Loop:
447___
448for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
449$code.="	movdqa	0x00($Tbl),$K\n";	# K_20_39
450for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
451$code.="	movdqa	0x20($Tbl),$K\n";	# K_40_59
452for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
453$code.="	movdqa	0x40($Tbl),$K\n";	# K_60_79
454for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
455$code.=<<___;
456	movdqa	(%rbx),@Xi[0]			# pull counters
457	mov	\$1,%ecx
458	cmp	4*0(%rbx),%ecx			# examine counters
459	pxor	$t2,$t2
460	cmovge	$Tbl,@ptr[0]			# cancel input
461	cmp	4*1(%rbx),%ecx
462	movdqa	@Xi[0],@Xi[1]
463	cmovge	$Tbl,@ptr[1]
464	cmp	4*2(%rbx),%ecx
465	pcmpgtd	$t2,@Xi[1]			# mask value
466	cmovge	$Tbl,@ptr[2]
467	cmp	4*3(%rbx),%ecx
468	paddd	@Xi[1],@Xi[0]			# counters--
469	cmovge	$Tbl,@ptr[3]
470
471	movdqu	0x00($ctx),$t0
472	pand	@Xi[1],$A
473	movdqu	0x20($ctx),$t1
474	pand	@Xi[1],$B
475	paddd	$t0,$A
476	movdqu	0x40($ctx),$t2
477	pand	@Xi[1],$C
478	paddd	$t1,$B
479	movdqu	0x60($ctx),$t3
480	pand	@Xi[1],$D
481	paddd	$t2,$C
482	movdqu	0x80($ctx),$tx
483	pand	@Xi[1],$E
484	movdqu	$A,0x00($ctx)
485	paddd	$t3,$D
486	movdqu	$B,0x20($ctx)
487	paddd	$tx,$E
488	movdqu	$C,0x40($ctx)
489	movdqu	$D,0x60($ctx)
490	movdqu	$E,0x80($ctx)
491
492	movdqa	@Xi[0],(%rbx)			# save counters
493	movdqa	0x60($Tbl),$tx			# pbswap_mask
494	movdqa	-0x20($Tbl),$K			# K_00_19
495	dec	$num
496	jnz	.Loop
497
498	mov	`$REG_SZ*17+8`(%rsp),$num
499	lea	$REG_SZ($ctx),$ctx
500	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
501	dec	$num
502	jnz	.Loop_grande
503
504.Ldone:
505	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
506.cfi_def_cfa	%rax,8
507___
508$code.=<<___ if ($win64);
509	movaps	-0xb8(%rax),%xmm6
510	movaps	-0xa8(%rax),%xmm7
511	movaps	-0x98(%rax),%xmm8
512	movaps	-0x88(%rax),%xmm9
513	movaps	-0x78(%rax),%xmm10
514	movaps	-0x68(%rax),%xmm11
515	movaps	-0x58(%rax),%xmm12
516	movaps	-0x48(%rax),%xmm13
517	movaps	-0x38(%rax),%xmm14
518	movaps	-0x28(%rax),%xmm15
519___
520$code.=<<___;
521	mov	-16(%rax),%rbp
522.cfi_restore	%rbp
523	mov	-8(%rax),%rbx
524.cfi_restore	%rbx
525	lea	(%rax),%rsp
526.cfi_def_cfa_register	%rsp
527.Lepilogue:
528	ret
529.cfi_endproc
530.size	sha1_multi_block,.-sha1_multi_block
531___
532						{{{
533my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
534my @MSG0=map("%xmm$_",(4..7));
535my @MSG1=map("%xmm$_",(11..14));
536
537$code.=<<___;
538.type	sha1_multi_block_shaext,\@function,3
539.align	32
540sha1_multi_block_shaext:
541.cfi_startproc
542_shaext_shortcut:
543	mov	%rsp,%rax
544.cfi_def_cfa_register	%rax
545	push	%rbx
546.cfi_push	%rbx
547	push	%rbp
548.cfi_push	%rbp
549___
550$code.=<<___ if ($win64);
551	lea	-0xa8(%rsp),%rsp
552	movaps	%xmm6,(%rsp)
553	movaps	%xmm7,0x10(%rsp)
554	movaps	%xmm8,0x20(%rsp)
555	movaps	%xmm9,0x30(%rsp)
556	movaps	%xmm10,-0x78(%rax)
557	movaps	%xmm11,-0x68(%rax)
558	movaps	%xmm12,-0x58(%rax)
559	movaps	%xmm13,-0x48(%rax)
560	movaps	%xmm14,-0x38(%rax)
561	movaps	%xmm15,-0x28(%rax)
562___
563$code.=<<___;
564	sub	\$`$REG_SZ*18`,%rsp
565	shl	\$1,$num			# we process pair at a time
566	and	\$-256,%rsp
567	lea	0x40($ctx),$ctx			# size optimization
568	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
569.Lbody_shaext:
570	lea	`$REG_SZ*16`(%rsp),%rbx
571	movdqa	K_XX_XX+0x80(%rip),$BSWAP	# byte-n-word swap
572
573.Loop_grande_shaext:
574	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
575	xor	$num,$num
576___
577for($i=0;$i<2;$i++) {
578    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
579    $code.=<<___;
580	# input pointer
581	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
582	# number of blocks
583	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
584	cmp	$num,%ecx
585	cmovg	%ecx,$num			# find maximum
586	test	%ecx,%ecx
587	mov	%ecx,`4*$i`(%rbx)		# initialize counters
588	cmovle	%rsp,@ptr[$i]			# cancel input
589___
590}
591$code.=<<___;
592	test	$num,$num
593	jz	.Ldone_shaext
594
595	movq		0x00-0x40($ctx),$ABCD0	# a1.a0
596	movq		0x20-0x40($ctx),@MSG0[0]# b1.b0
597	movq		0x40-0x40($ctx),@MSG0[1]# c1.c0
598	movq		0x60-0x40($ctx),@MSG0[2]# d1.d0
599	movq		0x80-0x40($ctx),@MSG0[3]# e1.e0
600
601	punpckldq	@MSG0[0],$ABCD0		# b1.a1.b0.a0
602	punpckldq	@MSG0[2],@MSG0[1]	# d1.c1.d0.c0
603
604	movdqa		$ABCD0,$ABCD1
605	punpcklqdq	@MSG0[1],$ABCD0		# d0.c0.b0.a0
606	punpckhqdq	@MSG0[1],$ABCD1		# d1.c1.b1.a1
607
608	pshufd		\$0b00111111,@MSG0[3],$E0
609	pshufd		\$0b01111111,@MSG0[3],$E1
610	pshufd		\$0b00011011,$ABCD0,$ABCD0
611	pshufd		\$0b00011011,$ABCD1,$ABCD1
612	jmp		.Loop_shaext
613
614.align	32
615.Loop_shaext:
616	movdqu		0x00(@ptr[0]),@MSG0[0]
617	 movdqu		0x00(@ptr[1]),@MSG1[0]
618	movdqu		0x10(@ptr[0]),@MSG0[1]
619	 movdqu		0x10(@ptr[1]),@MSG1[1]
620	movdqu		0x20(@ptr[0]),@MSG0[2]
621	pshufb		$BSWAP,@MSG0[0]
622	 movdqu		0x20(@ptr[1]),@MSG1[2]
623	 pshufb		$BSWAP,@MSG1[0]
624	movdqu		0x30(@ptr[0]),@MSG0[3]
625	lea		0x40(@ptr[0]),@ptr[0]
626	pshufb		$BSWAP,@MSG0[1]
627	 movdqu		0x30(@ptr[1]),@MSG1[3]
628	 lea		0x40(@ptr[1]),@ptr[1]
629	 pshufb		$BSWAP,@MSG1[1]
630
631	movdqa		$E0,0x50(%rsp)		# offload
632	paddd		@MSG0[0],$E0
633	 movdqa		$E1,0x70(%rsp)
634	 paddd		@MSG1[0],$E1
635	movdqa		$ABCD0,0x40(%rsp)	# offload
636	movdqa		$ABCD0,$E0_
637	 movdqa		$ABCD1,0x60(%rsp)
638	 movdqa		$ABCD1,$E1_
639	sha1rnds4	\$0,$E0,$ABCD0		# 0-3
640	sha1nexte	@MSG0[1],$E0_
641	 sha1rnds4	\$0,$E1,$ABCD1		# 0-3
642	 sha1nexte	@MSG1[1],$E1_
643	pshufb		$BSWAP,@MSG0[2]
644	prefetcht0	127(@ptr[0])
645	sha1msg1	@MSG0[1],@MSG0[0]
646	 pshufb		$BSWAP,@MSG1[2]
647	 prefetcht0	127(@ptr[1])
648	 sha1msg1	@MSG1[1],@MSG1[0]
649
650	pshufb		$BSWAP,@MSG0[3]
651	movdqa		$ABCD0,$E0
652	 pshufb		$BSWAP,@MSG1[3]
653	 movdqa		$ABCD1,$E1
654	sha1rnds4	\$0,$E0_,$ABCD0		# 4-7
655	sha1nexte	@MSG0[2],$E0
656	 sha1rnds4	\$0,$E1_,$ABCD1		# 4-7
657	 sha1nexte	@MSG1[2],$E1
658	pxor		@MSG0[2],@MSG0[0]
659	sha1msg1	@MSG0[2],@MSG0[1]
660	 pxor		@MSG1[2],@MSG1[0]
661	 sha1msg1	@MSG1[2],@MSG1[1]
662___
663for($i=2;$i<20-4;$i++) {
664$code.=<<___;
665	movdqa		$ABCD0,$E0_
666	 movdqa		$ABCD1,$E1_
667	sha1rnds4	\$`int($i/5)`,$E0,$ABCD0	# 8-11
668	sha1nexte	@MSG0[3],$E0_
669	 sha1rnds4	\$`int($i/5)`,$E1,$ABCD1	# 8-11
670	 sha1nexte	@MSG1[3],$E1_
671	sha1msg2	@MSG0[3],@MSG0[0]
672	 sha1msg2	@MSG1[3],@MSG1[0]
673	pxor		@MSG0[3],@MSG0[1]
674	sha1msg1	@MSG0[3],@MSG0[2]
675	 pxor		@MSG1[3],@MSG1[1]
676	 sha1msg1	@MSG1[3],@MSG1[2]
677___
678	($E0,$E0_)=($E0_,$E0);		($E1,$E1_)=($E1_,$E1);
679	push(@MSG0,shift(@MSG0));	push(@MSG1,shift(@MSG1));
680}
681$code.=<<___;
682	movdqa		$ABCD0,$E0_
683	 movdqa		$ABCD1,$E1_
684	sha1rnds4	\$3,$E0,$ABCD0		# 64-67
685	sha1nexte	@MSG0[3],$E0_
686	 sha1rnds4	\$3,$E1,$ABCD1		# 64-67
687	 sha1nexte	@MSG1[3],$E1_
688	sha1msg2	@MSG0[3],@MSG0[0]
689	 sha1msg2	@MSG1[3],@MSG1[0]
690	pxor		@MSG0[3],@MSG0[1]
691	 pxor		@MSG1[3],@MSG1[1]
692
693	mov		\$1,%ecx
694	pxor		@MSG0[2],@MSG0[2]	# zero
695	cmp		4*0(%rbx),%ecx		# examine counters
696	cmovge		%rsp,@ptr[0]		# cancel input
697
698	movdqa		$ABCD0,$E0
699	 movdqa		$ABCD1,$E1
700	sha1rnds4	\$3,$E0_,$ABCD0		# 68-71
701	sha1nexte	@MSG0[0],$E0
702	 sha1rnds4	\$3,$E1_,$ABCD1		# 68-71
703	 sha1nexte	@MSG1[0],$E1
704	sha1msg2	@MSG0[0],@MSG0[1]
705	 sha1msg2	@MSG1[0],@MSG1[1]
706
707	cmp		4*1(%rbx),%ecx
708	cmovge		%rsp,@ptr[1]
709	movq		(%rbx),@MSG0[0]		# pull counters
710
711	movdqa		$ABCD0,$E0_
712	 movdqa		$ABCD1,$E1_
713	sha1rnds4	\$3,$E0,$ABCD0		# 72-75
714	sha1nexte	@MSG0[1],$E0_
715	 sha1rnds4	\$3,$E1,$ABCD1		# 72-75
716	 sha1nexte	@MSG1[1],$E1_
717
718	pshufd		\$0x00,@MSG0[0],@MSG1[2]
719	pshufd		\$0x55,@MSG0[0],@MSG1[3]
720	movdqa		@MSG0[0],@MSG0[1]
721	pcmpgtd		@MSG0[2],@MSG1[2]
722	pcmpgtd		@MSG0[2],@MSG1[3]
723
724	movdqa		$ABCD0,$E0
725	 movdqa		$ABCD1,$E1
726	sha1rnds4	\$3,$E0_,$ABCD0		# 76-79
727	sha1nexte	$MSG0[2],$E0
728	 sha1rnds4	\$3,$E1_,$ABCD1		# 76-79
729	 sha1nexte	$MSG0[2],$E1
730
731	pcmpgtd		@MSG0[2],@MSG0[1]	# counter mask
732	pand		@MSG1[2],$ABCD0
733	pand		@MSG1[2],$E0
734	 pand		@MSG1[3],$ABCD1
735	 pand		@MSG1[3],$E1
736	paddd		@MSG0[1],@MSG0[0]	# counters--
737
738	paddd		0x40(%rsp),$ABCD0
739	paddd		0x50(%rsp),$E0
740	 paddd		0x60(%rsp),$ABCD1
741	 paddd		0x70(%rsp),$E1
742
743	movq		@MSG0[0],(%rbx)		# save counters
744	dec		$num
745	jnz		.Loop_shaext
746
747	mov		`$REG_SZ*17+8`(%rsp),$num
748
749	pshufd		\$0b00011011,$ABCD0,$ABCD0
750	pshufd		\$0b00011011,$ABCD1,$ABCD1
751
752	movdqa		$ABCD0,@MSG0[0]
753	punpckldq	$ABCD1,$ABCD0		# b1.b0.a1.a0
754	punpckhdq	$ABCD1,@MSG0[0]		# d1.d0.c1.c0
755	punpckhdq	$E1,$E0			# e1.e0.xx.xx
756	movq		$ABCD0,0x00-0x40($ctx)	# a1.a0
757	psrldq		\$8,$ABCD0
758	movq		@MSG0[0],0x40-0x40($ctx)# c1.c0
759	psrldq		\$8,@MSG0[0]
760	movq		$ABCD0,0x20-0x40($ctx)	# b1.b0
761	psrldq		\$8,$E0
762	movq		@MSG0[0],0x60-0x40($ctx)# d1.d0
763	movq		$E0,0x80-0x40($ctx)	# e1.e0
764
765	lea	`$REG_SZ/2`($ctx),$ctx
766	lea	`$inp_elm_size*2`($inp),$inp
767	dec	$num
768	jnz	.Loop_grande_shaext
769
770.Ldone_shaext:
771	#mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
772___
773$code.=<<___ if ($win64);
774	movaps	-0xb8(%rax),%xmm6
775	movaps	-0xa8(%rax),%xmm7
776	movaps	-0x98(%rax),%xmm8
777	movaps	-0x88(%rax),%xmm9
778	movaps	-0x78(%rax),%xmm10
779	movaps	-0x68(%rax),%xmm11
780	movaps	-0x58(%rax),%xmm12
781	movaps	-0x48(%rax),%xmm13
782	movaps	-0x38(%rax),%xmm14
783	movaps	-0x28(%rax),%xmm15
784___
785$code.=<<___;
786	mov	-16(%rax),%rbp
787.cfi_restore	%rbp
788	mov	-8(%rax),%rbx
789.cfi_restore	%rbx
790	lea	(%rax),%rsp
791.cfi_def_cfa_register	%rsp
792.Lepilogue_shaext:
793	ret
794.cfi_endproc
795.size	sha1_multi_block_shaext,.-sha1_multi_block_shaext
796___
797						}}}
798
799						if ($avx) {{{
800sub BODY_00_19_avx {
801my ($i,$a,$b,$c,$d,$e)=@_;
802my $j=$i+1;
803my $k=$i+2;
804my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
805my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
806
807$code.=<<___ if ($i==0 && $REG_SZ==16);
808	vmovd		(@ptr[0]),@Xi[0]
809	 lea		`16*4`(@ptr[0]),@ptr[0]
810	vmovd		(@ptr[1]),@Xi[2]	# borrow Xi[2]
811	 lea		`16*4`(@ptr[1]),@ptr[1]
812	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
813	 lea		`16*4`(@ptr[2]),@ptr[2]
814	vpinsrd		\$1,(@ptr[3]),@Xi[2],@Xi[2]
815	 lea		`16*4`(@ptr[3]),@ptr[3]
816	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
817	vpunpckldq	@Xi[2],@Xi[0],@Xi[0]
818	 vmovd		`4*$j-16*4`($ptr_n),$t3
819	vpshufb		$tx,@Xi[0],@Xi[0]
820___
821$code.=<<___ if ($i<15 && $REG_SZ==16);		# just load input
822	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
823	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
824___
825$code.=<<___ if ($i==0 && $REG_SZ==32);
826	vmovd		(@ptr[0]),@Xi[0]
827	 lea		`16*4`(@ptr[0]),@ptr[0]
828	vmovd		(@ptr[4]),@Xi[2]	# borrow Xi[2]
829	 lea		`16*4`(@ptr[4]),@ptr[4]
830	vmovd		(@ptr[1]),$t2
831	 lea		`16*4`(@ptr[1]),@ptr[1]
832	vmovd		(@ptr[5]),$t1
833	 lea		`16*4`(@ptr[5]),@ptr[5]
834	vpinsrd		\$1,(@ptr[2]),@Xi[0],@Xi[0]
835	 lea		`16*4`(@ptr[2]),@ptr[2]
836	vpinsrd		\$1,(@ptr[6]),@Xi[2],@Xi[2]
837	 lea		`16*4`(@ptr[6]),@ptr[6]
838	vpinsrd		\$1,(@ptr[3]),$t2,$t2
839	 lea		`16*4`(@ptr[3]),@ptr[3]
840	vpunpckldq	$t2,@Xi[0],@Xi[0]
841	vpinsrd		\$1,(@ptr[7]),$t1,$t1
842	 lea		`16*4`(@ptr[7]),@ptr[7]
843	vpunpckldq	$t1,@Xi[2],@Xi[2]
844	 vmovd		`4*$j-16*4`(@ptr[0]),@Xi[1]
845	vinserti128	@Xi[2],@Xi[0],@Xi[0]
846	 vmovd		`4*$j-16*4`($ptr_n),$t3
847	vpshufb		$tx,@Xi[0],@Xi[0]
848___
849$code.=<<___ if ($i<15 && $REG_SZ==32);		# just load input
850	 vmovd		`4*$j-16*4`(@ptr[1]),$t2
851	 vmovd		`4*$j-16*4`(@ptr[5]),$t1
852	 vpinsrd	\$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
853	 vpinsrd	\$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
854	 vpinsrd	\$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
855	 vpunpckldq	$t2,@Xi[1],@Xi[1]
856	 vpinsrd	\$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
857	 vpunpckldq	$t1,$t3,$t3
858___
859$code.=<<___ if ($i<14);
860	vpaddd	$K,$e,$e			# e+=K_00_19
861	vpslld	\$5,$a,$t2
862	vpandn	$d,$b,$t1
863	vpand	$c,$b,$t0
864
865	vmovdqa	@Xi[0],`&Xi_off($i)`
866	vpaddd	@Xi[0],$e,$e			# e+=X[i]
867	 $vpack		$t3,@Xi[1],@Xi[1]
868	vpsrld	\$27,$a,$t3
869	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
870	 vmovd		`4*$k-16*4`(@ptr[0]),@Xi[2]
871
872	vpslld	\$30,$b,$t1
873	vpor	$t3,$t2,$t2			# rol(a,5)
874	 vmovd		`4*$k-16*4`($ptr_n),$t3
875	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
876
877	vpsrld	\$2,$b,$b
878	vpaddd	$t2,$e,$e			# e+=rol(a,5)
879	 vpshufb	$tx,@Xi[1],@Xi[1]
880	vpor	$t1,$b,$b			# b=rol(b,30)
881___
882$code.=<<___ if ($i==14);
883	vpaddd	$K,$e,$e			# e+=K_00_19
884	 prefetcht0	63(@ptr[0])
885	vpslld	\$5,$a,$t2
886	vpandn	$d,$b,$t1
887	vpand	$c,$b,$t0
888
889	vmovdqa	@Xi[0],`&Xi_off($i)`
890	vpaddd	@Xi[0],$e,$e			# e+=X[i]
891	 $vpack		$t3,@Xi[1],@Xi[1]
892	vpsrld	\$27,$a,$t3
893	 prefetcht0	63(@ptr[1])
894	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
895
896	vpslld	\$30,$b,$t1
897	vpor	$t3,$t2,$t2			# rol(a,5)
898	 prefetcht0	63(@ptr[2])
899	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
900
901	vpsrld	\$2,$b,$b
902	vpaddd	$t2,$e,$e			# e+=rol(a,5)
903	 prefetcht0	63(@ptr[3])
904	 vpshufb	$tx,@Xi[1],@Xi[1]
905	vpor	$t1,$b,$b			# b=rol(b,30)
906___
907$code.=<<___ if ($i>=13 && $i<15);
908	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# preload "X[2]"
909___
910$code.=<<___ if ($i>=15);			# apply Xupdate
911	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
912	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
913
914	vpaddd	$K,$e,$e			# e+=K_00_19
915	vpslld	\$5,$a,$t2
916	vpandn	$d,$b,$t1
917	 `"prefetcht0	63(@ptr[4])"		if ($i==15 && $REG_SZ==32)`
918	vpand	$c,$b,$t0
919
920	vmovdqa	@Xi[0],`&Xi_off($i)`
921	vpaddd	@Xi[0],$e,$e			# e+=X[i]
922	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
923	vpsrld	\$27,$a,$t3
924	vpxor	$t1,$t0,$t0			# Ch(b,c,d)
925	 vpxor	@Xi[3],@Xi[1],@Xi[1]
926	 `"prefetcht0	63(@ptr[5])"		if ($i==15 && $REG_SZ==32)`
927
928	vpslld	\$30,$b,$t1
929	vpor	$t3,$t2,$t2			# rol(a,5)
930	vpaddd	$t0,$e,$e			# e+=Ch(b,c,d)
931	 `"prefetcht0	63(@ptr[6])"		if ($i==15 && $REG_SZ==32)`
932	 vpsrld	\$31,@Xi[1],$tx
933	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
934
935	vpsrld	\$2,$b,$b
936	 `"prefetcht0	63(@ptr[7])"		if ($i==15 && $REG_SZ==32)`
937	vpaddd	$t2,$e,$e			# e+=rol(a,5)
938	 vpor	$tx,@Xi[1],@Xi[1]		# rol	\$1,@Xi[1]
939	vpor	$t1,$b,$b			# b=rol(b,30)
940___
941push(@Xi,shift(@Xi));
942}
943
944sub BODY_20_39_avx {
945my ($i,$a,$b,$c,$d,$e)=@_;
946my $j=$i+1;
947
948$code.=<<___ if ($i<79);
949	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
950	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
951
952	vpslld	\$5,$a,$t2
953	vpaddd	$K,$e,$e			# e+=K_20_39
954	vpxor	$b,$d,$t0
955___
956$code.=<<___ if ($i<72);
957	vmovdqa	@Xi[0],`&Xi_off($i)`
958___
959$code.=<<___ if ($i<79);
960	vpaddd	@Xi[0],$e,$e			# e+=X[i]
961	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
962	vpsrld	\$27,$a,$t3
963	vpxor	$c,$t0,$t0			# Parity(b,c,d)
964	 vpxor	@Xi[3],@Xi[1],@Xi[1]
965
966	vpslld	\$30,$b,$t1
967	vpor	$t3,$t2,$t2			# rol(a,5)
968	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
969	 vpsrld	\$31,@Xi[1],$tx
970	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
971
972	vpsrld	\$2,$b,$b
973	vpaddd	$t2,$e,$e			# e+=rol(a,5)
974	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@Xi[1],1)
975	vpor	$t1,$b,$b			# b=rol(b,30)
976___
977$code.=<<___ if ($i==79);
978	vpslld	\$5,$a,$t2
979	vpaddd	$K,$e,$e			# e+=K_20_39
980	vpxor	$b,$d,$t0
981
982	vpsrld	\$27,$a,$t3
983	vpaddd	@Xi[0],$e,$e			# e+=X[i]
984	vpxor	$c,$t0,$t0			# Parity(b,c,d)
985
986	vpslld	\$30,$b,$t1
987	vpor	$t3,$t2,$t2			# rol(a,5)
988	vpaddd	$t0,$e,$e			# e+=Parity(b,c,d)
989
990	vpsrld	\$2,$b,$b
991	vpaddd	$t2,$e,$e			# e+=rol(a,5)
992	vpor	$t1,$b,$b			# b=rol(b,30)
993___
994push(@Xi,shift(@Xi));
995}
996
997sub BODY_40_59_avx {
998my ($i,$a,$b,$c,$d,$e)=@_;
999my $j=$i+1;
1000
1001$code.=<<___;
1002	vpxor	@Xi[-2],@Xi[1],@Xi[1]		# "X[13]"
1003	vmovdqa	`&Xi_off($j+2)`,@Xi[3]		# "X[2]"
1004
1005	vpaddd	$K,$e,$e			# e+=K_40_59
1006	vpslld	\$5,$a,$t2
1007	vpand	$c,$d,$t1
1008	 vpxor	`&Xi_off($j+8)`,@Xi[1],@Xi[1]
1009
1010	vpaddd	$t1,$e,$e
1011	vpsrld	\$27,$a,$t3
1012	vpxor	$c,$d,$t0
1013	 vpxor	@Xi[3],@Xi[1],@Xi[1]
1014
1015	vmovdqu	@Xi[0],`&Xi_off($i)`
1016	vpaddd	@Xi[0],$e,$e			# e+=X[i]
1017	vpor	$t3,$t2,$t2			# rol(a,5)
1018	 vpsrld	\$31,@Xi[1],$tx
1019	vpand	$b,$t0,$t0
1020	 vpaddd	@Xi[1],@Xi[1],@Xi[1]
1021
1022	vpslld	\$30,$b,$t1
1023	vpaddd	$t0,$e,$e			# e+=Maj(b,d,c)
1024
1025	vpsrld	\$2,$b,$b
1026	vpaddd	$t2,$e,$e			# e+=rol(a,5)
1027	 vpor	$tx,@Xi[1],@Xi[1]		# rol(@X[1],1)
1028	vpor	$t1,$b,$b			# b=rol(b,30)
1029___
1030push(@Xi,shift(@Xi));
1031}
1032
1033$code.=<<___;
1034.type	sha1_multi_block_avx,\@function,3
1035.align	32
1036sha1_multi_block_avx:
1037.cfi_startproc
1038_avx_shortcut:
1039___
1040$code.=<<___ if ($avx>1);
1041	shr	\$32,%rcx
1042	cmp	\$2,$num
1043	jb	.Lavx
1044	test	\$`1<<5`,%ecx
1045	jnz	_avx2_shortcut
1046	jmp	.Lavx
1047.align	32
1048.Lavx:
1049___
1050$code.=<<___;
1051	mov	%rsp,%rax
1052.cfi_def_cfa_register	%rax
1053	push	%rbx
1054.cfi_push	%rbx
1055	push	%rbp
1056.cfi_push	%rbp
1057___
1058$code.=<<___ if ($win64);
1059	lea	-0xa8(%rsp),%rsp
1060	movaps	%xmm6,(%rsp)
1061	movaps	%xmm7,0x10(%rsp)
1062	movaps	%xmm8,0x20(%rsp)
1063	movaps	%xmm9,0x30(%rsp)
1064	movaps	%xmm10,-0x78(%rax)
1065	movaps	%xmm11,-0x68(%rax)
1066	movaps	%xmm12,-0x58(%rax)
1067	movaps	%xmm13,-0x48(%rax)
1068	movaps	%xmm14,-0x38(%rax)
1069	movaps	%xmm15,-0x28(%rax)
1070___
1071$code.=<<___;
1072	sub	\$`$REG_SZ*18`, %rsp
1073	and	\$-256,%rsp
1074	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1075.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1076.Lbody_avx:
1077	lea	K_XX_XX(%rip),$Tbl
1078	lea	`$REG_SZ*16`(%rsp),%rbx
1079
1080	vzeroupper
1081.Loop_grande_avx:
1082	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1083	xor	$num,$num
1084___
1085for($i=0;$i<4;$i++) {
1086    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1087    $code.=<<___;
1088	# input pointer
1089	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1090	# number of blocks
1091	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1092	cmp	$num,%ecx
1093	cmovg	%ecx,$num			# find maximum
1094	test	%ecx,%ecx
1095	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1096	cmovle	$Tbl,@ptr[$i]			# cancel input
1097___
1098}
1099$code.=<<___;
1100	test	$num,$num
1101	jz	.Ldone_avx
1102
1103	vmovdqu	0x00($ctx),$A			# load context
1104	 lea	128(%rsp),%rax
1105	vmovdqu	0x20($ctx),$B
1106	vmovdqu	0x40($ctx),$C
1107	vmovdqu	0x60($ctx),$D
1108	vmovdqu	0x80($ctx),$E
1109	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1110	jmp	.Loop_avx
1111
1112.align	32
1113.Loop_avx:
1114___
1115$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1116for($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1117$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1118for(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1119$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1120for(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1121$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1122for(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1123$code.=<<___;
1124	mov	\$1,%ecx
1125___
1126for($i=0;$i<4;$i++) {
1127    $code.=<<___;
1128	cmp	`4*$i`(%rbx),%ecx		# examine counters
1129	cmovge	$Tbl,@ptr[$i]			# cancel input
1130___
1131}
1132$code.=<<___;
1133	vmovdqu	(%rbx),$t0			# pull counters
1134	vpxor	$t2,$t2,$t2
1135	vmovdqa	$t0,$t1
1136	vpcmpgtd $t2,$t1,$t1			# mask value
1137	vpaddd	$t1,$t0,$t0			# counters--
1138
1139	vpand	$t1,$A,$A
1140	vpand	$t1,$B,$B
1141	vpaddd	0x00($ctx),$A,$A
1142	vpand	$t1,$C,$C
1143	vpaddd	0x20($ctx),$B,$B
1144	vpand	$t1,$D,$D
1145	vpaddd	0x40($ctx),$C,$C
1146	vpand	$t1,$E,$E
1147	vpaddd	0x60($ctx),$D,$D
1148	vpaddd	0x80($ctx),$E,$E
1149	vmovdqu	$A,0x00($ctx)
1150	vmovdqu	$B,0x20($ctx)
1151	vmovdqu	$C,0x40($ctx)
1152	vmovdqu	$D,0x60($ctx)
1153	vmovdqu	$E,0x80($ctx)
1154
1155	vmovdqu	$t0,(%rbx)			# save counters
1156	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1157	dec	$num
1158	jnz	.Loop_avx
1159
1160	mov	`$REG_SZ*17+8`(%rsp),$num
1161	lea	$REG_SZ($ctx),$ctx
1162	lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1163	dec	$num
1164	jnz	.Loop_grande_avx
1165
1166.Ldone_avx:
1167	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1168.cfi_def_cfa	%rax,8
1169	vzeroupper
1170___
1171$code.=<<___ if ($win64);
1172	movaps	-0xb8(%rax),%xmm6
1173	movaps	-0xa8(%rax),%xmm7
1174	movaps	-0x98(%rax),%xmm8
1175	movaps	-0x88(%rax),%xmm9
1176	movaps	-0x78(%rax),%xmm10
1177	movaps	-0x68(%rax),%xmm11
1178	movaps	-0x58(%rax),%xmm12
1179	movaps	-0x48(%rax),%xmm13
1180	movaps	-0x38(%rax),%xmm14
1181	movaps	-0x28(%rax),%xmm15
1182___
1183$code.=<<___;
1184	mov	-16(%rax),%rbp
1185.cfi_restore	%rbp
1186	mov	-8(%rax),%rbx
1187.cfi_restore	%rbx
1188	lea	(%rax),%rsp
1189.cfi_def_cfa_register	%rsp
1190.Lepilogue_avx:
1191	ret
1192.cfi_endproc
1193.size	sha1_multi_block_avx,.-sha1_multi_block_avx
1194___
1195
1196						if ($avx>1) {
1197$code =~ s/\`([^\`]*)\`/eval $1/gem;
1198
1199$REG_SZ=32;
1200
1201@ptr=map("%r$_",(12..15,8..11));
1202
1203@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1204($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1205@Xi=map("%ymm$_",(10..14));
1206$K="%ymm15";
1207
1208$code.=<<___;
1209.type	sha1_multi_block_avx2,\@function,3
1210.align	32
1211sha1_multi_block_avx2:
1212.cfi_startproc
1213_avx2_shortcut:
1214	mov	%rsp,%rax
1215.cfi_def_cfa_register	%rax
1216	push	%rbx
1217.cfi_push	%rbx
1218	push	%rbp
1219.cfi_push	%rbp
1220	push	%r12
1221.cfi_push	%r12
1222	push	%r13
1223.cfi_push	%r13
1224	push	%r14
1225.cfi_push	%r14
1226	push	%r15
1227.cfi_push	%r15
1228___
1229$code.=<<___ if ($win64);
1230	lea	-0xa8(%rsp),%rsp
1231	movaps	%xmm6,(%rsp)
1232	movaps	%xmm7,0x10(%rsp)
1233	movaps	%xmm8,0x20(%rsp)
1234	movaps	%xmm9,0x30(%rsp)
1235	movaps	%xmm10,0x40(%rsp)
1236	movaps	%xmm11,0x50(%rsp)
1237	movaps	%xmm12,-0x78(%rax)
1238	movaps	%xmm13,-0x68(%rax)
1239	movaps	%xmm14,-0x58(%rax)
1240	movaps	%xmm15,-0x48(%rax)
1241___
1242$code.=<<___;
1243	sub	\$`$REG_SZ*18`, %rsp
1244	and	\$-256,%rsp
1245	mov	%rax,`$REG_SZ*17`(%rsp)		# original %rsp
1246.cfi_cfa_expression	%rsp+`$REG_SZ*17`,deref,+8
1247.Lbody_avx2:
1248	lea	K_XX_XX(%rip),$Tbl
1249	shr	\$1,$num
1250
1251	vzeroupper
1252.Loop_grande_avx2:
1253	mov	$num,`$REG_SZ*17+8`(%rsp)	# original $num
1254	xor	$num,$num
1255	lea	`$REG_SZ*16`(%rsp),%rbx
1256___
1257for($i=0;$i<8;$i++) {
1258    $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1259    $code.=<<___;
1260	# input pointer
1261	mov	`$inp_elm_size*$i+0`($inp),$ptr_reg
1262	# number of blocks
1263	mov	`$inp_elm_size*$i+$ptr_size`($inp),%ecx
1264	cmp	$num,%ecx
1265	cmovg	%ecx,$num			# find maximum
1266	test	%ecx,%ecx
1267	mov	%ecx,`4*$i`(%rbx)		# initialize counters
1268	cmovle	$Tbl,@ptr[$i]			# cancel input
1269___
1270}
1271$code.=<<___;
1272	vmovdqu	0x00($ctx),$A			# load context
1273	 lea	128(%rsp),%rax
1274	vmovdqu	0x20($ctx),$B
1275	 lea	256+128(%rsp),%rbx
1276	vmovdqu	0x40($ctx),$C
1277	vmovdqu	0x60($ctx),$D
1278	vmovdqu	0x80($ctx),$E
1279	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1280	jmp	.Loop_avx2
1281
1282.align	32
1283.Loop_avx2:
1284___
1285$code.="	vmovdqa	-0x20($Tbl),$K\n";	# K_00_19
1286for($i=0;$i<20;$i++)	{ &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1287$code.="	vmovdqa	0x00($Tbl),$K\n";	# K_20_39
1288for(;$i<40;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1289$code.="	vmovdqa	0x20($Tbl),$K\n";	# K_40_59
1290for(;$i<60;$i++)	{ &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1291$code.="	vmovdqa	0x40($Tbl),$K\n";	# K_60_79
1292for(;$i<80;$i++)	{ &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1293$code.=<<___;
1294	mov	\$1,%ecx
1295	lea	`$REG_SZ*16`(%rsp),%rbx
1296___
1297for($i=0;$i<8;$i++) {
1298    $code.=<<___;
1299	cmp	`4*$i`(%rbx),%ecx		# examine counters
1300	cmovge	$Tbl,@ptr[$i]			# cancel input
1301___
1302}
1303$code.=<<___;
1304	vmovdqu	(%rbx),$t0		# pull counters
1305	vpxor	$t2,$t2,$t2
1306	vmovdqa	$t0,$t1
1307	vpcmpgtd $t2,$t1,$t1			# mask value
1308	vpaddd	$t1,$t0,$t0			# counters--
1309
1310	vpand	$t1,$A,$A
1311	vpand	$t1,$B,$B
1312	vpaddd	0x00($ctx),$A,$A
1313	vpand	$t1,$C,$C
1314	vpaddd	0x20($ctx),$B,$B
1315	vpand	$t1,$D,$D
1316	vpaddd	0x40($ctx),$C,$C
1317	vpand	$t1,$E,$E
1318	vpaddd	0x60($ctx),$D,$D
1319	vpaddd	0x80($ctx),$E,$E
1320	vmovdqu	$A,0x00($ctx)
1321	vmovdqu	$B,0x20($ctx)
1322	vmovdqu	$C,0x40($ctx)
1323	vmovdqu	$D,0x60($ctx)
1324	vmovdqu	$E,0x80($ctx)
1325
1326	vmovdqu	$t0,(%rbx)			# save counters
1327	lea	256+128(%rsp),%rbx
1328	vmovdqu	0x60($Tbl),$tx			# pbswap_mask
1329	dec	$num
1330	jnz	.Loop_avx2
1331
1332	#mov	`$REG_SZ*17+8`(%rsp),$num
1333	#lea	$REG_SZ($ctx),$ctx
1334	#lea	`$inp_elm_size*$REG_SZ/4`($inp),$inp
1335	#dec	$num
1336	#jnz	.Loop_grande_avx2
1337
1338.Ldone_avx2:
1339	mov	`$REG_SZ*17`(%rsp),%rax		# original %rsp
1340.cfi_def_cfa	%rax,8
1341	vzeroupper
1342___
1343$code.=<<___ if ($win64);
1344	movaps	-0xd8(%rax),%xmm6
1345	movaps	-0xc8(%rax),%xmm7
1346	movaps	-0xb8(%rax),%xmm8
1347	movaps	-0xa8(%rax),%xmm9
1348	movaps	-0x98(%rax),%xmm10
1349	movaps	-0x88(%rax),%xmm11
1350	movaps	-0x78(%rax),%xmm12
1351	movaps	-0x68(%rax),%xmm13
1352	movaps	-0x58(%rax),%xmm14
1353	movaps	-0x48(%rax),%xmm15
1354___
1355$code.=<<___;
1356	mov	-48(%rax),%r15
1357.cfi_restore	%r15
1358	mov	-40(%rax),%r14
1359.cfi_restore	%r14
1360	mov	-32(%rax),%r13
1361.cfi_restore	%r13
1362	mov	-24(%rax),%r12
1363.cfi_restore	%r12
1364	mov	-16(%rax),%rbp
1365.cfi_restore	%rbp
1366	mov	-8(%rax),%rbx
1367.cfi_restore	%rbx
1368	lea	(%rax),%rsp
1369.cfi_def_cfa_register	%rsp
1370.Lepilogue_avx2:
1371	ret
1372.cfi_endproc
1373.size	sha1_multi_block_avx2,.-sha1_multi_block_avx2
1374___
1375						}	}}}
1376$code.=<<___;
1377.section .rodata align=256
1378.align	256
1379	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1380	.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1381K_XX_XX:
1382	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1383	.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1384	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1385	.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1386	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1387	.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1388	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1389	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap
1390	.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1391	.asciz	"SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1392.previous
1393___
1394
1395if ($win64) {
1396# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1397#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1398$rec="%rcx";
1399$frame="%rdx";
1400$context="%r8";
1401$disp="%r9";
1402
1403$code.=<<___;
1404.extern	__imp_RtlVirtualUnwind
1405.type	se_handler,\@abi-omnipotent
1406.align	16
1407se_handler:
1408	push	%rsi
1409	push	%rdi
1410	push	%rbx
1411	push	%rbp
1412	push	%r12
1413	push	%r13
1414	push	%r14
1415	push	%r15
1416	pushfq
1417	sub	\$64,%rsp
1418
1419	mov	120($context),%rax	# pull context->Rax
1420	mov	248($context),%rbx	# pull context->Rip
1421
1422	mov	8($disp),%rsi		# disp->ImageBase
1423	mov	56($disp),%r11		# disp->HandlerData
1424
1425	mov	0(%r11),%r10d		# HandlerData[0]
1426	lea	(%rsi,%r10),%r10	# end of prologue label
1427	cmp	%r10,%rbx		# context->Rip<.Lbody
1428	jb	.Lin_prologue
1429
1430	mov	152($context),%rax	# pull context->Rsp
1431
1432	mov	4(%r11),%r10d		# HandlerData[1]
1433	lea	(%rsi,%r10),%r10	# epilogue label
1434	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1435	jae	.Lin_prologue
1436
1437	mov	`16*17`(%rax),%rax	# pull saved stack pointer
1438
1439	mov	-8(%rax),%rbx
1440	mov	-16(%rax),%rbp
1441	mov	%rbx,144($context)	# restore context->Rbx
1442	mov	%rbp,160($context)	# restore context->Rbp
1443
1444	lea	-24-10*16(%rax),%rsi
1445	lea	512($context),%rdi	# &context.Xmm6
1446	mov	\$20,%ecx
1447	.long	0xa548f3fc		# cld; rep movsq
1448
1449.Lin_prologue:
1450	mov	8(%rax),%rdi
1451	mov	16(%rax),%rsi
1452	mov	%rax,152($context)	# restore context->Rsp
1453	mov	%rsi,168($context)	# restore context->Rsi
1454	mov	%rdi,176($context)	# restore context->Rdi
1455
1456	mov	40($disp),%rdi		# disp->ContextRecord
1457	mov	$context,%rsi		# context
1458	mov	\$154,%ecx		# sizeof(CONTEXT)
1459	.long	0xa548f3fc		# cld; rep movsq
1460
1461	mov	$disp,%rsi
1462	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1463	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1464	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1465	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1466	mov	40(%rsi),%r10		# disp->ContextRecord
1467	lea	56(%rsi),%r11		# &disp->HandlerData
1468	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1469	mov	%r10,32(%rsp)		# arg5
1470	mov	%r11,40(%rsp)		# arg6
1471	mov	%r12,48(%rsp)		# arg7
1472	mov	%rcx,56(%rsp)		# arg8, (NULL)
1473	call	*__imp_RtlVirtualUnwind(%rip)
1474
1475	mov	\$1,%eax		# ExceptionContinueSearch
1476	add	\$64,%rsp
1477	popfq
1478	pop	%r15
1479	pop	%r14
1480	pop	%r13
1481	pop	%r12
1482	pop	%rbp
1483	pop	%rbx
1484	pop	%rdi
1485	pop	%rsi
1486	ret
1487.size	se_handler,.-se_handler
1488___
1489$code.=<<___ if ($avx>1);
1490.type	avx2_handler,\@abi-omnipotent
1491.align	16
1492avx2_handler:
1493	push	%rsi
1494	push	%rdi
1495	push	%rbx
1496	push	%rbp
1497	push	%r12
1498	push	%r13
1499	push	%r14
1500	push	%r15
1501	pushfq
1502	sub	\$64,%rsp
1503
1504	mov	120($context),%rax	# pull context->Rax
1505	mov	248($context),%rbx	# pull context->Rip
1506
1507	mov	8($disp),%rsi		# disp->ImageBase
1508	mov	56($disp),%r11		# disp->HandlerData
1509
1510	mov	0(%r11),%r10d		# HandlerData[0]
1511	lea	(%rsi,%r10),%r10	# end of prologue label
1512	cmp	%r10,%rbx		# context->Rip<body label
1513	jb	.Lin_prologue
1514
1515	mov	152($context),%rax	# pull context->Rsp
1516
1517	mov	4(%r11),%r10d		# HandlerData[1]
1518	lea	(%rsi,%r10),%r10	# epilogue label
1519	cmp	%r10,%rbx		# context->Rip>=epilogue label
1520	jae	.Lin_prologue
1521
1522	mov	`32*17`($context),%rax	# pull saved stack pointer
1523
1524	mov	-8(%rax),%rbx
1525	mov	-16(%rax),%rbp
1526	mov	-24(%rax),%r12
1527	mov	-32(%rax),%r13
1528	mov	-40(%rax),%r14
1529	mov	-48(%rax),%r15
1530	mov	%rbx,144($context)	# restore context->Rbx
1531	mov	%rbp,160($context)	# restore context->Rbp
1532	mov	%r12,216($context)	# restore context->R12
1533	mov	%r13,224($context)	# restore context->R13
1534	mov	%r14,232($context)	# restore context->R14
1535	mov	%r15,240($context)	# restore context->R15
1536
1537	lea	-56-10*16(%rax),%rsi
1538	lea	512($context),%rdi	# &context.Xmm6
1539	mov	\$20,%ecx
1540	.long	0xa548f3fc		# cld; rep movsq
1541
1542	jmp	.Lin_prologue
1543.size	avx2_handler,.-avx2_handler
1544___
1545$code.=<<___;
1546.section	.pdata
1547.align	4
1548	.rva	.LSEH_begin_sha1_multi_block
1549	.rva	.LSEH_end_sha1_multi_block
1550	.rva	.LSEH_info_sha1_multi_block
1551	.rva	.LSEH_begin_sha1_multi_block_shaext
1552	.rva	.LSEH_end_sha1_multi_block_shaext
1553	.rva	.LSEH_info_sha1_multi_block_shaext
1554___
1555$code.=<<___ if ($avx);
1556	.rva	.LSEH_begin_sha1_multi_block_avx
1557	.rva	.LSEH_end_sha1_multi_block_avx
1558	.rva	.LSEH_info_sha1_multi_block_avx
1559___
1560$code.=<<___ if ($avx>1);
1561	.rva	.LSEH_begin_sha1_multi_block_avx2
1562	.rva	.LSEH_end_sha1_multi_block_avx2
1563	.rva	.LSEH_info_sha1_multi_block_avx2
1564___
1565$code.=<<___;
1566.section	.xdata
1567.align	8
1568.LSEH_info_sha1_multi_block:
1569	.byte	9,0,0,0
1570	.rva	se_handler
1571	.rva	.Lbody,.Lepilogue			# HandlerData[]
1572.LSEH_info_sha1_multi_block_shaext:
1573	.byte	9,0,0,0
1574	.rva	se_handler
1575	.rva	.Lbody_shaext,.Lepilogue_shaext	# HandlerData[]
1576___
1577$code.=<<___ if ($avx);
1578.LSEH_info_sha1_multi_block_avx:
1579	.byte	9,0,0,0
1580	.rva	se_handler
1581	.rva	.Lbody_avx,.Lepilogue_avx		# HandlerData[]
1582___
1583$code.=<<___ if ($avx>1);
1584.LSEH_info_sha1_multi_block_avx2:
1585	.byte	9,0,0,0
1586	.rva	avx2_handler
1587	.rva	.Lbody_avx2,.Lepilogue_avx2		# HandlerData[]
1588___
1589}
1590####################################################################
1591
1592sub rex {
1593  local *opcode=shift;
1594  my ($dst,$src)=@_;
1595  my $rex=0;
1596
1597    $rex|=0x04			if ($dst>=8);
1598    $rex|=0x01			if ($src>=8);
1599    unshift @opcode,$rex|0x40	if ($rex);
1600}
1601
1602sub sha1rnds4 {
1603    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1604      my @opcode=(0x0f,0x3a,0xcc);
1605	rex(\@opcode,$3,$2);
1606	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
1607	my $c=$1;
1608	push @opcode,$c=~/^0/?oct($c):$c;
1609	return ".byte\t".join(',',@opcode);
1610    } else {
1611	return "sha1rnds4\t".@_[0];
1612    }
1613}
1614
1615sub sha1op38 {
1616    my $instr = shift;
1617    my %opcodelet = (
1618		"sha1nexte" => 0xc8,
1619  		"sha1msg1"  => 0xc9,
1620		"sha1msg2"  => 0xca	);
1621
1622    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1623      my @opcode=(0x0f,0x38);
1624	rex(\@opcode,$2,$1);
1625	push @opcode,$opcodelet{$instr};
1626	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1627	return ".byte\t".join(',',@opcode);
1628    } else {
1629	return $instr."\t".@_[0];
1630    }
1631}
1632
1633foreach (split("\n",$code)) {
1634	s/\`([^\`]*)\`/eval($1)/ge;
1635
1636	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo		or
1637	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo		or
1638
1639	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1640	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1641	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go	or
1642	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1643	s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go		or
1644	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1645
1646	print $_,"\n";
1647}
1648
1649close STDOUT or die "error closing STDOUT: $!";
1650