1#! /usr/bin/env perl
2# Copyright 2013-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17#
18# AES-NI-CTR+GHASH stitch.
19#
20# February 2013
21#
22# OpenSSL GCM implementation is organized in such way that its
23# performance is rather close to the sum of its streamed components,
24# in the context parallelized AES-NI CTR and modulo-scheduled
25# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26# was observed to perform significantly better than the sum of the
27# components on contemporary CPUs, the effort was deemed impossible to
28# justify. This module is based on combination of Intel submissions,
29# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30# Locktyukhin of Intel Corp. who verified that it reduces shuffles
31# pressure with notable relative improvement, achieving 1.0 cycle per
32# byte processed with 128-bit key on Haswell processor, 0.74 - on
33# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34# measurements for favourable packet size, one divisible by 96.
35# Applications using the EVP interface will observe a few percent
36# worse performance.]
37#
38# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39#
40# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43# $output is the last argument if it looks like a file (it has an extension)
44# $flavour is the first argument if it doesn't look like a file
45$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53die "can't locate x86_64-xlate.pl";
54
55if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57	$avx = ($1>=2.20) + ($1>=2.22);
58}
59
60if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62	$avx = ($1>=2.09) + ($1>=2.10);
63}
64
65if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67	$avx = ($1>=10) + ($1>=11);
68}
69
70if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
71	$avx = ($2>=3.0) + ($2>3.0);
72}
73
74open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
75    or die "can't call $xlate: $!";
76*STDOUT=*OUT;
77
78if ($avx>1) {{{
79
80($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
81
82($Ii,$T1,$T2,$Hkey,
83 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
84
85($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
86
87($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
88
89$code=<<___;
90.text
91
92.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
93.align	32
94_aesni_ctr32_ghash_6x:
95.cfi_startproc
96	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
97	sub		\$6,$len
98	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
99	vmovdqu		0x00-0x80($key),$rndkey
100	vpaddb		$T2,$T1,$inout1
101	vpaddb		$T2,$inout1,$inout2
102	vpaddb		$T2,$inout2,$inout3
103	vpaddb		$T2,$inout3,$inout4
104	vpaddb		$T2,$inout4,$inout5
105	vpxor		$rndkey,$T1,$inout0
106	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
107	jmp		.Loop6x
108
109.align	32
110.Loop6x:
111	add		\$`6<<24`,$counter
112	jc		.Lhandle_ctr32		# discard $inout[1-5]?
113	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
114	  vpaddb	$T2,$inout5,$T1		# next counter value
115	  vpxor		$rndkey,$inout1,$inout1
116	  vpxor		$rndkey,$inout2,$inout2
117
118.Lresume_ctr32:
119	vmovdqu		$T1,($ivp)		# save next counter value
120	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
121	  vpxor		$rndkey,$inout3,$inout3
122	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
123	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
124	xor		%r12,%r12
125	cmp		$in0,$end0
126
127	  vaesenc	$T2,$inout0,$inout0
128	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
129	  vpxor		$rndkey,$inout4,$inout4
130	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
131	  vaesenc	$T2,$inout1,$inout1
132	  vpxor		$rndkey,$inout5,$inout5
133	setnc		%r12b
134	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
135	  vaesenc	$T2,$inout2,$inout2
136	vmovdqu		0x10-0x20($Xip),$Hkey	# $Hkey^2
137	neg		%r12
138	  vaesenc	$T2,$inout3,$inout3
139	 vpxor		$Z1,$Z2,$Z2
140	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
141	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
142	  vaesenc	$T2,$inout4,$inout4
143	 vpxor		$Z1,$T1,$Z0
144	and		\$0x60,%r12
145	  vmovups	0x20-0x80($key),$rndkey
146	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
147	  vaesenc	$T2,$inout5,$inout5
148
149	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
150	lea		($in0,%r12),$in0
151	  vaesenc	$rndkey,$inout0,$inout0
152	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
153	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
154	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
155	  vaesenc	$rndkey,$inout1,$inout1
156	movbe		0x58($in0),%r13
157	  vaesenc	$rndkey,$inout2,$inout2
158	movbe		0x50($in0),%r12
159	  vaesenc	$rndkey,$inout3,$inout3
160	mov		%r13,0x20+8(%rsp)
161	  vaesenc	$rndkey,$inout4,$inout4
162	mov		%r12,0x28+8(%rsp)
163	vmovdqu		0x30-0x20($Xip),$Z1	# borrow $Z1 for $Hkey^3
164	  vaesenc	$rndkey,$inout5,$inout5
165
166	  vmovups	0x30-0x80($key),$rndkey
167	 vpxor		$T1,$Z2,$Z2
168	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
169	  vaesenc	$rndkey,$inout0,$inout0
170	 vpxor		$T2,$Z2,$Z2
171	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
172	  vaesenc	$rndkey,$inout1,$inout1
173	 vpxor		$Hkey,$Z3,$Z3
174	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
175	  vaesenc	$rndkey,$inout2,$inout2
176	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
177	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
178	  vaesenc	$rndkey,$inout3,$inout3
179	  vaesenc	$rndkey,$inout4,$inout4
180	 vpxor		$T1,$Z0,$Z0
181	vmovdqu		0x40-0x20($Xip),$T1	# borrow $T1 for $Hkey^4
182	  vaesenc	$rndkey,$inout5,$inout5
183
184	  vmovups	0x40-0x80($key),$rndkey
185	 vpxor		$T2,$Z2,$Z2
186	vpclmulqdq	\$0x00,$T1,$Ii,$T2
187	  vaesenc	$rndkey,$inout0,$inout0
188	 vpxor		$Hkey,$Z2,$Z2
189	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
190	  vaesenc	$rndkey,$inout1,$inout1
191	movbe		0x48($in0),%r13
192	 vpxor		$Z1,$Z3,$Z3
193	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
194	  vaesenc	$rndkey,$inout2,$inout2
195	movbe		0x40($in0),%r12
196	vpclmulqdq	\$0x11,$T1,$Ii,$T1
197	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
198	  vaesenc	$rndkey,$inout3,$inout3
199	mov		%r13,0x30+8(%rsp)
200	  vaesenc	$rndkey,$inout4,$inout4
201	mov		%r12,0x38+8(%rsp)
202	 vpxor		$T2,$Z0,$Z0
203	vmovdqu		0x60-0x20($Xip),$T2	# borrow $T2 for $Hkey^5
204	  vaesenc	$rndkey,$inout5,$inout5
205
206	  vmovups	0x50-0x80($key),$rndkey
207	 vpxor		$Hkey,$Z2,$Z2
208	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
209	  vaesenc	$rndkey,$inout0,$inout0
210	 vpxor		$Z1,$Z2,$Z2
211	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
212	  vaesenc	$rndkey,$inout1,$inout1
213	movbe		0x38($in0),%r13
214	 vpxor		$T1,$Z3,$Z3
215	vpclmulqdq	\$0x01,$T2,$Ii,$T1
216	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
217	  vaesenc	$rndkey,$inout2,$inout2
218	movbe		0x30($in0),%r12
219	vpclmulqdq	\$0x11,$T2,$Ii,$T2
220	  vaesenc	$rndkey,$inout3,$inout3
221	mov		%r13,0x40+8(%rsp)
222	  vaesenc	$rndkey,$inout4,$inout4
223	mov		%r12,0x48+8(%rsp)
224	 vpxor		$Hkey,$Z0,$Z0
225	 vmovdqu	0x70-0x20($Xip),$Hkey	# $Hkey^6
226	  vaesenc	$rndkey,$inout5,$inout5
227
228	  vmovups	0x60-0x80($key),$rndkey
229	 vpxor		$Z1,$Z2,$Z2
230	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
231	  vaesenc	$rndkey,$inout0,$inout0
232	 vpxor		$T1,$Z2,$Z2
233	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
234	  vaesenc	$rndkey,$inout1,$inout1
235	movbe		0x28($in0),%r13
236	 vpxor		$T2,$Z3,$Z3
237	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
238	  vaesenc	$rndkey,$inout2,$inout2
239	movbe		0x20($in0),%r12
240	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
241	  vaesenc	$rndkey,$inout3,$inout3
242	mov		%r13,0x50+8(%rsp)
243	  vaesenc	$rndkey,$inout4,$inout4
244	mov		%r12,0x58+8(%rsp)
245	vpxor		$Z1,$Z2,$Z2
246	  vaesenc	$rndkey,$inout5,$inout5
247	vpxor		$T1,$Z2,$Z2
248
249	  vmovups	0x70-0x80($key),$rndkey
250	vpslldq		\$8,$Z2,$Z1
251	vpxor		$T2,$Z0,$Z0
252	vmovdqu		0x10($const),$Hkey	# .Lpoly
253
254	  vaesenc	$rndkey,$inout0,$inout0
255	vpxor		$Xi,$Z3,$Z3
256	  vaesenc	$rndkey,$inout1,$inout1
257	vpxor		$Z1,$Z0,$Z0
258	movbe		0x18($in0),%r13
259	  vaesenc	$rndkey,$inout2,$inout2
260	movbe		0x10($in0),%r12
261	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
262	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
263	mov		%r13,0x60+8(%rsp)
264	  vaesenc	$rndkey,$inout3,$inout3
265	mov		%r12,0x68+8(%rsp)
266	  vaesenc	$rndkey,$inout4,$inout4
267	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
268	  vaesenc	$rndkey,$inout5,$inout5
269
270	  vaesenc	$T1,$inout0,$inout0
271	  vmovups	0x90-0x80($key),$rndkey
272	  vaesenc	$T1,$inout1,$inout1
273	vpsrldq		\$8,$Z2,$Z2
274	  vaesenc	$T1,$inout2,$inout2
275	vpxor		$Z2,$Z3,$Z3
276	  vaesenc	$T1,$inout3,$inout3
277	vpxor		$Ii,$Z0,$Z0
278	movbe		0x08($in0),%r13
279	  vaesenc	$T1,$inout4,$inout4
280	movbe		0x00($in0),%r12
281	  vaesenc	$T1,$inout5,$inout5
282	  vmovups	0xa0-0x80($key),$T1
283	  cmp		\$11,$rounds
284	  jb		.Lenc_tail		# 128-bit key
285
286	  vaesenc	$rndkey,$inout0,$inout0
287	  vaesenc	$rndkey,$inout1,$inout1
288	  vaesenc	$rndkey,$inout2,$inout2
289	  vaesenc	$rndkey,$inout3,$inout3
290	  vaesenc	$rndkey,$inout4,$inout4
291	  vaesenc	$rndkey,$inout5,$inout5
292
293	  vaesenc	$T1,$inout0,$inout0
294	  vaesenc	$T1,$inout1,$inout1
295	  vaesenc	$T1,$inout2,$inout2
296	  vaesenc	$T1,$inout3,$inout3
297	  vaesenc	$T1,$inout4,$inout4
298	  vmovups	0xb0-0x80($key),$rndkey
299	  vaesenc	$T1,$inout5,$inout5
300	  vmovups	0xc0-0x80($key),$T1
301	  je		.Lenc_tail		# 192-bit key
302
303	  vaesenc	$rndkey,$inout0,$inout0
304	  vaesenc	$rndkey,$inout1,$inout1
305	  vaesenc	$rndkey,$inout2,$inout2
306	  vaesenc	$rndkey,$inout3,$inout3
307	  vaesenc	$rndkey,$inout4,$inout4
308	  vaesenc	$rndkey,$inout5,$inout5
309
310	  vaesenc	$T1,$inout0,$inout0
311	  vaesenc	$T1,$inout1,$inout1
312	  vaesenc	$T1,$inout2,$inout2
313	  vaesenc	$T1,$inout3,$inout3
314	  vaesenc	$T1,$inout4,$inout4
315	  vmovups	0xd0-0x80($key),$rndkey
316	  vaesenc	$T1,$inout5,$inout5
317	  vmovups	0xe0-0x80($key),$T1
318	  jmp		.Lenc_tail		# 256-bit key
319
320.align	32
321.Lhandle_ctr32:
322	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
323	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
324	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
325	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
326	  vpaddd	$Z1,$Z2,$inout2
327	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
328	  vpaddd	$Z1,$inout1,$inout3
329	  vpshufb	$Ii,$inout1,$inout1
330	  vpaddd	$Z1,$inout2,$inout4
331	  vpshufb	$Ii,$inout2,$inout2
332	  vpxor		$rndkey,$inout1,$inout1
333	  vpaddd	$Z1,$inout3,$inout5
334	  vpshufb	$Ii,$inout3,$inout3
335	  vpxor		$rndkey,$inout2,$inout2
336	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
337	  vpshufb	$Ii,$inout4,$inout4
338	  vpshufb	$Ii,$inout5,$inout5
339	  vpshufb	$Ii,$T1,$T1		# next counter value
340	jmp		.Lresume_ctr32
341
342.align	32
343.Lenc_tail:
344	  vaesenc	$rndkey,$inout0,$inout0
345	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
346	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
347	  vaesenc	$rndkey,$inout1,$inout1
348	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
349	  vpxor		0x00($inp),$T1,$T2
350	  vaesenc	$rndkey,$inout2,$inout2
351	  vpxor		0x10($inp),$T1,$Ii
352	  vaesenc	$rndkey,$inout3,$inout3
353	  vpxor		0x20($inp),$T1,$Z1
354	  vaesenc	$rndkey,$inout4,$inout4
355	  vpxor		0x30($inp),$T1,$Z2
356	  vaesenc	$rndkey,$inout5,$inout5
357	  vpxor		0x40($inp),$T1,$Z3
358	  vpxor		0x50($inp),$T1,$Hkey
359	  vmovdqu	($ivp),$T1		# load next counter value
360
361	  vaesenclast	$T2,$inout0,$inout0
362	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
363	  vaesenclast	$Ii,$inout1,$inout1
364	 vpaddb		$T2,$T1,$Ii
365	mov		%r13,0x70+8(%rsp)
366	lea		0x60($inp),$inp
367	  vaesenclast	$Z1,$inout2,$inout2
368	 vpaddb		$T2,$Ii,$Z1
369	mov		%r12,0x78+8(%rsp)
370	lea		0x60($out),$out
371	  vmovdqu	0x00-0x80($key),$rndkey
372	  vaesenclast	$Z2,$inout3,$inout3
373	 vpaddb		$T2,$Z1,$Z2
374	  vaesenclast	$Z3, $inout4,$inout4
375	 vpaddb		$T2,$Z2,$Z3
376	  vaesenclast	$Hkey,$inout5,$inout5
377	 vpaddb		$T2,$Z3,$Hkey
378
379	add		\$0x60,$ret
380	sub		\$0x6,$len
381	jc		.L6x_done
382
383	  vmovups	$inout0,-0x60($out)	# save output
384	 vpxor		$rndkey,$T1,$inout0
385	  vmovups	$inout1,-0x50($out)
386	 vmovdqa	$Ii,$inout1		# 0 latency
387	  vmovups	$inout2,-0x40($out)
388	 vmovdqa	$Z1,$inout2		# 0 latency
389	  vmovups	$inout3,-0x30($out)
390	 vmovdqa	$Z2,$inout3		# 0 latency
391	  vmovups	$inout4,-0x20($out)
392	 vmovdqa	$Z3,$inout4		# 0 latency
393	  vmovups	$inout5,-0x10($out)
394	 vmovdqa	$Hkey,$inout5		# 0 latency
395	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
396	jmp		.Loop6x
397
398.L6x_done:
399	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
400	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
401
402	ret
403.cfi_endproc
404.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
405___
406######################################################################
407#
408# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
409#		const AES_KEY *key, unsigned char iv[16],
410#		struct { u128 Xi,H,Htbl[9]; } *Xip);
411$code.=<<___;
412.globl	aesni_gcm_decrypt
413.type	aesni_gcm_decrypt,\@function,6
414.align	32
415aesni_gcm_decrypt:
416.cfi_startproc
417	xor	$ret,$ret
418	cmp	\$0x60,$len			# minimal accepted length
419	jb	.Lgcm_dec_abort
420
421	lea	(%rsp),%rax			# save stack pointer
422.cfi_def_cfa_register	%rax
423	push	%rbx
424.cfi_push	%rbx
425	push	%rbp
426.cfi_push	%rbp
427	push	%r12
428.cfi_push	%r12
429	push	%r13
430.cfi_push	%r13
431	push	%r14
432.cfi_push	%r14
433	push	%r15
434.cfi_push	%r15
435___
436$code.=<<___ if ($win64);
437	lea	-0xa8(%rsp),%rsp
438	movaps	%xmm6,-0xd8(%rax)
439	movaps	%xmm7,-0xc8(%rax)
440	movaps	%xmm8,-0xb8(%rax)
441	movaps	%xmm9,-0xa8(%rax)
442	movaps	%xmm10,-0x98(%rax)
443	movaps	%xmm11,-0x88(%rax)
444	movaps	%xmm12,-0x78(%rax)
445	movaps	%xmm13,-0x68(%rax)
446	movaps	%xmm14,-0x58(%rax)
447	movaps	%xmm15,-0x48(%rax)
448.Lgcm_dec_body:
449___
450$code.=<<___;
451	vzeroupper
452
453	vmovdqu		($ivp),$T1		# input counter value
454	add		\$-128,%rsp
455	mov		12($ivp),$counter
456	lea		.Lbswap_mask(%rip),$const
457	lea		-0x80($key),$in0	# borrow $in0
458	mov		\$0xf80,$end0		# borrow $end0
459	vmovdqu		($Xip),$Xi		# load Xi
460	and		\$-128,%rsp		# ensure stack alignment
461	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
462	lea		0x80($key),$key		# size optimization
463	lea		0x20+0x20($Xip),$Xip	# size optimization
464	mov		0xf0-0x80($key),$rounds
465	vpshufb		$Ii,$Xi,$Xi
466
467	and		$end0,$in0
468	and		%rsp,$end0
469	sub		$in0,$end0
470	jc		.Ldec_no_key_aliasing
471	cmp		\$768,$end0
472	jnc		.Ldec_no_key_aliasing
473	sub		$end0,%rsp		# avoid aliasing with key
474.Ldec_no_key_aliasing:
475
476	vmovdqu		0x50($inp),$Z3		# I[5]
477	lea		($inp),$in0
478	vmovdqu		0x40($inp),$Z0
479	lea		-0xc0($inp,$len),$end0
480	vmovdqu		0x30($inp),$Z1
481	shr		\$4,$len
482	xor		$ret,$ret
483	vmovdqu		0x20($inp),$Z2
484	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
485	vmovdqu		0x10($inp),$T2
486	 vpshufb	$Ii,$Z0,$Z0
487	vmovdqu		($inp),$Hkey
488	 vpshufb	$Ii,$Z1,$Z1
489	vmovdqu		$Z0,0x30(%rsp)
490	 vpshufb	$Ii,$Z2,$Z2
491	vmovdqu		$Z1,0x40(%rsp)
492	 vpshufb	$Ii,$T2,$T2
493	vmovdqu		$Z2,0x50(%rsp)
494	 vpshufb	$Ii,$Hkey,$Hkey
495	vmovdqu		$T2,0x60(%rsp)
496	vmovdqu		$Hkey,0x70(%rsp)
497
498	call		_aesni_ctr32_ghash_6x
499
500	vmovups		$inout0,-0x60($out)	# save output
501	vmovups		$inout1,-0x50($out)
502	vmovups		$inout2,-0x40($out)
503	vmovups		$inout3,-0x30($out)
504	vmovups		$inout4,-0x20($out)
505	vmovups		$inout5,-0x10($out)
506
507	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
508	vmovdqu		$Xi,-0x40($Xip)		# output Xi
509
510	vzeroupper
511___
512$code.=<<___ if ($win64);
513	movaps	-0xd8(%rax),%xmm6
514	movaps	-0xc8(%rax),%xmm7
515	movaps	-0xb8(%rax),%xmm8
516	movaps	-0xa8(%rax),%xmm9
517	movaps	-0x98(%rax),%xmm10
518	movaps	-0x88(%rax),%xmm11
519	movaps	-0x78(%rax),%xmm12
520	movaps	-0x68(%rax),%xmm13
521	movaps	-0x58(%rax),%xmm14
522	movaps	-0x48(%rax),%xmm15
523___
524$code.=<<___;
525	mov	-48(%rax),%r15
526.cfi_restore	%r15
527	mov	-40(%rax),%r14
528.cfi_restore	%r14
529	mov	-32(%rax),%r13
530.cfi_restore	%r13
531	mov	-24(%rax),%r12
532.cfi_restore	%r12
533	mov	-16(%rax),%rbp
534.cfi_restore	%rbp
535	mov	-8(%rax),%rbx
536.cfi_restore	%rbx
537	lea	(%rax),%rsp		# restore %rsp
538.cfi_def_cfa_register	%rsp
539.Lgcm_dec_abort:
540	mov	$ret,%rax		# return value
541	ret
542.cfi_endproc
543.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
544___
545
546$code.=<<___;
547.type	_aesni_ctr32_6x,\@abi-omnipotent
548.align	32
549_aesni_ctr32_6x:
550.cfi_startproc
551	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
552	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
553	lea		-1($rounds),%r13
554	vmovups		0x10-0x80($key),$rndkey
555	lea		0x20-0x80($key),%r12
556	vpxor		$Z0,$T1,$inout0
557	add		\$`6<<24`,$counter
558	jc		.Lhandle_ctr32_2
559	vpaddb		$T2,$T1,$inout1
560	vpaddb		$T2,$inout1,$inout2
561	vpxor		$Z0,$inout1,$inout1
562	vpaddb		$T2,$inout2,$inout3
563	vpxor		$Z0,$inout2,$inout2
564	vpaddb		$T2,$inout3,$inout4
565	vpxor		$Z0,$inout3,$inout3
566	vpaddb		$T2,$inout4,$inout5
567	vpxor		$Z0,$inout4,$inout4
568	vpaddb		$T2,$inout5,$T1
569	vpxor		$Z0,$inout5,$inout5
570	jmp		.Loop_ctr32
571
572.align	16
573.Loop_ctr32:
574	vaesenc		$rndkey,$inout0,$inout0
575	vaesenc		$rndkey,$inout1,$inout1
576	vaesenc		$rndkey,$inout2,$inout2
577	vaesenc		$rndkey,$inout3,$inout3
578	vaesenc		$rndkey,$inout4,$inout4
579	vaesenc		$rndkey,$inout5,$inout5
580	vmovups		(%r12),$rndkey
581	lea		0x10(%r12),%r12
582	dec		%r13d
583	jnz		.Loop_ctr32
584
585	vmovdqu		(%r12),$Hkey		# last round key
586	vaesenc		$rndkey,$inout0,$inout0
587	vpxor		0x00($inp),$Hkey,$Z0
588	vaesenc		$rndkey,$inout1,$inout1
589	vpxor		0x10($inp),$Hkey,$Z1
590	vaesenc		$rndkey,$inout2,$inout2
591	vpxor		0x20($inp),$Hkey,$Z2
592	vaesenc		$rndkey,$inout3,$inout3
593	vpxor		0x30($inp),$Hkey,$Xi
594	vaesenc		$rndkey,$inout4,$inout4
595	vpxor		0x40($inp),$Hkey,$T2
596	vaesenc		$rndkey,$inout5,$inout5
597	vpxor		0x50($inp),$Hkey,$Hkey
598	lea		0x60($inp),$inp
599
600	vaesenclast	$Z0,$inout0,$inout0
601	vaesenclast	$Z1,$inout1,$inout1
602	vaesenclast	$Z2,$inout2,$inout2
603	vaesenclast	$Xi,$inout3,$inout3
604	vaesenclast	$T2,$inout4,$inout4
605	vaesenclast	$Hkey,$inout5,$inout5
606	vmovups		$inout0,0x00($out)
607	vmovups		$inout1,0x10($out)
608	vmovups		$inout2,0x20($out)
609	vmovups		$inout3,0x30($out)
610	vmovups		$inout4,0x40($out)
611	vmovups		$inout5,0x50($out)
612	lea		0x60($out),$out
613
614	ret
615.align	32
616.Lhandle_ctr32_2:
617	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
618	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
619	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
620	vpaddd		$Z1,$Z2,$inout2
621	vpaddd		$Z1,$inout1,$inout3
622	vpshufb		$Ii,$inout1,$inout1
623	vpaddd		$Z1,$inout2,$inout4
624	vpshufb		$Ii,$inout2,$inout2
625	vpxor		$Z0,$inout1,$inout1
626	vpaddd		$Z1,$inout3,$inout5
627	vpshufb		$Ii,$inout3,$inout3
628	vpxor		$Z0,$inout2,$inout2
629	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
630	vpshufb		$Ii,$inout4,$inout4
631	vpxor		$Z0,$inout3,$inout3
632	vpshufb		$Ii,$inout5,$inout5
633	vpxor		$Z0,$inout4,$inout4
634	vpshufb		$Ii,$T1,$T1		# next counter value
635	vpxor		$Z0,$inout5,$inout5
636	jmp	.Loop_ctr32
637.cfi_endproc
638.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
639
640.globl	aesni_gcm_encrypt
641.type	aesni_gcm_encrypt,\@function,6
642.align	32
643aesni_gcm_encrypt:
644.cfi_startproc
645	xor	$ret,$ret
646	cmp	\$0x60*3,$len			# minimal accepted length
647	jb	.Lgcm_enc_abort
648
649	lea	(%rsp),%rax			# save stack pointer
650.cfi_def_cfa_register	%rax
651	push	%rbx
652.cfi_push	%rbx
653	push	%rbp
654.cfi_push	%rbp
655	push	%r12
656.cfi_push	%r12
657	push	%r13
658.cfi_push	%r13
659	push	%r14
660.cfi_push	%r14
661	push	%r15
662.cfi_push	%r15
663___
664$code.=<<___ if ($win64);
665	lea	-0xa8(%rsp),%rsp
666	movaps	%xmm6,-0xd8(%rax)
667	movaps	%xmm7,-0xc8(%rax)
668	movaps	%xmm8,-0xb8(%rax)
669	movaps	%xmm9,-0xa8(%rax)
670	movaps	%xmm10,-0x98(%rax)
671	movaps	%xmm11,-0x88(%rax)
672	movaps	%xmm12,-0x78(%rax)
673	movaps	%xmm13,-0x68(%rax)
674	movaps	%xmm14,-0x58(%rax)
675	movaps	%xmm15,-0x48(%rax)
676.Lgcm_enc_body:
677___
678$code.=<<___;
679	vzeroupper
680
681	vmovdqu		($ivp),$T1		# input counter value
682	add		\$-128,%rsp
683	mov		12($ivp),$counter
684	lea		.Lbswap_mask(%rip),$const
685	lea		-0x80($key),$in0	# borrow $in0
686	mov		\$0xf80,$end0		# borrow $end0
687	lea		0x80($key),$key		# size optimization
688	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
689	and		\$-128,%rsp		# ensure stack alignment
690	mov		0xf0-0x80($key),$rounds
691
692	and		$end0,$in0
693	and		%rsp,$end0
694	sub		$in0,$end0
695	jc		.Lenc_no_key_aliasing
696	cmp		\$768,$end0
697	jnc		.Lenc_no_key_aliasing
698	sub		$end0,%rsp		# avoid aliasing with key
699.Lenc_no_key_aliasing:
700
701	lea		($out),$in0
702	lea		-0xc0($out,$len),$end0
703	shr		\$4,$len
704
705	call		_aesni_ctr32_6x
706	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
707	vpshufb		$Ii,$inout1,$T2
708	vmovdqu		$Xi,0x70(%rsp)
709	vpshufb		$Ii,$inout2,$Z0
710	vmovdqu		$T2,0x60(%rsp)
711	vpshufb		$Ii,$inout3,$Z1
712	vmovdqu		$Z0,0x50(%rsp)
713	vpshufb		$Ii,$inout4,$Z2
714	vmovdqu		$Z1,0x40(%rsp)
715	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
716	vmovdqu		$Z2,0x30(%rsp)
717
718	call		_aesni_ctr32_6x
719
720	vmovdqu		($Xip),$Xi		# load Xi
721	lea		0x20+0x20($Xip),$Xip	# size optimization
722	sub		\$12,$len
723	mov		\$0x60*2,$ret
724	vpshufb		$Ii,$Xi,$Xi
725
726	call		_aesni_ctr32_ghash_6x
727	vmovdqu		0x20(%rsp),$Z3		# I[5]
728	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
729	vmovdqu		0x00-0x20($Xip),$Hkey	# $Hkey^1
730	vpunpckhqdq	$Z3,$Z3,$T1
731	vmovdqu		0x20-0x20($Xip),$rndkey	# borrow $rndkey for $HK
732	 vmovups	$inout0,-0x60($out)	# save output
733	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
734	vpxor		$Z3,$T1,$T1
735	 vmovups	$inout1,-0x50($out)
736	 vpshufb	$Ii,$inout1,$inout1
737	 vmovups	$inout2,-0x40($out)
738	 vpshufb	$Ii,$inout2,$inout2
739	 vmovups	$inout3,-0x30($out)
740	 vpshufb	$Ii,$inout3,$inout3
741	 vmovups	$inout4,-0x20($out)
742	 vpshufb	$Ii,$inout4,$inout4
743	 vmovups	$inout5,-0x10($out)
744	 vpshufb	$Ii,$inout5,$inout5
745	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
746___
747{ my ($HK,$T3)=($rndkey,$inout0);
748
749$code.=<<___;
750	 vmovdqu	0x30(%rsp),$Z2		# I[4]
751	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
752	 vpunpckhqdq	$Z2,$Z2,$T2
753	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
754	 vpxor		$Z2,$T2,$T2
755	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
756	vpclmulqdq	\$0x00,$HK,$T1,$T1
757
758	 vmovdqu	0x40(%rsp),$T3		# I[3]
759	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
760	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
761	vpxor		$Z1,$Z0,$Z0
762	 vpunpckhqdq	$T3,$T3,$Z1
763	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
764	 vpxor		$T3,$Z1,$Z1
765	vpxor		$Z3,$Z2,$Z2
766	vpclmulqdq	\$0x10,$HK,$T2,$T2
767	 vmovdqu	0x50-0x20($Xip),$HK
768	vpxor		$T1,$T2,$T2
769
770	 vmovdqu	0x50(%rsp),$T1		# I[2]
771	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
772	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
773	vpxor		$Z0,$Z3,$Z3
774	 vpunpckhqdq	$T1,$T1,$Z0
775	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
776	 vpxor		$T1,$Z0,$Z0
777	vpxor		$Z2,$T3,$T3
778	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
779	vpxor		$T2,$Z1,$Z1
780
781	 vmovdqu	0x60(%rsp),$T2		# I[1]
782	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
783	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
784	vpxor		$Z3,$Z2,$Z2
785	 vpunpckhqdq	$T2,$T2,$Z3
786	vpclmulqdq	\$0x11,$Ii,$T1,$T1
787	 vpxor		$T2,$Z3,$Z3
788	vpxor		$T3,$T1,$T1
789	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
790	 vmovdqu	0x80-0x20($Xip),$HK
791	vpxor		$Z1,$Z0,$Z0
792
793	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
794	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
795	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
796	 vpunpckhqdq	$Xi,$Xi,$T3
797	vpxor		$Z2,$Z1,$Z1
798	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
799	 vpxor		$Xi,$T3,$T3
800	vpxor		$T1,$T2,$T2
801	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
802	vpxor		$Z0,$Z3,$Z0
803
804	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
805	 vmovdqu	0x00-0x20($Xip),$Hkey	# $Hkey^1
806	 vpunpckhqdq	$inout5,$inout5,$T1
807	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
808	 vpxor		$inout5,$T1,$T1
809	vpxor		$Z1,$Z2,$Z1
810	vpclmulqdq	\$0x10,$HK,$T3,$T3
811	 vmovdqu	0x20-0x20($Xip),$HK
812	vpxor		$T2,$Xi,$Z3
813	vpxor		$Z0,$T3,$Z2
814
815	 vmovdqu	0x10-0x20($Xip),$Ii	# borrow $Ii for $Hkey^2
816	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
817	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
818	  vpxor		$T3,$Z2,$Z2
819	 vpunpckhqdq	$inout4,$inout4,$T2
820	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
821	 vpxor		$inout4,$T2,$T2
822	  vpslldq	\$8,$Z2,$T3
823	vpclmulqdq	\$0x00,$HK,$T1,$T1
824	  vpxor		$T3,$Z1,$Xi
825	  vpsrldq	\$8,$Z2,$Z2
826	  vpxor		$Z2,$Z3,$Z3
827
828	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
829	 vmovdqu	0x30-0x20($Xip),$Hkey	# $Hkey^3
830	vpxor		$Z0,$Z1,$Z1
831	 vpunpckhqdq	$inout3,$inout3,$T3
832	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
833	 vpxor		$inout3,$T3,$T3
834	vpxor		$inout5,$inout4,$inout4
835	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
836	vpclmulqdq	\$0x10,$HK,$T2,$T2
837	 vmovdqu	0x50-0x20($Xip),$HK
838	vpxor		$T1,$T2,$T2
839
840	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
841	 vmovdqu	0x40-0x20($Xip),$Ii	# borrow $Ii for $Hkey^4
842	vpxor		$Z1,$Z0,$Z0
843	 vpunpckhqdq	$inout2,$inout2,$T1
844	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
845	 vpxor		$inout2,$T1,$T1
846	vpxor		$inout4,$inout3,$inout3
847	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
848	vpclmulqdq	\$0x00,$HK,$T3,$T3
849	vpxor		$T2,$T3,$T3
850
851	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
852	  vxorps	$inout5,$Xi,$Xi
853
854	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
855	 vmovdqu	0x60-0x20($Xip),$Hkey	# $Hkey^5
856	vpxor		$Z0,$Z1,$Z1
857	 vpunpckhqdq	$inout1,$inout1,$T2
858	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
859	 vpxor		$inout1,$T2,$T2
860	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
861	vpxor		$inout3,$inout2,$inout2
862	vpclmulqdq	\$0x10,$HK,$T1,$T1
863	 vmovdqu	0x80-0x20($Xip),$HK
864	vpxor		$T3,$T1,$T1
865
866	  vxorps	$Z3,$inout5,$inout5
867	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
868	  vxorps	$inout5,$Xi,$Xi
869
870	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
871	 vmovdqu	0x70-0x20($Xip),$Ii	# borrow $Ii for $Hkey^6
872	vpxor		$Z1,$Z0,$Z0
873	 vpunpckhqdq	$Xi,$Xi,$T3
874	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
875	 vpxor		$Xi,$T3,$T3
876	vpxor		$inout2,$inout1,$inout1
877	vpclmulqdq	\$0x00,$HK,$T2,$T2
878	vpxor		$T1,$T2,$T2
879
880	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
881	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
882	vpxor		$Z0,$Z1,$Z1
883	vpclmulqdq	\$0x10,$HK,$T3,$Z2
884	vpxor		$inout1,$Z3,$Z3
885	vpxor		$T2,$Z2,$Z2
886
887	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
888	vpxor		$Z0,$Z2,$Z2
889	vpslldq		\$8,$Z2,$T1
890	vmovdqu		0x10($const),$Hkey	# .Lpoly
891	vpsrldq		\$8,$Z2,$Z2
892	vpxor		$T1,$Z1,$Xi
893	vpxor		$Z2,$Z3,$Z3
894
895	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
896	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
897	vpxor		$T2,$Xi,$Xi
898
899	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
900	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
901	vpxor		$Z3,$T2,$T2
902	vpxor		$T2,$Xi,$Xi
903___
904}
905$code.=<<___;
906	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
907	vmovdqu		$Xi,-0x40($Xip)		# output Xi
908
909	vzeroupper
910___
911$code.=<<___ if ($win64);
912	movaps	-0xd8(%rax),%xmm6
913	movaps	-0xc8(%rax),%xmm7
914	movaps	-0xb8(%rax),%xmm8
915	movaps	-0xa8(%rax),%xmm9
916	movaps	-0x98(%rax),%xmm10
917	movaps	-0x88(%rax),%xmm11
918	movaps	-0x78(%rax),%xmm12
919	movaps	-0x68(%rax),%xmm13
920	movaps	-0x58(%rax),%xmm14
921	movaps	-0x48(%rax),%xmm15
922___
923$code.=<<___;
924	mov	-48(%rax),%r15
925.cfi_restore	%r15
926	mov	-40(%rax),%r14
927.cfi_restore	%r14
928	mov	-32(%rax),%r13
929.cfi_restore	%r13
930	mov	-24(%rax),%r12
931.cfi_restore	%r12
932	mov	-16(%rax),%rbp
933.cfi_restore	%rbp
934	mov	-8(%rax),%rbx
935.cfi_restore	%rbx
936	lea	(%rax),%rsp		# restore %rsp
937.cfi_def_cfa_register	%rsp
938.Lgcm_enc_abort:
939	mov	$ret,%rax		# return value
940	ret
941.cfi_endproc
942.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
943___
944
945$code.=<<___;
946.section .rodata align=64
947.align	64
948.Lbswap_mask:
949	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
950.Lpoly:
951	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
952.Lone_msb:
953	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
954.Ltwo_lsb:
955	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
956.Lone_lsb:
957	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
958.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
959.previous
960.align	64
961___
962if ($win64) {
963$rec="%rcx";
964$frame="%rdx";
965$context="%r8";
966$disp="%r9";
967
968$code.=<<___
969.extern	__imp_RtlVirtualUnwind
970.type	gcm_se_handler,\@abi-omnipotent
971.align	16
972gcm_se_handler:
973	push	%rsi
974	push	%rdi
975	push	%rbx
976	push	%rbp
977	push	%r12
978	push	%r13
979	push	%r14
980	push	%r15
981	pushfq
982	sub	\$64,%rsp
983
984	mov	120($context),%rax	# pull context->Rax
985	mov	248($context),%rbx	# pull context->Rip
986
987	mov	8($disp),%rsi		# disp->ImageBase
988	mov	56($disp),%r11		# disp->HandlerData
989
990	mov	0(%r11),%r10d		# HandlerData[0]
991	lea	(%rsi,%r10),%r10	# prologue label
992	cmp	%r10,%rbx		# context->Rip<prologue label
993	jb	.Lcommon_seh_tail
994
995	mov	152($context),%rax	# pull context->Rsp
996
997	mov	4(%r11),%r10d		# HandlerData[1]
998	lea	(%rsi,%r10),%r10	# epilogue label
999	cmp	%r10,%rbx		# context->Rip>=epilogue label
1000	jae	.Lcommon_seh_tail
1001
1002	mov	120($context),%rax	# pull context->Rax
1003
1004	mov	-48(%rax),%r15
1005	mov	-40(%rax),%r14
1006	mov	-32(%rax),%r13
1007	mov	-24(%rax),%r12
1008	mov	-16(%rax),%rbp
1009	mov	-8(%rax),%rbx
1010	mov	%r15,240($context)
1011	mov	%r14,232($context)
1012	mov	%r13,224($context)
1013	mov	%r12,216($context)
1014	mov	%rbp,160($context)
1015	mov	%rbx,144($context)
1016
1017	lea	-0xd8(%rax),%rsi	# %xmm save area
1018	lea	512($context),%rdi	# & context.Xmm6
1019	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1020	.long	0xa548f3fc		# cld; rep movsq
1021
1022.Lcommon_seh_tail:
1023	mov	8(%rax),%rdi
1024	mov	16(%rax),%rsi
1025	mov	%rax,152($context)	# restore context->Rsp
1026	mov	%rsi,168($context)	# restore context->Rsi
1027	mov	%rdi,176($context)	# restore context->Rdi
1028
1029	mov	40($disp),%rdi		# disp->ContextRecord
1030	mov	$context,%rsi		# context
1031	mov	\$154,%ecx		# sizeof(CONTEXT)
1032	.long	0xa548f3fc		# cld; rep movsq
1033
1034	mov	$disp,%rsi
1035	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1036	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1037	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1038	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1039	mov	40(%rsi),%r10		# disp->ContextRecord
1040	lea	56(%rsi),%r11		# &disp->HandlerData
1041	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1042	mov	%r10,32(%rsp)		# arg5
1043	mov	%r11,40(%rsp)		# arg6
1044	mov	%r12,48(%rsp)		# arg7
1045	mov	%rcx,56(%rsp)		# arg8, (NULL)
1046	call	*__imp_RtlVirtualUnwind(%rip)
1047
1048	mov	\$1,%eax		# ExceptionContinueSearch
1049	add	\$64,%rsp
1050	popfq
1051	pop	%r15
1052	pop	%r14
1053	pop	%r13
1054	pop	%r12
1055	pop	%rbp
1056	pop	%rbx
1057	pop	%rdi
1058	pop	%rsi
1059	ret
1060.size	gcm_se_handler,.-gcm_se_handler
1061
1062.section	.pdata
1063.align	4
1064	.rva	.LSEH_begin_aesni_gcm_decrypt
1065	.rva	.LSEH_end_aesni_gcm_decrypt
1066	.rva	.LSEH_gcm_dec_info
1067
1068	.rva	.LSEH_begin_aesni_gcm_encrypt
1069	.rva	.LSEH_end_aesni_gcm_encrypt
1070	.rva	.LSEH_gcm_enc_info
1071.section	.xdata
1072.align	8
1073.LSEH_gcm_dec_info:
1074	.byte	9,0,0,0
1075	.rva	gcm_se_handler
1076	.rva	.Lgcm_dec_body,.Lgcm_dec_abort
1077.LSEH_gcm_enc_info:
1078	.byte	9,0,0,0
1079	.rva	gcm_se_handler
1080	.rva	.Lgcm_enc_body,.Lgcm_enc_abort
1081___
1082}
1083}}} else {{{
1084$code=<<___;	# assembler is too old
1085.text
1086
1087.globl	aesni_gcm_encrypt
1088.type	aesni_gcm_encrypt,\@abi-omnipotent
1089aesni_gcm_encrypt:
1090.cfi_startproc
1091	xor	%eax,%eax
1092	ret
1093.cfi_endproc
1094.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1095
1096.globl	aesni_gcm_decrypt
1097.type	aesni_gcm_decrypt,\@abi-omnipotent
1098aesni_gcm_decrypt:
1099.cfi_startproc
1100	xor	%eax,%eax
1101	ret
1102.cfi_endproc
1103.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
1104___
1105}}}
1106
1107$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1108
1109print $code;
1110
1111close STDOUT or die "error closing STDOUT: $!";
1112