xref: /openssl/crypto/aes/asm/bsaes-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10###################################################################
11### AES-128 [originally in CTR mode]				###
12### bitsliced implementation for Intel Core 2 processors	###
13### requires support of SSE extensions up to SSSE3		###
14### Author: Emilia Käsper and Peter Schwabe			###
15### Date: 2009-03-19						###
16### Public domain						###
17###								###
18### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
19### further information.					###
20###################################################################
21#
22# September 2011.
23#
24# Started as transliteration to "perlasm" the original code has
25# undergone following changes:
26#
27# - code was made position-independent;
28# - rounds were folded into a loop resulting in >5x size reduction
29#   from 12.5KB to 2.2KB;
30# - above was possible thanks to mixcolumns() modification that
31#   allowed to feed its output back to aesenc[last], this was
32#   achieved at cost of two additional inter-registers moves;
33# - some instruction reordering and interleaving;
34# - this module doesn't implement key setup subroutine, instead it
35#   relies on conversion of "conventional" key schedule as returned
36#   by AES_set_encrypt_key (see discussion below);
37# - first and last round keys are treated differently, which allowed
38#   to skip one shiftrows(), reduce bit-sliced key schedule and
39#   speed-up conversion by 22%;
40# - support for 192- and 256-bit keys was added;
41#
42# Resulting performance in CPU cycles spent to encrypt one byte out
43# of 4096-byte buffer with 128-bit key is:
44#
45#		Emilia's	this(*)		difference
46#
47# Core 2    	9.30		8.69		+7%
48# Nehalem(**) 	7.63		6.88		+11%
49# Atom	    	17.1		16.4		+4%
50# Silvermont	-		12.9
51# Goldmont	-		8.85
52#
53# (*)	Comparison is not completely fair, because "this" is ECB,
54#	i.e. no extra processing such as counter values calculation
55#	and xor-ing input as in Emilia's CTR implementation is
56#	performed. However, the CTR calculations stand for not more
57#	than 1% of total time, so comparison is *rather* fair.
58#
59# (**)	Results were collected on Westmere, which is considered to
60#	be equivalent to Nehalem for this code.
61#
62# As for key schedule conversion subroutine. Interface to OpenSSL
63# relies on per-invocation on-the-fly conversion. This naturally
64# has impact on performance, especially for short inputs. Conversion
65# time in CPU cycles and its ratio to CPU cycles spent in 8x block
66# function is:
67#
68# 		conversion	conversion/8x block
69# Core 2	240		0.22
70# Nehalem	180		0.20
71# Atom		430		0.20
72#
73# The ratio values mean that 128-byte blocks will be processed
74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75# etc. Then keep in mind that input sizes not divisible by 128 are
76# *effectively* slower, especially shortest ones, e.g. consecutive
77# 144-byte blocks are processed 44% slower than one would expect,
78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79# it's still faster than ["hyper-threading-safe" code path in]
80# aes-x86_64.pl on all lengths above 64 bytes...
81#
82# October 2011.
83#
84# Add decryption procedure. Performance in CPU cycles spent to decrypt
85# one byte out of 4096-byte buffer with 128-bit key is:
86#
87# Core 2	9.98
88# Nehalem	7.80
89# Atom		17.9
90# Silvermont	14.0
91# Goldmont	10.2
92#
93# November 2011.
94#
95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96# suboptimal, but XTS is meant to be used with larger blocks...
97#
98#						<appro@openssl.org>
99
100# $output is the last argument if it looks like a file (it has an extension)
101# $flavour is the first argument if it doesn't look like a file
102$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
103$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
104
105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
106
107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110die "can't locate x86_64-xlate.pl";
111
112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
113    or die "can't call $xlate: $!";
114*STDOUT=*OUT;
115
116my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
117my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
118my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
119
120{
121my ($key,$rounds,$const)=("%rax","%r10d","%r11");
122
123sub Sbox {
124# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
126my @b=@_[0..7];
127my @t=@_[8..11];
128my @s=@_[12..15];
129	&InBasisChange	(@b);
130	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
131	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
132}
133
134sub InBasisChange {
135# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
136# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
137my @b=@_[0..7];
138$code.=<<___;
139	pxor	@b[6], @b[5]
140	pxor	@b[1], @b[2]
141	pxor	@b[0], @b[3]
142	pxor	@b[2], @b[6]
143	pxor 	@b[0], @b[5]
144
145	pxor	@b[3], @b[6]
146	pxor	@b[7], @b[3]
147	pxor	@b[5], @b[7]
148	pxor	@b[4], @b[3]
149	pxor	@b[5], @b[4]
150	pxor	@b[1], @b[3]
151
152	pxor	@b[7], @b[2]
153	pxor	@b[5], @b[1]
154___
155}
156
157sub OutBasisChange {
158# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
159# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
160my @b=@_[0..7];
161$code.=<<___;
162	pxor	@b[6], @b[0]
163	pxor	@b[4], @b[1]
164	pxor	@b[0], @b[2]
165	pxor	@b[6], @b[4]
166	pxor	@b[1], @b[6]
167
168	pxor	@b[5], @b[1]
169	pxor	@b[3], @b[5]
170	pxor	@b[7], @b[3]
171	pxor	@b[5], @b[7]
172	pxor	@b[5], @b[2]
173
174	pxor	@b[7], @b[4]
175___
176}
177
178sub InvSbox {
179# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
180# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
181my @b=@_[0..7];
182my @t=@_[8..11];
183my @s=@_[12..15];
184	&InvInBasisChange	(@b);
185	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
186	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
187}
188
189sub InvInBasisChange {		# OutBasisChange in reverse
190my @b=@_[5,1,2,6,3,7,0,4];
191$code.=<<___
192	pxor	@b[7], @b[4]
193
194	pxor	@b[5], @b[7]
195	pxor	@b[5], @b[2]
196	pxor	@b[7], @b[3]
197	pxor	@b[3], @b[5]
198	pxor	@b[5], @b[1]
199
200	pxor	@b[1], @b[6]
201	pxor	@b[0], @b[2]
202	pxor	@b[6], @b[4]
203	pxor	@b[6], @b[0]
204	pxor	@b[4], @b[1]
205___
206}
207
208sub InvOutBasisChange {		# InBasisChange in reverse
209my @b=@_[2,5,7,3,6,1,0,4];
210$code.=<<___;
211	pxor	@b[5], @b[1]
212	pxor	@b[7], @b[2]
213
214	pxor	@b[1], @b[3]
215	pxor	@b[5], @b[4]
216	pxor	@b[5], @b[7]
217	pxor	@b[4], @b[3]
218	 pxor 	@b[0], @b[5]
219	pxor	@b[7], @b[3]
220	 pxor	@b[2], @b[6]
221	 pxor	@b[1], @b[2]
222	pxor	@b[3], @b[6]
223
224	pxor	@b[0], @b[3]
225	pxor	@b[6], @b[5]
226___
227}
228
229sub Mul_GF4 {
230#;*************************************************************
231#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
232#;*************************************************************
233my ($x0,$x1,$y0,$y1,$t0)=@_;
234$code.=<<___;
235	movdqa	$y0, $t0
236	pxor 	$y1, $t0
237	pand	$x0, $t0
238	pxor	$x1, $x0
239	pand	$y0, $x1
240	pand	$y1, $x0
241	pxor	$x1, $x0
242	pxor	$t0, $x1
243___
244}
245
246sub Mul_GF4_N {				# not used, see next subroutine
247# multiply and scale by N
248my ($x0,$x1,$y0,$y1,$t0)=@_;
249$code.=<<___;
250	movdqa	$y0, $t0
251	pxor	$y1, $t0
252	pand	$x0, $t0
253	pxor	$x1, $x0
254	pand	$y0, $x1
255	pand	$y1, $x0
256	pxor	$x0, $x1
257	pxor	$t0, $x0
258___
259}
260
261sub Mul_GF4_N_GF4 {
262# interleaved Mul_GF4_N and Mul_GF4
263my ($x0,$x1,$y0,$y1,$t0,
264    $x2,$x3,$y2,$y3,$t1)=@_;
265$code.=<<___;
266	movdqa	$y0, $t0
267	 movdqa	$y2, $t1
268	pxor	$y1, $t0
269	 pxor 	$y3, $t1
270	pand	$x0, $t0
271	 pand	$x2, $t1
272	pxor	$x1, $x0
273	 pxor	$x3, $x2
274	pand	$y0, $x1
275	 pand	$y2, $x3
276	pand	$y1, $x0
277	 pand	$y3, $x2
278	pxor	$x0, $x1
279	 pxor	$x3, $x2
280	pxor	$t0, $x0
281	 pxor	$t1, $x3
282___
283}
284sub Mul_GF16_2 {
285my @x=@_[0..7];
286my @y=@_[8..11];
287my @t=@_[12..15];
288$code.=<<___;
289	movdqa	@x[0], @t[0]
290	movdqa	@x[1], @t[1]
291___
292	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
293$code.=<<___;
294	pxor	@x[2], @t[0]
295	pxor	@x[3], @t[1]
296	pxor	@y[2], @y[0]
297	pxor	@y[3], @y[1]
298___
299	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
300			 @x[2], @x[3], @y[2], @y[3], @t[2]);
301$code.=<<___;
302	pxor	@t[0], @x[0]
303	pxor	@t[0], @x[2]
304	pxor	@t[1], @x[1]
305	pxor	@t[1], @x[3]
306
307	movdqa	@x[4], @t[0]
308	movdqa	@x[5], @t[1]
309	pxor	@x[6], @t[0]
310	pxor	@x[7], @t[1]
311___
312	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
313			 @x[6], @x[7], @y[2], @y[3], @t[2]);
314$code.=<<___;
315	pxor	@y[2], @y[0]
316	pxor	@y[3], @y[1]
317___
318	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
319$code.=<<___;
320	pxor	@t[0], @x[4]
321	pxor	@t[0], @x[6]
322	pxor	@t[1], @x[5]
323	pxor	@t[1], @x[7]
324___
325}
326sub Inv_GF256 {
327#;********************************************************************
328#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
329#;********************************************************************
330my @x=@_[0..7];
331my @t=@_[8..11];
332my @s=@_[12..15];
333# direct optimizations from hardware
334$code.=<<___;
335	movdqa	@x[4], @t[3]
336	movdqa	@x[5], @t[2]
337	movdqa	@x[1], @t[1]
338	movdqa	@x[7], @s[1]
339	movdqa	@x[0], @s[0]
340
341	pxor	@x[6], @t[3]
342	pxor	@x[7], @t[2]
343	pxor	@x[3], @t[1]
344	 movdqa	@t[3], @s[2]
345	pxor	@x[6], @s[1]
346	 movdqa	@t[2], @t[0]
347	pxor	@x[2], @s[0]
348	 movdqa	@t[3], @s[3]
349
350	por	@t[1], @t[2]
351	por	@s[0], @t[3]
352	pxor	@t[0], @s[3]
353	pand	@s[0], @s[2]
354	pxor	@t[1], @s[0]
355	pand	@t[1], @t[0]
356	pand	@s[0], @s[3]
357	movdqa	@x[3], @s[0]
358	pxor	@x[2], @s[0]
359	pand	@s[0], @s[1]
360	pxor	@s[1], @t[3]
361	pxor	@s[1], @t[2]
362	movdqa	@x[4], @s[1]
363	movdqa	@x[1], @s[0]
364	pxor	@x[5], @s[1]
365	pxor	@x[0], @s[0]
366	movdqa	@s[1], @t[1]
367	pand	@s[0], @s[1]
368	por	@s[0], @t[1]
369	pxor	@s[1], @t[0]
370	pxor	@s[3], @t[3]
371	pxor	@s[2], @t[2]
372	pxor	@s[3], @t[1]
373	movdqa	@x[7], @s[0]
374	pxor	@s[2], @t[0]
375	movdqa	@x[6], @s[1]
376	pxor	@s[2], @t[1]
377	movdqa	@x[5], @s[2]
378	pand	@x[3], @s[0]
379	movdqa	@x[4], @s[3]
380	pand	@x[2], @s[1]
381	pand	@x[1], @s[2]
382	por	@x[0], @s[3]
383	pxor	@s[0], @t[3]
384	pxor	@s[1], @t[2]
385	pxor	@s[2], @t[1]
386	pxor	@s[3], @t[0]
387
388	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
389
390	# new smaller inversion
391
392	movdqa	@t[3], @s[0]
393	pand	@t[1], @t[3]
394	pxor	@t[2], @s[0]
395
396	movdqa	@t[0], @s[2]
397	movdqa	@s[0], @s[3]
398	pxor	@t[3], @s[2]
399	pand	@s[2], @s[3]
400
401	movdqa	@t[1], @s[1]
402	pxor	@t[2], @s[3]
403	pxor	@t[0], @s[1]
404
405	pxor	@t[2], @t[3]
406
407	pand	@t[3], @s[1]
408
409	movdqa	@s[2], @t[2]
410	pxor	@t[0], @s[1]
411
412	pxor	@s[1], @t[2]
413	pxor	@s[1], @t[1]
414
415	pand	@t[0], @t[2]
416
417	pxor	@t[2], @s[2]
418	pxor	@t[2], @t[1]
419
420	pand	@s[3], @s[2]
421
422	pxor	@s[0], @s[2]
423___
424# output in s3, s2, s1, t1
425
426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
427
428# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
429	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
430
431### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
432}
433
434# AES linear components
435
436sub ShiftRows {
437my @x=@_[0..7];
438my $mask=pop;
439$code.=<<___;
440	pxor	0x00($key),@x[0]
441	pxor	0x10($key),@x[1]
442	pxor	0x20($key),@x[2]
443	pxor	0x30($key),@x[3]
444	pshufb	$mask,@x[0]
445	pshufb	$mask,@x[1]
446	pxor	0x40($key),@x[4]
447	pxor	0x50($key),@x[5]
448	pshufb	$mask,@x[2]
449	pshufb	$mask,@x[3]
450	pxor	0x60($key),@x[6]
451	pxor	0x70($key),@x[7]
452	pshufb	$mask,@x[4]
453	pshufb	$mask,@x[5]
454	pshufb	$mask,@x[6]
455	pshufb	$mask,@x[7]
456	lea	0x80($key),$key
457___
458}
459
460sub MixColumns {
461# modified to emit output in order suitable for feeding back to aesenc[last]
462my @x=@_[0..7];
463my @t=@_[8..15];
464my $inv=@_[16];	# optional
465$code.=<<___;
466	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
467	pshufd	\$0x93, @x[1], @t[1]
468	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
469	pshufd	\$0x93, @x[2], @t[2]
470	 pxor	@t[1], @x[1]
471	pshufd	\$0x93, @x[3], @t[3]
472	 pxor	@t[2], @x[2]
473	pshufd	\$0x93, @x[4], @t[4]
474	 pxor	@t[3], @x[3]
475	pshufd	\$0x93, @x[5], @t[5]
476	 pxor	@t[4], @x[4]
477	pshufd	\$0x93, @x[6], @t[6]
478	 pxor	@t[5], @x[5]
479	pshufd	\$0x93, @x[7], @t[7]
480	 pxor	@t[6], @x[6]
481	 pxor	@t[7], @x[7]
482
483	pxor	@x[0], @t[1]
484	pxor	@x[7], @t[0]
485	pxor	@x[7], @t[1]
486	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
487	pxor	@x[1], @t[2]
488	 pshufd	\$0x4E, @x[1], @x[1]
489	pxor	@x[4], @t[5]
490	 pxor	@t[0], @x[0]
491	pxor	@x[5], @t[6]
492	 pxor	@t[1], @x[1]
493	pxor	@x[3], @t[4]
494	 pshufd	\$0x4E, @x[4], @t[0]
495	pxor	@x[6], @t[7]
496	 pshufd	\$0x4E, @x[5], @t[1]
497	pxor	@x[2], @t[3]
498	 pshufd	\$0x4E, @x[3], @x[4]
499	pxor	@x[7], @t[3]
500	 pshufd	\$0x4E, @x[7], @x[5]
501	pxor	@x[7], @t[4]
502	 pshufd	\$0x4E, @x[6], @x[3]
503	pxor	@t[4], @t[0]
504	 pshufd	\$0x4E, @x[2], @x[6]
505	pxor	@t[5], @t[1]
506___
507$code.=<<___ if (!$inv);
508	pxor	@t[3], @x[4]
509	pxor	@t[7], @x[5]
510	pxor	@t[6], @x[3]
511	 movdqa	@t[0], @x[2]
512	pxor	@t[2], @x[6]
513	 movdqa	@t[1], @x[7]
514___
515$code.=<<___ if ($inv);
516	pxor	@x[4], @t[3]
517	pxor	@t[7], @x[5]
518	pxor	@x[3], @t[6]
519	 movdqa	@t[0], @x[3]
520	pxor	@t[2], @x[6]
521	 movdqa	@t[6], @x[2]
522	 movdqa	@t[1], @x[7]
523	 movdqa	@x[6], @x[4]
524	 movdqa	@t[3], @x[6]
525___
526}
527
528sub InvMixColumns_orig {
529my @x=@_[0..7];
530my @t=@_[8..15];
531
532$code.=<<___;
533	# multiplication by 0x0e
534	pshufd	\$0x93, @x[7], @t[7]
535	movdqa	@x[2], @t[2]
536	pxor	@x[5], @x[7]		# 7 5
537	pxor	@x[5], @x[2]		# 2 5
538	pshufd	\$0x93, @x[0], @t[0]
539	movdqa	@x[5], @t[5]
540	pxor	@x[0], @x[5]		# 5 0		[1]
541	pxor	@x[1], @x[0]		# 0 1
542	pshufd	\$0x93, @x[1], @t[1]
543	pxor	@x[2], @x[1]		# 1 25
544	pxor	@x[6], @x[0]		# 01 6		[2]
545	pxor	@x[3], @x[1]		# 125 3		[4]
546	pshufd	\$0x93, @x[3], @t[3]
547	pxor	@x[0], @x[2]		# 25 016	[3]
548	pxor	@x[7], @x[3]		# 3 75
549	pxor	@x[6], @x[7]		# 75 6		[0]
550	pshufd	\$0x93, @x[6], @t[6]
551	movdqa	@x[4], @t[4]
552	pxor	@x[4], @x[6]		# 6 4
553	pxor	@x[3], @x[4]		# 4 375		[6]
554	pxor	@x[7], @x[3]		# 375 756=36
555	pxor	@t[5], @x[6]		# 64 5		[7]
556	pxor	@t[2], @x[3]		# 36 2
557	pxor	@t[4], @x[3]		# 362 4		[5]
558	pshufd	\$0x93, @t[5], @t[5]
559___
560					my @y = @x[7,5,0,2,1,3,4,6];
561$code.=<<___;
562	# multiplication by 0x0b
563	pxor	@y[0], @y[1]
564	pxor	@t[0], @y[0]
565	pxor	@t[1], @y[1]
566	pshufd	\$0x93, @t[2], @t[2]
567	pxor	@t[5], @y[0]
568	pxor	@t[6], @y[1]
569	pxor	@t[7], @y[0]
570	pshufd	\$0x93, @t[4], @t[4]
571	pxor	@t[6], @t[7]		# clobber t[7]
572	pxor	@y[0], @y[1]
573
574	pxor	@t[0], @y[3]
575	pshufd	\$0x93, @t[0], @t[0]
576	pxor	@t[1], @y[2]
577	pxor	@t[1], @y[4]
578	pxor	@t[2], @y[2]
579	pshufd	\$0x93, @t[1], @t[1]
580	pxor	@t[2], @y[3]
581	pxor	@t[2], @y[5]
582	pxor	@t[7], @y[2]
583	pshufd	\$0x93, @t[2], @t[2]
584	pxor	@t[3], @y[3]
585	pxor	@t[3], @y[6]
586	pxor	@t[3], @y[4]
587	pshufd	\$0x93, @t[3], @t[3]
588	pxor	@t[4], @y[7]
589	pxor	@t[4], @y[5]
590	pxor	@t[7], @y[7]
591	pxor	@t[5], @y[3]
592	pxor	@t[4], @y[4]
593	pxor	@t[5], @t[7]		# clobber t[7] even more
594
595	pxor	@t[7], @y[5]
596	pshufd	\$0x93, @t[4], @t[4]
597	pxor	@t[7], @y[6]
598	pxor	@t[7], @y[4]
599
600	pxor	@t[5], @t[7]
601	pshufd	\$0x93, @t[5], @t[5]
602	pxor	@t[6], @t[7]		# restore t[7]
603
604	# multiplication by 0x0d
605	pxor	@y[7], @y[4]
606	pxor	@t[4], @y[7]
607	pshufd	\$0x93, @t[6], @t[6]
608	pxor	@t[0], @y[2]
609	pxor	@t[5], @y[7]
610	pxor	@t[2], @y[2]
611	pshufd	\$0x93, @t[7], @t[7]
612
613	pxor	@y[1], @y[3]
614	pxor	@t[1], @y[1]
615	pxor	@t[0], @y[0]
616	pxor	@t[0], @y[3]
617	pxor	@t[5], @y[1]
618	pxor	@t[5], @y[0]
619	pxor	@t[7], @y[1]
620	pshufd	\$0x93, @t[0], @t[0]
621	pxor	@t[6], @y[0]
622	pxor	@y[1], @y[3]
623	pxor	@t[1], @y[4]
624	pshufd	\$0x93, @t[1], @t[1]
625
626	pxor	@t[7], @y[7]
627	pxor	@t[2], @y[4]
628	pxor	@t[2], @y[5]
629	pshufd	\$0x93, @t[2], @t[2]
630	pxor	@t[6], @y[2]
631	pxor	@t[3], @t[6]		# clobber t[6]
632	pxor	@y[7], @y[4]
633	pxor	@t[6], @y[3]
634
635	pxor	@t[6], @y[6]
636	pxor	@t[5], @y[5]
637	pxor	@t[4], @y[6]
638	pshufd	\$0x93, @t[4], @t[4]
639	pxor	@t[6], @y[5]
640	pxor	@t[7], @y[6]
641	pxor	@t[3], @t[6]		# restore t[6]
642
643	pshufd	\$0x93, @t[5], @t[5]
644	pshufd	\$0x93, @t[6], @t[6]
645	pshufd	\$0x93, @t[7], @t[7]
646	pshufd	\$0x93, @t[3], @t[3]
647
648	# multiplication by 0x09
649	pxor	@y[1], @y[4]
650	pxor	@y[1], @t[1]		# t[1]=y[1]
651	pxor	@t[5], @t[0]		# clobber t[0]
652	pxor	@t[5], @t[1]
653	pxor	@t[0], @y[3]
654	pxor	@y[0], @t[0]		# t[0]=y[0]
655	pxor	@t[6], @t[1]
656	pxor	@t[7], @t[6]		# clobber t[6]
657	pxor	@t[1], @y[4]
658	pxor	@t[4], @y[7]
659	pxor	@y[4], @t[4]		# t[4]=y[4]
660	pxor	@t[3], @y[6]
661	pxor	@y[3], @t[3]		# t[3]=y[3]
662	pxor	@t[2], @y[5]
663	pxor	@y[2], @t[2]		# t[2]=y[2]
664	pxor	@t[7], @t[3]
665	pxor	@y[5], @t[5]		# t[5]=y[5]
666	pxor	@t[6], @t[2]
667	pxor	@t[6], @t[5]
668	pxor	@y[6], @t[6]		# t[6]=y[6]
669	pxor	@y[7], @t[7]		# t[7]=y[7]
670
671	movdqa	@t[0],@XMM[0]
672	movdqa	@t[1],@XMM[1]
673	movdqa	@t[2],@XMM[2]
674	movdqa	@t[3],@XMM[3]
675	movdqa	@t[4],@XMM[4]
676	movdqa	@t[5],@XMM[5]
677	movdqa	@t[6],@XMM[6]
678	movdqa	@t[7],@XMM[7]
679___
680}
681
682sub InvMixColumns {
683my @x=@_[0..7];
684my @t=@_[8..15];
685
686# Thanks to Jussi Kivilinna for providing pointer to
687#
688# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
689# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
690# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
691# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
692
693$code.=<<___;
694	# multiplication by 0x05-0x00-0x04-0x00
695	pshufd	\$0x4E, @x[0], @t[0]
696	pshufd	\$0x4E, @x[6], @t[6]
697	pxor	@x[0], @t[0]
698	pshufd	\$0x4E, @x[7], @t[7]
699	pxor	@x[6], @t[6]
700	pshufd	\$0x4E, @x[1], @t[1]
701	pxor	@x[7], @t[7]
702	pshufd	\$0x4E, @x[2], @t[2]
703	pxor	@x[1], @t[1]
704	pshufd	\$0x4E, @x[3], @t[3]
705	pxor	@x[2], @t[2]
706	 pxor	@t[6], @x[0]
707	 pxor	@t[6], @x[1]
708	pshufd	\$0x4E, @x[4], @t[4]
709	pxor	@x[3], @t[3]
710	 pxor	@t[0], @x[2]
711	 pxor	@t[1], @x[3]
712	pshufd	\$0x4E, @x[5], @t[5]
713	pxor	@x[4], @t[4]
714	 pxor	@t[7], @x[1]
715	 pxor	@t[2], @x[4]
716	pxor	@x[5], @t[5]
717
718	 pxor	@t[7], @x[2]
719	 pxor	@t[6], @x[3]
720	 pxor	@t[6], @x[4]
721	 pxor	@t[3], @x[5]
722	 pxor	@t[4], @x[6]
723	 pxor	@t[7], @x[4]
724	 pxor	@t[7], @x[5]
725	 pxor	@t[5], @x[7]
726___
727	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
728}
729
730sub aesenc {				# not used
731my @b=@_[0..7];
732my @t=@_[8..15];
733$code.=<<___;
734	movdqa	0x30($const),@t[0]	# .LSR
735___
736	&ShiftRows	(@b,@t[0]);
737	&Sbox		(@b,@t);
738	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
739}
740
741sub aesenclast {			# not used
742my @b=@_[0..7];
743my @t=@_[8..15];
744$code.=<<___;
745	movdqa	0x40($const),@t[0]	# .LSRM0
746___
747	&ShiftRows	(@b,@t[0]);
748	&Sbox		(@b,@t);
749$code.=<<___
750	pxor	0x00($key),@b[0]
751	pxor	0x10($key),@b[1]
752	pxor	0x20($key),@b[4]
753	pxor	0x30($key),@b[6]
754	pxor	0x40($key),@b[3]
755	pxor	0x50($key),@b[7]
756	pxor	0x60($key),@b[2]
757	pxor	0x70($key),@b[5]
758___
759}
760
761sub swapmove {
762my ($a,$b,$n,$mask,$t)=@_;
763$code.=<<___;
764	movdqa	$b,$t
765	psrlq	\$$n,$b
766	pxor  	$a,$b
767	pand	$mask,$b
768	pxor	$b,$a
769	psllq	\$$n,$b
770	pxor	$t,$b
771___
772}
773sub swapmove2x {
774my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
775$code.=<<___;
776	movdqa	$b0,$t0
777	psrlq	\$$n,$b0
778	 movdqa	$b1,$t1
779	 psrlq	\$$n,$b1
780	pxor  	$a0,$b0
781	 pxor  	$a1,$b1
782	pand	$mask,$b0
783	 pand	$mask,$b1
784	pxor	$b0,$a0
785	psllq	\$$n,$b0
786	 pxor	$b1,$a1
787	 psllq	\$$n,$b1
788	pxor	$t0,$b0
789	 pxor	$t1,$b1
790___
791}
792
793sub bitslice {
794my @x=reverse(@_[0..7]);
795my ($t0,$t1,$t2,$t3)=@_[8..11];
796$code.=<<___;
797	movdqa	0x00($const),$t0	# .LBS0
798	movdqa	0x10($const),$t1	# .LBS1
799___
800	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
801	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
802$code.=<<___;
803	movdqa	0x20($const),$t0	# .LBS2
804___
805	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
806	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
807
808	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
809	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
810}
811
812$code.=<<___;
813.text
814
815.extern	asm_AES_encrypt
816.extern	asm_AES_decrypt
817
818.type	_bsaes_encrypt8,\@abi-omnipotent
819.align	64
820_bsaes_encrypt8:
821.cfi_startproc
822	lea	.LBS0(%rip), $const	# constants table
823
824	movdqa	($key), @XMM[9]		# round 0 key
825	lea	0x10($key), $key
826	movdqa	0x50($const), @XMM[8]	# .LM0SR
827	pxor	@XMM[9], @XMM[0]	# xor with round0 key
828	pxor	@XMM[9], @XMM[1]
829	pxor	@XMM[9], @XMM[2]
830	pxor	@XMM[9], @XMM[3]
831	 pshufb	@XMM[8], @XMM[0]
832	 pshufb	@XMM[8], @XMM[1]
833	pxor	@XMM[9], @XMM[4]
834	pxor	@XMM[9], @XMM[5]
835	 pshufb	@XMM[8], @XMM[2]
836	 pshufb	@XMM[8], @XMM[3]
837	pxor	@XMM[9], @XMM[6]
838	pxor	@XMM[9], @XMM[7]
839	 pshufb	@XMM[8], @XMM[4]
840	 pshufb	@XMM[8], @XMM[5]
841	 pshufb	@XMM[8], @XMM[6]
842	 pshufb	@XMM[8], @XMM[7]
843_bsaes_encrypt8_bitslice:
844___
845	&bitslice	(@XMM[0..7, 8..11]);
846$code.=<<___;
847	dec	$rounds
848	jmp	.Lenc_sbox
849.align	16
850.Lenc_loop:
851___
852	&ShiftRows	(@XMM[0..7, 8]);
853$code.=".Lenc_sbox:\n";
854	&Sbox		(@XMM[0..7, 8..15]);
855$code.=<<___;
856	dec	$rounds
857	jl	.Lenc_done
858___
859	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
860$code.=<<___;
861	movdqa	0x30($const), @XMM[8]	# .LSR
862	jnz	.Lenc_loop
863	movdqa	0x40($const), @XMM[8]	# .LSRM0
864	jmp	.Lenc_loop
865.align	16
866.Lenc_done:
867___
868	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
869	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
870$code.=<<___;
871	movdqa	($key), @XMM[8]		# last round key
872	pxor	@XMM[8], @XMM[4]
873	pxor	@XMM[8], @XMM[6]
874	pxor	@XMM[8], @XMM[3]
875	pxor	@XMM[8], @XMM[7]
876	pxor	@XMM[8], @XMM[2]
877	pxor	@XMM[8], @XMM[5]
878	pxor	@XMM[8], @XMM[0]
879	pxor	@XMM[8], @XMM[1]
880	ret
881.cfi_endproc
882.size	_bsaes_encrypt8,.-_bsaes_encrypt8
883
884.type	_bsaes_decrypt8,\@abi-omnipotent
885.align	64
886_bsaes_decrypt8:
887.cfi_startproc
888	lea	.LBS0(%rip), $const	# constants table
889
890	movdqa	($key), @XMM[9]		# round 0 key
891	lea	0x10($key), $key
892	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
893	pxor	@XMM[9], @XMM[0]	# xor with round0 key
894	pxor	@XMM[9], @XMM[1]
895	pxor	@XMM[9], @XMM[2]
896	pxor	@XMM[9], @XMM[3]
897	 pshufb	@XMM[8], @XMM[0]
898	 pshufb	@XMM[8], @XMM[1]
899	pxor	@XMM[9], @XMM[4]
900	pxor	@XMM[9], @XMM[5]
901	 pshufb	@XMM[8], @XMM[2]
902	 pshufb	@XMM[8], @XMM[3]
903	pxor	@XMM[9], @XMM[6]
904	pxor	@XMM[9], @XMM[7]
905	 pshufb	@XMM[8], @XMM[4]
906	 pshufb	@XMM[8], @XMM[5]
907	 pshufb	@XMM[8], @XMM[6]
908	 pshufb	@XMM[8], @XMM[7]
909___
910	&bitslice	(@XMM[0..7, 8..11]);
911$code.=<<___;
912	dec	$rounds
913	jmp	.Ldec_sbox
914.align	16
915.Ldec_loop:
916___
917	&ShiftRows	(@XMM[0..7, 8]);
918$code.=".Ldec_sbox:\n";
919	&InvSbox	(@XMM[0..7, 8..15]);
920$code.=<<___;
921	dec	$rounds
922	jl	.Ldec_done
923___
924	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
925$code.=<<___;
926	movdqa	-0x10($const), @XMM[8]	# .LISR
927	jnz	.Ldec_loop
928	movdqa	-0x20($const), @XMM[8]	# .LISRM0
929	jmp	.Ldec_loop
930.align	16
931.Ldec_done:
932___
933	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
934$code.=<<___;
935	movdqa	($key), @XMM[8]		# last round key
936	pxor	@XMM[8], @XMM[6]
937	pxor	@XMM[8], @XMM[4]
938	pxor	@XMM[8], @XMM[2]
939	pxor	@XMM[8], @XMM[7]
940	pxor	@XMM[8], @XMM[3]
941	pxor	@XMM[8], @XMM[5]
942	pxor	@XMM[8], @XMM[0]
943	pxor	@XMM[8], @XMM[1]
944	ret
945.cfi_endproc
946.size	_bsaes_decrypt8,.-_bsaes_decrypt8
947___
948}
949{
950my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
951
952sub bitslice_key {
953my @x=reverse(@_[0..7]);
954my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
955
956	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
957$code.=<<___;
958	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
959	movdqa	@x[0], @x[2]
960	movdqa	@x[1], @x[3]
961___
962	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
963
964	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
965$code.=<<___;
966	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
967	movdqa	@x[0], @x[4]
968	movdqa	@x[2], @x[6]
969	movdqa	@x[1], @x[5]
970	movdqa	@x[3], @x[7]
971___
972	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
973	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
974}
975
976$code.=<<___;
977.type	_bsaes_key_convert,\@abi-omnipotent
978.align	16
979_bsaes_key_convert:
980.cfi_startproc
981	lea	.Lmasks(%rip), $const
982	movdqu	($inp), %xmm7		# load round 0 key
983	lea	0x10($inp), $inp
984	movdqa	0x00($const), %xmm0	# 0x01...
985	movdqa	0x10($const), %xmm1	# 0x02...
986	movdqa	0x20($const), %xmm2	# 0x04...
987	movdqa	0x30($const), %xmm3	# 0x08...
988	movdqa	0x40($const), %xmm4	# .LM0
989	pcmpeqd	%xmm5, %xmm5		# .LNOT
990
991	movdqu	($inp), %xmm6		# load round 1 key
992	movdqa	%xmm7, ($out)		# save round 0 key
993	lea	0x10($out), $out
994	dec	$rounds
995	jmp	.Lkey_loop
996.align	16
997.Lkey_loop:
998	pshufb	%xmm4, %xmm6		# .LM0
999
1000	movdqa	%xmm0,	%xmm8
1001	movdqa	%xmm1,	%xmm9
1002
1003	pand	%xmm6,	%xmm8
1004	pand	%xmm6,	%xmm9
1005	movdqa	%xmm2,	%xmm10
1006	pcmpeqb	%xmm0,	%xmm8
1007	psllq	\$4,	%xmm0		# 0x10...
1008	movdqa	%xmm3,	%xmm11
1009	pcmpeqb	%xmm1,	%xmm9
1010	psllq	\$4,	%xmm1		# 0x20...
1011
1012	pand	%xmm6,	%xmm10
1013	pand	%xmm6,	%xmm11
1014	movdqa	%xmm0,	%xmm12
1015	pcmpeqb	%xmm2,	%xmm10
1016	psllq	\$4,	%xmm2		# 0x40...
1017	movdqa	%xmm1,	%xmm13
1018	pcmpeqb	%xmm3,	%xmm11
1019	psllq	\$4,	%xmm3		# 0x80...
1020
1021	movdqa	%xmm2,	%xmm14
1022	movdqa	%xmm3,	%xmm15
1023	 pxor	%xmm5,	%xmm8		# "pnot"
1024	 pxor	%xmm5,	%xmm9
1025
1026	pand	%xmm6,	%xmm12
1027	pand	%xmm6,	%xmm13
1028	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
1029	pcmpeqb	%xmm0,	%xmm12
1030	psrlq	\$4,	%xmm0		# 0x01...
1031	 movdqa	%xmm9, 0x10($out)
1032	pcmpeqb	%xmm1,	%xmm13
1033	psrlq	\$4,	%xmm1		# 0x02...
1034	 lea	0x10($inp), $inp
1035
1036	pand	%xmm6,	%xmm14
1037	pand	%xmm6,	%xmm15
1038	 movdqa	%xmm10, 0x20($out)
1039	pcmpeqb	%xmm2,	%xmm14
1040	psrlq	\$4,	%xmm2		# 0x04...
1041	 movdqa	%xmm11, 0x30($out)
1042	pcmpeqb	%xmm3,	%xmm15
1043	psrlq	\$4,	%xmm3		# 0x08...
1044	 movdqu	($inp), %xmm6		# load next round key
1045
1046	pxor	%xmm5, %xmm13		# "pnot"
1047	pxor	%xmm5, %xmm14
1048	movdqa	%xmm12, 0x40($out)
1049	movdqa	%xmm13, 0x50($out)
1050	movdqa	%xmm14, 0x60($out)
1051	movdqa	%xmm15, 0x70($out)
1052	lea	0x80($out),$out
1053	dec	$rounds
1054	jnz	.Lkey_loop
1055
1056	movdqa	0x50($const), %xmm7	# .L63
1057	#movdqa	%xmm6, ($out)		# don't save last round key
1058	ret
1059.cfi_endproc
1060.size	_bsaes_key_convert,.-_bsaes_key_convert
1061___
1062}
1063
1064if (0 && !$win64) {	# following four functions are unsupported interface
1065			# used for benchmarking...
1066$code.=<<___;
1067.globl	bsaes_enc_key_convert
1068.type	bsaes_enc_key_convert,\@function,2
1069.align	16
1070bsaes_enc_key_convert:
1071	mov	240($inp),%r10d		# pass rounds
1072	mov	$inp,%rcx		# pass key
1073	mov	$out,%rax		# pass key schedule
1074	call	_bsaes_key_convert
1075	pxor	%xmm6,%xmm7		# fix up last round key
1076	movdqa	%xmm7,(%rax)		# save last round key
1077	ret
1078.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1079
1080.globl	bsaes_encrypt_128
1081.type	bsaes_encrypt_128,\@function,4
1082.align	16
1083bsaes_encrypt_128:
1084.Lenc128_loop:
1085	movdqu	0x00($inp), @XMM[0]	# load input
1086	movdqu	0x10($inp), @XMM[1]
1087	movdqu	0x20($inp), @XMM[2]
1088	movdqu	0x30($inp), @XMM[3]
1089	movdqu	0x40($inp), @XMM[4]
1090	movdqu	0x50($inp), @XMM[5]
1091	movdqu	0x60($inp), @XMM[6]
1092	movdqu	0x70($inp), @XMM[7]
1093	mov	$key, %rax		# pass the $key
1094	lea	0x80($inp), $inp
1095	mov	\$10,%r10d
1096
1097	call	_bsaes_encrypt8
1098
1099	movdqu	@XMM[0], 0x00($out)	# write output
1100	movdqu	@XMM[1], 0x10($out)
1101	movdqu	@XMM[4], 0x20($out)
1102	movdqu	@XMM[6], 0x30($out)
1103	movdqu	@XMM[3], 0x40($out)
1104	movdqu	@XMM[7], 0x50($out)
1105	movdqu	@XMM[2], 0x60($out)
1106	movdqu	@XMM[5], 0x70($out)
1107	lea	0x80($out), $out
1108	sub	\$0x80,$len
1109	ja	.Lenc128_loop
1110	ret
1111.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1112
1113.globl	bsaes_dec_key_convert
1114.type	bsaes_dec_key_convert,\@function,2
1115.align	16
1116bsaes_dec_key_convert:
1117	mov	240($inp),%r10d		# pass rounds
1118	mov	$inp,%rcx		# pass key
1119	mov	$out,%rax		# pass key schedule
1120	call	_bsaes_key_convert
1121	pxor	($out),%xmm7		# fix up round 0 key
1122	movdqa	%xmm6,(%rax)		# save last round key
1123	movdqa	%xmm7,($out)
1124	ret
1125.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1126
1127.globl	bsaes_decrypt_128
1128.type	bsaes_decrypt_128,\@function,4
1129.align	16
1130bsaes_decrypt_128:
1131.Ldec128_loop:
1132	movdqu	0x00($inp), @XMM[0]	# load input
1133	movdqu	0x10($inp), @XMM[1]
1134	movdqu	0x20($inp), @XMM[2]
1135	movdqu	0x30($inp), @XMM[3]
1136	movdqu	0x40($inp), @XMM[4]
1137	movdqu	0x50($inp), @XMM[5]
1138	movdqu	0x60($inp), @XMM[6]
1139	movdqu	0x70($inp), @XMM[7]
1140	mov	$key, %rax		# pass the $key
1141	lea	0x80($inp), $inp
1142	mov	\$10,%r10d
1143
1144	call	_bsaes_decrypt8
1145
1146	movdqu	@XMM[0], 0x00($out)	# write output
1147	movdqu	@XMM[1], 0x10($out)
1148	movdqu	@XMM[6], 0x20($out)
1149	movdqu	@XMM[4], 0x30($out)
1150	movdqu	@XMM[2], 0x40($out)
1151	movdqu	@XMM[7], 0x50($out)
1152	movdqu	@XMM[3], 0x60($out)
1153	movdqu	@XMM[5], 0x70($out)
1154	lea	0x80($out), $out
1155	sub	\$0x80,$len
1156	ja	.Ldec128_loop
1157	ret
1158.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1159___
1160}
1161{
1162######################################################################
1163#
1164# OpenSSL interface
1165#
1166my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1167						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1168my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1169
1170if ($ecb) {
1171$code.=<<___;
1172.globl	bsaes_ecb_encrypt_blocks
1173.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1174.align	16
1175bsaes_ecb_encrypt_blocks:
1176.cfi_startproc
1177	mov	%rsp, %rax
1178.Lecb_enc_prologue:
1179	push	%rbp
1180.cfi_push	%rbp
1181	push	%rbx
1182.cfi_push	%rbx
1183	push	%r12
1184.cfi_push	%r12
1185	push	%r13
1186.cfi_push	%r13
1187	push	%r14
1188.cfi_push	%r14
1189	push	%r15
1190.cfi_push	%r15
1191	lea	-0x48(%rsp),%rsp
1192.cfi_adjust_cfa_offset	0x48
1193___
1194$code.=<<___ if ($win64);
1195	lea	-0xa0(%rsp), %rsp
1196	movaps	%xmm6, 0x40(%rsp)
1197	movaps	%xmm7, 0x50(%rsp)
1198	movaps	%xmm8, 0x60(%rsp)
1199	movaps	%xmm9, 0x70(%rsp)
1200	movaps	%xmm10, 0x80(%rsp)
1201	movaps	%xmm11, 0x90(%rsp)
1202	movaps	%xmm12, 0xa0(%rsp)
1203	movaps	%xmm13, 0xb0(%rsp)
1204	movaps	%xmm14, 0xc0(%rsp)
1205	movaps	%xmm15, 0xd0(%rsp)
1206.Lecb_enc_body:
1207___
1208$code.=<<___;
1209	mov	%rsp,%rbp		# backup %rsp
1210.cfi_def_cfa_register	%rbp
1211	mov	240($arg4),%eax		# rounds
1212	mov	$arg1,$inp		# backup arguments
1213	mov	$arg2,$out
1214	mov	$arg3,$len
1215	mov	$arg4,$key
1216	cmp	\$8,$arg3
1217	jb	.Lecb_enc_short
1218
1219	mov	%eax,%ebx		# backup rounds
1220	shl	\$7,%rax		# 128 bytes per inner round key
1221	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1222	sub	%rax,%rsp
1223	mov	%rsp,%rax		# pass key schedule
1224	mov	$key,%rcx		# pass key
1225	mov	%ebx,%r10d		# pass rounds
1226	call	_bsaes_key_convert
1227	pxor	%xmm6,%xmm7		# fix up last round key
1228	movdqa	%xmm7,(%rax)		# save last round key
1229
1230	sub	\$8,$len
1231.Lecb_enc_loop:
1232	movdqu	0x00($inp), @XMM[0]	# load input
1233	movdqu	0x10($inp), @XMM[1]
1234	movdqu	0x20($inp), @XMM[2]
1235	movdqu	0x30($inp), @XMM[3]
1236	movdqu	0x40($inp), @XMM[4]
1237	movdqu	0x50($inp), @XMM[5]
1238	mov	%rsp, %rax		# pass key schedule
1239	movdqu	0x60($inp), @XMM[6]
1240	mov	%ebx,%r10d		# pass rounds
1241	movdqu	0x70($inp), @XMM[7]
1242	lea	0x80($inp), $inp
1243
1244	call	_bsaes_encrypt8
1245
1246	movdqu	@XMM[0], 0x00($out)	# write output
1247	movdqu	@XMM[1], 0x10($out)
1248	movdqu	@XMM[4], 0x20($out)
1249	movdqu	@XMM[6], 0x30($out)
1250	movdqu	@XMM[3], 0x40($out)
1251	movdqu	@XMM[7], 0x50($out)
1252	movdqu	@XMM[2], 0x60($out)
1253	movdqu	@XMM[5], 0x70($out)
1254	lea	0x80($out), $out
1255	sub	\$8,$len
1256	jnc	.Lecb_enc_loop
1257
1258	add	\$8,$len
1259	jz	.Lecb_enc_done
1260
1261	movdqu	0x00($inp), @XMM[0]	# load input
1262	mov	%rsp, %rax		# pass key schedule
1263	mov	%ebx,%r10d		# pass rounds
1264	cmp	\$2,$len
1265	jb	.Lecb_enc_one
1266	movdqu	0x10($inp), @XMM[1]
1267	je	.Lecb_enc_two
1268	movdqu	0x20($inp), @XMM[2]
1269	cmp	\$4,$len
1270	jb	.Lecb_enc_three
1271	movdqu	0x30($inp), @XMM[3]
1272	je	.Lecb_enc_four
1273	movdqu	0x40($inp), @XMM[4]
1274	cmp	\$6,$len
1275	jb	.Lecb_enc_five
1276	movdqu	0x50($inp), @XMM[5]
1277	je	.Lecb_enc_six
1278	movdqu	0x60($inp), @XMM[6]
1279	call	_bsaes_encrypt8
1280	movdqu	@XMM[0], 0x00($out)	# write output
1281	movdqu	@XMM[1], 0x10($out)
1282	movdqu	@XMM[4], 0x20($out)
1283	movdqu	@XMM[6], 0x30($out)
1284	movdqu	@XMM[3], 0x40($out)
1285	movdqu	@XMM[7], 0x50($out)
1286	movdqu	@XMM[2], 0x60($out)
1287	jmp	.Lecb_enc_done
1288.align	16
1289.Lecb_enc_six:
1290	call	_bsaes_encrypt8
1291	movdqu	@XMM[0], 0x00($out)	# write output
1292	movdqu	@XMM[1], 0x10($out)
1293	movdqu	@XMM[4], 0x20($out)
1294	movdqu	@XMM[6], 0x30($out)
1295	movdqu	@XMM[3], 0x40($out)
1296	movdqu	@XMM[7], 0x50($out)
1297	jmp	.Lecb_enc_done
1298.align	16
1299.Lecb_enc_five:
1300	call	_bsaes_encrypt8
1301	movdqu	@XMM[0], 0x00($out)	# write output
1302	movdqu	@XMM[1], 0x10($out)
1303	movdqu	@XMM[4], 0x20($out)
1304	movdqu	@XMM[6], 0x30($out)
1305	movdqu	@XMM[3], 0x40($out)
1306	jmp	.Lecb_enc_done
1307.align	16
1308.Lecb_enc_four:
1309	call	_bsaes_encrypt8
1310	movdqu	@XMM[0], 0x00($out)	# write output
1311	movdqu	@XMM[1], 0x10($out)
1312	movdqu	@XMM[4], 0x20($out)
1313	movdqu	@XMM[6], 0x30($out)
1314	jmp	.Lecb_enc_done
1315.align	16
1316.Lecb_enc_three:
1317	call	_bsaes_encrypt8
1318	movdqu	@XMM[0], 0x00($out)	# write output
1319	movdqu	@XMM[1], 0x10($out)
1320	movdqu	@XMM[4], 0x20($out)
1321	jmp	.Lecb_enc_done
1322.align	16
1323.Lecb_enc_two:
1324	call	_bsaes_encrypt8
1325	movdqu	@XMM[0], 0x00($out)	# write output
1326	movdqu	@XMM[1], 0x10($out)
1327	jmp	.Lecb_enc_done
1328.align	16
1329.Lecb_enc_one:
1330	call	_bsaes_encrypt8
1331	movdqu	@XMM[0], 0x00($out)	# write output
1332	jmp	.Lecb_enc_done
1333.align	16
1334.Lecb_enc_short:
1335	lea	($inp), $arg1
1336	lea	($out), $arg2
1337	lea	($key), $arg3
1338	call	asm_AES_encrypt
1339	lea	16($inp), $inp
1340	lea	16($out), $out
1341	dec	$len
1342	jnz	.Lecb_enc_short
1343
1344.Lecb_enc_done:
1345	lea	(%rsp),%rax
1346	pxor	%xmm0, %xmm0
1347.Lecb_enc_bzero:			# wipe key schedule [if any]
1348	movdqa	%xmm0, 0x00(%rax)
1349	movdqa	%xmm0, 0x10(%rax)
1350	lea	0x20(%rax), %rax
1351	cmp	%rax, %rbp
1352	jb	.Lecb_enc_bzero
1353
1354	lea	0x78(%rbp),%rax
1355.cfi_def_cfa	%rax,8
1356___
1357$code.=<<___ if ($win64);
1358	movaps	0x40(%rbp), %xmm6
1359	movaps	0x50(%rbp), %xmm7
1360	movaps	0x60(%rbp), %xmm8
1361	movaps	0x70(%rbp), %xmm9
1362	movaps	0x80(%rbp), %xmm10
1363	movaps	0x90(%rbp), %xmm11
1364	movaps	0xa0(%rbp), %xmm12
1365	movaps	0xb0(%rbp), %xmm13
1366	movaps	0xc0(%rbp), %xmm14
1367	movaps	0xd0(%rbp), %xmm15
1368	lea	0xa0(%rax), %rax
1369.Lecb_enc_tail:
1370___
1371$code.=<<___;
1372	mov	-48(%rax), %r15
1373.cfi_restore	%r15
1374	mov	-40(%rax), %r14
1375.cfi_restore	%r14
1376	mov	-32(%rax), %r13
1377.cfi_restore	%r13
1378	mov	-24(%rax), %r12
1379.cfi_restore	%r12
1380	mov	-16(%rax), %rbx
1381.cfi_restore	%rbx
1382	mov	-8(%rax), %rbp
1383.cfi_restore	%rbp
1384	lea	(%rax), %rsp		# restore %rsp
1385.cfi_def_cfa_register	%rsp
1386.Lecb_enc_epilogue:
1387	ret
1388.cfi_endproc
1389.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1390
1391.globl	bsaes_ecb_decrypt_blocks
1392.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1393.align	16
1394bsaes_ecb_decrypt_blocks:
1395.cfi_startproc
1396	mov	%rsp, %rax
1397.Lecb_dec_prologue:
1398	push	%rbp
1399.cfi_push	%rbp
1400	push	%rbx
1401.cfi_push	%rbx
1402	push	%r12
1403.cfi_push	%r12
1404	push	%r13
1405.cfi_push	%r13
1406	push	%r14
1407.cfi_push	%r14
1408	push	%r15
1409.cfi_push	%r15
1410	lea	-0x48(%rsp),%rsp
1411.cfi_adjust_cfa_offset	0x48
1412___
1413$code.=<<___ if ($win64);
1414	lea	-0xa0(%rsp), %rsp
1415	movaps	%xmm6, 0x40(%rsp)
1416	movaps	%xmm7, 0x50(%rsp)
1417	movaps	%xmm8, 0x60(%rsp)
1418	movaps	%xmm9, 0x70(%rsp)
1419	movaps	%xmm10, 0x80(%rsp)
1420	movaps	%xmm11, 0x90(%rsp)
1421	movaps	%xmm12, 0xa0(%rsp)
1422	movaps	%xmm13, 0xb0(%rsp)
1423	movaps	%xmm14, 0xc0(%rsp)
1424	movaps	%xmm15, 0xd0(%rsp)
1425.Lecb_dec_body:
1426___
1427$code.=<<___;
1428	mov	%rsp,%rbp		# backup %rsp
1429.cfi_def_cfa_register	%rbp
1430	mov	240($arg4),%eax		# rounds
1431	mov	$arg1,$inp		# backup arguments
1432	mov	$arg2,$out
1433	mov	$arg3,$len
1434	mov	$arg4,$key
1435	cmp	\$8,$arg3
1436	jb	.Lecb_dec_short
1437
1438	mov	%eax,%ebx		# backup rounds
1439	shl	\$7,%rax		# 128 bytes per inner round key
1440	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1441	sub	%rax,%rsp
1442	mov	%rsp,%rax		# pass key schedule
1443	mov	$key,%rcx		# pass key
1444	mov	%ebx,%r10d		# pass rounds
1445	call	_bsaes_key_convert
1446	pxor	(%rsp),%xmm7		# fix up 0 round key
1447	movdqa	%xmm6,(%rax)		# save last round key
1448	movdqa	%xmm7,(%rsp)
1449
1450	sub	\$8,$len
1451.Lecb_dec_loop:
1452	movdqu	0x00($inp), @XMM[0]	# load input
1453	movdqu	0x10($inp), @XMM[1]
1454	movdqu	0x20($inp), @XMM[2]
1455	movdqu	0x30($inp), @XMM[3]
1456	movdqu	0x40($inp), @XMM[4]
1457	movdqu	0x50($inp), @XMM[5]
1458	mov	%rsp, %rax		# pass key schedule
1459	movdqu	0x60($inp), @XMM[6]
1460	mov	%ebx,%r10d		# pass rounds
1461	movdqu	0x70($inp), @XMM[7]
1462	lea	0x80($inp), $inp
1463
1464	call	_bsaes_decrypt8
1465
1466	movdqu	@XMM[0], 0x00($out)	# write output
1467	movdqu	@XMM[1], 0x10($out)
1468	movdqu	@XMM[6], 0x20($out)
1469	movdqu	@XMM[4], 0x30($out)
1470	movdqu	@XMM[2], 0x40($out)
1471	movdqu	@XMM[7], 0x50($out)
1472	movdqu	@XMM[3], 0x60($out)
1473	movdqu	@XMM[5], 0x70($out)
1474	lea	0x80($out), $out
1475	sub	\$8,$len
1476	jnc	.Lecb_dec_loop
1477
1478	add	\$8,$len
1479	jz	.Lecb_dec_done
1480
1481	movdqu	0x00($inp), @XMM[0]	# load input
1482	mov	%rsp, %rax		# pass key schedule
1483	mov	%ebx,%r10d		# pass rounds
1484	cmp	\$2,$len
1485	jb	.Lecb_dec_one
1486	movdqu	0x10($inp), @XMM[1]
1487	je	.Lecb_dec_two
1488	movdqu	0x20($inp), @XMM[2]
1489	cmp	\$4,$len
1490	jb	.Lecb_dec_three
1491	movdqu	0x30($inp), @XMM[3]
1492	je	.Lecb_dec_four
1493	movdqu	0x40($inp), @XMM[4]
1494	cmp	\$6,$len
1495	jb	.Lecb_dec_five
1496	movdqu	0x50($inp), @XMM[5]
1497	je	.Lecb_dec_six
1498	movdqu	0x60($inp), @XMM[6]
1499	call	_bsaes_decrypt8
1500	movdqu	@XMM[0], 0x00($out)	# write output
1501	movdqu	@XMM[1], 0x10($out)
1502	movdqu	@XMM[6], 0x20($out)
1503	movdqu	@XMM[4], 0x30($out)
1504	movdqu	@XMM[2], 0x40($out)
1505	movdqu	@XMM[7], 0x50($out)
1506	movdqu	@XMM[3], 0x60($out)
1507	jmp	.Lecb_dec_done
1508.align	16
1509.Lecb_dec_six:
1510	call	_bsaes_decrypt8
1511	movdqu	@XMM[0], 0x00($out)	# write output
1512	movdqu	@XMM[1], 0x10($out)
1513	movdqu	@XMM[6], 0x20($out)
1514	movdqu	@XMM[4], 0x30($out)
1515	movdqu	@XMM[2], 0x40($out)
1516	movdqu	@XMM[7], 0x50($out)
1517	jmp	.Lecb_dec_done
1518.align	16
1519.Lecb_dec_five:
1520	call	_bsaes_decrypt8
1521	movdqu	@XMM[0], 0x00($out)	# write output
1522	movdqu	@XMM[1], 0x10($out)
1523	movdqu	@XMM[6], 0x20($out)
1524	movdqu	@XMM[4], 0x30($out)
1525	movdqu	@XMM[2], 0x40($out)
1526	jmp	.Lecb_dec_done
1527.align	16
1528.Lecb_dec_four:
1529	call	_bsaes_decrypt8
1530	movdqu	@XMM[0], 0x00($out)	# write output
1531	movdqu	@XMM[1], 0x10($out)
1532	movdqu	@XMM[6], 0x20($out)
1533	movdqu	@XMM[4], 0x30($out)
1534	jmp	.Lecb_dec_done
1535.align	16
1536.Lecb_dec_three:
1537	call	_bsaes_decrypt8
1538	movdqu	@XMM[0], 0x00($out)	# write output
1539	movdqu	@XMM[1], 0x10($out)
1540	movdqu	@XMM[6], 0x20($out)
1541	jmp	.Lecb_dec_done
1542.align	16
1543.Lecb_dec_two:
1544	call	_bsaes_decrypt8
1545	movdqu	@XMM[0], 0x00($out)	# write output
1546	movdqu	@XMM[1], 0x10($out)
1547	jmp	.Lecb_dec_done
1548.align	16
1549.Lecb_dec_one:
1550	call	_bsaes_decrypt8
1551	movdqu	@XMM[0], 0x00($out)	# write output
1552	jmp	.Lecb_dec_done
1553.align	16
1554.Lecb_dec_short:
1555	lea	($inp), $arg1
1556	lea	($out), $arg2
1557	lea	($key), $arg3
1558	call	asm_AES_decrypt
1559	lea	16($inp), $inp
1560	lea	16($out), $out
1561	dec	$len
1562	jnz	.Lecb_dec_short
1563
1564.Lecb_dec_done:
1565	lea	(%rsp),%rax
1566	pxor	%xmm0, %xmm0
1567.Lecb_dec_bzero:			# wipe key schedule [if any]
1568	movdqa	%xmm0, 0x00(%rax)
1569	movdqa	%xmm0, 0x10(%rax)
1570	lea	0x20(%rax), %rax
1571	cmp	%rax, %rbp
1572	jb	.Lecb_dec_bzero
1573
1574	lea	0x78(%rbp),%rax
1575.cfi_def_cfa	%rax,8
1576___
1577$code.=<<___ if ($win64);
1578	movaps	0x40(%rbp), %xmm6
1579	movaps	0x50(%rbp), %xmm7
1580	movaps	0x60(%rbp), %xmm8
1581	movaps	0x70(%rbp), %xmm9
1582	movaps	0x80(%rbp), %xmm10
1583	movaps	0x90(%rbp), %xmm11
1584	movaps	0xa0(%rbp), %xmm12
1585	movaps	0xb0(%rbp), %xmm13
1586	movaps	0xc0(%rbp), %xmm14
1587	movaps	0xd0(%rbp), %xmm15
1588	lea	0xa0(%rax), %rax
1589.Lecb_dec_tail:
1590___
1591$code.=<<___;
1592	mov	-48(%rax), %r15
1593.cfi_restore	%r15
1594	mov	-40(%rax), %r14
1595.cfi_restore	%r14
1596	mov	-32(%rax), %r13
1597.cfi_restore	%r13
1598	mov	-24(%rax), %r12
1599.cfi_restore	%r12
1600	mov	-16(%rax), %rbx
1601.cfi_restore	%rbx
1602	mov	-8(%rax), %rbp
1603.cfi_restore	%rbp
1604	lea	(%rax), %rsp		# restore %rsp
1605.cfi_def_cfa_register	%rsp
1606.Lecb_dec_epilogue:
1607	ret
1608.cfi_endproc
1609.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1610___
1611}
1612$code.=<<___;
1613.extern	asm_AES_cbc_encrypt
1614.globl	ossl_bsaes_cbc_encrypt
1615.type	ossl_bsaes_cbc_encrypt,\@abi-omnipotent
1616.align	16
1617ossl_bsaes_cbc_encrypt:
1618.cfi_startproc
1619	endbranch
1620___
1621$code.=<<___ if ($win64);
1622	mov	48(%rsp),$arg6		# pull direction flag
1623___
1624$code.=<<___;
1625	cmp	\$0,$arg6
1626	jne	asm_AES_cbc_encrypt
1627	cmp	\$128,$arg3
1628	jb	asm_AES_cbc_encrypt
1629
1630	mov	%rsp, %rax
1631.Lcbc_dec_prologue:
1632	push	%rbp
1633.cfi_push	%rbp
1634	push	%rbx
1635.cfi_push	%rbx
1636	push	%r12
1637.cfi_push	%r12
1638	push	%r13
1639.cfi_push	%r13
1640	push	%r14
1641.cfi_push	%r14
1642	push	%r15
1643.cfi_push	%r15
1644	lea	-0x48(%rsp), %rsp
1645.cfi_adjust_cfa_offset	0x48
1646___
1647$code.=<<___ if ($win64);
1648	mov	0xa0(%rsp),$arg5	# pull ivp
1649	lea	-0xa0(%rsp), %rsp
1650	movaps	%xmm6, 0x40(%rsp)
1651	movaps	%xmm7, 0x50(%rsp)
1652	movaps	%xmm8, 0x60(%rsp)
1653	movaps	%xmm9, 0x70(%rsp)
1654	movaps	%xmm10, 0x80(%rsp)
1655	movaps	%xmm11, 0x90(%rsp)
1656	movaps	%xmm12, 0xa0(%rsp)
1657	movaps	%xmm13, 0xb0(%rsp)
1658	movaps	%xmm14, 0xc0(%rsp)
1659	movaps	%xmm15, 0xd0(%rsp)
1660.Lcbc_dec_body:
1661___
1662$code.=<<___;
1663	mov	%rsp, %rbp		# backup %rsp
1664.cfi_def_cfa_register	%rbp
1665	mov	240($arg4), %eax	# rounds
1666	mov	$arg1, $inp		# backup arguments
1667	mov	$arg2, $out
1668	mov	$arg3, $len
1669	mov	$arg4, $key
1670	mov	$arg5, %rbx
1671	shr	\$4, $len		# bytes to blocks
1672
1673	mov	%eax, %edx		# rounds
1674	shl	\$7, %rax		# 128 bytes per inner round key
1675	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1676	sub	%rax, %rsp
1677
1678	mov	%rsp, %rax		# pass key schedule
1679	mov	$key, %rcx		# pass key
1680	mov	%edx, %r10d		# pass rounds
1681	call	_bsaes_key_convert
1682	pxor	(%rsp),%xmm7		# fix up 0 round key
1683	movdqa	%xmm6,(%rax)		# save last round key
1684	movdqa	%xmm7,(%rsp)
1685
1686	movdqu	(%rbx), @XMM[15]	# load IV
1687	sub	\$8,$len
1688.Lcbc_dec_loop:
1689	movdqu	0x00($inp), @XMM[0]	# load input
1690	movdqu	0x10($inp), @XMM[1]
1691	movdqu	0x20($inp), @XMM[2]
1692	movdqu	0x30($inp), @XMM[3]
1693	movdqu	0x40($inp), @XMM[4]
1694	movdqu	0x50($inp), @XMM[5]
1695	mov	%rsp, %rax		# pass key schedule
1696	movdqu	0x60($inp), @XMM[6]
1697	mov	%edx,%r10d		# pass rounds
1698	movdqu	0x70($inp), @XMM[7]
1699	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1700
1701	call	_bsaes_decrypt8
1702
1703	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1704	movdqu	0x00($inp), @XMM[8]	# re-load input
1705	movdqu	0x10($inp), @XMM[9]
1706	pxor	@XMM[8], @XMM[1]
1707	movdqu	0x20($inp), @XMM[10]
1708	pxor	@XMM[9], @XMM[6]
1709	movdqu	0x30($inp), @XMM[11]
1710	pxor	@XMM[10], @XMM[4]
1711	movdqu	0x40($inp), @XMM[12]
1712	pxor	@XMM[11], @XMM[2]
1713	movdqu	0x50($inp), @XMM[13]
1714	pxor	@XMM[12], @XMM[7]
1715	movdqu	0x60($inp), @XMM[14]
1716	pxor	@XMM[13], @XMM[3]
1717	movdqu	0x70($inp), @XMM[15]	# IV
1718	pxor	@XMM[14], @XMM[5]
1719	movdqu	@XMM[0], 0x00($out)	# write output
1720	lea	0x80($inp), $inp
1721	movdqu	@XMM[1], 0x10($out)
1722	movdqu	@XMM[6], 0x20($out)
1723	movdqu	@XMM[4], 0x30($out)
1724	movdqu	@XMM[2], 0x40($out)
1725	movdqu	@XMM[7], 0x50($out)
1726	movdqu	@XMM[3], 0x60($out)
1727	movdqu	@XMM[5], 0x70($out)
1728	lea	0x80($out), $out
1729	sub	\$8,$len
1730	jnc	.Lcbc_dec_loop
1731
1732	add	\$8,$len
1733	jz	.Lcbc_dec_done
1734
1735	movdqu	0x00($inp), @XMM[0]	# load input
1736	mov	%rsp, %rax		# pass key schedule
1737	mov	%edx, %r10d		# pass rounds
1738	cmp	\$2,$len
1739	jb	.Lcbc_dec_one
1740	movdqu	0x10($inp), @XMM[1]
1741	je	.Lcbc_dec_two
1742	movdqu	0x20($inp), @XMM[2]
1743	cmp	\$4,$len
1744	jb	.Lcbc_dec_three
1745	movdqu	0x30($inp), @XMM[3]
1746	je	.Lcbc_dec_four
1747	movdqu	0x40($inp), @XMM[4]
1748	cmp	\$6,$len
1749	jb	.Lcbc_dec_five
1750	movdqu	0x50($inp), @XMM[5]
1751	je	.Lcbc_dec_six
1752	movdqu	0x60($inp), @XMM[6]
1753	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1754	call	_bsaes_decrypt8
1755	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1756	movdqu	0x00($inp), @XMM[8]	# re-load input
1757	movdqu	0x10($inp), @XMM[9]
1758	pxor	@XMM[8], @XMM[1]
1759	movdqu	0x20($inp), @XMM[10]
1760	pxor	@XMM[9], @XMM[6]
1761	movdqu	0x30($inp), @XMM[11]
1762	pxor	@XMM[10], @XMM[4]
1763	movdqu	0x40($inp), @XMM[12]
1764	pxor	@XMM[11], @XMM[2]
1765	movdqu	0x50($inp), @XMM[13]
1766	pxor	@XMM[12], @XMM[7]
1767	movdqu	0x60($inp), @XMM[15]	# IV
1768	pxor	@XMM[13], @XMM[3]
1769	movdqu	@XMM[0], 0x00($out)	# write output
1770	movdqu	@XMM[1], 0x10($out)
1771	movdqu	@XMM[6], 0x20($out)
1772	movdqu	@XMM[4], 0x30($out)
1773	movdqu	@XMM[2], 0x40($out)
1774	movdqu	@XMM[7], 0x50($out)
1775	movdqu	@XMM[3], 0x60($out)
1776	jmp	.Lcbc_dec_done
1777.align	16
1778.Lcbc_dec_six:
1779	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1780	call	_bsaes_decrypt8
1781	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1782	movdqu	0x00($inp), @XMM[8]	# re-load input
1783	movdqu	0x10($inp), @XMM[9]
1784	pxor	@XMM[8], @XMM[1]
1785	movdqu	0x20($inp), @XMM[10]
1786	pxor	@XMM[9], @XMM[6]
1787	movdqu	0x30($inp), @XMM[11]
1788	pxor	@XMM[10], @XMM[4]
1789	movdqu	0x40($inp), @XMM[12]
1790	pxor	@XMM[11], @XMM[2]
1791	movdqu	0x50($inp), @XMM[15]	# IV
1792	pxor	@XMM[12], @XMM[7]
1793	movdqu	@XMM[0], 0x00($out)	# write output
1794	movdqu	@XMM[1], 0x10($out)
1795	movdqu	@XMM[6], 0x20($out)
1796	movdqu	@XMM[4], 0x30($out)
1797	movdqu	@XMM[2], 0x40($out)
1798	movdqu	@XMM[7], 0x50($out)
1799	jmp	.Lcbc_dec_done
1800.align	16
1801.Lcbc_dec_five:
1802	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1803	call	_bsaes_decrypt8
1804	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1805	movdqu	0x00($inp), @XMM[8]	# re-load input
1806	movdqu	0x10($inp), @XMM[9]
1807	pxor	@XMM[8], @XMM[1]
1808	movdqu	0x20($inp), @XMM[10]
1809	pxor	@XMM[9], @XMM[6]
1810	movdqu	0x30($inp), @XMM[11]
1811	pxor	@XMM[10], @XMM[4]
1812	movdqu	0x40($inp), @XMM[15]	# IV
1813	pxor	@XMM[11], @XMM[2]
1814	movdqu	@XMM[0], 0x00($out)	# write output
1815	movdqu	@XMM[1], 0x10($out)
1816	movdqu	@XMM[6], 0x20($out)
1817	movdqu	@XMM[4], 0x30($out)
1818	movdqu	@XMM[2], 0x40($out)
1819	jmp	.Lcbc_dec_done
1820.align	16
1821.Lcbc_dec_four:
1822	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1823	call	_bsaes_decrypt8
1824	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1825	movdqu	0x00($inp), @XMM[8]	# re-load input
1826	movdqu	0x10($inp), @XMM[9]
1827	pxor	@XMM[8], @XMM[1]
1828	movdqu	0x20($inp), @XMM[10]
1829	pxor	@XMM[9], @XMM[6]
1830	movdqu	0x30($inp), @XMM[15]	# IV
1831	pxor	@XMM[10], @XMM[4]
1832	movdqu	@XMM[0], 0x00($out)	# write output
1833	movdqu	@XMM[1], 0x10($out)
1834	movdqu	@XMM[6], 0x20($out)
1835	movdqu	@XMM[4], 0x30($out)
1836	jmp	.Lcbc_dec_done
1837.align	16
1838.Lcbc_dec_three:
1839	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1840	call	_bsaes_decrypt8
1841	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1842	movdqu	0x00($inp), @XMM[8]	# re-load input
1843	movdqu	0x10($inp), @XMM[9]
1844	pxor	@XMM[8], @XMM[1]
1845	movdqu	0x20($inp), @XMM[15]	# IV
1846	pxor	@XMM[9], @XMM[6]
1847	movdqu	@XMM[0], 0x00($out)	# write output
1848	movdqu	@XMM[1], 0x10($out)
1849	movdqu	@XMM[6], 0x20($out)
1850	jmp	.Lcbc_dec_done
1851.align	16
1852.Lcbc_dec_two:
1853	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1854	call	_bsaes_decrypt8
1855	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1856	movdqu	0x00($inp), @XMM[8]	# re-load input
1857	movdqu	0x10($inp), @XMM[15]	# IV
1858	pxor	@XMM[8], @XMM[1]
1859	movdqu	@XMM[0], 0x00($out)	# write output
1860	movdqu	@XMM[1], 0x10($out)
1861	jmp	.Lcbc_dec_done
1862.align	16
1863.Lcbc_dec_one:
1864	lea	($inp), $arg1
1865	lea	0x20(%rbp), $arg2	# buffer output
1866	lea	($key), $arg3
1867	call	asm_AES_decrypt		# doesn't touch %xmm
1868	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1869	movdqu	@XMM[15], ($out)	# write output
1870	movdqa	@XMM[0], @XMM[15]	# IV
1871
1872.Lcbc_dec_done:
1873	movdqu	@XMM[15], (%rbx)	# return IV
1874	lea	(%rsp), %rax
1875	pxor	%xmm0, %xmm0
1876.Lcbc_dec_bzero:			# wipe key schedule [if any]
1877	movdqa	%xmm0, 0x00(%rax)
1878	movdqa	%xmm0, 0x10(%rax)
1879	lea	0x20(%rax), %rax
1880	cmp	%rax, %rbp
1881	ja	.Lcbc_dec_bzero
1882
1883	lea	0x78(%rbp),%rax
1884.cfi_def_cfa	%rax,8
1885___
1886$code.=<<___ if ($win64);
1887	movaps	0x40(%rbp), %xmm6
1888	movaps	0x50(%rbp), %xmm7
1889	movaps	0x60(%rbp), %xmm8
1890	movaps	0x70(%rbp), %xmm9
1891	movaps	0x80(%rbp), %xmm10
1892	movaps	0x90(%rbp), %xmm11
1893	movaps	0xa0(%rbp), %xmm12
1894	movaps	0xb0(%rbp), %xmm13
1895	movaps	0xc0(%rbp), %xmm14
1896	movaps	0xd0(%rbp), %xmm15
1897	lea	0xa0(%rax), %rax
1898.Lcbc_dec_tail:
1899___
1900$code.=<<___;
1901	mov	-48(%rax), %r15
1902.cfi_restore	%r15
1903	mov	-40(%rax), %r14
1904.cfi_restore	%r14
1905	mov	-32(%rax), %r13
1906.cfi_restore	%r13
1907	mov	-24(%rax), %r12
1908.cfi_restore	%r12
1909	mov	-16(%rax), %rbx
1910.cfi_restore	%rbx
1911	mov	-8(%rax), %rbp
1912.cfi_restore	%rbp
1913	lea	(%rax), %rsp		# restore %rsp
1914.cfi_def_cfa_register	%rsp
1915.Lcbc_dec_epilogue:
1916	ret
1917.cfi_endproc
1918.size	ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1919
1920.globl	ossl_bsaes_ctr32_encrypt_blocks
1921.type	ossl_bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1922.align	16
1923ossl_bsaes_ctr32_encrypt_blocks:
1924.cfi_startproc
1925	endbranch
1926	mov	%rsp, %rax
1927.Lctr_enc_prologue:
1928	push	%rbp
1929.cfi_push	%rbp
1930	push	%rbx
1931.cfi_push	%rbx
1932	push	%r12
1933.cfi_push	%r12
1934	push	%r13
1935.cfi_push	%r13
1936	push	%r14
1937.cfi_push	%r14
1938	push	%r15
1939.cfi_push	%r15
1940	lea	-0x48(%rsp), %rsp
1941.cfi_adjust_cfa_offset	0x48
1942___
1943$code.=<<___ if ($win64);
1944	mov	0xa0(%rsp),$arg5	# pull ivp
1945	lea	-0xa0(%rsp), %rsp
1946	movaps	%xmm6, 0x40(%rsp)
1947	movaps	%xmm7, 0x50(%rsp)
1948	movaps	%xmm8, 0x60(%rsp)
1949	movaps	%xmm9, 0x70(%rsp)
1950	movaps	%xmm10, 0x80(%rsp)
1951	movaps	%xmm11, 0x90(%rsp)
1952	movaps	%xmm12, 0xa0(%rsp)
1953	movaps	%xmm13, 0xb0(%rsp)
1954	movaps	%xmm14, 0xc0(%rsp)
1955	movaps	%xmm15, 0xd0(%rsp)
1956.Lctr_enc_body:
1957___
1958$code.=<<___;
1959	mov	%rsp, %rbp		# backup %rsp
1960.cfi_def_cfa_register	%rbp
1961	movdqu	($arg5), %xmm0		# load counter
1962	mov	240($arg4), %eax	# rounds
1963	mov	$arg1, $inp		# backup arguments
1964	mov	$arg2, $out
1965	mov	$arg3, $len
1966	mov	$arg4, $key
1967	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1968	cmp	\$8, $arg3
1969	jb	.Lctr_enc_short
1970
1971	mov	%eax, %ebx		# rounds
1972	shl	\$7, %rax		# 128 bytes per inner round key
1973	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1974	sub	%rax, %rsp
1975
1976	mov	%rsp, %rax		# pass key schedule
1977	mov	$key, %rcx		# pass key
1978	mov	%ebx, %r10d		# pass rounds
1979	call	_bsaes_key_convert
1980	pxor	%xmm6,%xmm7		# fix up last round key
1981	movdqa	%xmm7,(%rax)		# save last round key
1982
1983	movdqa	(%rsp), @XMM[9]		# load round0 key
1984	lea	.LADD1(%rip), %r11
1985	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1986	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1987	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1988	pshufb	@XMM[8], @XMM[0]
1989	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1990	jmp	.Lctr_enc_loop
1991.align	16
1992.Lctr_enc_loop:
1993	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1994	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1995	movdqa	@XMM[0], @XMM[2]
1996	paddd	0x00(%r11), @XMM[1]	# .LADD1
1997	movdqa	@XMM[0], @XMM[3]
1998	paddd	0x10(%r11), @XMM[2]	# .LADD2
1999	movdqa	@XMM[0], @XMM[4]
2000	paddd	0x20(%r11), @XMM[3]	# .LADD3
2001	movdqa	@XMM[0], @XMM[5]
2002	paddd	0x30(%r11), @XMM[4]	# .LADD4
2003	movdqa	@XMM[0], @XMM[6]
2004	paddd	0x40(%r11), @XMM[5]	# .LADD5
2005	movdqa	@XMM[0], @XMM[7]
2006	paddd	0x50(%r11), @XMM[6]	# .LADD6
2007	paddd	0x60(%r11), @XMM[7]	# .LADD7
2008
2009	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
2010	# to flip byte order in 32-bit counter
2011	movdqa	(%rsp), @XMM[9]		# round 0 key
2012	lea	0x10(%rsp), %rax	# pass key schedule
2013	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
2014	pxor	@XMM[9], @XMM[0]	# xor with round0 key
2015	pxor	@XMM[9], @XMM[1]
2016	pxor	@XMM[9], @XMM[2]
2017	pxor	@XMM[9], @XMM[3]
2018	 pshufb	@XMM[8], @XMM[0]
2019	 pshufb	@XMM[8], @XMM[1]
2020	pxor	@XMM[9], @XMM[4]
2021	pxor	@XMM[9], @XMM[5]
2022	 pshufb	@XMM[8], @XMM[2]
2023	 pshufb	@XMM[8], @XMM[3]
2024	pxor	@XMM[9], @XMM[6]
2025	pxor	@XMM[9], @XMM[7]
2026	 pshufb	@XMM[8], @XMM[4]
2027	 pshufb	@XMM[8], @XMM[5]
2028	 pshufb	@XMM[8], @XMM[6]
2029	 pshufb	@XMM[8], @XMM[7]
2030	lea	.LBS0(%rip), %r11	# constants table
2031	mov	%ebx,%r10d		# pass rounds
2032
2033	call	_bsaes_encrypt8_bitslice
2034
2035	sub	\$8,$len
2036	jc	.Lctr_enc_loop_done
2037
2038	movdqu	0x00($inp), @XMM[8]	# load input
2039	movdqu	0x10($inp), @XMM[9]
2040	movdqu	0x20($inp), @XMM[10]
2041	movdqu	0x30($inp), @XMM[11]
2042	movdqu	0x40($inp), @XMM[12]
2043	movdqu	0x50($inp), @XMM[13]
2044	movdqu	0x60($inp), @XMM[14]
2045	movdqu	0x70($inp), @XMM[15]
2046	lea	0x80($inp),$inp
2047	pxor	@XMM[0], @XMM[8]
2048	movdqa	0x20(%rbp), @XMM[0]	# load counter
2049	pxor	@XMM[9], @XMM[1]
2050	movdqu	@XMM[8], 0x00($out)	# write output
2051	pxor	@XMM[10], @XMM[4]
2052	movdqu	@XMM[1], 0x10($out)
2053	pxor	@XMM[11], @XMM[6]
2054	movdqu	@XMM[4], 0x20($out)
2055	pxor	@XMM[12], @XMM[3]
2056	movdqu	@XMM[6], 0x30($out)
2057	pxor	@XMM[13], @XMM[7]
2058	movdqu	@XMM[3], 0x40($out)
2059	pxor	@XMM[14], @XMM[2]
2060	movdqu	@XMM[7], 0x50($out)
2061	pxor	@XMM[15], @XMM[5]
2062	movdqu	@XMM[2], 0x60($out)
2063	lea	.LADD1(%rip), %r11
2064	movdqu	@XMM[5], 0x70($out)
2065	lea	0x80($out), $out
2066	paddd	0x70(%r11), @XMM[0]	# .LADD8
2067	jnz	.Lctr_enc_loop
2068
2069	jmp	.Lctr_enc_done
2070.align	16
2071.Lctr_enc_loop_done:
2072	add	\$8, $len
2073	movdqu	0x00($inp), @XMM[8]	# load input
2074	pxor	@XMM[8], @XMM[0]
2075	movdqu	@XMM[0], 0x00($out)	# write output
2076	cmp	\$2,$len
2077	jb	.Lctr_enc_done
2078	movdqu	0x10($inp), @XMM[9]
2079	pxor	@XMM[9], @XMM[1]
2080	movdqu	@XMM[1], 0x10($out)
2081	je	.Lctr_enc_done
2082	movdqu	0x20($inp), @XMM[10]
2083	pxor	@XMM[10], @XMM[4]
2084	movdqu	@XMM[4], 0x20($out)
2085	cmp	\$4,$len
2086	jb	.Lctr_enc_done
2087	movdqu	0x30($inp), @XMM[11]
2088	pxor	@XMM[11], @XMM[6]
2089	movdqu	@XMM[6], 0x30($out)
2090	je	.Lctr_enc_done
2091	movdqu	0x40($inp), @XMM[12]
2092	pxor	@XMM[12], @XMM[3]
2093	movdqu	@XMM[3], 0x40($out)
2094	cmp	\$6,$len
2095	jb	.Lctr_enc_done
2096	movdqu	0x50($inp), @XMM[13]
2097	pxor	@XMM[13], @XMM[7]
2098	movdqu	@XMM[7], 0x50($out)
2099	je	.Lctr_enc_done
2100	movdqu	0x60($inp), @XMM[14]
2101	pxor	@XMM[14], @XMM[2]
2102	movdqu	@XMM[2], 0x60($out)
2103	jmp	.Lctr_enc_done
2104
2105.align	16
2106.Lctr_enc_short:
2107	lea	0x20(%rbp), $arg1
2108	lea	0x30(%rbp), $arg2
2109	lea	($key), $arg3
2110	call	asm_AES_encrypt
2111	movdqu	($inp), @XMM[1]
2112	lea	16($inp), $inp
2113	mov	0x2c(%rbp), %eax	# load 32-bit counter
2114	bswap	%eax
2115	pxor	0x30(%rbp), @XMM[1]
2116	inc	%eax			# increment
2117	movdqu	@XMM[1], ($out)
2118	bswap	%eax
2119	lea	16($out), $out
2120	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
2121	dec	$len
2122	jnz	.Lctr_enc_short
2123
2124.Lctr_enc_done:
2125	lea	(%rsp), %rax
2126	pxor	%xmm0, %xmm0
2127.Lctr_enc_bzero:			# wipe key schedule [if any]
2128	movdqa	%xmm0, 0x00(%rax)
2129	movdqa	%xmm0, 0x10(%rax)
2130	lea	0x20(%rax), %rax
2131	cmp	%rax, %rbp
2132	ja	.Lctr_enc_bzero
2133
2134	lea	0x78(%rbp),%rax
2135.cfi_def_cfa	%rax,8
2136___
2137$code.=<<___ if ($win64);
2138	movaps	0x40(%rbp), %xmm6
2139	movaps	0x50(%rbp), %xmm7
2140	movaps	0x60(%rbp), %xmm8
2141	movaps	0x70(%rbp), %xmm9
2142	movaps	0x80(%rbp), %xmm10
2143	movaps	0x90(%rbp), %xmm11
2144	movaps	0xa0(%rbp), %xmm12
2145	movaps	0xb0(%rbp), %xmm13
2146	movaps	0xc0(%rbp), %xmm14
2147	movaps	0xd0(%rbp), %xmm15
2148	lea	0xa0(%rax), %rax
2149.Lctr_enc_tail:
2150___
2151$code.=<<___;
2152	mov	-48(%rax), %r15
2153.cfi_restore	%r15
2154	mov	-40(%rax), %r14
2155.cfi_restore	%r14
2156	mov	-32(%rax), %r13
2157.cfi_restore	%r13
2158	mov	-24(%rax), %r12
2159.cfi_restore	%r12
2160	mov	-16(%rax), %rbx
2161.cfi_restore	%rbx
2162	mov	-8(%rax), %rbp
2163.cfi_restore	%rbp
2164	lea	(%rax), %rsp		# restore %rsp
2165.cfi_def_cfa_register	%rsp
2166.Lctr_enc_epilogue:
2167	ret
2168.cfi_endproc
2169.size	ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
2170___
2171######################################################################
2172# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2173#	const AES_KEY *key1, const AES_KEY *key2,
2174#	const unsigned char iv[16]);
2175#
2176my ($twmask,$twres,$twtmp)=@XMM[13..15];
2177$arg6=~s/d$//;
2178
2179$code.=<<___;
2180.globl	ossl_bsaes_xts_encrypt
2181.type	ossl_bsaes_xts_encrypt,\@abi-omnipotent
2182.align	16
2183ossl_bsaes_xts_encrypt:
2184.cfi_startproc
2185	endbranch
2186	mov	%rsp, %rax
2187.Lxts_enc_prologue:
2188	push	%rbp
2189.cfi_push	%rbp
2190	push	%rbx
2191.cfi_push	%rbx
2192	push	%r12
2193.cfi_push	%r12
2194	push	%r13
2195.cfi_push	%r13
2196	push	%r14
2197.cfi_push	%r14
2198	push	%r15
2199.cfi_push	%r15
2200	lea	-0x48(%rsp), %rsp
2201.cfi_adjust_cfa_offset	0x48
2202___
2203$code.=<<___ if ($win64);
2204	mov	0xa0(%rsp),$arg5	# pull key2
2205	mov	0xa8(%rsp),$arg6	# pull ivp
2206	lea	-0xa0(%rsp), %rsp
2207	movaps	%xmm6, 0x40(%rsp)
2208	movaps	%xmm7, 0x50(%rsp)
2209	movaps	%xmm8, 0x60(%rsp)
2210	movaps	%xmm9, 0x70(%rsp)
2211	movaps	%xmm10, 0x80(%rsp)
2212	movaps	%xmm11, 0x90(%rsp)
2213	movaps	%xmm12, 0xa0(%rsp)
2214	movaps	%xmm13, 0xb0(%rsp)
2215	movaps	%xmm14, 0xc0(%rsp)
2216	movaps	%xmm15, 0xd0(%rsp)
2217.Lxts_enc_body:
2218___
2219$code.=<<___;
2220	mov	%rsp, %rbp		# backup %rsp
2221.cfi_def_cfa_register	%rbp
2222	mov	$arg1, $inp		# backup arguments
2223	mov	$arg2, $out
2224	mov	$arg3, $len
2225	mov	$arg4, $key
2226
2227	lea	($arg6), $arg1
2228	lea	0x20(%rbp), $arg2
2229	lea	($arg5), $arg3
2230	call	asm_AES_encrypt		# generate initial tweak
2231
2232	mov	240($key), %eax		# rounds
2233	mov	$len, %rbx		# backup $len
2234
2235	mov	%eax, %edx		# rounds
2236	shl	\$7, %rax		# 128 bytes per inner round key
2237	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2238	sub	%rax, %rsp
2239
2240	mov	%rsp, %rax		# pass key schedule
2241	mov	$key, %rcx		# pass key
2242	mov	%edx, %r10d		# pass rounds
2243	call	_bsaes_key_convert
2244	pxor	%xmm6, %xmm7		# fix up last round key
2245	movdqa	%xmm7, (%rax)		# save last round key
2246
2247	and	\$-16, $len
2248	sub	\$0x80, %rsp		# place for tweak[8]
2249	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2250
2251	pxor	$twtmp, $twtmp
2252	movdqa	.Lxts_magic(%rip), $twmask
2253	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2254
2255	sub	\$0x80, $len
2256	jc	.Lxts_enc_short
2257	jmp	.Lxts_enc_loop
2258
2259.align	16
2260.Lxts_enc_loop:
2261___
2262    for ($i=0;$i<7;$i++) {
2263    $code.=<<___;
2264	pshufd	\$0x13, $twtmp, $twres
2265	pxor	$twtmp, $twtmp
2266	movdqa	@XMM[7], @XMM[$i]
2267	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2268	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2269	pand	$twmask, $twres		# isolate carry and residue
2270	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2271	pxor	$twres, @XMM[7]
2272___
2273    $code.=<<___ if ($i>=1);
2274	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2275___
2276    $code.=<<___ if ($i>=2);
2277	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2278___
2279    }
2280$code.=<<___;
2281	movdqu	0x60($inp), @XMM[8+6]
2282	pxor	@XMM[8+5], @XMM[5]
2283	movdqu	0x70($inp), @XMM[8+7]
2284	lea	0x80($inp), $inp
2285	movdqa	@XMM[7], 0x70(%rsp)
2286	pxor	@XMM[8+6], @XMM[6]
2287	lea	0x80(%rsp), %rax	# pass key schedule
2288	pxor	@XMM[8+7], @XMM[7]
2289	mov	%edx, %r10d		# pass rounds
2290
2291	call	_bsaes_encrypt8
2292
2293	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2294	pxor	0x10(%rsp), @XMM[1]
2295	movdqu	@XMM[0], 0x00($out)	# write output
2296	pxor	0x20(%rsp), @XMM[4]
2297	movdqu	@XMM[1], 0x10($out)
2298	pxor	0x30(%rsp), @XMM[6]
2299	movdqu	@XMM[4], 0x20($out)
2300	pxor	0x40(%rsp), @XMM[3]
2301	movdqu	@XMM[6], 0x30($out)
2302	pxor	0x50(%rsp), @XMM[7]
2303	movdqu	@XMM[3], 0x40($out)
2304	pxor	0x60(%rsp), @XMM[2]
2305	movdqu	@XMM[7], 0x50($out)
2306	pxor	0x70(%rsp), @XMM[5]
2307	movdqu	@XMM[2], 0x60($out)
2308	movdqu	@XMM[5], 0x70($out)
2309	lea	0x80($out), $out
2310
2311	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2312	pxor	$twtmp, $twtmp
2313	movdqa	.Lxts_magic(%rip), $twmask
2314	pcmpgtd	@XMM[7], $twtmp
2315	pshufd	\$0x13, $twtmp, $twres
2316	pxor	$twtmp, $twtmp
2317	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2318	pand	$twmask, $twres		# isolate carry and residue
2319	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2320	pxor	$twres, @XMM[7]
2321
2322	sub	\$0x80,$len
2323	jnc	.Lxts_enc_loop
2324
2325.Lxts_enc_short:
2326	add	\$0x80, $len
2327	jz	.Lxts_enc_done
2328___
2329    for ($i=0;$i<7;$i++) {
2330    $code.=<<___;
2331	pshufd	\$0x13, $twtmp, $twres
2332	pxor	$twtmp, $twtmp
2333	movdqa	@XMM[7], @XMM[$i]
2334	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2335	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2336	pand	$twmask, $twres		# isolate carry and residue
2337	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2338	pxor	$twres, @XMM[7]
2339___
2340    $code.=<<___ if ($i>=1);
2341	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2342	cmp	\$`0x10*$i`,$len
2343	je	.Lxts_enc_$i
2344___
2345    $code.=<<___ if ($i>=2);
2346	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2347___
2348    }
2349$code.=<<___;
2350	movdqu	0x60($inp), @XMM[8+6]
2351	pxor	@XMM[8+5], @XMM[5]
2352	movdqa	@XMM[7], 0x70(%rsp)
2353	lea	0x70($inp), $inp
2354	pxor	@XMM[8+6], @XMM[6]
2355	lea	0x80(%rsp), %rax	# pass key schedule
2356	mov	%edx, %r10d		# pass rounds
2357
2358	call	_bsaes_encrypt8
2359
2360	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2361	pxor	0x10(%rsp), @XMM[1]
2362	movdqu	@XMM[0], 0x00($out)	# write output
2363	pxor	0x20(%rsp), @XMM[4]
2364	movdqu	@XMM[1], 0x10($out)
2365	pxor	0x30(%rsp), @XMM[6]
2366	movdqu	@XMM[4], 0x20($out)
2367	pxor	0x40(%rsp), @XMM[3]
2368	movdqu	@XMM[6], 0x30($out)
2369	pxor	0x50(%rsp), @XMM[7]
2370	movdqu	@XMM[3], 0x40($out)
2371	pxor	0x60(%rsp), @XMM[2]
2372	movdqu	@XMM[7], 0x50($out)
2373	movdqu	@XMM[2], 0x60($out)
2374	lea	0x70($out), $out
2375
2376	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2377	jmp	.Lxts_enc_done
2378.align	16
2379.Lxts_enc_6:
2380	pxor	@XMM[8+4], @XMM[4]
2381	lea	0x60($inp), $inp
2382	pxor	@XMM[8+5], @XMM[5]
2383	lea	0x80(%rsp), %rax	# pass key schedule
2384	mov	%edx, %r10d		# pass rounds
2385
2386	call	_bsaes_encrypt8
2387
2388	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2389	pxor	0x10(%rsp), @XMM[1]
2390	movdqu	@XMM[0], 0x00($out)	# write output
2391	pxor	0x20(%rsp), @XMM[4]
2392	movdqu	@XMM[1], 0x10($out)
2393	pxor	0x30(%rsp), @XMM[6]
2394	movdqu	@XMM[4], 0x20($out)
2395	pxor	0x40(%rsp), @XMM[3]
2396	movdqu	@XMM[6], 0x30($out)
2397	pxor	0x50(%rsp), @XMM[7]
2398	movdqu	@XMM[3], 0x40($out)
2399	movdqu	@XMM[7], 0x50($out)
2400	lea	0x60($out), $out
2401
2402	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2403	jmp	.Lxts_enc_done
2404.align	16
2405.Lxts_enc_5:
2406	pxor	@XMM[8+3], @XMM[3]
2407	lea	0x50($inp), $inp
2408	pxor	@XMM[8+4], @XMM[4]
2409	lea	0x80(%rsp), %rax	# pass key schedule
2410	mov	%edx, %r10d		# pass rounds
2411
2412	call	_bsaes_encrypt8
2413
2414	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2415	pxor	0x10(%rsp), @XMM[1]
2416	movdqu	@XMM[0], 0x00($out)	# write output
2417	pxor	0x20(%rsp), @XMM[4]
2418	movdqu	@XMM[1], 0x10($out)
2419	pxor	0x30(%rsp), @XMM[6]
2420	movdqu	@XMM[4], 0x20($out)
2421	pxor	0x40(%rsp), @XMM[3]
2422	movdqu	@XMM[6], 0x30($out)
2423	movdqu	@XMM[3], 0x40($out)
2424	lea	0x50($out), $out
2425
2426	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2427	jmp	.Lxts_enc_done
2428.align	16
2429.Lxts_enc_4:
2430	pxor	@XMM[8+2], @XMM[2]
2431	lea	0x40($inp), $inp
2432	pxor	@XMM[8+3], @XMM[3]
2433	lea	0x80(%rsp), %rax	# pass key schedule
2434	mov	%edx, %r10d		# pass rounds
2435
2436	call	_bsaes_encrypt8
2437
2438	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2439	pxor	0x10(%rsp), @XMM[1]
2440	movdqu	@XMM[0], 0x00($out)	# write output
2441	pxor	0x20(%rsp), @XMM[4]
2442	movdqu	@XMM[1], 0x10($out)
2443	pxor	0x30(%rsp), @XMM[6]
2444	movdqu	@XMM[4], 0x20($out)
2445	movdqu	@XMM[6], 0x30($out)
2446	lea	0x40($out), $out
2447
2448	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2449	jmp	.Lxts_enc_done
2450.align	16
2451.Lxts_enc_3:
2452	pxor	@XMM[8+1], @XMM[1]
2453	lea	0x30($inp), $inp
2454	pxor	@XMM[8+2], @XMM[2]
2455	lea	0x80(%rsp), %rax	# pass key schedule
2456	mov	%edx, %r10d		# pass rounds
2457
2458	call	_bsaes_encrypt8
2459
2460	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2461	pxor	0x10(%rsp), @XMM[1]
2462	movdqu	@XMM[0], 0x00($out)	# write output
2463	pxor	0x20(%rsp), @XMM[4]
2464	movdqu	@XMM[1], 0x10($out)
2465	movdqu	@XMM[4], 0x20($out)
2466	lea	0x30($out), $out
2467
2468	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2469	jmp	.Lxts_enc_done
2470.align	16
2471.Lxts_enc_2:
2472	pxor	@XMM[8+0], @XMM[0]
2473	lea	0x20($inp), $inp
2474	pxor	@XMM[8+1], @XMM[1]
2475	lea	0x80(%rsp), %rax	# pass key schedule
2476	mov	%edx, %r10d		# pass rounds
2477
2478	call	_bsaes_encrypt8
2479
2480	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2481	pxor	0x10(%rsp), @XMM[1]
2482	movdqu	@XMM[0], 0x00($out)	# write output
2483	movdqu	@XMM[1], 0x10($out)
2484	lea	0x20($out), $out
2485
2486	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2487	jmp	.Lxts_enc_done
2488.align	16
2489.Lxts_enc_1:
2490	pxor	@XMM[0], @XMM[8]
2491	lea	0x10($inp), $inp
2492	movdqa	@XMM[8], 0x20(%rbp)
2493	lea	0x20(%rbp), $arg1
2494	lea	0x20(%rbp), $arg2
2495	lea	($key), $arg3
2496	call	asm_AES_encrypt		# doesn't touch %xmm
2497	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2498	#pxor	@XMM[8], @XMM[0]
2499	#lea	0x80(%rsp), %rax	# pass key schedule
2500	#mov	%edx, %r10d		# pass rounds
2501	#call	_bsaes_encrypt8
2502	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2503	movdqu	@XMM[0], 0x00($out)	# write output
2504	lea	0x10($out), $out
2505
2506	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2507
2508.Lxts_enc_done:
2509	and	\$15, %ebx
2510	jz	.Lxts_enc_ret
2511	mov	$out, %rdx
2512
2513.Lxts_enc_steal:
2514	movzb	($inp), %eax
2515	movzb	-16(%rdx), %ecx
2516	lea	1($inp), $inp
2517	mov	%al, -16(%rdx)
2518	mov	%cl, 0(%rdx)
2519	lea	1(%rdx), %rdx
2520	sub	\$1,%ebx
2521	jnz	.Lxts_enc_steal
2522
2523	movdqu	-16($out), @XMM[0]
2524	lea	0x20(%rbp), $arg1
2525	pxor	@XMM[7], @XMM[0]
2526	lea	0x20(%rbp), $arg2
2527	movdqa	@XMM[0], 0x20(%rbp)
2528	lea	($key), $arg3
2529	call	asm_AES_encrypt		# doesn't touch %xmm
2530	pxor	0x20(%rbp), @XMM[7]
2531	movdqu	@XMM[7], -16($out)
2532
2533.Lxts_enc_ret:
2534	lea	(%rsp), %rax
2535	pxor	%xmm0, %xmm0
2536.Lxts_enc_bzero:			# wipe key schedule [if any]
2537	movdqa	%xmm0, 0x00(%rax)
2538	movdqa	%xmm0, 0x10(%rax)
2539	lea	0x20(%rax), %rax
2540	cmp	%rax, %rbp
2541	ja	.Lxts_enc_bzero
2542
2543	lea	0x78(%rbp),%rax
2544.cfi_def_cfa	%rax,8
2545___
2546$code.=<<___ if ($win64);
2547	movaps	0x40(%rbp), %xmm6
2548	movaps	0x50(%rbp), %xmm7
2549	movaps	0x60(%rbp), %xmm8
2550	movaps	0x70(%rbp), %xmm9
2551	movaps	0x80(%rbp), %xmm10
2552	movaps	0x90(%rbp), %xmm11
2553	movaps	0xa0(%rbp), %xmm12
2554	movaps	0xb0(%rbp), %xmm13
2555	movaps	0xc0(%rbp), %xmm14
2556	movaps	0xd0(%rbp), %xmm15
2557	lea	0xa0(%rax), %rax
2558.Lxts_enc_tail:
2559___
2560$code.=<<___;
2561	mov	-48(%rax), %r15
2562.cfi_restore	%r15
2563	mov	-40(%rax), %r14
2564.cfi_restore	%r14
2565	mov	-32(%rax), %r13
2566.cfi_restore	%r13
2567	mov	-24(%rax), %r12
2568.cfi_restore	%r12
2569	mov	-16(%rax), %rbx
2570.cfi_restore	%rbx
2571	mov	-8(%rax), %rbp
2572.cfi_restore	%rbp
2573	lea	(%rax), %rsp		# restore %rsp
2574.cfi_def_cfa_register	%rsp
2575.Lxts_enc_epilogue:
2576	ret
2577.cfi_endproc
2578.size	ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
2579
2580.globl	ossl_bsaes_xts_decrypt
2581.type	ossl_bsaes_xts_decrypt,\@abi-omnipotent
2582.align	16
2583ossl_bsaes_xts_decrypt:
2584.cfi_startproc
2585	endbranch
2586	mov	%rsp, %rax
2587.Lxts_dec_prologue:
2588	push	%rbp
2589.cfi_push	%rbp
2590	push	%rbx
2591.cfi_push	%rbx
2592	push	%r12
2593.cfi_push	%r12
2594	push	%r13
2595.cfi_push	%r13
2596	push	%r14
2597.cfi_push	%r14
2598	push	%r15
2599.cfi_push	%r15
2600	lea	-0x48(%rsp), %rsp
2601.cfi_adjust_cfa_offset	0x48
2602___
2603$code.=<<___ if ($win64);
2604	mov	0xa0(%rsp),$arg5	# pull key2
2605	mov	0xa8(%rsp),$arg6	# pull ivp
2606	lea	-0xa0(%rsp), %rsp
2607	movaps	%xmm6, 0x40(%rsp)
2608	movaps	%xmm7, 0x50(%rsp)
2609	movaps	%xmm8, 0x60(%rsp)
2610	movaps	%xmm9, 0x70(%rsp)
2611	movaps	%xmm10, 0x80(%rsp)
2612	movaps	%xmm11, 0x90(%rsp)
2613	movaps	%xmm12, 0xa0(%rsp)
2614	movaps	%xmm13, 0xb0(%rsp)
2615	movaps	%xmm14, 0xc0(%rsp)
2616	movaps	%xmm15, 0xd0(%rsp)
2617.Lxts_dec_body:
2618___
2619$code.=<<___;
2620	mov	%rsp, %rbp		# backup %rsp
2621	mov	$arg1, $inp		# backup arguments
2622	mov	$arg2, $out
2623	mov	$arg3, $len
2624	mov	$arg4, $key
2625
2626	lea	($arg6), $arg1
2627	lea	0x20(%rbp), $arg2
2628	lea	($arg5), $arg3
2629	call	asm_AES_encrypt		# generate initial tweak
2630
2631	mov	240($key), %eax		# rounds
2632	mov	$len, %rbx		# backup $len
2633
2634	mov	%eax, %edx		# rounds
2635	shl	\$7, %rax		# 128 bytes per inner round key
2636	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2637	sub	%rax, %rsp
2638
2639	mov	%rsp, %rax		# pass key schedule
2640	mov	$key, %rcx		# pass key
2641	mov	%edx, %r10d		# pass rounds
2642	call	_bsaes_key_convert
2643	pxor	(%rsp), %xmm7		# fix up round 0 key
2644	movdqa	%xmm6, (%rax)		# save last round key
2645	movdqa	%xmm7, (%rsp)
2646
2647	xor	%eax, %eax		# if ($len%16) len-=16;
2648	and	\$-16, $len
2649	test	\$15, %ebx
2650	setnz	%al
2651	shl	\$4, %rax
2652	sub	%rax, $len
2653
2654	sub	\$0x80, %rsp		# place for tweak[8]
2655	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2656
2657	pxor	$twtmp, $twtmp
2658	movdqa	.Lxts_magic(%rip), $twmask
2659	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2660
2661	sub	\$0x80, $len
2662	jc	.Lxts_dec_short
2663	jmp	.Lxts_dec_loop
2664
2665.align	16
2666.Lxts_dec_loop:
2667___
2668    for ($i=0;$i<7;$i++) {
2669    $code.=<<___;
2670	pshufd	\$0x13, $twtmp, $twres
2671	pxor	$twtmp, $twtmp
2672	movdqa	@XMM[7], @XMM[$i]
2673	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2674	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2675	pand	$twmask, $twres		# isolate carry and residue
2676	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2677	pxor	$twres, @XMM[7]
2678___
2679    $code.=<<___ if ($i>=1);
2680	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2681___
2682    $code.=<<___ if ($i>=2);
2683	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2684___
2685    }
2686$code.=<<___;
2687	movdqu	0x60($inp), @XMM[8+6]
2688	pxor	@XMM[8+5], @XMM[5]
2689	movdqu	0x70($inp), @XMM[8+7]
2690	lea	0x80($inp), $inp
2691	movdqa	@XMM[7], 0x70(%rsp)
2692	pxor	@XMM[8+6], @XMM[6]
2693	lea	0x80(%rsp), %rax	# pass key schedule
2694	pxor	@XMM[8+7], @XMM[7]
2695	mov	%edx, %r10d		# pass rounds
2696
2697	call	_bsaes_decrypt8
2698
2699	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2700	pxor	0x10(%rsp), @XMM[1]
2701	movdqu	@XMM[0], 0x00($out)	# write output
2702	pxor	0x20(%rsp), @XMM[6]
2703	movdqu	@XMM[1], 0x10($out)
2704	pxor	0x30(%rsp), @XMM[4]
2705	movdqu	@XMM[6], 0x20($out)
2706	pxor	0x40(%rsp), @XMM[2]
2707	movdqu	@XMM[4], 0x30($out)
2708	pxor	0x50(%rsp), @XMM[7]
2709	movdqu	@XMM[2], 0x40($out)
2710	pxor	0x60(%rsp), @XMM[3]
2711	movdqu	@XMM[7], 0x50($out)
2712	pxor	0x70(%rsp), @XMM[5]
2713	movdqu	@XMM[3], 0x60($out)
2714	movdqu	@XMM[5], 0x70($out)
2715	lea	0x80($out), $out
2716
2717	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2718	pxor	$twtmp, $twtmp
2719	movdqa	.Lxts_magic(%rip), $twmask
2720	pcmpgtd	@XMM[7], $twtmp
2721	pshufd	\$0x13, $twtmp, $twres
2722	pxor	$twtmp, $twtmp
2723	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2724	pand	$twmask, $twres		# isolate carry and residue
2725	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2726	pxor	$twres, @XMM[7]
2727
2728	sub	\$0x80,$len
2729	jnc	.Lxts_dec_loop
2730
2731.Lxts_dec_short:
2732	add	\$0x80, $len
2733	jz	.Lxts_dec_done
2734___
2735    for ($i=0;$i<7;$i++) {
2736    $code.=<<___;
2737	pshufd	\$0x13, $twtmp, $twres
2738	pxor	$twtmp, $twtmp
2739	movdqa	@XMM[7], @XMM[$i]
2740	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2741	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2742	pand	$twmask, $twres		# isolate carry and residue
2743	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2744	pxor	$twres, @XMM[7]
2745___
2746    $code.=<<___ if ($i>=1);
2747	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2748	cmp	\$`0x10*$i`,$len
2749	je	.Lxts_dec_$i
2750___
2751    $code.=<<___ if ($i>=2);
2752	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2753___
2754    }
2755$code.=<<___;
2756	movdqu	0x60($inp), @XMM[8+6]
2757	pxor	@XMM[8+5], @XMM[5]
2758	movdqa	@XMM[7], 0x70(%rsp)
2759	lea	0x70($inp), $inp
2760	pxor	@XMM[8+6], @XMM[6]
2761	lea	0x80(%rsp), %rax	# pass key schedule
2762	mov	%edx, %r10d		# pass rounds
2763
2764	call	_bsaes_decrypt8
2765
2766	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2767	pxor	0x10(%rsp), @XMM[1]
2768	movdqu	@XMM[0], 0x00($out)	# write output
2769	pxor	0x20(%rsp), @XMM[6]
2770	movdqu	@XMM[1], 0x10($out)
2771	pxor	0x30(%rsp), @XMM[4]
2772	movdqu	@XMM[6], 0x20($out)
2773	pxor	0x40(%rsp), @XMM[2]
2774	movdqu	@XMM[4], 0x30($out)
2775	pxor	0x50(%rsp), @XMM[7]
2776	movdqu	@XMM[2], 0x40($out)
2777	pxor	0x60(%rsp), @XMM[3]
2778	movdqu	@XMM[7], 0x50($out)
2779	movdqu	@XMM[3], 0x60($out)
2780	lea	0x70($out), $out
2781
2782	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2783	jmp	.Lxts_dec_done
2784.align	16
2785.Lxts_dec_6:
2786	pxor	@XMM[8+4], @XMM[4]
2787	lea	0x60($inp), $inp
2788	pxor	@XMM[8+5], @XMM[5]
2789	lea	0x80(%rsp), %rax	# pass key schedule
2790	mov	%edx, %r10d		# pass rounds
2791
2792	call	_bsaes_decrypt8
2793
2794	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2795	pxor	0x10(%rsp), @XMM[1]
2796	movdqu	@XMM[0], 0x00($out)	# write output
2797	pxor	0x20(%rsp), @XMM[6]
2798	movdqu	@XMM[1], 0x10($out)
2799	pxor	0x30(%rsp), @XMM[4]
2800	movdqu	@XMM[6], 0x20($out)
2801	pxor	0x40(%rsp), @XMM[2]
2802	movdqu	@XMM[4], 0x30($out)
2803	pxor	0x50(%rsp), @XMM[7]
2804	movdqu	@XMM[2], 0x40($out)
2805	movdqu	@XMM[7], 0x50($out)
2806	lea	0x60($out), $out
2807
2808	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2809	jmp	.Lxts_dec_done
2810.align	16
2811.Lxts_dec_5:
2812	pxor	@XMM[8+3], @XMM[3]
2813	lea	0x50($inp), $inp
2814	pxor	@XMM[8+4], @XMM[4]
2815	lea	0x80(%rsp), %rax	# pass key schedule
2816	mov	%edx, %r10d		# pass rounds
2817
2818	call	_bsaes_decrypt8
2819
2820	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2821	pxor	0x10(%rsp), @XMM[1]
2822	movdqu	@XMM[0], 0x00($out)	# write output
2823	pxor	0x20(%rsp), @XMM[6]
2824	movdqu	@XMM[1], 0x10($out)
2825	pxor	0x30(%rsp), @XMM[4]
2826	movdqu	@XMM[6], 0x20($out)
2827	pxor	0x40(%rsp), @XMM[2]
2828	movdqu	@XMM[4], 0x30($out)
2829	movdqu	@XMM[2], 0x40($out)
2830	lea	0x50($out), $out
2831
2832	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2833	jmp	.Lxts_dec_done
2834.align	16
2835.Lxts_dec_4:
2836	pxor	@XMM[8+2], @XMM[2]
2837	lea	0x40($inp), $inp
2838	pxor	@XMM[8+3], @XMM[3]
2839	lea	0x80(%rsp), %rax	# pass key schedule
2840	mov	%edx, %r10d		# pass rounds
2841
2842	call	_bsaes_decrypt8
2843
2844	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2845	pxor	0x10(%rsp), @XMM[1]
2846	movdqu	@XMM[0], 0x00($out)	# write output
2847	pxor	0x20(%rsp), @XMM[6]
2848	movdqu	@XMM[1], 0x10($out)
2849	pxor	0x30(%rsp), @XMM[4]
2850	movdqu	@XMM[6], 0x20($out)
2851	movdqu	@XMM[4], 0x30($out)
2852	lea	0x40($out), $out
2853
2854	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2855	jmp	.Lxts_dec_done
2856.align	16
2857.Lxts_dec_3:
2858	pxor	@XMM[8+1], @XMM[1]
2859	lea	0x30($inp), $inp
2860	pxor	@XMM[8+2], @XMM[2]
2861	lea	0x80(%rsp), %rax	# pass key schedule
2862	mov	%edx, %r10d		# pass rounds
2863
2864	call	_bsaes_decrypt8
2865
2866	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2867	pxor	0x10(%rsp), @XMM[1]
2868	movdqu	@XMM[0], 0x00($out)	# write output
2869	pxor	0x20(%rsp), @XMM[6]
2870	movdqu	@XMM[1], 0x10($out)
2871	movdqu	@XMM[6], 0x20($out)
2872	lea	0x30($out), $out
2873
2874	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2875	jmp	.Lxts_dec_done
2876.align	16
2877.Lxts_dec_2:
2878	pxor	@XMM[8+0], @XMM[0]
2879	lea	0x20($inp), $inp
2880	pxor	@XMM[8+1], @XMM[1]
2881	lea	0x80(%rsp), %rax	# pass key schedule
2882	mov	%edx, %r10d		# pass rounds
2883
2884	call	_bsaes_decrypt8
2885
2886	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2887	pxor	0x10(%rsp), @XMM[1]
2888	movdqu	@XMM[0], 0x00($out)	# write output
2889	movdqu	@XMM[1], 0x10($out)
2890	lea	0x20($out), $out
2891
2892	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2893	jmp	.Lxts_dec_done
2894.align	16
2895.Lxts_dec_1:
2896	pxor	@XMM[0], @XMM[8]
2897	lea	0x10($inp), $inp
2898	movdqa	@XMM[8], 0x20(%rbp)
2899	lea	0x20(%rbp), $arg1
2900	lea	0x20(%rbp), $arg2
2901	lea	($key), $arg3
2902	call	asm_AES_decrypt		# doesn't touch %xmm
2903	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2904	#pxor	@XMM[8], @XMM[0]
2905	#lea	0x80(%rsp), %rax	# pass key schedule
2906	#mov	%edx, %r10d		# pass rounds
2907	#call	_bsaes_decrypt8
2908	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2909	movdqu	@XMM[0], 0x00($out)	# write output
2910	lea	0x10($out), $out
2911
2912	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2913
2914.Lxts_dec_done:
2915	and	\$15, %ebx
2916	jz	.Lxts_dec_ret
2917
2918	pxor	$twtmp, $twtmp
2919	movdqa	.Lxts_magic(%rip), $twmask
2920	pcmpgtd	@XMM[7], $twtmp
2921	pshufd	\$0x13, $twtmp, $twres
2922	movdqa	@XMM[7], @XMM[6]
2923	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2924	pand	$twmask, $twres		# isolate carry and residue
2925	movdqu	($inp), @XMM[0]
2926	pxor	$twres, @XMM[7]
2927
2928	lea	0x20(%rbp), $arg1
2929	pxor	@XMM[7], @XMM[0]
2930	lea	0x20(%rbp), $arg2
2931	movdqa	@XMM[0], 0x20(%rbp)
2932	lea	($key), $arg3
2933	call	asm_AES_decrypt		# doesn't touch %xmm
2934	pxor	0x20(%rbp), @XMM[7]
2935	mov	$out, %rdx
2936	movdqu	@XMM[7], ($out)
2937
2938.Lxts_dec_steal:
2939	movzb	16($inp), %eax
2940	movzb	(%rdx), %ecx
2941	lea	1($inp), $inp
2942	mov	%al, (%rdx)
2943	mov	%cl, 16(%rdx)
2944	lea	1(%rdx), %rdx
2945	sub	\$1,%ebx
2946	jnz	.Lxts_dec_steal
2947
2948	movdqu	($out), @XMM[0]
2949	lea	0x20(%rbp), $arg1
2950	pxor	@XMM[6], @XMM[0]
2951	lea	0x20(%rbp), $arg2
2952	movdqa	@XMM[0], 0x20(%rbp)
2953	lea	($key), $arg3
2954	call	asm_AES_decrypt		# doesn't touch %xmm
2955	pxor	0x20(%rbp), @XMM[6]
2956	movdqu	@XMM[6], ($out)
2957
2958.Lxts_dec_ret:
2959	lea	(%rsp), %rax
2960	pxor	%xmm0, %xmm0
2961.Lxts_dec_bzero:			# wipe key schedule [if any]
2962	movdqa	%xmm0, 0x00(%rax)
2963	movdqa	%xmm0, 0x10(%rax)
2964	lea	0x20(%rax), %rax
2965	cmp	%rax, %rbp
2966	ja	.Lxts_dec_bzero
2967
2968	lea	0x78(%rbp),%rax
2969.cfi_def_cfa	%rax,8
2970___
2971$code.=<<___ if ($win64);
2972	movaps	0x40(%rbp), %xmm6
2973	movaps	0x50(%rbp), %xmm7
2974	movaps	0x60(%rbp), %xmm8
2975	movaps	0x70(%rbp), %xmm9
2976	movaps	0x80(%rbp), %xmm10
2977	movaps	0x90(%rbp), %xmm11
2978	movaps	0xa0(%rbp), %xmm12
2979	movaps	0xb0(%rbp), %xmm13
2980	movaps	0xc0(%rbp), %xmm14
2981	movaps	0xd0(%rbp), %xmm15
2982	lea	0xa0(%rax), %rax
2983.Lxts_dec_tail:
2984___
2985$code.=<<___;
2986	mov	-48(%rax), %r15
2987.cfi_restore	%r15
2988	mov	-40(%rax), %r14
2989.cfi_restore	%r14
2990	mov	-32(%rax), %r13
2991.cfi_restore	%r13
2992	mov	-24(%rax), %r12
2993.cfi_restore	%r12
2994	mov	-16(%rax), %rbx
2995.cfi_restore	%rbx
2996	mov	-8(%rax), %rbp
2997.cfi_restore	%rbp
2998	lea	(%rax), %rsp		# restore %rsp
2999.cfi_def_cfa_register	%rsp
3000.Lxts_dec_epilogue:
3001	ret
3002.cfi_endproc
3003.size	ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
3004___
3005}
3006$code.=<<___;
3007.type	_bsaes_const,\@object
3008.section .rodata align=64
3009.align	64
3010_bsaes_const:
3011.LM0ISR:	# InvShiftRows constants
3012	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
3013.LISRM0:
3014	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
3015.LISR:
3016	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
3017.LBS0:		# bit-slice constants
3018	.quad	0x5555555555555555, 0x5555555555555555
3019.LBS1:
3020	.quad	0x3333333333333333, 0x3333333333333333
3021.LBS2:
3022	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3023.LSR:		# shiftrows constants
3024	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
3025.LSRM0:
3026	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
3027.LM0SR:
3028	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
3029.LSWPUP:	# byte-swap upper dword
3030	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
3031.LSWPUPM0SR:
3032	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
3033.LADD1:		# counter increment constants
3034	.quad	0x0000000000000000, 0x0000000100000000
3035.LADD2:
3036	.quad	0x0000000000000000, 0x0000000200000000
3037.LADD3:
3038	.quad	0x0000000000000000, 0x0000000300000000
3039.LADD4:
3040	.quad	0x0000000000000000, 0x0000000400000000
3041.LADD5:
3042	.quad	0x0000000000000000, 0x0000000500000000
3043.LADD6:
3044	.quad	0x0000000000000000, 0x0000000600000000
3045.LADD7:
3046	.quad	0x0000000000000000, 0x0000000700000000
3047.LADD8:
3048	.quad	0x0000000000000000, 0x0000000800000000
3049.Lxts_magic:
3050	.long	0x87,0,1,0
3051.Lmasks:
3052	.quad	0x0101010101010101, 0x0101010101010101
3053	.quad	0x0202020202020202, 0x0202020202020202
3054	.quad	0x0404040404040404, 0x0404040404040404
3055	.quad	0x0808080808080808, 0x0808080808080808
3056.LM0:
3057	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
3058.L63:
3059	.quad	0x6363636363636363, 0x6363636363636363
3060.align	64
3061.size	_bsaes_const,.-_bsaes_const
3062.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3063___
3064
3065# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3066#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3067if ($win64) {
3068$rec="%rcx";
3069$frame="%rdx";
3070$context="%r8";
3071$disp="%r9";
3072
3073$code.=<<___;
3074.extern	__imp_RtlVirtualUnwind
3075.type	se_handler,\@abi-omnipotent
3076.align	16
3077se_handler:
3078	push	%rsi
3079	push	%rdi
3080	push	%rbx
3081	push	%rbp
3082	push	%r12
3083	push	%r13
3084	push	%r14
3085	push	%r15
3086	pushfq
3087	sub	\$64,%rsp
3088
3089	mov	120($context),%rax	# pull context->Rax
3090	mov	248($context),%rbx	# pull context->Rip
3091
3092	mov	8($disp),%rsi		# disp->ImageBase
3093	mov	56($disp),%r11		# disp->HandlerData
3094
3095	mov	0(%r11),%r10d		# HandlerData[0]
3096	lea	(%rsi,%r10),%r10	# prologue label
3097	cmp	%r10,%rbx		# context->Rip<=prologue label
3098	jbe	.Lin_prologue
3099
3100	mov	4(%r11),%r10d		# HandlerData[1]
3101	lea	(%rsi,%r10),%r10	# epilogue label
3102	cmp	%r10,%rbx		# context->Rip>=epilogue label
3103	jae	.Lin_prologue
3104
3105	mov	8(%r11),%r10d		# HandlerData[2]
3106	lea	(%rsi,%r10),%r10	# epilogue label
3107	cmp	%r10,%rbx		# context->Rip>=tail label
3108	jae	.Lin_tail
3109
3110	mov	160($context),%rax	# pull context->Rbp
3111
3112	lea	0x40(%rax),%rsi		# %xmm save area
3113	lea	512($context),%rdi	# &context.Xmm6
3114	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3115	.long	0xa548f3fc		# cld; rep movsq
3116	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
3117
3118.Lin_tail:
3119	mov	-48(%rax),%rbp
3120	mov	-40(%rax),%rbx
3121	mov	-32(%rax),%r12
3122	mov	-24(%rax),%r13
3123	mov	-16(%rax),%r14
3124	mov	-8(%rax),%r15
3125	mov	%rbx,144($context)	# restore context->Rbx
3126	mov	%rbp,160($context)	# restore context->Rbp
3127	mov	%r12,216($context)	# restore context->R12
3128	mov	%r13,224($context)	# restore context->R13
3129	mov	%r14,232($context)	# restore context->R14
3130	mov	%r15,240($context)	# restore context->R15
3131
3132.Lin_prologue:
3133	mov	%rax,152($context)	# restore context->Rsp
3134
3135	mov	40($disp),%rdi		# disp->ContextRecord
3136	mov	$context,%rsi		# context
3137	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
3138	.long	0xa548f3fc		# cld; rep movsq
3139
3140	mov	$disp,%rsi
3141	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3142	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3143	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3144	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3145	mov	40(%rsi),%r10		# disp->ContextRecord
3146	lea	56(%rsi),%r11		# &disp->HandlerData
3147	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3148	mov	%r10,32(%rsp)		# arg5
3149	mov	%r11,40(%rsp)		# arg6
3150	mov	%r12,48(%rsp)		# arg7
3151	mov	%rcx,56(%rsp)		# arg8, (NULL)
3152	call	*__imp_RtlVirtualUnwind(%rip)
3153
3154	mov	\$1,%eax		# ExceptionContinueSearch
3155	add	\$64,%rsp
3156	popfq
3157	pop	%r15
3158	pop	%r14
3159	pop	%r13
3160	pop	%r12
3161	pop	%rbp
3162	pop	%rbx
3163	pop	%rdi
3164	pop	%rsi
3165	ret
3166.size	se_handler,.-se_handler
3167
3168.section	.pdata
3169.align	4
3170___
3171$code.=<<___ if ($ecb);
3172	.rva	.Lecb_enc_prologue
3173	.rva	.Lecb_enc_epilogue
3174	.rva	.Lecb_enc_info
3175
3176	.rva	.Lecb_dec_prologue
3177	.rva	.Lecb_dec_epilogue
3178	.rva	.Lecb_dec_info
3179___
3180$code.=<<___;
3181	.rva	.Lcbc_dec_prologue
3182	.rva	.Lcbc_dec_epilogue
3183	.rva	.Lcbc_dec_info
3184
3185	.rva	.Lctr_enc_prologue
3186	.rva	.Lctr_enc_epilogue
3187	.rva	.Lctr_enc_info
3188
3189	.rva	.Lxts_enc_prologue
3190	.rva	.Lxts_enc_epilogue
3191	.rva	.Lxts_enc_info
3192
3193	.rva	.Lxts_dec_prologue
3194	.rva	.Lxts_dec_epilogue
3195	.rva	.Lxts_dec_info
3196
3197.section	.xdata
3198.align	8
3199___
3200$code.=<<___ if ($ecb);
3201.Lecb_enc_info:
3202	.byte	9,0,0,0
3203	.rva	se_handler
3204	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3205	.rva	.Lecb_enc_tail
3206	.long	0
3207.Lecb_dec_info:
3208	.byte	9,0,0,0
3209	.rva	se_handler
3210	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3211	.rva	.Lecb_dec_tail
3212	.long	0
3213___
3214$code.=<<___;
3215.Lcbc_dec_info:
3216	.byte	9,0,0,0
3217	.rva	se_handler
3218	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3219	.rva	.Lcbc_dec_tail
3220	.long	0
3221.Lctr_enc_info:
3222	.byte	9,0,0,0
3223	.rva	se_handler
3224	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3225	.rva	.Lctr_enc_tail
3226	.long	0
3227.Lxts_enc_info:
3228	.byte	9,0,0,0
3229	.rva	se_handler
3230	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3231	.rva	.Lxts_enc_tail
3232	.long	0
3233.Lxts_dec_info:
3234	.byte	9,0,0,0
3235	.rva	se_handler
3236	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3237	.rva	.Lxts_dec_tail
3238	.long	0
3239___
3240}
3241
3242$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3243
3244print $code;
3245
3246close STDOUT or die "error closing STDOUT: $!";
3247