xref: /openssl/crypto/aes/asm/aesni-x86.pl (revision 6ebf6d51)
1#! /usr/bin/env perl
2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31#	16-byte     64-byte     256-byte    1-KB        8-KB
32#	53-67%      67-84%      91-94%      95-98%      97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt.
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61#		CBC en-/decrypt	CTR	XTS	ECB	OCB
62# Westmere	3.77/1.37	1.37	1.52	1.27
63# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
64# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
65# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
66# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
67# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
68# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
69
70$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
71			# generates drop-in replacement for
72			# crypto/aes/asm/aes-586.pl:-)
73$inline=1;		# inline _aesni_[en|de]crypt
74
75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76push(@INC,"${dir}","${dir}../../perlasm");
77require "x86asm.pl";
78
79$output = pop and open STDOUT,">$output";
80
81&asm_init($ARGV[0]);
82
83&external_label("OPENSSL_ia32cap_P");
84&static_label("key_const");
85
86if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
87else			{ $movekey=\&movups; }
88
89$len="eax";
90$rounds="ecx";
91$key="edx";
92$inp="esi";
93$out="edi";
94$rounds_="ebx";	# backup copy for $rounds
95$key_="ebp";	# backup copy for $key
96
97$rndkey0="xmm0";
98$rndkey1="xmm1";
99$inout0="xmm2";
100$inout1="xmm3";
101$inout2="xmm4";
102$inout3="xmm5";	$in1="xmm5";
103$inout4="xmm6";	$in0="xmm6";
104$inout5="xmm7";	$ivec="xmm7";
105
106# AESNI extension
107sub aeskeygenassist
108{ my($dst,$src,$imm)=@_;
109    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
110    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
111}
112sub aescommon
113{ my($opcodelet,$dst,$src)=@_;
114    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
115    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
116}
117sub aesimc	{ aescommon(0xdb,@_); }
118sub aesenc	{ aescommon(0xdc,@_); }
119sub aesenclast	{ aescommon(0xdd,@_); }
120sub aesdec	{ aescommon(0xde,@_); }
121sub aesdeclast	{ aescommon(0xdf,@_); }
122
123# Inline version of internal aesni_[en|de]crypt1
124{ my $sn;
125sub aesni_inline_generate1
126{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
127  $sn++;
128
129    &$movekey		($rndkey0,&QWP(0,$key));
130    &$movekey		($rndkey1,&QWP(16,$key));
131    &xorps		($ivec,$rndkey0)	if (defined($ivec));
132    &lea		($key,&DWP(32,$key));
133    &xorps		($inout,$ivec)		if (defined($ivec));
134    &xorps		($inout,$rndkey0)	if (!defined($ivec));
135    &set_label("${p}1_loop_$sn");
136	eval"&aes${p}	($inout,$rndkey1)";
137	&dec		($rounds);
138	&$movekey	($rndkey1,&QWP(0,$key));
139	&lea		($key,&DWP(16,$key));
140    &jnz		(&label("${p}1_loop_$sn"));
141    eval"&aes${p}last	($inout,$rndkey1)";
142}}
143
144sub aesni_generate1	# fully unrolled loop
145{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
146
147    &function_begin_B("_aesni_${p}rypt1");
148	&movups		($rndkey0,&QWP(0,$key));
149	&$movekey	($rndkey1,&QWP(0x10,$key));
150	&xorps		($inout,$rndkey0);
151	&$movekey	($rndkey0,&QWP(0x20,$key));
152	&lea		($key,&DWP(0x30,$key));
153	&cmp		($rounds,11);
154	&jb		(&label("${p}128"));
155	&lea		($key,&DWP(0x20,$key));
156	&je		(&label("${p}192"));
157	&lea		($key,&DWP(0x20,$key));
158	eval"&aes${p}	($inout,$rndkey1)";
159	&$movekey	($rndkey1,&QWP(-0x40,$key));
160	eval"&aes${p}	($inout,$rndkey0)";
161	&$movekey	($rndkey0,&QWP(-0x30,$key));
162    &set_label("${p}192");
163	eval"&aes${p}	($inout,$rndkey1)";
164	&$movekey	($rndkey1,&QWP(-0x20,$key));
165	eval"&aes${p}	($inout,$rndkey0)";
166	&$movekey	($rndkey0,&QWP(-0x10,$key));
167    &set_label("${p}128");
168	eval"&aes${p}	($inout,$rndkey1)";
169	&$movekey	($rndkey1,&QWP(0,$key));
170	eval"&aes${p}	($inout,$rndkey0)";
171	&$movekey	($rndkey0,&QWP(0x10,$key));
172	eval"&aes${p}	($inout,$rndkey1)";
173	&$movekey	($rndkey1,&QWP(0x20,$key));
174	eval"&aes${p}	($inout,$rndkey0)";
175	&$movekey	($rndkey0,&QWP(0x30,$key));
176	eval"&aes${p}	($inout,$rndkey1)";
177	&$movekey	($rndkey1,&QWP(0x40,$key));
178	eval"&aes${p}	($inout,$rndkey0)";
179	&$movekey	($rndkey0,&QWP(0x50,$key));
180	eval"&aes${p}	($inout,$rndkey1)";
181	&$movekey	($rndkey1,&QWP(0x60,$key));
182	eval"&aes${p}	($inout,$rndkey0)";
183	&$movekey	($rndkey0,&QWP(0x70,$key));
184	eval"&aes${p}	($inout,$rndkey1)";
185    eval"&aes${p}last	($inout,$rndkey0)";
186    &ret();
187    &function_end_B("_aesni_${p}rypt1");
188}
189
190# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
191&aesni_generate1("enc") if (!$inline);
192&function_begin_B("${PREFIX}_encrypt");
193	&mov	("eax",&wparam(0));
194	&mov	($key,&wparam(2));
195	&movups	($inout0,&QWP(0,"eax"));
196	&mov	($rounds,&DWP(240,$key));
197	&mov	("eax",&wparam(1));
198	if ($inline)
199	{   &aesni_inline_generate1("enc");	}
200	else
201	{   &call	("_aesni_encrypt1");	}
202	&pxor	($rndkey0,$rndkey0);		# clear register bank
203	&pxor	($rndkey1,$rndkey1);
204	&movups	(&QWP(0,"eax"),$inout0);
205	&pxor	($inout0,$inout0);
206	&ret	();
207&function_end_B("${PREFIX}_encrypt");
208
209# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
210&aesni_generate1("dec") if(!$inline);
211&function_begin_B("${PREFIX}_decrypt");
212	&mov	("eax",&wparam(0));
213	&mov	($key,&wparam(2));
214	&movups	($inout0,&QWP(0,"eax"));
215	&mov	($rounds,&DWP(240,$key));
216	&mov	("eax",&wparam(1));
217	if ($inline)
218	{   &aesni_inline_generate1("dec");	}
219	else
220	{   &call	("_aesni_decrypt1");	}
221	&pxor	($rndkey0,$rndkey0);		# clear register bank
222	&pxor	($rndkey1,$rndkey1);
223	&movups	(&QWP(0,"eax"),$inout0);
224	&pxor	($inout0,$inout0);
225	&ret	();
226&function_end_B("${PREFIX}_decrypt");
227
228# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
229# factor. Why 3x subroutine were originally used in loops? Even though
230# aes[enc|dec] latency was originally 6, it could be scheduled only
231# every *2nd* cycle. Thus 3x interleave was the one providing optimal
232# utilization, i.e. when subroutine's throughput is virtually same as
233# of non-interleaved subroutine [for number of input blocks up to 3].
234# This is why it originally made no sense to implement 2x subroutine.
235# But times change and it became appropriate to spend extra 192 bytes
236# on 2x subroutine on Atom Silvermont account. For processors that
237# can schedule aes[enc|dec] every cycle optimal interleave factor
238# equals to corresponding instructions latency. 8x is optimal for
239# * Bridge, but it's unfeasible to accommodate such implementation
240# in XMM registers addressable in 32-bit mode and therefore maximum
241# of 6x is used instead...
242
243sub aesni_generate2
244{ my $p=shift;
245
246    &function_begin_B("_aesni_${p}rypt2");
247	&$movekey	($rndkey0,&QWP(0,$key));
248	&shl		($rounds,4);
249	&$movekey	($rndkey1,&QWP(16,$key));
250	&xorps		($inout0,$rndkey0);
251	&pxor		($inout1,$rndkey0);
252	&$movekey	($rndkey0,&QWP(32,$key));
253	&lea		($key,&DWP(32,$key,$rounds));
254	&neg		($rounds);
255	&add		($rounds,16);
256
257    &set_label("${p}2_loop");
258	eval"&aes${p}	($inout0,$rndkey1)";
259	eval"&aes${p}	($inout1,$rndkey1)";
260	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
261	&add		($rounds,32);
262	eval"&aes${p}	($inout0,$rndkey0)";
263	eval"&aes${p}	($inout1,$rndkey0)";
264	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
265	&jnz		(&label("${p}2_loop"));
266    eval"&aes${p}	($inout0,$rndkey1)";
267    eval"&aes${p}	($inout1,$rndkey1)";
268    eval"&aes${p}last	($inout0,$rndkey0)";
269    eval"&aes${p}last	($inout1,$rndkey0)";
270    &ret();
271    &function_end_B("_aesni_${p}rypt2");
272}
273
274sub aesni_generate3
275{ my $p=shift;
276
277    &function_begin_B("_aesni_${p}rypt3");
278	&$movekey	($rndkey0,&QWP(0,$key));
279	&shl		($rounds,4);
280	&$movekey	($rndkey1,&QWP(16,$key));
281	&xorps		($inout0,$rndkey0);
282	&pxor		($inout1,$rndkey0);
283	&pxor		($inout2,$rndkey0);
284	&$movekey	($rndkey0,&QWP(32,$key));
285	&lea		($key,&DWP(32,$key,$rounds));
286	&neg		($rounds);
287	&add		($rounds,16);
288
289    &set_label("${p}3_loop");
290	eval"&aes${p}	($inout0,$rndkey1)";
291	eval"&aes${p}	($inout1,$rndkey1)";
292	eval"&aes${p}	($inout2,$rndkey1)";
293	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
294	&add		($rounds,32);
295	eval"&aes${p}	($inout0,$rndkey0)";
296	eval"&aes${p}	($inout1,$rndkey0)";
297	eval"&aes${p}	($inout2,$rndkey0)";
298	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
299	&jnz		(&label("${p}3_loop"));
300    eval"&aes${p}	($inout0,$rndkey1)";
301    eval"&aes${p}	($inout1,$rndkey1)";
302    eval"&aes${p}	($inout2,$rndkey1)";
303    eval"&aes${p}last	($inout0,$rndkey0)";
304    eval"&aes${p}last	($inout1,$rndkey0)";
305    eval"&aes${p}last	($inout2,$rndkey0)";
306    &ret();
307    &function_end_B("_aesni_${p}rypt3");
308}
309
310# 4x interleave is implemented to improve small block performance,
311# most notably [and naturally] 4 block by ~30%. One can argue that one
312# should have implemented 5x as well, but improvement  would be <20%,
313# so it's not worth it...
314sub aesni_generate4
315{ my $p=shift;
316
317    &function_begin_B("_aesni_${p}rypt4");
318	&$movekey	($rndkey0,&QWP(0,$key));
319	&$movekey	($rndkey1,&QWP(16,$key));
320	&shl		($rounds,4);
321	&xorps		($inout0,$rndkey0);
322	&pxor		($inout1,$rndkey0);
323	&pxor		($inout2,$rndkey0);
324	&pxor		($inout3,$rndkey0);
325	&$movekey	($rndkey0,&QWP(32,$key));
326	&lea		($key,&DWP(32,$key,$rounds));
327	&neg		($rounds);
328	&data_byte	(0x0f,0x1f,0x40,0x00);
329	&add		($rounds,16);
330
331    &set_label("${p}4_loop");
332	eval"&aes${p}	($inout0,$rndkey1)";
333	eval"&aes${p}	($inout1,$rndkey1)";
334	eval"&aes${p}	($inout2,$rndkey1)";
335	eval"&aes${p}	($inout3,$rndkey1)";
336	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
337	&add		($rounds,32);
338	eval"&aes${p}	($inout0,$rndkey0)";
339	eval"&aes${p}	($inout1,$rndkey0)";
340	eval"&aes${p}	($inout2,$rndkey0)";
341	eval"&aes${p}	($inout3,$rndkey0)";
342	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
343    &jnz		(&label("${p}4_loop"));
344
345    eval"&aes${p}	($inout0,$rndkey1)";
346    eval"&aes${p}	($inout1,$rndkey1)";
347    eval"&aes${p}	($inout2,$rndkey1)";
348    eval"&aes${p}	($inout3,$rndkey1)";
349    eval"&aes${p}last	($inout0,$rndkey0)";
350    eval"&aes${p}last	($inout1,$rndkey0)";
351    eval"&aes${p}last	($inout2,$rndkey0)";
352    eval"&aes${p}last	($inout3,$rndkey0)";
353    &ret();
354    &function_end_B("_aesni_${p}rypt4");
355}
356
357sub aesni_generate6
358{ my $p=shift;
359
360    &function_begin_B("_aesni_${p}rypt6");
361    &static_label("_aesni_${p}rypt6_enter");
362	&$movekey	($rndkey0,&QWP(0,$key));
363	&shl		($rounds,4);
364	&$movekey	($rndkey1,&QWP(16,$key));
365	&xorps		($inout0,$rndkey0);
366	&pxor		($inout1,$rndkey0);	# pxor does better here
367	&pxor		($inout2,$rndkey0);
368	eval"&aes${p}	($inout0,$rndkey1)";
369	&pxor		($inout3,$rndkey0);
370	&pxor		($inout4,$rndkey0);
371	eval"&aes${p}	($inout1,$rndkey1)";
372	&lea		($key,&DWP(32,$key,$rounds));
373	&neg		($rounds);
374	eval"&aes${p}	($inout2,$rndkey1)";
375	&pxor		($inout5,$rndkey0);
376	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
377	&add		($rounds,16);
378	&jmp		(&label("_aesni_${p}rypt6_inner"));
379
380    &set_label("${p}6_loop",16);
381	eval"&aes${p}	($inout0,$rndkey1)";
382	eval"&aes${p}	($inout1,$rndkey1)";
383	eval"&aes${p}	($inout2,$rndkey1)";
384    &set_label("_aesni_${p}rypt6_inner");
385	eval"&aes${p}	($inout3,$rndkey1)";
386	eval"&aes${p}	($inout4,$rndkey1)";
387	eval"&aes${p}	($inout5,$rndkey1)";
388    &set_label("_aesni_${p}rypt6_enter");
389	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
390	&add		($rounds,32);
391	eval"&aes${p}	($inout0,$rndkey0)";
392	eval"&aes${p}	($inout1,$rndkey0)";
393	eval"&aes${p}	($inout2,$rndkey0)";
394	eval"&aes${p}	($inout3,$rndkey0)";
395	eval"&aes${p}	($inout4,$rndkey0)";
396	eval"&aes${p}	($inout5,$rndkey0)";
397	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
398    &jnz		(&label("${p}6_loop"));
399
400    eval"&aes${p}	($inout0,$rndkey1)";
401    eval"&aes${p}	($inout1,$rndkey1)";
402    eval"&aes${p}	($inout2,$rndkey1)";
403    eval"&aes${p}	($inout3,$rndkey1)";
404    eval"&aes${p}	($inout4,$rndkey1)";
405    eval"&aes${p}	($inout5,$rndkey1)";
406    eval"&aes${p}last	($inout0,$rndkey0)";
407    eval"&aes${p}last	($inout1,$rndkey0)";
408    eval"&aes${p}last	($inout2,$rndkey0)";
409    eval"&aes${p}last	($inout3,$rndkey0)";
410    eval"&aes${p}last	($inout4,$rndkey0)";
411    eval"&aes${p}last	($inout5,$rndkey0)";
412    &ret();
413    &function_end_B("_aesni_${p}rypt6");
414}
415&aesni_generate2("enc") if ($PREFIX eq "aesni");
416&aesni_generate2("dec");
417&aesni_generate3("enc") if ($PREFIX eq "aesni");
418&aesni_generate3("dec");
419&aesni_generate4("enc") if ($PREFIX eq "aesni");
420&aesni_generate4("dec");
421&aesni_generate6("enc") if ($PREFIX eq "aesni");
422&aesni_generate6("dec");
423
424if ($PREFIX eq "aesni") {
425######################################################################
426# void aesni_ecb_encrypt (const void *in, void *out,
427#                         size_t length, const AES_KEY *key,
428#                         int enc);
429&function_begin("aesni_ecb_encrypt");
430	&mov	($inp,&wparam(0));
431	&mov	($out,&wparam(1));
432	&mov	($len,&wparam(2));
433	&mov	($key,&wparam(3));
434	&mov	($rounds_,&wparam(4));
435	&and	($len,-16);
436	&jz	(&label("ecb_ret"));
437	&mov	($rounds,&DWP(240,$key));
438	&test	($rounds_,$rounds_);
439	&jz	(&label("ecb_decrypt"));
440
441	&mov	($key_,$key);		# backup $key
442	&mov	($rounds_,$rounds);	# backup $rounds
443	&cmp	($len,0x60);
444	&jb	(&label("ecb_enc_tail"));
445
446	&movdqu	($inout0,&QWP(0,$inp));
447	&movdqu	($inout1,&QWP(0x10,$inp));
448	&movdqu	($inout2,&QWP(0x20,$inp));
449	&movdqu	($inout3,&QWP(0x30,$inp));
450	&movdqu	($inout4,&QWP(0x40,$inp));
451	&movdqu	($inout5,&QWP(0x50,$inp));
452	&lea	($inp,&DWP(0x60,$inp));
453	&sub	($len,0x60);
454	&jmp	(&label("ecb_enc_loop6_enter"));
455
456&set_label("ecb_enc_loop6",16);
457	&movups	(&QWP(0,$out),$inout0);
458	&movdqu	($inout0,&QWP(0,$inp));
459	&movups	(&QWP(0x10,$out),$inout1);
460	&movdqu	($inout1,&QWP(0x10,$inp));
461	&movups	(&QWP(0x20,$out),$inout2);
462	&movdqu	($inout2,&QWP(0x20,$inp));
463	&movups	(&QWP(0x30,$out),$inout3);
464	&movdqu	($inout3,&QWP(0x30,$inp));
465	&movups	(&QWP(0x40,$out),$inout4);
466	&movdqu	($inout4,&QWP(0x40,$inp));
467	&movups	(&QWP(0x50,$out),$inout5);
468	&lea	($out,&DWP(0x60,$out));
469	&movdqu	($inout5,&QWP(0x50,$inp));
470	&lea	($inp,&DWP(0x60,$inp));
471&set_label("ecb_enc_loop6_enter");
472
473	&call	("_aesni_encrypt6");
474
475	&mov	($key,$key_);		# restore $key
476	&mov	($rounds,$rounds_);	# restore $rounds
477	&sub	($len,0x60);
478	&jnc	(&label("ecb_enc_loop6"));
479
480	&movups	(&QWP(0,$out),$inout0);
481	&movups	(&QWP(0x10,$out),$inout1);
482	&movups	(&QWP(0x20,$out),$inout2);
483	&movups	(&QWP(0x30,$out),$inout3);
484	&movups	(&QWP(0x40,$out),$inout4);
485	&movups	(&QWP(0x50,$out),$inout5);
486	&lea	($out,&DWP(0x60,$out));
487	&add	($len,0x60);
488	&jz	(&label("ecb_ret"));
489
490&set_label("ecb_enc_tail");
491	&movups	($inout0,&QWP(0,$inp));
492	&cmp	($len,0x20);
493	&jb	(&label("ecb_enc_one"));
494	&movups	($inout1,&QWP(0x10,$inp));
495	&je	(&label("ecb_enc_two"));
496	&movups	($inout2,&QWP(0x20,$inp));
497	&cmp	($len,0x40);
498	&jb	(&label("ecb_enc_three"));
499	&movups	($inout3,&QWP(0x30,$inp));
500	&je	(&label("ecb_enc_four"));
501	&movups	($inout4,&QWP(0x40,$inp));
502	&xorps	($inout5,$inout5);
503	&call	("_aesni_encrypt6");
504	&movups	(&QWP(0,$out),$inout0);
505	&movups	(&QWP(0x10,$out),$inout1);
506	&movups	(&QWP(0x20,$out),$inout2);
507	&movups	(&QWP(0x30,$out),$inout3);
508	&movups	(&QWP(0x40,$out),$inout4);
509	jmp	(&label("ecb_ret"));
510
511&set_label("ecb_enc_one",16);
512	if ($inline)
513	{   &aesni_inline_generate1("enc");	}
514	else
515	{   &call	("_aesni_encrypt1");	}
516	&movups	(&QWP(0,$out),$inout0);
517	&jmp	(&label("ecb_ret"));
518
519&set_label("ecb_enc_two",16);
520	&call	("_aesni_encrypt2");
521	&movups	(&QWP(0,$out),$inout0);
522	&movups	(&QWP(0x10,$out),$inout1);
523	&jmp	(&label("ecb_ret"));
524
525&set_label("ecb_enc_three",16);
526	&call	("_aesni_encrypt3");
527	&movups	(&QWP(0,$out),$inout0);
528	&movups	(&QWP(0x10,$out),$inout1);
529	&movups	(&QWP(0x20,$out),$inout2);
530	&jmp	(&label("ecb_ret"));
531
532&set_label("ecb_enc_four",16);
533	&call	("_aesni_encrypt4");
534	&movups	(&QWP(0,$out),$inout0);
535	&movups	(&QWP(0x10,$out),$inout1);
536	&movups	(&QWP(0x20,$out),$inout2);
537	&movups	(&QWP(0x30,$out),$inout3);
538	&jmp	(&label("ecb_ret"));
539######################################################################
540&set_label("ecb_decrypt",16);
541	&mov	($key_,$key);		# backup $key
542	&mov	($rounds_,$rounds);	# backup $rounds
543	&cmp	($len,0x60);
544	&jb	(&label("ecb_dec_tail"));
545
546	&movdqu	($inout0,&QWP(0,$inp));
547	&movdqu	($inout1,&QWP(0x10,$inp));
548	&movdqu	($inout2,&QWP(0x20,$inp));
549	&movdqu	($inout3,&QWP(0x30,$inp));
550	&movdqu	($inout4,&QWP(0x40,$inp));
551	&movdqu	($inout5,&QWP(0x50,$inp));
552	&lea	($inp,&DWP(0x60,$inp));
553	&sub	($len,0x60);
554	&jmp	(&label("ecb_dec_loop6_enter"));
555
556&set_label("ecb_dec_loop6",16);
557	&movups	(&QWP(0,$out),$inout0);
558	&movdqu	($inout0,&QWP(0,$inp));
559	&movups	(&QWP(0x10,$out),$inout1);
560	&movdqu	($inout1,&QWP(0x10,$inp));
561	&movups	(&QWP(0x20,$out),$inout2);
562	&movdqu	($inout2,&QWP(0x20,$inp));
563	&movups	(&QWP(0x30,$out),$inout3);
564	&movdqu	($inout3,&QWP(0x30,$inp));
565	&movups	(&QWP(0x40,$out),$inout4);
566	&movdqu	($inout4,&QWP(0x40,$inp));
567	&movups	(&QWP(0x50,$out),$inout5);
568	&lea	($out,&DWP(0x60,$out));
569	&movdqu	($inout5,&QWP(0x50,$inp));
570	&lea	($inp,&DWP(0x60,$inp));
571&set_label("ecb_dec_loop6_enter");
572
573	&call	("_aesni_decrypt6");
574
575	&mov	($key,$key_);		# restore $key
576	&mov	($rounds,$rounds_);	# restore $rounds
577	&sub	($len,0x60);
578	&jnc	(&label("ecb_dec_loop6"));
579
580	&movups	(&QWP(0,$out),$inout0);
581	&movups	(&QWP(0x10,$out),$inout1);
582	&movups	(&QWP(0x20,$out),$inout2);
583	&movups	(&QWP(0x30,$out),$inout3);
584	&movups	(&QWP(0x40,$out),$inout4);
585	&movups	(&QWP(0x50,$out),$inout5);
586	&lea	($out,&DWP(0x60,$out));
587	&add	($len,0x60);
588	&jz	(&label("ecb_ret"));
589
590&set_label("ecb_dec_tail");
591	&movups	($inout0,&QWP(0,$inp));
592	&cmp	($len,0x20);
593	&jb	(&label("ecb_dec_one"));
594	&movups	($inout1,&QWP(0x10,$inp));
595	&je	(&label("ecb_dec_two"));
596	&movups	($inout2,&QWP(0x20,$inp));
597	&cmp	($len,0x40);
598	&jb	(&label("ecb_dec_three"));
599	&movups	($inout3,&QWP(0x30,$inp));
600	&je	(&label("ecb_dec_four"));
601	&movups	($inout4,&QWP(0x40,$inp));
602	&xorps	($inout5,$inout5);
603	&call	("_aesni_decrypt6");
604	&movups	(&QWP(0,$out),$inout0);
605	&movups	(&QWP(0x10,$out),$inout1);
606	&movups	(&QWP(0x20,$out),$inout2);
607	&movups	(&QWP(0x30,$out),$inout3);
608	&movups	(&QWP(0x40,$out),$inout4);
609	&jmp	(&label("ecb_ret"));
610
611&set_label("ecb_dec_one",16);
612	if ($inline)
613	{   &aesni_inline_generate1("dec");	}
614	else
615	{   &call	("_aesni_decrypt1");	}
616	&movups	(&QWP(0,$out),$inout0);
617	&jmp	(&label("ecb_ret"));
618
619&set_label("ecb_dec_two",16);
620	&call	("_aesni_decrypt2");
621	&movups	(&QWP(0,$out),$inout0);
622	&movups	(&QWP(0x10,$out),$inout1);
623	&jmp	(&label("ecb_ret"));
624
625&set_label("ecb_dec_three",16);
626	&call	("_aesni_decrypt3");
627	&movups	(&QWP(0,$out),$inout0);
628	&movups	(&QWP(0x10,$out),$inout1);
629	&movups	(&QWP(0x20,$out),$inout2);
630	&jmp	(&label("ecb_ret"));
631
632&set_label("ecb_dec_four",16);
633	&call	("_aesni_decrypt4");
634	&movups	(&QWP(0,$out),$inout0);
635	&movups	(&QWP(0x10,$out),$inout1);
636	&movups	(&QWP(0x20,$out),$inout2);
637	&movups	(&QWP(0x30,$out),$inout3);
638
639&set_label("ecb_ret");
640	&pxor	("xmm0","xmm0");		# clear register bank
641	&pxor	("xmm1","xmm1");
642	&pxor	("xmm2","xmm2");
643	&pxor	("xmm3","xmm3");
644	&pxor	("xmm4","xmm4");
645	&pxor	("xmm5","xmm5");
646	&pxor	("xmm6","xmm6");
647	&pxor	("xmm7","xmm7");
648&function_end("aesni_ecb_encrypt");
649
650######################################################################
651# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
652#                         size_t blocks, const AES_KEY *key,
653#                         const char *ivec,char *cmac);
654#
655# Handles only complete blocks, operates on 64-bit counter and
656# does not update *ivec! Nor does it finalize CMAC value
657# (see engine/eng_aesni.c for details)
658#
659{ my $cmac=$inout1;
660&function_begin("aesni_ccm64_encrypt_blocks");
661	&mov	($inp,&wparam(0));
662	&mov	($out,&wparam(1));
663	&mov	($len,&wparam(2));
664	&mov	($key,&wparam(3));
665	&mov	($rounds_,&wparam(4));
666	&mov	($rounds,&wparam(5));
667	&mov	($key_,"esp");
668	&sub	("esp",60);
669	&and	("esp",-16);			# align stack
670	&mov	(&DWP(48,"esp"),$key_);
671
672	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
673	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
674	&mov	($rounds,&DWP(240,$key));
675
676	# compose byte-swap control mask for pshufb on stack
677	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
678	&mov	(&DWP(4,"esp"),0x08090a0b);
679	&mov	(&DWP(8,"esp"),0x04050607);
680	&mov	(&DWP(12,"esp"),0x00010203);
681
682	# compose counter increment vector on stack
683	&mov	($rounds_,1);
684	&xor	($key_,$key_);
685	&mov	(&DWP(16,"esp"),$rounds_);
686	&mov	(&DWP(20,"esp"),$key_);
687	&mov	(&DWP(24,"esp"),$key_);
688	&mov	(&DWP(28,"esp"),$key_);
689
690	&shl	($rounds,4);
691	&mov	($rounds_,16);
692	&lea	($key_,&DWP(0,$key));
693	&movdqa	($inout3,&QWP(0,"esp"));
694	&movdqa	($inout0,$ivec);
695	&lea	($key,&DWP(32,$key,$rounds));
696	&sub	($rounds_,$rounds);
697	&pshufb	($ivec,$inout3);
698
699&set_label("ccm64_enc_outer");
700	&$movekey	($rndkey0,&QWP(0,$key_));
701	&mov		($rounds,$rounds_);
702	&movups		($in0,&QWP(0,$inp));
703
704	&xorps		($inout0,$rndkey0);
705	&$movekey	($rndkey1,&QWP(16,$key_));
706	&xorps		($rndkey0,$in0);
707	&xorps		($cmac,$rndkey0);		# cmac^=inp
708	&$movekey	($rndkey0,&QWP(32,$key_));
709
710&set_label("ccm64_enc2_loop");
711	&aesenc		($inout0,$rndkey1);
712	&aesenc		($cmac,$rndkey1);
713	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
714	&add		($rounds,32);
715	&aesenc		($inout0,$rndkey0);
716	&aesenc		($cmac,$rndkey0);
717	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
718	&jnz		(&label("ccm64_enc2_loop"));
719	&aesenc		($inout0,$rndkey1);
720	&aesenc		($cmac,$rndkey1);
721	&paddq		($ivec,&QWP(16,"esp"));
722	&dec		($len);
723	&aesenclast	($inout0,$rndkey0);
724	&aesenclast	($cmac,$rndkey0);
725
726	&lea	($inp,&DWP(16,$inp));
727	&xorps	($in0,$inout0);			# inp^=E(ivec)
728	&movdqa	($inout0,$ivec);
729	&movups	(&QWP(0,$out),$in0);		# save output
730	&pshufb	($inout0,$inout3);
731	&lea	($out,&DWP(16,$out));
732	&jnz	(&label("ccm64_enc_outer"));
733
734	&mov	("esp",&DWP(48,"esp"));
735	&mov	($out,&wparam(5));
736	&movups	(&QWP(0,$out),$cmac);
737
738	&pxor	("xmm0","xmm0");		# clear register bank
739	&pxor	("xmm1","xmm1");
740	&pxor	("xmm2","xmm2");
741	&pxor	("xmm3","xmm3");
742	&pxor	("xmm4","xmm4");
743	&pxor	("xmm5","xmm5");
744	&pxor	("xmm6","xmm6");
745	&pxor	("xmm7","xmm7");
746&function_end("aesni_ccm64_encrypt_blocks");
747
748&function_begin("aesni_ccm64_decrypt_blocks");
749	&mov	($inp,&wparam(0));
750	&mov	($out,&wparam(1));
751	&mov	($len,&wparam(2));
752	&mov	($key,&wparam(3));
753	&mov	($rounds_,&wparam(4));
754	&mov	($rounds,&wparam(5));
755	&mov	($key_,"esp");
756	&sub	("esp",60);
757	&and	("esp",-16);			# align stack
758	&mov	(&DWP(48,"esp"),$key_);
759
760	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
761	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
762	&mov	($rounds,&DWP(240,$key));
763
764	# compose byte-swap control mask for pshufb on stack
765	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
766	&mov	(&DWP(4,"esp"),0x08090a0b);
767	&mov	(&DWP(8,"esp"),0x04050607);
768	&mov	(&DWP(12,"esp"),0x00010203);
769
770	# compose counter increment vector on stack
771	&mov	($rounds_,1);
772	&xor	($key_,$key_);
773	&mov	(&DWP(16,"esp"),$rounds_);
774	&mov	(&DWP(20,"esp"),$key_);
775	&mov	(&DWP(24,"esp"),$key_);
776	&mov	(&DWP(28,"esp"),$key_);
777
778	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
779	&movdqa	($inout0,$ivec);
780
781	&mov	($key_,$key);
782	&mov	($rounds_,$rounds);
783
784	&pshufb	($ivec,$inout3);
785	if ($inline)
786	{   &aesni_inline_generate1("enc");	}
787	else
788	{   &call	("_aesni_encrypt1");	}
789	&shl	($rounds_,4);
790	&mov	($rounds,16);
791	&movups	($in0,&QWP(0,$inp));		# load inp
792	&paddq	($ivec,&QWP(16,"esp"));
793	&lea	($inp,&QWP(16,$inp));
794	&sub	($rounds,$rounds_);
795	&lea	($key,&DWP(32,$key_,$rounds_));
796	&mov	($rounds_,$rounds);
797	&jmp	(&label("ccm64_dec_outer"));
798
799&set_label("ccm64_dec_outer",16);
800	&xorps	($in0,$inout0);			# inp ^= E(ivec)
801	&movdqa	($inout0,$ivec);
802	&movups	(&QWP(0,$out),$in0);		# save output
803	&lea	($out,&DWP(16,$out));
804	&pshufb	($inout0,$inout3);
805
806	&sub	($len,1);
807	&jz	(&label("ccm64_dec_break"));
808
809	&$movekey	($rndkey0,&QWP(0,$key_));
810	&mov		($rounds,$rounds_);
811	&$movekey	($rndkey1,&QWP(16,$key_));
812	&xorps		($in0,$rndkey0);
813	&xorps		($inout0,$rndkey0);
814	&xorps		($cmac,$in0);		# cmac^=out
815	&$movekey	($rndkey0,&QWP(32,$key_));
816
817&set_label("ccm64_dec2_loop");
818	&aesenc		($inout0,$rndkey1);
819	&aesenc		($cmac,$rndkey1);
820	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
821	&add		($rounds,32);
822	&aesenc		($inout0,$rndkey0);
823	&aesenc		($cmac,$rndkey0);
824	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
825	&jnz		(&label("ccm64_dec2_loop"));
826	&movups		($in0,&QWP(0,$inp));	# load inp
827	&paddq		($ivec,&QWP(16,"esp"));
828	&aesenc		($inout0,$rndkey1);
829	&aesenc		($cmac,$rndkey1);
830	&aesenclast	($inout0,$rndkey0);
831	&aesenclast	($cmac,$rndkey0);
832	&lea		($inp,&QWP(16,$inp));
833	&jmp	(&label("ccm64_dec_outer"));
834
835&set_label("ccm64_dec_break",16);
836	&mov	($rounds,&DWP(240,$key_));
837	&mov	($key,$key_);
838	if ($inline)
839	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
840	else
841	{   &call	("_aesni_encrypt1",$cmac);	}
842
843	&mov	("esp",&DWP(48,"esp"));
844	&mov	($out,&wparam(5));
845	&movups	(&QWP(0,$out),$cmac);
846
847	&pxor	("xmm0","xmm0");		# clear register bank
848	&pxor	("xmm1","xmm1");
849	&pxor	("xmm2","xmm2");
850	&pxor	("xmm3","xmm3");
851	&pxor	("xmm4","xmm4");
852	&pxor	("xmm5","xmm5");
853	&pxor	("xmm6","xmm6");
854	&pxor	("xmm7","xmm7");
855&function_end("aesni_ccm64_decrypt_blocks");
856}
857
858######################################################################
859# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
860#                         size_t blocks, const AES_KEY *key,
861#                         const char *ivec);
862#
863# Handles only complete blocks, operates on 32-bit counter and
864# does not update *ivec! (see crypto/modes/ctr128.c for details)
865#
866# stack layout:
867#	0	pshufb mask
868#	16	vector addend: 0,6,6,6
869# 	32	counter-less ivec
870#	48	1st triplet of counter vector
871#	64	2nd triplet of counter vector
872#	80	saved %esp
873
874&function_begin("aesni_ctr32_encrypt_blocks");
875	&mov	($inp,&wparam(0));
876	&mov	($out,&wparam(1));
877	&mov	($len,&wparam(2));
878	&mov	($key,&wparam(3));
879	&mov	($rounds_,&wparam(4));
880	&mov	($key_,"esp");
881	&sub	("esp",88);
882	&and	("esp",-16);			# align stack
883	&mov	(&DWP(80,"esp"),$key_);
884
885	&cmp	($len,1);
886	&je	(&label("ctr32_one_shortcut"));
887
888	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
889
890	# compose byte-swap control mask for pshufb on stack
891	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
892	&mov	(&DWP(4,"esp"),0x08090a0b);
893	&mov	(&DWP(8,"esp"),0x04050607);
894	&mov	(&DWP(12,"esp"),0x00010203);
895
896	# compose counter increment vector on stack
897	&mov	($rounds,6);
898	&xor	($key_,$key_);
899	&mov	(&DWP(16,"esp"),$rounds);
900	&mov	(&DWP(20,"esp"),$rounds);
901	&mov	(&DWP(24,"esp"),$rounds);
902	&mov	(&DWP(28,"esp"),$key_);
903
904	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
905	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
906
907	&mov	($rounds,&DWP(240,$key));	# key->rounds
908
909	# compose 2 vectors of 3x32-bit counters
910	&bswap	($rounds_);
911	&pxor	($rndkey0,$rndkey0);
912	&pxor	($rndkey1,$rndkey1);
913	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
914	&pinsrd	($rndkey0,$rounds_,0);
915	&lea	($key_,&DWP(3,$rounds_));
916	&pinsrd	($rndkey1,$key_,0);
917	&inc	($rounds_);
918	&pinsrd	($rndkey0,$rounds_,1);
919	&inc	($key_);
920	&pinsrd	($rndkey1,$key_,1);
921	&inc	($rounds_);
922	&pinsrd	($rndkey0,$rounds_,2);
923	&inc	($key_);
924	&pinsrd	($rndkey1,$key_,2);
925	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
926	&pshufb	($rndkey0,$inout0);		# byte swap
927	&movdqu	($inout4,&QWP(0,$key));		# key[0]
928	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
929	&pshufb	($rndkey1,$inout0);		# byte swap
930
931	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
932	&pshufd	($inout1,$rndkey0,2<<6);
933	&cmp	($len,6);
934	&jb	(&label("ctr32_tail"));
935	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
936	&shl	($rounds,4);
937	&mov	($rounds_,16);
938	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
939	&mov	($key_,$key);			# backup $key
940	&sub	($rounds_,$rounds);		# backup twisted $rounds
941	&lea	($key,&DWP(32,$key,$rounds));
942	&sub	($len,6);
943	&jmp	(&label("ctr32_loop6"));
944
945&set_label("ctr32_loop6",16);
946	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
947	&pshufd	($inout2,$rndkey0,1<<6);
948	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
949	&pshufd	($inout3,$rndkey1,3<<6);
950	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
951	&pshufd	($inout4,$rndkey1,2<<6);
952	&pxor		($inout1,$rndkey0);
953	&pshufd	($inout5,$rndkey1,1<<6);
954	&$movekey	($rndkey1,&QWP(16,$key_));
955	&pxor		($inout2,$rndkey0);
956	&pxor		($inout3,$rndkey0);
957	&aesenc		($inout0,$rndkey1);
958	&pxor		($inout4,$rndkey0);
959	&pxor		($inout5,$rndkey0);
960	&aesenc		($inout1,$rndkey1);
961	&$movekey	($rndkey0,&QWP(32,$key_));
962	&mov		($rounds,$rounds_);
963	&aesenc		($inout2,$rndkey1);
964	&aesenc		($inout3,$rndkey1);
965	&aesenc		($inout4,$rndkey1);
966	&aesenc		($inout5,$rndkey1);
967
968	&call		(&label("_aesni_encrypt6_enter"));
969
970	&movups	($rndkey1,&QWP(0,$inp));
971	&movups	($rndkey0,&QWP(0x10,$inp));
972	&xorps	($inout0,$rndkey1);
973	&movups	($rndkey1,&QWP(0x20,$inp));
974	&xorps	($inout1,$rndkey0);
975	&movups	(&QWP(0,$out),$inout0);
976	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
977	&xorps	($inout2,$rndkey1);
978	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
979	&movups	(&QWP(0x10,$out),$inout1);
980	&movups	(&QWP(0x20,$out),$inout2);
981
982	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
983	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
984	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
985
986	&movups	($inout1,&QWP(0x30,$inp));
987	&movups	($inout2,&QWP(0x40,$inp));
988	&xorps	($inout3,$inout1);
989	&movups	($inout1,&QWP(0x50,$inp));
990	&lea	($inp,&DWP(0x60,$inp));
991	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
992	&pshufb	($rndkey0,$inout0);		# byte swap
993	&xorps	($inout4,$inout2);
994	&movups	(&QWP(0x30,$out),$inout3);
995	&xorps	($inout5,$inout1);
996	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
997	&pshufb	($rndkey1,$inout0);		# byte swap
998	&movups	(&QWP(0x40,$out),$inout4);
999	&pshufd	($inout0,$rndkey0,3<<6);
1000	&movups	(&QWP(0x50,$out),$inout5);
1001	&lea	($out,&DWP(0x60,$out));
1002
1003	&pshufd	($inout1,$rndkey0,2<<6);
1004	&sub	($len,6);
1005	&jnc	(&label("ctr32_loop6"));
1006
1007	&add	($len,6);
1008	&jz	(&label("ctr32_ret"));
1009	&movdqu	($inout5,&QWP(0,$key_));
1010	&mov	($key,$key_);
1011	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
1012	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1013
1014&set_label("ctr32_tail");
1015	&por	($inout0,$inout5);
1016	&cmp	($len,2);
1017	&jb	(&label("ctr32_one"));
1018
1019	&pshufd	($inout2,$rndkey0,1<<6);
1020	&por	($inout1,$inout5);
1021	&je	(&label("ctr32_two"));
1022
1023	&pshufd	($inout3,$rndkey1,3<<6);
1024	&por	($inout2,$inout5);
1025	&cmp	($len,4);
1026	&jb	(&label("ctr32_three"));
1027
1028	&pshufd	($inout4,$rndkey1,2<<6);
1029	&por	($inout3,$inout5);
1030	&je	(&label("ctr32_four"));
1031
1032	&por	($inout4,$inout5);
1033	&call	("_aesni_encrypt6");
1034	&movups	($rndkey1,&QWP(0,$inp));
1035	&movups	($rndkey0,&QWP(0x10,$inp));
1036	&xorps	($inout0,$rndkey1);
1037	&movups	($rndkey1,&QWP(0x20,$inp));
1038	&xorps	($inout1,$rndkey0);
1039	&movups	($rndkey0,&QWP(0x30,$inp));
1040	&xorps	($inout2,$rndkey1);
1041	&movups	($rndkey1,&QWP(0x40,$inp));
1042	&xorps	($inout3,$rndkey0);
1043	&movups	(&QWP(0,$out),$inout0);
1044	&xorps	($inout4,$rndkey1);
1045	&movups	(&QWP(0x10,$out),$inout1);
1046	&movups	(&QWP(0x20,$out),$inout2);
1047	&movups	(&QWP(0x30,$out),$inout3);
1048	&movups	(&QWP(0x40,$out),$inout4);
1049	&jmp	(&label("ctr32_ret"));
1050
1051&set_label("ctr32_one_shortcut",16);
1052	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
1053	&mov	($rounds,&DWP(240,$key));
1054
1055&set_label("ctr32_one");
1056	if ($inline)
1057	{   &aesni_inline_generate1("enc");	}
1058	else
1059	{   &call	("_aesni_encrypt1");	}
1060	&movups	($in0,&QWP(0,$inp));
1061	&xorps	($in0,$inout0);
1062	&movups	(&QWP(0,$out),$in0);
1063	&jmp	(&label("ctr32_ret"));
1064
1065&set_label("ctr32_two",16);
1066	&call	("_aesni_encrypt2");
1067	&movups	($inout3,&QWP(0,$inp));
1068	&movups	($inout4,&QWP(0x10,$inp));
1069	&xorps	($inout0,$inout3);
1070	&xorps	($inout1,$inout4);
1071	&movups	(&QWP(0,$out),$inout0);
1072	&movups	(&QWP(0x10,$out),$inout1);
1073	&jmp	(&label("ctr32_ret"));
1074
1075&set_label("ctr32_three",16);
1076	&call	("_aesni_encrypt3");
1077	&movups	($inout3,&QWP(0,$inp));
1078	&movups	($inout4,&QWP(0x10,$inp));
1079	&xorps	($inout0,$inout3);
1080	&movups	($inout5,&QWP(0x20,$inp));
1081	&xorps	($inout1,$inout4);
1082	&movups	(&QWP(0,$out),$inout0);
1083	&xorps	($inout2,$inout5);
1084	&movups	(&QWP(0x10,$out),$inout1);
1085	&movups	(&QWP(0x20,$out),$inout2);
1086	&jmp	(&label("ctr32_ret"));
1087
1088&set_label("ctr32_four",16);
1089	&call	("_aesni_encrypt4");
1090	&movups	($inout4,&QWP(0,$inp));
1091	&movups	($inout5,&QWP(0x10,$inp));
1092	&movups	($rndkey1,&QWP(0x20,$inp));
1093	&xorps	($inout0,$inout4);
1094	&movups	($rndkey0,&QWP(0x30,$inp));
1095	&xorps	($inout1,$inout5);
1096	&movups	(&QWP(0,$out),$inout0);
1097	&xorps	($inout2,$rndkey1);
1098	&movups	(&QWP(0x10,$out),$inout1);
1099	&xorps	($inout3,$rndkey0);
1100	&movups	(&QWP(0x20,$out),$inout2);
1101	&movups	(&QWP(0x30,$out),$inout3);
1102
1103&set_label("ctr32_ret");
1104	&pxor	("xmm0","xmm0");		# clear register bank
1105	&pxor	("xmm1","xmm1");
1106	&pxor	("xmm2","xmm2");
1107	&pxor	("xmm3","xmm3");
1108	&pxor	("xmm4","xmm4");
1109	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
1110	&pxor	("xmm5","xmm5");
1111	&movdqa	(&QWP(48,"esp"),"xmm0");
1112	&pxor	("xmm6","xmm6");
1113	&movdqa	(&QWP(64,"esp"),"xmm0");
1114	&pxor	("xmm7","xmm7");
1115	&mov	("esp",&DWP(80,"esp"));
1116&function_end("aesni_ctr32_encrypt_blocks");
1117
1118######################################################################
1119# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1120#	const AES_KEY *key1, const AES_KEY *key2
1121#	const unsigned char iv[16]);
1122#
1123{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1124
1125&function_begin("aesni_xts_encrypt");
1126	&mov	($key,&wparam(4));		# key2
1127	&mov	($inp,&wparam(5));		# clear-text tweak
1128
1129	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1130	&movups	($inout0,&QWP(0,$inp));
1131	if ($inline)
1132	{   &aesni_inline_generate1("enc");	}
1133	else
1134	{   &call	("_aesni_encrypt1");	}
1135
1136	&mov	($inp,&wparam(0));
1137	&mov	($out,&wparam(1));
1138	&mov	($len,&wparam(2));
1139	&mov	($key,&wparam(3));		# key1
1140
1141	&mov	($key_,"esp");
1142	&sub	("esp",16*7+8);
1143	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1144	&and	("esp",-16);			# align stack
1145
1146	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1147	&mov	(&DWP(16*6+4,"esp"),0);
1148	&mov	(&DWP(16*6+8,"esp"),1);
1149	&mov	(&DWP(16*6+12,"esp"),0);
1150	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1151	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1152
1153	&movdqa	($tweak,$inout0);
1154	&pxor	($twtmp,$twtmp);
1155	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1156	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1157
1158	&and	($len,-16);
1159	&mov	($key_,$key);			# backup $key
1160	&mov	($rounds_,$rounds);		# backup $rounds
1161	&sub	($len,16*6);
1162	&jc	(&label("xts_enc_short"));
1163
1164	&shl	($rounds,4);
1165	&mov	($rounds_,16);
1166	&sub	($rounds_,$rounds);
1167	&lea	($key,&DWP(32,$key,$rounds));
1168	&jmp	(&label("xts_enc_loop6"));
1169
1170&set_label("xts_enc_loop6",16);
1171	for ($i=0;$i<4;$i++) {
1172	    &pshufd	($twres,$twtmp,0x13);
1173	    &pxor	($twtmp,$twtmp);
1174	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1175	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1176	    &pand	($twres,$twmask);	# isolate carry and residue
1177	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1178	    &pxor	($tweak,$twres);
1179	}
1180	&pshufd	($inout5,$twtmp,0x13);
1181	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1182	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1183	 &$movekey	($rndkey0,&QWP(0,$key_));
1184	&pand	($inout5,$twmask);		# isolate carry and residue
1185	 &movups	($inout0,&QWP(0,$inp));	# load input
1186	&pxor	($inout5,$tweak);
1187
1188	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1189	&mov	($rounds,$rounds_);		# restore $rounds
1190	&movdqu	($inout1,&QWP(16*1,$inp));
1191	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1192	&movdqu	($inout2,&QWP(16*2,$inp));
1193	 &pxor		($inout1,$rndkey0);
1194	&movdqu	($inout3,&QWP(16*3,$inp));
1195	 &pxor		($inout2,$rndkey0);
1196	&movdqu	($inout4,&QWP(16*4,$inp));
1197	 &pxor		($inout3,$rndkey0);
1198	&movdqu	($rndkey1,&QWP(16*5,$inp));
1199	 &pxor		($inout4,$rndkey0);
1200	&lea	($inp,&DWP(16*6,$inp));
1201	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1202	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1203	&pxor	($inout5,$rndkey1);
1204
1205	 &$movekey	($rndkey1,&QWP(16,$key_));
1206	&pxor	($inout1,&QWP(16*1,"esp"));
1207	&pxor	($inout2,&QWP(16*2,"esp"));
1208	 &aesenc	($inout0,$rndkey1);
1209	&pxor	($inout3,&QWP(16*3,"esp"));
1210	&pxor	($inout4,&QWP(16*4,"esp"));
1211	 &aesenc	($inout1,$rndkey1);
1212	&pxor		($inout5,$rndkey0);
1213	 &$movekey	($rndkey0,&QWP(32,$key_));
1214	 &aesenc	($inout2,$rndkey1);
1215	 &aesenc	($inout3,$rndkey1);
1216	 &aesenc	($inout4,$rndkey1);
1217	 &aesenc	($inout5,$rndkey1);
1218	&call		(&label("_aesni_encrypt6_enter"));
1219
1220	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1221       &pxor	($twtmp,$twtmp);
1222	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1223       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1224	&xorps	($inout1,&QWP(16*1,"esp"));
1225	&movups	(&QWP(16*0,$out),$inout0);	# write output
1226	&xorps	($inout2,&QWP(16*2,"esp"));
1227	&movups	(&QWP(16*1,$out),$inout1);
1228	&xorps	($inout3,&QWP(16*3,"esp"));
1229	&movups	(&QWP(16*2,$out),$inout2);
1230	&xorps	($inout4,&QWP(16*4,"esp"));
1231	&movups	(&QWP(16*3,$out),$inout3);
1232	&xorps	($inout5,$tweak);
1233	&movups	(&QWP(16*4,$out),$inout4);
1234       &pshufd	($twres,$twtmp,0x13);
1235	&movups	(&QWP(16*5,$out),$inout5);
1236	&lea	($out,&DWP(16*6,$out));
1237       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1238
1239	&pxor	($twtmp,$twtmp);
1240	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1241	&pand	($twres,$twmask);		# isolate carry and residue
1242	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1243	&pxor	($tweak,$twres);
1244
1245	&sub	($len,16*6);
1246	&jnc	(&label("xts_enc_loop6"));
1247
1248	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1249	&mov	($key,$key_);			# restore $key
1250	&mov	($rounds_,$rounds);
1251
1252&set_label("xts_enc_short");
1253	&add	($len,16*6);
1254	&jz	(&label("xts_enc_done6x"));
1255
1256	&movdqa	($inout3,$tweak);		# put aside previous tweak
1257	&cmp	($len,0x20);
1258	&jb	(&label("xts_enc_one"));
1259
1260	&pshufd	($twres,$twtmp,0x13);
1261	&pxor	($twtmp,$twtmp);
1262	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1263	&pand	($twres,$twmask);		# isolate carry and residue
1264	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1265	&pxor	($tweak,$twres);
1266	&je	(&label("xts_enc_two"));
1267
1268	&pshufd	($twres,$twtmp,0x13);
1269	&pxor	($twtmp,$twtmp);
1270	&movdqa	($inout4,$tweak);		# put aside previous tweak
1271	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1272	&pand	($twres,$twmask);		# isolate carry and residue
1273	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1274	&pxor	($tweak,$twres);
1275	&cmp	($len,0x40);
1276	&jb	(&label("xts_enc_three"));
1277
1278	&pshufd	($twres,$twtmp,0x13);
1279	&pxor	($twtmp,$twtmp);
1280	&movdqa	($inout5,$tweak);		# put aside previous tweak
1281	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1282	&pand	($twres,$twmask);		# isolate carry and residue
1283	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1284	&pxor	($tweak,$twres);
1285	&movdqa	(&QWP(16*0,"esp"),$inout3);
1286	&movdqa	(&QWP(16*1,"esp"),$inout4);
1287	&je	(&label("xts_enc_four"));
1288
1289	&movdqa	(&QWP(16*2,"esp"),$inout5);
1290	&pshufd	($inout5,$twtmp,0x13);
1291	&movdqa	(&QWP(16*3,"esp"),$tweak);
1292	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1293	&pand	($inout5,$twmask);		# isolate carry and residue
1294	&pxor	($inout5,$tweak);
1295
1296	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1297	&movdqu	($inout1,&QWP(16*1,$inp));
1298	&movdqu	($inout2,&QWP(16*2,$inp));
1299	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1300	&movdqu	($inout3,&QWP(16*3,$inp));
1301	&pxor	($inout1,&QWP(16*1,"esp"));
1302	&movdqu	($inout4,&QWP(16*4,$inp));
1303	&pxor	($inout2,&QWP(16*2,"esp"));
1304	&lea	($inp,&DWP(16*5,$inp));
1305	&pxor	($inout3,&QWP(16*3,"esp"));
1306	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1307	&pxor	($inout4,$inout5);
1308
1309	&call	("_aesni_encrypt6");
1310
1311	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1312	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1313	&xorps	($inout1,&QWP(16*1,"esp"));
1314	&xorps	($inout2,&QWP(16*2,"esp"));
1315	&movups	(&QWP(16*0,$out),$inout0);	# write output
1316	&xorps	($inout3,&QWP(16*3,"esp"));
1317	&movups	(&QWP(16*1,$out),$inout1);
1318	&xorps	($inout4,$tweak);
1319	&movups	(&QWP(16*2,$out),$inout2);
1320	&movups	(&QWP(16*3,$out),$inout3);
1321	&movups	(&QWP(16*4,$out),$inout4);
1322	&lea	($out,&DWP(16*5,$out));
1323	&jmp	(&label("xts_enc_done"));
1324
1325&set_label("xts_enc_one",16);
1326	&movups	($inout0,&QWP(16*0,$inp));	# load input
1327	&lea	($inp,&DWP(16*1,$inp));
1328	&xorps	($inout0,$inout3);		# input^=tweak
1329	if ($inline)
1330	{   &aesni_inline_generate1("enc");	}
1331	else
1332	{   &call	("_aesni_encrypt1");	}
1333	&xorps	($inout0,$inout3);		# output^=tweak
1334	&movups	(&QWP(16*0,$out),$inout0);	# write output
1335	&lea	($out,&DWP(16*1,$out));
1336
1337	&movdqa	($tweak,$inout3);		# last tweak
1338	&jmp	(&label("xts_enc_done"));
1339
1340&set_label("xts_enc_two",16);
1341	&movaps	($inout4,$tweak);		# put aside last tweak
1342
1343	&movups	($inout0,&QWP(16*0,$inp));	# load input
1344	&movups	($inout1,&QWP(16*1,$inp));
1345	&lea	($inp,&DWP(16*2,$inp));
1346	&xorps	($inout0,$inout3);		# input^=tweak
1347	&xorps	($inout1,$inout4);
1348
1349	&call	("_aesni_encrypt2");
1350
1351	&xorps	($inout0,$inout3);		# output^=tweak
1352	&xorps	($inout1,$inout4);
1353	&movups	(&QWP(16*0,$out),$inout0);	# write output
1354	&movups	(&QWP(16*1,$out),$inout1);
1355	&lea	($out,&DWP(16*2,$out));
1356
1357	&movdqa	($tweak,$inout4);		# last tweak
1358	&jmp	(&label("xts_enc_done"));
1359
1360&set_label("xts_enc_three",16);
1361	&movaps	($inout5,$tweak);		# put aside last tweak
1362	&movups	($inout0,&QWP(16*0,$inp));	# load input
1363	&movups	($inout1,&QWP(16*1,$inp));
1364	&movups	($inout2,&QWP(16*2,$inp));
1365	&lea	($inp,&DWP(16*3,$inp));
1366	&xorps	($inout0,$inout3);		# input^=tweak
1367	&xorps	($inout1,$inout4);
1368	&xorps	($inout2,$inout5);
1369
1370	&call	("_aesni_encrypt3");
1371
1372	&xorps	($inout0,$inout3);		# output^=tweak
1373	&xorps	($inout1,$inout4);
1374	&xorps	($inout2,$inout5);
1375	&movups	(&QWP(16*0,$out),$inout0);	# write output
1376	&movups	(&QWP(16*1,$out),$inout1);
1377	&movups	(&QWP(16*2,$out),$inout2);
1378	&lea	($out,&DWP(16*3,$out));
1379
1380	&movdqa	($tweak,$inout5);		# last tweak
1381	&jmp	(&label("xts_enc_done"));
1382
1383&set_label("xts_enc_four",16);
1384	&movaps	($inout4,$tweak);		# put aside last tweak
1385
1386	&movups	($inout0,&QWP(16*0,$inp));	# load input
1387	&movups	($inout1,&QWP(16*1,$inp));
1388	&movups	($inout2,&QWP(16*2,$inp));
1389	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1390	&movups	($inout3,&QWP(16*3,$inp));
1391	&lea	($inp,&DWP(16*4,$inp));
1392	&xorps	($inout1,&QWP(16*1,"esp"));
1393	&xorps	($inout2,$inout5);
1394	&xorps	($inout3,$inout4);
1395
1396	&call	("_aesni_encrypt4");
1397
1398	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1399	&xorps	($inout1,&QWP(16*1,"esp"));
1400	&xorps	($inout2,$inout5);
1401	&movups	(&QWP(16*0,$out),$inout0);	# write output
1402	&xorps	($inout3,$inout4);
1403	&movups	(&QWP(16*1,$out),$inout1);
1404	&movups	(&QWP(16*2,$out),$inout2);
1405	&movups	(&QWP(16*3,$out),$inout3);
1406	&lea	($out,&DWP(16*4,$out));
1407
1408	&movdqa	($tweak,$inout4);		# last tweak
1409	&jmp	(&label("xts_enc_done"));
1410
1411&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1412	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1413	&and	($len,15);
1414	&jz	(&label("xts_enc_ret"));
1415	&movdqa	($inout3,$tweak);
1416	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1417	&jmp	(&label("xts_enc_steal"));
1418
1419&set_label("xts_enc_done",16);
1420	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1421	&pxor	($twtmp,$twtmp);
1422	&and	($len,15);
1423	&jz	(&label("xts_enc_ret"));
1424
1425	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1426	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1427	&pshufd	($inout3,$twtmp,0x13);
1428	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1429	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1430	&pxor	($inout3,$tweak);
1431
1432&set_label("xts_enc_steal");
1433	&movz	($rounds,&BP(0,$inp));
1434	&movz	($key,&BP(-16,$out));
1435	&lea	($inp,&DWP(1,$inp));
1436	&mov	(&BP(-16,$out),&LB($rounds));
1437	&mov	(&BP(0,$out),&LB($key));
1438	&lea	($out,&DWP(1,$out));
1439	&sub	($len,1);
1440	&jnz	(&label("xts_enc_steal"));
1441
1442	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1443	&mov	($key,$key_);			# restore $key
1444	&mov	($rounds,$rounds_);		# restore $rounds
1445
1446	&movups	($inout0,&QWP(-16,$out));	# load input
1447	&xorps	($inout0,$inout3);		# input^=tweak
1448	if ($inline)
1449	{   &aesni_inline_generate1("enc");	}
1450	else
1451	{   &call	("_aesni_encrypt1");	}
1452	&xorps	($inout0,$inout3);		# output^=tweak
1453	&movups	(&QWP(-16,$out),$inout0);	# write output
1454
1455&set_label("xts_enc_ret");
1456	&pxor	("xmm0","xmm0");		# clear register bank
1457	&pxor	("xmm1","xmm1");
1458	&pxor	("xmm2","xmm2");
1459	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1460	&pxor	("xmm3","xmm3");
1461	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1462	&pxor	("xmm4","xmm4");
1463	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1464	&pxor	("xmm5","xmm5");
1465	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1466	&pxor	("xmm6","xmm6");
1467	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1468	&pxor	("xmm7","xmm7");
1469	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1470	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1471&function_end("aesni_xts_encrypt");
1472
1473&function_begin("aesni_xts_decrypt");
1474	&mov	($key,&wparam(4));		# key2
1475	&mov	($inp,&wparam(5));		# clear-text tweak
1476
1477	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1478	&movups	($inout0,&QWP(0,$inp));
1479	if ($inline)
1480	{   &aesni_inline_generate1("enc");	}
1481	else
1482	{   &call	("_aesni_encrypt1");	}
1483
1484	&mov	($inp,&wparam(0));
1485	&mov	($out,&wparam(1));
1486	&mov	($len,&wparam(2));
1487	&mov	($key,&wparam(3));		# key1
1488
1489	&mov	($key_,"esp");
1490	&sub	("esp",16*7+8);
1491	&and	("esp",-16);			# align stack
1492
1493	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1494	&test	($len,15);
1495	&setnz	(&LB($rounds_));
1496	&shl	($rounds_,4);
1497	&sub	($len,$rounds_);
1498
1499	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1500	&mov	(&DWP(16*6+4,"esp"),0);
1501	&mov	(&DWP(16*6+8,"esp"),1);
1502	&mov	(&DWP(16*6+12,"esp"),0);
1503	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1504	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1505
1506	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1507	&mov	($key_,$key);			# backup $key
1508	&mov	($rounds_,$rounds);		# backup $rounds
1509
1510	&movdqa	($tweak,$inout0);
1511	&pxor	($twtmp,$twtmp);
1512	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1513	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1514
1515	&and	($len,-16);
1516	&sub	($len,16*6);
1517	&jc	(&label("xts_dec_short"));
1518
1519	&shl	($rounds,4);
1520	&mov	($rounds_,16);
1521	&sub	($rounds_,$rounds);
1522	&lea	($key,&DWP(32,$key,$rounds));
1523	&jmp	(&label("xts_dec_loop6"));
1524
1525&set_label("xts_dec_loop6",16);
1526	for ($i=0;$i<4;$i++) {
1527	    &pshufd	($twres,$twtmp,0x13);
1528	    &pxor	($twtmp,$twtmp);
1529	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1530	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1531	    &pand	($twres,$twmask);	# isolate carry and residue
1532	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1533	    &pxor	($tweak,$twres);
1534	}
1535	&pshufd	($inout5,$twtmp,0x13);
1536	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1537	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1538	 &$movekey	($rndkey0,&QWP(0,$key_));
1539	&pand	($inout5,$twmask);		# isolate carry and residue
1540	 &movups	($inout0,&QWP(0,$inp));	# load input
1541	&pxor	($inout5,$tweak);
1542
1543	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1544	&mov	($rounds,$rounds_);
1545	&movdqu	($inout1,&QWP(16*1,$inp));
1546	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1547	&movdqu	($inout2,&QWP(16*2,$inp));
1548	 &pxor		($inout1,$rndkey0);
1549	&movdqu	($inout3,&QWP(16*3,$inp));
1550	 &pxor		($inout2,$rndkey0);
1551	&movdqu	($inout4,&QWP(16*4,$inp));
1552	 &pxor		($inout3,$rndkey0);
1553	&movdqu	($rndkey1,&QWP(16*5,$inp));
1554	 &pxor		($inout4,$rndkey0);
1555	&lea	($inp,&DWP(16*6,$inp));
1556	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1557	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1558	&pxor	($inout5,$rndkey1);
1559
1560	 &$movekey	($rndkey1,&QWP(16,$key_));
1561	&pxor	($inout1,&QWP(16*1,"esp"));
1562	&pxor	($inout2,&QWP(16*2,"esp"));
1563	 &aesdec	($inout0,$rndkey1);
1564	&pxor	($inout3,&QWP(16*3,"esp"));
1565	&pxor	($inout4,&QWP(16*4,"esp"));
1566	 &aesdec	($inout1,$rndkey1);
1567	&pxor		($inout5,$rndkey0);
1568	 &$movekey	($rndkey0,&QWP(32,$key_));
1569	 &aesdec	($inout2,$rndkey1);
1570	 &aesdec	($inout3,$rndkey1);
1571	 &aesdec	($inout4,$rndkey1);
1572	 &aesdec	($inout5,$rndkey1);
1573	&call		(&label("_aesni_decrypt6_enter"));
1574
1575	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1576       &pxor	($twtmp,$twtmp);
1577	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1578       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1579	&xorps	($inout1,&QWP(16*1,"esp"));
1580	&movups	(&QWP(16*0,$out),$inout0);	# write output
1581	&xorps	($inout2,&QWP(16*2,"esp"));
1582	&movups	(&QWP(16*1,$out),$inout1);
1583	&xorps	($inout3,&QWP(16*3,"esp"));
1584	&movups	(&QWP(16*2,$out),$inout2);
1585	&xorps	($inout4,&QWP(16*4,"esp"));
1586	&movups	(&QWP(16*3,$out),$inout3);
1587	&xorps	($inout5,$tweak);
1588	&movups	(&QWP(16*4,$out),$inout4);
1589       &pshufd	($twres,$twtmp,0x13);
1590	&movups	(&QWP(16*5,$out),$inout5);
1591	&lea	($out,&DWP(16*6,$out));
1592       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1593
1594	&pxor	($twtmp,$twtmp);
1595	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1596	&pand	($twres,$twmask);		# isolate carry and residue
1597	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1598	&pxor	($tweak,$twres);
1599
1600	&sub	($len,16*6);
1601	&jnc	(&label("xts_dec_loop6"));
1602
1603	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1604	&mov	($key,$key_);			# restore $key
1605	&mov	($rounds_,$rounds);
1606
1607&set_label("xts_dec_short");
1608	&add	($len,16*6);
1609	&jz	(&label("xts_dec_done6x"));
1610
1611	&movdqa	($inout3,$tweak);		# put aside previous tweak
1612	&cmp	($len,0x20);
1613	&jb	(&label("xts_dec_one"));
1614
1615	&pshufd	($twres,$twtmp,0x13);
1616	&pxor	($twtmp,$twtmp);
1617	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1618	&pand	($twres,$twmask);		# isolate carry and residue
1619	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1620	&pxor	($tweak,$twres);
1621	&je	(&label("xts_dec_two"));
1622
1623	&pshufd	($twres,$twtmp,0x13);
1624	&pxor	($twtmp,$twtmp);
1625	&movdqa	($inout4,$tweak);		# put aside previous tweak
1626	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1627	&pand	($twres,$twmask);		# isolate carry and residue
1628	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1629	&pxor	($tweak,$twres);
1630	&cmp	($len,0x40);
1631	&jb	(&label("xts_dec_three"));
1632
1633	&pshufd	($twres,$twtmp,0x13);
1634	&pxor	($twtmp,$twtmp);
1635	&movdqa	($inout5,$tweak);		# put aside previous tweak
1636	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1637	&pand	($twres,$twmask);		# isolate carry and residue
1638	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1639	&pxor	($tweak,$twres);
1640	&movdqa	(&QWP(16*0,"esp"),$inout3);
1641	&movdqa	(&QWP(16*1,"esp"),$inout4);
1642	&je	(&label("xts_dec_four"));
1643
1644	&movdqa	(&QWP(16*2,"esp"),$inout5);
1645	&pshufd	($inout5,$twtmp,0x13);
1646	&movdqa	(&QWP(16*3,"esp"),$tweak);
1647	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1648	&pand	($inout5,$twmask);		# isolate carry and residue
1649	&pxor	($inout5,$tweak);
1650
1651	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1652	&movdqu	($inout1,&QWP(16*1,$inp));
1653	&movdqu	($inout2,&QWP(16*2,$inp));
1654	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1655	&movdqu	($inout3,&QWP(16*3,$inp));
1656	&pxor	($inout1,&QWP(16*1,"esp"));
1657	&movdqu	($inout4,&QWP(16*4,$inp));
1658	&pxor	($inout2,&QWP(16*2,"esp"));
1659	&lea	($inp,&DWP(16*5,$inp));
1660	&pxor	($inout3,&QWP(16*3,"esp"));
1661	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1662	&pxor	($inout4,$inout5);
1663
1664	&call	("_aesni_decrypt6");
1665
1666	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1667	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1668	&xorps	($inout1,&QWP(16*1,"esp"));
1669	&xorps	($inout2,&QWP(16*2,"esp"));
1670	&movups	(&QWP(16*0,$out),$inout0);	# write output
1671	&xorps	($inout3,&QWP(16*3,"esp"));
1672	&movups	(&QWP(16*1,$out),$inout1);
1673	&xorps	($inout4,$tweak);
1674	&movups	(&QWP(16*2,$out),$inout2);
1675	&movups	(&QWP(16*3,$out),$inout3);
1676	&movups	(&QWP(16*4,$out),$inout4);
1677	&lea	($out,&DWP(16*5,$out));
1678	&jmp	(&label("xts_dec_done"));
1679
1680&set_label("xts_dec_one",16);
1681	&movups	($inout0,&QWP(16*0,$inp));	# load input
1682	&lea	($inp,&DWP(16*1,$inp));
1683	&xorps	($inout0,$inout3);		# input^=tweak
1684	if ($inline)
1685	{   &aesni_inline_generate1("dec");	}
1686	else
1687	{   &call	("_aesni_decrypt1");	}
1688	&xorps	($inout0,$inout3);		# output^=tweak
1689	&movups	(&QWP(16*0,$out),$inout0);	# write output
1690	&lea	($out,&DWP(16*1,$out));
1691
1692	&movdqa	($tweak,$inout3);		# last tweak
1693	&jmp	(&label("xts_dec_done"));
1694
1695&set_label("xts_dec_two",16);
1696	&movaps	($inout4,$tweak);		# put aside last tweak
1697
1698	&movups	($inout0,&QWP(16*0,$inp));	# load input
1699	&movups	($inout1,&QWP(16*1,$inp));
1700	&lea	($inp,&DWP(16*2,$inp));
1701	&xorps	($inout0,$inout3);		# input^=tweak
1702	&xorps	($inout1,$inout4);
1703
1704	&call	("_aesni_decrypt2");
1705
1706	&xorps	($inout0,$inout3);		# output^=tweak
1707	&xorps	($inout1,$inout4);
1708	&movups	(&QWP(16*0,$out),$inout0);	# write output
1709	&movups	(&QWP(16*1,$out),$inout1);
1710	&lea	($out,&DWP(16*2,$out));
1711
1712	&movdqa	($tweak,$inout4);		# last tweak
1713	&jmp	(&label("xts_dec_done"));
1714
1715&set_label("xts_dec_three",16);
1716	&movaps	($inout5,$tweak);		# put aside last tweak
1717	&movups	($inout0,&QWP(16*0,$inp));	# load input
1718	&movups	($inout1,&QWP(16*1,$inp));
1719	&movups	($inout2,&QWP(16*2,$inp));
1720	&lea	($inp,&DWP(16*3,$inp));
1721	&xorps	($inout0,$inout3);		# input^=tweak
1722	&xorps	($inout1,$inout4);
1723	&xorps	($inout2,$inout5);
1724
1725	&call	("_aesni_decrypt3");
1726
1727	&xorps	($inout0,$inout3);		# output^=tweak
1728	&xorps	($inout1,$inout4);
1729	&xorps	($inout2,$inout5);
1730	&movups	(&QWP(16*0,$out),$inout0);	# write output
1731	&movups	(&QWP(16*1,$out),$inout1);
1732	&movups	(&QWP(16*2,$out),$inout2);
1733	&lea	($out,&DWP(16*3,$out));
1734
1735	&movdqa	($tweak,$inout5);		# last tweak
1736	&jmp	(&label("xts_dec_done"));
1737
1738&set_label("xts_dec_four",16);
1739	&movaps	($inout4,$tweak);		# put aside last tweak
1740
1741	&movups	($inout0,&QWP(16*0,$inp));	# load input
1742	&movups	($inout1,&QWP(16*1,$inp));
1743	&movups	($inout2,&QWP(16*2,$inp));
1744	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1745	&movups	($inout3,&QWP(16*3,$inp));
1746	&lea	($inp,&DWP(16*4,$inp));
1747	&xorps	($inout1,&QWP(16*1,"esp"));
1748	&xorps	($inout2,$inout5);
1749	&xorps	($inout3,$inout4);
1750
1751	&call	("_aesni_decrypt4");
1752
1753	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1754	&xorps	($inout1,&QWP(16*1,"esp"));
1755	&xorps	($inout2,$inout5);
1756	&movups	(&QWP(16*0,$out),$inout0);	# write output
1757	&xorps	($inout3,$inout4);
1758	&movups	(&QWP(16*1,$out),$inout1);
1759	&movups	(&QWP(16*2,$out),$inout2);
1760	&movups	(&QWP(16*3,$out),$inout3);
1761	&lea	($out,&DWP(16*4,$out));
1762
1763	&movdqa	($tweak,$inout4);		# last tweak
1764	&jmp	(&label("xts_dec_done"));
1765
1766&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1767	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1768	&and	($len,15);
1769	&jz	(&label("xts_dec_ret"));
1770	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1771	&jmp	(&label("xts_dec_only_one_more"));
1772
1773&set_label("xts_dec_done",16);
1774	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1775	&pxor	($twtmp,$twtmp);
1776	&and	($len,15);
1777	&jz	(&label("xts_dec_ret"));
1778
1779	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1780	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1781	&pshufd	($twres,$twtmp,0x13);
1782	&pxor	($twtmp,$twtmp);
1783	&movdqa	($twmask,&QWP(16*6,"esp"));
1784	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1785	&pand	($twres,$twmask);		# isolate carry and residue
1786	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1787	&pxor	($tweak,$twres);
1788
1789&set_label("xts_dec_only_one_more");
1790	&pshufd	($inout3,$twtmp,0x13);
1791	&movdqa	($inout4,$tweak);		# put aside previous tweak
1792	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1793	&pand	($inout3,$twmask);		# isolate carry and residue
1794	&pxor	($inout3,$tweak);
1795
1796	&mov	($key,$key_);			# restore $key
1797	&mov	($rounds,$rounds_);		# restore $rounds
1798
1799	&movups	($inout0,&QWP(0,$inp));		# load input
1800	&xorps	($inout0,$inout3);		# input^=tweak
1801	if ($inline)
1802	{   &aesni_inline_generate1("dec");	}
1803	else
1804	{   &call	("_aesni_decrypt1");	}
1805	&xorps	($inout0,$inout3);		# output^=tweak
1806	&movups	(&QWP(0,$out),$inout0);		# write output
1807
1808&set_label("xts_dec_steal");
1809	&movz	($rounds,&BP(16,$inp));
1810	&movz	($key,&BP(0,$out));
1811	&lea	($inp,&DWP(1,$inp));
1812	&mov	(&BP(0,$out),&LB($rounds));
1813	&mov	(&BP(16,$out),&LB($key));
1814	&lea	($out,&DWP(1,$out));
1815	&sub	($len,1);
1816	&jnz	(&label("xts_dec_steal"));
1817
1818	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1819	&mov	($key,$key_);			# restore $key
1820	&mov	($rounds,$rounds_);		# restore $rounds
1821
1822	&movups	($inout0,&QWP(0,$out));		# load input
1823	&xorps	($inout0,$inout4);		# input^=tweak
1824	if ($inline)
1825	{   &aesni_inline_generate1("dec");	}
1826	else
1827	{   &call	("_aesni_decrypt1");	}
1828	&xorps	($inout0,$inout4);		# output^=tweak
1829	&movups	(&QWP(0,$out),$inout0);		# write output
1830
1831&set_label("xts_dec_ret");
1832	&pxor	("xmm0","xmm0");		# clear register bank
1833	&pxor	("xmm1","xmm1");
1834	&pxor	("xmm2","xmm2");
1835	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1836	&pxor	("xmm3","xmm3");
1837	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1838	&pxor	("xmm4","xmm4");
1839	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1840	&pxor	("xmm5","xmm5");
1841	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1842	&pxor	("xmm6","xmm6");
1843	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1844	&pxor	("xmm7","xmm7");
1845	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1846	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1847&function_end("aesni_xts_decrypt");
1848}
1849
1850######################################################################
1851# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1852#	const AES_KEY *key, unsigned int start_block_num,
1853#	unsigned char offset_i[16], const unsigned char L_[][16],
1854#	unsigned char checksum[16]);
1855#
1856{
1857# offsets within stack frame
1858my $checksum = 16*6;
1859my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1860
1861# reassigned registers
1862my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1863# $l_, $blocks, $inp, $key are permanently allocated in registers;
1864# remaining non-volatile ones are offloaded to stack, which even
1865# stay invariant after written to stack.
1866
1867&function_begin("aesni_ocb_encrypt");
1868	&mov	($rounds,&wparam(5));		# &offset_i
1869	&mov	($rounds_,&wparam(7));		# &checksum
1870
1871	&mov	($inp,&wparam(0));
1872	&mov	($out,&wparam(1));
1873	&mov	($len,&wparam(2));
1874	&mov	($key,&wparam(3));
1875	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
1876	&mov	($block,&wparam(4));		# start_block_num
1877	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
1878	&mov	($l_,&wparam(6));		# L_
1879
1880	&mov	($rounds,"esp");
1881	&sub	("esp",$esp_off+4);		# alloca
1882	&and	("esp",-16);			# align stack
1883
1884	&sub	($out,$inp);
1885	&shl	($len,4);
1886	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
1887	&mov	(&DWP($out_off,"esp"),$out);
1888	&mov	(&DWP($end_off,"esp"),$len);
1889	&mov	(&DWP($esp_off,"esp"),$rounds);
1890
1891	&mov	($rounds,&DWP(240,$key));
1892
1893	&test	($block,1);
1894	&jnz	(&label("odd"));
1895
1896	&bsf		($i3,$block);
1897	&add		($block,1);
1898	&shl		($i3,4);
1899	&movdqu		($inout5,&QWP(0,$l_,$i3));
1900	&mov		($i3,$key);			# put aside key
1901
1902	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
1903	&lea		($inp,&DWP(16,$inp));
1904
1905	&pxor		($inout5,$rndkey0);		# ^ last offset_i
1906	&pxor		($rndkey1,$inout0);		# checksum
1907	&pxor		($inout0,$inout5);		# ^ offset_i
1908
1909	&movdqa		($inout4,$rndkey1);
1910	if ($inline)
1911	{   &aesni_inline_generate1("enc");	}
1912	else
1913	{   &call	("_aesni_encrypt1");	}
1914
1915	&xorps		($inout0,$inout5);		# ^ offset_i
1916	&movdqa		($rndkey0,$inout5);		# pass last offset_i
1917	&movdqa		($rndkey1,$inout4);		# pass the checksum
1918
1919	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
1920
1921	&mov		($rounds,&DWP(240,$i3));
1922	&mov		($key,$i3);			# restore key
1923	&mov		($len,&DWP($end_off,"esp"));
1924
1925&set_label("odd");
1926	&shl		($rounds,4);
1927	&mov		($out,16);
1928	&sub		($out,$rounds);			# twisted rounds
1929	&mov		(&DWP($key_off,"esp"),$key);
1930	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
1931	&mov		(&DWP($rounds_off,"esp"),$out);
1932
1933	&cmp		($inp,$len);
1934	&ja		(&label("short"));
1935	&jmp		(&label("grandloop"));
1936
1937&set_label("grandloop",32);
1938	&lea		($i1,&DWP(1,$block));
1939	&lea		($i3,&DWP(3,$block));
1940	&lea		($i5,&DWP(5,$block));
1941	&add		($block,6);
1942	&bsf		($i1,$i1);
1943	&bsf		($i3,$i3);
1944	&bsf		($i5,$i5);
1945	&shl		($i1,4);
1946	&shl		($i3,4);
1947	&shl		($i5,4);
1948	&movdqu		($inout0,&QWP(0,$l_));
1949	&movdqu		($inout1,&QWP(0,$l_,$i1));
1950	&mov		($rounds,&DWP($rounds_off,"esp"));
1951	&movdqa		($inout2,$inout0);
1952	&movdqu		($inout3,&QWP(0,$l_,$i3));
1953	&movdqa		($inout4,$inout0);
1954	&movdqu		($inout5,&QWP(0,$l_,$i5));
1955
1956	&pxor		($inout0,$rndkey0);		# ^ last offset_i
1957	&pxor		($inout1,$inout0);
1958	&movdqa		(&QWP(16*0,"esp"),$inout0);
1959	&pxor		($inout2,$inout1);
1960	&movdqa		(&QWP(16*1,"esp"),$inout1);
1961	&pxor		($inout3,$inout2);
1962	&movdqa		(&QWP(16*2,"esp"),$inout2);
1963	&pxor		($inout4,$inout3);
1964	&movdqa		(&QWP(16*3,"esp"),$inout3);
1965	&pxor		($inout5,$inout4);
1966	&movdqa		(&QWP(16*4,"esp"),$inout4);
1967	&movdqa		(&QWP(16*5,"esp"),$inout5);
1968
1969	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
1970	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
1971	&movdqu		($inout1,&QWP(16*1,$inp));
1972	&movdqu		($inout2,&QWP(16*2,$inp));
1973	&movdqu		($inout3,&QWP(16*3,$inp));
1974	&movdqu		($inout4,&QWP(16*4,$inp));
1975	&movdqu		($inout5,&QWP(16*5,$inp));
1976	&lea		($inp,&DWP(16*6,$inp));
1977
1978	&pxor		($rndkey1,$inout0);		# checksum
1979	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
1980	&pxor		($rndkey1,$inout1);
1981	&pxor		($inout1,$rndkey0);
1982	&pxor		($rndkey1,$inout2);
1983	&pxor		($inout2,$rndkey0);
1984	&pxor		($rndkey1,$inout3);
1985	&pxor		($inout3,$rndkey0);
1986	&pxor		($rndkey1,$inout4);
1987	&pxor		($inout4,$rndkey0);
1988	&pxor		($rndkey1,$inout5);
1989	&pxor		($inout5,$rndkey0);
1990	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
1991
1992	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
1993	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
1994	&pxor		($inout1,&QWP(16*1,"esp"));
1995	&pxor		($inout2,&QWP(16*2,"esp"));
1996	&pxor		($inout3,&QWP(16*3,"esp"));
1997	&pxor		($inout4,&QWP(16*4,"esp"));
1998	&pxor		($inout5,&QWP(16*5,"esp"));
1999
2000	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2001	&aesenc		($inout0,$rndkey1);
2002	&aesenc		($inout1,$rndkey1);
2003	&aesenc		($inout2,$rndkey1);
2004	&aesenc		($inout3,$rndkey1);
2005	&aesenc		($inout4,$rndkey1);
2006	&aesenc		($inout5,$rndkey1);
2007
2008	&mov		($out,&DWP($out_off,"esp"));
2009	&mov		($len,&DWP($end_off,"esp"));
2010	&call		("_aesni_encrypt6_enter");
2011
2012	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
2013	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2014	&pxor		($inout1,&QWP(16*1,"esp"));
2015	&pxor		($inout2,&QWP(16*2,"esp"));
2016	&pxor		($inout3,&QWP(16*3,"esp"));
2017	&pxor		($inout4,&QWP(16*4,"esp"));
2018	&pxor		($inout5,$rndkey0);
2019	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2020
2021	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
2022	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
2023	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
2024	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
2025	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
2026	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
2027	&cmp		($inp,$len);			# done yet?
2028	&jbe		(&label("grandloop"));
2029
2030&set_label("short");
2031	&add		($len,16*6);
2032	&sub		($len,$inp);
2033	&jz		(&label("done"));
2034
2035	&cmp		($len,16*2);
2036	&jb		(&label("one"));
2037	&je		(&label("two"));
2038
2039	&cmp		($len,16*4);
2040	&jb		(&label("three"));
2041	&je		(&label("four"));
2042
2043	&lea		($i1,&DWP(1,$block));
2044	&lea		($i3,&DWP(3,$block));
2045	&bsf		($i1,$i1);
2046	&bsf		($i3,$i3);
2047	&shl		($i1,4);
2048	&shl		($i3,4);
2049	&movdqu		($inout0,&QWP(0,$l_));
2050	&movdqu		($inout1,&QWP(0,$l_,$i1));
2051	&mov		($rounds,&DWP($rounds_off,"esp"));
2052	&movdqa		($inout2,$inout0);
2053	&movdqu		($inout3,&QWP(0,$l_,$i3));
2054	&movdqa		($inout4,$inout0);
2055
2056	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2057	&pxor		($inout1,$inout0);
2058	&movdqa		(&QWP(16*0,"esp"),$inout0);
2059	&pxor		($inout2,$inout1);
2060	&movdqa		(&QWP(16*1,"esp"),$inout1);
2061	&pxor		($inout3,$inout2);
2062	&movdqa		(&QWP(16*2,"esp"),$inout2);
2063	&pxor		($inout4,$inout3);
2064	&movdqa		(&QWP(16*3,"esp"),$inout3);
2065	&pxor		($inout5,$inout4);
2066	&movdqa		(&QWP(16*4,"esp"),$inout4);
2067
2068	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2069	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2070	&movdqu		($inout1,&QWP(16*1,$inp));
2071	&movdqu		($inout2,&QWP(16*2,$inp));
2072	&movdqu		($inout3,&QWP(16*3,$inp));
2073	&movdqu		($inout4,&QWP(16*4,$inp));
2074	&pxor		($inout5,$inout5);
2075
2076	&pxor		($rndkey1,$inout0);		# checksum
2077	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2078	&pxor		($rndkey1,$inout1);
2079	&pxor		($inout1,$rndkey0);
2080	&pxor		($rndkey1,$inout2);
2081	&pxor		($inout2,$rndkey0);
2082	&pxor		($rndkey1,$inout3);
2083	&pxor		($inout3,$rndkey0);
2084	&pxor		($rndkey1,$inout4);
2085	&pxor		($inout4,$rndkey0);
2086	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2087
2088	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2089	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2090	&pxor		($inout1,&QWP(16*1,"esp"));
2091	&pxor		($inout2,&QWP(16*2,"esp"));
2092	&pxor		($inout3,&QWP(16*3,"esp"));
2093	&pxor		($inout4,&QWP(16*4,"esp"));
2094
2095	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2096	&aesenc		($inout0,$rndkey1);
2097	&aesenc		($inout1,$rndkey1);
2098	&aesenc		($inout2,$rndkey1);
2099	&aesenc		($inout3,$rndkey1);
2100	&aesenc		($inout4,$rndkey1);
2101	&aesenc		($inout5,$rndkey1);
2102
2103	&mov		($out,&DWP($out_off,"esp"));
2104	&call		("_aesni_encrypt6_enter");
2105
2106	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
2107	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2108	&pxor		($inout1,&QWP(16*1,"esp"));
2109	&pxor		($inout2,&QWP(16*2,"esp"));
2110	&pxor		($inout3,&QWP(16*3,"esp"));
2111	&pxor		($inout4,$rndkey0);
2112	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2113
2114	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
2115	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
2116	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
2117	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
2118	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
2119
2120	&jmp		(&label("done"));
2121
2122&set_label("one",16);
2123	&movdqu		($inout5,&QWP(0,$l_));
2124	&mov		($key,&DWP($key_off,"esp"));	# restore key
2125
2126	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2127	&mov		($rounds,&DWP(240,$key));
2128
2129	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2130	&pxor		($rndkey1,$inout0);		# checksum
2131	&pxor		($inout0,$inout5);		# ^ offset_i
2132
2133	&movdqa		($inout4,$rndkey1);
2134	&mov		($out,&DWP($out_off,"esp"));
2135	if ($inline)
2136	{   &aesni_inline_generate1("enc");	}
2137	else
2138	{   &call	("_aesni_encrypt1");	}
2139
2140	&xorps		($inout0,$inout5);		# ^ offset_i
2141	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2142	&movdqa		($rndkey1,$inout4);		# pass the checksum
2143	&movups		(&QWP(0,$out,$inp),$inout0);
2144
2145	&jmp		(&label("done"));
2146
2147&set_label("two",16);
2148	&lea		($i1,&DWP(1,$block));
2149	&mov		($key,&DWP($key_off,"esp"));	# restore key
2150	&bsf		($i1,$i1);
2151	&shl		($i1,4);
2152	&movdqu		($inout4,&QWP(0,$l_));
2153	&movdqu		($inout5,&QWP(0,$l_,$i1));
2154
2155	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2156	&movdqu		($inout1,&QWP(16*1,$inp));
2157	&mov		($rounds,&DWP(240,$key));
2158
2159	&pxor		($inout4,$rndkey0);		# ^ last offset_i
2160	&pxor		($inout5,$inout4);
2161
2162	&pxor		($rndkey1,$inout0);		# checksum
2163	&pxor		($inout0,$inout4);		# ^ offset_i
2164	&pxor		($rndkey1,$inout1);
2165	&pxor		($inout1,$inout5);
2166
2167	&movdqa		($inout3,$rndkey1)
2168	&mov		($out,&DWP($out_off,"esp"));
2169	&call		("_aesni_encrypt2");
2170
2171	&xorps		($inout0,$inout4);		# ^ offset_i
2172	&xorps		($inout1,$inout5);
2173	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2174	&movdqa		($rndkey1,$inout3);		# pass the checksum
2175	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2176	&movups		(&QWP(16*1,$out,$inp),$inout1);
2177
2178	&jmp		(&label("done"));
2179
2180&set_label("three",16);
2181	&lea		($i1,&DWP(1,$block));
2182	&mov		($key,&DWP($key_off,"esp"));	# restore key
2183	&bsf		($i1,$i1);
2184	&shl		($i1,4);
2185	&movdqu		($inout3,&QWP(0,$l_));
2186	&movdqu		($inout4,&QWP(0,$l_,$i1));
2187	&movdqa		($inout5,$inout3);
2188
2189	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2190	&movdqu		($inout1,&QWP(16*1,$inp));
2191	&movdqu		($inout2,&QWP(16*2,$inp));
2192	&mov		($rounds,&DWP(240,$key));
2193
2194	&pxor		($inout3,$rndkey0);		# ^ last offset_i
2195	&pxor		($inout4,$inout3);
2196	&pxor		($inout5,$inout4);
2197
2198	&pxor		($rndkey1,$inout0);		# checksum
2199	&pxor		($inout0,$inout3);		# ^ offset_i
2200	&pxor		($rndkey1,$inout1);
2201	&pxor		($inout1,$inout4);
2202	&pxor		($rndkey1,$inout2);
2203	&pxor		($inout2,$inout5);
2204
2205	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2206	&mov		($out,&DWP($out_off,"esp"));
2207	&call		("_aesni_encrypt3");
2208
2209	&xorps		($inout0,$inout3);		# ^ offset_i
2210	&xorps		($inout1,$inout4);
2211	&xorps		($inout2,$inout5);
2212	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2213	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2214	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2215	&movups		(&QWP(16*1,$out,$inp),$inout1);
2216	&movups		(&QWP(16*2,$out,$inp),$inout2);
2217
2218	&jmp		(&label("done"));
2219
2220&set_label("four",16);
2221	&lea		($i1,&DWP(1,$block));
2222	&lea		($i3,&DWP(3,$block));
2223	&bsf		($i1,$i1);
2224	&bsf		($i3,$i3);
2225	&mov		($key,&DWP($key_off,"esp"));	# restore key
2226	&shl		($i1,4);
2227	&shl		($i3,4);
2228	&movdqu		($inout2,&QWP(0,$l_));
2229	&movdqu		($inout3,&QWP(0,$l_,$i1));
2230	&movdqa		($inout4,$inout2);
2231	&movdqu		($inout5,&QWP(0,$l_,$i3));
2232
2233	&pxor		($inout2,$rndkey0);		# ^ last offset_i
2234	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2235	&pxor		($inout3,$inout2);
2236	&movdqu		($inout1,&QWP(16*1,$inp));
2237	&pxor		($inout4,$inout3);
2238	&movdqa		(&QWP(16*0,"esp"),$inout2);
2239	&pxor		($inout5,$inout4);
2240	&movdqa		(&QWP(16*1,"esp"),$inout3);
2241	&movdqu		($inout2,&QWP(16*2,$inp));
2242	&movdqu		($inout3,&QWP(16*3,$inp));
2243	&mov		($rounds,&DWP(240,$key));
2244
2245	&pxor		($rndkey1,$inout0);		# checksum
2246	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2247	&pxor		($rndkey1,$inout1);
2248	&pxor		($inout1,&QWP(16*1,"esp"));
2249	&pxor		($rndkey1,$inout2);
2250	&pxor		($inout2,$inout4);
2251	&pxor		($rndkey1,$inout3);
2252	&pxor		($inout3,$inout5);
2253
2254	&movdqa		(&QWP($checksum,"esp"),$rndkey1)
2255	&mov		($out,&DWP($out_off,"esp"));
2256	&call		("_aesni_encrypt4");
2257
2258	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2259	&xorps		($inout1,&QWP(16*1,"esp"));
2260	&xorps		($inout2,$inout4);
2261	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2262	&xorps		($inout3,$inout5);
2263	&movups		(&QWP(16*1,$out,$inp),$inout1);
2264	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2265	&movups		(&QWP(16*2,$out,$inp),$inout2);
2266	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2267	&movups		(&QWP(16*3,$out,$inp),$inout3);
2268
2269&set_label("done");
2270	&mov	($key,&DWP($esp_off,"esp"));
2271	&pxor	($inout0,$inout0);		# clear register bank
2272	&pxor	($inout1,$inout1);
2273	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
2274	&pxor	($inout2,$inout2);
2275	&movdqa	(&QWP(16*1,"esp"),$inout0);
2276	&pxor	($inout3,$inout3);
2277	&movdqa	(&QWP(16*2,"esp"),$inout0);
2278	&pxor	($inout4,$inout4);
2279	&movdqa	(&QWP(16*3,"esp"),$inout0);
2280	&pxor	($inout5,$inout5);
2281	&movdqa	(&QWP(16*4,"esp"),$inout0);
2282	&movdqa	(&QWP(16*5,"esp"),$inout0);
2283	&movdqa	(&QWP(16*6,"esp"),$inout0);
2284
2285	&lea	("esp",&DWP(0,$key));
2286	&mov	($rounds,&wparam(5));		# &offset_i
2287	&mov	($rounds_,&wparam(7));		# &checksum
2288	&movdqu	(&QWP(0,$rounds),$rndkey0);
2289	&pxor	($rndkey0,$rndkey0);
2290	&movdqu	(&QWP(0,$rounds_),$rndkey1);
2291	&pxor	($rndkey1,$rndkey1);
2292&function_end("aesni_ocb_encrypt");
2293
2294&function_begin("aesni_ocb_decrypt");
2295	&mov	($rounds,&wparam(5));		# &offset_i
2296	&mov	($rounds_,&wparam(7));		# &checksum
2297
2298	&mov	($inp,&wparam(0));
2299	&mov	($out,&wparam(1));
2300	&mov	($len,&wparam(2));
2301	&mov	($key,&wparam(3));
2302	&movdqu	($rndkey0,&QWP(0,$rounds));	# load offset_i
2303	&mov	($block,&wparam(4));		# start_block_num
2304	&movdqu	($rndkey1,&QWP(0,$rounds_));	# load checksum
2305	&mov	($l_,&wparam(6));		# L_
2306
2307	&mov	($rounds,"esp");
2308	&sub	("esp",$esp_off+4);		# alloca
2309	&and	("esp",-16);			# align stack
2310
2311	&sub	($out,$inp);
2312	&shl	($len,4);
2313	&lea	($len,&DWP(-16*6,$inp,$len));	# end of input - 16*6
2314	&mov	(&DWP($out_off,"esp"),$out);
2315	&mov	(&DWP($end_off,"esp"),$len);
2316	&mov	(&DWP($esp_off,"esp"),$rounds);
2317
2318	&mov	($rounds,&DWP(240,$key));
2319
2320	&test	($block,1);
2321	&jnz	(&label("odd"));
2322
2323	&bsf		($i3,$block);
2324	&add		($block,1);
2325	&shl		($i3,4);
2326	&movdqu		($inout5,&QWP(0,$l_,$i3));
2327	&mov		($i3,$key);			# put aside key
2328
2329	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2330	&lea		($inp,&DWP(16,$inp));
2331
2332	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2333	&pxor		($inout0,$inout5);		# ^ offset_i
2334
2335	&movdqa		($inout4,$rndkey1);
2336	if ($inline)
2337	{   &aesni_inline_generate1("dec");	}
2338	else
2339	{   &call	("_aesni_decrypt1");	}
2340
2341	&xorps		($inout0,$inout5);		# ^ offset_i
2342	&movaps		($rndkey1,$inout4);		# pass the checksum
2343	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2344	&xorps		($rndkey1,$inout0);		# checksum
2345	&movups		(&QWP(-16,$out,$inp),$inout0);	# store output
2346
2347	&mov		($rounds,&DWP(240,$i3));
2348	&mov		($key,$i3);			# restore key
2349	&mov		($len,&DWP($end_off,"esp"));
2350
2351&set_label("odd");
2352	&shl		($rounds,4);
2353	&mov		($out,16);
2354	&sub		($out,$rounds);			# twisted rounds
2355	&mov		(&DWP($key_off,"esp"),$key);
2356	&lea		($key,&DWP(32,$key,$rounds));	# end of key schedule
2357	&mov		(&DWP($rounds_off,"esp"),$out);
2358
2359	&cmp		($inp,$len);
2360	&ja		(&label("short"));
2361	&jmp		(&label("grandloop"));
2362
2363&set_label("grandloop",32);
2364	&lea		($i1,&DWP(1,$block));
2365	&lea		($i3,&DWP(3,$block));
2366	&lea		($i5,&DWP(5,$block));
2367	&add		($block,6);
2368	&bsf		($i1,$i1);
2369	&bsf		($i3,$i3);
2370	&bsf		($i5,$i5);
2371	&shl		($i1,4);
2372	&shl		($i3,4);
2373	&shl		($i5,4);
2374	&movdqu		($inout0,&QWP(0,$l_));
2375	&movdqu		($inout1,&QWP(0,$l_,$i1));
2376	&mov		($rounds,&DWP($rounds_off,"esp"));
2377	&movdqa		($inout2,$inout0);
2378	&movdqu		($inout3,&QWP(0,$l_,$i3));
2379	&movdqa		($inout4,$inout0);
2380	&movdqu		($inout5,&QWP(0,$l_,$i5));
2381
2382	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2383	&pxor		($inout1,$inout0);
2384	&movdqa		(&QWP(16*0,"esp"),$inout0);
2385	&pxor		($inout2,$inout1);
2386	&movdqa		(&QWP(16*1,"esp"),$inout1);
2387	&pxor		($inout3,$inout2);
2388	&movdqa		(&QWP(16*2,"esp"),$inout2);
2389	&pxor		($inout4,$inout3);
2390	&movdqa		(&QWP(16*3,"esp"),$inout3);
2391	&pxor		($inout5,$inout4);
2392	&movdqa		(&QWP(16*4,"esp"),$inout4);
2393	&movdqa		(&QWP(16*5,"esp"),$inout5);
2394
2395	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2396	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2397	&movdqu		($inout1,&QWP(16*1,$inp));
2398	&movdqu		($inout2,&QWP(16*2,$inp));
2399	&movdqu		($inout3,&QWP(16*3,$inp));
2400	&movdqu		($inout4,&QWP(16*4,$inp));
2401	&movdqu		($inout5,&QWP(16*5,$inp));
2402	&lea		($inp,&DWP(16*6,$inp));
2403
2404	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2405	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2406	&pxor		($inout1,$rndkey0);
2407	&pxor		($inout2,$rndkey0);
2408	&pxor		($inout3,$rndkey0);
2409	&pxor		($inout4,$rndkey0);
2410	&pxor		($inout5,$rndkey0);
2411
2412	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2413	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2414	&pxor		($inout1,&QWP(16*1,"esp"));
2415	&pxor		($inout2,&QWP(16*2,"esp"));
2416	&pxor		($inout3,&QWP(16*3,"esp"));
2417	&pxor		($inout4,&QWP(16*4,"esp"));
2418	&pxor		($inout5,&QWP(16*5,"esp"));
2419
2420	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2421	&aesdec		($inout0,$rndkey1);
2422	&aesdec		($inout1,$rndkey1);
2423	&aesdec		($inout2,$rndkey1);
2424	&aesdec		($inout3,$rndkey1);
2425	&aesdec		($inout4,$rndkey1);
2426	&aesdec		($inout5,$rndkey1);
2427
2428	&mov		($out,&DWP($out_off,"esp"));
2429	&mov		($len,&DWP($end_off,"esp"));
2430	&call		("_aesni_decrypt6_enter");
2431
2432	&movdqa		($rndkey0,&QWP(16*5,"esp"));	# pass last offset_i
2433	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2434	&movdqa		($rndkey1,&QWP($checksum,"esp"));
2435	&pxor		($inout1,&QWP(16*1,"esp"));
2436	&pxor		($inout2,&QWP(16*2,"esp"));
2437	&pxor		($inout3,&QWP(16*3,"esp"));
2438	&pxor		($inout4,&QWP(16*4,"esp"));
2439	&pxor		($inout5,$rndkey0);
2440
2441	&pxor		($rndkey1,$inout0);		# checksum
2442	&movdqu		(&QWP(-16*6,$out,$inp),$inout0);# store output
2443	&pxor		($rndkey1,$inout1);
2444	&movdqu		(&QWP(-16*5,$out,$inp),$inout1);
2445	&pxor		($rndkey1,$inout2);
2446	&movdqu		(&QWP(-16*4,$out,$inp),$inout2);
2447	&pxor		($rndkey1,$inout3);
2448	&movdqu		(&QWP(-16*3,$out,$inp),$inout3);
2449	&pxor		($rndkey1,$inout4);
2450	&movdqu		(&QWP(-16*2,$out,$inp),$inout4);
2451	&pxor		($rndkey1,$inout5);
2452	&movdqu		(&QWP(-16*1,$out,$inp),$inout5);
2453	&cmp		($inp,$len);			# done yet?
2454	&jbe		(&label("grandloop"));
2455
2456&set_label("short");
2457	&add		($len,16*6);
2458	&sub		($len,$inp);
2459	&jz		(&label("done"));
2460
2461	&cmp		($len,16*2);
2462	&jb		(&label("one"));
2463	&je		(&label("two"));
2464
2465	&cmp		($len,16*4);
2466	&jb		(&label("three"));
2467	&je		(&label("four"));
2468
2469	&lea		($i1,&DWP(1,$block));
2470	&lea		($i3,&DWP(3,$block));
2471	&bsf		($i1,$i1);
2472	&bsf		($i3,$i3);
2473	&shl		($i1,4);
2474	&shl		($i3,4);
2475	&movdqu		($inout0,&QWP(0,$l_));
2476	&movdqu		($inout1,&QWP(0,$l_,$i1));
2477	&mov		($rounds,&DWP($rounds_off,"esp"));
2478	&movdqa		($inout2,$inout0);
2479	&movdqu		($inout3,&QWP(0,$l_,$i3));
2480	&movdqa		($inout4,$inout0);
2481
2482	&pxor		($inout0,$rndkey0);		# ^ last offset_i
2483	&pxor		($inout1,$inout0);
2484	&movdqa		(&QWP(16*0,"esp"),$inout0);
2485	&pxor		($inout2,$inout1);
2486	&movdqa		(&QWP(16*1,"esp"),$inout1);
2487	&pxor		($inout3,$inout2);
2488	&movdqa		(&QWP(16*2,"esp"),$inout2);
2489	&pxor		($inout4,$inout3);
2490	&movdqa		(&QWP(16*3,"esp"),$inout3);
2491	&pxor		($inout5,$inout4);
2492	&movdqa		(&QWP(16*4,"esp"),$inout4);
2493
2494	&$movekey	($rndkey0,&QWP(-48,$key,$rounds));
2495	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2496	&movdqu		($inout1,&QWP(16*1,$inp));
2497	&movdqu		($inout2,&QWP(16*2,$inp));
2498	&movdqu		($inout3,&QWP(16*3,$inp));
2499	&movdqu		($inout4,&QWP(16*4,$inp));
2500	&pxor		($inout5,$inout5);
2501
2502	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2503	&pxor		($inout0,$rndkey0);		# ^ roundkey[0]
2504	&pxor		($inout1,$rndkey0);
2505	&pxor		($inout2,$rndkey0);
2506	&pxor		($inout3,$rndkey0);
2507	&pxor		($inout4,$rndkey0);
2508
2509	&$movekey	($rndkey1,&QWP(-32,$key,$rounds));
2510	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2511	&pxor		($inout1,&QWP(16*1,"esp"));
2512	&pxor		($inout2,&QWP(16*2,"esp"));
2513	&pxor		($inout3,&QWP(16*3,"esp"));
2514	&pxor		($inout4,&QWP(16*4,"esp"));
2515
2516	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
2517	&aesdec		($inout0,$rndkey1);
2518	&aesdec		($inout1,$rndkey1);
2519	&aesdec		($inout2,$rndkey1);
2520	&aesdec		($inout3,$rndkey1);
2521	&aesdec		($inout4,$rndkey1);
2522	&aesdec		($inout5,$rndkey1);
2523
2524	&mov		($out,&DWP($out_off,"esp"));
2525	&call		("_aesni_decrypt6_enter");
2526
2527	&movdqa		($rndkey0,&QWP(16*4,"esp"));	# pass last offset_i
2528	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2529	&movdqa		($rndkey1,&QWP($checksum,"esp"));
2530	&pxor		($inout1,&QWP(16*1,"esp"));
2531	&pxor		($inout2,&QWP(16*2,"esp"));
2532	&pxor		($inout3,&QWP(16*3,"esp"));
2533	&pxor		($inout4,$rndkey0);
2534
2535	&pxor		($rndkey1,$inout0);		# checksum
2536	&movdqu		(&QWP(16*0,$out,$inp),$inout0);	# store output
2537	&pxor		($rndkey1,$inout1);
2538	&movdqu		(&QWP(16*1,$out,$inp),$inout1);
2539	&pxor		($rndkey1,$inout2);
2540	&movdqu		(&QWP(16*2,$out,$inp),$inout2);
2541	&pxor		($rndkey1,$inout3);
2542	&movdqu		(&QWP(16*3,$out,$inp),$inout3);
2543	&pxor		($rndkey1,$inout4);
2544	&movdqu		(&QWP(16*4,$out,$inp),$inout4);
2545
2546	&jmp		(&label("done"));
2547
2548&set_label("one",16);
2549	&movdqu		($inout5,&QWP(0,$l_));
2550	&mov		($key,&DWP($key_off,"esp"));	# restore key
2551
2552	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2553	&mov		($rounds,&DWP(240,$key));
2554
2555	&pxor		($inout5,$rndkey0);		# ^ last offset_i
2556	&pxor		($inout0,$inout5);		# ^ offset_i
2557
2558	&movdqa		($inout4,$rndkey1);
2559	&mov		($out,&DWP($out_off,"esp"));
2560	if ($inline)
2561	{   &aesni_inline_generate1("dec");	}
2562	else
2563	{   &call	("_aesni_decrypt1");	}
2564
2565	&xorps		($inout0,$inout5);		# ^ offset_i
2566	&movaps		($rndkey1,$inout4);		# pass the checksum
2567	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2568	&xorps		($rndkey1,$inout0);		# checksum
2569	&movups		(&QWP(0,$out,$inp),$inout0);
2570
2571	&jmp		(&label("done"));
2572
2573&set_label("two",16);
2574	&lea		($i1,&DWP(1,$block));
2575	&mov		($key,&DWP($key_off,"esp"));	# restore key
2576	&bsf		($i1,$i1);
2577	&shl		($i1,4);
2578	&movdqu		($inout4,&QWP(0,$l_));
2579	&movdqu		($inout5,&QWP(0,$l_,$i1));
2580
2581	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2582	&movdqu		($inout1,&QWP(16*1,$inp));
2583	&mov		($rounds,&DWP(240,$key));
2584
2585	&movdqa		($inout3,$rndkey1);
2586	&pxor		($inout4,$rndkey0);		# ^ last offset_i
2587	&pxor		($inout5,$inout4);
2588
2589	&pxor		($inout0,$inout4);		# ^ offset_i
2590	&pxor		($inout1,$inout5);
2591
2592	&mov		($out,&DWP($out_off,"esp"));
2593	&call		("_aesni_decrypt2");
2594
2595	&xorps		($inout0,$inout4);		# ^ offset_i
2596	&xorps		($inout1,$inout5);
2597	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2598	&xorps		($inout3,$inout0);		# checksum
2599	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2600	&xorps		($inout3,$inout1);
2601	&movups		(&QWP(16*1,$out,$inp),$inout1);
2602	&movaps		($rndkey1,$inout3);		# pass the checksum
2603
2604	&jmp		(&label("done"));
2605
2606&set_label("three",16);
2607	&lea		($i1,&DWP(1,$block));
2608	&mov		($key,&DWP($key_off,"esp"));	# restore key
2609	&bsf		($i1,$i1);
2610	&shl		($i1,4);
2611	&movdqu		($inout3,&QWP(0,$l_));
2612	&movdqu		($inout4,&QWP(0,$l_,$i1));
2613	&movdqa		($inout5,$inout3);
2614
2615	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2616	&movdqu		($inout1,&QWP(16*1,$inp));
2617	&movdqu		($inout2,&QWP(16*2,$inp));
2618	&mov		($rounds,&DWP(240,$key));
2619
2620	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2621	&pxor		($inout3,$rndkey0);		# ^ last offset_i
2622	&pxor		($inout4,$inout3);
2623	&pxor		($inout5,$inout4);
2624
2625	&pxor		($inout0,$inout3);		# ^ offset_i
2626	&pxor		($inout1,$inout4);
2627	&pxor		($inout2,$inout5);
2628
2629	&mov		($out,&DWP($out_off,"esp"));
2630	&call		("_aesni_decrypt3");
2631
2632	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2633	&xorps		($inout0,$inout3);		# ^ offset_i
2634	&xorps		($inout1,$inout4);
2635	&xorps		($inout2,$inout5);
2636	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2637	&pxor		($rndkey1,$inout0);		# checksum
2638	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2639	&movups		(&QWP(16*1,$out,$inp),$inout1);
2640	&pxor		($rndkey1,$inout1);
2641	&movups		(&QWP(16*2,$out,$inp),$inout2);
2642	&pxor		($rndkey1,$inout2);
2643
2644	&jmp		(&label("done"));
2645
2646&set_label("four",16);
2647	&lea		($i1,&DWP(1,$block));
2648	&lea		($i3,&DWP(3,$block));
2649	&bsf		($i1,$i1);
2650	&bsf		($i3,$i3);
2651	&mov		($key,&DWP($key_off,"esp"));	# restore key
2652	&shl		($i1,4);
2653	&shl		($i3,4);
2654	&movdqu		($inout2,&QWP(0,$l_));
2655	&movdqu		($inout3,&QWP(0,$l_,$i1));
2656	&movdqa		($inout4,$inout2);
2657	&movdqu		($inout5,&QWP(0,$l_,$i3));
2658
2659	&pxor		($inout2,$rndkey0);		# ^ last offset_i
2660	&movdqu		($inout0,&QWP(16*0,$inp));	# load input
2661	&pxor		($inout3,$inout2);
2662	&movdqu		($inout1,&QWP(16*1,$inp));
2663	&pxor		($inout4,$inout3);
2664	&movdqa		(&QWP(16*0,"esp"),$inout2);
2665	&pxor		($inout5,$inout4);
2666	&movdqa		(&QWP(16*1,"esp"),$inout3);
2667	&movdqu		($inout2,&QWP(16*2,$inp));
2668	&movdqu		($inout3,&QWP(16*3,$inp));
2669	&mov		($rounds,&DWP(240,$key));
2670
2671	&movdqa		(&QWP($checksum,"esp"),$rndkey1);
2672	&pxor		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2673	&pxor		($inout1,&QWP(16*1,"esp"));
2674	&pxor		($inout2,$inout4);
2675	&pxor		($inout3,$inout5);
2676
2677	&mov		($out,&DWP($out_off,"esp"));
2678	&call		("_aesni_decrypt4");
2679
2680	&movdqa		($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2681	&xorps		($inout0,&QWP(16*0,"esp"));	# ^ offset_i
2682	&xorps		($inout1,&QWP(16*1,"esp"));
2683	&xorps		($inout2,$inout4);
2684	&movups		(&QWP(16*0,$out,$inp),$inout0);	# store output
2685	&pxor		($rndkey1,$inout0);		# checksum
2686	&xorps		($inout3,$inout5);
2687	&movups		(&QWP(16*1,$out,$inp),$inout1);
2688	&pxor		($rndkey1,$inout1);
2689	&movdqa		($rndkey0,$inout5);		# pass last offset_i
2690	&movups		(&QWP(16*2,$out,$inp),$inout2);
2691	&pxor		($rndkey1,$inout2);
2692	&movups		(&QWP(16*3,$out,$inp),$inout3);
2693	&pxor		($rndkey1,$inout3);
2694
2695&set_label("done");
2696	&mov	($key,&DWP($esp_off,"esp"));
2697	&pxor	($inout0,$inout0);		# clear register bank
2698	&pxor	($inout1,$inout1);
2699	&movdqa	(&QWP(16*0,"esp"),$inout0);	# clear stack
2700	&pxor	($inout2,$inout2);
2701	&movdqa	(&QWP(16*1,"esp"),$inout0);
2702	&pxor	($inout3,$inout3);
2703	&movdqa	(&QWP(16*2,"esp"),$inout0);
2704	&pxor	($inout4,$inout4);
2705	&movdqa	(&QWP(16*3,"esp"),$inout0);
2706	&pxor	($inout5,$inout5);
2707	&movdqa	(&QWP(16*4,"esp"),$inout0);
2708	&movdqa	(&QWP(16*5,"esp"),$inout0);
2709	&movdqa	(&QWP(16*6,"esp"),$inout0);
2710
2711	&lea	("esp",&DWP(0,$key));
2712	&mov	($rounds,&wparam(5));		# &offset_i
2713	&mov	($rounds_,&wparam(7));		# &checksum
2714	&movdqu	(&QWP(0,$rounds),$rndkey0);
2715	&pxor	($rndkey0,$rndkey0);
2716	&movdqu	(&QWP(0,$rounds_),$rndkey1);
2717	&pxor	($rndkey1,$rndkey1);
2718&function_end("aesni_ocb_decrypt");
2719}
2720}
2721
2722######################################################################
2723# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2724#                           size_t length, const AES_KEY *key,
2725#                           unsigned char *ivp,const int enc);
2726&function_begin("${PREFIX}_cbc_encrypt");
2727	&mov	($inp,&wparam(0));
2728	&mov	($rounds_,"esp");
2729	&mov	($out,&wparam(1));
2730	&sub	($rounds_,24);
2731	&mov	($len,&wparam(2));
2732	&and	($rounds_,-16);
2733	&mov	($key,&wparam(3));
2734	&mov	($key_,&wparam(4));
2735	&test	($len,$len);
2736	&jz	(&label("cbc_abort"));
2737
2738	&cmp	(&wparam(5),0);
2739	&xchg	($rounds_,"esp");		# alloca
2740	&movups	($ivec,&QWP(0,$key_));		# load IV
2741	&mov	($rounds,&DWP(240,$key));
2742	&mov	($key_,$key);			# backup $key
2743	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
2744	&mov	($rounds_,$rounds);		# backup $rounds
2745	&je	(&label("cbc_decrypt"));
2746
2747	&movaps	($inout0,$ivec);
2748	&cmp	($len,16);
2749	&jb	(&label("cbc_enc_tail"));
2750	&sub	($len,16);
2751	&jmp	(&label("cbc_enc_loop"));
2752
2753&set_label("cbc_enc_loop",16);
2754	&movups	($ivec,&QWP(0,$inp));		# input actually
2755	&lea	($inp,&DWP(16,$inp));
2756	if ($inline)
2757	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
2758	else
2759	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
2760	&mov	($rounds,$rounds_);	# restore $rounds
2761	&mov	($key,$key_);		# restore $key
2762	&movups	(&QWP(0,$out),$inout0);	# store output
2763	&lea	($out,&DWP(16,$out));
2764	&sub	($len,16);
2765	&jnc	(&label("cbc_enc_loop"));
2766	&add	($len,16);
2767	&jnz	(&label("cbc_enc_tail"));
2768	&movaps	($ivec,$inout0);
2769	&pxor	($inout0,$inout0);
2770	&jmp	(&label("cbc_ret"));
2771
2772&set_label("cbc_enc_tail");
2773	&mov	("ecx",$len);		# zaps $rounds
2774	&data_word(0xA4F3F689);		# rep movsb
2775	&mov	("ecx",16);		# zero tail
2776	&sub	("ecx",$len);
2777	&xor	("eax","eax");		# zaps $len
2778	&data_word(0xAAF3F689);		# rep stosb
2779	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
2780	&mov	($rounds,$rounds_);	# restore $rounds
2781	&mov	($inp,$out);		# $inp and $out are the same
2782	&mov	($key,$key_);		# restore $key
2783	&jmp	(&label("cbc_enc_loop"));
2784######################################################################
2785&set_label("cbc_decrypt",16);
2786	&cmp	($len,0x50);
2787	&jbe	(&label("cbc_dec_tail"));
2788	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
2789	&sub	($len,0x50);
2790	&jmp	(&label("cbc_dec_loop6_enter"));
2791
2792&set_label("cbc_dec_loop6",16);
2793	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
2794	&movups	(&QWP(0,$out),$inout5);
2795	&lea	($out,&DWP(0x10,$out));
2796&set_label("cbc_dec_loop6_enter");
2797	&movdqu	($inout0,&QWP(0,$inp));
2798	&movdqu	($inout1,&QWP(0x10,$inp));
2799	&movdqu	($inout2,&QWP(0x20,$inp));
2800	&movdqu	($inout3,&QWP(0x30,$inp));
2801	&movdqu	($inout4,&QWP(0x40,$inp));
2802	&movdqu	($inout5,&QWP(0x50,$inp));
2803
2804	&call	("_aesni_decrypt6");
2805
2806	&movups	($rndkey1,&QWP(0,$inp));
2807	&movups	($rndkey0,&QWP(0x10,$inp));
2808	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
2809	&xorps	($inout1,$rndkey1);
2810	&movups	($rndkey1,&QWP(0x20,$inp));
2811	&xorps	($inout2,$rndkey0);
2812	&movups	($rndkey0,&QWP(0x30,$inp));
2813	&xorps	($inout3,$rndkey1);
2814	&movups	($rndkey1,&QWP(0x40,$inp));
2815	&xorps	($inout4,$rndkey0);
2816	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
2817	&xorps	($inout5,$rndkey1);
2818	&movups	(&QWP(0,$out),$inout0);
2819	&movups	(&QWP(0x10,$out),$inout1);
2820	&lea	($inp,&DWP(0x60,$inp));
2821	&movups	(&QWP(0x20,$out),$inout2);
2822	&mov	($rounds,$rounds_);		# restore $rounds
2823	&movups	(&QWP(0x30,$out),$inout3);
2824	&mov	($key,$key_);			# restore $key
2825	&movups	(&QWP(0x40,$out),$inout4);
2826	&lea	($out,&DWP(0x50,$out));
2827	&sub	($len,0x60);
2828	&ja	(&label("cbc_dec_loop6"));
2829
2830	&movaps	($inout0,$inout5);
2831	&movaps	($ivec,$rndkey0);
2832	&add	($len,0x50);
2833	&jle	(&label("cbc_dec_clear_tail_collected"));
2834	&movups	(&QWP(0,$out),$inout0);
2835	&lea	($out,&DWP(0x10,$out));
2836&set_label("cbc_dec_tail");
2837	&movups	($inout0,&QWP(0,$inp));
2838	&movaps	($in0,$inout0);
2839	&cmp	($len,0x10);
2840	&jbe	(&label("cbc_dec_one"));
2841
2842	&movups	($inout1,&QWP(0x10,$inp));
2843	&movaps	($in1,$inout1);
2844	&cmp	($len,0x20);
2845	&jbe	(&label("cbc_dec_two"));
2846
2847	&movups	($inout2,&QWP(0x20,$inp));
2848	&cmp	($len,0x30);
2849	&jbe	(&label("cbc_dec_three"));
2850
2851	&movups	($inout3,&QWP(0x30,$inp));
2852	&cmp	($len,0x40);
2853	&jbe	(&label("cbc_dec_four"));
2854
2855	&movups	($inout4,&QWP(0x40,$inp));
2856	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
2857	&movups	($inout0,&QWP(0,$inp));
2858	&xorps	($inout5,$inout5);
2859	&call	("_aesni_decrypt6");
2860	&movups	($rndkey1,&QWP(0,$inp));
2861	&movups	($rndkey0,&QWP(0x10,$inp));
2862	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
2863	&xorps	($inout1,$rndkey1);
2864	&movups	($rndkey1,&QWP(0x20,$inp));
2865	&xorps	($inout2,$rndkey0);
2866	&movups	($rndkey0,&QWP(0x30,$inp));
2867	&xorps	($inout3,$rndkey1);
2868	&movups	($ivec,&QWP(0x40,$inp));	# IV
2869	&xorps	($inout4,$rndkey0);
2870	&movups	(&QWP(0,$out),$inout0);
2871	&movups	(&QWP(0x10,$out),$inout1);
2872	&pxor	($inout1,$inout1);
2873	&movups	(&QWP(0x20,$out),$inout2);
2874	&pxor	($inout2,$inout2);
2875	&movups	(&QWP(0x30,$out),$inout3);
2876	&pxor	($inout3,$inout3);
2877	&lea	($out,&DWP(0x40,$out));
2878	&movaps	($inout0,$inout4);
2879	&pxor	($inout4,$inout4);
2880	&sub	($len,0x50);
2881	&jmp	(&label("cbc_dec_tail_collected"));
2882
2883&set_label("cbc_dec_one",16);
2884	if ($inline)
2885	{   &aesni_inline_generate1("dec");	}
2886	else
2887	{   &call	("_aesni_decrypt1");	}
2888	&xorps	($inout0,$ivec);
2889	&movaps	($ivec,$in0);
2890	&sub	($len,0x10);
2891	&jmp	(&label("cbc_dec_tail_collected"));
2892
2893&set_label("cbc_dec_two",16);
2894	&call	("_aesni_decrypt2");
2895	&xorps	($inout0,$ivec);
2896	&xorps	($inout1,$in0);
2897	&movups	(&QWP(0,$out),$inout0);
2898	&movaps	($inout0,$inout1);
2899	&pxor	($inout1,$inout1);
2900	&lea	($out,&DWP(0x10,$out));
2901	&movaps	($ivec,$in1);
2902	&sub	($len,0x20);
2903	&jmp	(&label("cbc_dec_tail_collected"));
2904
2905&set_label("cbc_dec_three",16);
2906	&call	("_aesni_decrypt3");
2907	&xorps	($inout0,$ivec);
2908	&xorps	($inout1,$in0);
2909	&xorps	($inout2,$in1);
2910	&movups	(&QWP(0,$out),$inout0);
2911	&movaps	($inout0,$inout2);
2912	&pxor	($inout2,$inout2);
2913	&movups	(&QWP(0x10,$out),$inout1);
2914	&pxor	($inout1,$inout1);
2915	&lea	($out,&DWP(0x20,$out));
2916	&movups	($ivec,&QWP(0x20,$inp));
2917	&sub	($len,0x30);
2918	&jmp	(&label("cbc_dec_tail_collected"));
2919
2920&set_label("cbc_dec_four",16);
2921	&call	("_aesni_decrypt4");
2922	&movups	($rndkey1,&QWP(0x10,$inp));
2923	&movups	($rndkey0,&QWP(0x20,$inp));
2924	&xorps	($inout0,$ivec);
2925	&movups	($ivec,&QWP(0x30,$inp));
2926	&xorps	($inout1,$in0);
2927	&movups	(&QWP(0,$out),$inout0);
2928	&xorps	($inout2,$rndkey1);
2929	&movups	(&QWP(0x10,$out),$inout1);
2930	&pxor	($inout1,$inout1);
2931	&xorps	($inout3,$rndkey0);
2932	&movups	(&QWP(0x20,$out),$inout2);
2933	&pxor	($inout2,$inout2);
2934	&lea	($out,&DWP(0x30,$out));
2935	&movaps	($inout0,$inout3);
2936	&pxor	($inout3,$inout3);
2937	&sub	($len,0x40);
2938	&jmp	(&label("cbc_dec_tail_collected"));
2939
2940&set_label("cbc_dec_clear_tail_collected",16);
2941	&pxor	($inout1,$inout1);
2942	&pxor	($inout2,$inout2);
2943	&pxor	($inout3,$inout3);
2944	&pxor	($inout4,$inout4);
2945&set_label("cbc_dec_tail_collected");
2946	&and	($len,15);
2947	&jnz	(&label("cbc_dec_tail_partial"));
2948	&movups	(&QWP(0,$out),$inout0);
2949	&pxor	($rndkey0,$rndkey0);
2950	&jmp	(&label("cbc_ret"));
2951
2952&set_label("cbc_dec_tail_partial",16);
2953	&movaps	(&QWP(0,"esp"),$inout0);
2954	&pxor	($rndkey0,$rndkey0);
2955	&mov	("ecx",16);
2956	&mov	($inp,"esp");
2957	&sub	("ecx",$len);
2958	&data_word(0xA4F3F689);		# rep movsb
2959	&movdqa	(&QWP(0,"esp"),$inout0);
2960
2961&set_label("cbc_ret");
2962	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
2963	&mov	($key_,&wparam(4));
2964	&pxor	($inout0,$inout0);
2965	&pxor	($rndkey1,$rndkey1);
2966	&movups	(&QWP(0,$key_),$ivec);	# output IV
2967	&pxor	($ivec,$ivec);
2968&set_label("cbc_abort");
2969&function_end("${PREFIX}_cbc_encrypt");
2970
2971######################################################################
2972# Mechanical port from aesni-x86_64.pl.
2973#
2974# _aesni_set_encrypt_key is private interface,
2975# input:
2976#	"eax"	const unsigned char *userKey
2977#	$rounds	int bits
2978#	$key	AES_KEY *key
2979# output:
2980#	"eax"	return code
2981#	$round	rounds
2982
2983&function_begin_B("_aesni_set_encrypt_key");
2984	&push	("ebp");
2985	&push	("ebx");
2986	&test	("eax","eax");
2987	&jz	(&label("bad_pointer"));
2988	&test	($key,$key);
2989	&jz	(&label("bad_pointer"));
2990
2991	&call	(&label("pic"));
2992&set_label("pic");
2993	&blindpop("ebx");
2994	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2995
2996	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2997	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
2998	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
2999	&mov	("ebp",&DWP(4,"ebp"));
3000	&lea	($key,&DWP(16,$key));
3001	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
3002	&cmp	($rounds,256);
3003	&je	(&label("14rounds"));
3004	&cmp	($rounds,192);
3005	&je	(&label("12rounds"));
3006	&cmp	($rounds,128);
3007	&jne	(&label("bad_keybits"));
3008
3009&set_label("10rounds",16);
3010	&cmp		("ebp",1<<28);
3011	&je		(&label("10rounds_alt"));
3012
3013	&mov		($rounds,9);
3014	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
3015	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
3016	&call		(&label("key_128_cold"));
3017	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
3018	&call		(&label("key_128"));
3019	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
3020	&call		(&label("key_128"));
3021	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
3022	&call		(&label("key_128"));
3023	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
3024	&call		(&label("key_128"));
3025	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
3026	&call		(&label("key_128"));
3027	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
3028	&call		(&label("key_128"));
3029	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
3030	&call		(&label("key_128"));
3031	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
3032	&call		(&label("key_128"));
3033	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
3034	&call		(&label("key_128"));
3035	&$movekey	(&QWP(0,$key),"xmm0");
3036	&mov		(&DWP(80,$key),$rounds);
3037
3038	&jmp	(&label("good_key"));
3039
3040&set_label("key_128",16);
3041	&$movekey	(&QWP(0,$key),"xmm0");
3042	&lea		($key,&DWP(16,$key));
3043&set_label("key_128_cold");
3044	&shufps		("xmm4","xmm0",0b00010000);
3045	&xorps		("xmm0","xmm4");
3046	&shufps		("xmm4","xmm0",0b10001100);
3047	&xorps		("xmm0","xmm4");
3048	&shufps		("xmm1","xmm1",0b11111111);	# critical path
3049	&xorps		("xmm0","xmm1");
3050	&ret();
3051
3052&set_label("10rounds_alt",16);
3053	&movdqa		("xmm5",&QWP(0x00,"ebx"));
3054	&mov		($rounds,8);
3055	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3056	&movdqa		("xmm2","xmm0");
3057	&movdqu		(&QWP(-16,$key),"xmm0");
3058
3059&set_label("loop_key128");
3060	&pshufb		("xmm0","xmm5");
3061	&aesenclast	("xmm0","xmm4");
3062	&pslld		("xmm4",1);
3063	&lea		($key,&DWP(16,$key));
3064
3065	&movdqa		("xmm3","xmm2");
3066	&pslldq		("xmm2",4);
3067	&pxor		("xmm3","xmm2");
3068	&pslldq		("xmm2",4);
3069	&pxor		("xmm3","xmm2");
3070	&pslldq		("xmm2",4);
3071	&pxor		("xmm2","xmm3");
3072
3073	&pxor		("xmm0","xmm2");
3074	&movdqu		(&QWP(-16,$key),"xmm0");
3075	&movdqa		("xmm2","xmm0");
3076
3077	&dec		($rounds);
3078	&jnz		(&label("loop_key128"));
3079
3080	&movdqa		("xmm4",&QWP(0x30,"ebx"));
3081
3082	&pshufb		("xmm0","xmm5");
3083	&aesenclast	("xmm0","xmm4");
3084	&pslld		("xmm4",1);
3085
3086	&movdqa		("xmm3","xmm2");
3087	&pslldq		("xmm2",4);
3088	&pxor		("xmm3","xmm2");
3089	&pslldq		("xmm2",4);
3090	&pxor		("xmm3","xmm2");
3091	&pslldq		("xmm2",4);
3092	&pxor		("xmm2","xmm3");
3093
3094	&pxor		("xmm0","xmm2");
3095	&movdqu		(&QWP(0,$key),"xmm0");
3096
3097	&movdqa		("xmm2","xmm0");
3098	&pshufb		("xmm0","xmm5");
3099	&aesenclast	("xmm0","xmm4");
3100
3101	&movdqa		("xmm3","xmm2");
3102	&pslldq		("xmm2",4);
3103	&pxor		("xmm3","xmm2");
3104	&pslldq		("xmm2",4);
3105	&pxor		("xmm3","xmm2");
3106	&pslldq		("xmm2",4);
3107	&pxor		("xmm2","xmm3");
3108
3109	&pxor		("xmm0","xmm2");
3110	&movdqu		(&QWP(16,$key),"xmm0");
3111
3112	&mov		($rounds,9);
3113	&mov		(&DWP(96,$key),$rounds);
3114
3115	&jmp	(&label("good_key"));
3116
3117&set_label("12rounds",16);
3118	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
3119	&cmp		("ebp",1<<28);
3120	&je		(&label("12rounds_alt"));
3121
3122	&mov		($rounds,11);
3123	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
3124	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
3125	&call		(&label("key_192a_cold"));
3126	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
3127	&call		(&label("key_192b"));
3128	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
3129	&call		(&label("key_192a"));
3130	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
3131	&call		(&label("key_192b"));
3132	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
3133	&call		(&label("key_192a"));
3134	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
3135	&call		(&label("key_192b"));
3136	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
3137	&call		(&label("key_192a"));
3138	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
3139	&call		(&label("key_192b"));
3140	&$movekey	(&QWP(0,$key),"xmm0");
3141	&mov		(&DWP(48,$key),$rounds);
3142
3143	&jmp	(&label("good_key"));
3144
3145&set_label("key_192a",16);
3146	&$movekey	(&QWP(0,$key),"xmm0");
3147	&lea		($key,&DWP(16,$key));
3148&set_label("key_192a_cold",16);
3149	&movaps		("xmm5","xmm2");
3150&set_label("key_192b_warm");
3151	&shufps		("xmm4","xmm0",0b00010000);
3152	&movdqa		("xmm3","xmm2");
3153	&xorps		("xmm0","xmm4");
3154	&shufps		("xmm4","xmm0",0b10001100);
3155	&pslldq		("xmm3",4);
3156	&xorps		("xmm0","xmm4");
3157	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
3158	&pxor		("xmm2","xmm3");
3159	&pxor		("xmm0","xmm1");
3160	&pshufd		("xmm3","xmm0",0b11111111);
3161	&pxor		("xmm2","xmm3");
3162	&ret();
3163
3164&set_label("key_192b",16);
3165	&movaps		("xmm3","xmm0");
3166	&shufps		("xmm5","xmm0",0b01000100);
3167	&$movekey	(&QWP(0,$key),"xmm5");
3168	&shufps		("xmm3","xmm2",0b01001110);
3169	&$movekey	(&QWP(16,$key),"xmm3");
3170	&lea		($key,&DWP(32,$key));
3171	&jmp		(&label("key_192b_warm"));
3172
3173&set_label("12rounds_alt",16);
3174	&movdqa		("xmm5",&QWP(0x10,"ebx"));
3175	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3176	&mov		($rounds,8);
3177	&movdqu		(&QWP(-16,$key),"xmm0");
3178
3179&set_label("loop_key192");
3180	&movq		(&QWP(0,$key),"xmm2");
3181	&movdqa		("xmm1","xmm2");
3182	&pshufb		("xmm2","xmm5");
3183	&aesenclast	("xmm2","xmm4");
3184	&pslld		("xmm4",1);
3185	&lea		($key,&DWP(24,$key));
3186
3187	&movdqa		("xmm3","xmm0");
3188	&pslldq		("xmm0",4);
3189	&pxor		("xmm3","xmm0");
3190	&pslldq		("xmm0",4);
3191	&pxor		("xmm3","xmm0");
3192	&pslldq		("xmm0",4);
3193	&pxor		("xmm0","xmm3");
3194
3195	&pshufd		("xmm3","xmm0",0xff);
3196	&pxor		("xmm3","xmm1");
3197	&pslldq		("xmm1",4);
3198	&pxor		("xmm3","xmm1");
3199
3200	&pxor		("xmm0","xmm2");
3201	&pxor		("xmm2","xmm3");
3202	&movdqu		(&QWP(-16,$key),"xmm0");
3203
3204	&dec		($rounds);
3205	&jnz		(&label("loop_key192"));
3206
3207	&mov	($rounds,11);
3208	&mov	(&DWP(32,$key),$rounds);
3209
3210	&jmp	(&label("good_key"));
3211
3212&set_label("14rounds",16);
3213	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
3214	&lea		($key,&DWP(16,$key));
3215	&cmp		("ebp",1<<28);
3216	&je		(&label("14rounds_alt"));
3217
3218	&mov		($rounds,13);
3219	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
3220	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
3221	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
3222	&call		(&label("key_256a_cold"));
3223	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
3224	&call		(&label("key_256b"));
3225	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
3226	&call		(&label("key_256a"));
3227	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
3228	&call		(&label("key_256b"));
3229	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
3230	&call		(&label("key_256a"));
3231	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
3232	&call		(&label("key_256b"));
3233	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
3234	&call		(&label("key_256a"));
3235	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
3236	&call		(&label("key_256b"));
3237	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
3238	&call		(&label("key_256a"));
3239	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
3240	&call		(&label("key_256b"));
3241	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
3242	&call		(&label("key_256a"));
3243	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
3244	&call		(&label("key_256b"));
3245	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
3246	&call		(&label("key_256a"));
3247	&$movekey	(&QWP(0,$key),"xmm0");
3248	&mov		(&DWP(16,$key),$rounds);
3249	&xor		("eax","eax");
3250
3251	&jmp	(&label("good_key"));
3252
3253&set_label("key_256a",16);
3254	&$movekey	(&QWP(0,$key),"xmm2");
3255	&lea		($key,&DWP(16,$key));
3256&set_label("key_256a_cold");
3257	&shufps		("xmm4","xmm0",0b00010000);
3258	&xorps		("xmm0","xmm4");
3259	&shufps		("xmm4","xmm0",0b10001100);
3260	&xorps		("xmm0","xmm4");
3261	&shufps		("xmm1","xmm1",0b11111111);	# critical path
3262	&xorps		("xmm0","xmm1");
3263	&ret();
3264
3265&set_label("key_256b",16);
3266	&$movekey	(&QWP(0,$key),"xmm0");
3267	&lea		($key,&DWP(16,$key));
3268
3269	&shufps		("xmm4","xmm2",0b00010000);
3270	&xorps		("xmm2","xmm4");
3271	&shufps		("xmm4","xmm2",0b10001100);
3272	&xorps		("xmm2","xmm4");
3273	&shufps		("xmm1","xmm1",0b10101010);	# critical path
3274	&xorps		("xmm2","xmm1");
3275	&ret();
3276
3277&set_label("14rounds_alt",16);
3278	&movdqa		("xmm5",&QWP(0x00,"ebx"));
3279	&movdqa		("xmm4",&QWP(0x20,"ebx"));
3280	&mov		($rounds,7);
3281	&movdqu		(&QWP(-32,$key),"xmm0");
3282	&movdqa		("xmm1","xmm2");
3283	&movdqu		(&QWP(-16,$key),"xmm2");
3284
3285&set_label("loop_key256");
3286	&pshufb		("xmm2","xmm5");
3287	&aesenclast	("xmm2","xmm4");
3288
3289	&movdqa		("xmm3","xmm0");
3290	&pslldq		("xmm0",4);
3291	&pxor		("xmm3","xmm0");
3292	&pslldq		("xmm0",4);
3293	&pxor		("xmm3","xmm0");
3294	&pslldq		("xmm0",4);
3295	&pxor		("xmm0","xmm3");
3296	&pslld		("xmm4",1);
3297
3298	&pxor		("xmm0","xmm2");
3299	&movdqu		(&QWP(0,$key),"xmm0");
3300
3301	&dec		($rounds);
3302	&jz		(&label("done_key256"));
3303
3304	&pshufd		("xmm2","xmm0",0xff);
3305	&pxor		("xmm3","xmm3");
3306	&aesenclast	("xmm2","xmm3");
3307
3308	&movdqa		("xmm3","xmm1");
3309	&pslldq		("xmm1",4);
3310	&pxor		("xmm3","xmm1");
3311	&pslldq		("xmm1",4);
3312	&pxor		("xmm3","xmm1");
3313	&pslldq		("xmm1",4);
3314	&pxor		("xmm1","xmm3");
3315
3316	&pxor		("xmm2","xmm1");
3317	&movdqu		(&QWP(16,$key),"xmm2");
3318	&lea		($key,&DWP(32,$key));
3319	&movdqa		("xmm1","xmm2");
3320	&jmp		(&label("loop_key256"));
3321
3322&set_label("done_key256");
3323	&mov		($rounds,13);
3324	&mov		(&DWP(16,$key),$rounds);
3325
3326&set_label("good_key");
3327	&pxor	("xmm0","xmm0");
3328	&pxor	("xmm1","xmm1");
3329	&pxor	("xmm2","xmm2");
3330	&pxor	("xmm3","xmm3");
3331	&pxor	("xmm4","xmm4");
3332	&pxor	("xmm5","xmm5");
3333	&xor	("eax","eax");
3334	&pop	("ebx");
3335	&pop	("ebp");
3336	&ret	();
3337
3338&set_label("bad_pointer",4);
3339	&mov	("eax",-1);
3340	&pop	("ebx");
3341	&pop	("ebp");
3342	&ret	();
3343&set_label("bad_keybits",4);
3344	&pxor	("xmm0","xmm0");
3345	&mov	("eax",-2);
3346	&pop	("ebx");
3347	&pop	("ebp");
3348	&ret	();
3349&function_end_B("_aesni_set_encrypt_key");
3350
3351# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3352#                              AES_KEY *key)
3353&function_begin_B("${PREFIX}_set_encrypt_key");
3354	&mov	("eax",&wparam(0));
3355	&mov	($rounds,&wparam(1));
3356	&mov	($key,&wparam(2));
3357	&call	("_aesni_set_encrypt_key");
3358	&ret	();
3359&function_end_B("${PREFIX}_set_encrypt_key");
3360
3361# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3362#                              AES_KEY *key)
3363&function_begin_B("${PREFIX}_set_decrypt_key");
3364	&mov	("eax",&wparam(0));
3365	&mov	($rounds,&wparam(1));
3366	&mov	($key,&wparam(2));
3367	&call	("_aesni_set_encrypt_key");
3368	&mov	($key,&wparam(2));
3369	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
3370	&test	("eax","eax");
3371	&jnz	(&label("dec_key_ret"));
3372	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
3373
3374	&$movekey	("xmm0",&QWP(0,$key));	# just swap
3375	&$movekey	("xmm1",&QWP(0,"eax"));
3376	&$movekey	(&QWP(0,"eax"),"xmm0");
3377	&$movekey	(&QWP(0,$key),"xmm1");
3378	&lea		($key,&DWP(16,$key));
3379	&lea		("eax",&DWP(-16,"eax"));
3380
3381&set_label("dec_key_inverse");
3382	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
3383	&$movekey	("xmm1",&QWP(0,"eax"));
3384	&aesimc		("xmm0","xmm0");
3385	&aesimc		("xmm1","xmm1");
3386	&lea		($key,&DWP(16,$key));
3387	&lea		("eax",&DWP(-16,"eax"));
3388	&$movekey	(&QWP(16,"eax"),"xmm0");
3389	&$movekey	(&QWP(-16,$key),"xmm1");
3390	&cmp		("eax",$key);
3391	&ja		(&label("dec_key_inverse"));
3392
3393	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
3394	&aesimc		("xmm0","xmm0");
3395	&$movekey	(&QWP(0,$key),"xmm0");
3396
3397	&pxor		("xmm0","xmm0");
3398	&pxor		("xmm1","xmm1");
3399	&xor		("eax","eax");		# return success
3400&set_label("dec_key_ret");
3401	&ret	();
3402&function_end_B("${PREFIX}_set_decrypt_key");
3403
3404&set_label("key_const",64);
3405&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3406&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3407&data_word(1,1,1,1);
3408&data_word(0x1b,0x1b,0x1b,0x1b);
3409&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3410
3411&asm_finish();
3412
3413close STDOUT or die "error closing STDOUT: $!";
3414