xref: /openssl/crypto/camellia/asm/cmll-x86.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2008-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12#
13# This module may be used under the terms of either the GNU General
14# Public License version 2 or later, the GNU Lesser General Public
15# License version 2.1 or later, the Mozilla Public License version
16# 1.1 or the BSD License. The exact terms of either license are
17# distributed along with this module. For further details see
18# http://www.openssl.org/~appro/camellia/.
19# ====================================================================
20
21# Performance in cycles per processed byte (less is better) in
22# 'openssl speed ...' benchmark:
23#
24#			AMD K8	Core2	PIII	P4
25# -evp camellia-128-ecb	21.5	22.8	27.0	28.9
26# + over gcc 3.4.6	+90/11% +70/10%	+53/4%	+160/64%
27# + over icc 8.0	+48/19% +21/15%	+21/17%	+55/37%
28#
29# camellia-128-cbc	17.3	21.1	23.9	25.9
30#
31# 128-bit key setup	196	280	256	240	cycles/key
32# + over gcc 3.4.6	+30/0%	+17/11%	+11/0%	+63/40%
33# + over icc 8.0	+18/3%	+10/0%	+10/3%	+21/10%
34#
35# Pairs of numbers in "+" rows represent performance improvement over
36# compiler generated position-independent code, PIC, and non-PIC
37# respectively. PIC results are of greater relevance, as this module
38# is position-independent, i.e. suitable for a shared library or PIE.
39# Position independence "costs" one register, which is why compilers
40# are so close with non-PIC results, they have an extra register to
41# spare. CBC results are better than ECB ones thanks to "zero-copy"
42# private _x86_* interface, and are ~30-40% better than with compiler
43# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
44# same CPU (where applicable).
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47push(@INC,"${dir}","${dir}../../perlasm");
48require "x86asm.pl";
49
50$OPENSSL=1;
51
52$output = pop and open STDOUT,">$output";
53
54&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
55
56@T=("eax","ebx","ecx","edx");
57$idx="esi";
58$key="edi";
59$Tbl="ebp";
60
61# stack frame layout in _x86_Camellia_* routines, frame is allocated
62# by caller
63$__ra=&DWP(0,"esp");	# return address
64$__s0=&DWP(4,"esp");	# s0 backing store
65$__s1=&DWP(8,"esp");	# s1 backing store
66$__s2=&DWP(12,"esp");	# s2 backing store
67$__s3=&DWP(16,"esp");	# s3 backing store
68$__end=&DWP(20,"esp");	# pointer to end/start of key schedule
69
70# stack frame layout in Camellia_[en|crypt] routines, which differs from
71# above by 4 and overlaps by pointer to end/start of key schedule
72$_end=&DWP(16,"esp");
73$_esp=&DWP(20,"esp");
74
75# const unsigned int Camellia_SBOX[4][256];
76# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
77# and [2][] - with [3][]. This is done to optimize code size.
78$SBOX1_1110=0;		# Camellia_SBOX[0]
79$SBOX4_4404=4;		# Camellia_SBOX[1]
80$SBOX2_0222=2048;	# Camellia_SBOX[2]
81$SBOX3_3033=2052;	# Camellia_SBOX[3]
82&static_label("Camellia_SIGMA");
83&static_label("Camellia_SBOX");
84
85sub Camellia_Feistel {
86my $i=@_[0];
87my $seed=defined(@_[1])?@_[1]:0;
88my $scale=$seed<0?-8:8;
89my $frame=defined(@_[2])?@_[2]:0;
90my $j=($i&1)*2;
91my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
92
93	&xor	($t0,$idx);				# t0^=key[0]
94	&xor	($t1,&DWP($seed+$i*$scale+4,$key));	# t1^=key[1]
95	&movz	($idx,&HB($t0));			# (t0>>8)&0xff
96	&mov	($t3,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t3=SBOX3_3033[0]
97	&movz	($idx,&LB($t0));			# (t0>>0)&0xff
98	&xor	($t3,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t3^=SBOX4_4404[0]
99	&shr	($t0,16);
100	&movz	($idx,&LB($t1));			# (t1>>0)&0xff
101	&mov	($t2,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t2=SBOX1_1110[1]
102	&movz	($idx,&HB($t0));			# (t0>>24)&0xff
103	&xor	($t3,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t3^=SBOX1_1110[0]
104	&movz	($idx,&HB($t1));			# (t1>>8)&0xff
105	&xor	($t2,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t2^=SBOX4_4404[1]
106	&shr	($t1,16);
107	&movz	($t0,&LB($t0));				# (t0>>16)&0xff
108	&xor	($t3,&DWP($SBOX2_0222,$Tbl,$t0,8));	# t3^=SBOX2_0222[0]
109	&movz	($idx,&HB($t1));			# (t1>>24)&0xff
110	&mov	($t0,&DWP($frame+4*(($j+3)%4),"esp"));	# prefetch "s3"
111	&xor	($t2,$t3);				# t2^=t3
112	&rotr	($t3,8);				# t3=RightRotate(t3,8)
113	&xor	($t2,&DWP($SBOX2_0222,$Tbl,$idx,8));	# t2^=SBOX2_0222[1]
114	&movz	($idx,&LB($t1));			# (t1>>16)&0xff
115	&mov	($t1,&DWP($frame+4*(($j+2)%4),"esp"));	# prefetch "s2"
116	&xor	($t3,$t0);				# t3^=s3
117	&xor	($t2,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t2^=SBOX3_3033[1]
118	&mov	($idx,&DWP($seed+($i+1)*$scale,$key));	# prefetch key[i+1]
119	&xor	($t3,$t2);				# t3^=t2
120	&mov	(&DWP($frame+4*(($j+3)%4),"esp"),$t3);	# s3=t3
121	&xor	($t2,$t1);				# t2^=s2
122	&mov	(&DWP($frame+4*(($j+2)%4),"esp"),$t2);	# s2=t2
123}
124
125# void Camellia_EncryptBlock_Rounds(
126#		int grandRounds,
127#		const Byte plaintext[],
128#		const KEY_TABLE_TYPE keyTable,
129#		Byte ciphertext[])
130&function_begin("Camellia_EncryptBlock_Rounds");
131	&mov	("eax",&wparam(0));	# load grandRounds
132	&mov	($idx,&wparam(1));	# load plaintext pointer
133	&mov	($key,&wparam(2));	# load key schedule pointer
134
135	&mov	("ebx","esp");
136	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
137	&and	("esp",-64);
138
139	# place stack frame just "above mod 1024" the key schedule
140	# this ensures that cache associativity of 2 suffices
141	&lea	("ecx",&DWP(-64-63,$key));
142	&sub	("ecx","esp");
143	&neg	("ecx");
144	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
145	&sub	("esp","ecx");
146	&add	("esp",4);	# 4 is reserved for callee's return address
147
148	&shl	("eax",6);
149	&lea	("eax",&DWP(0,$key,"eax"));
150	&mov	($_esp,"ebx");	# save %esp
151	&mov	($_end,"eax");	# save keyEnd
152
153	&call	(&label("pic_point"));
154	&set_label("pic_point");
155	&blindpop($Tbl);
156	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
157
158	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
159	&mov	(@T[1],&DWP(4,$idx));
160	&mov	(@T[2],&DWP(8,$idx));
161	&bswap	(@T[0]);
162	&mov	(@T[3],&DWP(12,$idx));
163	&bswap	(@T[1]);
164	&bswap	(@T[2]);
165	&bswap	(@T[3]);
166
167	&call	("_x86_Camellia_encrypt");
168
169	&mov	("esp",$_esp);
170	&bswap	(@T[0]);
171	&mov	($idx,&wparam(3));	# load ciphertext pointer
172	&bswap	(@T[1]);
173	&bswap	(@T[2]);
174	&bswap	(@T[3]);
175	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
176	&mov	(&DWP(4,$idx),@T[1]);
177	&mov	(&DWP(8,$idx),@T[2]);
178	&mov	(&DWP(12,$idx),@T[3]);
179&function_end("Camellia_EncryptBlock_Rounds");
180# V1.x API
181&function_begin_B("Camellia_EncryptBlock");
182	&mov	("eax",128);
183	&sub	("eax",&wparam(0));	# load keyBitLength
184	&mov	("eax",3);
185	&adc	("eax",0);		# keyBitLength==128?3:4
186	&mov	(&wparam(0),"eax");
187	&jmp	(&label("Camellia_EncryptBlock_Rounds"));
188&function_end_B("Camellia_EncryptBlock");
189
190if ($OPENSSL) {
191# void Camellia_encrypt(
192#		const unsigned char *in,
193#		unsigned char *out,
194#		const CAMELLIA_KEY *key)
195&function_begin("Camellia_encrypt");
196	&mov	($idx,&wparam(0));	# load plaintext pointer
197	&mov	($key,&wparam(2));	# load key schedule pointer
198
199	&mov	("ebx","esp");
200	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
201	&and	("esp",-64);
202	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
203
204	# place stack frame just "above mod 1024" the key schedule
205	# this ensures that cache associativity of 2 suffices
206	&lea	("ecx",&DWP(-64-63,$key));
207	&sub	("ecx","esp");
208	&neg	("ecx");
209	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
210	&sub	("esp","ecx");
211	&add	("esp",4);	# 4 is reserved for callee's return address
212
213	&shl	("eax",6);
214	&lea	("eax",&DWP(0,$key,"eax"));
215	&mov	($_esp,"ebx");	# save %esp
216	&mov	($_end,"eax");	# save keyEnd
217
218	&call	(&label("pic_point"));
219	&set_label("pic_point");
220	&blindpop($Tbl);
221	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
222
223	&mov	(@T[0],&DWP(0,$idx));	# load plaintext
224	&mov	(@T[1],&DWP(4,$idx));
225	&mov	(@T[2],&DWP(8,$idx));
226	&bswap	(@T[0]);
227	&mov	(@T[3],&DWP(12,$idx));
228	&bswap	(@T[1]);
229	&bswap	(@T[2]);
230	&bswap	(@T[3]);
231
232	&call	("_x86_Camellia_encrypt");
233
234	&mov	("esp",$_esp);
235	&bswap	(@T[0]);
236	&mov	($idx,&wparam(1));	# load ciphertext pointer
237	&bswap	(@T[1]);
238	&bswap	(@T[2]);
239	&bswap	(@T[3]);
240	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext
241	&mov	(&DWP(4,$idx),@T[1]);
242	&mov	(&DWP(8,$idx),@T[2]);
243	&mov	(&DWP(12,$idx),@T[3]);
244&function_end("Camellia_encrypt");
245}
246
247&function_begin_B("_x86_Camellia_encrypt");
248	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
249	&xor	(@T[1],&DWP(4,$key));
250	&xor	(@T[2],&DWP(8,$key));
251	&xor	(@T[3],&DWP(12,$key));
252	&mov	($idx,&DWP(16,$key));	# prefetch key[4]
253
254	&mov	($__s0,@T[0]);		# save s[0-3]
255	&mov	($__s1,@T[1]);
256	&mov	($__s2,@T[2]);
257	&mov	($__s3,@T[3]);
258
259&set_label("loop",16);
260	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
261
262	&add	($key,16*4);
263	&cmp	($key,$__end);
264	&je	(&label("done"));
265
266	# @T[0-1] are preloaded, $idx is preloaded with key[0]
267	&and	($idx,@T[0]);
268	 &mov	 (@T[3],$__s3);
269	&rotl	($idx,1);
270	 &mov	 (@T[2],@T[3]);
271	&xor	(@T[1],$idx);
272	 &or	 (@T[2],&DWP(12,$key));
273	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
274	 &xor	 (@T[2],$__s2);
275
276	&mov	($idx,&DWP(4,$key));
277	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
278	&or	($idx,@T[1]);
279	 &and	 (@T[2],&DWP(8,$key));
280	&xor	(@T[0],$idx);
281	 &rotl	 (@T[2],1);
282	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
283	 &xor	 (@T[3],@T[2]);
284	&mov	($idx,&DWP(16,$key));		# prefetch key[4]
285	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
286	&jmp	(&label("loop"));
287
288&set_label("done",8);
289	&mov	(@T[2],@T[0]);		# SwapHalf
290	&mov	(@T[3],@T[1]);
291	&mov	(@T[0],$__s2);
292	&mov	(@T[1],$__s3);
293	&xor	(@T[0],$idx);		# $idx is preloaded with key[0]
294	&xor	(@T[1],&DWP(4,$key));
295	&xor	(@T[2],&DWP(8,$key));
296	&xor	(@T[3],&DWP(12,$key));
297	&ret	();
298&function_end_B("_x86_Camellia_encrypt");
299
300# void Camellia_DecryptBlock_Rounds(
301#		int grandRounds,
302#		const Byte ciphertext[],
303#		const KEY_TABLE_TYPE keyTable,
304#		Byte plaintext[])
305&function_begin("Camellia_DecryptBlock_Rounds");
306	&mov	("eax",&wparam(0));	# load grandRounds
307	&mov	($idx,&wparam(1));	# load ciphertext pointer
308	&mov	($key,&wparam(2));	# load key schedule pointer
309
310	&mov	("ebx","esp");
311	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
312	&and	("esp",-64);
313
314	# place stack frame just "above mod 1024" the key schedule
315	# this ensures that cache associativity of 2 suffices
316	&lea	("ecx",&DWP(-64-63,$key));
317	&sub	("ecx","esp");
318	&neg	("ecx");
319	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
320	&sub	("esp","ecx");
321	&add	("esp",4);	# 4 is reserved for callee's return address
322
323	&shl	("eax",6);
324	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
325	&lea	($key,&DWP(0,$key,"eax"));
326	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
327
328	&call	(&label("pic_point"));
329	&set_label("pic_point");
330	&blindpop($Tbl);
331	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
332
333	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
334	&mov	(@T[1],&DWP(4,$idx));
335	&mov	(@T[2],&DWP(8,$idx));
336	&bswap	(@T[0]);
337	&mov	(@T[3],&DWP(12,$idx));
338	&bswap	(@T[1]);
339	&bswap	(@T[2]);
340	&bswap	(@T[3]);
341
342	&call	("_x86_Camellia_decrypt");
343
344	&mov	("esp",&DWP(5*4,"esp"));
345	&bswap	(@T[0]);
346	&mov	($idx,&wparam(3));	# load plaintext pointer
347	&bswap	(@T[1]);
348	&bswap	(@T[2]);
349	&bswap	(@T[3]);
350	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
351	&mov	(&DWP(4,$idx),@T[1]);
352	&mov	(&DWP(8,$idx),@T[2]);
353	&mov	(&DWP(12,$idx),@T[3]);
354&function_end("Camellia_DecryptBlock_Rounds");
355# V1.x API
356&function_begin_B("Camellia_DecryptBlock");
357	&mov	("eax",128);
358	&sub	("eax",&wparam(0));	# load keyBitLength
359	&mov	("eax",3);
360	&adc	("eax",0);		# keyBitLength==128?3:4
361	&mov	(&wparam(0),"eax");
362	&jmp	(&label("Camellia_DecryptBlock_Rounds"));
363&function_end_B("Camellia_DecryptBlock");
364
365if ($OPENSSL) {
366# void Camellia_decrypt(
367#		const unsigned char *in,
368#		unsigned char *out,
369#		const CAMELLIA_KEY *key)
370&function_begin("Camellia_decrypt");
371	&mov	($idx,&wparam(0));	# load ciphertext pointer
372	&mov	($key,&wparam(2));	# load key schedule pointer
373
374	&mov	("ebx","esp");
375	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra
376	&and	("esp",-64);
377	&mov	("eax",&DWP(272,$key));	# load grandRounds counter
378
379	# place stack frame just "above mod 1024" the key schedule
380	# this ensures that cache associativity of 2 suffices
381	&lea	("ecx",&DWP(-64-63,$key));
382	&sub	("ecx","esp");
383	&neg	("ecx");
384	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line
385	&sub	("esp","ecx");
386	&add	("esp",4);	# 4 is reserved for callee's return address
387
388	&shl	("eax",6);
389	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart
390	&lea	($key,&DWP(0,$key,"eax"));
391	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp
392
393	&call	(&label("pic_point"));
394	&set_label("pic_point");
395	&blindpop($Tbl);
396	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
397
398	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext
399	&mov	(@T[1],&DWP(4,$idx));
400	&mov	(@T[2],&DWP(8,$idx));
401	&bswap	(@T[0]);
402	&mov	(@T[3],&DWP(12,$idx));
403	&bswap	(@T[1]);
404	&bswap	(@T[2]);
405	&bswap	(@T[3]);
406
407	&call	("_x86_Camellia_decrypt");
408
409	&mov	("esp",&DWP(5*4,"esp"));
410	&bswap	(@T[0]);
411	&mov	($idx,&wparam(1));	# load plaintext pointer
412	&bswap	(@T[1]);
413	&bswap	(@T[2]);
414	&bswap	(@T[3]);
415	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext
416	&mov	(&DWP(4,$idx),@T[1]);
417	&mov	(&DWP(8,$idx),@T[2]);
418	&mov	(&DWP(12,$idx),@T[3]);
419&function_end("Camellia_decrypt");
420}
421
422&function_begin_B("_x86_Camellia_decrypt");
423	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]
424	&xor	(@T[1],&DWP(4,$key));
425	&xor	(@T[2],&DWP(8,$key));
426	&xor	(@T[3],&DWP(12,$key));
427	&mov	($idx,&DWP(-8,$key));	# prefetch key[-2]
428
429	&mov	($__s0,@T[0]);		# save s[0-3]
430	&mov	($__s1,@T[1]);
431	&mov	($__s2,@T[2]);
432	&mov	($__s3,@T[3]);
433
434&set_label("loop",16);
435	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
436
437	&sub	($key,16*4);
438	&cmp	($key,$__end);
439	&je	(&label("done"));
440
441	# @T[0-1] are preloaded, $idx is preloaded with key[2]
442	&and	($idx,@T[0]);
443	 &mov	 (@T[3],$__s3);
444	&rotl	($idx,1);
445	 &mov	 (@T[2],@T[3]);
446	&xor	(@T[1],$idx);
447	 &or	 (@T[2],&DWP(4,$key));
448	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);
449	 &xor	 (@T[2],$__s2);
450
451	&mov	($idx,&DWP(12,$key));
452	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];
453	&or	($idx,@T[1]);
454	 &and	 (@T[2],&DWP(0,$key));
455	&xor	(@T[0],$idx);
456	 &rotl	 (@T[2],1);
457	&mov	($__s0,@T[0]);		# s0^=s1|key[1];
458	 &xor	 (@T[3],@T[2]);
459	&mov	($idx,&DWP(-8,$key));	# prefetch key[4]
460	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);
461	&jmp	(&label("loop"));
462
463&set_label("done",8);
464	&mov	(@T[2],@T[0]);		# SwapHalf
465	&mov	(@T[3],@T[1]);
466	&mov	(@T[0],$__s2);
467	&mov	(@T[1],$__s3);
468	&xor	(@T[2],$idx);		# $idx is preloaded with key[2]
469	&xor	(@T[3],&DWP(12,$key));
470	&xor	(@T[0],&DWP(0,$key));
471	&xor	(@T[1],&DWP(4,$key));
472	&ret	();
473&function_end_B("_x86_Camellia_decrypt");
474
475# shld is very slow on Intel P4 family. Even on AMD it limits
476# instruction decode rate [because it's VectorPath] and consequently
477# performance. PIII, PM and Core[2] seem to be the only ones which
478# execute this code ~7% faster...
479sub __rotl128 {
480  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
481
482    $rnd *= 2;
483    if ($rot) {
484	&mov	($idx,$i0);
485	&shld	($i0,$i1,$rot);
486	&shld	($i1,$i2,$rot);
487	&shld	($i2,$i3,$rot);
488	&shld	($i3,$idx,$rot);
489    }
490    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
491    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
492    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
493    &mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
494}
495
496# ... Implementing 128-bit rotate without shld gives >3x performance
497# improvement on P4, only ~7% degradation on other Intel CPUs and
498# not worse performance on AMD. This is therefore preferred.
499sub _rotl128 {
500  my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
501
502    $rnd *= 2;
503    if ($rot) {
504	&mov	($Tbl,$i0);
505	&shl	($i0,$rot);
506	&mov	($idx,$i1);
507	&shr	($idx,32-$rot);
508	&shl	($i1,$rot);
509	&or	($i0,$idx);
510	&mov	($idx,$i2);
511	&shl	($i2,$rot);
512	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
513	&shr	($idx,32-$rot);
514	&or	($i1,$idx);
515	&shr	($Tbl,32-$rot);
516	&mov	($idx,$i3);
517	&shr	($idx,32-$rot);
518	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
519	&shl	($i3,$rot);
520	&or	($i2,$idx);
521	&or	($i3,$Tbl);
522	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
523	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
524    } else {
525	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i0 eq @T[0]);
526	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i1 eq @T[0]);
527	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i2 eq @T[0]);
528	&mov	(&DWP(-128+4*$rnd++,$key),shift(@T))	if ($i3 eq @T[0]);
529    }
530}
531
532sub _saveround {
533my ($rnd,$key,@T)=@_;
534my $bias=int(@T[0])?shift(@T):0;
535
536	&mov	(&DWP($bias+$rnd*8+0,$key),@T[0]);
537	&mov	(&DWP($bias+$rnd*8+4,$key),@T[1])	if ($#T>=1);
538	&mov	(&DWP($bias+$rnd*8+8,$key),@T[2])	if ($#T>=2);
539	&mov	(&DWP($bias+$rnd*8+12,$key),@T[3])	if ($#T>=3);
540}
541
542sub _loadround {
543my ($rnd,$key,@T)=@_;
544my $bias=int(@T[0])?shift(@T):0;
545
546	&mov	(@T[0],&DWP($bias+$rnd*8+0,$key));
547	&mov	(@T[1],&DWP($bias+$rnd*8+4,$key))	if ($#T>=1);
548	&mov	(@T[2],&DWP($bias+$rnd*8+8,$key))	if ($#T>=2);
549	&mov	(@T[3],&DWP($bias+$rnd*8+12,$key))	if ($#T>=3);
550}
551
552# void Camellia_Ekeygen(
553#		const int keyBitLength,
554#		const Byte *rawKey,
555#		KEY_TABLE_TYPE keyTable)
556&function_begin("Camellia_Ekeygen");
557{ my $step=0;
558
559	&stack_push(4);				# place for s[0-3]
560
561	&mov	($Tbl,&wparam(0));		# load arguments
562	&mov	($idx,&wparam(1));
563	&mov	($key,&wparam(2));
564
565	&mov	(@T[0],&DWP(0,$idx));		# load 0-127 bits
566	&mov	(@T[1],&DWP(4,$idx));
567	&mov	(@T[2],&DWP(8,$idx));
568	&mov	(@T[3],&DWP(12,$idx));
569
570	&bswap	(@T[0]);
571	&bswap	(@T[1]);
572	&bswap	(@T[2]);
573	&bswap	(@T[3]);
574
575	&_saveround	(0,$key,@T);		# KL<<<0
576
577	&cmp	($Tbl,128);
578	&je	(&label("1st128"));
579
580	&mov	(@T[0],&DWP(16,$idx));		# load 128-191 bits
581	&mov	(@T[1],&DWP(20,$idx));
582	&cmp	($Tbl,192);
583	&je	(&label("1st192"));
584	&mov	(@T[2],&DWP(24,$idx));		# load 192-255 bits
585	&mov	(@T[3],&DWP(28,$idx));
586	&jmp	(&label("1st256"));
587&set_label("1st192",4);
588	&mov	(@T[2],@T[0]);
589	&mov	(@T[3],@T[1]);
590	&not	(@T[2]);
591	&not	(@T[3]);
592&set_label("1st256",4);
593	&bswap	(@T[0]);
594	&bswap	(@T[1]);
595	&bswap	(@T[2]);
596	&bswap	(@T[3]);
597
598	&_saveround	(4,$key,@T);		# temporary storage for KR!
599
600	&xor	(@T[0],&DWP(0*8+0,$key));	# KR^KL
601	&xor	(@T[1],&DWP(0*8+4,$key));
602	&xor	(@T[2],&DWP(1*8+0,$key));
603	&xor	(@T[3],&DWP(1*8+4,$key));
604
605&set_label("1st128",4);
606	&call	(&label("pic_point"));
607	&set_label("pic_point");
608	&blindpop($Tbl);
609	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
610	&lea	($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
611
612	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[0]
613	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
614	&mov	(&swtmp(1),@T[1]);
615	&mov	(&swtmp(2),@T[2]);
616	&mov	(&swtmp(3),@T[3]);
617	&Camellia_Feistel($step++);
618	&Camellia_Feistel($step++);
619	&mov	(@T[2],&swtmp(2));
620	&mov	(@T[3],&swtmp(3));
621
622	&mov	($idx,&wparam(2));
623	&xor	(@T[0],&DWP(0*8+0,$idx));	# ^KL
624	&xor	(@T[1],&DWP(0*8+4,$idx));
625	&xor	(@T[2],&DWP(1*8+0,$idx));
626	&xor	(@T[3],&DWP(1*8+4,$idx));
627
628	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[4]
629	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
630	&mov	(&swtmp(1),@T[1]);
631	&mov	(&swtmp(2),@T[2]);
632	&mov	(&swtmp(3),@T[3]);
633	&Camellia_Feistel($step++);
634	&Camellia_Feistel($step++);
635	&mov	(@T[2],&swtmp(2));
636	&mov	(@T[3],&swtmp(3));
637
638	&mov	($idx,&wparam(0));
639	&cmp	($idx,128);
640	&jne	(&label("2nd256"));
641
642	&mov	($key,&wparam(2));
643	&lea	($key,&DWP(128,$key));		# size optimization
644
645	####### process KA
646	&_saveround	(2,$key,-128,@T);	# KA<<<0
647	&_rotl128	(@T,15,6,@T);		# KA<<<15
648	&_rotl128	(@T,15,8,@T);		# KA<<<(15+15=30)
649	&_rotl128	(@T,15,12,@T[0],@T[1]);	# KA<<<(30+15=45)
650	&_rotl128	(@T,15,14,@T);		# KA<<<(45+15=60)
651	push		(@T,shift(@T));		# rotl128(@T,32);
652	&_rotl128	(@T,2,20,@T);		# KA<<<(60+32+2=94)
653	&_rotl128	(@T,17,24,@T);		# KA<<<(94+17=111)
654
655	####### process KL
656	&_loadround	(0,$key,-128,@T);	# load KL
657	&_rotl128	(@T,15,4,@T);		# KL<<<15
658	&_rotl128	(@T,30,10,@T);		# KL<<<(15+30=45)
659	&_rotl128	(@T,15,13,@T[2],@T[3]);	# KL<<<(45+15=60)
660	&_rotl128	(@T,17,16,@T);		# KL<<<(60+17=77)
661	&_rotl128	(@T,17,18,@T);		# KL<<<(77+17=94)
662	&_rotl128	(@T,17,22,@T);		# KL<<<(94+17=111)
663
664	while (@T[0] ne "eax")			# restore order
665	{   unshift	(@T,pop(@T));   }
666
667	&mov	("eax",3);			# 3 grandRounds
668	&jmp	(&label("done"));
669
670&set_label("2nd256",16);
671	&mov	($idx,&wparam(2));
672	&_saveround	(6,$idx,@T);		# temporary storage for KA!
673
674	&xor	(@T[0],&DWP(4*8+0,$idx));	# KA^KR
675	&xor	(@T[1],&DWP(4*8+4,$idx));
676	&xor	(@T[2],&DWP(5*8+0,$idx));
677	&xor	(@T[3],&DWP(5*8+4,$idx));
678
679	&mov	($idx,&DWP($step*8,$key));	# prefetch SIGMA[8]
680	&mov	(&swtmp(0),@T[0]);		# save s[0-3]
681	&mov	(&swtmp(1),@T[1]);
682	&mov	(&swtmp(2),@T[2]);
683	&mov	(&swtmp(3),@T[3]);
684	&Camellia_Feistel($step++);
685	&Camellia_Feistel($step++);
686	&mov	(@T[2],&swtmp(2));
687	&mov	(@T[3],&swtmp(3));
688
689	&mov	($key,&wparam(2));
690	&lea	($key,&DWP(128,$key));		# size optimization
691
692	####### process KB
693	&_saveround	(2,$key,-128,@T);	# KB<<<0
694	&_rotl128	(@T,30,10,@T);		# KB<<<30
695	&_rotl128	(@T,30,20,@T);		# KB<<<(30+30=60)
696	push		(@T,shift(@T));		# rotl128(@T,32);
697	&_rotl128	(@T,19,32,@T);		# KB<<<(60+32+19=111)
698
699	####### process KR
700	&_loadround	(4,$key,-128,@T);	# load KR
701	&_rotl128	(@T,15,4,@T);		# KR<<<15
702	&_rotl128	(@T,15,8,@T);		# KR<<<(15+15=30)
703	&_rotl128	(@T,30,18,@T);		# KR<<<(30+30=60)
704	push		(@T,shift(@T));		# rotl128(@T,32);
705	&_rotl128	(@T,2,26,@T);		# KR<<<(60+32+2=94)
706
707	####### process KA
708	&_loadround	(6,$key,-128,@T);	# load KA
709	&_rotl128	(@T,15,6,@T);		# KA<<<15
710	&_rotl128	(@T,30,14,@T);		# KA<<<(15+30=45)
711	push		(@T,shift(@T));		# rotl128(@T,32);
712	&_rotl128	(@T,0,24,@T);		# KA<<<(45+32+0=77)
713	&_rotl128	(@T,17,28,@T);		# KA<<<(77+17=94)
714
715	####### process KL
716	&_loadround	(0,$key,-128,@T);	# load KL
717	push		(@T,shift(@T));		# rotl128(@T,32);
718	&_rotl128	(@T,13,12,@T);		# KL<<<(32+13=45)
719	&_rotl128	(@T,15,16,@T);		# KL<<<(45+15=60)
720	&_rotl128	(@T,17,22,@T);		# KL<<<(60+17=77)
721	push		(@T,shift(@T));		# rotl128(@T,32);
722	&_rotl128	(@T,2,30,@T);		# KL<<<(77+32+2=111)
723
724	while (@T[0] ne "eax")			# restore order
725	{   unshift	(@T,pop(@T));   }
726
727	&mov	("eax",4);			# 4 grandRounds
728&set_label("done");
729	&lea	("edx",&DWP(272-128,$key));	# end of key schedule
730	&stack_pop(4);
731}
732&function_end("Camellia_Ekeygen");
733
734if ($OPENSSL) {
735# int Camellia_set_key (
736#		const unsigned char *userKey,
737#		int bits,
738#		CAMELLIA_KEY *key)
739&function_begin_B("Camellia_set_key");
740	&push	("ebx");
741	&mov	("ecx",&wparam(0));	# pull arguments
742	&mov	("ebx",&wparam(1));
743	&mov	("edx",&wparam(2));
744
745	&mov	("eax",-1);
746	&test	("ecx","ecx");
747	&jz	(&label("done"));	# userKey==NULL?
748	&test	("edx","edx");
749	&jz	(&label("done"));	# key==NULL?
750
751	&mov	("eax",-2);
752	&cmp	("ebx",256);
753	&je	(&label("arg_ok"));	# bits==256?
754	&cmp	("ebx",192);
755	&je	(&label("arg_ok"));	# bits==192?
756	&cmp	("ebx",128);
757	&jne	(&label("done"));	# bits!=128?
758&set_label("arg_ok",4);
759
760	&push	("edx");		# push arguments
761	&push	("ecx");
762	&push	("ebx");
763	&call	("Camellia_Ekeygen");
764	&stack_pop(3);
765
766	# eax holds grandRounds and edx points at where to put it
767	&mov	(&DWP(0,"edx"),"eax");
768	&xor	("eax","eax");
769&set_label("done",4);
770	&pop	("ebx");
771	&ret	();
772&function_end_B("Camellia_set_key");
773}
774
775@SBOX=(
776112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
777 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
778134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
779166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
780139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
781223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
782 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
783254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
784170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
785 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
786135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
787 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
788233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
789120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
790114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
791 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
792
793sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
794sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
795sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
796sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
797
798&set_label("Camellia_SIGMA",64);
799&data_word(
800    0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
801    0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
802    0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
803    0,          0,          0,          0);
804&set_label("Camellia_SBOX",64);
805# tables are interleaved, remember?
806for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
807for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
808
809# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
810#			size_t length, const CAMELLIA_KEY *key,
811#			unsigned char *ivp,const int enc);
812{
813# stack frame layout
814#             -4(%esp)		# return address	 0(%esp)
815#              0(%esp)		# s0			 4(%esp)
816#              4(%esp)		# s1			 8(%esp)
817#              8(%esp)		# s2			12(%esp)
818#             12(%esp)		# s3			16(%esp)
819#             16(%esp)		# end of key schedule	20(%esp)
820#             20(%esp)		# %esp backup
821my $_inp=&DWP(24,"esp");	#copy of wparam(0)
822my $_out=&DWP(28,"esp");	#copy of wparam(1)
823my $_len=&DWP(32,"esp");	#copy of wparam(2)
824my $_key=&DWP(36,"esp");	#copy of wparam(3)
825my $_ivp=&DWP(40,"esp");	#copy of wparam(4)
826my $ivec=&DWP(44,"esp");	#ivec[16]
827my $_tmp=&DWP(44,"esp");	#volatile variable [yes, aliases with ivec]
828my ($s0,$s1,$s2,$s3) = @T;
829
830&function_begin("Camellia_cbc_encrypt");
831	&mov	($s2 eq "ecx"? $s2 : "",&wparam(2));	# load len
832	&cmp	($s2,0);
833	&je	(&label("enc_out"));
834
835	&pushf	();
836	&cld	();
837
838	&mov	($s0,&wparam(0));	# load inp
839	&mov	($s1,&wparam(1));	# load out
840	#&mov	($s2,&wparam(2));	# load len
841	&mov	($s3,&wparam(3));	# load key
842	&mov	($Tbl,&wparam(4));	# load ivp
843
844	# allocate aligned stack frame...
845	&lea	($idx,&DWP(-64,"esp"));
846	&and	($idx,-64);
847
848	# place stack frame just "above mod 1024" the key schedule
849	# this ensures that cache associativity of 2 suffices
850	&lea	($key,&DWP(-64-63,$s3));
851	&sub	($key,$idx);
852	&neg	($key);
853	&and	($key,0x3C0);	# modulo 1024, but aligned to cache-line
854	&sub	($idx,$key);
855
856	&mov	($key,&wparam(5));	# load enc
857
858	&exch	("esp",$idx);
859	&add	("esp",4);		# reserve for return address!
860	&mov	($_esp,$idx);		# save %esp
861
862	&mov	($_inp,$s0);		# save copy of inp
863	&mov	($_out,$s1);		# save copy of out
864	&mov	($_len,$s2);		# save copy of len
865	&mov	($_key,$s3);		# save copy of key
866	&mov	($_ivp,$Tbl);		# save copy of ivp
867
868	&call   (&label("pic_point"));	# make it PIC!
869	&set_label("pic_point");
870	&blindpop($Tbl);
871	&lea    ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
872
873	&mov	($idx,32);
874	&set_label("prefetch_sbox",4);
875		&mov	($s0,&DWP(0,$Tbl));
876		&mov	($s1,&DWP(32,$Tbl));
877		&mov	($s2,&DWP(64,$Tbl));
878		&mov	($s3,&DWP(96,$Tbl));
879		&lea	($Tbl,&DWP(128,$Tbl));
880		&dec	($idx);
881	&jnz	(&label("prefetch_sbox"));
882	&mov	($s0,$_key);
883	&sub	($Tbl,4096);
884	&mov	($idx,$_inp);
885	&mov	($s3,&DWP(272,$s0));		# load grandRounds
886
887	&cmp	($key,0);
888	&je	(&label("DECRYPT"));
889
890	&mov	($s2,$_len);
891	&mov	($key,$_ivp);
892	&shl	($s3,6);
893	&lea	($s3,&DWP(0,$s0,$s3));
894	&mov	($_end,$s3);
895
896	&test	($s2,0xFFFFFFF0);
897	&jz	(&label("enc_tail"));		# short input...
898
899	&mov	($s0,&DWP(0,$key));		# load iv
900	&mov	($s1,&DWP(4,$key));
901
902	&set_label("enc_loop",4);
903		&mov	($s2,&DWP(8,$key));
904		&mov	($s3,&DWP(12,$key));
905
906		&xor	($s0,&DWP(0,$idx));	# xor input data
907		&xor	($s1,&DWP(4,$idx));
908		&xor	($s2,&DWP(8,$idx));
909		&bswap	($s0);
910		&xor	($s3,&DWP(12,$idx));
911		&bswap	($s1);
912		&mov	($key,$_key);		# load key
913		&bswap	($s2);
914		&bswap	($s3);
915
916		&call	("_x86_Camellia_encrypt");
917
918		&mov	($idx,$_inp);		# load inp
919		&mov	($key,$_out);		# load out
920
921		&bswap	($s0);
922		&bswap	($s1);
923		&bswap	($s2);
924		&mov	(&DWP(0,$key),$s0);	# save output data
925		&bswap	($s3);
926		&mov	(&DWP(4,$key),$s1);
927		&mov	(&DWP(8,$key),$s2);
928		&mov	(&DWP(12,$key),$s3);
929
930		&mov	($s2,$_len);		# load len
931
932		&lea	($idx,&DWP(16,$idx));
933		&mov	($_inp,$idx);		# save inp
934
935		&lea	($s3,&DWP(16,$key));
936		&mov	($_out,$s3);		# save out
937
938		&sub	($s2,16);
939		&test	($s2,0xFFFFFFF0);
940		&mov	($_len,$s2);		# save len
941	&jnz	(&label("enc_loop"));
942	&test	($s2,15);
943	&jnz	(&label("enc_tail"));
944	&mov	($idx,$_ivp);		# load ivp
945	&mov	($s2,&DWP(8,$key));	# restore last dwords
946	&mov	($s3,&DWP(12,$key));
947	&mov	(&DWP(0,$idx),$s0);	# save ivec
948	&mov	(&DWP(4,$idx),$s1);
949	&mov	(&DWP(8,$idx),$s2);
950	&mov	(&DWP(12,$idx),$s3);
951
952	&mov	("esp",$_esp);
953	&popf	();
954    &set_label("enc_out");
955	&function_end_A();
956	&pushf	();			# kludge, never executed
957
958    &set_label("enc_tail",4);
959	&mov	($s0,$key eq "edi" ? $key : "");
960	&mov	($key,$_out);			# load out
961	&push	($s0);				# push ivp
962	&mov	($s1,16);
963	&sub	($s1,$s2);
964	&cmp	($key,$idx);			# compare with inp
965	&je	(&label("enc_in_place"));
966	&align	(4);
967	&data_word(0xA4F3F689);	# rep movsb	# copy input
968	&jmp	(&label("enc_skip_in_place"));
969    &set_label("enc_in_place");
970	&lea	($key,&DWP(0,$key,$s2));
971    &set_label("enc_skip_in_place");
972	&mov	($s2,$s1);
973	&xor	($s0,$s0);
974	&align	(4);
975	&data_word(0xAAF3F689);	# rep stosb	# zero tail
976	&pop	($key);				# pop ivp
977
978	&mov	($idx,$_out);			# output as input
979	&mov	($s0,&DWP(0,$key));
980	&mov	($s1,&DWP(4,$key));
981	&mov	($_len,16);			# len=16
982	&jmp	(&label("enc_loop"));		# one more spin...
983
984#----------------------------- DECRYPT -----------------------------#
985&set_label("DECRYPT",16);
986	&shl	($s3,6);
987	&lea	($s3,&DWP(0,$s0,$s3));
988	&mov	($_end,$s0);
989	&mov	($_key,$s3);
990
991	&cmp	($idx,$_out);
992	&je	(&label("dec_in_place"));	# in-place processing...
993
994	&mov	($key,$_ivp);			# load ivp
995	&mov	($_tmp,$key);
996
997	&set_label("dec_loop",4);
998		&mov	($s0,&DWP(0,$idx));	# read input
999		&mov	($s1,&DWP(4,$idx));
1000		&mov	($s2,&DWP(8,$idx));
1001		&bswap	($s0);
1002		&mov	($s3,&DWP(12,$idx));
1003		&bswap	($s1);
1004		&mov	($key,$_key);		# load key
1005		&bswap	($s2);
1006		&bswap	($s3);
1007
1008		&call	("_x86_Camellia_decrypt");
1009
1010		&mov	($key,$_tmp);		# load ivp
1011		&mov	($idx,$_len);		# load len
1012
1013		&bswap	($s0);
1014		&bswap	($s1);
1015		&bswap	($s2);
1016		&xor	($s0,&DWP(0,$key));	# xor iv
1017		&bswap	($s3);
1018		&xor	($s1,&DWP(4,$key));
1019		&xor	($s2,&DWP(8,$key));
1020		&xor	($s3,&DWP(12,$key));
1021
1022		&sub	($idx,16);
1023		&jc	(&label("dec_partial"));
1024		&mov	($_len,$idx);		# save len
1025		&mov	($idx,$_inp);		# load inp
1026		&mov	($key,$_out);		# load out
1027
1028		&mov	(&DWP(0,$key),$s0);	# write output
1029		&mov	(&DWP(4,$key),$s1);
1030		&mov	(&DWP(8,$key),$s2);
1031		&mov	(&DWP(12,$key),$s3);
1032
1033		&mov	($_tmp,$idx);		# save ivp
1034		&lea	($idx,&DWP(16,$idx));
1035		&mov	($_inp,$idx);		# save inp
1036
1037		&lea	($key,&DWP(16,$key));
1038		&mov	($_out,$key);		# save out
1039
1040	&jnz	(&label("dec_loop"));
1041	&mov	($key,$_tmp);		# load temp ivp
1042    &set_label("dec_end");
1043	&mov	($idx,$_ivp);		# load user ivp
1044	&mov	($s0,&DWP(0,$key));	# load iv
1045	&mov	($s1,&DWP(4,$key));
1046	&mov	($s2,&DWP(8,$key));
1047	&mov	($s3,&DWP(12,$key));
1048	&mov	(&DWP(0,$idx),$s0);	# copy back to user
1049	&mov	(&DWP(4,$idx),$s1);
1050	&mov	(&DWP(8,$idx),$s2);
1051	&mov	(&DWP(12,$idx),$s3);
1052	&jmp	(&label("dec_out"));
1053
1054    &set_label("dec_partial",4);
1055	&lea	($key,$ivec);
1056	&mov	(&DWP(0,$key),$s0);	# dump output to stack
1057	&mov	(&DWP(4,$key),$s1);
1058	&mov	(&DWP(8,$key),$s2);
1059	&mov	(&DWP(12,$key),$s3);
1060	&lea	($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1061	&mov	($idx eq "esi" ? $idx : "",$key);
1062	&mov	($key eq "edi" ? $key : "",$_out);	# load out
1063	&data_word(0xA4F3F689);	# rep movsb		# copy output
1064	&mov	($key,$_inp);				# use inp as temp ivp
1065	&jmp	(&label("dec_end"));
1066
1067    &set_label("dec_in_place",4);
1068	&set_label("dec_in_place_loop");
1069		&lea	($key,$ivec);
1070		&mov	($s0,&DWP(0,$idx));	# read input
1071		&mov	($s1,&DWP(4,$idx));
1072		&mov	($s2,&DWP(8,$idx));
1073		&mov	($s3,&DWP(12,$idx));
1074
1075		&mov	(&DWP(0,$key),$s0);	# copy to temp
1076		&mov	(&DWP(4,$key),$s1);
1077		&mov	(&DWP(8,$key),$s2);
1078		&bswap	($s0);
1079		&mov	(&DWP(12,$key),$s3);
1080		&bswap	($s1);
1081		&mov	($key,$_key);		# load key
1082		&bswap	($s2);
1083		&bswap	($s3);
1084
1085		&call	("_x86_Camellia_decrypt");
1086
1087		&mov	($key,$_ivp);		# load ivp
1088		&mov	($idx,$_out);		# load out
1089
1090		&bswap	($s0);
1091		&bswap	($s1);
1092		&bswap	($s2);
1093		&xor	($s0,&DWP(0,$key));	# xor iv
1094		&bswap	($s3);
1095		&xor	($s1,&DWP(4,$key));
1096		&xor	($s2,&DWP(8,$key));
1097		&xor	($s3,&DWP(12,$key));
1098
1099		&mov	(&DWP(0,$idx),$s0);	# write output
1100		&mov	(&DWP(4,$idx),$s1);
1101		&mov	(&DWP(8,$idx),$s2);
1102		&mov	(&DWP(12,$idx),$s3);
1103
1104		&lea	($idx,&DWP(16,$idx));
1105		&mov	($_out,$idx);		# save out
1106
1107		&lea	($idx,$ivec);
1108		&mov	($s0,&DWP(0,$idx));	# read temp
1109		&mov	($s1,&DWP(4,$idx));
1110		&mov	($s2,&DWP(8,$idx));
1111		&mov	($s3,&DWP(12,$idx));
1112
1113		&mov	(&DWP(0,$key),$s0);	# copy iv
1114		&mov	(&DWP(4,$key),$s1);
1115		&mov	(&DWP(8,$key),$s2);
1116		&mov	(&DWP(12,$key),$s3);
1117
1118		&mov	($idx,$_inp);		# load inp
1119
1120		&lea	($idx,&DWP(16,$idx));
1121		&mov	($_inp,$idx);		# save inp
1122
1123		&mov	($s2,$_len);		# load len
1124		&sub	($s2,16);
1125		&jc	(&label("dec_in_place_partial"));
1126		&mov	($_len,$s2);		# save len
1127	&jnz	(&label("dec_in_place_loop"));
1128	&jmp	(&label("dec_out"));
1129
1130    &set_label("dec_in_place_partial",4);
1131	# one can argue if this is actually required...
1132	&mov	($key eq "edi" ? $key : "",$_out);
1133	&lea	($idx eq "esi" ? $idx : "",$ivec);
1134	&lea	($key,&DWP(0,$key,$s2));
1135	&lea	($idx,&DWP(16,$idx,$s2));
1136	&neg	($s2 eq "ecx" ? $s2 : "");
1137	&data_word(0xA4F3F689);	# rep movsb	# restore tail
1138
1139    &set_label("dec_out",4);
1140    &mov	("esp",$_esp);
1141    &popf	();
1142&function_end("Camellia_cbc_encrypt");
1143}
1144
1145&asciz("Camellia for x86 by <appro\@openssl.org>");
1146
1147&asm_finish();
1148
1149close STDOUT or die "error closing STDOUT: $!";
1150