xref: /openssl/crypto/camellia/asm/cmll-x86_64.pl (revision 7ed6de99)
1#! /usr/bin/env perl
2# Copyright 2008-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12#
13# This module may be used under the terms of either the GNU General
14# Public License version 2 or later, the GNU Lesser General Public
15# License version 2.1 or later, the Mozilla Public License version
16# 1.1 or the BSD License. The exact terms of either license are
17# distributed along with this module. For further details see
18# http://www.openssl.org/~appro/camellia/.
19# ====================================================================
20
21# Performance in cycles per processed byte (less is better) in
22# 'openssl speed ...' benchmark:
23#
24#			AMD64	Core2	EM64T
25# -evp camellia-128-ecb	16.7	21.0	22.7
26# + over gcc 3.4.6	+25%	+5%	0%
27#
28# camellia-128-cbc	15.7	20.4	21.1
29#
30# 128-bit key setup	128	216	205	cycles/key
31# + over gcc 3.4.6	+54%	+39%	+15%
32#
33# Numbers in "+" rows represent performance improvement over compiler
34# generated code. Key setup timings are impressive on AMD and Core2
35# thanks to 64-bit operations being covertly deployed. Improvement on
36# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37# apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39# $output is the last argument if it looks like a file (it has an extension)
40# $flavour is the first argument if it doesn't look like a file
41$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
42$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
43
44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49die "can't locate x86_64-xlate.pl";
50
51open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
52    or die "can't call $xlate: $!";
53*STDOUT=*OUT;
54
55sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
56sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
57                        $r =~ s/%[er]([sd]i)/%\1l/;
58                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
59
60$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
61@S=("%r8d","%r9d","%r10d","%r11d");
62$i0="%esi";
63$i1="%edi";
64$Tbl="%rbp";	# size optimization
65$inp="%r12";
66$out="%r13";
67$key="%r14";
68$keyend="%r15";
69$arg0d=$win64?"%ecx":"%edi";
70
71# const unsigned int Camellia_SBOX[4][256];
72# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
73# and [2][] - with [3][]. This is done to minimize code size.
74$SBOX1_1110=0;		# Camellia_SBOX[0]
75$SBOX4_4404=4;		# Camellia_SBOX[1]
76$SBOX2_0222=2048;	# Camellia_SBOX[2]
77$SBOX3_3033=2052;	# Camellia_SBOX[3]
78
79sub Camellia_Feistel {
80my $i=@_[0];
81my $seed=defined(@_[1])?@_[1]:0;
82my $scale=$seed<0?-8:8;
83my $j=($i&1)*2;
84my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
85
86$code.=<<___;
87	xor	$s0,$t0				# t0^=key[0]
88	xor	$s1,$t1				# t1^=key[1]
89	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff
90	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff
91	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]
92	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]
93	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff
94	shr	\$16,$t0
95	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff
96	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]
97	shr	\$16,$t1
98	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]
99	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff
100	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff
101	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]
102	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]
103	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff
104	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff
105	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]
106	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]
107	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]
108	mov	`$seed+($i+1)*$scale+4`($key),$t0
109	xor	$t3,$t2				# t2^=t3
110	ror	\$8,$t3				# t3=RightRotate(t3,8)
111	xor	$t2,$s2
112	xor	$t2,$s3
113	xor	$t3,$s3
114___
115}
116
117# void Camellia_EncryptBlock_Rounds(
118#		int grandRounds,
119#		const Byte plaintext[],
120#		const KEY_TABLE_TYPE keyTable,
121#		Byte ciphertext[])
122$code=<<___;
123.text
124
125# V1.x API
126.globl	Camellia_EncryptBlock
127.type	Camellia_EncryptBlock,\@abi-omnipotent
128.align	16
129Camellia_EncryptBlock:
130.cfi_startproc
131	movl	\$128,%eax
132	subl	$arg0d,%eax
133	movl	\$3,$arg0d
134	adcl	\$0,$arg0d	# keyBitLength==128?3:4
135	jmp	.Lenc_rounds
136.cfi_endproc
137.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock
138# V2
139.globl	Camellia_EncryptBlock_Rounds
140.type	Camellia_EncryptBlock_Rounds,\@function,4
141.align	16
142.Lenc_rounds:
143Camellia_EncryptBlock_Rounds:
144.cfi_startproc
145	push	%rbx
146.cfi_push	%rbx
147	push	%rbp
148.cfi_push	%rbp
149	push	%r13
150.cfi_push	%r13
151	push	%r14
152.cfi_push	%r14
153	push	%r15
154.cfi_push	%r15
155.Lenc_prologue:
156
157	#mov	%rsi,$inp		# put away arguments
158	mov	%rcx,$out
159	mov	%rdx,$key
160
161	shl	\$6,%edi		# process grandRounds
162	lea	.LCamellia_SBOX(%rip),$Tbl
163	lea	($key,%rdi),$keyend
164
165	mov	0(%rsi),@S[0]		# load plaintext
166	mov	4(%rsi),@S[1]
167	mov	8(%rsi),@S[2]
168	bswap	@S[0]
169	mov	12(%rsi),@S[3]
170	bswap	@S[1]
171	bswap	@S[2]
172	bswap	@S[3]
173
174	call	_x86_64_Camellia_encrypt
175
176	bswap	@S[0]
177	bswap	@S[1]
178	bswap	@S[2]
179	mov	@S[0],0($out)
180	bswap	@S[3]
181	mov	@S[1],4($out)
182	mov	@S[2],8($out)
183	mov	@S[3],12($out)
184
185	mov	0(%rsp),%r15
186.cfi_restore	%r15
187	mov	8(%rsp),%r14
188.cfi_restore	%r14
189	mov	16(%rsp),%r13
190.cfi_restore	%r13
191	mov	24(%rsp),%rbp
192.cfi_restore	%rbp
193	mov	32(%rsp),%rbx
194.cfi_restore	%rbx
195	lea	40(%rsp),%rsp
196.cfi_adjust_cfa_offset	-40
197.Lenc_epilogue:
198	ret
199.cfi_endproc
200.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
201
202.type	_x86_64_Camellia_encrypt,\@abi-omnipotent
203.align	16
204_x86_64_Camellia_encrypt:
205.cfi_startproc
206	xor	0($key),@S[1]
207	xor	4($key),@S[0]		# ^=key[0-3]
208	xor	8($key),@S[3]
209	xor	12($key),@S[2]
210.align	16
211.Leloop:
212	mov	16($key),$t1		# prefetch key[4-5]
213	mov	20($key),$t0
214
215___
216	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
217$code.=<<___;
218	lea	16*4($key),$key
219	cmp	$keyend,$key
220	mov	8($key),$t3		# prefetch key[2-3]
221	mov	12($key),$t2
222	je	.Ledone
223
224	and	@S[0],$t0
225	or	@S[3],$t3
226	rol	\$1,$t0
227	xor	$t3,@S[2]		# s2^=s3|key[3];
228	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
229	and	@S[2],$t2
230	or	@S[1],$t1
231	rol	\$1,$t2
232	xor	$t1,@S[0]		# s0^=s1|key[1];
233	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
234	jmp	.Leloop
235
236.align	16
237.Ledone:
238	xor	@S[2],$t0		# SwapHalf
239	xor	@S[3],$t1
240	xor	@S[0],$t2
241	xor	@S[1],$t3
242
243	mov	$t0,@S[0]
244	mov	$t1,@S[1]
245	mov	$t2,@S[2]
246	mov	$t3,@S[3]
247
248	.byte	0xf3,0xc3		# rep ret
249.cfi_endproc
250.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
251
252# V1.x API
253.globl	Camellia_DecryptBlock
254.type	Camellia_DecryptBlock,\@abi-omnipotent
255.align	16
256Camellia_DecryptBlock:
257.cfi_startproc
258	movl	\$128,%eax
259	subl	$arg0d,%eax
260	movl	\$3,$arg0d
261	adcl	\$0,$arg0d	# keyBitLength==128?3:4
262	jmp	.Ldec_rounds
263.cfi_endproc
264.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock
265# V2
266.globl	Camellia_DecryptBlock_Rounds
267.type	Camellia_DecryptBlock_Rounds,\@function,4
268.align	16
269.Ldec_rounds:
270Camellia_DecryptBlock_Rounds:
271.cfi_startproc
272	push	%rbx
273.cfi_push	%rbx
274	push	%rbp
275.cfi_push	%rbp
276	push	%r13
277.cfi_push	%r13
278	push	%r14
279.cfi_push	%r14
280	push	%r15
281.cfi_push	%r15
282.Ldec_prologue:
283
284	#mov	%rsi,$inp		# put away arguments
285	mov	%rcx,$out
286	mov	%rdx,$keyend
287
288	shl	\$6,%edi		# process grandRounds
289	lea	.LCamellia_SBOX(%rip),$Tbl
290	lea	($keyend,%rdi),$key
291
292	mov	0(%rsi),@S[0]		# load plaintext
293	mov	4(%rsi),@S[1]
294	mov	8(%rsi),@S[2]
295	bswap	@S[0]
296	mov	12(%rsi),@S[3]
297	bswap	@S[1]
298	bswap	@S[2]
299	bswap	@S[3]
300
301	call	_x86_64_Camellia_decrypt
302
303	bswap	@S[0]
304	bswap	@S[1]
305	bswap	@S[2]
306	mov	@S[0],0($out)
307	bswap	@S[3]
308	mov	@S[1],4($out)
309	mov	@S[2],8($out)
310	mov	@S[3],12($out)
311
312	mov	0(%rsp),%r15
313.cfi_restore	%r15
314	mov	8(%rsp),%r14
315.cfi_restore	%r14
316	mov	16(%rsp),%r13
317.cfi_restore	%r13
318	mov	24(%rsp),%rbp
319.cfi_restore	%rbp
320	mov	32(%rsp),%rbx
321.cfi_restore	%rbx
322	lea	40(%rsp),%rsp
323.cfi_adjust_cfa_offset	-40
324.Ldec_epilogue:
325	ret
326.cfi_endproc
327.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
328
329.type	_x86_64_Camellia_decrypt,\@abi-omnipotent
330.align	16
331_x86_64_Camellia_decrypt:
332.cfi_startproc
333	xor	0($key),@S[1]
334	xor	4($key),@S[0]		# ^=key[0-3]
335	xor	8($key),@S[3]
336	xor	12($key),@S[2]
337.align	16
338.Ldloop:
339	mov	-8($key),$t1		# prefetch key[4-5]
340	mov	-4($key),$t0
341
342___
343	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
344$code.=<<___;
345	lea	-16*4($key),$key
346	cmp	$keyend,$key
347	mov	0($key),$t3		# prefetch key[2-3]
348	mov	4($key),$t2
349	je	.Lddone
350
351	and	@S[0],$t0
352	or	@S[3],$t3
353	rol	\$1,$t0
354	xor	$t3,@S[2]		# s2^=s3|key[3];
355	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);
356	and	@S[2],$t2
357	or	@S[1],$t1
358	rol	\$1,$t2
359	xor	$t1,@S[0]		# s0^=s1|key[1];
360	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);
361
362	jmp	.Ldloop
363
364.align	16
365.Lddone:
366	xor	@S[2],$t2
367	xor	@S[3],$t3
368	xor	@S[0],$t0
369	xor	@S[1],$t1
370
371	mov	$t2,@S[0]		# SwapHalf
372	mov	$t3,@S[1]
373	mov	$t0,@S[2]
374	mov	$t1,@S[3]
375
376	.byte	0xf3,0xc3		# rep ret
377.cfi_endproc
378.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
379___
380
381sub _saveround {
382my ($rnd,$key,@T)=@_;
383my $bias=int(@T[0])?shift(@T):0;
384
385    if ($#T==3) {
386	$code.=<<___;
387	mov	@T[1],`$bias+$rnd*8+0`($key)
388	mov	@T[0],`$bias+$rnd*8+4`($key)
389	mov	@T[3],`$bias+$rnd*8+8`($key)
390	mov	@T[2],`$bias+$rnd*8+12`($key)
391___
392    } else {
393	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";
394	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);
395    }
396}
397
398sub _loadround {
399my ($rnd,$key,@T)=@_;
400my $bias=int(@T[0])?shift(@T):0;
401
402$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";
403$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);
404}
405
406# shld is very slow on Intel EM64T family. Even on AMD it limits
407# instruction decode rate [because it's VectorPath] and consequently
408# performance...
409sub __rotl128 {
410my ($i0,$i1,$rot)=@_;
411
412    if ($rot) {
413	$code.=<<___;
414	mov	$i0,%r11
415	shld	\$$rot,$i1,$i0
416	shld	\$$rot,%r11,$i1
417___
418    }
419}
420
421# ... Implementing 128-bit rotate without shld gives 80% better
422# performance EM64T, +15% on AMD64 and only ~7% degradation on
423# Core2. This is therefore preferred.
424sub _rotl128 {
425my ($i0,$i1,$rot)=@_;
426
427    if ($rot) {
428	$code.=<<___;
429	mov	$i0,%r11
430	shl	\$$rot,$i0
431	mov	$i1,%r9
432	shr	\$`64-$rot`,%r9
433	shr	\$`64-$rot`,%r11
434	or	%r9,$i0
435	shl	\$$rot,$i1
436	or	%r11,$i1
437___
438    }
439}
440
441{ my $step=0;
442
443$code.=<<___;
444.globl	Camellia_Ekeygen
445.type	Camellia_Ekeygen,\@function,3
446.align	16
447Camellia_Ekeygen:
448.cfi_startproc
449	push	%rbx
450.cfi_push	%rbx
451	push	%rbp
452.cfi_push	%rbp
453	push	%r13
454.cfi_push	%r13
455	push	%r14
456.cfi_push	%r14
457	push	%r15
458.cfi_push	%r15
459.Lkey_prologue:
460
461	mov	%edi,${keyend}d		# put away arguments, keyBitLength
462	mov	%rdx,$out		# keyTable
463
464	mov	0(%rsi),@S[0]		# load 0-127 bits
465	mov	4(%rsi),@S[1]
466	mov	8(%rsi),@S[2]
467	mov	12(%rsi),@S[3]
468
469	bswap	@S[0]
470	bswap	@S[1]
471	bswap	@S[2]
472	bswap	@S[3]
473___
474	&_saveround	(0,$out,@S);	# KL<<<0
475$code.=<<___;
476	cmp	\$128,$keyend		# check keyBitLength
477	je	.L1st128
478
479	mov	16(%rsi),@S[0]		# load 128-191 bits
480	mov	20(%rsi),@S[1]
481	cmp	\$192,$keyend
482	je	.L1st192
483	mov	24(%rsi),@S[2]		# load 192-255 bits
484	mov	28(%rsi),@S[3]
485	jmp	.L1st256
486.L1st192:
487	mov	@S[0],@S[2]
488	mov	@S[1],@S[3]
489	not	@S[2]
490	not	@S[3]
491.L1st256:
492	bswap	@S[0]
493	bswap	@S[1]
494	bswap	@S[2]
495	bswap	@S[3]
496___
497	&_saveround	(4,$out,@S);	# temp storage for KR!
498$code.=<<___;
499	xor	0($out),@S[1]		# KR^KL
500	xor	4($out),@S[0]
501	xor	8($out),@S[3]
502	xor	12($out),@S[2]
503
504.L1st128:
505	lea	.LCamellia_SIGMA(%rip),$key
506	lea	.LCamellia_SBOX(%rip),$Tbl
507
508	mov	0($key),$t1
509	mov	4($key),$t0
510___
511	&Camellia_Feistel($step++);
512	&Camellia_Feistel($step++);
513$code.=<<___;
514	xor	0($out),@S[1]		# ^KL
515	xor	4($out),@S[0]
516	xor	8($out),@S[3]
517	xor	12($out),@S[2]
518___
519	&Camellia_Feistel($step++);
520	&Camellia_Feistel($step++);
521$code.=<<___;
522	cmp	\$128,$keyend
523	jne	.L2nd256
524
525	lea	128($out),$out		# size optimization
526	shl	\$32,%r8		# @S[0]||
527	shl	\$32,%r10		# @S[2]||
528	or	%r9,%r8			# ||@S[1]
529	or	%r11,%r10		# ||@S[3]
530___
531	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL
532	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0
533	&_rotl128	("%rax","%rbx",15);
534	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15
535	&_rotl128	("%r8","%r10",15);
536	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15
537	&_rotl128	("%r8","%r10",15);		# 15+15=30
538	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30
539	&_rotl128	("%rax","%rbx",30);		# 15+30=45
540	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45
541	&_rotl128	("%r8","%r10",15);		# 30+15=45
542	&_saveround	(12,$out,-128,"%r8");		# KA<<<45
543	&_rotl128	("%rax","%rbx",15);		# 45+15=60
544	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60
545	&_rotl128	("%r8","%r10",15);		# 45+15=60
546	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60
547	&_rotl128	("%rax","%rbx",17);		# 60+17=77
548	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77
549	&_rotl128	("%rax","%rbx",17);		# 77+17=94
550	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94
551	&_rotl128	("%r8","%r10",34);		# 60+34=94
552	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94
553	&_rotl128	("%rax","%rbx",17);		# 94+17=111
554	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111
555	&_rotl128	("%r8","%r10",17);		# 94+17=111
556	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111
557$code.=<<___;
558	mov	\$3,%eax
559	jmp	.Ldone
560.align	16
561.L2nd256:
562___
563	&_saveround	(6,$out,@S);	# temp storage for KA!
564$code.=<<___;
565	xor	`4*8+0`($out),@S[1]	# KA^KR
566	xor	`4*8+4`($out),@S[0]
567	xor	`5*8+0`($out),@S[3]
568	xor	`5*8+4`($out),@S[2]
569___
570	&Camellia_Feistel($step++);
571	&Camellia_Feistel($step++);
572
573	&_loadround	(0,$out,"%rax","%rbx");	# KL
574	&_loadround	(4,$out,"%rcx","%rdx");	# KR
575	&_loadround	(6,$out,"%r14","%r15");	# KA
576$code.=<<___;
577	lea	128($out),$out		# size optimization
578	shl	\$32,%r8		# @S[0]||
579	shl	\$32,%r10		# @S[2]||
580	or	%r9,%r8			# ||@S[1]
581	or	%r11,%r10		# ||@S[3]
582___
583	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0
584	&_rotl128	("%rcx","%rdx",15);
585	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15
586	&_rotl128	("%r14","%r15",15);
587	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15
588	&_rotl128	("%rcx","%rdx",15);		# 15+15=30
589	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30
590	&_rotl128	("%r8","%r10",30);
591	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30
592	&_rotl128	("%rax","%rbx",45);
593	&_saveround	(12,$out,-128,"%rax","%rbx");	# KL<<<45
594	&_rotl128	("%r14","%r15",30);		# 15+30=45
595	&_saveround	(14,$out,-128,"%r14","%r15");	# KA<<<45
596	&_rotl128	("%rax","%rbx",15);		# 45+15=60
597	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<60
598	&_rotl128	("%rcx","%rdx",30);		# 30+30=60
599	&_saveround	(18,$out,-128,"%rcx","%rdx");	# KR<<<60
600	&_rotl128	("%r8","%r10",30);		# 30+30=60
601	&_saveround	(20,$out,-128,"%r8","%r10");	# KB<<<60
602	&_rotl128	("%rax","%rbx",17);		# 60+17=77
603	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<77
604	&_rotl128	("%r14","%r15",32);		# 45+32=77
605	&_saveround	(24,$out,-128,"%r14","%r15");	# KA<<<77
606	&_rotl128	("%rcx","%rdx",34);		# 60+34=94
607	&_saveround	(26,$out,-128,"%rcx","%rdx");	# KR<<<94
608	&_rotl128	("%r14","%r15",17);		# 77+17=94
609	&_saveround	(28,$out,-128,"%r14","%r15");	# KA<<<77
610	&_rotl128	("%rax","%rbx",34);		# 77+34=111
611	&_saveround	(30,$out,-128,"%rax","%rbx");	# KL<<<111
612	&_rotl128	("%r8","%r10",51);		# 60+51=111
613	&_saveround	(32,$out,-128,"%r8","%r10");	# KB<<<111
614$code.=<<___;
615	mov	\$4,%eax
616.Ldone:
617	mov	0(%rsp),%r15
618.cfi_restore	%r15
619	mov	8(%rsp),%r14
620.cfi_restore	%r14
621	mov	16(%rsp),%r13
622.cfi_restore	%r13
623	mov	24(%rsp),%rbp
624.cfi_restore	%rbp
625	mov	32(%rsp),%rbx
626.cfi_restore	%rbx
627	lea	40(%rsp),%rsp
628.cfi_adjust_cfa_offset	-40
629.Lkey_epilogue:
630	ret
631.cfi_endproc
632.size	Camellia_Ekeygen,.-Camellia_Ekeygen
633___
634}
635
636@SBOX=(
637112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
638 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
639134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
640166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
641139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
642223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
643 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
644254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
645170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
646 16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
647135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
648 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
649233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
650120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
651114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
652 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
653
654sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
655sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
656sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
657sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
658
659$code.=<<___;
660.section .rodata align=64
661.align	64
662.LCamellia_SIGMA:
663.long	0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
664.long	0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
665.long	0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
666.long	0,          0,          0,          0
667.LCamellia_SBOX:
668___
669# tables are interleaved, remember?
670sub data_word { $code.=".long\t".join(',',@_)."\n"; }
671for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
672for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
673
674# void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
675#			size_t length, const CAMELLIA_KEY *key,
676#			unsigned char *ivp,const int enc);
677{
678$_key="0(%rsp)";
679$_end="8(%rsp)";	# inp+len&~15
680$_res="16(%rsp)";	# len&15
681$ivec="24(%rsp)";
682$_ivp="40(%rsp)";
683$_rsp="48(%rsp)";
684
685$code.=<<___;
686.text
687.globl	Camellia_cbc_encrypt
688.type	Camellia_cbc_encrypt,\@function,6
689.align	16
690Camellia_cbc_encrypt:
691.cfi_startproc
692	endbranch
693	cmp	\$0,%rdx
694	je	.Lcbc_abort
695	push	%rbx
696.cfi_push	%rbx
697	push	%rbp
698.cfi_push	%rbp
699	push	%r12
700.cfi_push	%r12
701	push	%r13
702.cfi_push	%r13
703	push	%r14
704.cfi_push	%r14
705	push	%r15
706.cfi_push	%r15
707.Lcbc_prologue:
708
709	mov	%rsp,%rbp
710.cfi_def_cfa_register	%rbp
711	sub	\$64,%rsp
712	and	\$-64,%rsp
713
714	# place stack frame just "above mod 1024" the key schedule,
715	# this ensures that cache associativity suffices
716	lea	-64-63(%rcx),%r10
717	sub	%rsp,%r10
718	neg	%r10
719	and	\$0x3C0,%r10
720	sub	%r10,%rsp
721	#add	\$8,%rsp		# 8 is reserved for callee's ra
722
723	mov	%rdi,$inp		# inp argument
724	mov	%rsi,$out		# out argument
725	mov	%r8,%rbx		# ivp argument
726	mov	%rcx,$key		# key argument
727	mov	272(%rcx),${keyend}d	# grandRounds
728
729	mov	%r8,$_ivp
730	mov	%rbp,$_rsp
731.cfi_cfa_expression	$_rsp,deref,+56
732
733.Lcbc_body:
734	lea	.LCamellia_SBOX(%rip),$Tbl
735
736	mov	\$32,%ecx
737.align	4
738.Lcbc_prefetch_sbox:
739	mov	0($Tbl),%rax
740	mov	32($Tbl),%rsi
741	mov	64($Tbl),%rdi
742	mov	96($Tbl),%r11
743	lea	128($Tbl),$Tbl
744	loop	.Lcbc_prefetch_sbox
745	sub	\$4096,$Tbl
746	shl	\$6,$keyend
747	mov	%rdx,%rcx		# len argument
748	lea	($key,$keyend),$keyend
749
750	cmp	\$0,%r9d		# enc argument
751	je	.LCBC_DECRYPT
752
753	and	\$-16,%rdx
754	and	\$15,%rcx		# length residue
755	lea	($inp,%rdx),%rdx
756	mov	$key,$_key
757	mov	%rdx,$_end
758	mov	%rcx,$_res
759
760	cmp	$inp,%rdx
761	mov	0(%rbx),@S[0]		# load IV
762	mov	4(%rbx),@S[1]
763	mov	8(%rbx),@S[2]
764	mov	12(%rbx),@S[3]
765	je	.Lcbc_enc_tail
766	jmp	.Lcbc_eloop
767
768.align	16
769.Lcbc_eloop:
770	xor	0($inp),@S[0]
771	xor	4($inp),@S[1]
772	xor	8($inp),@S[2]
773	bswap	@S[0]
774	xor	12($inp),@S[3]
775	bswap	@S[1]
776	bswap	@S[2]
777	bswap	@S[3]
778
779	call	_x86_64_Camellia_encrypt
780
781	mov	$_key,$key		# "rewind" the key
782	bswap	@S[0]
783	mov	$_end,%rdx
784	bswap	@S[1]
785	mov	$_res,%rcx
786	bswap	@S[2]
787	mov	@S[0],0($out)
788	bswap	@S[3]
789	mov	@S[1],4($out)
790	mov	@S[2],8($out)
791	lea	16($inp),$inp
792	mov	@S[3],12($out)
793	cmp	%rdx,$inp
794	lea	16($out),$out
795	jne	.Lcbc_eloop
796
797	cmp	\$0,%rcx
798	jne	.Lcbc_enc_tail
799
800	mov	$_ivp,$out
801	mov	@S[0],0($out)		# write out IV residue
802	mov	@S[1],4($out)
803	mov	@S[2],8($out)
804	mov	@S[3],12($out)
805	jmp	.Lcbc_done
806
807.align	16
808.Lcbc_enc_tail:
809	xor	%rax,%rax
810	mov	%rax,0+$ivec
811	mov	%rax,8+$ivec
812	mov	%rax,$_res
813
814.Lcbc_enc_pushf:
815	pushfq
816	cld
817	mov	$inp,%rsi
818	lea	8+$ivec,%rdi
819	.long	0x9066A4F3		# rep movsb
820	popfq
821.Lcbc_enc_popf:
822
823	lea	$ivec,$inp
824	lea	16+$ivec,%rax
825	mov	%rax,$_end
826	jmp	.Lcbc_eloop		# one more time
827
828.align	16
829.LCBC_DECRYPT:
830	xchg	$key,$keyend
831	add	\$15,%rdx
832	and	\$15,%rcx		# length residue
833	and	\$-16,%rdx
834	mov	$key,$_key
835	lea	($inp,%rdx),%rdx
836	mov	%rdx,$_end
837	mov	%rcx,$_res
838
839	mov	(%rbx),%rax		# load IV
840	mov	8(%rbx),%rbx
841	jmp	.Lcbc_dloop
842.align	16
843.Lcbc_dloop:
844	mov	0($inp),@S[0]
845	mov	4($inp),@S[1]
846	mov	8($inp),@S[2]
847	bswap	@S[0]
848	mov	12($inp),@S[3]
849	bswap	@S[1]
850	mov	%rax,0+$ivec		# save IV to temporary storage
851	bswap	@S[2]
852	mov	%rbx,8+$ivec
853	bswap	@S[3]
854
855	call	_x86_64_Camellia_decrypt
856
857	mov	$_key,$key		# "rewind" the key
858	mov	$_end,%rdx
859	mov	$_res,%rcx
860
861	bswap	@S[0]
862	mov	($inp),%rax		# load IV for next iteration
863	bswap	@S[1]
864	mov	8($inp),%rbx
865	bswap	@S[2]
866	xor	0+$ivec,@S[0]
867	bswap	@S[3]
868	xor	4+$ivec,@S[1]
869	xor	8+$ivec,@S[2]
870	lea	16($inp),$inp
871	xor	12+$ivec,@S[3]
872	cmp	%rdx,$inp
873	je	.Lcbc_ddone
874
875	mov	@S[0],0($out)
876	mov	@S[1],4($out)
877	mov	@S[2],8($out)
878	mov	@S[3],12($out)
879
880	lea	16($out),$out
881	jmp	.Lcbc_dloop
882
883.align	16
884.Lcbc_ddone:
885	mov	$_ivp,%rdx
886	cmp	\$0,%rcx
887	jne	.Lcbc_dec_tail
888
889	mov	@S[0],0($out)
890	mov	@S[1],4($out)
891	mov	@S[2],8($out)
892	mov	@S[3],12($out)
893
894	mov	%rax,(%rdx)		# write out IV residue
895	mov	%rbx,8(%rdx)
896	jmp	.Lcbc_done
897.align	16
898.Lcbc_dec_tail:
899	mov	@S[0],0+$ivec
900	mov	@S[1],4+$ivec
901	mov	@S[2],8+$ivec
902	mov	@S[3],12+$ivec
903
904.Lcbc_dec_pushf:
905	pushfq
906	cld
907	lea	8+$ivec,%rsi
908	lea	($out),%rdi
909	.long	0x9066A4F3		# rep movsb
910	popfq
911.Lcbc_dec_popf:
912
913	mov	%rax,(%rdx)		# write out IV residue
914	mov	%rbx,8(%rdx)
915	jmp	.Lcbc_done
916
917.align	16
918.Lcbc_done:
919	mov	$_rsp,%rcx
920.cfi_def_cfa	%rcx,56
921	mov	0(%rcx),%r15
922.cfi_restore	%r15
923	mov	8(%rcx),%r14
924.cfi_restore	%r14
925	mov	16(%rcx),%r13
926.cfi_restore	%r13
927	mov	24(%rcx),%r12
928.cfi_restore	%r12
929	mov	32(%rcx),%rbp
930.cfi_restore	%rbp
931	mov	40(%rcx),%rbx
932.cfi_restore	%rbx
933	lea	48(%rcx),%rsp
934.cfi_def_cfa	%rsp,8
935.Lcbc_abort:
936	ret
937.cfi_endproc
938.size	Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
939
940.asciz	"Camellia for x86_64 by <appro\@openssl.org>"
941___
942}
943
944# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
945#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
946if ($win64) {
947$rec="%rcx";
948$frame="%rdx";
949$context="%r8";
950$disp="%r9";
951
952$code.=<<___;
953.extern	__imp_RtlVirtualUnwind
954.type	common_se_handler,\@abi-omnipotent
955.align	16
956common_se_handler:
957	push	%rsi
958	push	%rdi
959	push	%rbx
960	push	%rbp
961	push	%r12
962	push	%r13
963	push	%r14
964	push	%r15
965	pushfq
966	lea	-64(%rsp),%rsp
967
968	mov	120($context),%rax	# pull context->Rax
969	mov	248($context),%rbx	# pull context->Rip
970
971	mov	8($disp),%rsi		# disp->ImageBase
972	mov	56($disp),%r11		# disp->HandlerData
973
974	mov	0(%r11),%r10d		# HandlerData[0]
975	lea	(%rsi,%r10),%r10	# prologue label
976	cmp	%r10,%rbx		# context->Rip<prologue label
977	jb	.Lin_prologue
978
979	mov	152($context),%rax	# pull context->Rsp
980
981	mov	4(%r11),%r10d		# HandlerData[1]
982	lea	(%rsi,%r10),%r10	# epilogue label
983	cmp	%r10,%rbx		# context->Rip>=epilogue label
984	jae	.Lin_prologue
985
986	lea	40(%rax),%rax
987	mov	-8(%rax),%rbx
988	mov	-16(%rax),%rbp
989	mov	-24(%rax),%r13
990	mov	-32(%rax),%r14
991	mov	-40(%rax),%r15
992	mov	%rbx,144($context)	# restore context->Rbx
993	mov	%rbp,160($context)	# restore context->Rbp
994	mov	%r13,224($context)	# restore context->R13
995	mov	%r14,232($context)	# restore context->R14
996	mov	%r15,240($context)	# restore context->R15
997
998.Lin_prologue:
999	mov	8(%rax),%rdi
1000	mov	16(%rax),%rsi
1001	mov	%rax,152($context)	# restore context->Rsp
1002	mov	%rsi,168($context)	# restore context->Rsi
1003	mov	%rdi,176($context)	# restore context->Rdi
1004
1005	jmp	.Lcommon_seh_exit
1006.size	common_se_handler,.-common_se_handler
1007
1008.type	cbc_se_handler,\@abi-omnipotent
1009.align	16
1010cbc_se_handler:
1011	push	%rsi
1012	push	%rdi
1013	push	%rbx
1014	push	%rbp
1015	push	%r12
1016	push	%r13
1017	push	%r14
1018	push	%r15
1019	pushfq
1020	lea	-64(%rsp),%rsp
1021
1022	mov	120($context),%rax	# pull context->Rax
1023	mov	248($context),%rbx	# pull context->Rip
1024
1025	lea	.Lcbc_prologue(%rip),%r10
1026	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
1027	jb	.Lin_cbc_prologue
1028
1029	lea	.Lcbc_body(%rip),%r10
1030	cmp	%r10,%rbx		# context->Rip<.Lcbc_body
1031	jb	.Lin_cbc_frame_setup
1032
1033	mov	152($context),%rax	# pull context->Rsp
1034
1035	lea	.Lcbc_abort(%rip),%r10
1036	cmp	%r10,%rbx		# context->Rip>=.Lcbc_abort
1037	jae	.Lin_cbc_prologue
1038
1039	# handle pushf/popf in Camellia_cbc_encrypt
1040	lea	.Lcbc_enc_pushf(%rip),%r10
1041	cmp	%r10,%rbx		# context->Rip<=.Lcbc_enc_pushf
1042	jbe	.Lin_cbc_no_flag
1043	lea	8(%rax),%rax
1044	lea	.Lcbc_enc_popf(%rip),%r10
1045	cmp	%r10,%rbx		# context->Rip<.Lcbc_enc_popf
1046	jb	.Lin_cbc_no_flag
1047	lea	-8(%rax),%rax
1048	lea	.Lcbc_dec_pushf(%rip),%r10
1049	cmp	%r10,%rbx		# context->Rip<=.Lcbc_dec_pushf
1050	jbe	.Lin_cbc_no_flag
1051	lea	8(%rax),%rax
1052	lea	.Lcbc_dec_popf(%rip),%r10
1053	cmp	%r10,%rbx		# context->Rip<.Lcbc_dec_popf
1054	jb	.Lin_cbc_no_flag
1055	lea	-8(%rax),%rax
1056
1057.Lin_cbc_no_flag:
1058	mov	48(%rax),%rax		# $_rsp
1059	lea	48(%rax),%rax
1060
1061.Lin_cbc_frame_setup:
1062	mov	-8(%rax),%rbx
1063	mov	-16(%rax),%rbp
1064	mov	-24(%rax),%r12
1065	mov	-32(%rax),%r13
1066	mov	-40(%rax),%r14
1067	mov	-48(%rax),%r15
1068	mov	%rbx,144($context)	# restore context->Rbx
1069	mov	%rbp,160($context)	# restore context->Rbp
1070	mov	%r12,216($context)	# restore context->R12
1071	mov	%r13,224($context)	# restore context->R13
1072	mov	%r14,232($context)	# restore context->R14
1073	mov	%r15,240($context)	# restore context->R15
1074
1075.Lin_cbc_prologue:
1076	mov	8(%rax),%rdi
1077	mov	16(%rax),%rsi
1078	mov	%rax,152($context)	# restore context->Rsp
1079	mov	%rsi,168($context)	# restore context->Rsi
1080	mov	%rdi,176($context)	# restore context->Rdi
1081
1082.align	4
1083.Lcommon_seh_exit:
1084
1085	mov	40($disp),%rdi		# disp->ContextRecord
1086	mov	$context,%rsi		# context
1087	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
1088	.long	0xa548f3fc		# cld; rep movsq
1089
1090	mov	$disp,%rsi
1091	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1092	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1093	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1094	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1095	mov	40(%rsi),%r10		# disp->ContextRecord
1096	lea	56(%rsi),%r11		# &disp->HandlerData
1097	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1098	mov	%r10,32(%rsp)		# arg5
1099	mov	%r11,40(%rsp)		# arg6
1100	mov	%r12,48(%rsp)		# arg7
1101	mov	%rcx,56(%rsp)		# arg8, (NULL)
1102	call	*__imp_RtlVirtualUnwind(%rip)
1103
1104	mov	\$1,%eax		# ExceptionContinueSearch
1105	lea	64(%rsp),%rsp
1106	popfq
1107	pop	%r15
1108	pop	%r14
1109	pop	%r13
1110	pop	%r12
1111	pop	%rbp
1112	pop	%rbx
1113	pop	%rdi
1114	pop	%rsi
1115	ret
1116.size	cbc_se_handler,.-cbc_se_handler
1117
1118.section	.pdata
1119.align	4
1120	.rva	.LSEH_begin_Camellia_EncryptBlock_Rounds
1121	.rva	.LSEH_end_Camellia_EncryptBlock_Rounds
1122	.rva	.LSEH_info_Camellia_EncryptBlock_Rounds
1123
1124	.rva	.LSEH_begin_Camellia_DecryptBlock_Rounds
1125	.rva	.LSEH_end_Camellia_DecryptBlock_Rounds
1126	.rva	.LSEH_info_Camellia_DecryptBlock_Rounds
1127
1128	.rva	.LSEH_begin_Camellia_Ekeygen
1129	.rva	.LSEH_end_Camellia_Ekeygen
1130	.rva	.LSEH_info_Camellia_Ekeygen
1131
1132	.rva	.LSEH_begin_Camellia_cbc_encrypt
1133	.rva	.LSEH_end_Camellia_cbc_encrypt
1134	.rva	.LSEH_info_Camellia_cbc_encrypt
1135
1136.section	.xdata
1137.align	8
1138.LSEH_info_Camellia_EncryptBlock_Rounds:
1139	.byte	9,0,0,0
1140	.rva	common_se_handler
1141	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
1142.LSEH_info_Camellia_DecryptBlock_Rounds:
1143	.byte	9,0,0,0
1144	.rva	common_se_handler
1145	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
1146.LSEH_info_Camellia_Ekeygen:
1147	.byte	9,0,0,0
1148	.rva	common_se_handler
1149	.rva	.Lkey_prologue,.Lkey_epilogue	# HandlerData[]
1150.LSEH_info_Camellia_cbc_encrypt:
1151	.byte	9,0,0,0
1152	.rva	cbc_se_handler
1153___
1154}
1155
1156$code =~ s/\`([^\`]*)\`/eval $1/gem;
1157print $code;
1158close STDOUT or die "error closing STDOUT: $!";
1159