xref: /openssl/engines/asm/e_padlock-x86_64.pl (revision 1aa89a7a)
1#! /usr/bin/env perl
2# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# September 2011
18#
19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
20# details.
21
22# $output is the last argument if it looks like a file (it has an extension)
23# $flavour is the first argument if it doesn't look like a file
24$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
26
27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32die "can't locate x86_64-xlate.pl";
33
34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35     or die "can't call $xlate: $!";
36*STDOUT=*OUT;
37
38$code=".text\n";
39
40%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32);	# prefetch errata
41$PADLOCK_CHUNK=512;	# Must be a power of 2 between 32 and 2^20
42
43$ctx="%rdx";
44$out="%rdi";
45$inp="%rsi";
46$len="%rcx";
47$chunk="%rbx";
48
49($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
51
52$code.=<<___;
53.globl	padlock_capability
54.type	padlock_capability,\@abi-omnipotent
55.align	16
56padlock_capability:
57	mov	%rbx,%r8
58	xor	%eax,%eax
59	cpuid
60	xor	%eax,%eax
61	cmp	\$`"0x".unpack("H*",'tneC')`,%ebx
62	jne	.Lzhaoxin
63	cmp	\$`"0x".unpack("H*",'Hrua')`,%edx
64	jne	.Lnoluck
65	cmp	\$`"0x".unpack("H*",'slua')`,%ecx
66	jne	.Lnoluck
67	jmp	.LzhaoxinEnd
68.Lzhaoxin:
69	cmp	\$`"0x".unpack("H*",'hS  ')`,%ebx
70	jne	.Lnoluck
71	cmp	\$`"0x".unpack("H*",'hgna')`,%edx
72	jne	.Lnoluck
73	cmp	\$`"0x".unpack("H*",'  ia')`,%ecx
74	jne	.Lnoluck
75.LzhaoxinEnd:
76	mov	\$0xC0000000,%eax
77	cpuid
78	mov	%eax,%edx
79	xor	%eax,%eax
80	cmp	\$0xC0000001,%edx
81	jb	.Lnoluck
82	mov	\$0xC0000001,%eax
83	cpuid
84	mov	%edx,%eax
85	and	\$0xffffffef,%eax
86	or	\$0x10,%eax		# set Nano bit#4
87.Lnoluck:
88	mov	%r8,%rbx
89	ret
90.size	padlock_capability,.-padlock_capability
91
92.globl	padlock_key_bswap
93.type	padlock_key_bswap,\@abi-omnipotent,0
94.align	16
95padlock_key_bswap:
96	mov	240($arg1),%edx
97.Lbswap_loop:
98	mov	($arg1),%eax
99	bswap	%eax
100	mov	%eax,($arg1)
101	lea	4($arg1),$arg1
102	sub	\$1,%edx
103	jnz	.Lbswap_loop
104	ret
105.size	padlock_key_bswap,.-padlock_key_bswap
106
107.globl	padlock_verify_context
108.type	padlock_verify_context,\@abi-omnipotent
109.align	16
110padlock_verify_context:
111	mov	$arg1,$ctx
112	pushf
113	lea	.Lpadlock_saved_context(%rip),%rax
114	call	_padlock_verify_ctx
115	lea	8(%rsp),%rsp
116	ret
117.size	padlock_verify_context,.-padlock_verify_context
118
119.type	_padlock_verify_ctx,\@abi-omnipotent
120.align	16
121_padlock_verify_ctx:
122	mov	8(%rsp),%r8
123	bt	\$30,%r8
124	jnc	.Lverified
125	cmp	(%rax),$ctx
126	je	.Lverified
127	pushf
128	popf
129.Lverified:
130	mov	$ctx,(%rax)
131	ret
132.size	_padlock_verify_ctx,.-_padlock_verify_ctx
133
134.globl	padlock_reload_key
135.type	padlock_reload_key,\@abi-omnipotent
136.align	16
137padlock_reload_key:
138	pushf
139	popf
140	ret
141.size	padlock_reload_key,.-padlock_reload_key
142
143.globl	padlock_aes_block
144.type	padlock_aes_block,\@function,3
145.align	16
146padlock_aes_block:
147	mov	%rbx,%r8
148	mov	\$1,$len
149	lea	32($ctx),%rbx		# key
150	lea	16($ctx),$ctx		# control word
151	.byte	0xf3,0x0f,0xa7,0xc8	# rep xcryptecb
152	mov	%r8,%rbx
153	ret
154.size	padlock_aes_block,.-padlock_aes_block
155
156.globl	padlock_xstore
157.type	padlock_xstore,\@function,2
158.align	16
159padlock_xstore:
160	mov	%esi,%edx
161	.byte	0x0f,0xa7,0xc0		# xstore
162	ret
163.size	padlock_xstore,.-padlock_xstore
164
165.globl	padlock_sha1_oneshot
166.type	padlock_sha1_oneshot,\@function,3
167.align	16
168padlock_sha1_oneshot:
169	mov	%rdx,%rcx
170	mov	%rdi,%rdx		# put aside %rdi
171	movups	(%rdi),%xmm0		# copy-in context
172	sub	\$128+8,%rsp
173	mov	16(%rdi),%eax
174	movaps	%xmm0,(%rsp)
175	mov	%rsp,%rdi
176	mov	%eax,16(%rsp)
177	xor	%rax,%rax
178	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
179	movaps	(%rsp),%xmm0
180	mov	16(%rsp),%eax
181	add	\$128+8,%rsp
182	movups	%xmm0,(%rdx)		# copy-out context
183	mov	%eax,16(%rdx)
184	ret
185.size	padlock_sha1_oneshot,.-padlock_sha1_oneshot
186
187.globl	padlock_sha1_blocks
188.type	padlock_sha1_blocks,\@function,3
189.align	16
190padlock_sha1_blocks:
191	mov	%rdx,%rcx
192	mov	%rdi,%rdx		# put aside %rdi
193	movups	(%rdi),%xmm0		# copy-in context
194	sub	\$128+8,%rsp
195	mov	16(%rdi),%eax
196	movaps	%xmm0,(%rsp)
197	mov	%rsp,%rdi
198	mov	%eax,16(%rsp)
199	mov	\$-1,%rax
200	.byte	0xf3,0x0f,0xa6,0xc8	# rep xsha1
201	movaps	(%rsp),%xmm0
202	mov	16(%rsp),%eax
203	add	\$128+8,%rsp
204	movups	%xmm0,(%rdx)		# copy-out context
205	mov	%eax,16(%rdx)
206	ret
207.size	padlock_sha1_blocks,.-padlock_sha1_blocks
208
209.globl	padlock_sha256_oneshot
210.type	padlock_sha256_oneshot,\@function,3
211.align	16
212padlock_sha256_oneshot:
213	mov	%rdx,%rcx
214	mov	%rdi,%rdx		# put aside %rdi
215	movups	(%rdi),%xmm0		# copy-in context
216	sub	\$128+8,%rsp
217	movups	16(%rdi),%xmm1
218	movaps	%xmm0,(%rsp)
219	mov	%rsp,%rdi
220	movaps	%xmm1,16(%rsp)
221	xor	%rax,%rax
222	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
223	movaps	(%rsp),%xmm0
224	movaps	16(%rsp),%xmm1
225	add	\$128+8,%rsp
226	movups	%xmm0,(%rdx)		# copy-out context
227	movups	%xmm1,16(%rdx)
228	ret
229.size	padlock_sha256_oneshot,.-padlock_sha256_oneshot
230
231.globl	padlock_sha256_blocks
232.type	padlock_sha256_blocks,\@function,3
233.align	16
234padlock_sha256_blocks:
235	mov	%rdx,%rcx
236	mov	%rdi,%rdx		# put aside %rdi
237	movups	(%rdi),%xmm0		# copy-in context
238	sub	\$128+8,%rsp
239	movups	16(%rdi),%xmm1
240	movaps	%xmm0,(%rsp)
241	mov	%rsp,%rdi
242	movaps	%xmm1,16(%rsp)
243	mov	\$-1,%rax
244	.byte	0xf3,0x0f,0xa6,0xd0	# rep xsha256
245	movaps	(%rsp),%xmm0
246	movaps	16(%rsp),%xmm1
247	add	\$128+8,%rsp
248	movups	%xmm0,(%rdx)		# copy-out context
249	movups	%xmm1,16(%rdx)
250	ret
251.size	padlock_sha256_blocks,.-padlock_sha256_blocks
252
253.globl	padlock_sha512_blocks
254.type	padlock_sha512_blocks,\@function,3
255.align	16
256padlock_sha512_blocks:
257	mov	%rdx,%rcx
258	mov	%rdi,%rdx		# put aside %rdi
259	movups	(%rdi),%xmm0		# copy-in context
260	sub	\$128+8,%rsp
261	movups	16(%rdi),%xmm1
262	movups	32(%rdi),%xmm2
263	movups	48(%rdi),%xmm3
264	movaps	%xmm0,(%rsp)
265	mov	%rsp,%rdi
266	movaps	%xmm1,16(%rsp)
267	movaps	%xmm2,32(%rsp)
268	movaps	%xmm3,48(%rsp)
269	.byte	0xf3,0x0f,0xa6,0xe0	# rep xha512
270	movaps	(%rsp),%xmm0
271	movaps	16(%rsp),%xmm1
272	movaps	32(%rsp),%xmm2
273	movaps	48(%rsp),%xmm3
274	add	\$128+8,%rsp
275	movups	%xmm0,(%rdx)		# copy-out context
276	movups	%xmm1,16(%rdx)
277	movups	%xmm2,32(%rdx)
278	movups	%xmm3,48(%rdx)
279	ret
280.size	padlock_sha512_blocks,.-padlock_sha512_blocks
281___
282
283sub generate_mode {
284my ($mode,$opcode) = @_;
285# int padlock_$mode_encrypt(void *out, const void *inp,
286#		struct padlock_cipher_data *ctx, size_t len);
287$code.=<<___;
288.globl	padlock_${mode}_encrypt
289.type	padlock_${mode}_encrypt,\@function,4
290.align	16
291padlock_${mode}_encrypt:
292	push	%rbp
293	push	%rbx
294
295	xor	%eax,%eax
296	test	\$15,$ctx
297	jnz	.L${mode}_abort
298	test	\$15,$len
299	jnz	.L${mode}_abort
300	lea	.Lpadlock_saved_context(%rip),%rax
301	pushf
302	cld
303	call	_padlock_verify_ctx
304	lea	16($ctx),$ctx		# control word
305	xor	%eax,%eax
306	xor	%ebx,%ebx
307	testl	\$`1<<5`,($ctx)		# align bit in control word
308	jnz	.L${mode}_aligned
309	test	\$0x0f,$out
310	setz	%al			# !out_misaligned
311	test	\$0x0f,$inp
312	setz	%bl			# !inp_misaligned
313	test	%ebx,%eax
314	jnz	.L${mode}_aligned
315	neg	%rax
316	mov	\$$PADLOCK_CHUNK,$chunk
317	not	%rax			# out_misaligned?-1:0
318	lea	(%rsp),%rbp
319	cmp	$chunk,$len
320	cmovc	$len,$chunk		# chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
321	and	$chunk,%rax		# out_misaligned?chunk:0
322	mov	$len,$chunk
323	neg	%rax
324	and	\$$PADLOCK_CHUNK-1,$chunk	# chunk%=PADLOCK_CHUNK
325	lea	(%rax,%rbp),%rsp
326	mov	\$$PADLOCK_CHUNK,%rax
327	cmovz	%rax,$chunk			# chunk=chunk?:PADLOCK_CHUNK
328___
329$code.=<<___				if ($mode eq "ctr32");
330.L${mode}_reenter:
331	mov	-4($ctx),%eax		# pull 32-bit counter
332	bswap	%eax
333	neg	%eax
334	and	\$`$PADLOCK_CHUNK/16-1`,%eax
335	mov	\$$PADLOCK_CHUNK,$chunk
336	shl	\$4,%eax
337	cmovz	$chunk,%rax
338	cmp	%rax,$len
339	cmova	%rax,$chunk		# don't let counter cross PADLOCK_CHUNK
340	cmovbe	$len,$chunk
341___
342$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
343	cmp	$chunk,$len
344	ja	.L${mode}_loop
345	mov	$inp,%rax		# check if prefetch crosses page
346	cmp	%rsp,%rbp
347	cmove	$out,%rax
348	add	$len,%rax
349	neg	%rax
350	and	\$0xfff,%rax		# distance to page boundary
351	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
352	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
353	cmovae	$chunk,%rax		# mask=distance<prefetch?-prefetch:-1
354	and	%rax,$chunk
355	jz	.L${mode}_unaligned_tail
356___
357$code.=<<___;
358	jmp	.L${mode}_loop
359.align	16
360.L${mode}_loop:
361	cmp	$len,$chunk		# ctr32 artefact
362	cmova	$len,$chunk		# ctr32 artefact
363	mov	$out,%r8		# save parameters
364	mov	$inp,%r9
365	mov	$len,%r10
366	mov	$chunk,$len
367	mov	$chunk,%r11
368	test	\$0x0f,$out		# out_misaligned
369	cmovnz	%rsp,$out
370	test	\$0x0f,$inp		# inp_misaligned
371	jz	.L${mode}_inp_aligned
372	shr	\$3,$len
373	.byte	0xf3,0x48,0xa5		# rep movsq
374	sub	$chunk,$out
375	mov	$chunk,$len
376	mov	$out,$inp
377.L${mode}_inp_aligned:
378	lea	-16($ctx),%rax		# ivp
379	lea	16($ctx),%rbx		# key
380	shr	\$4,$len
381	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
382___
383$code.=<<___				if ($mode !~ /ecb|ctr/);
384	movdqa	(%rax),%xmm0
385	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
386___
387$code.=<<___				if ($mode eq "ctr32");
388	mov	-4($ctx),%eax		# pull 32-bit counter
389	test	\$0xffff0000,%eax
390	jnz	.L${mode}_no_carry
391	bswap	%eax
392	add	\$0x10000,%eax
393	bswap	%eax
394	mov	%eax,-4($ctx)
395.L${mode}_no_carry:
396___
397$code.=<<___;
398	mov	%r8,$out		# restore parameters
399	mov	%r11,$chunk
400	test	\$0x0f,$out
401	jz	.L${mode}_out_aligned
402	mov	$chunk,$len
403	lea	(%rsp),$inp
404	shr	\$3,$len
405	.byte	0xf3,0x48,0xa5		# rep movsq
406	sub	$chunk,$out
407.L${mode}_out_aligned:
408	mov	%r9,$inp
409	mov	%r10,$len
410	add	$chunk,$out
411	add	$chunk,$inp
412	sub	$chunk,$len
413	mov	\$$PADLOCK_CHUNK,$chunk
414___
415					if (!$PADLOCK_PREFETCH{$mode}) {
416$code.=<<___;
417	jnz	.L${mode}_loop
418___
419					} else {
420$code.=<<___;
421	jz	.L${mode}_break
422	cmp	$chunk,$len
423	jae	.L${mode}_loop
424___
425$code.=<<___				if ($mode eq "ctr32");
426	mov	$len,$chunk
427	mov	$inp,%rax		# check if prefetch crosses page
428	cmp	%rsp,%rbp
429	cmove	$out,%rax
430	add	$len,%rax
431	neg	%rax
432	and	\$0xfff,%rax		# distance to page boundary
433	cmp	\$$PADLOCK_PREFETCH{$mode},%rax
434	mov	\$-$PADLOCK_PREFETCH{$mode},%rax
435	cmovae	$chunk,%rax
436	and	%rax,$chunk
437	jnz	.L${mode}_loop
438___
439$code.=<<___;
440.L${mode}_unaligned_tail:
441	xor	%eax,%eax
442	cmp	%rsp,%rbp
443	cmove	$len,%rax
444	mov	$out,%r8		# save parameters
445	mov	$len,$chunk
446	sub	%rax,%rsp		# alloca
447	shr	\$3,$len
448	lea	(%rsp),$out
449	.byte	0xf3,0x48,0xa5		# rep movsq
450	mov	%rsp,$inp
451	mov	%r8, $out		# restore parameters
452	mov	$chunk,$len
453	jmp	.L${mode}_loop
454.align	16
455.L${mode}_break:
456___
457					}
458$code.=<<___;
459	cmp	%rbp,%rsp
460	je	.L${mode}_done
461
462	pxor	%xmm0,%xmm0
463	lea	(%rsp),%rax
464.L${mode}_bzero:
465	movaps	%xmm0,(%rax)
466	lea	16(%rax),%rax
467	cmp	%rax,%rbp
468	ja	.L${mode}_bzero
469
470.L${mode}_done:
471	lea	(%rbp),%rsp
472	jmp	.L${mode}_exit
473
474.align	16
475.L${mode}_aligned:
476___
477$code.=<<___				if ($mode eq "ctr32");
478	mov	-4($ctx),%eax		# pull 32-bit counter
479	bswap	%eax
480	neg	%eax
481	and	\$0xffff,%eax
482	mov	\$`16*0x10000`,$chunk
483	shl	\$4,%eax
484	cmovz	$chunk,%rax
485	cmp	%rax,$len
486	cmova	%rax,$chunk		# don't let counter cross 2^16
487	cmovbe	$len,$chunk
488	jbe	.L${mode}_aligned_skip
489
490.L${mode}_aligned_loop:
491	mov	$len,%r10		# save parameters
492	mov	$chunk,$len
493	mov	$chunk,%r11
494
495	lea	-16($ctx),%rax		# ivp
496	lea	16($ctx),%rbx		# key
497	shr	\$4,$len		# len/=AES_BLOCK_SIZE
498	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
499
500	mov	-4($ctx),%eax		# pull 32-bit counter
501	bswap	%eax
502	add	\$0x10000,%eax
503	bswap	%eax
504	mov	%eax,-4($ctx)
505
506	mov	%r10,$len		# restore parameters
507	sub	%r11,$len
508	mov	\$`16*0x10000`,$chunk
509	jz	.L${mode}_exit
510	cmp	$chunk,$len
511	jae	.L${mode}_aligned_loop
512
513.L${mode}_aligned_skip:
514___
515$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
516	lea	($inp,$len),%rbp
517	neg	%rbp
518	and	\$0xfff,%rbp		# distance to page boundary
519	xor	%eax,%eax
520	cmp	\$$PADLOCK_PREFETCH{$mode},%rbp
521	mov	\$$PADLOCK_PREFETCH{$mode}-1,%rbp
522	cmovae	%rax,%rbp
523	and	$len,%rbp		# remainder
524	sub	%rbp,$len
525	jz	.L${mode}_aligned_tail
526___
527$code.=<<___;
528	lea	-16($ctx),%rax		# ivp
529	lea	16($ctx),%rbx		# key
530	shr	\$4,$len		# len/=AES_BLOCK_SIZE
531	.byte	0xf3,0x0f,0xa7,$opcode	# rep xcrypt*
532___
533$code.=<<___				if ($mode !~ /ecb|ctr/);
534	movdqa	(%rax),%xmm0
535	movdqa	%xmm0,-16($ctx)		# copy [or refresh] iv
536___
537$code.=<<___				if ($PADLOCK_PREFETCH{$mode});
538	test	%rbp,%rbp		# check remainder
539	jz	.L${mode}_exit
540
541.L${mode}_aligned_tail:
542	mov	$out,%r8
543	mov	%rbp,$chunk
544	mov	%rbp,$len
545	lea	(%rsp),%rbp
546	sub	$len,%rsp
547	shr	\$3,$len
548	lea	(%rsp),$out
549	.byte	0xf3,0x48,0xa5		# rep movsq
550	lea	(%r8),$out
551	lea	(%rsp),$inp
552	mov	$chunk,$len
553	jmp	.L${mode}_loop
554___
555$code.=<<___;
556.L${mode}_exit:
557	mov	\$1,%eax
558	lea	8(%rsp),%rsp
559.L${mode}_abort:
560	pop	%rbx
561	pop	%rbp
562	ret
563.size	padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
564___
565}
566
567&generate_mode("ecb",0xc8);
568&generate_mode("cbc",0xd0);
569&generate_mode("cfb",0xe0);
570&generate_mode("ofb",0xe8);
571&generate_mode("ctr32",0xd8);	# all 64-bit CPUs have working CTR...
572
573$code.=<<___;
574.asciz	"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
575.align	16
576.data
577.align	8
578.Lpadlock_saved_context:
579	.quad	0
580___
581$code =~ s/\`([^\`]*)\`/eval($1)/gem;
582
583print $code;
584
585close STDOUT;
586