xref: /openssl/crypto/aes/asm/vpaes-ppc.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20# CBC encrypt/decrypt performance in cycles per byte processed with
21# 128-bit key.
22#
23#		aes-ppc.pl		this
24# PPC74x0/G4e	35.5/52.1/(23.8)	11.9(*)/15.4
25# PPC970/G5	37.9/55.0/(28.5)	22.2/28.5
26# POWER6	42.7/54.3/(28.2)	63.0/92.8(**)
27# POWER7	32.3/42.9/(18.4)	18.5/23.3
28#
29# (*)	This is ~10% worse than reported in paper. The reason is
30#	twofold. This module doesn't make any assumption about
31#	key schedule (or data for that matter) alignment and handles
32#	it in-line. Secondly it, being transliterated from
33#	vpaes-x86_64.pl, relies on "nested inversion" better suited
34#	for Intel CPUs.
35# (**)	Inadequate POWER6 performance is due to astronomic AltiVec
36#	latency, 9 cycles per simple logical operation.
37
38# $output is the last argument if it looks like a file (it has an extension)
39# $flavour is the first argument if it doesn't look like a file
40$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
41$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
42
43if ($flavour =~ /64/) {
44	$SIZE_T	=8;
45	$LRSAVE	=2*$SIZE_T;
46	$STU	="stdu";
47	$POP	="ld";
48	$PUSH	="std";
49	$UCMP	="cmpld";
50} elsif ($flavour =~ /32/) {
51	$SIZE_T	=4;
52	$LRSAVE	=$SIZE_T;
53	$STU	="stwu";
54	$POP	="lwz";
55	$PUSH	="stw";
56	$UCMP	="cmplw";
57} else { die "nonsense $flavour"; }
58
59$sp="r1";
60$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
61
62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65die "can't locate ppc-xlate.pl";
66
67open STDOUT,"| $^X $xlate $flavour \"$output\""
68    || die "can't call $xlate: $!";
69
70$code.=<<___;
71.machine	"any"
72
73.text
74
75.align	7	# totally strategic alignment
76_vpaes_consts:
77Lk_mc_forward:	# mc_forward
78	.long	0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c	?inv
79	.long	0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300	?inv
80	.long	0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704	?inv
81	.long	0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08	?inv
82Lk_mc_backward:	# mc_backward
83	.long	0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e	?inv
84	.long	0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a	?inv
85	.long	0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506	?inv
86	.long	0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102	?inv
87Lk_sr:		# sr
88	.long	0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f	?inv
89	.long	0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b	?inv
90	.long	0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07	?inv
91	.long	0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603	?inv
92
93##
94## "Hot" constants
95##
96Lk_inv:		# inv, inva
97	.long	0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704	?rev
98	.long	0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03	?rev
99Lk_ipt:		# input transform (lo, hi)
100	.long	0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca	?rev
101	.long	0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd	?rev
102Lk_sbo:		# sbou, sbot
103	.long	0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15	?rev
104	.long	0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e	?rev
105Lk_sb1:		# sb1u, sb1t
106	.long	0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b	?rev
107	.long	0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5	?rev
108Lk_sb2:		# sb2u, sb2t
109	.long	0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2	?rev
110	.long	0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e	?rev
111
112##
113##  Decryption stuff
114##
115Lk_dipt:	# decryption input transform
116	.long	0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15	?rev
117	.long	0x00650560, 0xe683e386, 0x94f191f4, 0x72177712	?rev
118Lk_dsbo:	# decryption sbox final output
119	.long	0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7	?rev
120	.long	0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca	?rev
121Lk_dsb9:	# decryption sbox output *9*u, *9*t
122	.long	0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca	?rev
123	.long	0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72	?rev
124Lk_dsbd:	# decryption sbox output *D*u, *D*t
125	.long	0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5	?rev
126	.long	0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129	?rev
127Lk_dsbb:	# decryption sbox output *B*u, *B*t
128	.long	0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660	?rev
129	.long	0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3	?rev
130Lk_dsbe:	# decryption sbox output *E*u, *E*t
131	.long	0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222	?rev
132	.long	0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794	?rev
133
134##
135##  Key schedule constants
136##
137Lk_dksd:	# decryption key schedule: invskew x*D
138	.long	0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007	?rev
139	.long	0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f	?rev
140Lk_dksb:	# decryption key schedule: invskew x*B
141	.long	0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603	?rev
142	.long	0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9	?rev
143Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
144	.long	0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553	?rev
145	.long	0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd	?rev
146Lk_dks9:	# decryption key schedule: invskew x*9
147	.long	0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a	?rev
148	.long	0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b	?rev
149
150Lk_rcon:	# rcon
151	.long	0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70	?asis
152Lk_s63:
153	.long	0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b	?asis
154
155Lk_opt:		# output transform
156	.long	0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7	?rev
157	.long	0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1	?rev
158Lk_deskew:	# deskew tables: inverts the sbox's "skew"
159	.long	0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d	?rev
160	.long	0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128	?rev
161.align	5
162Lconsts:
163	mflr	r0
164	bcl	20,31,\$+4
165	mflr	r12	#vvvvv "distance between . and _vpaes_consts
166	addi	r12,r12,-0x308
167	mtlr	r0
168	blr
169	.long	0
170	.byte	0,12,0x14,0,0,0,0,0
171.asciz  "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
172.align	6
173___
174
175my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
176{
177my ($inp,$out,$key) = map("r$_",(3..5));
178
179my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
180my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
181my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
182
183$code.=<<___;
184##
185##  _aes_preheat
186##
187##  Fills register %r10 -> .aes_consts (so you can -fPIC)
188##  and %xmm9-%xmm15 as specified below.
189##
190.align	4
191_vpaes_encrypt_preheat:
192	mflr	r8
193	bl	Lconsts
194	mtlr	r8
195	li	r11, 0xc0		# Lk_inv
196	li	r10, 0xd0
197	li	r9,  0xe0		# Lk_ipt
198	li	r8,  0xf0
199	vxor	v7, v7, v7		# 0x00..00
200	vspltisb	v8,4		# 0x04..04
201	vspltisb	v9,0x0f		# 0x0f..0f
202	lvx	$invlo, r12, r11
203	li	r11, 0x100
204	lvx	$invhi, r12, r10
205	li	r10, 0x110
206	lvx	$iptlo, r12, r9
207	li	r9,  0x120
208	lvx	$ipthi, r12, r8
209	li	r8,  0x130
210	lvx	$sbou, r12, r11
211	li	r11, 0x140
212	lvx	$sbot, r12, r10
213	li	r10, 0x150
214	lvx	$sb1u, r12, r9
215	lvx	$sb1t, r12, r8
216	lvx	$sb2u, r12, r11
217	lvx	$sb2t, r12, r10
218	blr
219	.long	0
220	.byte	0,12,0x14,0,0,0,0,0
221
222##
223##  _aes_encrypt_core
224##
225##  AES-encrypt %xmm0.
226##
227##  Inputs:
228##     %xmm0 = input
229##     %xmm9-%xmm15 as in _vpaes_preheat
230##    (%rdx) = scheduled keys
231##
232##  Output in %xmm0
233##  Clobbers  %xmm1-%xmm6, %r9, %r10, %r11, %rax
234##
235##
236.align 5
237_vpaes_encrypt_core:
238	lwz	r8, 240($key)		# pull rounds
239	li	r9, 16
240	lvx	v5, 0, $key		# vmovdqu	(%r9),	%xmm5		# round0 key
241	li	r11, 0x10
242	lvx	v6, r9, $key
243	addi	r9, r9, 16
244	?vperm	v5, v5, v6, $keyperm	# align round key
245	addi	r10, r11, 0x40
246	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
247	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm1
248	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm3,	%xmm2
249	vxor	v0, v0, v5		# vpxor	%xmm5,	%xmm1,	%xmm0
250	vxor	v0, v0, v1		# vpxor	%xmm2,	%xmm0,	%xmm0
251	mtctr	r8
252	b	Lenc_entry
253
254.align 4
255Lenc_loop:
256	# middle of middle round
257	vperm	v4, $sb1t, v7, v2	# vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
258	lvx	v1, r12, r11		# vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
259	addi	r11, r11, 16
260	vperm	v0, $sb1u, v7, v3	# vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
261	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
262	andi.	r11, r11, 0x30		# and		\$0x30, %r11	# ... mod 4
263	vperm	v5, $sb2t, v7, v2	# vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
264	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
265	vperm	v2, $sb2u, v7, v3	# vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
266	lvx	v4, r12, r10		# vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
267	addi	r10, r11, 0x40
268	vperm	v3, v0, v7, v1		# vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
269	vxor	v2, v2, v5		# vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
270	vperm	v0, v0, v7, v4		# vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
271	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
272	vperm	v4, v3, v7, v1		# vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
273	vxor	v0, v0, v3		# vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
274	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
275
276Lenc_entry:
277	# top of round
278	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
279	vperm	v5, $invhi, $invhi, v0	# vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
280	vxor	v0, v0, v1		# vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
281	vperm	v3, $invlo, $invlo, v1	# vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
282	vperm	v4, $invlo, $invlo, v0	# vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
283	vand	v0, v0, v9
284	vxor	v3, v3, v5		# vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
285	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
286	vperm	v2, $invlo, v7, v3	# vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
287	vmr	v5, v6
288	lvx	v6, r9, $key		# vmovdqu	(%r9), %xmm5
289	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
290	addi	r9, r9, 16
291	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
292	?vperm	v5, v5, v6, $keyperm	# align round key
293	vxor	v3, v3, v1		# vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
294	bdnz	Lenc_loop
295
296	# middle of last round
297	addi	r10, r11, 0x80
298					# vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
299					# vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
300	vperm	v4, $sbou, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
301	lvx	v1, r12, r10		# vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
302	vperm	v0, $sbot, v7, v3	# vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
303	vxor	v4, v4, v5		# vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
304	vxor	v0, v0, v4		# vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
305	vperm	v0, v0, v7, v1		# vpshufb	%xmm1,	%xmm0,	%xmm0
306	blr
307	.long	0
308	.byte	0,12,0x14,0,0,0,0,0
309
310.globl	.vpaes_encrypt
311.align	5
312.vpaes_encrypt:
313	$STU	$sp,-$FRAME($sp)
314	li	r10,`15+6*$SIZE_T`
315	li	r11,`31+6*$SIZE_T`
316	mflr	r6
317	mfspr	r7, 256			# save vrsave
318	stvx	v20,r10,$sp
319	addi	r10,r10,32
320	stvx	v21,r11,$sp
321	addi	r11,r11,32
322	stvx	v22,r10,$sp
323	addi	r10,r10,32
324	stvx	v23,r11,$sp
325	addi	r11,r11,32
326	stvx	v24,r10,$sp
327	addi	r10,r10,32
328	stvx	v25,r11,$sp
329	addi	r11,r11,32
330	stvx	v26,r10,$sp
331	addi	r10,r10,32
332	stvx	v27,r11,$sp
333	addi	r11,r11,32
334	stvx	v28,r10,$sp
335	addi	r10,r10,32
336	stvx	v29,r11,$sp
337	addi	r11,r11,32
338	stvx	v30,r10,$sp
339	stvx	v31,r11,$sp
340	stw	r7,`$FRAME-4`($sp)	# save vrsave
341	li	r0, -1
342	$PUSH	r6,`$FRAME+$LRSAVE`($sp)
343	mtspr	256, r0			# preserve all AltiVec registers
344
345	bl	_vpaes_encrypt_preheat
346
347	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
348	lvx	v0, 0, $inp
349	addi	$inp, $inp, 15		# 15 is not a typo
350	 ?lvsr	$outperm, 0, $out
351	?lvsl	$keyperm, 0, $key	# prepare for unaligned access
352	lvx	$inptail, 0, $inp	# redundant in aligned case
353	?vperm	v0, v0, $inptail, $inpperm
354
355	bl	_vpaes_encrypt_core
356
357	andi.	r8, $out, 15
358	li	r9, 16
359	beq	Lenc_out_aligned
360
361	vperm	v0, v0, v0, $outperm	# rotate right/left
362	mtctr	r9
363Lenc_out_unaligned:
364	stvebx	v0, 0, $out
365	addi	$out, $out, 1
366	bdnz	Lenc_out_unaligned
367	b	Lenc_done
368
369.align	4
370Lenc_out_aligned:
371	stvx	v0, 0, $out
372Lenc_done:
373
374	li	r10,`15+6*$SIZE_T`
375	li	r11,`31+6*$SIZE_T`
376	mtlr	r6
377	mtspr	256, r7			# restore vrsave
378	lvx	v20,r10,$sp
379	addi	r10,r10,32
380	lvx	v21,r11,$sp
381	addi	r11,r11,32
382	lvx	v22,r10,$sp
383	addi	r10,r10,32
384	lvx	v23,r11,$sp
385	addi	r11,r11,32
386	lvx	v24,r10,$sp
387	addi	r10,r10,32
388	lvx	v25,r11,$sp
389	addi	r11,r11,32
390	lvx	v26,r10,$sp
391	addi	r10,r10,32
392	lvx	v27,r11,$sp
393	addi	r11,r11,32
394	lvx	v28,r10,$sp
395	addi	r10,r10,32
396	lvx	v29,r11,$sp
397	addi	r11,r11,32
398	lvx	v30,r10,$sp
399	lvx	v31,r11,$sp
400	addi	$sp,$sp,$FRAME
401	blr
402	.long	0
403	.byte	0,12,0x04,1,0x80,0,3,0
404	.long	0
405.size	.vpaes_encrypt,.-.vpaes_encrypt
406
407.align	4
408_vpaes_decrypt_preheat:
409	mflr	r8
410	bl	Lconsts
411	mtlr	r8
412	li	r11, 0xc0		# Lk_inv
413	li	r10, 0xd0
414	li	r9,  0x160		# Ldipt
415	li	r8,  0x170
416	vxor	v7, v7, v7		# 0x00..00
417	vspltisb	v8,4		# 0x04..04
418	vspltisb	v9,0x0f		# 0x0f..0f
419	lvx	$invlo, r12, r11
420	li	r11, 0x180
421	lvx	$invhi, r12, r10
422	li	r10, 0x190
423	lvx	$iptlo, r12, r9
424	li	r9,  0x1a0
425	lvx	$ipthi, r12, r8
426	li	r8,  0x1b0
427	lvx	$sbou, r12, r11
428	li	r11, 0x1c0
429	lvx	$sbot, r12, r10
430	li	r10, 0x1d0
431	lvx	$sb9u, r12, r9
432	li	r9,  0x1e0
433	lvx	$sb9t, r12, r8
434	li	r8,  0x1f0
435	lvx	$sbdu, r12, r11
436	li	r11, 0x200
437	lvx	$sbdt, r12, r10
438	li	r10, 0x210
439	lvx	$sbbu, r12, r9
440	lvx	$sbbt, r12, r8
441	lvx	$sbeu, r12, r11
442	lvx	$sbet, r12, r10
443	blr
444	.long	0
445	.byte	0,12,0x14,0,0,0,0,0
446
447##
448##  Decryption core
449##
450##  Same API as encryption core.
451##
452.align	4
453_vpaes_decrypt_core:
454	lwz	r8, 240($key)		# pull rounds
455	li	r9, 16
456	lvx	v5, 0, $key		# vmovdqu	(%r9),	%xmm4		# round0 key
457	li	r11, 0x30
458	lvx	v6, r9, $key
459	addi	r9, r9, 16
460	?vperm	v5, v5, v6, $keyperm	# align round key
461	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
462	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
463	vperm	v1, $ipthi, $ipthi, v1	# vpshufb	%xmm0,	%xmm1,	%xmm0
464	vxor	v0, v0, v5		# vpxor	%xmm4,	%xmm2,	%xmm2
465	vxor	v0, v0, v1		# vpxor	%xmm2,	%xmm0,	%xmm0
466	mtctr	r8
467	b	Ldec_entry
468
469.align 4
470Ldec_loop:
471#
472#  Inverse mix columns
473#
474	lvx	v0, r12, r11		# v5 and v0 are flipped
475					# vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
476					# vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
477	vperm	v4, $sb9u, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
478	subi	r11, r11, 16
479	vperm	v1, $sb9t, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
480	andi.	r11, r11, 0x30
481	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0
482					# vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
483	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
484					# vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
485
486	vperm	v4, $sbdu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
487	vperm 	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
488	vperm	v1, $sbdt, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
489	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
490					# vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
491	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
492					# vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
493
494	vperm	v4, $sbbu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
495	vperm	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
496	vperm	v1, $sbbt, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
497	vxor	v5, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
498					# vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
499	vxor	v5, v5, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
500					# vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
501
502	vperm	v4, $sbeu, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
503	vperm	v5, v5, v7, v0		# vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
504	vperm	v1, $sbet, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
505	vxor	v0, v5, v4		# vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
506	vxor	v0, v0, v1		# vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
507
508Ldec_entry:
509	# top of round
510	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0	# 1 = i
511	vperm	v2, $invhi, $invhi, v0	# vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
512	vxor	v0, v0, v1		# vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
513	vperm	v3, $invlo, $invlo, v1	# vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
514	vperm	v4, $invlo, $invlo, v0	# vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
515	vand	v0, v0, v9
516	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
517	vxor	v4, v4, v2		# vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
518	vperm	v2, $invlo, v7, v3	# vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
519	vmr	v5, v6
520	lvx	v6, r9, $key		# vmovdqu	(%r9),	%xmm0
521	vperm	v3, $invlo, v7, v4	# vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
522	addi	r9, r9, 16
523	vxor	v2, v2, v0		# vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
524	?vperm	v5, v5, v6, $keyperm	# align round key
525	vxor	v3, v3, v1		# vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
526	bdnz	Ldec_loop
527
528	# middle of last round
529	addi	r10, r11, 0x80
530					# vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
531	vperm	v4, $sbou, v7, v2	# vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
532					# vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
533	lvx	v2, r12, r10		# vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
534	vperm	v1, $sbot, v7, v3	# vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
535	vxor	v4, v4, v5		# vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
536	vxor	v0, v1, v4		# vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
537	vperm	v0, v0, v7, v2		# vpshufb	%xmm2,	%xmm0,	%xmm0
538	blr
539	.long	0
540	.byte	0,12,0x14,0,0,0,0,0
541
542.globl	.vpaes_decrypt
543.align	5
544.vpaes_decrypt:
545	$STU	$sp,-$FRAME($sp)
546	li	r10,`15+6*$SIZE_T`
547	li	r11,`31+6*$SIZE_T`
548	mflr	r6
549	mfspr	r7, 256			# save vrsave
550	stvx	v20,r10,$sp
551	addi	r10,r10,32
552	stvx	v21,r11,$sp
553	addi	r11,r11,32
554	stvx	v22,r10,$sp
555	addi	r10,r10,32
556	stvx	v23,r11,$sp
557	addi	r11,r11,32
558	stvx	v24,r10,$sp
559	addi	r10,r10,32
560	stvx	v25,r11,$sp
561	addi	r11,r11,32
562	stvx	v26,r10,$sp
563	addi	r10,r10,32
564	stvx	v27,r11,$sp
565	addi	r11,r11,32
566	stvx	v28,r10,$sp
567	addi	r10,r10,32
568	stvx	v29,r11,$sp
569	addi	r11,r11,32
570	stvx	v30,r10,$sp
571	stvx	v31,r11,$sp
572	stw	r7,`$FRAME-4`($sp)	# save vrsave
573	li	r0, -1
574	$PUSH	r6,`$FRAME+$LRSAVE`($sp)
575	mtspr	256, r0			# preserve all AltiVec registers
576
577	bl	_vpaes_decrypt_preheat
578
579	?lvsl	$inpperm, 0, $inp	# prepare for unaligned access
580	lvx	v0, 0, $inp
581	addi	$inp, $inp, 15		# 15 is not a typo
582	 ?lvsr	$outperm, 0, $out
583	?lvsl	$keyperm, 0, $key
584	lvx	$inptail, 0, $inp	# redundant in aligned case
585	?vperm	v0, v0, $inptail, $inpperm
586
587	bl	_vpaes_decrypt_core
588
589	andi.	r8, $out, 15
590	li	r9, 16
591	beq	Ldec_out_aligned
592
593	vperm	v0, v0, v0, $outperm	# rotate right/left
594	mtctr	r9
595Ldec_out_unaligned:
596	stvebx	v0, 0, $out
597	addi	$out, $out, 1
598	bdnz	Ldec_out_unaligned
599	b	Ldec_done
600
601.align	4
602Ldec_out_aligned:
603	stvx	v0, 0, $out
604Ldec_done:
605
606	li	r10,`15+6*$SIZE_T`
607	li	r11,`31+6*$SIZE_T`
608	mtlr	r6
609	mtspr	256, r7			# restore vrsave
610	lvx	v20,r10,$sp
611	addi	r10,r10,32
612	lvx	v21,r11,$sp
613	addi	r11,r11,32
614	lvx	v22,r10,$sp
615	addi	r10,r10,32
616	lvx	v23,r11,$sp
617	addi	r11,r11,32
618	lvx	v24,r10,$sp
619	addi	r10,r10,32
620	lvx	v25,r11,$sp
621	addi	r11,r11,32
622	lvx	v26,r10,$sp
623	addi	r10,r10,32
624	lvx	v27,r11,$sp
625	addi	r11,r11,32
626	lvx	v28,r10,$sp
627	addi	r10,r10,32
628	lvx	v29,r11,$sp
629	addi	r11,r11,32
630	lvx	v30,r10,$sp
631	lvx	v31,r11,$sp
632	addi	$sp,$sp,$FRAME
633	blr
634	.long	0
635	.byte	0,12,0x04,1,0x80,0,3,0
636	.long	0
637.size	.vpaes_decrypt,.-.vpaes_decrypt
638
639.globl	.vpaes_cbc_encrypt
640.align	5
641.vpaes_cbc_encrypt:
642	${UCMP}i r5,16
643	bltlr-
644
645	$STU	$sp,-`($FRAME+2*$SIZE_T)`($sp)
646	mflr	r0
647	li	r10,`15+6*$SIZE_T`
648	li	r11,`31+6*$SIZE_T`
649	mfspr	r12, 256
650	stvx	v20,r10,$sp
651	addi	r10,r10,32
652	stvx	v21,r11,$sp
653	addi	r11,r11,32
654	stvx	v22,r10,$sp
655	addi	r10,r10,32
656	stvx	v23,r11,$sp
657	addi	r11,r11,32
658	stvx	v24,r10,$sp
659	addi	r10,r10,32
660	stvx	v25,r11,$sp
661	addi	r11,r11,32
662	stvx	v26,r10,$sp
663	addi	r10,r10,32
664	stvx	v27,r11,$sp
665	addi	r11,r11,32
666	stvx	v28,r10,$sp
667	addi	r10,r10,32
668	stvx	v29,r11,$sp
669	addi	r11,r11,32
670	stvx	v30,r10,$sp
671	stvx	v31,r11,$sp
672	stw	r12,`$FRAME-4`($sp)	# save vrsave
673	$PUSH	r30,`$FRAME+$SIZE_T*0`($sp)
674	$PUSH	r31,`$FRAME+$SIZE_T*1`($sp)
675	li	r9, -16
676	$PUSH	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
677
678	and	r30, r5, r9		# copy length&-16
679	andi.	r9, $out, 15		# is $out aligned?
680	mr	r5, r6			# copy pointer to key
681	mr	r31, r7			# copy pointer to iv
682	li	r6, -1
683	mcrf	cr1, cr0		# put aside $out alignment flag
684	mr	r7, r12			# copy vrsave
685	mtspr	256, r6			# preserve all AltiVec registers
686
687	lvx	v24, 0, r31		# load [potentially unaligned] iv
688	li	r9, 15
689	?lvsl	$inpperm, 0, r31
690	lvx	v25, r9, r31
691	?vperm	v24, v24, v25, $inpperm
692
693	cmpwi	r8, 0			# test direction
694	neg	r8, $inp		# prepare for unaligned access
695	 vxor	v7, v7, v7
696	?lvsl	$keyperm, 0, $key
697	 ?lvsr	$outperm, 0, $out
698	?lvsr	$inpperm, 0, r8		# -$inp
699	 vnor	$outmask, v7, v7	# 0xff..ff
700	lvx	$inptail, 0, $inp
701	 ?vperm	$outmask, v7, $outmask, $outperm
702	addi	$inp, $inp, 15		# 15 is not a typo
703
704	beq	Lcbc_decrypt
705
706	bl	_vpaes_encrypt_preheat
707	li	r0, 16
708
709	beq	cr1, Lcbc_enc_loop	# $out is aligned
710
711	vmr	v0, $inptail
712	lvx	$inptail, 0, $inp
713	addi	$inp, $inp, 16
714	?vperm	v0, v0, $inptail, $inpperm
715	vxor	v0, v0, v24		# ^= iv
716
717	bl	_vpaes_encrypt_core
718
719	andi.	r8, $out, 15
720	vmr	v24, v0			# put aside iv
721	sub	r9, $out, r8
722	vperm	$outhead, v0, v0, $outperm	# rotate right/left
723
724Lcbc_enc_head:
725	stvebx	$outhead, r8, r9
726	cmpwi	r8, 15
727	addi	r8, r8, 1
728	bne	Lcbc_enc_head
729
730	sub.	r30, r30, r0		# len -= 16
731	addi	$out, $out, 16
732	beq	Lcbc_unaligned_done
733
734Lcbc_enc_loop:
735	vmr	v0, $inptail
736	lvx	$inptail, 0, $inp
737	addi	$inp, $inp, 16
738	?vperm	v0, v0, $inptail, $inpperm
739	vxor	v0, v0, v24		# ^= iv
740
741	bl	_vpaes_encrypt_core
742
743	vmr	v24, v0			# put aside iv
744	sub.	r30, r30, r0		# len -= 16
745	vperm	v0, v0, v0, $outperm	# rotate right/left
746	vsel	v1, $outhead, v0, $outmask
747	vmr	$outhead, v0
748	stvx	v1, 0, $out
749	addi	$out, $out, 16
750	bne	Lcbc_enc_loop
751
752	b	Lcbc_done
753
754.align	5
755Lcbc_decrypt:
756	bl	_vpaes_decrypt_preheat
757	li	r0, 16
758
759	beq	cr1, Lcbc_dec_loop	# $out is aligned
760
761	vmr	v0, $inptail
762	lvx	$inptail, 0, $inp
763	addi	$inp, $inp, 16
764	?vperm	v0, v0, $inptail, $inpperm
765	vmr	v25, v0			# put aside input
766
767	bl	_vpaes_decrypt_core
768
769	andi.	r8, $out, 15
770	vxor	v0, v0, v24		# ^= iv
771	vmr	v24, v25
772	sub	r9, $out, r8
773	vperm	$outhead, v0, v0, $outperm	# rotate right/left
774
775Lcbc_dec_head:
776	stvebx	$outhead, r8, r9
777	cmpwi	r8, 15
778	addi	r8, r8, 1
779	bne	Lcbc_dec_head
780
781	sub.	r30, r30, r0		# len -= 16
782	addi	$out, $out, 16
783	beq	Lcbc_unaligned_done
784
785Lcbc_dec_loop:
786	vmr	v0, $inptail
787	lvx	$inptail, 0, $inp
788	addi	$inp, $inp, 16
789	?vperm	v0, v0, $inptail, $inpperm
790	vmr	v25, v0			# put aside input
791
792	bl	_vpaes_decrypt_core
793
794	vxor	v0, v0, v24		# ^= iv
795	vmr	v24, v25
796	sub.	r30, r30, r0		# len -= 16
797	vperm	v0, v0, v0, $outperm	# rotate right/left
798	vsel	v1, $outhead, v0, $outmask
799	vmr	$outhead, v0
800	stvx	v1, 0, $out
801	addi	$out, $out, 16
802	bne	Lcbc_dec_loop
803
804Lcbc_done:
805	beq	cr1, Lcbc_write_iv	# $out is aligned
806
807Lcbc_unaligned_done:
808	andi.	r8, $out, 15
809	sub	$out, $out, r8
810	li	r9, 0
811Lcbc_tail:
812	stvebx	$outhead, r9, $out
813	addi	r9, r9, 1
814	cmpw	r9, r8
815	bne	Lcbc_tail
816
817Lcbc_write_iv:
818	neg	r8, r31			# write [potentially unaligned] iv
819	li	r10, 4
820	?lvsl	$outperm, 0, r8
821	li	r11, 8
822	li	r12, 12
823	vperm	v24, v24, v24, $outperm	# rotate right/left
824	stvewx	v24, 0, r31		# ivp is at least 32-bit aligned
825	stvewx	v24, r10, r31
826	stvewx	v24, r11, r31
827	stvewx	v24, r12, r31
828
829	mtspr	256, r7			# restore vrsave
830	li	r10,`15+6*$SIZE_T`
831	li	r11,`31+6*$SIZE_T`
832	lvx	v20,r10,$sp
833	addi	r10,r10,32
834	lvx	v21,r11,$sp
835	addi	r11,r11,32
836	lvx	v22,r10,$sp
837	addi	r10,r10,32
838	lvx	v23,r11,$sp
839	addi	r11,r11,32
840	lvx	v24,r10,$sp
841	addi	r10,r10,32
842	lvx	v25,r11,$sp
843	addi	r11,r11,32
844	lvx	v26,r10,$sp
845	addi	r10,r10,32
846	lvx	v27,r11,$sp
847	addi	r11,r11,32
848	lvx	v28,r10,$sp
849	addi	r10,r10,32
850	lvx	v29,r11,$sp
851	addi	r11,r11,32
852	lvx	v30,r10,$sp
853	lvx	v31,r11,$sp
854Lcbc_abort:
855	$POP	r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
856	$POP	r30,`$FRAME+$SIZE_T*0`($sp)
857	$POP	r31,`$FRAME+$SIZE_T*1`($sp)
858	mtlr	r0
859	addi	$sp,$sp,`$FRAME+$SIZE_T*2`
860	blr
861	.long	0
862	.byte	0,12,0x04,1,0x80,2,6,0
863	.long	0
864.size	.vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
865___
866}
867{
868my ($inp,$bits,$out)=map("r$_",(3..5));
869my $dir="cr1";
870my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
871
872$code.=<<___;
873########################################################
874##                                                    ##
875##                  AES key schedule                  ##
876##                                                    ##
877########################################################
878.align	4
879_vpaes_key_preheat:
880	mflr	r8
881	bl	Lconsts
882	mtlr	r8
883	li	r11, 0xc0		# Lk_inv
884	li	r10, 0xd0
885	li	r9,  0xe0		# L_ipt
886	li	r8,  0xf0
887
888	vspltisb	v8,4		# 0x04..04
889	vxor	v9,v9,v9		# 0x00..00
890	lvx	$invlo, r12, r11	# Lk_inv
891	li	r11, 0x120
892	lvx	$invhi, r12, r10
893	li	r10, 0x130
894	lvx	$iptlo, r12, r9		# Lk_ipt
895	li	r9, 0x220
896	lvx	$ipthi, r12, r8
897	li	r8, 0x230
898
899	lvx	v14, r12, r11		# Lk_sb1
900	li	r11, 0x240
901	lvx	v15, r12, r10
902	li	r10, 0x250
903
904	lvx	v16, r12, r9		# Lk_dksd
905	li	r9, 0x260
906	lvx	v17, r12, r8
907	li	r8, 0x270
908	lvx	v18, r12, r11		# Lk_dksb
909	li	r11, 0x280
910	lvx	v19, r12, r10
911	li	r10, 0x290
912	lvx	v20, r12, r9		# Lk_dkse
913	li	r9, 0x2a0
914	lvx	v21, r12, r8
915	li	r8, 0x2b0
916	lvx	v22, r12, r11		# Lk_dks9
917	lvx	v23, r12, r10
918
919	lvx	v24, r12, r9		# Lk_rcon
920	lvx	v25, 0, r12		# Lk_mc_forward[0]
921	lvx	v26, r12, r8		# Lks63
922	blr
923	.long	0
924	.byte	0,12,0x14,0,0,0,0,0
925
926.align	4
927_vpaes_schedule_core:
928	mflr	r7
929
930	bl	_vpaes_key_preheat	# load the tables
931
932	#lvx	v0, 0, $inp		# vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
933	neg	r8, $inp		# prepare for unaligned access
934	lvx	v0, 0, $inp
935	addi	$inp, $inp, 15		# 15 is not typo
936	?lvsr	$inpperm, 0, r8		# -$inp
937	lvx	v6, 0, $inp		# v6 serves as inptail
938	addi	$inp, $inp, 8
939	?vperm	v0, v0, v6, $inpperm
940
941	# input transform
942	vmr	v3, v0			# vmovdqa	%xmm0,	%xmm3
943	bl	_vpaes_schedule_transform
944	vmr	v7, v0			# vmovdqa	%xmm0,	%xmm7
945
946	bne	$dir, Lschedule_am_decrypting
947
948	# encrypting, output zeroth round key after transform
949	li	r8, 0x30		# mov	\$0x30,%r8d
950	li	r9, 4
951	li	r10, 8
952	li	r11, 12
953
954	?lvsr	$outperm, 0, $out	# prepare for unaligned access
955	vnor	$outmask, v9, v9	# 0xff..ff
956	?vperm	$outmask, v9, $outmask, $outperm
957
958	#stvx	v0, 0, $out		# vmovdqu	%xmm0,	(%rdx)
959	vperm	$outhead, v0, v0, $outperm	# rotate right/left
960	stvewx	$outhead, 0, $out	# some are superfluous
961	stvewx	$outhead, r9, $out
962	stvewx	$outhead, r10, $out
963	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
964	stvewx	$outhead, r11, $out
965	b	Lschedule_go
966
967Lschedule_am_decrypting:
968	srwi	r8, $bits, 1		# shr	\$1,%r8d
969	andi.	r8, r8, 32		# and	\$32,%r8d
970	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
971	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
972	# decrypting, output zeroth round key after shiftrows
973	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
974	li	r9, 4
975	li	r10, 8
976	li	r11, 12
977	vperm	v4, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
978
979	neg	r0, $out		# prepare for unaligned access
980	?lvsl	$outperm, 0, r0
981	vnor	$outmask, v9, v9	# 0xff..ff
982	?vperm	$outmask, $outmask, v9, $outperm
983
984	#stvx	v4, 0, $out		# vmovdqu	%xmm3,	(%rdx)
985	vperm	$outhead, v4, v4, $outperm	# rotate right/left
986	stvewx	$outhead, 0, $out	# some are superfluous
987	stvewx	$outhead, r9, $out
988	stvewx	$outhead, r10, $out
989	addi	r10, r12, 0x80		# lea	.Lk_sr(%rip),%r10
990	stvewx	$outhead, r11, $out
991	addi	$out, $out, 15		# 15 is not typo
992	xori	r8, r8, 0x30		# xor	\$0x30, %r8
993
994Lschedule_go:
995	cmplwi	$bits, 192		# cmp	\$192,	%esi
996	bgt	Lschedule_256
997	beq	Lschedule_192
998	# 128: fall though
999
1000##
1001##  .schedule_128
1002##
1003##  128-bit specific part of key schedule.
1004##
1005##  This schedule is really simple, because all its parts
1006##  are accomplished by the subroutines.
1007##
1008Lschedule_128:
1009	li	r0, 10			# mov	\$10, %esi
1010	mtctr	r0
1011
1012Loop_schedule_128:
1013	bl 	_vpaes_schedule_round
1014	bdz 	Lschedule_mangle_last	# dec	%esi
1015	bl	_vpaes_schedule_mangle	# write output
1016	b 	Loop_schedule_128
1017
1018##
1019##  .aes_schedule_192
1020##
1021##  192-bit specific part of key schedule.
1022##
1023##  The main body of this schedule is the same as the 128-bit
1024##  schedule, but with more smearing.  The long, high side is
1025##  stored in %xmm7 as before, and the short, low side is in
1026##  the high bits of %xmm6.
1027##
1028##  This schedule is somewhat nastier, however, because each
1029##  round produces 192 bits of key material, or 1.5 round keys.
1030##  Therefore, on each cycle we do 2 rounds and produce 3 round
1031##  keys.
1032##
1033.align	4
1034Lschedule_192:
1035	li	r0, 4			# mov	\$4,	%esi
1036	lvx	v0, 0, $inp
1037	?vperm	v0, v6, v0, $inpperm
1038	?vsldoi	v0, v3, v0, 8		# vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
1039	bl	_vpaes_schedule_transform	# input transform
1040	?vsldoi	v6, v0, v9, 8
1041	?vsldoi	v6, v9, v6, 8		# clobber "low" side with zeros
1042	mtctr	r0
1043
1044Loop_schedule_192:
1045	bl	_vpaes_schedule_round
1046	?vsldoi	v0, v6, v0, 8		# vpalignr	\$8,%xmm6,%xmm0,%xmm0
1047	bl	_vpaes_schedule_mangle	# save key n
1048	bl	_vpaes_schedule_192_smear
1049	bl	_vpaes_schedule_mangle	# save key n+1
1050	bl	_vpaes_schedule_round
1051	bdz 	Lschedule_mangle_last	# dec	%esi
1052	bl	_vpaes_schedule_mangle	# save key n+2
1053	bl	_vpaes_schedule_192_smear
1054	b	Loop_schedule_192
1055
1056##
1057##  .aes_schedule_256
1058##
1059##  256-bit specific part of key schedule.
1060##
1061##  The structure here is very similar to the 128-bit
1062##  schedule, but with an additional "low side" in
1063##  %xmm6.  The low side's rounds are the same as the
1064##  high side's, except no rcon and no rotation.
1065##
1066.align	4
1067Lschedule_256:
1068	li	r0, 7			# mov	\$7, %esi
1069	addi	$inp, $inp, 8
1070	lvx	v0, 0, $inp		# vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
1071	?vperm	v0, v6, v0, $inpperm
1072	bl	_vpaes_schedule_transform	# input transform
1073	mtctr	r0
1074
1075Loop_schedule_256:
1076	bl	_vpaes_schedule_mangle	# output low result
1077	vmr	v6, v0			# vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
1078
1079	# high round
1080	bl	_vpaes_schedule_round
1081	bdz 	Lschedule_mangle_last	# dec	%esi
1082	bl	_vpaes_schedule_mangle
1083
1084	# low round. swap xmm7 and xmm6
1085	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
1086	vmr	v5, v7			# vmovdqa	%xmm7,	%xmm5
1087	vmr	v7, v6			# vmovdqa	%xmm6,	%xmm7
1088	bl	_vpaes_schedule_low_round
1089	vmr	v7, v5			# vmovdqa	%xmm5,	%xmm7
1090
1091	b	Loop_schedule_256
1092##
1093##  .aes_schedule_mangle_last
1094##
1095##  Mangler for last round of key schedule
1096##  Mangles %xmm0
1097##    when encrypting, outputs out(%xmm0) ^ 63
1098##    when decrypting, outputs unskew(%xmm0)
1099##
1100##  Always called right before return... jumps to cleanup and exits
1101##
1102.align	4
1103Lschedule_mangle_last:
1104	# schedule last round key from xmm0
1105	li	r11, 0x2e0		# lea	.Lk_deskew(%rip),%r11
1106	li	r9,  0x2f0
1107	bne	$dir, Lschedule_mangle_last_dec
1108
1109	# encrypting
1110	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),%xmm1
1111	li	r11, 0x2c0		# lea		.Lk_opt(%rip),	%r11	# prepare to output transform
1112	li	r9,  0x2d0		# prepare to output transform
1113	vperm	v0, v0, v0, v1		# vpshufb	%xmm1,	%xmm0,	%xmm0	# output permute
1114
1115	lvx	$iptlo, r11, r12	# reload $ipt
1116	lvx	$ipthi, r9, r12
1117	addi	$out, $out, 16		# add	\$16,	%rdx
1118	vxor	v0, v0, v26		# vpxor		.Lk_s63(%rip),	%xmm0,	%xmm0
1119	bl	_vpaes_schedule_transform	# output transform
1120
1121	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
1122	vperm	v0, v0, v0, $outperm	# rotate right/left
1123	li	r10, 4
1124	vsel	v2, $outhead, v0, $outmask
1125	li	r11, 8
1126	stvx	v2, 0, $out
1127	li	r12, 12
1128	stvewx	v0, 0, $out		# some (or all) are redundant
1129	stvewx	v0, r10, $out
1130	stvewx	v0, r11, $out
1131	stvewx	v0, r12, $out
1132	b	Lschedule_mangle_done
1133
1134.align	4
1135Lschedule_mangle_last_dec:
1136	lvx	$iptlo, r11, r12	# reload $ipt
1137	lvx	$ipthi, r9,  r12
1138	addi	$out, $out, -16		# add	\$-16,	%rdx
1139	vxor	v0, v0, v26		# vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
1140	bl	_vpaes_schedule_transform	# output transform
1141
1142	#stvx	v0, r0, $out		# vmovdqu	%xmm0,	(%rdx)		# save last key
1143	addi	r9, $out, -15		# -15 is not typo
1144	vperm	v0, v0, v0, $outperm	# rotate right/left
1145	li	r10, 4
1146	vsel	v2, $outhead, v0, $outmask
1147	li	r11, 8
1148	stvx	v2, 0, $out
1149	li	r12, 12
1150	stvewx	v0, 0, r9		# some (or all) are redundant
1151	stvewx	v0, r10, r9
1152	stvewx	v0, r11, r9
1153	stvewx	v0, r12, r9
1154
1155
1156Lschedule_mangle_done:
1157	mtlr	r7
1158	# cleanup
1159	vxor	v0, v0, v0		# vpxor		%xmm0,	%xmm0,	%xmm0
1160	vxor	v1, v1, v1		# vpxor		%xmm1,	%xmm1,	%xmm1
1161	vxor	v2, v2, v2		# vpxor		%xmm2,	%xmm2,	%xmm2
1162	vxor	v3, v3, v3		# vpxor		%xmm3,	%xmm3,	%xmm3
1163	vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
1164	vxor	v5, v5, v5		# vpxor		%xmm5,	%xmm5,	%xmm5
1165	vxor	v6, v6, v6		# vpxor		%xmm6,	%xmm6,	%xmm6
1166	vxor	v7, v7, v7		# vpxor		%xmm7,	%xmm7,	%xmm7
1167
1168	blr
1169	.long	0
1170	.byte	0,12,0x14,0,0,0,0,0
1171
1172##
1173##  .aes_schedule_192_smear
1174##
1175##  Smear the short, low side in the 192-bit key schedule.
1176##
1177##  Inputs:
1178##    %xmm7: high side, b  a  x  y
1179##    %xmm6:  low side, d  c  0  0
1180##    %xmm13: 0
1181##
1182##  Outputs:
1183##    %xmm6: b+c+d  b+c  0  0
1184##    %xmm0: b+c+d  b+c  b  a
1185##
1186.align	4
1187_vpaes_schedule_192_smear:
1188	?vspltw	v0, v7, 3
1189	?vsldoi	v1, v9, v6, 12		# vpshufd	\$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
1190	?vsldoi	v0, v7, v0, 8		# vpshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
1191	vxor	v6, v6, v1		# vpxor		%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
1192	vxor	v6, v6, v0		# vpxor		%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
1193	vmr	v0, v6
1194	?vsldoi	v6, v6, v9, 8
1195	?vsldoi	v6, v9, v6, 8		# clobber low side with zeros
1196	blr
1197	.long	0
1198	.byte	0,12,0x14,0,0,0,0,0
1199
1200##
1201##  .aes_schedule_round
1202##
1203##  Runs one main round of the key schedule on %xmm0, %xmm7
1204##
1205##  Specifically, runs subbytes on the high dword of %xmm0
1206##  then rotates it by one byte and xors into the low dword of
1207##  %xmm7.
1208##
1209##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1210##  next rcon.
1211##
1212##  Smears the dwords of %xmm7 by xoring the low into the
1213##  second low, result into third, result into highest.
1214##
1215##  Returns results in %xmm7 = %xmm0.
1216##  Clobbers %xmm1-%xmm4, %r11.
1217##
1218.align	4
1219_vpaes_schedule_round:
1220	# extract rcon from xmm8
1221	#vxor	v4, v4, v4		# vpxor		%xmm4,	%xmm4,	%xmm4
1222	?vsldoi	v1, $rcon, v9, 15	# vpalignr	\$15,	%xmm8,	%xmm4,	%xmm1
1223	?vsldoi	$rcon, $rcon, $rcon, 15	# vpalignr	\$15,	%xmm8,	%xmm8,	%xmm8
1224	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
1225
1226	# rotate
1227	?vspltw	v0, v0, 3		# vpshufd	\$0xFF,	%xmm0,	%xmm0
1228	?vsldoi	v0, v0, v0, 1		# vpalignr	\$1,	%xmm0,	%xmm0,	%xmm0
1229
1230	# fall through...
1231
1232	# low round: same as high round, but no rotation and no rcon.
1233_vpaes_schedule_low_round:
1234	# smear xmm7
1235	?vsldoi	v1, v9, v7, 12		# vpslldq	\$4,	%xmm7,	%xmm1
1236	vxor	v7, v7, v1		# vpxor		%xmm1,	%xmm7,	%xmm7
1237	vspltisb	v1, 0x0f	# 0x0f..0f
1238	?vsldoi	v4, v9, v7, 8		# vpslldq	\$8,	%xmm7,	%xmm4
1239
1240	# subbytes
1241	vand	v1, v1, v0		# vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
1242	vsrb	v0, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0		# 1 = i
1243	 vxor	v7, v7, v4		# vpxor		%xmm4,	%xmm7,	%xmm7
1244	vperm	v2, $invhi, v9, v1	# vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
1245	vxor	v1, v1, v0		# vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
1246	vperm	v3, $invlo, v9, v0	# vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
1247	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
1248	vperm	v4, $invlo, v9, v1	# vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
1249	 vxor	v7, v7, v26		# vpxor		.Lk_s63(%rip),	%xmm7,	%xmm7
1250	vperm	v3, $invlo, v9, v3	# vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
1251	vxor	v4, v4, v2		# vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
1252	vperm	v2, $invlo, v9, v4	# vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
1253	vxor	v3, v3, v1		# vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
1254	vxor	v2, v2, v0		# vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
1255	vperm	v4, v15, v9, v3		# vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
1256	vperm	v1, v14, v9, v2		# vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
1257	vxor	v1, v1, v4		# vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
1258
1259	# add in smeared stuff
1260	vxor	v0, v1, v7		# vpxor		%xmm7,	%xmm1,	%xmm0
1261	vxor	v7, v1, v7		# vmovdqa	%xmm0,	%xmm7
1262	blr
1263	.long	0
1264	.byte	0,12,0x14,0,0,0,0,0
1265
1266##
1267##  .aes_schedule_transform
1268##
1269##  Linear-transform %xmm0 according to tables at (%r11)
1270##
1271##  Requires that %xmm9 = 0x0F0F... as in preheat
1272##  Output in %xmm0
1273##  Clobbers %xmm2
1274##
1275.align	4
1276_vpaes_schedule_transform:
1277	#vand	v1, v0, v9		# vpand		%xmm9,	%xmm0,	%xmm1
1278	vsrb	v2, v0, v8		# vpsrlb	\$4,	%xmm0,	%xmm0
1279					# vmovdqa	(%r11),	%xmm2 	# lo
1280	vperm	v0, $iptlo, $iptlo, v0	# vpshufb	%xmm1,	%xmm2,	%xmm2
1281					# vmovdqa	16(%r11),	%xmm1 # hi
1282	vperm	v2, $ipthi, $ipthi, v2	# vpshufb	%xmm0,	%xmm1,	%xmm0
1283	vxor	v0, v0, v2		# vpxor		%xmm2,	%xmm0,	%xmm0
1284	blr
1285	.long	0
1286	.byte	0,12,0x14,0,0,0,0,0
1287
1288##
1289##  .aes_schedule_mangle
1290##
1291##  Mangle xmm0 from (basis-transformed) standard version
1292##  to our version.
1293##
1294##  On encrypt,
1295##    xor with 0x63
1296##    multiply by circulant 0,1,1,1
1297##    apply shiftrows transform
1298##
1299##  On decrypt,
1300##    xor with 0x63
1301##    multiply by "inverse mixcolumns" circulant E,B,D,9
1302##    deskew
1303##    apply shiftrows transform
1304##
1305##
1306##  Writes out to (%rdx), and increments or decrements it
1307##  Keeps track of round number mod 4 in %r8
1308##  Preserves xmm0
1309##  Clobbers xmm1-xmm5
1310##
1311.align	4
1312_vpaes_schedule_mangle:
1313	#vmr	v4, v0			# vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
1314					# vmovdqa	.Lk_mc_forward(%rip),%xmm5
1315	bne	$dir, Lschedule_mangle_dec
1316
1317	# encrypting
1318	vxor	v4, v0, v26		# vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
1319	addi	$out, $out, 16		# add	\$16,	%rdx
1320	vperm	v4, v4, v4, v25		# vpshufb	%xmm5,	%xmm4,	%xmm4
1321	vperm	v1, v4, v4, v25		# vpshufb	%xmm5,	%xmm4,	%xmm1
1322	vperm	v3, v1, v1, v25		# vpshufb	%xmm5,	%xmm1,	%xmm3
1323	vxor	v4, v4, v1		# vpxor		%xmm1,	%xmm4,	%xmm4
1324	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
1325	vxor	v3, v3, v4		# vpxor		%xmm4,	%xmm3,	%xmm3
1326
1327	vperm	v3, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
1328	addi	r8, r8, -16		# add	\$-16,	%r8
1329	andi.	r8, r8, 0x30		# and	\$0x30,	%r8
1330
1331	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
1332	vperm	v1, v3, v3, $outperm	# rotate right/left
1333	vsel	v2, $outhead, v1, $outmask
1334	vmr	$outhead, v1
1335	stvx	v2, 0, $out
1336	blr
1337
1338.align	4
1339Lschedule_mangle_dec:
1340	# inverse mix columns
1341					# lea	.Lk_dksd(%rip),%r11
1342	vsrb	v1, v0, v8		# vpsrlb	\$4,	%xmm4,	%xmm1	# 1 = hi
1343	#and	v4, v0, v9		# vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
1344
1345					# vmovdqa	0x00(%r11),	%xmm2
1346	vperm	v2, v16, v16, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1347					# vmovdqa	0x10(%r11),	%xmm3
1348	vperm	v3, v17, v17, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1349	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1350	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1351
1352					# vmovdqa	0x20(%r11),	%xmm2
1353	vperm	v2, v18, v18, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1354	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1355					# vmovdqa	0x30(%r11),	%xmm3
1356	vperm	v3, v19, v19, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1357	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1358	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1359
1360					# vmovdqa	0x40(%r11),	%xmm2
1361	vperm	v2, v20, v20, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1362	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1363					# vmovdqa	0x50(%r11),	%xmm3
1364	vperm	v3, v21, v21, v1	# vpshufb	%xmm1,	%xmm3,	%xmm3
1365	vxor	v3, v3, v2		# vpxor		%xmm2,	%xmm3,	%xmm3
1366
1367					# vmovdqa	0x60(%r11),	%xmm2
1368	vperm	v2, v22, v22, v0	# vpshufb	%xmm4,	%xmm2,	%xmm2
1369	vperm	v3, v3, v9, v25		# vpshufb	%xmm5,	%xmm3,	%xmm3
1370					# vmovdqa	0x70(%r11),	%xmm4
1371	vperm	v4, v23, v23, v1	# vpshufb	%xmm1,	%xmm4,	%xmm4
1372	lvx	v1, r8, r10		# vmovdqa	(%r8,%r10),	%xmm1
1373	vxor	v2, v2, v3		# vpxor		%xmm3,	%xmm2,	%xmm2
1374	vxor	v3, v4, v2		# vpxor		%xmm2,	%xmm4,	%xmm3
1375
1376	addi	$out, $out, -16		# add	\$-16,	%rdx
1377
1378	vperm	v3, v3, v3, v1		# vpshufb	%xmm1,	%xmm3,	%xmm3
1379	addi	r8, r8, -16		# add	\$-16,	%r8
1380	andi.	r8, r8, 0x30		# and	\$0x30,	%r8
1381
1382	#stvx	v3, 0, $out		# vmovdqu	%xmm3,	(%rdx)
1383	vperm	v1, v3, v3, $outperm	# rotate right/left
1384	vsel	v2, $outhead, v1, $outmask
1385	vmr	$outhead, v1
1386	stvx	v2, 0, $out
1387	blr
1388	.long	0
1389	.byte	0,12,0x14,0,0,0,0,0
1390
1391.globl	.vpaes_set_encrypt_key
1392.align	5
1393.vpaes_set_encrypt_key:
1394	$STU	$sp,-$FRAME($sp)
1395	li	r10,`15+6*$SIZE_T`
1396	li	r11,`31+6*$SIZE_T`
1397	mflr	r0
1398	mfspr	r6, 256			# save vrsave
1399	stvx	v20,r10,$sp
1400	addi	r10,r10,32
1401	stvx	v21,r11,$sp
1402	addi	r11,r11,32
1403	stvx	v22,r10,$sp
1404	addi	r10,r10,32
1405	stvx	v23,r11,$sp
1406	addi	r11,r11,32
1407	stvx	v24,r10,$sp
1408	addi	r10,r10,32
1409	stvx	v25,r11,$sp
1410	addi	r11,r11,32
1411	stvx	v26,r10,$sp
1412	addi	r10,r10,32
1413	stvx	v27,r11,$sp
1414	addi	r11,r11,32
1415	stvx	v28,r10,$sp
1416	addi	r10,r10,32
1417	stvx	v29,r11,$sp
1418	addi	r11,r11,32
1419	stvx	v30,r10,$sp
1420	stvx	v31,r11,$sp
1421	stw	r6,`$FRAME-4`($sp)	# save vrsave
1422	li	r7, -1
1423	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
1424	mtspr	256, r7			# preserve all AltiVec registers
1425
1426	srwi	r9, $bits, 5		# shr	\$5,%eax
1427	addi	r9, r9, 6		# add	\$5,%eax
1428	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1429
1430	cmplw	$dir, $bits, $bits	# set encrypt direction
1431	li	r8, 0x30		# mov	\$0x30,%r8d
1432	bl	_vpaes_schedule_core
1433
1434	$POP	r0, `$FRAME+$LRSAVE`($sp)
1435	li	r10,`15+6*$SIZE_T`
1436	li	r11,`31+6*$SIZE_T`
1437	mtspr	256, r6			# restore vrsave
1438	mtlr	r0
1439	xor	r3, r3, r3
1440	lvx	v20,r10,$sp
1441	addi	r10,r10,32
1442	lvx	v21,r11,$sp
1443	addi	r11,r11,32
1444	lvx	v22,r10,$sp
1445	addi	r10,r10,32
1446	lvx	v23,r11,$sp
1447	addi	r11,r11,32
1448	lvx	v24,r10,$sp
1449	addi	r10,r10,32
1450	lvx	v25,r11,$sp
1451	addi	r11,r11,32
1452	lvx	v26,r10,$sp
1453	addi	r10,r10,32
1454	lvx	v27,r11,$sp
1455	addi	r11,r11,32
1456	lvx	v28,r10,$sp
1457	addi	r10,r10,32
1458	lvx	v29,r11,$sp
1459	addi	r11,r11,32
1460	lvx	v30,r10,$sp
1461	lvx	v31,r11,$sp
1462	addi	$sp,$sp,$FRAME
1463	blr
1464	.long	0
1465	.byte	0,12,0x04,1,0x80,0,3,0
1466	.long	0
1467.size	.vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1468
1469.globl	.vpaes_set_decrypt_key
1470.align	4
1471.vpaes_set_decrypt_key:
1472	$STU	$sp,-$FRAME($sp)
1473	li	r10,`15+6*$SIZE_T`
1474	li	r11,`31+6*$SIZE_T`
1475	mflr	r0
1476	mfspr	r6, 256			# save vrsave
1477	stvx	v20,r10,$sp
1478	addi	r10,r10,32
1479	stvx	v21,r11,$sp
1480	addi	r11,r11,32
1481	stvx	v22,r10,$sp
1482	addi	r10,r10,32
1483	stvx	v23,r11,$sp
1484	addi	r11,r11,32
1485	stvx	v24,r10,$sp
1486	addi	r10,r10,32
1487	stvx	v25,r11,$sp
1488	addi	r11,r11,32
1489	stvx	v26,r10,$sp
1490	addi	r10,r10,32
1491	stvx	v27,r11,$sp
1492	addi	r11,r11,32
1493	stvx	v28,r10,$sp
1494	addi	r10,r10,32
1495	stvx	v29,r11,$sp
1496	addi	r11,r11,32
1497	stvx	v30,r10,$sp
1498	stvx	v31,r11,$sp
1499	stw	r6,`$FRAME-4`($sp)	# save vrsave
1500	li	r7, -1
1501	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
1502	mtspr	256, r7			# preserve all AltiVec registers
1503
1504	srwi	r9, $bits, 5		# shr	\$5,%eax
1505	addi	r9, r9, 6		# add	\$5,%eax
1506	stw	r9, 240($out)		# mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1507
1508	slwi	r9, r9, 4		# shl	\$4,%eax
1509	add	$out, $out, r9		# lea	(%rdx,%rax),%rdx
1510
1511	cmplwi	$dir, $bits, 0		# set decrypt direction
1512	srwi	r8, $bits, 1		# shr	\$1,%r8d
1513	andi.	r8, r8, 32		# and	\$32,%r8d
1514	xori	r8, r8, 32		# xor	\$32,%r8d	# nbits==192?0:32
1515	bl	_vpaes_schedule_core
1516
1517	$POP	r0,  `$FRAME+$LRSAVE`($sp)
1518	li	r10,`15+6*$SIZE_T`
1519	li	r11,`31+6*$SIZE_T`
1520	mtspr	256, r6			# restore vrsave
1521	mtlr	r0
1522	xor	r3, r3, r3
1523	lvx	v20,r10,$sp
1524	addi	r10,r10,32
1525	lvx	v21,r11,$sp
1526	addi	r11,r11,32
1527	lvx	v22,r10,$sp
1528	addi	r10,r10,32
1529	lvx	v23,r11,$sp
1530	addi	r11,r11,32
1531	lvx	v24,r10,$sp
1532	addi	r10,r10,32
1533	lvx	v25,r11,$sp
1534	addi	r11,r11,32
1535	lvx	v26,r10,$sp
1536	addi	r10,r10,32
1537	lvx	v27,r11,$sp
1538	addi	r11,r11,32
1539	lvx	v28,r10,$sp
1540	addi	r10,r10,32
1541	lvx	v29,r11,$sp
1542	addi	r11,r11,32
1543	lvx	v30,r10,$sp
1544	lvx	v31,r11,$sp
1545	addi	$sp,$sp,$FRAME
1546	blr
1547	.long	0
1548	.byte	0,12,0x04,1,0x80,0,3,0
1549	.long	0
1550.size	.vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1551___
1552}
1553
1554my $consts=1;
1555foreach  (split("\n",$code)) {
1556	s/\`([^\`]*)\`/eval $1/geo;
1557
1558	# constants table endian-specific conversion
1559	if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1560	    my $conv=$2;
1561	    my @bytes=();
1562
1563	    # convert to endian-agnostic format
1564	    foreach (split(/,\s+/,$1)) {
1565		my $l = /^0/?oct:int;
1566		push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1567	    }
1568
1569	    # little-endian conversion
1570	    if ($flavour =~ /le$/o) {
1571		SWITCH: for($conv)  {
1572		    /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1573		    /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
1574		}
1575	    }
1576
1577	    #emit
1578	    print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1579	    next;
1580	}
1581	$consts=0 if (m/Lconsts:/o);	# end of table
1582
1583	# instructions prefixed with '?' are endian-specific and need
1584	# to be adjusted accordingly...
1585	if ($flavour =~ /le$/o) {	# little-endian
1586	    s/\?lvsr/lvsl/o or
1587	    s/\?lvsl/lvsr/o or
1588	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1589	    s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1590	    s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1591	} else {			# big-endian
1592	    s/\?([a-z]+)/$1/o;
1593	}
1594
1595	print $_,"\n";
1596}
1597
1598close STDOUT or die "error closing STDOUT: $!";
1599