xref: /openssl/crypto/aes/asm/aes-sparcv9.pl (revision 54b40531)
1#! /usr/bin/env perl
2# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the License.
14# ====================================================================
15#
16# Version 1.1
17#
18# The major reason for undertaken effort was to mitigate the hazard of
19# cache-timing attack. This is [currently and initially!] addressed in
20# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
21# 2. References to them are scheduled for L2 cache latency, meaning
22# that the tables don't have to reside in L1 cache. Once again, this
23# is an initial draft and one should expect more countermeasures to
24# be implemented...
25#
26# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
27# round.
28#
29# Even though performance was not the primary goal [on the contrary,
30# extra shifts "induced" by compressed S-box and longer loop epilogue
31# "induced" by scheduling for L2 have negative effect on performance],
32# the code turned out to run in ~23 cycles per processed byte en-/
33# decrypted with 128-bit key. This is pretty good result for code
34# with mentioned qualities and UltraSPARC core. Compared to Sun C
35# generated code my encrypt procedure runs just few percents faster,
36# while decrypt one - whole 50% faster [yes, Sun C failed to generate
37# optimal decrypt procedure]. Compared to GNU C generated code both
38# procedures are more than 60% faster:-)
39
40$output = pop and open STDOUT,">$output";
41
42$frame="STACK_FRAME";
43$bias="STACK_BIAS";
44$locals=16;
45
46$acc0="%l0";
47$acc1="%o0";
48$acc2="%o1";
49$acc3="%o2";
50
51$acc4="%l1";
52$acc5="%o3";
53$acc6="%o4";
54$acc7="%o5";
55
56$acc8="%l2";
57$acc9="%o7";
58$acc10="%g1";
59$acc11="%g2";
60
61$acc12="%l3";
62$acc13="%g3";
63$acc14="%g4";
64$acc15="%g5";
65
66$t0="%l4";
67$t1="%l5";
68$t2="%l6";
69$t3="%l7";
70
71$s0="%i0";
72$s1="%i1";
73$s2="%i2";
74$s3="%i3";
75$tbl="%i4";
76$key="%i5";
77$rounds="%i7";	# aliases with return address, which is off-loaded to stack
78
79sub _data_word()
80{ my $i;
81    while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
82}
83
84$code.=<<___;
85#ifndef __ASSEMBLER__
86# define __ASSEMBLER__ 1
87#endif
88#include "crypto/sparc_arch.h"
89
90#ifdef  __arch64__
91.register	%g2,#scratch
92.register	%g3,#scratch
93#endif
94.section	".text",#alloc,#execinstr
95
96.align	256
97AES_Te:
98___
99&_data_word(
100	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
101	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
102	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
103	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
104	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
105	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
106	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
107	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
108	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
109	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
110	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
111	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
112	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
113	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
114	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
115	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
116	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
117	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
118	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
119	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
120	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
121	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
122	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
123	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
124	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
125	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
126	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
127	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
128	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
129	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
130	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
131	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
132	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
133	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
134	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
135	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
136	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
137	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
138	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
139	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
140	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
141	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
142	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
143	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
144	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
145	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
146	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
147	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
148	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
149	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
150	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
151	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
152	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
153	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
154	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
155	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
156	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
157	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
158	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
159	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
160	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
161	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
162	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
163	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
164$code.=<<___;
165	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
166	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
167	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
168	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
169	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
170	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
171	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
172	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
173	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
174	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
175	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
176	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
177	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
178	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
179	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
180	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
181	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
182	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
183	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
184	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
185	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
186	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
187	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
188	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
189	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
190	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
191	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
192	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
193	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
194	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
195	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
196	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
197.type	AES_Te,#object
198.size	AES_Te,(.-AES_Te)
199
200.align	64
201.skip	16
202_sparcv9_AES_encrypt:
203	save	%sp,-$frame-$locals,%sp
204	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
205	ld	[$key+240],$rounds
206	ld	[$key+0],$t0
207	ld	[$key+4],$t1			!
208	ld	[$key+8],$t2
209	srl	$rounds,1,$rounds
210	xor	$t0,$s0,$s0
211	ld	[$key+12],$t3
212	srl	$s0,21,$acc0
213	xor	$t1,$s1,$s1
214	ld	[$key+16],$t0
215	srl	$s1,13,$acc1			!
216	xor	$t2,$s2,$s2
217	ld	[$key+20],$t1
218	xor	$t3,$s3,$s3
219	ld	[$key+24],$t2
220	and	$acc0,2040,$acc0
221	ld	[$key+28],$t3
222	nop
223.Lenc_loop:
224	srl	$s2,5,$acc2			!
225	and	$acc1,2040,$acc1
226	ldx	[$tbl+$acc0],$acc0
227	sll	$s3,3,$acc3
228	and	$acc2,2040,$acc2
229	ldx	[$tbl+$acc1],$acc1
230	srl	$s1,21,$acc4
231	and	$acc3,2040,$acc3
232	ldx	[$tbl+$acc2],$acc2		!
233	srl	$s2,13,$acc5
234	and	$acc4,2040,$acc4
235	ldx	[$tbl+$acc3],$acc3
236	srl	$s3,5,$acc6
237	and	$acc5,2040,$acc5
238	ldx	[$tbl+$acc4],$acc4
239	fmovs	%f0,%f0
240	sll	$s0,3,$acc7			!
241	and	$acc6,2040,$acc6
242	ldx	[$tbl+$acc5],$acc5
243	srl	$s2,21,$acc8
244	and	$acc7,2040,$acc7
245	ldx	[$tbl+$acc6],$acc6
246	srl	$s3,13,$acc9
247	and	$acc8,2040,$acc8
248	ldx	[$tbl+$acc7],$acc7		!
249	srl	$s0,5,$acc10
250	and	$acc9,2040,$acc9
251	ldx	[$tbl+$acc8],$acc8
252	sll	$s1,3,$acc11
253	and	$acc10,2040,$acc10
254	ldx	[$tbl+$acc9],$acc9
255	fmovs	%f0,%f0
256	srl	$s3,21,$acc12			!
257	and	$acc11,2040,$acc11
258	ldx	[$tbl+$acc10],$acc10
259	srl	$s0,13,$acc13
260	and	$acc12,2040,$acc12
261	ldx	[$tbl+$acc11],$acc11
262	srl	$s1,5,$acc14
263	and	$acc13,2040,$acc13
264	ldx	[$tbl+$acc12],$acc12		!
265	sll	$s2,3,$acc15
266	and	$acc14,2040,$acc14
267	ldx	[$tbl+$acc13],$acc13
268	and	$acc15,2040,$acc15
269	add	$key,32,$key
270	ldx	[$tbl+$acc14],$acc14
271	fmovs	%f0,%f0
272	subcc	$rounds,1,$rounds		!
273	ldx	[$tbl+$acc15],$acc15
274	bz,a,pn	%icc,.Lenc_last
275	add	$tbl,2048,$rounds
276
277		srlx	$acc1,8,$acc1
278		xor	$acc0,$t0,$t0
279	ld	[$key+0],$s0
280	fmovs	%f0,%f0
281		srlx	$acc2,16,$acc2		!
282		xor	$acc1,$t0,$t0
283	ld	[$key+4],$s1
284		srlx	$acc3,24,$acc3
285		xor	$acc2,$t0,$t0
286	ld	[$key+8],$s2
287		srlx	$acc5,8,$acc5
288		xor	$acc3,$t0,$t0
289	ld	[$key+12],$s3			!
290		srlx	$acc6,16,$acc6
291		xor	$acc4,$t1,$t1
292	fmovs	%f0,%f0
293		srlx	$acc7,24,$acc7
294		xor	$acc5,$t1,$t1
295		srlx	$acc9,8,$acc9
296		xor	$acc6,$t1,$t1
297		srlx	$acc10,16,$acc10	!
298		xor	$acc7,$t1,$t1
299		srlx	$acc11,24,$acc11
300		xor	$acc8,$t2,$t2
301		srlx	$acc13,8,$acc13
302		xor	$acc9,$t2,$t2
303		srlx	$acc14,16,$acc14
304		xor	$acc10,$t2,$t2
305		srlx	$acc15,24,$acc15	!
306		xor	$acc11,$t2,$t2
307		xor	$acc12,$acc14,$acc14
308		xor	$acc13,$t3,$t3
309	srl	$t0,21,$acc0
310		xor	$acc14,$t3,$t3
311	srl	$t1,13,$acc1
312		xor	$acc15,$t3,$t3
313
314	and	$acc0,2040,$acc0		!
315	srl	$t2,5,$acc2
316	and	$acc1,2040,$acc1
317	ldx	[$tbl+$acc0],$acc0
318	sll	$t3,3,$acc3
319	and	$acc2,2040,$acc2
320	ldx	[$tbl+$acc1],$acc1
321	fmovs	%f0,%f0
322	srl	$t1,21,$acc4			!
323	and	$acc3,2040,$acc3
324	ldx	[$tbl+$acc2],$acc2
325	srl	$t2,13,$acc5
326	and	$acc4,2040,$acc4
327	ldx	[$tbl+$acc3],$acc3
328	srl	$t3,5,$acc6
329	and	$acc5,2040,$acc5
330	ldx	[$tbl+$acc4],$acc4		!
331	sll	$t0,3,$acc7
332	and	$acc6,2040,$acc6
333	ldx	[$tbl+$acc5],$acc5
334	srl	$t2,21,$acc8
335	and	$acc7,2040,$acc7
336	ldx	[$tbl+$acc6],$acc6
337	fmovs	%f0,%f0
338	srl	$t3,13,$acc9			!
339	and	$acc8,2040,$acc8
340	ldx	[$tbl+$acc7],$acc7
341	srl	$t0,5,$acc10
342	and	$acc9,2040,$acc9
343	ldx	[$tbl+$acc8],$acc8
344	sll	$t1,3,$acc11
345	and	$acc10,2040,$acc10
346	ldx	[$tbl+$acc9],$acc9		!
347	srl	$t3,21,$acc12
348	and	$acc11,2040,$acc11
349	ldx	[$tbl+$acc10],$acc10
350	srl	$t0,13,$acc13
351	and	$acc12,2040,$acc12
352	ldx	[$tbl+$acc11],$acc11
353	fmovs	%f0,%f0
354	srl	$t1,5,$acc14			!
355	and	$acc13,2040,$acc13
356	ldx	[$tbl+$acc12],$acc12
357	sll	$t2,3,$acc15
358	and	$acc14,2040,$acc14
359	ldx	[$tbl+$acc13],$acc13
360		srlx	$acc1,8,$acc1
361	and	$acc15,2040,$acc15
362	ldx	[$tbl+$acc14],$acc14		!
363
364		srlx	$acc2,16,$acc2
365		xor	$acc0,$s0,$s0
366	ldx	[$tbl+$acc15],$acc15
367		srlx	$acc3,24,$acc3
368		xor	$acc1,$s0,$s0
369	ld	[$key+16],$t0
370	fmovs	%f0,%f0
371		srlx	$acc5,8,$acc5		!
372		xor	$acc2,$s0,$s0
373	ld	[$key+20],$t1
374		srlx	$acc6,16,$acc6
375		xor	$acc3,$s0,$s0
376	ld	[$key+24],$t2
377		srlx	$acc7,24,$acc7
378		xor	$acc4,$s1,$s1
379	ld	[$key+28],$t3			!
380		srlx	$acc9,8,$acc9
381		xor	$acc5,$s1,$s1
382	ldx	[$tbl+2048+0],%g0		! prefetch te4
383		srlx	$acc10,16,$acc10
384		xor	$acc6,$s1,$s1
385	ldx	[$tbl+2048+32],%g0		! prefetch te4
386		srlx	$acc11,24,$acc11
387		xor	$acc7,$s1,$s1
388	ldx	[$tbl+2048+64],%g0		! prefetch te4
389		srlx	$acc13,8,$acc13
390		xor	$acc8,$s2,$s2
391	ldx	[$tbl+2048+96],%g0		! prefetch te4
392		srlx	$acc14,16,$acc14	!
393		xor	$acc9,$s2,$s2
394	ldx	[$tbl+2048+128],%g0		! prefetch te4
395		srlx	$acc15,24,$acc15
396		xor	$acc10,$s2,$s2
397	ldx	[$tbl+2048+160],%g0		! prefetch te4
398	srl	$s0,21,$acc0
399		xor	$acc11,$s2,$s2
400	ldx	[$tbl+2048+192],%g0		! prefetch te4
401		xor	$acc12,$acc14,$acc14
402		xor	$acc13,$s3,$s3
403	ldx	[$tbl+2048+224],%g0		! prefetch te4
404	srl	$s1,13,$acc1			!
405		xor	$acc14,$s3,$s3
406		xor	$acc15,$s3,$s3
407	ba	.Lenc_loop
408	and	$acc0,2040,$acc0
409
410.align	32
411.Lenc_last:
412		srlx	$acc1,8,$acc1		!
413		xor	$acc0,$t0,$t0
414	ld	[$key+0],$s0
415		srlx	$acc2,16,$acc2
416		xor	$acc1,$t0,$t0
417	ld	[$key+4],$s1
418		srlx	$acc3,24,$acc3
419		xor	$acc2,$t0,$t0
420	ld	[$key+8],$s2			!
421		srlx	$acc5,8,$acc5
422		xor	$acc3,$t0,$t0
423	ld	[$key+12],$s3
424		srlx	$acc6,16,$acc6
425		xor	$acc4,$t1,$t1
426		srlx	$acc7,24,$acc7
427		xor	$acc5,$t1,$t1
428		srlx	$acc9,8,$acc9		!
429		xor	$acc6,$t1,$t1
430		srlx	$acc10,16,$acc10
431		xor	$acc7,$t1,$t1
432		srlx	$acc11,24,$acc11
433		xor	$acc8,$t2,$t2
434		srlx	$acc13,8,$acc13
435		xor	$acc9,$t2,$t2
436		srlx	$acc14,16,$acc14	!
437		xor	$acc10,$t2,$t2
438		srlx	$acc15,24,$acc15
439		xor	$acc11,$t2,$t2
440		xor	$acc12,$acc14,$acc14
441		xor	$acc13,$t3,$t3
442	srl	$t0,24,$acc0
443		xor	$acc14,$t3,$t3
444	srl	$t1,16,$acc1			!
445		xor	$acc15,$t3,$t3
446
447	srl	$t2,8,$acc2
448	and	$acc1,255,$acc1
449	ldub	[$rounds+$acc0],$acc0
450	srl	$t1,24,$acc4
451	and	$acc2,255,$acc2
452	ldub	[$rounds+$acc1],$acc1
453	srl	$t2,16,$acc5			!
454	and	$t3,255,$acc3
455	ldub	[$rounds+$acc2],$acc2
456	ldub	[$rounds+$acc3],$acc3
457	srl	$t3,8,$acc6
458	and	$acc5,255,$acc5
459	ldub	[$rounds+$acc4],$acc4
460	fmovs	%f0,%f0
461	srl	$t2,24,$acc8			!
462	and	$acc6,255,$acc6
463	ldub	[$rounds+$acc5],$acc5
464	srl	$t3,16,$acc9
465	and	$t0,255,$acc7
466	ldub	[$rounds+$acc6],$acc6
467	ldub	[$rounds+$acc7],$acc7
468	fmovs	%f0,%f0
469	srl	$t0,8,$acc10			!
470	and	$acc9,255,$acc9
471	ldub	[$rounds+$acc8],$acc8
472	srl	$t3,24,$acc12
473	and	$acc10,255,$acc10
474	ldub	[$rounds+$acc9],$acc9
475	srl	$t0,16,$acc13
476	and	$t1,255,$acc11
477	ldub	[$rounds+$acc10],$acc10		!
478	srl	$t1,8,$acc14
479	and	$acc13,255,$acc13
480	ldub	[$rounds+$acc11],$acc11
481	ldub	[$rounds+$acc12],$acc12
482	and	$acc14,255,$acc14
483	ldub	[$rounds+$acc13],$acc13
484	and	$t2,255,$acc15
485	ldub	[$rounds+$acc14],$acc14		!
486
487		sll	$acc0,24,$acc0
488		xor	$acc3,$s0,$s0
489	ldub	[$rounds+$acc15],$acc15
490		sll	$acc1,16,$acc1
491		xor	$acc0,$s0,$s0
492	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
493	fmovs	%f0,%f0
494		sll	$acc2,8,$acc2		!
495		xor	$acc1,$s0,$s0
496		sll	$acc4,24,$acc4
497		xor	$acc2,$s0,$s0
498		sll	$acc5,16,$acc5
499		xor	$acc7,$s1,$s1
500		sll	$acc6,8,$acc6
501		xor	$acc4,$s1,$s1
502		sll	$acc8,24,$acc8		!
503		xor	$acc5,$s1,$s1
504		sll	$acc9,16,$acc9
505		xor	$acc11,$s2,$s2
506		sll	$acc10,8,$acc10
507		xor	$acc6,$s1,$s1
508		sll	$acc12,24,$acc12
509		xor	$acc8,$s2,$s2
510		sll	$acc13,16,$acc13	!
511		xor	$acc9,$s2,$s2
512		sll	$acc14,8,$acc14
513		xor	$acc10,$s2,$s2
514		xor	$acc12,$acc14,$acc14
515		xor	$acc13,$s3,$s3
516		xor	$acc14,$s3,$s3
517		xor	$acc15,$s3,$s3
518
519	ret
520	restore
521.type	_sparcv9_AES_encrypt,#function
522.size	_sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
523
524.align	32
525.globl	AES_encrypt
526AES_encrypt:
527	or	%o0,%o1,%g1
528	andcc	%g1,3,%g0
529	bnz,pn	%xcc,.Lunaligned_enc
530	save	%sp,-$frame,%sp
531
532	ld	[%i0+0],%o0
533	ld	[%i0+4],%o1
534	ld	[%i0+8],%o2
535	ld	[%i0+12],%o3
536
5371:	call	.+8
538	add	%o7,AES_Te-1b,%o4
539	call	_sparcv9_AES_encrypt
540	mov	%i2,%o5
541
542	st	%o0,[%i1+0]
543	st	%o1,[%i1+4]
544	st	%o2,[%i1+8]
545	st	%o3,[%i1+12]
546
547	ret
548	restore
549
550.align	32
551.Lunaligned_enc:
552	ldub	[%i0+0],%l0
553	ldub	[%i0+1],%l1
554	ldub	[%i0+2],%l2
555
556	sll	%l0,24,%l0
557	ldub	[%i0+3],%l3
558	sll	%l1,16,%l1
559	ldub	[%i0+4],%l4
560	sll	%l2,8,%l2
561	or	%l1,%l0,%l0
562	ldub	[%i0+5],%l5
563	sll	%l4,24,%l4
564	or	%l3,%l2,%l2
565	ldub	[%i0+6],%l6
566	sll	%l5,16,%l5
567	or	%l0,%l2,%o0
568	ldub	[%i0+7],%l7
569
570	sll	%l6,8,%l6
571	or	%l5,%l4,%l4
572	ldub	[%i0+8],%l0
573	or	%l7,%l6,%l6
574	ldub	[%i0+9],%l1
575	or	%l4,%l6,%o1
576	ldub	[%i0+10],%l2
577
578	sll	%l0,24,%l0
579	ldub	[%i0+11],%l3
580	sll	%l1,16,%l1
581	ldub	[%i0+12],%l4
582	sll	%l2,8,%l2
583	or	%l1,%l0,%l0
584	ldub	[%i0+13],%l5
585	sll	%l4,24,%l4
586	or	%l3,%l2,%l2
587	ldub	[%i0+14],%l6
588	sll	%l5,16,%l5
589	or	%l0,%l2,%o2
590	ldub	[%i0+15],%l7
591
592	sll	%l6,8,%l6
593	or	%l5,%l4,%l4
594	or	%l7,%l6,%l6
595	or	%l4,%l6,%o3
596
5971:	call	.+8
598	add	%o7,AES_Te-1b,%o4
599	call	_sparcv9_AES_encrypt
600	mov	%i2,%o5
601
602	srl	%o0,24,%l0
603	srl	%o0,16,%l1
604	stb	%l0,[%i1+0]
605	srl	%o0,8,%l2
606	stb	%l1,[%i1+1]
607	stb	%l2,[%i1+2]
608	srl	%o1,24,%l4
609	stb	%o0,[%i1+3]
610
611	srl	%o1,16,%l5
612	stb	%l4,[%i1+4]
613	srl	%o1,8,%l6
614	stb	%l5,[%i1+5]
615	stb	%l6,[%i1+6]
616	srl	%o2,24,%l0
617	stb	%o1,[%i1+7]
618
619	srl	%o2,16,%l1
620	stb	%l0,[%i1+8]
621	srl	%o2,8,%l2
622	stb	%l1,[%i1+9]
623	stb	%l2,[%i1+10]
624	srl	%o3,24,%l4
625	stb	%o2,[%i1+11]
626
627	srl	%o3,16,%l5
628	stb	%l4,[%i1+12]
629	srl	%o3,8,%l6
630	stb	%l5,[%i1+13]
631	stb	%l6,[%i1+14]
632	stb	%o3,[%i1+15]
633
634	ret
635	restore
636.type	AES_encrypt,#function
637.size	AES_encrypt,(.-AES_encrypt)
638
639___
640
641$code.=<<___;
642.align	256
643AES_Td:
644___
645&_data_word(
646	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
647	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
648	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
649	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
650	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
651	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
652	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
653	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
654	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
655	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
656	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
657	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
658	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
659	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
660	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
661	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
662	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
663	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
664	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
665	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
666	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
667	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
668	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
669	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
670	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
671	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
672	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
673	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
674	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
675	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
676	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
677	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
678	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
679	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
680	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
681	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
682	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
683	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
684	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
685	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
686	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
687	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
688	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
689	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
690	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
691	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
692	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
693	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
694	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
695	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
696	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
697	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
698	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
699	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
700	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
701	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
702	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
703	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
704	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
705	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
706	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
707	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
708	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
709	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
710$code.=<<___;
711	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
712	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
713	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
714	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
715	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
716	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
717	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
718	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
719	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
720	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
721	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
722	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
723	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
724	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
725	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
726	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
727	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
728	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
729	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
730	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
731	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
732	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
733	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
734	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
735	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
736	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
737	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
738	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
739	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
740	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
741	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
742	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
743.type	AES_Td,#object
744.size	AES_Td,(.-AES_Td)
745
746.align	64
747.skip	16
748_sparcv9_AES_decrypt:
749	save	%sp,-$frame-$locals,%sp
750	stx	%i7,[%sp+$bias+$frame+0]	! off-load return address
751	ld	[$key+240],$rounds
752	ld	[$key+0],$t0
753	ld	[$key+4],$t1			!
754	ld	[$key+8],$t2
755	ld	[$key+12],$t3
756	srl	$rounds,1,$rounds
757	xor	$t0,$s0,$s0
758	ld	[$key+16],$t0
759	xor	$t1,$s1,$s1
760	ld	[$key+20],$t1
761	srl	$s0,21,$acc0			!
762	xor	$t2,$s2,$s2
763	ld	[$key+24],$t2
764	xor	$t3,$s3,$s3
765	and	$acc0,2040,$acc0
766	ld	[$key+28],$t3
767	srl	$s3,13,$acc1
768	nop
769.Ldec_loop:
770	srl	$s2,5,$acc2			!
771	and	$acc1,2040,$acc1
772	ldx	[$tbl+$acc0],$acc0
773	sll	$s1,3,$acc3
774	and	$acc2,2040,$acc2
775	ldx	[$tbl+$acc1],$acc1
776	srl	$s1,21,$acc4
777	and	$acc3,2040,$acc3
778	ldx	[$tbl+$acc2],$acc2		!
779	srl	$s0,13,$acc5
780	and	$acc4,2040,$acc4
781	ldx	[$tbl+$acc3],$acc3
782	srl	$s3,5,$acc6
783	and	$acc5,2040,$acc5
784	ldx	[$tbl+$acc4],$acc4
785	fmovs	%f0,%f0
786	sll	$s2,3,$acc7			!
787	and	$acc6,2040,$acc6
788	ldx	[$tbl+$acc5],$acc5
789	srl	$s2,21,$acc8
790	and	$acc7,2040,$acc7
791	ldx	[$tbl+$acc6],$acc6
792	srl	$s1,13,$acc9
793	and	$acc8,2040,$acc8
794	ldx	[$tbl+$acc7],$acc7		!
795	srl	$s0,5,$acc10
796	and	$acc9,2040,$acc9
797	ldx	[$tbl+$acc8],$acc8
798	sll	$s3,3,$acc11
799	and	$acc10,2040,$acc10
800	ldx	[$tbl+$acc9],$acc9
801	fmovs	%f0,%f0
802	srl	$s3,21,$acc12			!
803	and	$acc11,2040,$acc11
804	ldx	[$tbl+$acc10],$acc10
805	srl	$s2,13,$acc13
806	and	$acc12,2040,$acc12
807	ldx	[$tbl+$acc11],$acc11
808	srl	$s1,5,$acc14
809	and	$acc13,2040,$acc13
810	ldx	[$tbl+$acc12],$acc12		!
811	sll	$s0,3,$acc15
812	and	$acc14,2040,$acc14
813	ldx	[$tbl+$acc13],$acc13
814	and	$acc15,2040,$acc15
815	add	$key,32,$key
816	ldx	[$tbl+$acc14],$acc14
817	fmovs	%f0,%f0
818	subcc	$rounds,1,$rounds		!
819	ldx	[$tbl+$acc15],$acc15
820	bz,a,pn	%icc,.Ldec_last
821	add	$tbl,2048,$rounds
822
823		srlx	$acc1,8,$acc1
824		xor	$acc0,$t0,$t0
825	ld	[$key+0],$s0
826	fmovs	%f0,%f0
827		srlx	$acc2,16,$acc2		!
828		xor	$acc1,$t0,$t0
829	ld	[$key+4],$s1
830		srlx	$acc3,24,$acc3
831		xor	$acc2,$t0,$t0
832	ld	[$key+8],$s2
833		srlx	$acc5,8,$acc5
834		xor	$acc3,$t0,$t0
835	ld	[$key+12],$s3			!
836		srlx	$acc6,16,$acc6
837		xor	$acc4,$t1,$t1
838	fmovs	%f0,%f0
839		srlx	$acc7,24,$acc7
840		xor	$acc5,$t1,$t1
841		srlx	$acc9,8,$acc9
842		xor	$acc6,$t1,$t1
843		srlx	$acc10,16,$acc10	!
844		xor	$acc7,$t1,$t1
845		srlx	$acc11,24,$acc11
846		xor	$acc8,$t2,$t2
847		srlx	$acc13,8,$acc13
848		xor	$acc9,$t2,$t2
849		srlx	$acc14,16,$acc14
850		xor	$acc10,$t2,$t2
851		srlx	$acc15,24,$acc15	!
852		xor	$acc11,$t2,$t2
853		xor	$acc12,$acc14,$acc14
854		xor	$acc13,$t3,$t3
855	srl	$t0,21,$acc0
856		xor	$acc14,$t3,$t3
857		xor	$acc15,$t3,$t3
858	srl	$t3,13,$acc1
859
860	and	$acc0,2040,$acc0		!
861	srl	$t2,5,$acc2
862	and	$acc1,2040,$acc1
863	ldx	[$tbl+$acc0],$acc0
864	sll	$t1,3,$acc3
865	and	$acc2,2040,$acc2
866	ldx	[$tbl+$acc1],$acc1
867	fmovs	%f0,%f0
868	srl	$t1,21,$acc4			!
869	and	$acc3,2040,$acc3
870	ldx	[$tbl+$acc2],$acc2
871	srl	$t0,13,$acc5
872	and	$acc4,2040,$acc4
873	ldx	[$tbl+$acc3],$acc3
874	srl	$t3,5,$acc6
875	and	$acc5,2040,$acc5
876	ldx	[$tbl+$acc4],$acc4		!
877	sll	$t2,3,$acc7
878	and	$acc6,2040,$acc6
879	ldx	[$tbl+$acc5],$acc5
880	srl	$t2,21,$acc8
881	and	$acc7,2040,$acc7
882	ldx	[$tbl+$acc6],$acc6
883	fmovs	%f0,%f0
884	srl	$t1,13,$acc9			!
885	and	$acc8,2040,$acc8
886	ldx	[$tbl+$acc7],$acc7
887	srl	$t0,5,$acc10
888	and	$acc9,2040,$acc9
889	ldx	[$tbl+$acc8],$acc8
890	sll	$t3,3,$acc11
891	and	$acc10,2040,$acc10
892	ldx	[$tbl+$acc9],$acc9		!
893	srl	$t3,21,$acc12
894	and	$acc11,2040,$acc11
895	ldx	[$tbl+$acc10],$acc10
896	srl	$t2,13,$acc13
897	and	$acc12,2040,$acc12
898	ldx	[$tbl+$acc11],$acc11
899	fmovs	%f0,%f0
900	srl	$t1,5,$acc14			!
901	and	$acc13,2040,$acc13
902	ldx	[$tbl+$acc12],$acc12
903	sll	$t0,3,$acc15
904	and	$acc14,2040,$acc14
905	ldx	[$tbl+$acc13],$acc13
906		srlx	$acc1,8,$acc1
907	and	$acc15,2040,$acc15
908	ldx	[$tbl+$acc14],$acc14		!
909
910		srlx	$acc2,16,$acc2
911		xor	$acc0,$s0,$s0
912	ldx	[$tbl+$acc15],$acc15
913		srlx	$acc3,24,$acc3
914		xor	$acc1,$s0,$s0
915	ld	[$key+16],$t0
916	fmovs	%f0,%f0
917		srlx	$acc5,8,$acc5		!
918		xor	$acc2,$s0,$s0
919	ld	[$key+20],$t1
920		srlx	$acc6,16,$acc6
921		xor	$acc3,$s0,$s0
922	ld	[$key+24],$t2
923		srlx	$acc7,24,$acc7
924		xor	$acc4,$s1,$s1
925	ld	[$key+28],$t3			!
926		srlx	$acc9,8,$acc9
927		xor	$acc5,$s1,$s1
928	ldx	[$tbl+2048+0],%g0		! prefetch td4
929		srlx	$acc10,16,$acc10
930		xor	$acc6,$s1,$s1
931	ldx	[$tbl+2048+32],%g0		! prefetch td4
932		srlx	$acc11,24,$acc11
933		xor	$acc7,$s1,$s1
934	ldx	[$tbl+2048+64],%g0		! prefetch td4
935		srlx	$acc13,8,$acc13
936		xor	$acc8,$s2,$s2
937	ldx	[$tbl+2048+96],%g0		! prefetch td4
938		srlx	$acc14,16,$acc14	!
939		xor	$acc9,$s2,$s2
940	ldx	[$tbl+2048+128],%g0		! prefetch td4
941		srlx	$acc15,24,$acc15
942		xor	$acc10,$s2,$s2
943	ldx	[$tbl+2048+160],%g0		! prefetch td4
944	srl	$s0,21,$acc0
945		xor	$acc11,$s2,$s2
946	ldx	[$tbl+2048+192],%g0		! prefetch td4
947		xor	$acc12,$acc14,$acc14
948		xor	$acc13,$s3,$s3
949	ldx	[$tbl+2048+224],%g0		! prefetch td4
950	and	$acc0,2040,$acc0		!
951		xor	$acc14,$s3,$s3
952		xor	$acc15,$s3,$s3
953	ba	.Ldec_loop
954	srl	$s3,13,$acc1
955
956.align	32
957.Ldec_last:
958		srlx	$acc1,8,$acc1		!
959		xor	$acc0,$t0,$t0
960	ld	[$key+0],$s0
961		srlx	$acc2,16,$acc2
962		xor	$acc1,$t0,$t0
963	ld	[$key+4],$s1
964		srlx	$acc3,24,$acc3
965		xor	$acc2,$t0,$t0
966	ld	[$key+8],$s2			!
967		srlx	$acc5,8,$acc5
968		xor	$acc3,$t0,$t0
969	ld	[$key+12],$s3
970		srlx	$acc6,16,$acc6
971		xor	$acc4,$t1,$t1
972		srlx	$acc7,24,$acc7
973		xor	$acc5,$t1,$t1
974		srlx	$acc9,8,$acc9		!
975		xor	$acc6,$t1,$t1
976		srlx	$acc10,16,$acc10
977		xor	$acc7,$t1,$t1
978		srlx	$acc11,24,$acc11
979		xor	$acc8,$t2,$t2
980		srlx	$acc13,8,$acc13
981		xor	$acc9,$t2,$t2
982		srlx	$acc14,16,$acc14	!
983		xor	$acc10,$t2,$t2
984		srlx	$acc15,24,$acc15
985		xor	$acc11,$t2,$t2
986		xor	$acc12,$acc14,$acc14
987		xor	$acc13,$t3,$t3
988	srl	$t0,24,$acc0
989		xor	$acc14,$t3,$t3
990		xor	$acc15,$t3,$t3		!
991	srl	$t3,16,$acc1
992
993	srl	$t2,8,$acc2
994	and	$acc1,255,$acc1
995	ldub	[$rounds+$acc0],$acc0
996	srl	$t1,24,$acc4
997	and	$acc2,255,$acc2
998	ldub	[$rounds+$acc1],$acc1
999	srl	$t0,16,$acc5			!
1000	and	$t1,255,$acc3
1001	ldub	[$rounds+$acc2],$acc2
1002	ldub	[$rounds+$acc3],$acc3
1003	srl	$t3,8,$acc6
1004	and	$acc5,255,$acc5
1005	ldub	[$rounds+$acc4],$acc4
1006	fmovs	%f0,%f0
1007	srl	$t2,24,$acc8			!
1008	and	$acc6,255,$acc6
1009	ldub	[$rounds+$acc5],$acc5
1010	srl	$t1,16,$acc9
1011	and	$t2,255,$acc7
1012	ldub	[$rounds+$acc6],$acc6
1013	ldub	[$rounds+$acc7],$acc7
1014	fmovs	%f0,%f0
1015	srl	$t0,8,$acc10			!
1016	and	$acc9,255,$acc9
1017	ldub	[$rounds+$acc8],$acc8
1018	srl	$t3,24,$acc12
1019	and	$acc10,255,$acc10
1020	ldub	[$rounds+$acc9],$acc9
1021	srl	$t2,16,$acc13
1022	and	$t3,255,$acc11
1023	ldub	[$rounds+$acc10],$acc10		!
1024	srl	$t1,8,$acc14
1025	and	$acc13,255,$acc13
1026	ldub	[$rounds+$acc11],$acc11
1027	ldub	[$rounds+$acc12],$acc12
1028	and	$acc14,255,$acc14
1029	ldub	[$rounds+$acc13],$acc13
1030	and	$t0,255,$acc15
1031	ldub	[$rounds+$acc14],$acc14		!
1032
1033		sll	$acc0,24,$acc0
1034		xor	$acc3,$s0,$s0
1035	ldub	[$rounds+$acc15],$acc15
1036		sll	$acc1,16,$acc1
1037		xor	$acc0,$s0,$s0
1038	ldx	[%sp+$bias+$frame+0],%i7	! restore return address
1039	fmovs	%f0,%f0
1040		sll	$acc2,8,$acc2		!
1041		xor	$acc1,$s0,$s0
1042		sll	$acc4,24,$acc4
1043		xor	$acc2,$s0,$s0
1044		sll	$acc5,16,$acc5
1045		xor	$acc7,$s1,$s1
1046		sll	$acc6,8,$acc6
1047		xor	$acc4,$s1,$s1
1048		sll	$acc8,24,$acc8		!
1049		xor	$acc5,$s1,$s1
1050		sll	$acc9,16,$acc9
1051		xor	$acc11,$s2,$s2
1052		sll	$acc10,8,$acc10
1053		xor	$acc6,$s1,$s1
1054		sll	$acc12,24,$acc12
1055		xor	$acc8,$s2,$s2
1056		sll	$acc13,16,$acc13	!
1057		xor	$acc9,$s2,$s2
1058		sll	$acc14,8,$acc14
1059		xor	$acc10,$s2,$s2
1060		xor	$acc12,$acc14,$acc14
1061		xor	$acc13,$s3,$s3
1062		xor	$acc14,$s3,$s3
1063		xor	$acc15,$s3,$s3
1064
1065	ret
1066	restore
1067.type	_sparcv9_AES_decrypt,#function
1068.size	_sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1069
1070.align	32
1071.globl	AES_decrypt
1072AES_decrypt:
1073	or	%o0,%o1,%g1
1074	andcc	%g1,3,%g0
1075	bnz,pn	%xcc,.Lunaligned_dec
1076	save	%sp,-$frame,%sp
1077
1078	ld	[%i0+0],%o0
1079	ld	[%i0+4],%o1
1080	ld	[%i0+8],%o2
1081	ld	[%i0+12],%o3
1082
10831:	call	.+8
1084	add	%o7,AES_Td-1b,%o4
1085	call	_sparcv9_AES_decrypt
1086	mov	%i2,%o5
1087
1088	st	%o0,[%i1+0]
1089	st	%o1,[%i1+4]
1090	st	%o2,[%i1+8]
1091	st	%o3,[%i1+12]
1092
1093	ret
1094	restore
1095
1096.align	32
1097.Lunaligned_dec:
1098	ldub	[%i0+0],%l0
1099	ldub	[%i0+1],%l1
1100	ldub	[%i0+2],%l2
1101
1102	sll	%l0,24,%l0
1103	ldub	[%i0+3],%l3
1104	sll	%l1,16,%l1
1105	ldub	[%i0+4],%l4
1106	sll	%l2,8,%l2
1107	or	%l1,%l0,%l0
1108	ldub	[%i0+5],%l5
1109	sll	%l4,24,%l4
1110	or	%l3,%l2,%l2
1111	ldub	[%i0+6],%l6
1112	sll	%l5,16,%l5
1113	or	%l0,%l2,%o0
1114	ldub	[%i0+7],%l7
1115
1116	sll	%l6,8,%l6
1117	or	%l5,%l4,%l4
1118	ldub	[%i0+8],%l0
1119	or	%l7,%l6,%l6
1120	ldub	[%i0+9],%l1
1121	or	%l4,%l6,%o1
1122	ldub	[%i0+10],%l2
1123
1124	sll	%l0,24,%l0
1125	ldub	[%i0+11],%l3
1126	sll	%l1,16,%l1
1127	ldub	[%i0+12],%l4
1128	sll	%l2,8,%l2
1129	or	%l1,%l0,%l0
1130	ldub	[%i0+13],%l5
1131	sll	%l4,24,%l4
1132	or	%l3,%l2,%l2
1133	ldub	[%i0+14],%l6
1134	sll	%l5,16,%l5
1135	or	%l0,%l2,%o2
1136	ldub	[%i0+15],%l7
1137
1138	sll	%l6,8,%l6
1139	or	%l5,%l4,%l4
1140	or	%l7,%l6,%l6
1141	or	%l4,%l6,%o3
1142
11431:	call	.+8
1144	add	%o7,AES_Td-1b,%o4
1145	call	_sparcv9_AES_decrypt
1146	mov	%i2,%o5
1147
1148	srl	%o0,24,%l0
1149	srl	%o0,16,%l1
1150	stb	%l0,[%i1+0]
1151	srl	%o0,8,%l2
1152	stb	%l1,[%i1+1]
1153	stb	%l2,[%i1+2]
1154	srl	%o1,24,%l4
1155	stb	%o0,[%i1+3]
1156
1157	srl	%o1,16,%l5
1158	stb	%l4,[%i1+4]
1159	srl	%o1,8,%l6
1160	stb	%l5,[%i1+5]
1161	stb	%l6,[%i1+6]
1162	srl	%o2,24,%l0
1163	stb	%o1,[%i1+7]
1164
1165	srl	%o2,16,%l1
1166	stb	%l0,[%i1+8]
1167	srl	%o2,8,%l2
1168	stb	%l1,[%i1+9]
1169	stb	%l2,[%i1+10]
1170	srl	%o3,24,%l4
1171	stb	%o2,[%i1+11]
1172
1173	srl	%o3,16,%l5
1174	stb	%l4,[%i1+12]
1175	srl	%o3,8,%l6
1176	stb	%l5,[%i1+13]
1177	stb	%l6,[%i1+14]
1178	stb	%o3,[%i1+15]
1179
1180	ret
1181	restore
1182.type	AES_decrypt,#function
1183.size	AES_decrypt,(.-AES_decrypt)
1184___
1185
1186# fmovs instructions substituting for FP nops were originally added
1187# to meet specific instruction alignment requirements to maximize ILP.
1188# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1189# undesired effect, so just omit them and sacrifice some portion of
1190# percent in performance...
1191$code =~ s/fmovs.*$//gm;
1192
1193print $code;
1194close STDOUT or die "error closing STDOUT: $!";	# ensure flush
1195