xref: /openssl/crypto/sha/asm/sha512-sparcv9.pl (revision 54b40531)
1#! /usr/bin/env perl
2# Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Hardware SPARC T4 support by David S. Miller
17# ====================================================================
18
19# SHA256 performance improvement over compiler generated code varies
20# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
21# build]. Just like in SHA1 module I aim to ensure scalability on
22# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
23
24# SHA512 on pre-T1 UltraSPARC.
25#
26# Performance is >75% better than 64-bit code generated by Sun C and
27# over 2x than 32-bit code. X[16] resides on stack, but access to it
28# is scheduled for L2 latency and staged through 32 least significant
29# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
30# duality. Nevertheless it's ~40% faster than SHA256, which is pretty
31# good [optimal coefficient is 50%].
32#
33# SHA512 on UltraSPARC T1.
34#
35# It's not any faster than 64-bit code generated by Sun C 5.8. This is
36# because 64-bit code generator has the advantage of using 64-bit
37# loads(*) to access X[16], which I consciously traded for 32-/64-bit
38# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
39# code by 60%, not to mention that it doesn't suffer from severe decay
40# when running 4 times physical cores threads and that it leaves gcc
41# [3.4] behind by over 4x factor! If compared to SHA256, single thread
42# performance is only 10% better, but overall throughput for maximum
43# amount of threads for given CPU exceeds corresponding one of SHA256
44# by 30% [again, optimal coefficient is 50%].
45#
46# (*)	Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
47#	in-order, i.e. load instruction has to complete prior next
48#	instruction in given thread is executed, even if the latter is
49#	not dependent on load result! This means that on T1 two 32-bit
50#	loads are always slower than one 64-bit load. Once again this
51#	is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
52#	2x32-bit loads can be as fast as 1x64-bit ones.
53#
54# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
55# which is 9.3x/11.1x faster than software. Multi-process benchmark
56# saturates at 11.5x single-process result on 8-core processor, or
57# ~11/16GBps per 2.85GHz socket.
58
59# $output is the last argument if it looks like a file (it has an extension)
60$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
61
62$output and open STDOUT,">$output";
63
64if ($output =~ /512/) {
65	$label="512";
66	$SZ=8;
67	$LD="ldx";		# load from memory
68	$ST="stx";		# store to memory
69	$SLL="sllx";		# shift left logical
70	$SRL="srlx";		# shift right logical
71	@Sigma0=(28,34,39);
72	@Sigma1=(14,18,41);
73	@sigma0=( 7, 1, 8);	# right shift first
74	@sigma1=( 6,19,61);	# right shift first
75	$lastK=0x817;
76	$rounds=80;
77	$align=4;
78
79	$locals=16*$SZ;		# X[16]
80
81	$A="%o0";
82	$B="%o1";
83	$C="%o2";
84	$D="%o3";
85	$E="%o4";
86	$F="%o5";
87	$G="%g1";
88	$H="%o7";
89	@V=($A,$B,$C,$D,$E,$F,$G,$H);
90} else {
91	$label="256";
92	$SZ=4;
93	$LD="ld";		# load from memory
94	$ST="st";		# store to memory
95	$SLL="sll";		# shift left logical
96	$SRL="srl";		# shift right logical
97	@Sigma0=( 2,13,22);
98	@Sigma1=( 6,11,25);
99	@sigma0=( 3, 7,18);	# right shift first
100	@sigma1=(10,17,19);	# right shift first
101	$lastK=0x8f2;
102	$rounds=64;
103	$align=8;
104
105	$locals=0;		# X[16] is register resident
106	@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
107
108	$A="%l0";
109	$B="%l1";
110	$C="%l2";
111	$D="%l3";
112	$E="%l4";
113	$F="%l5";
114	$G="%l6";
115	$H="%l7";
116	@V=($A,$B,$C,$D,$E,$F,$G,$H);
117}
118$T1="%g2";
119$tmp0="%g3";
120$tmp1="%g4";
121$tmp2="%g5";
122
123$ctx="%i0";
124$inp="%i1";
125$len="%i2";
126$Ktbl="%i3";
127$tmp31="%i4";
128$tmp32="%i5";
129
130########### SHA256
131$Xload = sub {
132my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
133
134    if ($i==0) {
135$code.=<<___;
136	ldx	[$inp+0],@X[0]
137	ldx	[$inp+16],@X[2]
138	ldx	[$inp+32],@X[4]
139	ldx	[$inp+48],@X[6]
140	ldx	[$inp+8],@X[1]
141	ldx	[$inp+24],@X[3]
142	subcc	%g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
143	ldx	[$inp+40],@X[5]
144	bz,pt	%icc,.Laligned
145	ldx	[$inp+56],@X[7]
146
147	sllx	@X[0],$tmp31,@X[0]
148	ldx	[$inp+64],$T1
149___
150for($j=0;$j<7;$j++)
151{   $code.=<<___;
152	srlx	@X[$j+1],$tmp32,$tmp1
153	sllx	@X[$j+1],$tmp31,@X[$j+1]
154	or	$tmp1,@X[$j],@X[$j]
155___
156}
157$code.=<<___;
158	srlx	$T1,$tmp32,$T1
159	or	$T1,@X[7],@X[7]
160.Laligned:
161___
162    }
163
164    if ($i&1) {
165	$code.="\tadd	@X[$i/2],$h,$T1\n";
166    } else {
167	$code.="\tsrlx	@X[$i/2],32,$T1\n\tadd	$h,$T1,$T1\n";
168    }
169} if ($SZ==4);
170
171########### SHA512
172$Xload = sub {
173my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
174my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
175
176$code.=<<___ if ($i==0);
177	ld	[$inp+0],%l0
178	ld	[$inp+4],%l1
179	ld	[$inp+8],%l2
180	ld	[$inp+12],%l3
181	ld	[$inp+16],%l4
182	ld	[$inp+20],%l5
183	ld	[$inp+24],%l6
184	cmp	$tmp31,0
185	ld	[$inp+28],%l7
186___
187$code.=<<___ if ($i<15);
188	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
189	add	$tmp31,32,$tmp0
190	sllx	@pair[0],$tmp0,$tmp1
191	`"ld	[$inp+".eval(32+0+$i*8)."],@pair[0]"	if ($i<12)`
192	srlx	@pair[2],$tmp32,@pair[1]
193	or	$tmp1,$tmp2,$tmp2
194	or	@pair[1],$tmp2,$tmp2
195	`"ld	[$inp+".eval(32+4+$i*8)."],@pair[1]"	if ($i<12)`
196	add	$h,$tmp2,$T1
197	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
198___
199$code.=<<___ if ($i==12);
200	bnz,a,pn	%icc,.+8
201	ld	[$inp+128],%l0
202___
203$code.=<<___ if ($i==15);
204	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
205	sllx	@pair[1],$tmp31,$tmp2	! Xload($i)
206	add	$tmp31,32,$tmp0
207	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
208	sllx	@pair[0],$tmp0,$tmp1
209	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
210	srlx	@pair[2],$tmp32,@pair[1]
211	or	$tmp1,$tmp2,$tmp2
212	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
213	or	@pair[1],$tmp2,$tmp2
214	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
215	add	$h,$tmp2,$T1
216	$ST	$tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
217	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
218	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
219	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
220___
221} if ($SZ==8);
222
223########### common
224sub BODY_00_15 {
225my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
226
227    if ($i<16) {
228	&$Xload(@_);
229    } else {
230	$code.="\tadd	$h,$T1,$T1\n";
231    }
232
233$code.=<<___;
234	$SRL	$e,@Sigma1[0],$h	!! $i
235	xor	$f,$g,$tmp2
236	$SLL	$e,`$SZ*8-@Sigma1[2]`,$tmp1
237	and	$e,$tmp2,$tmp2
238	$SRL	$e,@Sigma1[1],$tmp0
239	xor	$tmp1,$h,$h
240	$SLL	$e,`$SZ*8-@Sigma1[1]`,$tmp1
241	xor	$tmp0,$h,$h
242	$SRL	$e,@Sigma1[2],$tmp0
243	xor	$tmp1,$h,$h
244	$SLL	$e,`$SZ*8-@Sigma1[0]`,$tmp1
245	xor	$tmp0,$h,$h
246	xor	$g,$tmp2,$tmp2		! Ch(e,f,g)
247	xor	$tmp1,$h,$tmp0		! Sigma1(e)
248
249	$SRL	$a,@Sigma0[0],$h
250	add	$tmp2,$T1,$T1
251	$LD	[$Ktbl+`$i*$SZ`],$tmp2	! K[$i]
252	$SLL	$a,`$SZ*8-@Sigma0[2]`,$tmp1
253	add	$tmp0,$T1,$T1
254	$SRL	$a,@Sigma0[1],$tmp0
255	xor	$tmp1,$h,$h
256	$SLL	$a,`$SZ*8-@Sigma0[1]`,$tmp1
257	xor	$tmp0,$h,$h
258	$SRL	$a,@Sigma0[2],$tmp0
259	xor	$tmp1,$h,$h
260	$SLL	$a,`$SZ*8-@Sigma0[0]`,$tmp1
261	xor	$tmp0,$h,$h
262	xor	$tmp1,$h,$h		! Sigma0(a)
263
264	or	$a,$b,$tmp0
265	and	$a,$b,$tmp1
266	and	$c,$tmp0,$tmp0
267	or	$tmp0,$tmp1,$tmp1	! Maj(a,b,c)
268	add	$tmp2,$T1,$T1		! +=K[$i]
269	add	$tmp1,$h,$h
270
271	add	$T1,$d,$d
272	add	$T1,$h,$h
273___
274}
275
276########### SHA256
277$BODY_16_XX = sub {
278my $i=@_[0];
279my $xi;
280
281    if ($i&1) {
282	$xi=$tmp32;
283	$code.="\tsrlx	@X[(($i+1)/2)%8],32,$xi\n";
284    } else {
285	$xi=@X[(($i+1)/2)%8];
286    }
287$code.=<<___;
288	srl	$xi,@sigma0[0],$T1		!! Xupdate($i)
289	sll	$xi,`32-@sigma0[2]`,$tmp1
290	srl	$xi,@sigma0[1],$tmp0
291	xor	$tmp1,$T1,$T1
292	sll	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
293	xor	$tmp0,$T1,$T1
294	srl	$xi,@sigma0[2],$tmp0
295	xor	$tmp1,$T1,$T1
296___
297    if ($i&1) {
298	$xi=@X[(($i+14)/2)%8];
299    } else {
300	$xi=$tmp32;
301	$code.="\tsrlx	@X[(($i+14)/2)%8],32,$xi\n";
302    }
303$code.=<<___;
304	srl	$xi,@sigma1[0],$tmp2
305	xor	$tmp0,$T1,$T1			! T1=sigma0(X[i+1])
306	sll	$xi,`32-@sigma1[2]`,$tmp1
307	srl	$xi,@sigma1[1],$tmp0
308	xor	$tmp1,$tmp2,$tmp2
309	sll	$tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
310	xor	$tmp0,$tmp2,$tmp2
311	srl	$xi,@sigma1[2],$tmp0
312	xor	$tmp1,$tmp2,$tmp2
313___
314    if ($i&1) {
315	$xi=@X[($i/2)%8];
316$code.=<<___;
317	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
318	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
319	srl	@X[($i/2)%8],0,$tmp0
320	add	$tmp2,$tmp1,$tmp1
321	add	$xi,$T1,$T1			! +=X[i]
322	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
323	add	$tmp1,$T1,$T1
324
325	srl	$T1,0,$T1
326	or	$T1,@X[($i/2)%8],@X[($i/2)%8]
327___
328    } else {
329	$xi=@X[(($i+9)/2)%8];
330$code.=<<___;
331	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
332	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
333	add	$xi,$T1,$T1			! +=X[i+9]
334	add	$tmp2,$tmp1,$tmp1
335	srl	@X[($i/2)%8],0,@X[($i/2)%8]
336	add	$tmp1,$T1,$T1
337
338	sllx	$T1,32,$tmp0
339	or	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
340___
341    }
342    &BODY_00_15(@_);
343} if ($SZ==4);
344
345########### SHA512
346$BODY_16_XX = sub {
347my $i=@_[0];
348my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
349
350$code.=<<___;
351	sllx	%l2,32,$tmp0		!! Xupdate($i)
352	or	%l3,$tmp0,$tmp0
353
354	srlx	$tmp0,@sigma0[0],$T1
355	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
356	sllx	$tmp0,`64-@sigma0[2]`,$tmp1
357	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
358	srlx	$tmp0,@sigma0[1],$tmp0
359	xor	$tmp1,$T1,$T1
360	sllx	$tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
361	xor	$tmp0,$T1,$T1
362	srlx	$tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
363	xor	$tmp1,$T1,$T1
364	sllx	%l6,32,$tmp2
365	xor	$tmp0,$T1,$T1		! sigma0(X[$i+1])
366	or	%l7,$tmp2,$tmp2
367
368	srlx	$tmp2,@sigma1[0],$tmp1
369	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
370	sllx	$tmp2,`64-@sigma1[2]`,$tmp0
371	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
372	srlx	$tmp2,@sigma1[1],$tmp2
373	xor	$tmp0,$tmp1,$tmp1
374	sllx	$tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
375	xor	$tmp2,$tmp1,$tmp1
376	srlx	$tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
377	xor	$tmp0,$tmp1,$tmp1
378	sllx	%l4,32,$tmp0
379	xor	$tmp2,$tmp1,$tmp1	! sigma1(X[$i+14])
380	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
381	or	%l5,$tmp0,$tmp0
382	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
383
384	sllx	%l0,32,$tmp2
385	add	$tmp1,$T1,$T1
386	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
387	or	%l1,$tmp2,$tmp2
388	add	$tmp0,$T1,$T1		! +=X[$i+9]
389	ld	[%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
390	add	$tmp2,$T1,$T1		! +=X[$i]
391	$ST	$T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
392___
393    &BODY_00_15(@_);
394} if ($SZ==8);
395
396$code.=<<___;
397#ifndef __ASSEMBLER__
398# define __ASSEMBLER__ 1
399#endif
400#include "crypto/sparc_arch.h"
401
402#ifdef __arch64__
403.register	%g2,#scratch
404.register	%g3,#scratch
405#endif
406
407.section	".text",#alloc,#execinstr
408
409.align	64
410K${label}:
411.type	K${label},#object
412___
413if ($SZ==4) {
414$code.=<<___;
415	.long	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
416	.long	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
417	.long	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
418	.long	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
419	.long	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
420	.long	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
421	.long	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
422	.long	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
423	.long	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
424	.long	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
425	.long	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
426	.long	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
427	.long	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
428	.long	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
429	.long	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
430	.long	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
431___
432} else {
433$code.=<<___;
434	.long	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
435	.long	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
436	.long	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
437	.long	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
438	.long	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
439	.long	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
440	.long	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
441	.long	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
442	.long	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
443	.long	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
444	.long	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
445	.long	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
446	.long	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
447	.long	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
448	.long	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
449	.long	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
450	.long	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
451	.long	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
452	.long	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
453	.long	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
454	.long	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
455	.long	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
456	.long	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
457	.long	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
458	.long	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
459	.long	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
460	.long	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
461	.long	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
462	.long	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
463	.long	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
464	.long	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
465	.long	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
466	.long	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
467	.long	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
468	.long	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
469	.long	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
470	.long	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
471	.long	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
472	.long	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
473	.long	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
474___
475}
476$code.=<<___;
477.size	K${label},.-K${label}
478
479#ifdef __PIC__
480SPARC_PIC_THUNK(%g1)
481#endif
482
483.globl	sha${label}_block_data_order
484.align	32
485sha${label}_block_data_order:
486	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
487	ld	[%g1+4],%g1		! OPENSSL_sparcv9cap_P[1]
488
489	andcc	%g1, CFR_SHA${label}, %g0
490	be	.Lsoftware
491	nop
492___
493$code.=<<___ if ($SZ==8); 		# SHA512
494	ldd	[%o0 + 0x00], %f0	! load context
495	ldd	[%o0 + 0x08], %f2
496	ldd	[%o0 + 0x10], %f4
497	ldd	[%o0 + 0x18], %f6
498	ldd	[%o0 + 0x20], %f8
499	ldd	[%o0 + 0x28], %f10
500	andcc	%o1, 0x7, %g0
501	ldd	[%o0 + 0x30], %f12
502	bne,pn	%icc, .Lhwunaligned
503	 ldd	[%o0 + 0x38], %f14
504
505.Lhwaligned_loop:
506	ldd	[%o1 + 0x00], %f16
507	ldd	[%o1 + 0x08], %f18
508	ldd	[%o1 + 0x10], %f20
509	ldd	[%o1 + 0x18], %f22
510	ldd	[%o1 + 0x20], %f24
511	ldd	[%o1 + 0x28], %f26
512	ldd	[%o1 + 0x30], %f28
513	ldd	[%o1 + 0x38], %f30
514	ldd	[%o1 + 0x40], %f32
515	ldd	[%o1 + 0x48], %f34
516	ldd	[%o1 + 0x50], %f36
517	ldd	[%o1 + 0x58], %f38
518	ldd	[%o1 + 0x60], %f40
519	ldd	[%o1 + 0x68], %f42
520	ldd	[%o1 + 0x70], %f44
521	subcc	%o2, 1, %o2		! done yet?
522	ldd	[%o1 + 0x78], %f46
523	add	%o1, 0x80, %o1
524	prefetch [%o1 + 63], 20
525	prefetch [%o1 + 64+63], 20
526
527	.word	0x81b02860		! SHA512
528
529	bne,pt	SIZE_T_CC, .Lhwaligned_loop
530	nop
531
532.Lhwfinish:
533	std	%f0, [%o0 + 0x00]	! store context
534	std	%f2, [%o0 + 0x08]
535	std	%f4, [%o0 + 0x10]
536	std	%f6, [%o0 + 0x18]
537	std	%f8, [%o0 + 0x20]
538	std	%f10, [%o0 + 0x28]
539	std	%f12, [%o0 + 0x30]
540	retl
541	 std	%f14, [%o0 + 0x38]
542
543.align	16
544.Lhwunaligned:
545	alignaddr %o1, %g0, %o1
546
547	ldd	[%o1 + 0x00], %f18
548.Lhwunaligned_loop:
549	ldd	[%o1 + 0x08], %f20
550	ldd	[%o1 + 0x10], %f22
551	ldd	[%o1 + 0x18], %f24
552	ldd	[%o1 + 0x20], %f26
553	ldd	[%o1 + 0x28], %f28
554	ldd	[%o1 + 0x30], %f30
555	ldd	[%o1 + 0x38], %f32
556	ldd	[%o1 + 0x40], %f34
557	ldd	[%o1 + 0x48], %f36
558	ldd	[%o1 + 0x50], %f38
559	ldd	[%o1 + 0x58], %f40
560	ldd	[%o1 + 0x60], %f42
561	ldd	[%o1 + 0x68], %f44
562	ldd	[%o1 + 0x70], %f46
563	ldd	[%o1 + 0x78], %f48
564	subcc	%o2, 1, %o2		! done yet?
565	ldd	[%o1 + 0x80], %f50
566	add	%o1, 0x80, %o1
567	prefetch [%o1 + 63], 20
568	prefetch [%o1 + 64+63], 20
569
570	faligndata %f18, %f20, %f16
571	faligndata %f20, %f22, %f18
572	faligndata %f22, %f24, %f20
573	faligndata %f24, %f26, %f22
574	faligndata %f26, %f28, %f24
575	faligndata %f28, %f30, %f26
576	faligndata %f30, %f32, %f28
577	faligndata %f32, %f34, %f30
578	faligndata %f34, %f36, %f32
579	faligndata %f36, %f38, %f34
580	faligndata %f38, %f40, %f36
581	faligndata %f40, %f42, %f38
582	faligndata %f42, %f44, %f40
583	faligndata %f44, %f46, %f42
584	faligndata %f46, %f48, %f44
585	faligndata %f48, %f50, %f46
586
587	.word	0x81b02860		! SHA512
588
589	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
590	for	%f50, %f50, %f18	! %f18=%f50
591
592	ba	.Lhwfinish
593	nop
594___
595$code.=<<___ if ($SZ==4); 		# SHA256
596	ld	[%o0 + 0x00], %f0
597	ld	[%o0 + 0x04], %f1
598	ld	[%o0 + 0x08], %f2
599	ld	[%o0 + 0x0c], %f3
600	ld	[%o0 + 0x10], %f4
601	ld	[%o0 + 0x14], %f5
602	andcc	%o1, 0x7, %g0
603	ld	[%o0 + 0x18], %f6
604	bne,pn	%icc, .Lhwunaligned
605	 ld	[%o0 + 0x1c], %f7
606
607.Lhwloop:
608	ldd	[%o1 + 0x00], %f8
609	ldd	[%o1 + 0x08], %f10
610	ldd	[%o1 + 0x10], %f12
611	ldd	[%o1 + 0x18], %f14
612	ldd	[%o1 + 0x20], %f16
613	ldd	[%o1 + 0x28], %f18
614	ldd	[%o1 + 0x30], %f20
615	subcc	%o2, 1, %o2		! done yet?
616	ldd	[%o1 + 0x38], %f22
617	add	%o1, 0x40, %o1
618	prefetch [%o1 + 63], 20
619
620	.word	0x81b02840		! SHA256
621
622	bne,pt	SIZE_T_CC, .Lhwloop
623	nop
624
625.Lhwfinish:
626	st	%f0, [%o0 + 0x00]	! store context
627	st	%f1, [%o0 + 0x04]
628	st	%f2, [%o0 + 0x08]
629	st	%f3, [%o0 + 0x0c]
630	st	%f4, [%o0 + 0x10]
631	st	%f5, [%o0 + 0x14]
632	st	%f6, [%o0 + 0x18]
633	retl
634	 st	%f7, [%o0 + 0x1c]
635
636.align	8
637.Lhwunaligned:
638	alignaddr %o1, %g0, %o1
639
640	ldd	[%o1 + 0x00], %f10
641.Lhwunaligned_loop:
642	ldd	[%o1 + 0x08], %f12
643	ldd	[%o1 + 0x10], %f14
644	ldd	[%o1 + 0x18], %f16
645	ldd	[%o1 + 0x20], %f18
646	ldd	[%o1 + 0x28], %f20
647	ldd	[%o1 + 0x30], %f22
648	ldd	[%o1 + 0x38], %f24
649	subcc	%o2, 1, %o2		! done yet?
650	ldd	[%o1 + 0x40], %f26
651	add	%o1, 0x40, %o1
652	prefetch [%o1 + 63], 20
653
654	faligndata %f10, %f12, %f8
655	faligndata %f12, %f14, %f10
656	faligndata %f14, %f16, %f12
657	faligndata %f16, %f18, %f14
658	faligndata %f18, %f20, %f16
659	faligndata %f20, %f22, %f18
660	faligndata %f22, %f24, %f20
661	faligndata %f24, %f26, %f22
662
663	.word	0x81b02840		! SHA256
664
665	bne,pt	SIZE_T_CC, .Lhwunaligned_loop
666	for	%f26, %f26, %f10	! %f10=%f26
667
668	ba	.Lhwfinish
669	nop
670___
671$code.=<<___;
672.align	16
673.Lsoftware:
674	save	%sp,-STACK_FRAME-$locals,%sp
675	and	$inp,`$align-1`,$tmp31
676	sllx	$len,`log(16*$SZ)/log(2)`,$len
677	andn	$inp,`$align-1`,$inp
678	sll	$tmp31,3,$tmp31
679	add	$inp,$len,$len
680___
681$code.=<<___ if ($SZ==8); # SHA512
682	mov	32,$tmp32
683	sub	$tmp32,$tmp31,$tmp32
684___
685$code.=<<___;
686.Lpic:	call	.+8
687	add	%o7,K${label}-.Lpic,$Ktbl
688
689	$LD	[$ctx+`0*$SZ`],$A
690	$LD	[$ctx+`1*$SZ`],$B
691	$LD	[$ctx+`2*$SZ`],$C
692	$LD	[$ctx+`3*$SZ`],$D
693	$LD	[$ctx+`4*$SZ`],$E
694	$LD	[$ctx+`5*$SZ`],$F
695	$LD	[$ctx+`6*$SZ`],$G
696	$LD	[$ctx+`7*$SZ`],$H
697
698.Lloop:
699___
700for ($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
701$code.=".L16_xx:\n";
702for (;$i<32;$i++)	{ &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
703$code.=<<___;
704	and	$tmp2,0xfff,$tmp2
705	cmp	$tmp2,$lastK
706	bne	.L16_xx
707	add	$Ktbl,`16*$SZ`,$Ktbl	! Ktbl+=16
708
709___
710$code.=<<___ if ($SZ==4); # SHA256
711	$LD	[$ctx+`0*$SZ`],@X[0]
712	$LD	[$ctx+`1*$SZ`],@X[1]
713	$LD	[$ctx+`2*$SZ`],@X[2]
714	$LD	[$ctx+`3*$SZ`],@X[3]
715	$LD	[$ctx+`4*$SZ`],@X[4]
716	$LD	[$ctx+`5*$SZ`],@X[5]
717	$LD	[$ctx+`6*$SZ`],@X[6]
718	$LD	[$ctx+`7*$SZ`],@X[7]
719
720	add	$A,@X[0],$A
721	$ST	$A,[$ctx+`0*$SZ`]
722	add	$B,@X[1],$B
723	$ST	$B,[$ctx+`1*$SZ`]
724	add	$C,@X[2],$C
725	$ST	$C,[$ctx+`2*$SZ`]
726	add	$D,@X[3],$D
727	$ST	$D,[$ctx+`3*$SZ`]
728	add	$E,@X[4],$E
729	$ST	$E,[$ctx+`4*$SZ`]
730	add	$F,@X[5],$F
731	$ST	$F,[$ctx+`5*$SZ`]
732	add	$G,@X[6],$G
733	$ST	$G,[$ctx+`6*$SZ`]
734	add	$H,@X[7],$H
735	$ST	$H,[$ctx+`7*$SZ`]
736___
737$code.=<<___ if ($SZ==8); # SHA512
738	ld	[$ctx+`0*$SZ+0`],%l0
739	ld	[$ctx+`0*$SZ+4`],%l1
740	ld	[$ctx+`1*$SZ+0`],%l2
741	ld	[$ctx+`1*$SZ+4`],%l3
742	ld	[$ctx+`2*$SZ+0`],%l4
743	ld	[$ctx+`2*$SZ+4`],%l5
744	ld	[$ctx+`3*$SZ+0`],%l6
745
746	sllx	%l0,32,$tmp0
747	ld	[$ctx+`3*$SZ+4`],%l7
748	sllx	%l2,32,$tmp1
749	or	%l1,$tmp0,$tmp0
750	or	%l3,$tmp1,$tmp1
751	add	$tmp0,$A,$A
752	add	$tmp1,$B,$B
753	$ST	$A,[$ctx+`0*$SZ`]
754	sllx	%l4,32,$tmp2
755	$ST	$B,[$ctx+`1*$SZ`]
756	sllx	%l6,32,$T1
757	or	%l5,$tmp2,$tmp2
758	or	%l7,$T1,$T1
759	add	$tmp2,$C,$C
760	$ST	$C,[$ctx+`2*$SZ`]
761	add	$T1,$D,$D
762	$ST	$D,[$ctx+`3*$SZ`]
763
764	ld	[$ctx+`4*$SZ+0`],%l0
765	ld	[$ctx+`4*$SZ+4`],%l1
766	ld	[$ctx+`5*$SZ+0`],%l2
767	ld	[$ctx+`5*$SZ+4`],%l3
768	ld	[$ctx+`6*$SZ+0`],%l4
769	ld	[$ctx+`6*$SZ+4`],%l5
770	ld	[$ctx+`7*$SZ+0`],%l6
771
772	sllx	%l0,32,$tmp0
773	ld	[$ctx+`7*$SZ+4`],%l7
774	sllx	%l2,32,$tmp1
775	or	%l1,$tmp0,$tmp0
776	or	%l3,$tmp1,$tmp1
777	add	$tmp0,$E,$E
778	add	$tmp1,$F,$F
779	$ST	$E,[$ctx+`4*$SZ`]
780	sllx	%l4,32,$tmp2
781	$ST	$F,[$ctx+`5*$SZ`]
782	sllx	%l6,32,$T1
783	or	%l5,$tmp2,$tmp2
784	or	%l7,$T1,$T1
785	add	$tmp2,$G,$G
786	$ST	$G,[$ctx+`6*$SZ`]
787	add	$T1,$H,$H
788	$ST	$H,[$ctx+`7*$SZ`]
789___
790$code.=<<___;
791	add	$inp,`16*$SZ`,$inp		! advance inp
792	cmp	$inp,$len
793	bne	SIZE_T_CC,.Lloop
794	sub	$Ktbl,`($rounds-16)*$SZ`,$Ktbl	! rewind Ktbl
795
796	ret
797	restore
798.type	sha${label}_block_data_order,#function
799.size	sha${label}_block_data_order,(.-sha${label}_block_data_order)
800.asciz	"SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
801.align	4
802___
803
804# Purpose of these subroutines is to explicitly encode VIS instructions,
805# so that one can compile the module without having to specify VIS
806# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
807# Idea is to reserve for option to produce "universal" binary and let
808# programmer detect if current CPU is VIS capable at run-time.
809sub unvis {
810my ($mnemonic,$rs1,$rs2,$rd)=@_;
811my $ref,$opf;
812my %visopf = (	"faligndata"	=> 0x048,
813		"for"		=> 0x07c	);
814
815    $ref = "$mnemonic\t$rs1,$rs2,$rd";
816
817    if ($opf=$visopf{$mnemonic}) {
818	foreach ($rs1,$rs2,$rd) {
819	    return $ref if (!/%f([0-9]{1,2})/);
820	    $_=$1;
821	    if ($1>=32) {
822		return $ref if ($1&1);
823		# re-encode for upper double register addressing
824		$_=($1|$1>>5)&31;
825	    }
826	}
827
828	return	sprintf ".word\t0x%08x !%s",
829			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
830			$ref;
831    } else {
832	return $ref;
833    }
834}
835sub unalignaddr {
836my ($mnemonic,$rs1,$rs2,$rd)=@_;
837my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
838my $ref="$mnemonic\t$rs1,$rs2,$rd";
839
840    foreach ($rs1,$rs2,$rd) {
841	if (/%([goli])([0-7])/)	{ $_=$bias{$1}+$2; }
842	else			{ return $ref; }
843    }
844    return  sprintf ".word\t0x%08x !%s",
845		    0x81b00300|$rd<<25|$rs1<<14|$rs2,
846		    $ref;
847}
848
849foreach (split("\n",$code)) {
850	s/\`([^\`]*)\`/eval $1/ge;
851
852	s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
853		&unvis($1,$2,$3,$4)
854	 /ge;
855	s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
856		&unalignaddr($1,$2,$3,$4)
857	 /ge;
858
859	print $_,"\n";
860}
861
862close STDOUT or die "error closing STDOUT: $!";
863