xref: /openssl/crypto/sha/asm/sha256-c64xplus.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# SHA256 for C64x+.
18#
19# January 2012
20#
21# Performance is just below 10 cycles per processed byte, which is
22# almost 40% faster than compiler-generated code. Unroll is unlikely
23# to give more than ~8% improvement...
24#
25# !!! Note that this module uses AMR, which means that all interrupt
26# service routines are expected to preserve it and for own well-being
27# zero it upon entry.
28
29$output = pop and open STDOUT,">$output";
30
31($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
32 $K256="A3";
33
34($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
35	=map("A$_",(16..31));
36($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
37	=map("B$_",(16..31));
38
39($Xia,$Xib)=("A5","B5");			# circular/ring buffer
40 $CTXB=$t2e;
41
42($Xn,$X0,$K)=("B7","B8","B9");
43($Maj,$Ch)=($T2,"B6");
44
45$code.=<<___;
46	.text
47
48	.if	.ASSEMBLER_VERSION<7000000
49	.asg	0,__TI_EABI__
50	.endif
51	.if	__TI_EABI__
52	.nocmp
53	.asg	sha256_block_data_order,_sha256_block_data_order
54	.endif
55
56	.asg	B3,RA
57	.asg	A15,FP
58	.asg	B15,SP
59
60	.if	.BIG_ENDIAN
61	.asg	SWAP2,MV
62	.asg	SWAP4,MV
63	.endif
64
65	.global	_sha256_block_data_order
66_sha256_block_data_order:
67__sha256_block:
68	.asmfunc stack_usage(64)
69	MV	$NUM,A0				; reassign $NUM
70||	MVK	-64,B0
71  [!A0]	BNOP	RA				; if ($NUM==0) return;
72|| [A0]	STW	FP,*SP--[16]			; save frame pointer and alloca(64)
73|| [A0]	MV	SP,FP
74   [A0]	ADDKPC	__sha256_block,B2
75|| [A0]	AND	B0,SP,SP			; align stack at 64 bytes
76	.if	__TI_EABI__
77   [A0]	MVK	0x00404,B1
78|| [A0]	MVKL	\$PCR_OFFSET(K256,__sha256_block),$K256
79   [A0]	MVKH	0x50000,B1
80|| [A0]	MVKH	\$PCR_OFFSET(K256,__sha256_block),$K256
81	.else
82   [A0]	MVK	0x00404,B1
83|| [A0]	MVKL	(K256-__sha256_block),$K256
84   [A0]	MVKH	0x50000,B1
85|| [A0]	MVKH	(K256-__sha256_block),$K256
86	.endif
87   [A0]	MVC	B1,AMR				; setup circular addressing
88|| [A0]	MV	SP,$Xia
89   [A0]	MV	SP,$Xib
90|| [A0]	ADD	B2,$K256,$K256
91|| [A0]	MV	$CTXA,$CTXB
92|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
93	LDW	*${CTXA}[0],$A			; load ctx
94||	LDW	*${CTXB}[4],$E
95	LDW	*${CTXA}[1],$B
96||	LDW	*${CTXB}[5],$F
97	LDW	*${CTXA}[2],$C
98||	LDW	*${CTXB}[6],$G
99	LDW	*${CTXA}[3],$D
100||	LDW	*${CTXB}[7],$H
101
102	LDNW	*$INP++,$Xn			; pre-fetch input
103	LDW	*$K256++,$K			; pre-fetch K256[0]
104	MVK	14,B0				; loop counters
105	MVK	47,B1
106||	ADDAW	$Xia,9,$Xia
107outerloop?:
108	SUB	A0,1,A0
109||	MV	$A,$Actx
110||	MV	$E,$Ectx
111||	MVD	$B,$Bctx
112||	MVD	$F,$Fctx
113	MV	$C,$Cctx
114||	MV	$G,$Gctx
115||	MVD	$D,$Dctx
116||	MVD	$H,$Hctx
117||	SWAP4	$Xn,$X0
118
119	SPLOOPD	8				; BODY_00_14
120||	MVC	B0,ILC
121||	SWAP2	$X0,$X0
122
123	LDNW	*$INP++,$Xn
124||	ROTL	$A,30,$S0
125||	OR	$A,$B,$Maj
126||	AND	$A,$B,$t2a
127||	ROTL	$E,26,$S1
128||	AND	$F,$E,$Ch
129||	ANDN	$G,$E,$t2e
130	ROTL	$A,19,$t0a
131||	AND	$C,$Maj,$Maj
132||	ROTL	$E,21,$t0e
133||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
134	ROTL	$A,10,$t1a
135||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
136||	ROTL	$E,7,$t1e
137||	ADD	$K,$H,$T1			; T1 = h + K256[i]
138	ADD	$X0,$T1,$T1			; T1 += X[i];
139||	STW	$X0,*$Xib++
140||	XOR	$t0a,$S0,$S0
141||	XOR	$t0e,$S1,$S1
142	XOR	$t1a,$S0,$S0			; Sigma0(a)
143||	XOR	$t1e,$S1,$S1			; Sigma1(e)
144||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
145||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
146	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
147||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
148||	ROTL	$G,0,$H				; h = g
149||	MV	$F,$G				; g = f
150||	MV	$X0,$X14
151||	SWAP4	$Xn,$X0
152	SWAP2	$X0,$X0
153||	MV	$E,$F				; f = e
154||	ADD	$D,$T1,$E			; e = d + T1
155||	MV	$C,$D				; d = c
156	MV	$B,$C				; c = b
157||	MV	$A,$B				; b = a
158||	ADD	$T1,$T2,$A			; a = T1 + T2
159	SPKERNEL
160
161	ROTL	$A,30,$S0			; BODY_15
162||	OR	$A,$B,$Maj
163||	AND	$A,$B,$t2a
164||	ROTL	$E,26,$S1
165||	AND	$F,$E,$Ch
166||	ANDN	$G,$E,$t2e
167||	LDW	*${Xib}[1],$Xn			; modulo-scheduled
168	ROTL	$A,19,$t0a
169||	AND	$C,$Maj,$Maj
170||	ROTL	$E,21,$t0e
171||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
172||	LDW	*${Xib}[2],$X1			; modulo-scheduled
173	ROTL	$A,10,$t1a
174||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
175||	ROTL	$E,7,$t1e
176||	ADD	$K,$H,$T1			; T1 = h + K256[i]
177	ADD	$X0,$T1,$T1			; T1 += X[i];
178||	STW	$X0,*$Xib++
179||	XOR	$t0a,$S0,$S0
180||	XOR	$t0e,$S1,$S1
181	XOR	$t1a,$S0,$S0			; Sigma0(a)
182||	XOR	$t1e,$S1,$S1			; Sigma1(e)
183||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
184||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
185	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
186||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
187||	ROTL	$G,0,$H				; h = g
188||	MV	$F,$G				; g = f
189||	MV	$X0,$X15
190	MV	$E,$F				; f = e
191||	ADD	$D,$T1,$E			; e = d + T1
192||	MV	$C,$D				; d = c
193||	MV	$Xn,$X0				; modulo-scheduled
194||	LDW	*$Xia,$X9			; modulo-scheduled
195||	ROTL	$X1,25,$t0e			; modulo-scheduled
196||	ROTL	$X14,15,$t0a			; modulo-scheduled
197	SHRU	$X1,3,$s0			; modulo-scheduled
198||	SHRU	$X14,10,$s1			; modulo-scheduled
199||	ROTL	$B,0,$C				; c = b
200||	MV	$A,$B				; b = a
201||	ADD	$T1,$T2,$A			; a = T1 + T2
202
203	SPLOOPD	10				; BODY_16_63
204||	MVC	B1,ILC
205||	ROTL	$X1,14,$t1e			; modulo-scheduled
206||	ROTL	$X14,13,$t1a			; modulo-scheduled
207
208	XOR	$t0e,$s0,$s0
209||	XOR	$t0a,$s1,$s1
210||	MV	$X15,$X14
211||	MV	$X1,$Xn
212	XOR	$t1e,$s0,$s0			; sigma0(X[i+1])
213||	XOR	$t1a,$s1,$s1			; sigma1(X[i+14])
214||	LDW	*${Xib}[2],$X1			; module-scheduled
215	ROTL	$A,30,$S0
216||	OR	$A,$B,$Maj
217||	AND	$A,$B,$t2a
218||	ROTL	$E,26,$S1
219||	AND	$F,$E,$Ch
220||	ANDN	$G,$E,$t2e
221||	ADD	$X9,$X0,$X0			; X[i] += X[i+9]
222	ROTL	$A,19,$t0a
223||	AND	$C,$Maj,$Maj
224||	ROTL	$E,21,$t0e
225||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
226||	ADD	$s0,$X0,$X0			; X[i] += sigma1(X[i+1])
227	ROTL	$A,10,$t1a
228||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
229||	ROTL	$E,7,$t1e
230||	ADD	$H,$K,$T1			; T1 = h + K256[i]
231||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
232	XOR	$t0a,$S0,$S0
233||	XOR	$t0e,$S1,$S1
234||	ADD	$X0,$T1,$T1			; T1 += X[i]
235||	STW	$X0,*$Xib++
236	XOR	$t1a,$S0,$S0			; Sigma0(a)
237||	XOR	$t1e,$S1,$S1			; Sigma1(e)
238||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
239||	MV	$X0,$X15
240||	ROTL	$G,0,$H				; h = g
241||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
242	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
243||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
244||	MV	$F,$G				; g = f
245||	MV	$Xn,$X0				; modulo-scheduled
246||	LDW	*++$Xia,$X9			; modulo-scheduled
247||	ROTL	$X1,25,$t0e			; module-scheduled
248||	ROTL	$X14,15,$t0a			; modulo-scheduled
249	ROTL	$X1,14,$t1e			; modulo-scheduled
250||	ROTL	$X14,13,$t1a			; modulo-scheduled
251||	MV	$E,$F				; f = e
252||	ADD	$D,$T1,$E			; e = d + T1
253||	MV	$C,$D				; d = c
254||	MV	$B,$C				; c = b
255	MV	$A,$B				; b = a
256||	ADD	$T1,$T2,$A			; a = T1 + T2
257||	SHRU	$X1,3,$s0			; modulo-scheduled
258||	SHRU	$X14,10,$s1			; modulo-scheduled
259	SPKERNEL
260
261   [A0]	B	outerloop?
262|| [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
263|| [A0]	ADDK	-260,$K256			; rewind K256
264||	ADD	$Actx,$A,$A			; accumulate ctx
265||	ADD	$Ectx,$E,$E
266||	ADD	$Bctx,$B,$B
267	ADD	$Fctx,$F,$F
268||	ADD	$Cctx,$C,$C
269||	ADD	$Gctx,$G,$G
270||	ADD	$Dctx,$D,$D
271||	ADD	$Hctx,$H,$H
272|| [A0]	LDW	*$K256++,$K			; pre-fetch K256[0]
273
274  [!A0]	BNOP	RA
275||[!A0]	MV	$CTXA,$CTXB
276  [!A0]	MV	FP,SP				; restore stack pointer
277||[!A0]	LDW	*FP[0],FP			; restore frame pointer
278  [!A0]	STW	$A,*${CTXA}[0]  		; save ctx
279||[!A0]	STW	$E,*${CTXB}[4]
280||[!A0]	MVK	0,B0
281  [!A0]	STW	$B,*${CTXA}[1]
282||[!A0]	STW	$F,*${CTXB}[5]
283||[!A0]	MVC	B0,AMR				; clear AMR
284	STW	$C,*${CTXA}[2]
285||	STW	$G,*${CTXB}[6]
286	STW	$D,*${CTXA}[3]
287||	STW	$H,*${CTXB}[7]
288	.endasmfunc
289
290	.if	__TI_EABI__
291	.sect	".text:sha_asm.const"
292	.else
293	.sect	".const:sha_asm"
294	.endif
295	.align	128
296K256:
297	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
298	.uword	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
299	.uword	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
300	.uword	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
301	.uword	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
302	.uword	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
303	.uword	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
304	.uword	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
305	.uword	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
306	.uword	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
307	.uword	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
308	.uword	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
309	.uword	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
310	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
311	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
312	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
313	.cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
314	.align	4
315
316___
317
318print $code;
319close STDOUT or die "error closing STDOUT: $!";
320