xref: /openssl/crypto/sha/asm/sha1-c64xplus.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# SHA1 for C64x+.
18#
19# November 2011
20#
21# If compared to compiler-generated code with similar characteristics,
22# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
23# this implementation is 25% smaller and >2x faster. In absolute terms
24# performance is (quite impressive) ~6.5 cycles per processed byte.
25# Fully unrolled assembler would be ~5x larger and is likely to be
26# ~15% faster. It would be free from references to intermediate ring
27# buffer, but put more pressure on L1P [both because the code would be
28# larger and won't be using SPLOOP buffer]. There are no plans to
29# realize fully unrolled variant though...
30#
31# !!! Note that this module uses AMR, which means that all interrupt
32# service routines are expected to preserve it and for own well-being
33# zero it upon entry.
34
35$output = pop and open STDOUT,">$output";
36
37($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
38
39($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
40($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
41($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
42($XPA,$XPB) = ("A5","B5");			# X circular buffer
43($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
44
45$code=<<___;
46	.text
47
48	.if	.ASSEMBLER_VERSION<7000000
49	.asg	0,__TI_EABI__
50	.endif
51	.if	__TI_EABI__
52	.asg	sha1_block_data_order,_sha1_block_data_order
53	.endif
54
55	.asg	B3,RA
56	.asg	A15,FP
57	.asg	B15,SP
58
59	.if	.BIG_ENDIAN
60	.asg	MV,SWAP2
61	.asg	MV,SWAP4
62	.endif
63
64	.global	_sha1_block_data_order
65_sha1_block_data_order:
66	.asmfunc stack_usage(64)
67	MV	$NUM,A0			; reassign $NUM
68||	MVK	-64,B0
69  [!A0]	BNOP	RA			; if ($NUM==0) return;
70|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
71|| [A0]	MV	SP,FP
72   [A0]	LDW	*${CTX}[0],$A		; load A-E...
73|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
74   [A0]	LDW	*${CTX}[1],$B
75|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
76   [A0]	LDW	*${CTX}[2],$C
77|| [A0]	MVK	0x00404,B0
78   [A0]	LDW	*${CTX}[3],$D
79|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
80   [A0]	LDW	*${CTX}[4],$E
81|| [A0]	MVC	B0,AMR			; setup circular addressing
82	LDNW	*${INP}++,$TX1		; pre-fetch input
83	NOP	1
84
85loop?:
86	MVK	0x00007999,$K
87||	ADDAW	SP,2,$XPA
88||	SUB	A0,1,A0
89||	MVK	13,B0
90	MVKH	0x5a820000,$K		; K_00_19
91||	ADDAW	SP,2,$XPB
92||	MV	$A,$Actx
93||	MV	$B,$Bctx
94;;==================================================
95	SPLOOPD	5			; BODY_00_13
96||	MV	$C,$Cctx
97||	MV	$D,$Dctx
98||	MV	$E,$Ectx
99||	MVC	B0,ILC
100
101	ROTL	$A,5,$Arot
102||	AND	$C,$B,$F
103||	ANDN	$D,$B,$F0
104||	ADD	$K,$E,$T		; T=E+K
105
106	XOR	$F0,$F,$F		; F_00_19(B,C,D)
107||	MV	$D,$E			; E=D
108||	MV	$C,$D			; D=C
109||	SWAP2	$TX1,$TX2
110||	LDNW	*${INP}++,$TX1
111
112	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
113||	ROTL	$B,30,$C		; C=ROL(B,30)
114||	SWAP4	$TX2,$TX3		; byte swap
115
116	ADD	$Arot,$T,$T		; T+=ROL(A,5)
117||	MV	$A,$B			; B=A
118
119	ADD	$TX3,$T,$A		; A=T+Xi
120||	STW	$TX3,*${XPB}++
121	SPKERNEL
122;;==================================================
123	ROTL	$A,5,$Arot		; BODY_14
124||	AND	$C,$B,$F
125||	ANDN	$D,$B,$F0
126||	ADD	$K,$E,$T		; T=E+K
127
128	XOR	$F0,$F,$F		; F_00_19(B,C,D)
129||	MV	$D,$E			; E=D
130||	MV	$C,$D			; D=C
131||	SWAP2	$TX1,$TX2
132||	LDNW	*${INP}++,$TX1
133
134	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
135||	ROTL	$B,30,$C		; C=ROL(B,30)
136||	SWAP4	$TX2,$TX2		; byte swap
137||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
138||	LDW	*${XPB}[4],$X2		; 2 iterations ahead
139
140	ADD	$Arot,$T,$T		; T+=ROL(A,5)
141||	MV	$A,$B			; B=A
142||	LDW	*${XPA}[7],$X8
143||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
144||	MV	$TX2,$TX3
145
146	ADD	$TX2,$T,$A		; A=T+Xi
147||	STW	$TX2,*${XPB}++
148;;==================================================
149	ROTL	$A,5,$Arot		; BODY_15
150||	AND	$C,$B,$F
151||	ANDN	$D,$B,$F0
152||	ADD	$K,$E,$T		; T=E+K
153
154	XOR	$F0,$F,$F		; F_00_19(B,C,D)
155||	MV	$D,$E			; E=D
156||	MV	$C,$D			; D=C
157||	SWAP2	$TX1,$TX2
158
159	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
160||	ROTL	$B,30,$C		; C=ROL(B,30)
161||	SWAP4	$TX2,$TX2		; byte swap
162||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
163||	LDW	*${XPA}++,$X0
164||	LDW	*${XPB}[4],$X2
165
166	ADD	$Arot,$T,$T		; T+=ROL(A,5)
167||	MV	$A,$B			; B=A
168||	XOR	$X8,$X13,$TX1
169||	LDW	*${XPA}[7],$X8
170||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
171||	MV	$TX2,$TX3
172
173	ADD	$TX2,$T,$A		; A=T+Xi
174||	STW	$TX2,*${XPB}++
175||	XOR	$TX0,$TX1,$TX1
176||	MVK	3,B0
177;;==================================================
178	SPLOOPD	5			; BODY_16_19
179||	MVC	B0,ILC
180
181	ROTL	$A,5,$Arot
182||	AND	$C,$B,$F
183||	ANDN	$D,$B,$F0
184||	ADD	$K,$E,$T		; T=E+K
185||	ROTL	$TX1,1,$TX2		; Xupdate output
186
187	XOR	$F0,$F,$F		; F_00_19(B,C,D)
188||	MV	$D,$E			; E=D
189||	MV	$C,$D			; D=C
190
191	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
192||	ROTL	$B,30,$C		; C=ROL(B,30)
193||	XOR	$X0,$X2,$TX0
194||	LDW	*${XPA}++,$X0
195||	LDW	*${XPB}[4],$X2
196
197	ADD	$Arot,$T,$T		; T+=ROL(A,5)
198||	MV	$A,$B			; B=A
199||	XOR	$X8,$X13,$TX1
200||	LDW	*${XPA}[7],$X8
201||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
202||	MV	$TX2,$TX3
203
204	ADD	$TX2,$T,$A		; A=T+Xi
205||	STW	$TX2,*${XPB}++
206||	XOR	$TX0,$TX1,$TX1
207	SPKERNEL
208
209	MVK	0xffffeba1,$K
210||	MVK	19,B0
211	MVKH	0x6ed90000,$K		; K_20_39
212___
213sub BODY_20_39 {
214$code.=<<___;
215;;==================================================
216	SPLOOPD	5			; BODY_20_39
217||	MVC	B0,ILC
218
219	ROTL	$A,5,$Arot
220||	XOR	$B,$C,$F
221||	ADD	$K,$E,$T		; T=E+K
222||	ROTL	$TX1,1,$TX2		; Xupdate output
223
224	XOR	$D,$F,$F		; F_20_39(B,C,D)
225||	MV	$D,$E			; E=D
226||	MV	$C,$D			; D=C
227
228	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
229||	ROTL	$B,30,$C		; C=ROL(B,30)
230||	XOR	$X0,$X2,$TX0
231||	LDW	*${XPA}++,$X0
232||	LDW	*${XPB}[4],$X2
233
234	ADD	$Arot,$T,$T		; T+=ROL(A,5)
235||	MV	$A,$B			; B=A
236||	XOR	$X8,$X13,$TX1
237||	LDW	*${XPA}[7],$X8
238||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
239||	MV	$TX2,$TX3
240
241	ADD	$TX2,$T,$A		; A=T+Xi
242||	STW	$TX2,*${XPB}++		; last one is redundant
243||	XOR	$TX0,$TX1,$TX1
244	SPKERNEL
245___
246$code.=<<___ if (!shift);
247	MVK	0xffffbcdc,$K
248	MVKH	0x8f1b0000,$K		; K_40_59
249___
250}	&BODY_20_39();
251$code.=<<___;
252;;==================================================
253	SPLOOPD	5			; BODY_40_59
254||	MVC	B0,ILC
255||	AND	$B,$C,$F
256||	AND	$B,$D,$F0
257
258	ROTL	$A,5,$Arot
259||	XOR	$F0,$F,$F
260||	AND	$C,$D,$F0
261||	ADD	$K,$E,$T		; T=E+K
262||	ROTL	$TX1,1,$TX2		; Xupdate output
263
264	XOR	$F0,$F,$F		; F_40_59(B,C,D)
265||	MV	$D,$E			; E=D
266||	MV	$C,$D			; D=C
267
268	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
269||	ROTL	$B,30,$C		; C=ROL(B,30)
270||	XOR	$X0,$X2,$TX0
271||	LDW	*${XPA}++,$X0
272||	LDW	*${XPB}[4],$X2
273
274	ADD	$Arot,$T,$T		; T+=ROL(A,5)
275||	MV	$A,$B			; B=A
276||	XOR	$X8,$X13,$TX1
277||	LDW	*${XPA}[7],$X8
278||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
279||	MV	$TX2,$TX3
280
281	ADD	$TX2,$T,$A		; A=T+Xi
282||	STW	$TX2,*${XPB}++
283||	XOR	$TX0,$TX1,$TX1
284||	AND	$B,$C,$F
285||	AND	$B,$D,$F0
286	SPKERNEL
287
288	MVK	0xffffc1d6,$K
289||	MVK	18,B0
290	MVKH	0xca620000,$K		; K_60_79
291___
292	&BODY_20_39(-1);		# BODY_60_78
293$code.=<<___;
294;;==================================================
295   [A0]	B	loop?
296||	ROTL	$A,5,$Arot		; BODY_79
297||	XOR	$B,$C,$F
298||	ROTL	$TX1,1,$TX2		; Xupdate output
299
300   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
301||	ADD	$K,$E,$T		; T=E+K
302||	XOR	$D,$F,$F		; F_20_39(B,C,D)
303
304	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
305||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
306||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
307||	ROTL	$B,30,$C		; C=ROL(B,30)
308
309	ADD	$Arot,$T,$T		; T+=ROL(A,5)
310||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx
311
312	ADD	$TX2,$T,$A		; A=T+Xi
313
314	ADD	$Actx,$A,$A		; A+=Actx
315||	ADD	$Cctx,$C,$C		; C+=Cctx
316;; end of loop?
317
318	BNOP	RA			; return
319||	MV	FP,SP			; restore stack pointer
320||	LDW	*FP[0],FP		; restore frame pointer
321	STW	$A,*${CTX}[0]		; emit A-E...
322||	MVK	0,B0
323	STW	$B,*${CTX}[1]
324||	MVC	B0,AMR			; clear AMR
325	STW	$C,*${CTX}[2]
326	STW	$D,*${CTX}[3]
327	STW	$E,*${CTX}[4]
328	.endasmfunc
329
330	.sect	.const
331	.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
332	.align	4
333___
334
335print $code;
336close STDOUT or die "error closing STDOUT: $!";
337