xref: /openssl/crypto/bn/asm/bn-c64xplus.asm (revision 367ace68)
1;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
2;;
3;; Licensed under the Apache License 2.0 (the "License").  You may not use
4;; this file except in compliance with the License.  You can obtain a copy
5;; in the file LICENSE in the source distribution or at
6;; https://www.openssl.org/source/license.html
7;;
8;;====================================================================
9;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
10;; project.
11;;
12;; Rights for redistribution and usage in source and binary forms are
13;; granted according to the License. Warranty of any kind is disclaimed.
14;;====================================================================
15;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
16;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
17;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
18;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
19;;====================================================================
20	.text
21
22	.if	.ASSEMBLER_VERSION<7000000
23	.asg	0,__TI_EABI__
24	.endif
25	.if	__TI_EABI__
26	.asg	bn_mul_add_words,_bn_mul_add_words
27	.asg	bn_mul_words,_bn_mul_words
28	.asg	bn_sqr_words,_bn_sqr_words
29	.asg	bn_add_words,_bn_add_words
30	.asg	bn_sub_words,_bn_sub_words
31	.asg	bn_div_words,_bn_div_words
32	.asg	bn_sqr_comba8,_bn_sqr_comba8
33	.asg	bn_mul_comba8,_bn_mul_comba8
34	.asg	bn_sqr_comba4,_bn_sqr_comba4
35	.asg	bn_mul_comba4,_bn_mul_comba4
36	.endif
37
38	.asg	B3,RA
39	.asg	A4,ARG0
40	.asg	B4,ARG1
41	.asg	A6,ARG2
42	.asg	B6,ARG3
43	.asg	A8,ARG4
44	.asg	B8,ARG5
45	.asg	A4,RET
46	.asg	A15,FP
47	.asg	B14,DP
48	.asg	B15,SP
49
50	.global	_bn_mul_add_words
51_bn_mul_add_words:
52	.asmfunc
53	MV	ARG2,B0
54  [!B0]	BNOP	RA
55||[!B0]	MVK	0,RET
56   [B0]	MVC	B0,ILC
57   [B0]	ZERO	A19		; high part of accumulator
58|| [B0]	MV	ARG0,A2
59|| [B0]	MV	ARG3,A3
60	NOP	3
61
62	SPLOOP	2		; 2*n+10
63;;====================================================================
64	LDW	*ARG1++,B7	; ap[i]
65	NOP	3
66	LDW	*ARG0++,A7	; rp[i]
67	MPY32U	B7,A3,A17:A16
68	NOP	3		; [2,0] in epilogue
69	ADDU	A16,A7,A21:A20
70	ADDU	A19,A21:A20,A19:A18
71||	MV.S	A17,A23
72	SPKERNEL 2,1		; leave slot for "return value"
73||	STW	A18,*A2++	; rp[i]
74||	ADD	A19,A23,A19
75;;====================================================================
76	BNOP	RA,4
77	MV	A19,RET		; return value
78	.endasmfunc
79
80	.global	_bn_mul_words
81_bn_mul_words:
82	.asmfunc
83	MV	ARG2,B0
84  [!B0]	BNOP	RA
85||[!B0]	MVK	0,RET
86   [B0]	MVC	B0,ILC
87   [B0]	ZERO	A19		; high part of accumulator
88	NOP	3
89
90	SPLOOP	2		; 2*n+10
91;;====================================================================
92	LDW	*ARG1++,A7	; ap[i]
93	NOP	4
94	MPY32U	A7,ARG3,A17:A16
95	NOP	4		; [2,0] in epiloque
96	ADDU	A19,A16,A19:A18
97||	MV.S	A17,A21
98	SPKERNEL 2,1		; leave slot for "return value"
99||	STW	A18,*ARG0++	; rp[i]
100||	ADD.L	A19,A21,A19
101;;====================================================================
102	BNOP	RA,4
103	MV	A19,RET		; return value
104	.endasmfunc
105
106	.global	_bn_sqr_words
107_bn_sqr_words:
108	.asmfunc
109	MV	ARG2,B0
110  [!B0]	BNOP	RA
111||[!B0]	MVK	0,RET
112   [B0]	MVC	B0,ILC
113   [B0]	MV	ARG0,B2
114|| [B0]	ADD	4,ARG0,ARG0
115	NOP	3
116
117	SPLOOP	2		; 2*n+10
118;;====================================================================
119	LDW	*ARG1++,B7	; ap[i]
120	NOP	4
121	MPY32U	B7,B7,B1:B0
122	NOP	3		; [2,0] in epilogue
123	STW	B0,*B2++(8)	; rp[2*i]
124	MV	B1,A1
125	SPKERNEL 2,0		; fully overlap BNOP RA,5
126||	STW	A1,*ARG0++(8)	; rp[2*i+1]
127;;====================================================================
128	BNOP	RA,5
129	.endasmfunc
130
131	.global	_bn_add_words
132_bn_add_words:
133	.asmfunc
134	MV	ARG3,B0
135  [!B0]	BNOP	RA
136||[!B0]	MVK	0,RET
137   [B0]	MVC	B0,ILC
138   [B0]	ZERO	A1		; carry flag
139|| [B0]	MV	ARG0,A3
140	NOP	3
141
142	SPLOOP	2		; 2*n+6
143;;====================================================================
144	LDW	*ARG2++,A7	; bp[i]
145||	LDW	*ARG1++,B7	; ap[i]
146	NOP	4
147	ADDU	A7,B7,A9:A8
148	ADDU	A1,A9:A8,A1:A0
149	SPKERNEL 0,0		; fully overlap BNOP RA,5
150||	STW	A0,*A3++	; write result
151||	MV	A1,RET		; keep carry flag in RET
152;;====================================================================
153	BNOP	RA,5
154	.endasmfunc
155
156	.global	_bn_sub_words
157_bn_sub_words:
158	.asmfunc
159	MV	ARG3,B0
160  [!B0]	BNOP	RA
161||[!B0]	MVK	0,RET
162   [B0]	MVC	B0,ILC
163   [B0]	ZERO	A2		; borrow flag
164|| [B0]	MV	ARG0,A3
165	NOP	3
166
167	SPLOOP	2		; 2*n+6
168;;====================================================================
169	LDW	*ARG2++,A7	; bp[i]
170||	LDW	*ARG1++,B7	; ap[i]
171	NOP	4
172	SUBU	B7,A7,A1:A0
173  [A2]	SUB	A1:A0,1,A1:A0
174	SPKERNEL 0,1		; leave slot for "return borrow flag"
175||	STW	A0,*A3++	; write result
176||	AND	1,A1,A2		; pass on borrow flag
177;;====================================================================
178	BNOP	RA,4
179	AND	1,A1,RET	; return borrow flag
180	.endasmfunc
181
182	.global	_bn_div_words
183_bn_div_words:
184	.asmfunc
185	LMBD	1,A6,A0		; leading zero bits in dv
186	LMBD	1,A4,A1		; leading zero bits in hi
187||	MVK	32,B0
188	CMPLTU	A1,A0,A2
189||	ADD	A0,B0,B0
190  [ A2]	BNOP	RA
191||[ A2]	MVK	-1,A4		; return overflow
192||[!A2]	MV	A4,A3		; reassign hi
193  [!A2]	MV	B4,A4		; reassign lo, will be quotient
194||[!A2]	MVC	B0,ILC
195  [!A2]	SHL	A6,A0,A6	; normalize dv
196||	MVK	1,A1
197
198  [!A2]	CMPLTU	A3,A6,A1	; hi<dv?
199||[!A2]	SHL	A4,1,A5:A4	; lo<<1
200  [!A1]	SUB	A3,A6,A3	; hi-=dv
201||[!A1]	OR	1,A4,A4
202  [!A2]	SHRU	A3,31,A1	; upper bit
203||[!A2]	ADDAH	A5,A3,A3	; hi<<1|lo>>31
204
205	SPLOOP	3
206  [!A1]	CMPLTU	A3,A6,A1	; hi<dv?
207||[ A1]	ZERO	A1
208||	SHL	A4,1,A5:A4	; lo<<1
209  [!A1]	SUB	A3,A6,A3	; hi-=dv
210||[!A1]	OR	1,A4,A4		; quotient
211	SHRU	A3,31,A1	; upper bit
212||	ADDAH	A5,A3,A3	; hi<<1|lo>>31
213	SPKERNEL
214
215	BNOP	RA,5
216	.endasmfunc
217
218;;====================================================================
219;; Not really Comba algorithm, just straightforward NxM... Dedicated
220;; fully unrolled real Comba implementations are asymptotically 2x
221;; faster, but naturally larger undertaking. Purpose of this exercise
222;; was rather to learn to master nested SPLOOPs...
223;;====================================================================
224	.global	_bn_sqr_comba8
225	.global	_bn_mul_comba8
226_bn_sqr_comba8:
227	MV	ARG1,ARG2
228_bn_mul_comba8:
229	.asmfunc
230	MVK	8,B0		; N, RILC
231||	MVK	8,A0		; M, outer loop counter
232||	MV	ARG1,A5		; copy ap
233||	MV	ARG0,B4		; copy rp
234||	ZERO	B19		; high part of accumulator
235	MVC	B0,RILC
236||	SUB	B0,2,B1		; N-2, initial ILC
237||	SUB	B0,1,B2		; const B2=N-1
238||	LDW	*A5++,B6	; ap[0]
239||	MV	A0,A3		; const A3=M
240sploopNxM?:			; for best performance arrange M<=N
241   [A0]	SPLOOPD	2		; 2*n+10
242||	MVC	B1,ILC
243||	ADDAW	B4,B0,B5
244||	ZERO	B7
245||	LDW	*A5++,A9	; pre-fetch ap[1]
246||	ZERO	A1
247||	SUB	A0,1,A0
248;;====================================================================
249;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
250;; This is because of Advisory 15 from TI publication SPRZ247I.
251	LDW	*ARG2++,A7	; bp[i]
252	NOP	3
253   [A1]	LDW	*B5++,B7	; rp[i]
254	MPY32U	A7,B6,B17:B16
255	NOP	3
256	ADDU	B16,B7,B21:B20
257	ADDU	B19,B21:B20,B19:B18
258||	MV.S	B17,B23
259	SPKERNEL
260||	STW	B18,*B4++	; rp[i]
261||	ADD.S	B19,B23,B19
262;;====================================================================
263outer?:				; m*2*(n+1)+10
264	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
265	SPMASKR
266||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
267	MVD	A9,B6		; move through .M unit(*)
268   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
269	SUBAW	B5,B2,B5	; rewind rp to rp[1]
270	MVK	1,A1
271   [A0]	BNOP.S1	outer?,4
272|| [A0]	SUB.L	A0,1,A0
273	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
274||	ZERO.S	B19		; high part of accumulator
275;; end of outer?
276	BNOP	RA,5		; return
277	.endasmfunc
278;; (*)	It should be noted that B6 is used as input to MPY32U in
279;;	chronologically next cycle in *preceding* SPLOOP iteration.
280;;	Normally such arrangement would require DINT, but at this
281;;	point SPLOOP is draining and interrupts are disabled
282;;	implicitly.
283
284	.global	_bn_sqr_comba4
285	.global	_bn_mul_comba4
286_bn_sqr_comba4:
287	MV	ARG1,ARG2
288_bn_mul_comba4:
289	.asmfunc
290	.if	0
291	BNOP	sploopNxM?,3
292	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
293	;; because of low-counter effect, when prologue phase finishes
294	;; before SPKERNEL instruction is reached. As result it's 25%
295	;; slower than expected...
296	MVK	4,B0		; N, RILC
297||	MVK	4,A0		; M, outer loop counter
298||	MV	ARG1,A5		; copy ap
299||	MV	ARG0,B4		; copy rp
300||	ZERO	B19		; high part of accumulator
301	MVC	B0,RILC
302||	SUB	B0,2,B1		; first ILC
303||	SUB	B0,1,B2		; const B2=N-1
304||	LDW	*A5++,B6	; ap[0]
305||	MV	A0,A3		; const A3=M
306	.else
307	;; This alternative is an exercise in fully unrolled Comba
308	;; algorithm implementation that operates at n*(n+1)+12, or
309	;; as little as 32 cycles...
310	LDW	*ARG1[0],B16	; a[0]
311||	LDW	*ARG2[0],A16	; b[0]
312	LDW	*ARG1[1],B17	; a[1]
313||	LDW	*ARG2[1],A17	; b[1]
314	LDW	*ARG1[2],B18	; a[2]
315||	LDW	*ARG2[2],A18	; b[2]
316	LDW	*ARG1[3],B19	; a[3]
317||	LDW	*ARG2[3],A19	; b[3]
318	NOP
319	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
320	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
321	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
322	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
323	STW	A0,*ARG0[0]
324||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
325	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
326||	ADDU	A22,A1,A1:A0
327	MV	A23,B0
328||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
329||	ADDU	A24,A1:A0,A1:A0
330	ADDU	A25,B0,B1:B0
331||	STW	A0,*ARG0[1]
332||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
333||	ADDU	A26,A1,A9:A8
334	ADDU	A27,B1,B9:B8
335||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
336||	ADDU	A28,A9:A8,A9:A8
337	ADDU	A29,B9:B8,B9:B8
338||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
339||	ADDU	A30,A9:A8,A9:A8
340	ADDU	A31,B9:B8,B9:B8
341||	ADDU	B0,A9:A8,A9:A8
342	STW	A8,*ARG0[2]
343||	ADDU	A20,A9,A1:A0
344	ADDU	A21,B9,B1:B0
345||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
346||	ADDU	A22,A1:A0,A1:A0
347	ADDU	A23,B1:B0,B1:B0
348||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
349||	ADDU	A24,A1:A0,A1:A0
350	ADDU	A25,B1:B0,B1:B0
351||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
352||	ADDU	A26,A1:A0,A1:A0
353	ADDU	A27,B1:B0,B1:B0
354||	ADDU	B8,A1:A0,A1:A0
355	STW	A0,*ARG0[3]
356||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
357||	ADDU	A20,A1,A9:A8
358	ADDU	A21,B1,B9:B8
359||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
360||	ADDU	A22,A9:A8,A9:A8
361	ADDU	A23,B9:B8,B9:B8
362||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
363||	ADDU	A24,A9:A8,A9:A8
364	ADDU	A25,B9:B8,B9:B8
365||	ADDU	B0,A9:A8,A9:A8
366	STW	A8,*ARG0[4]
367||	ADDU	A26,A9,A1:A0
368	ADDU	A27,B9,B1:B0
369||	ADDU	A28,A1:A0,A1:A0
370	ADDU	A29,B1:B0,B1:B0
371||	BNOP	RA
372||	ADDU	B8,A1:A0,A1:A0
373	STW	A0,*ARG0[5]
374||	ADDU	A30,A1,A9:A8
375	ADD	A31,B1,B8
376	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
377	ADD	B8,A9,A9
378||	STW	A8,*ARG0[6]
379	STW	A9,*ARG0[7]
380	.endif
381	.endasmfunc
382