xref: /openssl/crypto/poly1305/asm/poly1305-ia64.S (revision debd9210)
1// ====================================================================
2// Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
3// project.
4// ====================================================================
5//
6// Poly1305 for Itanium.
7//
8// January 2019
9//
10// Performance was reported to be ~2.1 cycles per byte on Itanium 2.
11// With exception for processors in 95xx family, which have higher
12// floating-point instructions' latencies and deliver ~2.6 cpb.
13// Comparison to compiler-generated code is not exactly fair, because
14// of different radixes. But just for reference, it was observed to be
15// >3x faster. Originally it was argued that floating-point base 2^32
16// implementation would be optimal. Upon closer look estimate for below
17// integer base 2^64 implementation turned to be approximately same on
18// Itanium 2. But floating-point code would be larger, and have higher
19// overhead, which would negatively affect small-block performance...
20
21#if defined(_HPUX_SOURCE)
22# if !defined(_LP64)
23#  define ADDP  addp4
24# else
25#  define ADDP  add
26# endif
27# define RUM    rum
28# define SUM    sum
29#else
30# define ADDP   add
31# define RUM    nop
32# define SUM    nop
33#endif
34
35.text
36.explicit
37
38.global	poly1305_init#
39.proc	poly1305_init#
40.align	64
41poly1305_init:
42	.prologue
43	.save		ar.pfs,r2
44{ .mmi;	alloc		r2=ar.pfs,2,0,0,0
45	cmp.eq		p6,p7=0,r33		}	// key == NULL?
46{ .mmi;	ADDP		r9=8,r32
47	ADDP		r10=16,r32
48	ADDP		r32=0,r32		};;
49	.body
50{ .mmi;	st8		[r32]=r0,24			// ctx->h0 = 0
51	st8		[r9]=r0				// ctx->h1 = 0
52(p7)	ADDP		r8=0,r33		}
53{ .mib;	st8		[r10]=r0			// ctx->h2 = 0
54(p6)	mov		r8=0
55(p6)	br.ret.spnt	b0			};;
56
57{ .mmi;	ADDP		r9=1,r33
58	ADDP		r10=2,r33
59	ADDP		r11=3,r33		};;
60{ .mmi;	ld1		r16=[r8],4			// load key, little-endian
61	ld1		r17=[r9],4		}
62{ .mmi;	ld1		r18=[r10],4
63	ld1		r19=[r11],4		};;
64{ .mmi;	ld1		r20=[r8],4
65	ld1		r21=[r9],4		}
66{ .mmi;	ld1		r22=[r10],4
67	ld1		r23=[r11],4
68	and		r19=15,r19		};;
69{ .mmi;	ld1		r24=[r8],4
70	ld1		r25=[r9],4
71	and		r20=-4,r20		}
72{ .mmi;	ld1		r26=[r10],4
73	ld1		r27=[r11],4
74	and		r23=15,r23		};;
75{ .mmi;	ld1		r28=[r8],4
76	ld1		r29=[r9],4
77	and		r24=-4,r24		}
78{ .mmi;	ld1		r30=[r10],4
79	ld1		r31=[r11],4
80	and		r27=15,r27		};;
81
82{ .mii;	and		r28=-4,r28
83	dep		r16=r17,r16,8,8
84	dep		r18=r19,r18,8,8		};;
85{ .mii;	and		r31=15,r31
86	dep		r16=r18,r16,16,16
87	dep		r20=r21,r20,8,8		};;
88{ .mii;	dep		r16=r20,r16,32,16
89	dep		r22=r23,r22,8,8		};;
90{ .mii;	dep		r16=r22,r16,48,16
91	dep		r24=r25,r24,8,8		};;
92{ .mii;	dep		r26=r27,r26,8,8
93	dep		r28=r29,r28,8,8		};;
94{ .mii;	dep		r24=r26,r24,16,16
95	dep		r30=r31,r30,8,8		};;
96{ .mii;	st8		[r32]=r16,8			// ctx->r0
97	dep		r24=r28,r24,32,16;;
98	dep		r24=r30,r24,48,16	};;
99{ .mii;	st8		[r32]=r24,8			// ctx->r1
100	shr.u		r25=r24,2;;
101	add		r25=r25,r24		};;
102{ .mib; st8		[r32]=r25			// ctx->s1
103	mov		r8=0
104	br.ret.sptk	b0			};;
105.endp	poly1305_init#
106
107h0=r17;  h1=r18;  h2=r19;
108i0=r20;  i1=r21;
109HF0=f8;  HF1=f9;  HF2=f10;
110RF0=f11; RF1=f12; SF1=f13;
111
112.global	poly1305_blocks#
113.proc	poly1305_blocks#
114.align	64
115poly1305_blocks:
116	.prologue
117	.save		ar.pfs,r2
118{ .mii;	alloc		r2=ar.pfs,4,1,0,0
119	.save		ar.lc,r3
120	mov		r3=ar.lc
121	.save		pr,r36
122	mov		r36=pr			}
123
124	.body
125{ .mmi;	ADDP		r8=0,r32
126	ADDP		r9=8,r32
127	and		r29=7,r33		};;
128{ .mmi;	ld8		h0=[r8],16
129	ld8		h1=[r9],16
130	and		r33=-8,r33		};;
131{ .mmi;	ld8		h2=[r8],16
132	ldf8		RF0=[r9],16
133	shr.u		r34=r34,4		};;
134{ .mmi;	ldf8		RF1=[r8],-32
135	ldf8		SF1=[r9],-32
136	cmp.ltu		p16,p17=1,r34		};;
137{ .mmi;
138(p16)	add		r34=-2,r34
139(p17)	mov		r34=0
140	ADDP		r10=0,r33		}
141{ .mii;	ADDP		r11=8,r33
142(p16)	mov		ar.ec=2
143(p17)	mov		ar.ec=1			};;
144{ .mib;	RUM		1<<1				// go little-endian
145	mov		ar.lc=r34
146	brp.loop.imp	.Loop,.Lcend-16		}
147
148{ .mmi;	cmp.eq		p8,p7=0,r29
149	cmp.eq		p9,p0=1,r29
150	cmp.eq		p10,p0=2,r29		}
151{ .mmi;	cmp.eq		p11,p0=3,r29
152	cmp.eq		p12,p0=4,r29
153	cmp.eq		p13,p0=5,r29		}
154{ .mmi;	cmp.eq		p14,p0=6,r29
155	cmp.eq		p15,p0=7,r29
156	add		r16=16,r10		};;
157
158{ .mmb;
159(p8)	ld8		i0=[r10],16			// aligned input
160(p8)	ld8		i1=[r11],16
161(p8)	br.cond.sptk	.Loop			};;
162
163	// align first block
164	.pred.rel	"mutex",p8,p9,p10,p11,p12,p13,p14,p15
165{ .mmi;	(p7)	ld8		r14=[r10],24
166	(p7)	ld8		r15=[r11],24		}
167
168{ .mii;	(p7)	ld8		r16=[r16]
169		nop.i		0;;
170	(p15)	shrp		i0=r15,r14,56		}
171{ .mii;	(p15)	shrp		i1=r16,r15,56
172	(p14)	shrp		i0=r15,r14,48		}
173{ .mii;	(p14)	shrp		i1=r16,r15,48
174	(p13)	shrp		i0=r15,r14,40		}
175{ .mii;	(p13)	shrp		i1=r16,r15,40
176	(p12)	shrp		i0=r15,r14,32		}
177{ .mii;	(p12)	shrp		i1=r16,r15,32
178	(p11)	shrp		i0=r15,r14,24		}
179{ .mii;	(p11)	shrp		i1=r16,r15,24
180	(p10)	shrp		i0=r15,r14,16		}
181{ .mii;	(p10)	shrp		i1=r16,r15,16
182	(p9)	shrp		i0=r15,r14,8		}
183{ .mii;	(p9)	shrp		i1=r16,r15,8
184		mov		r14=r16			};;
185
186.Loop:
187		.pred.rel	"mutex",p8,p9,p10,p11,p12,p13,p14,p15
188{ .mmi;		add		h0=h0,i0
189		add		h1=h1,i1
190		add		h2=h2,r35		};;
191{ .mmi;		setf.sig	HF0=h0
192		cmp.ltu		p6,p0=h0,i0
193		cmp.ltu		p7,p0=h1,i1		};;
194{ .mmi;	(p6)	add		h1=1,h1;;
195		setf.sig	HF1=h1
196	(p6)	cmp.eq.or	p7,p0=0,h1		};;
197{ .mmi;	(p7)	add		h2=1,h2;;
198		setf.sig	HF2=h2			};;
199
200{ .mfi;	(p16)	ld8		r15=[r10],16
201		xmpy.lu		f32=HF0,RF0		}
202{ .mfi;	(p16)	ld8		r16=[r11],16
203		xmpy.hu		f33=HF0,RF0		}
204{ .mfi;		xmpy.lu		f36=HF0,RF1		}
205{ .mfi;		xmpy.hu		f37=HF0,RF1		};;
206{ .mfi;		xmpy.lu		f34=HF1,SF1
207	(p15)	shrp		i0=r15,r14,56		}
208{ .mfi;		xmpy.hu		f35=HF1,SF1		}
209{ .mfi;		xmpy.lu		f38=HF1,RF0
210	(p15)	shrp		i1=r16,r15,56		}
211{ .mfi;		xmpy.hu		f39=HF1,RF0		}
212{ .mfi;		xmpy.lu		f40=HF2,SF1
213	(p14)	shrp		i0=r15,r14,48		}
214{ .mfi;		xmpy.lu		f41=HF2,RF0		};;
215
216{ .mmi;		getf.sig	r22=f32
217		getf.sig	r23=f33
218	(p14)	shrp		i1=r16,r15,48		}
219{ .mmi;		getf.sig	r24=f34
220		getf.sig	r25=f35
221	(p13)	shrp		i0=r15,r14,40		}
222{ .mmi;		getf.sig	r26=f36
223		getf.sig	r27=f37
224	(p13)	shrp		i1=r16,r15,40		}
225{ .mmi;		getf.sig	r28=f38
226		getf.sig	r29=f39
227	(p12)	shrp		i0=r15,r14,32		}
228{ .mmi;		getf.sig	r30=f40
229		getf.sig	r31=f41			};;
230
231{ .mmi;		add		h0=r22,r24
232		add		r23=r23,r25
233	(p12)	shrp		i1=r16,r15,32		}
234{ .mmi;		add		h1=r26,r28
235		add		r27=r27,r29
236	(p11)	shrp		i0=r15,r14,24		};;
237{ .mmi;		cmp.ltu		p6,p0=h0,r24
238		cmp.ltu		p7,p0=h1,r28
239		add		r23=r23,r30		};;
240{ .mmi;	(p6)	add		r23=1,r23
241	(p7)	add		r27=1,r27
242	(p11)	shrp		i1=r16,r15,24		};;
243{ .mmi;		add		h1=h1,r23;;
244		cmp.ltu		p6,p7=h1,r23
245	(p10)	shrp		i0=r15,r14,16		};;
246{ .mmi;	(p6)	add		h2=r31,r27,1
247	(p7)	add		h2=r31,r27
248	(p10)	shrp		i1=r16,r15,16		};;
249
250{ .mmi;	(p8)	mov		i0=r15
251		and		r22=-4,h2
252		shr.u		r23=h2,2		};;
253{ .mmi;		add		r22=r22,r23
254		and		h2=3,h2
255	(p9)	shrp		i0=r15,r14,8		};;
256
257{ .mmi;		add		h0=h0,r22;;
258		cmp.ltu		p6,p0=h0,r22
259	(p9)	shrp		i1=r16,r15,8		};;
260{ .mmi;	(p8)	mov		i1=r16
261	(p6)	cmp.eq.unc	p7,p0=-1,h1
262	(p6)	add		h1=1,h1			};;
263{ .mmb;	(p7)	add		h2=1,h2
264		mov		r14=r16
265		br.ctop.sptk	.Loop			};;
266.Lcend:
267
268{ .mii;	SUM		1<<1				// back to big-endian
269	mov		ar.lc=r3		};;
270
271{ .mmi;	st8		[r8]=h0,16
272	st8		[r9]=h1
273	mov		pr=r36,0x1ffff		};;
274{ .mmb;	st8		[r8]=h2
275	rum		1<<5
276	br.ret.sptk	b0			};;
277.endp	poly1305_blocks#
278
279.global	poly1305_emit#
280.proc	poly1305_emit#
281.align	64
282poly1305_emit:
283	.prologue
284	.save		ar.pfs,r2
285{ .mmi;	alloc		r2=ar.pfs,3,0,0,0
286	ADDP		r8=0,r32
287	ADDP		r9=8,r32		};;
288
289	.body
290{ .mmi;	ld8		r16=[r8],16			// load hash
291	ld8		r17=[r9]
292	ADDP		r10=0,r34		};;
293{ .mmi;	ld8		r18=[r8]
294	ld4		r24=[r10],8			// load nonce
295	ADDP		r11=4,r34		};;
296
297{ .mmi;	ld4		r25=[r11],8
298	ld4		r26=[r10]
299	add		r20=5,r16		};;
300
301{ .mmi;	ld4		r27=[r11]
302	cmp.ltu		p6,p7=r20,r16
303	shl		r25=r25,32		};;
304{ .mmi;
305(p6)	add		r21=1,r17
306(p7)	add		r21=0,r17
307(p6)	cmp.eq.or.andcm	p6,p7=-1,r17		};;
308{ .mmi;
309(p6)	add		r22=1,r18
310(p7)	add		r22=0,r18
311	shl		r27=r27,32		};;
312{ .mmi;	or		r24=r24,r25
313	or		r26=r26,r27
314	cmp.leu		p6,p7=4,r22		};;
315{ .mmi;
316(p6)	add		r16=r20,r24
317(p7)	add		r16=r16,r24
318(p6)	add		r17=r21,r26		};;
319{ .mii;
320(p7)	add		r17=r17,r26
321	cmp.ltu		p6,p7=r16,r24;;
322(p6)	add		r17=1,r17		};;
323
324{ .mmi;	ADDP		r8=0,r33
325	ADDP		r9=4,r33
326	shr.u		r20=r16,32		}
327{ .mmi;	ADDP		r10=8,r33
328	ADDP		r11=12,r33
329	shr.u		r21=r17,32		};;
330
331{ .mmi;	st1		[r8]=r16,1			// write mac, little-endian
332	st1		[r9]=r20,1
333	shr.u		r16=r16,8		}
334{ .mii;	st1		[r10]=r17,1
335	shr.u		r20=r20,8
336	shr.u		r17=r17,8		}
337{ .mmi;	st1		[r11]=r21,1
338	shr.u		r21=r21,8		};;
339
340{ .mmi;	st1		[r8]=r16,1
341	st1		[r9]=r20,1
342	shr.u		r16=r16,8		}
343{ .mii;	st1		[r10]=r17,1
344	shr.u		r20=r20,8
345	shr.u		r17=r17,8		}
346{ .mmi;	st1		[r11]=r21,1
347	shr.u		r21=r21,8		};;
348
349{ .mmi;	st1		[r8]=r16,1
350	st1		[r9]=r20,1
351	shr.u		r16=r16,8		}
352{ .mii;	st1		[r10]=r17,1
353	shr.u		r20=r20,8
354	shr.u		r17=r17,8		}
355{ .mmi;	st1		[r11]=r21,1
356	shr.u		r21=r21,8		};;
357
358{ .mmi;	st1		[r8]=r16
359	st1		[r9]=r20		}
360{ .mmb;	st1		[r10]=r17
361	st1		[r11]=r21
362	br.ret.sptk	b0			};;
363.endp	poly1305_emit#
364
365stringz	"Poly1305 for IA64, CRYPTOGAMS by \@dot-asm"
366