1#! /usr/bin/env perl
2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for SPARCv9, vanilla, as well
18# as VIS3 and FMA extensions.
19#
20# May, August 2015
21#
22# Numbers are cycles per processed byte with poly1305_blocks alone.
23#
24#			IALU(*)		FMA
25#
26# UltraSPARC III	12.3(**)
27# SPARC T3		7.92
28# SPARC T4		1.70(***)	6.55
29# SPARC64 X		5.60		3.64
30#
31# (*)	Comparison to compiler-generated code is really problematic,
32#	because latter's performance varies too much depending on too
33#	many variables. For example, one can measure from 5x to 15x
34#	improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
35#	unfair comparison, because compiler doesn't use VIS3, but
36#	given same initial conditions coefficient varies from 3x to 9x.
37# (**)	Pre-III performance should be even worse; floating-point
38#	performance for UltraSPARC I-IV on the other hand is reported
39#	to be 4.25 for hand-coded assembly, but they are just too old
40#	to care about.
41# (***)	Multi-process benchmark saturates at ~12.5x single-process
42#	result on 8-core processor, or ~21GBps per 2.85GHz socket.
43
44# $output is the last argument if it looks like a file (it has an extension)
45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46
47open STDOUT,">$output" if $output;
48
49my ($ctx,$inp,$len,$padbit,$shl,$shr)	= map("%i$_",(0..5));
50my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4)	= map("%l$_",(0..7));
51my ($h0,$h1,$h2,$h3, $t0,$t1,$t2)	= map("%o$_",(0..5,7));
52my ($d0,$d1,$d2,$d3)			= map("%g$_",(1..4));
53
54$code.=<<___;
55#ifndef __ASSEMBLER__
56# define __ASSEMBLER__ 1
57#endif
58#include "crypto/sparc_arch.h"
59
60#ifdef	__arch64__
61.register	%g2,#scratch
62.register	%g3,#scratch
63# define	STPTR	stx
64# define	SIZE_T	8
65#else
66# define	STPTR	st
67# define	SIZE_T	4
68#endif
69#define	LOCALS	(STACK_BIAS+STACK_FRAME)
70
71.section	".text",#alloc,#execinstr
72
73#ifdef __PIC__
74SPARC_PIC_THUNK(%g1)
75#endif
76
77.globl	poly1305_init
78.align	32
79poly1305_init:
80	save	%sp,-STACK_FRAME-16,%sp
81	nop
82
83	SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
84	ld	[%g1],%g1
85
86	and	%g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
87	cmp	%g1,SPARCV9_FMADD
88	be	.Lpoly1305_init_fma
89	nop
90
91	stx	%g0,[$ctx+0]
92	stx	%g0,[$ctx+8]		! zero hash value
93	brz,pn	$inp,.Lno_key
94	stx	%g0,[$ctx+16]
95
96	and	$inp,7,$shr		! alignment factor
97	andn	$inp,7,$inp
98	sll	$shr,3,$shr		! *8
99	neg	$shr,$shl
100
101	sethi	%hi(0x0ffffffc),$t0
102	set	8,$h1
103	or	$t0,%lo(0x0ffffffc),$t0
104	set	16,$h2
105	sllx	$t0,32,$t1
106	or	$t0,$t1,$t1		! 0x0ffffffc0ffffffc
107	or	$t1,3,$t0		! 0x0ffffffc0fffffff
108
109	ldxa	[$inp+%g0]0x88,$h0	! load little-endian key
110	brz,pt	$shr,.Lkey_aligned
111	ldxa	[$inp+$h1]0x88,$h1
112
113	ldxa	[$inp+$h2]0x88,$h2
114	srlx	$h0,$shr,$h0
115	sllx	$h1,$shl,$t2
116	srlx	$h1,$shr,$h1
117	or	$t2,$h0,$h0
118	sllx	$h2,$shl,$h2
119	or	$h2,$h1,$h1
120
121.Lkey_aligned:
122	and	$t0,$h0,$h0
123	and	$t1,$h1,$h1
124	stx	$h0,[$ctx+32+0]		! store key
125	stx	$h1,[$ctx+32+8]
126
127	andcc	%g1,SPARCV9_VIS3,%g0
128	be	.Lno_key
129	nop
130
1311:	call	.+8
132	add	%o7,poly1305_blocks_vis3-1b,%o7
133
134	add	%o7,poly1305_emit-poly1305_blocks_vis3,%o5
135	STPTR	%o7,[%i2]
136	STPTR	%o5,[%i2+SIZE_T]
137
138	ret
139	restore	%g0,1,%o0		! return 1
140
141.Lno_key:
142	ret
143	restore	%g0,%g0,%o0		! return 0
144.type	poly1305_init,#function
145.size	poly1305_init,.-poly1305_init
146
147.globl	poly1305_blocks
148.align	32
149poly1305_blocks:
150	save	%sp,-STACK_FRAME,%sp
151	srln	$len,4,$len
152
153	brz,pn	$len,.Lno_data
154	nop
155
156	ld	[$ctx+32+0],$r1		! load key
157	ld	[$ctx+32+4],$r0
158	ld	[$ctx+32+8],$r3
159	ld	[$ctx+32+12],$r2
160
161	ld	[$ctx+0],$h1		! load hash value
162	ld	[$ctx+4],$h0
163	ld	[$ctx+8],$h3
164	ld	[$ctx+12],$h2
165	ld	[$ctx+16],$h4
166
167	and	$inp,7,$shr		! alignment factor
168	andn	$inp,7,$inp
169	set	8,$d1
170	sll	$shr,3,$shr		! *8
171	set	16,$d2
172	neg	$shr,$shl
173
174	srl	$r1,2,$s1
175	srl	$r2,2,$s2
176	add	$r1,$s1,$s1
177	srl	$r3,2,$s3
178	add	$r2,$s2,$s2
179	add	$r3,$s3,$s3
180
181.Loop:
182	ldxa	[$inp+%g0]0x88,$d0	! load little-endian input
183	brz,pt	$shr,.Linp_aligned
184	ldxa	[$inp+$d1]0x88,$d1
185
186	ldxa	[$inp+$d2]0x88,$d2
187	srlx	$d0,$shr,$d0
188	sllx	$d1,$shl,$t1
189	srlx	$d1,$shr,$d1
190	or	$t1,$d0,$d0
191	sllx	$d2,$shl,$d2
192	or	$d2,$d1,$d1
193
194.Linp_aligned:
195	srlx	$d0,32,$t0
196	addcc	$d0,$h0,$h0		! accumulate input
197	srlx	$d1,32,$t1
198	addccc	$t0,$h1,$h1
199	addccc	$d1,$h2,$h2
200	addccc	$t1,$h3,$h3
201	addc	$padbit,$h4,$h4
202
203	umul	$r0,$h0,$d0
204	umul	$r1,$h0,$d1
205	umul	$r2,$h0,$d2
206	umul	$r3,$h0,$d3
207	 sub	$len,1,$len
208	 add	$inp,16,$inp
209
210	umul	$s3,$h1,$t0
211	umul	$r0,$h1,$t1
212	umul	$r1,$h1,$t2
213	add	$t0,$d0,$d0
214	add	$t1,$d1,$d1
215	umul	$r2,$h1,$t0
216	add	$t2,$d2,$d2
217	add	$t0,$d3,$d3
218
219	umul	$s2,$h2,$t1
220	umul	$s3,$h2,$t2
221	umul	$r0,$h2,$t0
222	add	$t1,$d0,$d0
223	add	$t2,$d1,$d1
224	umul	$r1,$h2,$t1
225	add	$t0,$d2,$d2
226	add	$t1,$d3,$d3
227
228	umul	$s1,$h3,$t2
229	umul	$s2,$h3,$t0
230	umul	$s3,$h3,$t1
231	add	$t2,$d0,$d0
232	add	$t0,$d1,$d1
233	umul	$r0,$h3,$t2
234	add	$t1,$d2,$d2
235	add	$t2,$d3,$d3
236
237	umul	$s1,$h4,$t0
238	umul	$s2,$h4,$t1
239	umul	$s3,$h4,$t2
240	umul	$r0,$h4,$h4
241	add	$t0,$d1,$d1
242	add	$t1,$d2,$d2
243	srlx	$d0,32,$h1
244	add	$t2,$d3,$d3
245	srlx	$d1,32,$h2
246
247	addcc	$d1,$h1,$h1
248	srlx	$d2,32,$h3
249	 set	8,$d1
250	addccc	$d2,$h2,$h2
251	srlx	$d3,32,$t0
252	 set	16,$d2
253	addccc	$d3,$h3,$h3
254	addc	$t0,$h4,$h4
255
256	srl	$h4,2,$t0		! final reduction step
257	andn	$h4,3,$t1
258	and	$h4,3,$h4
259	add	$t1,$t0,$t0
260
261	addcc	$t0,$d0,$h0
262	addccc	%g0,$h1,$h1
263	addccc	%g0,$h2,$h2
264	addccc	%g0,$h3,$h3
265	brnz,pt	$len,.Loop
266	addc	%g0,$h4,$h4
267
268	st	$h1,[$ctx+0]		! store hash value
269	st	$h0,[$ctx+4]
270	st	$h3,[$ctx+8]
271	st	$h2,[$ctx+12]
272	st	$h4,[$ctx+16]
273
274.Lno_data:
275	ret
276	restore
277.type	poly1305_blocks,#function
278.size	poly1305_blocks,.-poly1305_blocks
279___
280########################################################################
281# VIS3 has umulxhi and addxc...
282{
283my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
284my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
285
286$code.=<<___;
287.align	32
288poly1305_blocks_vis3:
289	save	%sp,-STACK_FRAME,%sp
290	srln	$len,4,$len
291
292	brz,pn	$len,.Lno_data
293	nop
294
295	ldx	[$ctx+32+0],$R0		! load key
296	ldx	[$ctx+32+8],$R1
297
298	ldx	[$ctx+0],$H0		! load hash value
299	ldx	[$ctx+8],$H1
300	ld	[$ctx+16],$H2
301
302	and	$inp,7,$shr		! alignment factor
303	andn	$inp,7,$inp
304	set	8,$r1
305	sll	$shr,3,$shr		! *8
306	set	16,$r2
307	neg	$shr,$shl
308
309	srlx	$R1,2,$S1
310	b	.Loop_vis3
311	add	$R1,$S1,$S1
312
313.Loop_vis3:
314	ldxa	[$inp+%g0]0x88,$D0	! load little-endian input
315	brz,pt	$shr,.Linp_aligned_vis3
316	ldxa	[$inp+$r1]0x88,$D1
317
318	ldxa	[$inp+$r2]0x88,$D2
319	srlx	$D0,$shr,$D0
320	sllx	$D1,$shl,$T1
321	srlx	$D1,$shr,$D1
322	or	$T1,$D0,$D0
323	sllx	$D2,$shl,$D2
324	or	$D2,$D1,$D1
325
326.Linp_aligned_vis3:
327	addcc	$D0,$H0,$H0		! accumulate input
328	 sub	$len,1,$len
329	addxccc	$D1,$H1,$H1
330	 add	$inp,16,$inp
331
332	mulx	$R0,$H0,$D0		! r0*h0
333	addxc	$padbit,$H2,$H2
334	umulxhi	$R0,$H0,$D1
335	mulx	$S1,$H1,$T0		! s1*h1
336	umulxhi	$S1,$H1,$T1
337	addcc	$T0,$D0,$D0
338	mulx	$R1,$H0,$T0		! r1*h0
339	addxc	$T1,$D1,$D1
340	umulxhi	$R1,$H0,$D2
341	addcc	$T0,$D1,$D1
342	mulx	$R0,$H1,$T0		! r0*h1
343	addxc	%g0,$D2,$D2
344	umulxhi	$R0,$H1,$T1
345	addcc	$T0,$D1,$D1
346	mulx	$S1,$H2,$T0		! s1*h2
347	addxc	$T1,$D2,$D2
348	mulx	$R0,$H2,$T1		! r0*h2
349	addcc	$T0,$D1,$D1
350	addxc	$T1,$D2,$D2
351
352	srlx	$D2,2,$T0		! final reduction step
353	andn	$D2,3,$T1
354	and	$D2,3,$H2
355	add	$T1,$T0,$T0
356
357	addcc	$T0,$D0,$H0
358	addxccc	%g0,$D1,$H1
359	brnz,pt	$len,.Loop_vis3
360	addxc	%g0,$H2,$H2
361
362	stx	$H0,[$ctx+0]		! store hash value
363	stx	$H1,[$ctx+8]
364	st	$H2,[$ctx+16]
365
366	ret
367	restore
368.type	poly1305_blocks_vis3,#function
369.size	poly1305_blocks_vis3,.-poly1305_blocks_vis3
370___
371}
372my ($mac,$nonce) = ($inp,$len);
373
374$code.=<<___;
375.globl	poly1305_emit
376.align	32
377poly1305_emit:
378	save	%sp,-STACK_FRAME,%sp
379
380	ld	[$ctx+0],$h1		! load hash value
381	ld	[$ctx+4],$h0
382	ld	[$ctx+8],$h3
383	ld	[$ctx+12],$h2
384	ld	[$ctx+16],$h4
385
386	addcc	$h0,5,$r0		! compare to modulus
387	addccc	$h1,0,$r1
388	addccc	$h2,0,$r2
389	addccc	$h3,0,$r3
390	addc	$h4,0,$h4
391	andcc	$h4,4,%g0		! did it carry/borrow?
392
393	movnz	%icc,$r0,$h0
394	ld	[$nonce+0],$r0		! load nonce
395	movnz	%icc,$r1,$h1
396	ld	[$nonce+4],$r1
397	movnz	%icc,$r2,$h2
398	ld	[$nonce+8],$r2
399	movnz	%icc,$r3,$h3
400	ld	[$nonce+12],$r3
401
402	addcc	$r0,$h0,$h0		! accumulate nonce
403	addccc	$r1,$h1,$h1
404	addccc	$r2,$h2,$h2
405	addc	$r3,$h3,$h3
406
407	srl	$h0,8,$r0
408	stb	$h0,[$mac+0]		! store little-endian result
409	srl	$h0,16,$r1
410	stb	$r0,[$mac+1]
411	srl	$h0,24,$r2
412	stb	$r1,[$mac+2]
413	stb	$r2,[$mac+3]
414
415	srl	$h1,8,$r0
416	stb	$h1,[$mac+4]
417	srl	$h1,16,$r1
418	stb	$r0,[$mac+5]
419	srl	$h1,24,$r2
420	stb	$r1,[$mac+6]
421	stb	$r2,[$mac+7]
422
423	srl	$h2,8,$r0
424	stb	$h2,[$mac+8]
425	srl	$h2,16,$r1
426	stb	$r0,[$mac+9]
427	srl	$h2,24,$r2
428	stb	$r1,[$mac+10]
429	stb	$r2,[$mac+11]
430
431	srl	$h3,8,$r0
432	stb	$h3,[$mac+12]
433	srl	$h3,16,$r1
434	stb	$r0,[$mac+13]
435	srl	$h3,24,$r2
436	stb	$r1,[$mac+14]
437	stb	$r2,[$mac+15]
438
439	ret
440	restore
441.type	poly1305_emit,#function
442.size	poly1305_emit,.-poly1305_emit
443___
444
445{
446my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
447my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
448my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
449my $i2=$step;
450
451my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
452    $two0,$two32,$two64,$two96,$two130,$five_two130,
453    $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
454    $s2lo,$s2hi,$s3lo,$s3hi,
455    $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
456# borrowings
457my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
458my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
459my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
460
461$code.=<<___;
462.align	32
463poly1305_init_fma:
464	save	%sp,-STACK_FRAME-16,%sp
465	nop
466
467.Lpoly1305_init_fma:
4681:	call	.+8
469	add	%o7,.Lconsts_fma-1b,%o7
470
471	ldd	[%o7+8*0],$two0			! load constants
472	ldd	[%o7+8*1],$two32
473	ldd	[%o7+8*2],$two64
474	ldd	[%o7+8*3],$two96
475	ldd	[%o7+8*5],$five_two130
476
477	std	$two0,[$ctx+8*0]		! initial hash value, biased 0
478	std	$two32,[$ctx+8*1]
479	std	$two64,[$ctx+8*2]
480	std	$two96,[$ctx+8*3]
481
482	brz,pn	$inp,.Lno_key_fma
483	nop
484
485	stx	%fsr,[%sp+LOCALS]		! save original %fsr
486	ldx	[%o7+8*6],%fsr			! load new %fsr
487
488	std	$two0,[$ctx+8*4] 		! key "template"
489	std	$two32,[$ctx+8*5]
490	std	$two64,[$ctx+8*6]
491	std	$two96,[$ctx+8*7]
492
493	and	$inp,7,$shr
494	andn	$inp,7,$inp			! align pointer
495	mov	8,$i1
496	sll	$shr,3,$shr
497	mov	16,$i2
498	neg	$shr,$shl
499
500	ldxa	[$inp+%g0]0x88,$in0		! load little-endian key
501	ldxa	[$inp+$i1]0x88,$in2
502
503	brz	$shr,.Lkey_aligned_fma
504	sethi	%hi(0xf0000000),$i1		!   0xf0000000
505
506	ldxa	[$inp+$i2]0x88,$in4
507
508	srlx	$in0,$shr,$in0			! align data
509	sllx	$in2,$shl,$in1
510	srlx	$in2,$shr,$in2
511	or	$in1,$in0,$in0
512	sllx	$in4,$shl,$in3
513	or	$in3,$in2,$in2
514
515.Lkey_aligned_fma:
516	or	$i1,3,$i2			!   0xf0000003
517	srlx	$in0,32,$in1
518	andn	$in0,$i1,$in0			! &=0x0fffffff
519	andn	$in1,$i2,$in1			! &=0x0ffffffc
520	srlx	$in2,32,$in3
521	andn	$in2,$i2,$in2
522	andn	$in3,$i2,$in3
523
524	st	$in0,[$ctx+`8*4+4`]		! fill "template"
525	st	$in1,[$ctx+`8*5+4`]
526	st	$in2,[$ctx+`8*6+4`]
527	st	$in3,[$ctx+`8*7+4`]
528
529	ldd	[$ctx+8*4],$h0lo 		! load [biased] key
530	ldd	[$ctx+8*5],$h1lo
531	ldd	[$ctx+8*6],$h2lo
532	ldd	[$ctx+8*7],$h3lo
533
534	fsubd	$h0lo,$two0, $h0lo		! r0
535	 ldd	[%o7+8*7],$two0 		! more constants
536	fsubd	$h1lo,$two32,$h1lo		! r1
537	 ldd	[%o7+8*8],$two32
538	fsubd	$h2lo,$two64,$h2lo		! r2
539	 ldd	[%o7+8*9],$two64
540	fsubd	$h3lo,$two96,$h3lo		! r3
541	 ldd	[%o7+8*10],$two96
542
543	fmuld	$five_two130,$h1lo,$s1lo	! s1
544	fmuld	$five_two130,$h2lo,$s2lo	! s2
545	fmuld	$five_two130,$h3lo,$s3lo	! s3
546
547	faddd	$h0lo,$two0, $h0hi
548	faddd	$h1lo,$two32,$h1hi
549	faddd	$h2lo,$two64,$h2hi
550	faddd	$h3lo,$two96,$h3hi
551
552	fsubd	$h0hi,$two0, $h0hi
553	 ldd	[%o7+8*11],$two0		! more constants
554	fsubd	$h1hi,$two32,$h1hi
555	 ldd	[%o7+8*12],$two32
556	fsubd	$h2hi,$two64,$h2hi
557	 ldd	[%o7+8*13],$two64
558	fsubd	$h3hi,$two96,$h3hi
559
560	fsubd	$h0lo,$h0hi,$h0lo
561	 std	$h0hi,[$ctx+8*5] 		! r0hi
562	fsubd	$h1lo,$h1hi,$h1lo
563	 std	$h1hi,[$ctx+8*7] 		! r1hi
564	fsubd	$h2lo,$h2hi,$h2lo
565	 std	$h2hi,[$ctx+8*9] 		! r2hi
566	fsubd	$h3lo,$h3hi,$h3lo
567	 std	$h3hi,[$ctx+8*11]		! r3hi
568
569	faddd	$s1lo,$two0, $s1hi
570	faddd	$s2lo,$two32,$s2hi
571	faddd	$s3lo,$two64,$s3hi
572
573	fsubd	$s1hi,$two0, $s1hi
574	fsubd	$s2hi,$two32,$s2hi
575	fsubd	$s3hi,$two64,$s3hi
576
577	fsubd	$s1lo,$s1hi,$s1lo
578	fsubd	$s2lo,$s2hi,$s2lo
579	fsubd	$s3lo,$s3hi,$s3lo
580
581	ldx	[%sp+LOCALS],%fsr		! restore %fsr
582
583	std	$h0lo,[$ctx+8*4] 		! r0lo
584	std	$h1lo,[$ctx+8*6] 		! r1lo
585	std	$h2lo,[$ctx+8*8] 		! r2lo
586	std	$h3lo,[$ctx+8*10]		! r3lo
587
588	std	$s1hi,[$ctx+8*13]
589	std	$s2hi,[$ctx+8*15]
590	std	$s3hi,[$ctx+8*17]
591
592	std	$s1lo,[$ctx+8*12]
593	std	$s2lo,[$ctx+8*14]
594	std	$s3lo,[$ctx+8*16]
595
596	add	%o7,poly1305_blocks_fma-.Lconsts_fma,%o0
597	add	%o7,poly1305_emit_fma-.Lconsts_fma,%o1
598	STPTR	%o0,[%i2]
599	STPTR	%o1,[%i2+SIZE_T]
600
601	ret
602	restore	%g0,1,%o0			! return 1
603
604.Lno_key_fma:
605	ret
606	restore	%g0,%g0,%o0			! return 0
607.type	poly1305_init_fma,#function
608.size	poly1305_init_fma,.-poly1305_init_fma
609
610.align	32
611poly1305_blocks_fma:
612	save	%sp,-STACK_FRAME-48,%sp
613	srln	$len,4,$len
614
615	brz,pn	$len,.Labort
616	sub	$len,1,$len
617
6181:	call	.+8
619	add	%o7,.Lconsts_fma-1b,%o7
620
621	ldd	[%o7+8*0],$two0			! load constants
622	ldd	[%o7+8*1],$two32
623	ldd	[%o7+8*2],$two64
624	ldd	[%o7+8*3],$two96
625	ldd	[%o7+8*4],$two130
626	ldd	[%o7+8*5],$five_two130
627
628	ldd	[$ctx+8*0],$h0lo 		! load [biased] hash value
629	ldd	[$ctx+8*1],$h1lo
630	ldd	[$ctx+8*2],$h2lo
631	ldd	[$ctx+8*3],$h3lo
632
633	std	$two0,[%sp+LOCALS+8*0]		! input "template"
634	sethi	%hi((1023+52+96)<<20),$in3
635	std	$two32,[%sp+LOCALS+8*1]
636	or	$padbit,$in3,$in3
637	std	$two64,[%sp+LOCALS+8*2]
638	st	$in3,[%sp+LOCALS+8*3]
639
640	and	$inp,7,$shr
641	andn	$inp,7,$inp			! align pointer
642	mov	8,$i1
643	sll	$shr,3,$shr
644	mov	16,$step
645	neg	$shr,$shl
646
647	ldxa	[$inp+%g0]0x88,$in0		! load little-endian input
648	brz	$shr,.Linp_aligned_fma
649	ldxa	[$inp+$i1]0x88,$in2
650
651	ldxa	[$inp+$step]0x88,$in4
652	add	$inp,8,$inp
653
654	srlx	$in0,$shr,$in0			! align data
655	sllx	$in2,$shl,$in1
656	srlx	$in2,$shr,$in2
657	or	$in1,$in0,$in0
658	sllx	$in4,$shl,$in3
659	srlx	$in4,$shr,$in4			! pre-shift
660	or	$in3,$in2,$in2
661
662.Linp_aligned_fma:
663	srlx	$in0,32,$in1
664	movrz	$len,0,$step
665	srlx	$in2,32,$in3
666	add	$step,$inp,$inp			! conditional advance
667
668	st	$in0,[%sp+LOCALS+8*0+4]		! fill "template"
669	st	$in1,[%sp+LOCALS+8*1+4]
670	st	$in2,[%sp+LOCALS+8*2+4]
671	st	$in3,[%sp+LOCALS+8*3+4]
672
673	ldd	[$ctx+8*4],$r0lo 		! load key
674	ldd	[$ctx+8*5],$r0hi
675	ldd	[$ctx+8*6],$r1lo
676	ldd	[$ctx+8*7],$r1hi
677	ldd	[$ctx+8*8],$r2lo
678	ldd	[$ctx+8*9],$r2hi
679	ldd	[$ctx+8*10],$r3lo
680	ldd	[$ctx+8*11],$r3hi
681	ldd	[$ctx+8*12],$s1lo
682	ldd	[$ctx+8*13],$s1hi
683	ldd	[$ctx+8*14],$s2lo
684	ldd	[$ctx+8*15],$s2hi
685	ldd	[$ctx+8*16],$s3lo
686	ldd	[$ctx+8*17],$s3hi
687
688	stx	%fsr,[%sp+LOCALS+8*4]		! save original %fsr
689	ldx	[%o7+8*6],%fsr			! load new %fsr
690
691	subcc	$len,1,$len
692	movrz	$len,0,$step
693
694	ldd	[%sp+LOCALS+8*0],$x0		! load biased input
695	ldd	[%sp+LOCALS+8*1],$x1
696	ldd	[%sp+LOCALS+8*2],$x2
697	ldd	[%sp+LOCALS+8*3],$x3
698
699	fsubd	$h0lo,$two0, $h0lo		! de-bias hash value
700	fsubd	$h1lo,$two32,$h1lo
701	 ldxa	[$inp+%g0]0x88,$in0		! modulo-scheduled input load
702	fsubd	$h2lo,$two64,$h2lo
703	fsubd	$h3lo,$two96,$h3lo
704	 ldxa	[$inp+$i1]0x88,$in2
705
706	fsubd	$x0,$two0, $x0  		! de-bias input
707	fsubd	$x1,$two32,$x1
708	fsubd	$x2,$two64,$x2
709	fsubd	$x3,$two96,$x3
710
711	brz	$shr,.Linp_aligned_fma2
712	add	$step,$inp,$inp			! conditional advance
713
714	sllx	$in0,$shl,$in1			! align data
715	srlx	$in0,$shr,$in3
716	or	$in1,$in4,$in0
717	sllx	$in2,$shl,$in1
718	srlx	$in2,$shr,$in4			! pre-shift
719	or	$in3,$in1,$in2
720.Linp_aligned_fma2:
721	srlx	$in0,32,$in1
722	srlx	$in2,32,$in3
723
724	faddd	$h0lo,$x0,$x0			! accumulate input
725	 stw	$in0,[%sp+LOCALS+8*0+4]
726	faddd	$h1lo,$x1,$x1
727	 stw	$in1,[%sp+LOCALS+8*1+4]
728	faddd	$h2lo,$x2,$x2
729	 stw	$in2,[%sp+LOCALS+8*2+4]
730	faddd	$h3lo,$x3,$x3
731	 stw	$in3,[%sp+LOCALS+8*3+4]
732
733	b	.Lentry_fma
734	nop
735
736.align	16
737.Loop_fma:
738	ldxa	[$inp+%g0]0x88,$in0		! modulo-scheduled input load
739	ldxa	[$inp+$i1]0x88,$in2
740	movrz	$len,0,$step
741
742	faddd	$y0,$h0lo,$h0lo 		! accumulate input
743	faddd	$y1,$h0hi,$h0hi
744	faddd	$y2,$h2lo,$h2lo
745	faddd	$y3,$h2hi,$h2hi
746
747	brz,pn	$shr,.Linp_aligned_fma3
748	add	$step,$inp,$inp			! conditional advance
749
750	sllx	$in0,$shl,$in1			! align data
751	srlx	$in0,$shr,$in3
752	or	$in1,$in4,$in0
753	sllx	$in2,$shl,$in1
754	srlx	$in2,$shr,$in4			! pre-shift
755	or	$in3,$in1,$in2
756
757.Linp_aligned_fma3:
758	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
759	faddd	$two64,$h1lo,$c1lo
760	 srlx	$in0,32,$in1
761	faddd	$two64,$h1hi,$c1hi
762	 srlx	$in2,32,$in3
763	faddd	$two130,$h3lo,$c3lo
764	 st	$in0,[%sp+LOCALS+8*0+4]		! fill "template"
765	faddd	$two130,$h3hi,$c3hi
766	 st	$in1,[%sp+LOCALS+8*1+4]
767	faddd	$two32,$h0lo,$c0lo
768	 st	$in2,[%sp+LOCALS+8*2+4]
769	faddd	$two32,$h0hi,$c0hi
770	 st	$in3,[%sp+LOCALS+8*3+4]
771	faddd	$two96,$h2lo,$c2lo
772	faddd	$two96,$h2hi,$c2hi
773
774	fsubd	$c1lo,$two64,$c1lo
775	fsubd	$c1hi,$two64,$c1hi
776	fsubd	$c3lo,$two130,$c3lo
777	fsubd	$c3hi,$two130,$c3hi
778	fsubd	$c0lo,$two32,$c0lo
779	fsubd	$c0hi,$two32,$c0hi
780	fsubd	$c2lo,$two96,$c2lo
781	fsubd	$c2hi,$two96,$c2hi
782
783	fsubd	$h1lo,$c1lo,$h1lo
784	fsubd	$h1hi,$c1hi,$h1hi
785	fsubd	$h3lo,$c3lo,$h3lo
786	fsubd	$h3hi,$c3hi,$h3hi
787	fsubd	$h2lo,$c2lo,$h2lo
788	fsubd	$h2hi,$c2hi,$h2hi
789	fsubd	$h0lo,$c0lo,$h0lo
790	fsubd	$h0hi,$c0hi,$h0hi
791
792	faddd	$h1lo,$c0lo,$h1lo
793	faddd	$h1hi,$c0hi,$h1hi
794	faddd	$h3lo,$c2lo,$h3lo
795	faddd	$h3hi,$c2hi,$h3hi
796	faddd	$h2lo,$c1lo,$h2lo
797	faddd	$h2hi,$c1hi,$h2hi
798	fmaddd	$five_two130,$c3lo,$h0lo,$h0lo
799	fmaddd	$five_two130,$c3hi,$h0hi,$h0hi
800
801	faddd	$h1lo,$h1hi,$x1
802	 ldd	[$ctx+8*12],$s1lo		! reload constants
803	faddd	$h3lo,$h3hi,$x3
804	 ldd	[$ctx+8*13],$s1hi
805	faddd	$h2lo,$h2hi,$x2
806	 ldd	[$ctx+8*10],$r3lo
807	faddd	$h0lo,$h0hi,$x0
808	 ldd	[$ctx+8*11],$r3hi
809
810.Lentry_fma:
811	fmuld	$x1,$s3lo,$h0lo
812	fmuld	$x1,$s3hi,$h0hi
813	fmuld	$x1,$r1lo,$h2lo
814	fmuld	$x1,$r1hi,$h2hi
815	fmuld	$x1,$r0lo,$h1lo
816	fmuld	$x1,$r0hi,$h1hi
817	fmuld	$x1,$r2lo,$h3lo
818	fmuld	$x1,$r2hi,$h3hi
819
820	fmaddd	$x3,$s1lo,$h0lo,$h0lo
821	fmaddd	$x3,$s1hi,$h0hi,$h0hi
822	fmaddd	$x3,$s3lo,$h2lo,$h2lo
823	fmaddd	$x3,$s3hi,$h2hi,$h2hi
824	fmaddd	$x3,$s2lo,$h1lo,$h1lo
825	fmaddd	$x3,$s2hi,$h1hi,$h1hi
826	fmaddd	$x3,$r0lo,$h3lo,$h3lo
827	fmaddd	$x3,$r0hi,$h3hi,$h3hi
828
829	fmaddd	$x2,$s2lo,$h0lo,$h0lo
830	fmaddd	$x2,$s2hi,$h0hi,$h0hi
831	fmaddd	$x2,$r0lo,$h2lo,$h2lo
832	fmaddd	$x2,$r0hi,$h2hi,$h2hi
833	fmaddd	$x2,$s3lo,$h1lo,$h1lo
834	 ldd	[%sp+LOCALS+8*0],$y0		! load [biased] input
835	fmaddd	$x2,$s3hi,$h1hi,$h1hi
836	 ldd	[%sp+LOCALS+8*1],$y1
837	fmaddd	$x2,$r1lo,$h3lo,$h3lo
838	 ldd	[%sp+LOCALS+8*2],$y2
839	fmaddd	$x2,$r1hi,$h3hi,$h3hi
840	 ldd	[%sp+LOCALS+8*3],$y3
841
842	fmaddd	$x0,$r0lo,$h0lo,$h0lo
843	 fsubd	$y0,$two0, $y0  		! de-bias input
844	fmaddd	$x0,$r0hi,$h0hi,$h0hi
845	 fsubd	$y1,$two32,$y1
846	fmaddd	$x0,$r2lo,$h2lo,$h2lo
847	 fsubd	$y2,$two64,$y2
848	fmaddd	$x0,$r2hi,$h2hi,$h2hi
849	 fsubd	$y3,$two96,$y3
850	fmaddd	$x0,$r1lo,$h1lo,$h1lo
851	fmaddd	$x0,$r1hi,$h1hi,$h1hi
852	fmaddd	$x0,$r3lo,$h3lo,$h3lo
853	fmaddd	$x0,$r3hi,$h3hi,$h3hi
854
855	bcc	SIZE_T_CC,.Loop_fma
856	subcc	$len,1,$len
857
858	!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
859	faddd	$h0lo,$two32,$c0lo
860	faddd	$h0hi,$two32,$c0hi
861	faddd	$h2lo,$two96,$c2lo
862	faddd	$h2hi,$two96,$c2hi
863	faddd	$h1lo,$two64,$c1lo
864	faddd	$h1hi,$two64,$c1hi
865	faddd	$h3lo,$two130,$c3lo
866	faddd	$h3hi,$two130,$c3hi
867
868	fsubd	$c0lo,$two32,$c0lo
869	fsubd	$c0hi,$two32,$c0hi
870	fsubd	$c2lo,$two96,$c2lo
871	fsubd	$c2hi,$two96,$c2hi
872	fsubd	$c1lo,$two64,$c1lo
873	fsubd	$c1hi,$two64,$c1hi
874	fsubd	$c3lo,$two130,$c3lo
875	fsubd	$c3hi,$two130,$c3hi
876
877	fsubd	$h1lo,$c1lo,$h1lo
878	fsubd	$h1hi,$c1hi,$h1hi
879	fsubd	$h3lo,$c3lo,$h3lo
880	fsubd	$h3hi,$c3hi,$h3hi
881	fsubd	$h2lo,$c2lo,$h2lo
882	fsubd	$h2hi,$c2hi,$h2hi
883	fsubd	$h0lo,$c0lo,$h0lo
884	fsubd	$h0hi,$c0hi,$h0hi
885
886	faddd	$h1lo,$c0lo,$h1lo
887	faddd	$h1hi,$c0hi,$h1hi
888	faddd	$h3lo,$c2lo,$h3lo
889	faddd	$h3hi,$c2hi,$h3hi
890	faddd	$h2lo,$c1lo,$h2lo
891	faddd	$h2hi,$c1hi,$h2hi
892	fmaddd	$five_two130,$c3lo,$h0lo,$h0lo
893	fmaddd	$five_two130,$c3hi,$h0hi,$h0hi
894
895	faddd	$h1lo,$h1hi,$x1
896	faddd	$h3lo,$h3hi,$x3
897	faddd	$h2lo,$h2hi,$x2
898	faddd	$h0lo,$h0hi,$x0
899
900	faddd	$x1,$two32,$x1  		! bias
901	faddd	$x3,$two96,$x3
902	faddd	$x2,$two64,$x2
903	faddd	$x0,$two0, $x0
904
905	ldx	[%sp+LOCALS+8*4],%fsr		! restore saved %fsr
906
907	std	$x1,[$ctx+8*1]			! store [biased] hash value
908	std	$x3,[$ctx+8*3]
909	std	$x2,[$ctx+8*2]
910	std	$x0,[$ctx+8*0]
911
912.Labort:
913	ret
914	restore
915.type	poly1305_blocks_fma,#function
916.size	poly1305_blocks_fma,.-poly1305_blocks_fma
917___
918{
919my ($mac,$nonce)=($inp,$len);
920
921my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
922   ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
923
924$code.=<<___;
925.align	32
926poly1305_emit_fma:
927	save	%sp,-STACK_FRAME,%sp
928
929	ld	[$ctx+8*0+0],$d0		! load hash
930	ld	[$ctx+8*0+4],$h0
931	ld	[$ctx+8*1+0],$d1
932	ld	[$ctx+8*1+4],$h1
933	ld	[$ctx+8*2+0],$d2
934	ld	[$ctx+8*2+4],$h2
935	ld	[$ctx+8*3+0],$d3
936	ld	[$ctx+8*3+4],$h3
937
938	sethi	%hi(0xfff00000),$mask
939	andn	$d0,$mask,$d0			! mask exponent
940	andn	$d1,$mask,$d1
941	andn	$d2,$mask,$d2
942	andn	$d3,$mask,$d3			! can be partially reduced...
943	mov	3,$mask
944
945	srl	$d3,2,$padbit			! ... so reduce
946	and	$d3,$mask,$h4
947	andn	$d3,$mask,$d3
948	add	$padbit,$d3,$d3
949
950	addcc	$d3,$h0,$h0
951	addccc	$d0,$h1,$h1
952	addccc	$d1,$h2,$h2
953	addccc	$d2,$h3,$h3
954	addc	%g0,$h4,$h4
955
956	addcc	$h0,5,$d0			! compare to modulus
957	addccc	$h1,0,$d1
958	addccc	$h2,0,$d2
959	addccc	$h3,0,$d3
960	addc	$h4,0,$mask
961
962	srl	$mask,2,$mask			! did it carry/borrow?
963	neg	$mask,$mask
964	sra	$mask,31,$mask			! mask
965
966	andn	$h0,$mask,$h0
967	and	$d0,$mask,$d0
968	andn	$h1,$mask,$h1
969	and	$d1,$mask,$d1
970	or	$d0,$h0,$h0
971	ld	[$nonce+0],$d0			! load nonce
972	andn	$h2,$mask,$h2
973	and	$d2,$mask,$d2
974	or	$d1,$h1,$h1
975	ld	[$nonce+4],$d1
976	andn	$h3,$mask,$h3
977	and	$d3,$mask,$d3
978	or	$d2,$h2,$h2
979	ld	[$nonce+8],$d2
980	or	$d3,$h3,$h3
981	ld	[$nonce+12],$d3
982
983	addcc	$d0,$h0,$h0			! accumulate nonce
984	addccc	$d1,$h1,$h1
985	addccc	$d2,$h2,$h2
986	addc	$d3,$h3,$h3
987
988	stb	$h0,[$mac+0]			! write little-endian result
989	srl	$h0,8,$h0
990	stb	$h1,[$mac+4]
991	srl	$h1,8,$h1
992	stb	$h2,[$mac+8]
993	srl	$h2,8,$h2
994	stb	$h3,[$mac+12]
995	srl	$h3,8,$h3
996
997	stb	$h0,[$mac+1]
998	srl	$h0,8,$h0
999	stb	$h1,[$mac+5]
1000	srl	$h1,8,$h1
1001	stb	$h2,[$mac+9]
1002	srl	$h2,8,$h2
1003	stb	$h3,[$mac+13]
1004	srl	$h3,8,$h3
1005
1006	stb	$h0,[$mac+2]
1007	srl	$h0,8,$h0
1008	stb	$h1,[$mac+6]
1009	srl	$h1,8,$h1
1010	stb	$h2,[$mac+10]
1011	srl	$h2,8,$h2
1012	stb	$h3,[$mac+14]
1013	srl	$h3,8,$h3
1014
1015	stb	$h0,[$mac+3]
1016	stb	$h1,[$mac+7]
1017	stb	$h2,[$mac+11]
1018	stb	$h3,[$mac+15]
1019
1020	ret
1021	restore
1022.type	poly1305_emit_fma,#function
1023.size	poly1305_emit_fma,.-poly1305_emit_fma
1024___
1025}
1026
1027$code.=<<___;
1028.align	64
1029.Lconsts_fma:
1030.word	0x43300000,0x00000000		! 2^(52+0)
1031.word	0x45300000,0x00000000		! 2^(52+32)
1032.word	0x47300000,0x00000000		! 2^(52+64)
1033.word	0x49300000,0x00000000		! 2^(52+96)
1034.word	0x4b500000,0x00000000		! 2^(52+130)
1035
1036.word	0x37f40000,0x00000000		! 5/2^130
1037.word	0,1<<30				! fsr: truncate, no exceptions
1038
1039.word	0x44300000,0x00000000		! 2^(52+16+0)
1040.word	0x46300000,0x00000000		! 2^(52+16+32)
1041.word	0x48300000,0x00000000		! 2^(52+16+64)
1042.word	0x4a300000,0x00000000		! 2^(52+16+96)
1043.word	0x3e300000,0x00000000		! 2^(52+16+0-96)
1044.word	0x40300000,0x00000000		! 2^(52+16+32-96)
1045.word	0x42300000,0x00000000		! 2^(52+16+64-96)
1046.asciz	"Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1047.align	4
1048___
1049}
1050
1051# Purpose of these subroutines is to explicitly encode VIS instructions,
1052# so that one can compile the module without having to specify VIS
1053# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1054# Idea is to reserve for option to produce "universal" binary and let
1055# programmer detect if current CPU is VIS capable at run-time.
1056sub unvis3 {
1057my ($mnemonic,$rs1,$rs2,$rd)=@_;
1058my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1059my ($ref,$opf);
1060my %visopf = (	"addxc"		=> 0x011,
1061		"addxccc"	=> 0x013,
1062		"umulxhi"	=> 0x016	);
1063
1064    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1065
1066    if ($opf=$visopf{$mnemonic}) {
1067	foreach ($rs1,$rs2,$rd) {
1068	    return $ref if (!/%([goli])([0-9])/);
1069	    $_=$bias{$1}+$2;
1070	}
1071
1072	return	sprintf ".word\t0x%08x !%s",
1073			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1074			$ref;
1075    } else {
1076	return $ref;
1077    }
1078}
1079
1080sub unfma {
1081my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1082my ($ref,$opf);
1083my %fmaopf = (	"fmadds"	=> 0x1,
1084		"fmaddd"	=> 0x2,
1085		"fmsubs"	=> 0x5,
1086		"fmsubd"	=> 0x6		);
1087
1088    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1089
1090    if ($opf=$fmaopf{$mnemonic}) {
1091	foreach ($rs1,$rs2,$rs3,$rd) {
1092	    return $ref if (!/%f([0-9]{1,2})/);
1093	    $_=$1;
1094	    if ($1>=32) {
1095		return $ref if ($1&1);
1096		# re-encode for upper double register addressing
1097		$_=($1|$1>>5)&31;
1098	    }
1099	}
1100
1101	return	sprintf ".word\t0x%08x !%s",
1102			0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1103			$ref;
1104    } else {
1105	return $ref;
1106    }
1107}
1108
1109foreach (split("\n",$code)) {
1110	s/\`([^\`]*)\`/eval $1/ge;
1111
1112	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1113		&unvis3($1,$2,$3,$4)
1114	 /ge	or
1115	s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1116		&unfma($1,$2,$3,$4,$5)
1117	 /ge;
1118
1119	print $_,"\n";
1120}
1121
1122close STDOUT or die "error closing STDOUT: $!";
1123