xref: /openssl/crypto/modes/asm/ghash-sparcv9.pl (revision 54b40531)
1#! /usr/bin/env perl
2# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23# and are expressed in cycles per processed byte, less is better:
24#
25#		gcc 3.3.x	cc 5.2		this assembler
26#
27# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
28# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
29#
30# Here is data collected on UltraSPARC T1 system running Linux:
31#
32#		gcc 4.4.1			this assembler
33#
34# 32-bit build	566				50	(+1000%)
35# 64-bit build	56				50	(+12%)
36#
37# I don't quite understand why difference between 32-bit and 64-bit
38# compiler-generated code is so big. Compilers *were* instructed to
39# generate code for UltraSPARC and should have used 64-bit registers
40# for Z vector (see C code) even in 32-bit build... Oh well, it only
41# means more impressive improvement coefficients for this assembler
42# module;-) Loops are aggressively modulo-scheduled in respect to
43# references to input data and Z.hi updates to achieve 12 cycles
44# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46#
47# October 2012
48#
49# Add VIS3 lookup-table-free implementation using polynomial
50# multiplication xmulx[hi] and extended addition addxc[cc]
51# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53# saturates at ~15.5x single-process result on 8-core processor,
54# or ~20.5GBps per 2.85GHz socket.
55
56$output=pop and open STDOUT,">$output";
57
58$frame="STACK_FRAME";
59$bias="STACK_BIAS";
60
61$Zhi="%o0";	# 64-bit values
62$Zlo="%o1";
63$Thi="%o2";
64$Tlo="%o3";
65$rem="%o4";
66$tmp="%o5";
67
68$nhi="%l0";	# small values and pointers
69$nlo="%l1";
70$xi0="%l2";
71$xi1="%l3";
72$rem_4bit="%l4";
73$remi="%l5";
74$Htblo="%l6";
75$cnt="%l7";
76
77$Xi="%i0";	# input argument block
78$Htbl="%i1";
79$inp="%i2";
80$len="%i3";
81
82$code.=<<___;
83#ifndef __ASSEMBLER__
84# define __ASSEMBLER__ 1
85#endif
86#include "crypto/sparc_arch.h"
87
88#ifdef  __arch64__
89.register	%g2,#scratch
90.register	%g3,#scratch
91#endif
92
93.section	".text",#alloc,#execinstr
94
95.align	64
96rem_4bit:
97	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
98	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
99	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
100	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
101.type	rem_4bit,#object
102.size	rem_4bit,(.-rem_4bit)
103
104.globl	gcm_ghash_4bit
105.align	32
106gcm_ghash_4bit:
107	save	%sp,-$frame,%sp
108	ldub	[$inp+15],$nlo
109	ldub	[$Xi+15],$xi0
110	ldub	[$Xi+14],$xi1
111	add	$len,$inp,$len
112	add	$Htbl,8,$Htblo
113
1141:	call	.+8
115	add	%o7,rem_4bit-1b,$rem_4bit
116
117.Louter:
118	xor	$xi0,$nlo,$nlo
119	and	$nlo,0xf0,$nhi
120	and	$nlo,0x0f,$nlo
121	sll	$nlo,4,$nlo
122	ldx	[$Htblo+$nlo],$Zlo
123	ldx	[$Htbl+$nlo],$Zhi
124
125	ldub	[$inp+14],$nlo
126
127	ldx	[$Htblo+$nhi],$Tlo
128	and	$Zlo,0xf,$remi
129	ldx	[$Htbl+$nhi],$Thi
130	sll	$remi,3,$remi
131	ldx	[$rem_4bit+$remi],$rem
132	srlx	$Zlo,4,$Zlo
133	mov	13,$cnt
134	sllx	$Zhi,60,$tmp
135	xor	$Tlo,$Zlo,$Zlo
136	srlx	$Zhi,4,$Zhi
137	xor	$Zlo,$tmp,$Zlo
138
139	xor	$xi1,$nlo,$nlo
140	and	$Zlo,0xf,$remi
141	and	$nlo,0xf0,$nhi
142	and	$nlo,0x0f,$nlo
143	ba	.Lghash_inner
144	sll	$nlo,4,$nlo
145.align	32
146.Lghash_inner:
147	ldx	[$Htblo+$nlo],$Tlo
148	sll	$remi,3,$remi
149	xor	$Thi,$Zhi,$Zhi
150	ldx	[$Htbl+$nlo],$Thi
151	srlx	$Zlo,4,$Zlo
152	xor	$rem,$Zhi,$Zhi
153	ldx	[$rem_4bit+$remi],$rem
154	sllx	$Zhi,60,$tmp
155	xor	$Tlo,$Zlo,$Zlo
156	ldub	[$inp+$cnt],$nlo
157	srlx	$Zhi,4,$Zhi
158	xor	$Zlo,$tmp,$Zlo
159	ldub	[$Xi+$cnt],$xi1
160	xor	$Thi,$Zhi,$Zhi
161	and	$Zlo,0xf,$remi
162
163	ldx	[$Htblo+$nhi],$Tlo
164	sll	$remi,3,$remi
165	xor	$rem,$Zhi,$Zhi
166	ldx	[$Htbl+$nhi],$Thi
167	srlx	$Zlo,4,$Zlo
168	ldx	[$rem_4bit+$remi],$rem
169	sllx	$Zhi,60,$tmp
170	xor	$xi1,$nlo,$nlo
171	srlx	$Zhi,4,$Zhi
172	and	$nlo,0xf0,$nhi
173	addcc	$cnt,-1,$cnt
174	xor	$Zlo,$tmp,$Zlo
175	and	$nlo,0x0f,$nlo
176	xor	$Tlo,$Zlo,$Zlo
177	sll	$nlo,4,$nlo
178	blu	.Lghash_inner
179	and	$Zlo,0xf,$remi
180
181	ldx	[$Htblo+$nlo],$Tlo
182	sll	$remi,3,$remi
183	xor	$Thi,$Zhi,$Zhi
184	ldx	[$Htbl+$nlo],$Thi
185	srlx	$Zlo,4,$Zlo
186	xor	$rem,$Zhi,$Zhi
187	ldx	[$rem_4bit+$remi],$rem
188	sllx	$Zhi,60,$tmp
189	xor	$Tlo,$Zlo,$Zlo
190	srlx	$Zhi,4,$Zhi
191	xor	$Zlo,$tmp,$Zlo
192	xor	$Thi,$Zhi,$Zhi
193
194	add	$inp,16,$inp
195	cmp	$inp,$len
196	be,pn	SIZE_T_CC,.Ldone
197	and	$Zlo,0xf,$remi
198
199	ldx	[$Htblo+$nhi],$Tlo
200	sll	$remi,3,$remi
201	xor	$rem,$Zhi,$Zhi
202	ldx	[$Htbl+$nhi],$Thi
203	srlx	$Zlo,4,$Zlo
204	ldx	[$rem_4bit+$remi],$rem
205	sllx	$Zhi,60,$tmp
206	xor	$Tlo,$Zlo,$Zlo
207	ldub	[$inp+15],$nlo
208	srlx	$Zhi,4,$Zhi
209	xor	$Zlo,$tmp,$Zlo
210	xor	$Thi,$Zhi,$Zhi
211	stx	$Zlo,[$Xi+8]
212	xor	$rem,$Zhi,$Zhi
213	stx	$Zhi,[$Xi]
214	srl	$Zlo,8,$xi1
215	and	$Zlo,0xff,$xi0
216	ba	.Louter
217	and	$xi1,0xff,$xi1
218.align	32
219.Ldone:
220	ldx	[$Htblo+$nhi],$Tlo
221	sll	$remi,3,$remi
222	xor	$rem,$Zhi,$Zhi
223	ldx	[$Htbl+$nhi],$Thi
224	srlx	$Zlo,4,$Zlo
225	ldx	[$rem_4bit+$remi],$rem
226	sllx	$Zhi,60,$tmp
227	xor	$Tlo,$Zlo,$Zlo
228	srlx	$Zhi,4,$Zhi
229	xor	$Zlo,$tmp,$Zlo
230	xor	$Thi,$Zhi,$Zhi
231	stx	$Zlo,[$Xi+8]
232	xor	$rem,$Zhi,$Zhi
233	stx	$Zhi,[$Xi]
234
235	ret
236	restore
237.type	gcm_ghash_4bit,#function
238.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
239___
240
241undef $inp;
242undef $len;
243
244$code.=<<___;
245.globl	gcm_gmult_4bit
246.align	32
247gcm_gmult_4bit:
248	save	%sp,-$frame,%sp
249	ldub	[$Xi+15],$nlo
250	add	$Htbl,8,$Htblo
251
2521:	call	.+8
253	add	%o7,rem_4bit-1b,$rem_4bit
254
255	and	$nlo,0xf0,$nhi
256	and	$nlo,0x0f,$nlo
257	sll	$nlo,4,$nlo
258	ldx	[$Htblo+$nlo],$Zlo
259	ldx	[$Htbl+$nlo],$Zhi
260
261	ldub	[$Xi+14],$nlo
262
263	ldx	[$Htblo+$nhi],$Tlo
264	and	$Zlo,0xf,$remi
265	ldx	[$Htbl+$nhi],$Thi
266	sll	$remi,3,$remi
267	ldx	[$rem_4bit+$remi],$rem
268	srlx	$Zlo,4,$Zlo
269	mov	13,$cnt
270	sllx	$Zhi,60,$tmp
271	xor	$Tlo,$Zlo,$Zlo
272	srlx	$Zhi,4,$Zhi
273	xor	$Zlo,$tmp,$Zlo
274
275	and	$Zlo,0xf,$remi
276	and	$nlo,0xf0,$nhi
277	and	$nlo,0x0f,$nlo
278	ba	.Lgmult_inner
279	sll	$nlo,4,$nlo
280.align	32
281.Lgmult_inner:
282	ldx	[$Htblo+$nlo],$Tlo
283	sll	$remi,3,$remi
284	xor	$Thi,$Zhi,$Zhi
285	ldx	[$Htbl+$nlo],$Thi
286	srlx	$Zlo,4,$Zlo
287	xor	$rem,$Zhi,$Zhi
288	ldx	[$rem_4bit+$remi],$rem
289	sllx	$Zhi,60,$tmp
290	xor	$Tlo,$Zlo,$Zlo
291	ldub	[$Xi+$cnt],$nlo
292	srlx	$Zhi,4,$Zhi
293	xor	$Zlo,$tmp,$Zlo
294	xor	$Thi,$Zhi,$Zhi
295	and	$Zlo,0xf,$remi
296
297	ldx	[$Htblo+$nhi],$Tlo
298	sll	$remi,3,$remi
299	xor	$rem,$Zhi,$Zhi
300	ldx	[$Htbl+$nhi],$Thi
301	srlx	$Zlo,4,$Zlo
302	ldx	[$rem_4bit+$remi],$rem
303	sllx	$Zhi,60,$tmp
304	srlx	$Zhi,4,$Zhi
305	and	$nlo,0xf0,$nhi
306	addcc	$cnt,-1,$cnt
307	xor	$Zlo,$tmp,$Zlo
308	and	$nlo,0x0f,$nlo
309	xor	$Tlo,$Zlo,$Zlo
310	sll	$nlo,4,$nlo
311	blu	.Lgmult_inner
312	and	$Zlo,0xf,$remi
313
314	ldx	[$Htblo+$nlo],$Tlo
315	sll	$remi,3,$remi
316	xor	$Thi,$Zhi,$Zhi
317	ldx	[$Htbl+$nlo],$Thi
318	srlx	$Zlo,4,$Zlo
319	xor	$rem,$Zhi,$Zhi
320	ldx	[$rem_4bit+$remi],$rem
321	sllx	$Zhi,60,$tmp
322	xor	$Tlo,$Zlo,$Zlo
323	srlx	$Zhi,4,$Zhi
324	xor	$Zlo,$tmp,$Zlo
325	xor	$Thi,$Zhi,$Zhi
326	and	$Zlo,0xf,$remi
327
328	ldx	[$Htblo+$nhi],$Tlo
329	sll	$remi,3,$remi
330	xor	$rem,$Zhi,$Zhi
331	ldx	[$Htbl+$nhi],$Thi
332	srlx	$Zlo,4,$Zlo
333	ldx	[$rem_4bit+$remi],$rem
334	sllx	$Zhi,60,$tmp
335	xor	$Tlo,$Zlo,$Zlo
336	srlx	$Zhi,4,$Zhi
337	xor	$Zlo,$tmp,$Zlo
338	xor	$Thi,$Zhi,$Zhi
339	stx	$Zlo,[$Xi+8]
340	xor	$rem,$Zhi,$Zhi
341	stx	$Zhi,[$Xi]
342
343	ret
344	restore
345.type	gcm_gmult_4bit,#function
346.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
347___
348
349{{{
350# Straightforward 128x128-bit multiplication using Karatsuba algorithm
351# followed by pair of 64-bit reductions [with a shortcut in first one,
352# which allowed to break dependency between reductions and remove one
353# multiplication from critical path]. While it might be suboptimal
354# with regard to sheer number of multiplications, other methods [such
355# as aggregate reduction] would require more 64-bit registers, which
356# we don't have in 32-bit application context.
357
358($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
359
360($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
361	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
362
363($shl,$shr)=map("%l$_",(0..7));
364
365# For details regarding "twisted H" see ghash-x86.pl.
366$code.=<<___;
367.globl	gcm_init_vis3
368.align	32
369gcm_init_vis3:
370	save	%sp,-$frame,%sp
371
372	ldx	[%i1+0],$Hhi
373	ldx	[%i1+8],$Hlo
374	mov	0xE1,$Xhi
375	mov	1,$Xlo
376	sllx	$Xhi,57,$Xhi
377	srax	$Hhi,63,$C0		! broadcast carry
378	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
379	addxc	$Hhi,$Hhi,$Hhi
380	and	$C0,$Xlo,$Xlo
381	and	$C0,$Xhi,$Xhi
382	xor	$Xlo,$Hlo,$Hlo
383	xor	$Xhi,$Hhi,$Hhi
384	stx	$Hlo,[%i0+8]		! save twisted H
385	stx	$Hhi,[%i0+0]
386
387	sethi	%hi(0xA0406080),$V
388	sethi	%hi(0x20C0E000),%l0
389	or	$V,%lo(0xA0406080),$V
390	or	%l0,%lo(0x20C0E000),%l0
391	sllx	$V,32,$V
392	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
393	stx	$V,[%i0+16]
394
395	ret
396	restore
397.type	gcm_init_vis3,#function
398.size	gcm_init_vis3,.-gcm_init_vis3
399
400.globl	gcm_gmult_vis3
401.align	32
402gcm_gmult_vis3:
403	save	%sp,-$frame,%sp
404
405	ldx	[$Xip+8],$Xlo		! load Xi
406	ldx	[$Xip+0],$Xhi
407	ldx	[$Htable+8],$Hlo	! load twisted H
408	ldx	[$Htable+0],$Hhi
409
410	mov	0xE1,%l7
411	sllx	%l7,57,$xE1		! 57 is not a typo
412	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
413
414	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
415	xmulx	$Xlo,$Hlo,$C0
416	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
417	xmulx	$C2,$Hhl,$C1
418	xmulxhi	$Xlo,$Hlo,$Xlo
419	xmulxhi	$C2,$Hhl,$C2
420	xmulxhi	$Xhi,$Hhi,$C3
421	xmulx	$Xhi,$Hhi,$Xhi
422
423	sll	$C0,3,$sqr
424	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
425	xor	$C0,$sqr,$sqr
426	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
427
428	xor	$C0,$C1,$C1		! Karatsuba post-processing
429	xor	$Xlo,$C2,$C2
430	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
431	xor	$C3,$C2,$C2
432	xor	$Xlo,$C1,$C1
433	xor	$Xhi,$C2,$C2
434	xor	$Xhi,$C1,$C1
435
436	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
437	 xor	$C0,$C2,$C2
438	xmulx	$C1,$xE1,$C0
439	 xor	$C1,$C3,$C3
440	xmulxhi	$C1,$xE1,$C1
441
442	xor	$Xlo,$C2,$C2
443	xor	$C0,$C2,$C2
444	xor	$C1,$C3,$C3
445
446	stx	$C2,[$Xip+8]		! save Xi
447	stx	$C3,[$Xip+0]
448
449	ret
450	restore
451.type	gcm_gmult_vis3,#function
452.size	gcm_gmult_vis3,.-gcm_gmult_vis3
453
454.globl	gcm_ghash_vis3
455.align	32
456gcm_ghash_vis3:
457	save	%sp,-$frame,%sp
458	nop
459	srln	$len,0,$len		! needed on v8+, "nop" on v9
460
461	ldx	[$Xip+8],$C2		! load Xi
462	ldx	[$Xip+0],$C3
463	ldx	[$Htable+8],$Hlo	! load twisted H
464	ldx	[$Htable+0],$Hhi
465
466	mov	0xE1,%l7
467	sllx	%l7,57,$xE1		! 57 is not a typo
468	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
469
470	and	$inp,7,$shl
471	andn	$inp,7,$inp
472	sll	$shl,3,$shl
473	prefetch [$inp+63], 20
474	sub	%g0,$shl,$shr
475
476	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
477.Loop:
478	ldx	[$inp+8],$Xlo
479	brz,pt	$shl,1f
480	ldx	[$inp+0],$Xhi
481
482	ldx	[$inp+16],$C1		! align data
483	srlx	$Xlo,$shr,$C0
484	sllx	$Xlo,$shl,$Xlo
485	sllx	$Xhi,$shl,$Xhi
486	srlx	$C1,$shr,$C1
487	or	$C0,$Xhi,$Xhi
488	or	$C1,$Xlo,$Xlo
4891:
490	add	$inp,16,$inp
491	sub	$len,16,$len
492	xor	$C2,$Xlo,$Xlo
493	xor	$C3,$Xhi,$Xhi
494	prefetch [$inp+63], 20
495
496	xmulx	$Xlo,$Hlo,$C0
497	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
498	xmulx	$C2,$Hhl,$C1
499	xmulxhi	$Xlo,$Hlo,$Xlo
500	xmulxhi	$C2,$Hhl,$C2
501	xmulxhi	$Xhi,$Hhi,$C3
502	xmulx	$Xhi,$Hhi,$Xhi
503
504	sll	$C0,3,$sqr
505	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
506	xor	$C0,$sqr,$sqr
507	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
508
509	xor	$C0,$C1,$C1		! Karatsuba post-processing
510	xor	$Xlo,$C2,$C2
511	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
512	xor	$C3,$C2,$C2
513	xor	$Xlo,$C1,$C1
514	xor	$Xhi,$C2,$C2
515	xor	$Xhi,$C1,$C1
516
517	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
518	 xor	$C0,$C2,$C2
519	xmulx	$C1,$xE1,$C0
520	 xor	$C1,$C3,$C3
521	xmulxhi	$C1,$xE1,$C1
522
523	xor	$Xlo,$C2,$C2
524	xor	$C0,$C2,$C2
525	brnz,pt	$len,.Loop
526	xor	$C1,$C3,$C3
527
528	stx	$C2,[$Xip+8]		! save Xi
529	stx	$C3,[$Xip+0]
530
531	ret
532	restore
533.type	gcm_ghash_vis3,#function
534.size	gcm_ghash_vis3,.-gcm_ghash_vis3
535___
536}}}
537$code.=<<___;
538.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
539.align	4
540___
541
542
543# Purpose of these subroutines is to explicitly encode VIS instructions,
544# so that one can compile the module without having to specify VIS
545# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
546# Idea is to reserve for option to produce "universal" binary and let
547# programmer detect if current CPU is VIS capable at run-time.
548sub unvis3 {
549my ($mnemonic,$rs1,$rs2,$rd)=@_;
550my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
551my ($ref,$opf);
552my %visopf = (	"addxc"		=> 0x011,
553		"addxccc"	=> 0x013,
554		"xmulx"		=> 0x115,
555		"xmulxhi"	=> 0x116	);
556
557    $ref = "$mnemonic\t$rs1,$rs2,$rd";
558
559    if ($opf=$visopf{$mnemonic}) {
560	foreach ($rs1,$rs2,$rd) {
561	    return $ref if (!/%([goli])([0-9])/);
562	    $_=$bias{$1}+$2;
563	}
564
565	return	sprintf ".word\t0x%08x !%s",
566			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
567			$ref;
568    } else {
569	return $ref;
570    }
571}
572
573foreach (split("\n",$code)) {
574	s/\`([^\`]*)\`/eval $1/ge;
575
576	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
577		&unvis3($1,$2,$3,$4)
578	 /ge;
579
580	print $_,"\n";
581}
582
583close STDOUT or die "error closing STDOUT: $!";
584