xref: /openssl/crypto/modes/asm/ghash-c64xplus.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2011
18#
19# The module implements GCM GHASH function and underlying single
20# multiplication operation in GF(2^128). Even though subroutines
21# have _4bit suffix, they are not using any tables, but rely on
22# hardware Galois Field Multiply support. Streamed GHASH processes
23# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
24# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
25# comparing apples vs. oranges, but compiler surely could have done
26# better, because theoretical [though not necessarily achievable]
27# estimate for "4-bit" table-driven implementation is ~12 cycles.
28
29$output = pop and open STDOUT,">$output";
30
31($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
32
33($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
34			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
35($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
36			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
37($FF000000,$E10000)=("B30","B31");
38($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
39 $xia="A9";
40($rem,$res)=("B4","B5");		# $rem zaps $Htable
41
42$code.=<<___;
43	.text
44
45	.if	.ASSEMBLER_VERSION<7000000
46	.asg	0,__TI_EABI__
47	.endif
48	.if	__TI_EABI__
49	.asg	gcm_gmult_1bit,_gcm_gmult_1bit
50	.asg	gcm_gmult_4bit,_gcm_gmult_4bit
51	.asg	gcm_ghash_4bit,_gcm_ghash_4bit
52	.endif
53
54	.asg	B3,RA
55
56	.if	0
57	.global	_gcm_gmult_1bit
58_gcm_gmult_1bit:
59	ADDAD	$Htable,2,$Htable
60	.endif
61	.global	_gcm_gmult_4bit
62_gcm_gmult_4bit:
63	.asmfunc
64	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
65	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
66||	MV	$Xip,${xip}		; reassign Xi
67||	MVK	15,B1			; SPLOOPD constant
68
69	MVK	0xE1,$E10000
70||	LDBU	*++${xip}[15],$x1	; Xi[15]
71	MVK	0xFF,$FF000000
72||	LDBU	*--${xip},$x0		; Xi[14]
73	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
74	SHL	$FF000000,24,$FF000000	; upper byte mask
75||	BNOP	ghash_loop?
76||	MVK	1,B0			; take a single spin
77
78	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
79	AND	$H2,$FF000000,$H2u	; H2's upper byte
80	AND	$H3,$FF000000,$H3u	; H3's upper byte
81||	SHRU	$H2u,8,$H2u
82	SHRU	$H3u,8,$H3u
83||	ZERO	$Z1:$Z0
84	SHRU2	$xia,8,$H01u
85||	ZERO	$Z3:$Z2
86	.endasmfunc
87
88	.global	_gcm_ghash_4bit
89_gcm_ghash_4bit:
90	.asmfunc
91	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
92||	SHRU	$len,4,B0		; reassign len
93	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
94||	MV	$Xip,${xip}		; reassign Xi
95||	MVK	15,B1			; SPLOOPD constant
96
97	MVK	0xE1,$E10000
98|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
99	MVK	0xFF,$FF000000
100|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
101	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
102||	LDDW	*${xip}[1],$Z1:$Z0
103	SHL	$FF000000,24,$FF000000	; upper byte mask
104||	LDDW	*${xip}[0],$Z3:$Z2
105
106	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
107	AND	$H2,$FF000000,$H2u	; H2's upper byte
108	AND	$H3,$FF000000,$H3u	; H3's upper byte
109||	SHRU	$H2u,8,$H2u
110	SHRU	$H3u,8,$H3u
111	SHRU2	$xia,8,$H01u
112
113|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
114|| [B0]	XOR	$H1x,$Z1,$Z1
115	.if	.LITTLE_ENDIAN
116   [B0]	XOR	$H2x,$Z2,$Z2
117|| [B0]	XOR	$H3x,$Z3,$Z3
118|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
119	STDW	$Z1:$Z0,*${xip}[1]
120|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
121|| [B0]	ZERO	$Z1:$Z0
122	.else
123   [B0]	XOR	$H2x,$Z2,$Z2
124|| [B0]	XOR	$H3x,$Z3,$Z3
125|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
126	STDW	$Z1:$Z0,*${xip}[1]
127|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
128|| [B0]	ZERO	$Z1:$Z0
129	.endif
130	STDW	$Z3:$Z2,*${xip}[0]
131|| [B0]	ZERO	$Z3:$Z2
132|| [B0]	MV	$xia,$x1
133   [B0]	ADDK	14,${xip}
134
135ghash_loop?:
136	SPLOOPD	6			; 6*16+7
137||	MVC	B1,ILC
138|| [B0]	SUB	B0,1,B0
139||	ZERO	A0
140||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
141||	SHL	$x1,1,$xia
142___
143
144########____________________________
145#  0    D2.     M1          M2      |
146#  1            M1                  |
147#  2            M1          M2      |
148#  3        D1. M1          M2      |
149#  4        S1. L1                  |
150#  5    S2  S1x L1          D2  L2  |____________________________
151#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
152#  7/1          L1  S1  D1x S2  M2  |        M1                  |
153#  8/2              S1  L1x S2      |        M1          M2      |
154#  9/3              S1  L1x         |    D1. M1          M2      |
155# 10/4                  D1x         |    S1. L1                  |
156# 11/5                              |S2  S1x L1          D2  L2  |____________
157# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
158#    7/1                                     L1  S1  D1x S2  M2  |        ....
159#    8/2                                         S1  L1x S2      |        ....
160#####...                                         ................|............
161$code.=<<___;
162	XORMPY	$H0,$xia,$H0x		; 0	; H·(Xi[i]<<1)
163||	XORMPY	$H01u,$xib,$H01y
164|| [A0]	LDBU	*--${xip},$x0
165	XORMPY	$H1,$xia,$H1x		; 1
166	XORMPY	$H2,$xia,$H2x		; 2
167||	XORMPY	$H2u,$xib,$H2y
168	XORMPY	$H3,$xia,$H3x		; 3
169||	XORMPY	$H3u,$xib,$H3y
170||[!A0]	MVK.D	15,A0				; *--${xip} counter
171	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H·(Xi[i]<<1)
172|| [A0]	SUB.S	A0,1,A0
173	XOR.L	$H1x,$Z1,$Z1		; 5
174||	AND.D	$H01y,$FF000000,$H0z
175||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
176||	SHL	$x0,1,$xib
177||	SHL	$x0,1,$xia
178
179	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
180||	SHL	$Z0,1,$rem		;	; rem=Z<<1
181||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
182||	AND.L	$H1y,$FF000000,$H1z
183	XOR.L	$H3x,$Z3,$Z3		; 7/1
184||	SHRMB.S	$Z2,$Z1,$Z1
185||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
186||	AND.S	$H2y,$FF000000,$H2z
187||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
188	XOR.L	$H1z,$Z1,$Z1		; 8/2
189||	SHRMB.S	$Z3,$Z2,$Z2
190||	AND.S	$H3y,$FF000000,$H3z
191	XOR.L	$H2z,$Z2,$Z2		; 9/3
192||	SHRU	$Z3,8,$Z3
193	XOR.D	$H3z,$Z3,$Z3		; 10/4
194	NOP				; 11/5
195
196	SPKERNEL 0,2
197||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
198
199	; input pre-fetch is possible where D1 slot is available...
200   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
201   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
202	NOP				; 10/-
203	.if	.LITTLE_ENDIAN
204	SWAP2	$Z0,$Z1			; 11/-
205||	SWAP4	$Z1,$Z0
206	SWAP4	$Z1,$Z1			; 12/-
207||	SWAP2	$Z0,$Z0
208	SWAP2	$Z2,$Z3
209||	SWAP4	$Z3,$Z2
210||[!B0]	BNOP	RA
211	SWAP4	$Z3,$Z3
212||	SWAP2	$Z2,$Z2
213|| [B0]	BNOP	ghash_loop?
214   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
215|| [B0]	XOR	$H1x,$Z1,$Z1
216   [B0]	XOR	$H2x,$Z2,$Z2
217|| [B0]	XOR	$H3x,$Z3,$Z3
218|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
219	STDW	$Z1:$Z0,*${xip}[1]
220|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
221|| [B0]	ZERO	$Z1:$Z0
222	.else
223  [!B0]	BNOP	RA			; 11/-
224   [B0]	BNOP	ghash_loop?		; 12/-
225   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
226|| [B0]	XOR	$H1x,$Z1,$Z1
227   [B0]	XOR	$H2x,$Z2,$Z2
228|| [B0]	XOR	$H3x,$Z3,$Z3
229|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
230	STDW	$Z1:$Z0,*${xip}[1]
231|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
232|| [B0]	ZERO	$Z1:$Z0
233	.endif
234	STDW	$Z3:$Z2,*${xip}[0]
235|| [B0]	ZERO	$Z3:$Z2
236|| [B0]	MV	$xia,$x1
237   [B0]	ADDK	14,${xip}
238	.endasmfunc
239
240	.sect	.const
241	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
242	.align	4
243___
244
245print $code;
246close STDOUT or die "error closing STDOUT: $!";
247