xref: /openssl/crypto/modes/asm/ghash-riscv64.pl (revision 999376dc)
1#! /usr/bin/env perl
2# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# $output is the last argument if it looks like a file (it has an extension)
10# $flavour is the first argument if it doesn't look like a file
11$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13
14$output and open STDOUT,">$output";
15
16my @regs = map("x$_",(0..31));
17my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1',
18    map("a$_",(0..7)),
19    map("s$_",(2..11)),
20    map("t$_",(3..6))
21);
22
23my %reglookup;
24@reglookup{@regs} = @regs;
25@reglookup{@regaliases} = @regs;
26
27# Takes a register name, possibly an alias, and converts it to a register index
28# from 0 to 31
29sub read_reg {
30    my $reg = lc shift;
31    if (!exists($reglookup{$reg})) {
32        die("Unknown register ".$reg);
33    }
34    my $regstr = $reglookup{$reg};
35    if (!($regstr =~ /^x([0-9]+)$/)) {
36        die("Could not process register ".$reg);
37    }
38    return $1;
39}
40
41sub rv64_rev8 {
42    # Encoding for rev8 rd, rs instruction on RV64
43    #               XXXXXXXXXXXXX_ rs  _XXX_ rd  _XXXXXXX
44    my $template = 0b011010111000_00000_101_00000_0010011;
45    my $rd = read_reg shift;
46    my $rs = read_reg shift;
47
48    return ".word ".($template | ($rs << 15) | ($rd << 7));
49}
50
51sub rv64_clmul {
52    # Encoding for clmul rd, rs1, rs2 instruction on RV64
53    #                XXXXXXX_ rs2 _ rs1 _XXX_ rd  _XXXXXXX
54    my $template = 0b0000101_00000_00000_001_00000_0110011;
55    my $rd = read_reg shift;
56    my $rs1 = read_reg shift;
57    my $rs2 = read_reg shift;
58
59    return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
60}
61
62sub rv64_clmulh {
63    # Encoding for clmulh rd, rs1, rs2 instruction on RV64
64    #                XXXXXXX_ rs2 _ rs1 _XXX_ rd  _XXXXXXX
65    my $template = 0b0000101_00000_00000_011_00000_0110011;
66    my $rd = read_reg shift;
67    my $rs1 = read_reg shift;
68    my $rs2 = read_reg shift;
69
70    return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
71}
72
73################################################################################
74# gcm_init_clmul_rv64i_zbb_zbc(u128 Htable[16], const u64 Xi[2])
75# Initialization function for clmul-based implementation of GMULT
76# This function is used in tandem with gcm_gmult_clmul_rv64i_zbb_zbc
77################################################################################
78{
79my ($Haddr,$Xi,$TEMP) = ("a0","a1","a2");
80
81$code .= <<___;
82.text
83.balign 16
84.globl gcm_init_clmul_rv64i_zbb_zbc
85.type gcm_init_clmul_rv64i_zbb_zbc,\@function
86# Initialize clmul-based implementation of galois field multiplication routine.
87# gcm_init_clmul_rv64i_zbb_zbc(ctx->Htable, ctx->H.u)
88gcm_init_clmul_rv64i_zbb_zbc:
89    # argument 0 = ctx->Htable (store H here)
90    # argument 1 = H.u[] (2x 64-bit words) [H_high64, H_low64]
91
92    # Simply store [H_high64, H_low64] for later
93    ld      $TEMP,0($Xi)
94    sd      $TEMP,0($Haddr)
95    ld      $TEMP,8($Xi)
96    sd      $TEMP,8($Haddr)
97
98    ret
99
100___
101
102}
103
104################################################################################
105# gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
106# Compute GMULT (X*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
107# extensions, and the Modified Barrett Reduction technique
108################################################################################
109{
110my ($Xi,$Haddr,$A1,$A0,$B1,$B0,$C1,$C0,$D1,$D0,$E1,$E0,$TEMP,$TEMP2,$qp_low) =
111 ("a0","a1","a2","a3","a4","a5","a6","a7","t0","t1","t2","t3","t4","t5","t6");
112
113$code .= <<___;
114.text
115.balign 16
116.globl gcm_gmult_clmul_rv64i_zbb_zbc
117.type gcm_gmult_clmul_rv64i_zbb_zbc,\@function
118# static void gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
119# Computes product of X*H mod f
120gcm_gmult_clmul_rv64i_zbb_zbc:
121
122    # Load X and H (H is saved previously in gcm_init_clmul_rv64i_zbb_zbc)
123    ld              $A1,0($Xi)
124    ld              $A0,8($Xi)
125
126    ld              $B1,0($Haddr)
127    ld              $B0,8($Haddr)
128
129    li              $qp_low,0xe100000000000000
130
131    # Perform Katratsuba Multiplication to generate a 255-bit intermediate
132    # A = [A1:A0]
133    # B = [B1:B0]
134    # Let:
135    # [C1:C0] = A1*B1
136    # [D1:D0] = A0*B0
137    # [E1:E0] = (A0+A1)*(B0+B1)
138    # Then:
139    # A*B = [C1:C0+C1+D1+E1:D1+C0+D0+E0:D0]
140
141    @{[rv64_rev8    $A1, $A1]}
142    @{[rv64_clmul   $C0,$A1,$B1]}
143    @{[rv64_clmulh  $C1,$A1,$B1]}
144
145    @{[rv64_rev8    $A0,$A0]}
146    @{[rv64_clmul   $D0,$A0,$B0]}
147    @{[rv64_clmulh  $D1,$A0,$B0]}
148
149    xor             $TEMP,$A0,$A1
150    xor             $TEMP2,$B0,$B1
151
152    @{[rv64_clmul   $E0,$TEMP,$TEMP2]}
153    @{[rv64_clmulh  $E1,$TEMP,$TEMP2]}
154
155    # 0th term is just C1
156
157    # Construct term 1 in E1 (E1 only appears in dword 1)
158    xor             $E1,$E1,$D1
159    xor             $E1,$E1,$C1
160    xor             $E1,$E1,$C0
161
162    # Term 1 is E1
163
164    # Construct term 2 in E0 (E0 only appears in dword 2)
165    xor             $E0,$E0,$D0
166    xor             $E0,$E0,$C0
167    xor             $E0,$E0,$D1
168
169    # Term 2 is E0
170
171    # final term is just D0
172
173    # X*H is now stored in [C1,E1,E0,D0]
174
175    # Left-justify
176    slli            $C1,$C1,1
177    # Or in the high bit of E1
178    srli            $TEMP,$E1,63
179    or              $C1,$C1,$TEMP
180
181    slli            $E1,$E1,1
182    # Or in the high bit of E0
183    srli            $TEMP2,$E0,63
184    or              $E1,$E1,$TEMP2
185
186    slli            $E0,$E0,1
187    # Or in the high bit of D0
188    srli            $TEMP,$D0,63
189    or              $E0,$E0,$TEMP
190
191    slli            $D0,$D0,1
192
193    # Barrett Reduction
194    # c = [E0, D0]
195    # We want the top 128 bits of the result of c*f
196    # We'll get this by computing the low-half (most significant 128 bits in
197    # the reflected domain) of clmul(c,fs)<<1 first, then
198    # xor in c to complete the calculation
199
200    # AA = [AA1:AA0] = [E0,D0] = c
201    # BB = [BB1:BB0] = [qp_low,0]
202    # [CC1:CC0] = AA1*BB1
203    # [DD1:DD0] = AA0*BB0
204    # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
205    # Then:
206    # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
207    # We only need CC0,DD1,DD0,EE0 to compute the low 128 bits of c * qp_low
208___
209
210my ($CC0,$EE0,$AA1,$AA0,$BB1) = ($A0,$B1,$E0,$D0,$qp_low);
211
212$code .= <<___;
213
214    @{[rv64_clmul   $CC0,$AA1,$BB1]}
215    #clmul          DD0,AA0,BB0     # BB0 is 0, so DD0 = 0
216    #clmulh         DD1,AA0,BB0     # BB0 is 0, so DD1 = 0
217    xor             $TEMP,$AA0,$AA1
218    #xor            TEMP2,BB0,BB1   # TEMP2 = BB1 = qp_low
219    @{[rv64_clmul   $EE0,$TEMP,$BB1]}
220
221    # Result is [N/A:N/A:DD1+CC0+DD0+EE0:DD0]
222    # Simplifying: [CC0+EE0:0]
223    xor             $TEMP2,$CC0,$EE0
224    # Shift left by 1 to correct for bit reflection
225    slli            $TEMP2,$TEMP2,1
226
227    # xor into c = [E0,D0]
228    # Note that only E0 is affected
229    xor             $E0,$E0,$TEMP2
230
231    # Now, q = [E0,D0]
232
233    # The final step is to compute clmul(q,[qp_low:0])<<1
234    # The leftmost 128 bits are the reduced result.
235    # Once again, we use Karatsuba multiplication, but many of the terms
236    # simplify or cancel out.
237    # AA = [AA1:AA0] = [E0,D0] = c
238    # BB = [BB1:BB0] = [qp_low,0]
239    # [CC1:CC0] = AA1*BB1
240    # [DD1:DD0] = AA0*BB0
241    # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
242    # Then:
243    # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
244    # We need CC1,CC0,DD0,DD1,EE1,EE0 to compute the leftmost 128 bits of AA*BB
245
246___
247
248my ($AA1,$AA0,$BB1,$CC1,$CC0,$EE1,$EE0) = ($E0,$D0,$qp_low,$A0,$A1,$C0,$B0);
249
250$code .= <<___;
251
252    @{[rv64_clmul   $CC0,$AA1,$BB1]}
253    @{[rv64_clmulh  $CC1,$AA1,$BB1]}
254
255    #clmul          DD0,AA0,BB0   # BB0 = 0 so DD0 = 0
256    #clmulh         DD1,AA0,BB0   # BB0 = 0 so DD1 = 0
257
258    xor             $TEMP,$AA0,$AA1
259    #xor            TEMP2,BB0,BB1 # BB0 = 0 to TEMP2 == BB1 == qp_low
260
261    @{[rv64_clmul   $EE0,$TEMP,$BB1]}
262    @{[rv64_clmulh  $EE1,$TEMP,$BB1]}
263
264    # Need the DD1+CC0+DD0+EE0 term to shift its leftmost bit into the
265    # intermediate result.
266    # This is just CC0+EE0, store it in TEMP
267    xor             $TEMP,$CC0,$EE0
268
269    # Result is [CC1:CC0+CC1+EE1:(a single bit)]<<1
270    # Combine into [CC1:CC0]
271    xor             $CC0,$CC0,$CC1
272    xor             $CC0,$CC0,$EE1
273
274    # Shift 128-bit quantity, xor in [C1,E1] and store
275    slli            $CC1,$CC1,1
276    srli            $TEMP2,$CC0,63
277    or              $CC1,$CC1,$TEMP2
278    # xor in C1
279    xor             $CC1,$CC1,$C1
280    @{[rv64_rev8    $CC1,$CC1]}
281
282    slli            $CC0,$CC0,1
283    srli            $TEMP,$TEMP,63
284    or              $CC0,$CC0,$TEMP
285    # xor in E1
286    xor             $CC0,$CC0,$E1
287    @{[rv64_rev8    $CC0,$CC0]}
288    sd              $CC1,0(a0)
289    sd              $CC0,8(a0)
290
291    ret
292___
293
294}
295
296print $code;
297
298close STDOUT or die "error closing STDOUT: $!";
299