1#! /usr/bin/env perl
2# This file is dual-licensed, meaning that you can use it under your
3# choice of either of the following two licenses:
4#
5# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
6#
7# Licensed under the Apache License 2.0 (the "License"). You can obtain
8# a copy in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# or
12#
13# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
14# All rights reserved.
15#
16# Redistribution and use in source and binary forms, with or without
17# modification, are permitted provided that the following conditions
18# are met:
19# 1. Redistributions of source code must retain the above copyright
20#    notice, this list of conditions and the following disclaimer.
21# 2. Redistributions in binary form must reproduce the above copyright
22#    notice, this list of conditions and the following disclaimer in the
23#    documentation and/or other materials provided with the distribution.
24#
25# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
37# - RV64I
38# - RISC-V Vector ('V') with VLEN >= 128
39# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
40# - RISC-V Vector Carryless Multiplication extension ('Zvbc')
41
42use strict;
43use warnings;
44
45use FindBin qw($Bin);
46use lib "$Bin";
47use lib "$Bin/../../perlasm";
48use riscv;
49
50# $output is the last argument if it looks like a file (it has an extension)
51# $flavour is the first argument if it doesn't look like a file
52my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
53my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
54
55$output and open STDOUT,">$output";
56
57my $code=<<___;
58.text
59___
60
61################################################################################
62# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
63#
64# input:	H: 128-bit H - secret parameter E(K, 0^128)
65# output:	Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
66#                       gcm_ghash_rv64i_zvkb_zvbc
67{
68my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
69my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
70
71$code .= <<___;
72.p2align 3
73.globl gcm_init_rv64i_zvkb_zvbc
74.type gcm_init_rv64i_zvkb_zvbc,\@function
75gcm_init_rv64i_zvkb_zvbc:
76    # Load/store data in reverse order.
77    # This is needed as a part of endianness swap.
78    add $H, $H, 8
79    li $TMP0, -8
80    li $TMP1, 63
81    la $TMP2, Lpolymod
82
83    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
84
85    @{[vlse64_v  $V1, $H, $TMP0]}    # vlse64.v v1, (a1), t0
86    @{[vle64_v $V2, $TMP2]}          # vle64.v v2, (t2)
87
88    # Shift one left and get the carry bits.
89    @{[vsrl_vx $V3, $V1, $TMP1]}     # vsrl.vx v3, v1, t1
90    @{[vsll_vi $V1, $V1, 1]}         # vsll.vi v1, v1, 1
91
92    # Use the fact that the polynomial degree is no more than 128,
93    # i.e. only the LSB of the upper half could be set.
94    # Thanks to this we don't need to do the full reduction here.
95    # Instead simply subtract the reduction polynomial.
96    # This idea was taken from x86 ghash implementation in OpenSSL.
97    @{[vslideup_vi $V4, $V3, 1]}     # vslideup.vi v4, v3, 1
98    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
99
100    @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
101    @{[vor_vv_v0t $V1, $V1, $V4]}    # vor.vv v1, v1, v4, v0.t
102
103    # Need to set the mask to 3, if the carry bit is set.
104    @{[vmv_v_v $V0, $V3]}            # vmv.v.v v0, v3
105    @{[vmv_v_i $V3, 0]}              # vmv.v.i v3, 0
106    @{[vmerge_vim $V3, $V3, 3]}      # vmerge.vim v3, v3, 3, v0
107    @{[vmv_v_v $V0, $V3]}            # vmv.v.v v0, v3
108
109    @{[vxor_vv_v0t $V1, $V1, $V2]}   # vxor.vv v1, v1, v2, v0.t
110
111    @{[vse64_v $V1, $Htable]}        # vse64.v v1, (a0)
112    ret
113.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
114___
115}
116
117################################################################################
118# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
119#
120# input:	Xi: current hash value
121#		Htable: preprocessed H
122# output:	Xi: next hash value Xi = (Xi * H mod f)
123{
124my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
125my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
126
127$code .= <<___;
128.text
129.p2align 3
130.globl gcm_gmult_rv64i_zvkb_zvbc
131.type gcm_gmult_rv64i_zvkb_zvbc,\@function
132gcm_gmult_rv64i_zvkb_zvbc:
133    ld $TMP0, ($Htable)
134    ld $TMP1, 8($Htable)
135    li $TMP2, 63
136    la $TMP3, Lpolymod
137    ld $TMP3, 8($TMP3)
138
139    # Load/store data in reverse order.
140    # This is needed as a part of endianness swap.
141    add $Xi, $Xi, 8
142    li $TMP4, -8
143
144    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
145
146    @{[vlse64_v $V5, $Xi, $TMP4]}    # vlse64.v v5, (a0), t4
147    @{[vrev8_v $V5, $V5]}            # vrev8.v v5, v5
148
149    # Multiplication
150
151    # Do two 64x64 multiplications in one go to save some time
152    # and simplify things.
153
154    # A = a1a0 (t1, t0)
155    # B = b1b0 (v5)
156    # C = c1c0 (256 bit)
157    # c1 = a1b1 + (a0b1)h + (a1b0)h
158    # c0 = a0b0 + (a0b1)l + (a1b0)h
159
160    # v1 = (a0b1)l,(a0b0)l
161    @{[vclmul_vx $V1, $V5, $TMP0]}   # vclmul.vx v1, v5, t0
162    # v3 = (a0b1)h,(a0b0)h
163    @{[vclmulh_vx $V3, $V5, $TMP0]}  # vclmulh.vx v3, v5, t0
164
165    # v4 = (a1b1)l,(a1b0)l
166    @{[vclmul_vx $V4, $V5, $TMP1]}   # vclmul.vx v4, v5, t1
167    # v2 = (a1b1)h,(a1b0)h
168    @{[vclmulh_vx $V2, $V5, $TMP1]}   # vclmulh.vx v2, v5, t1
169
170    # Is there a better way to do this?
171    # Would need to swap the order of elements within a vector register.
172    @{[vslideup_vi $V5, $V3, 1]}     # vslideup.vi v5, v3, 1
173    @{[vslideup_vi $V6, $V4, 1]}     # vslideup.vi v6, v4, 1
174    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
175    @{[vslidedown_vi $V4, $V4, 1]}   # vslidedown.vi v4, v4, 1
176
177    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
178    # v2 += (a0b1)h
179    @{[vxor_vv_v0t $V2, $V2, $V3]}   # vxor.vv v2, v2, v3, v0.t
180    # v2 += (a1b1)l
181    @{[vxor_vv_v0t $V2, $V2, $V4]}   # vxor.vv v2, v2, v4, v0.t
182
183    @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
184    # v1 += (a0b0)h,0
185    @{[vxor_vv_v0t $V1, $V1, $V5]}   # vxor.vv v1, v1, v5, v0.t
186    # v1 += (a1b0)l,0
187    @{[vxor_vv_v0t $V1, $V1, $V6]}   # vxor.vv v1, v1, v6, v0.t
188
189    # Now the 256bit product should be stored in (v2,v1)
190    # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
191    # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
192
193    # Reduction
194    # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
195    # This is a slight variation of the Gueron's Montgomery reduction.
196    # The difference being the order of some operations has been changed,
197    # to make a better use of vclmul(h) instructions.
198
199    # First step:
200    # c1 += (c0 * P)l
201    # vmv.v.i v0, 2
202    @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
203    @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
204    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
205
206    # Second step:
207    # D = d1,d0 is final result
208    # We want:
209    # m1 = c1 + (c1 * P)h
210    # m0 = (c1 * P)l + (c0 * P)h + c0
211    # d1 = c3 + m1
212    # d0 = c2 + m0
213
214    #v3 = (c1 * P)l, 0
215    @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
216    #v4 = (c1 * P)h, (c0 * P)h
217    @{[vclmulh_vx $V4, $V1, $TMP3]}   # vclmulh.vx v4, v1, t3
218
219    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
220    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
221
222    @{[vxor_vv $V1, $V1, $V4]}       # vxor.vv v1, v1, v4
223    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
224
225    # XOR in the upper upper part of the product
226    @{[vxor_vv $V2, $V2, $V1]}       # vxor.vv v2, v2, v1
227
228    @{[vrev8_v $V2, $V2]}            # vrev8.v v2, v2
229    @{[vsse64_v $V2, $Xi, $TMP4]}    # vsse64.v v2, (a0), t4
230    ret
231.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
232___
233}
234
235################################################################################
236# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
237#                                const u8 *inp, size_t len);
238#
239# input:	Xi: current hash value
240#		Htable: preprocessed H
241#		inp: pointer to input data
242#		len: length of input data in bytes (multiple of block size)
243# output:	Xi: Xi+1 (next hash value Xi)
244{
245my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
246my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
247
248$code .= <<___;
249.p2align 3
250.globl gcm_ghash_rv64i_zvkb_zvbc
251.type gcm_ghash_rv64i_zvkb_zvbc,\@function
252gcm_ghash_rv64i_zvkb_zvbc:
253    ld $TMP0, ($Htable)
254    ld $TMP1, 8($Htable)
255    li $TMP2, 63
256    la $TMP3, Lpolymod
257    ld $TMP3, 8($TMP3)
258
259    # Load/store data in reverse order.
260    # This is needed as a part of endianness swap.
261    add $Xi, $Xi, 8
262    add $inp, $inp, 8
263    li $M8, -8
264
265    @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
266
267    @{[vlse64_v $V5, $Xi, $M8]}      # vlse64.v v5, (a0), t4
268
269Lstep:
270    # Read input data
271    @{[vlse64_v $Vinp, $inp, $M8]}   # vle64.v v0, (a2)
272    add $inp, $inp, 16
273    add $len, $len, -16
274    # XOR them into Xi
275    @{[vxor_vv $V5, $V5, $Vinp]}       # vxor.vv v0, v0, v1
276
277    @{[vrev8_v $V5, $V5]}            # vrev8.v v5, v5
278
279    # Multiplication
280
281    # Do two 64x64 multiplications in one go to save some time
282    # and simplify things.
283
284    # A = a1a0 (t1, t0)
285    # B = b1b0 (v5)
286    # C = c1c0 (256 bit)
287    # c1 = a1b1 + (a0b1)h + (a1b0)h
288    # c0 = a0b0 + (a0b1)l + (a1b0)h
289
290    # v1 = (a0b1)l,(a0b0)l
291    @{[vclmul_vx $V1, $V5, $TMP0]}   # vclmul.vx v1, v5, t0
292    # v3 = (a0b1)h,(a0b0)h
293    @{[vclmulh_vx $V3, $V5, $TMP0]}  # vclmulh.vx v3, v5, t0
294
295    # v4 = (a1b1)l,(a1b0)l
296    @{[vclmul_vx $V4, $V5, $TMP1]}   # vclmul.vx v4, v5, t1
297    # v2 = (a1b1)h,(a1b0)h
298    @{[vclmulh_vx $V2, $V5, $TMP1]}   # vclmulh.vx v2, v5, t1
299
300    # Is there a better way to do this?
301    # Would need to swap the order of elements within a vector register.
302    @{[vslideup_vi $V5, $V3, 1]}     # vslideup.vi v5, v3, 1
303    @{[vslideup_vi $V6, $V4, 1]}     # vslideup.vi v6, v4, 1
304    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
305    @{[vslidedown_vi $V4, $V4, 1]}   # vslidedown.vi v4, v4, 1
306
307    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
308    # v2 += (a0b1)h
309    @{[vxor_vv_v0t $V2, $V2, $V3]}   # vxor.vv v2, v2, v3, v0.t
310    # v2 += (a1b1)l
311    @{[vxor_vv_v0t $V2, $V2, $V4]}   # vxor.vv v2, v2, v4, v0.t
312
313    @{[vmv_v_i $V0, 2]}              # vmv.v.i v0, 2
314    # v1 += (a0b0)h,0
315    @{[vxor_vv_v0t $V1, $V1, $V5]}   # vxor.vv v1, v1, v5, v0.t
316    # v1 += (a1b0)l,0
317    @{[vxor_vv_v0t $V1, $V1, $V6]}   # vxor.vv v1, v1, v6, v0.t
318
319    # Now the 256bit product should be stored in (v2,v1)
320    # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
321    # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
322
323    # Reduction
324    # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
325    # This is a slight variation of the Gueron's Montgomery reduction.
326    # The difference being the order of some operations has been changed,
327    # to make a better use of vclmul(h) instructions.
328
329    # First step:
330    # c1 += (c0 * P)l
331    # vmv.v.i v0, 2
332    @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
333    @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
334    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
335
336    # Second step:
337    # D = d1,d0 is final result
338    # We want:
339    # m1 = c1 + (c1 * P)h
340    # m0 = (c1 * P)l + (c0 * P)h + c0
341    # d1 = c3 + m1
342    # d0 = c2 + m0
343
344    #v3 = (c1 * P)l, 0
345    @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
346    #v4 = (c1 * P)h, (c0 * P)h
347    @{[vclmulh_vx $V4, $V1, $TMP3]}   # vclmulh.vx v4, v1, t3
348
349    @{[vmv_v_i $V0, 1]}              # vmv.v.i v0, 1
350    @{[vslidedown_vi $V3, $V3, 1]}   # vslidedown.vi v3, v3, 1
351
352    @{[vxor_vv $V1, $V1, $V4]}       # vxor.vv v1, v1, v4
353    @{[vxor_vv_v0t $V1, $V1, $V3]}   # vxor.vv v1, v1, v3, v0.t
354
355    # XOR in the upper upper part of the product
356    @{[vxor_vv $V2, $V2, $V1]}       # vxor.vv v2, v2, v1
357
358    @{[vrev8_v $V5, $V2]}            # vrev8.v v2, v2
359
360    bnez $len, Lstep
361
362    @{[vsse64_v $V5, $Xi, $M8]}    # vsse64.v v2, (a0), t4
363    ret
364.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
365___
366}
367
368$code .= <<___;
369.p2align 4
370Lpolymod:
371        .dword 0x0000000000000001
372        .dword 0xc200000000000000
373.size Lpolymod,.-Lpolymod
374___
375
376print $code;
377
378close STDOUT or die "error closing STDOUT: $!";
379