xref: /openssl/crypto/modes/asm/ghash-riscv64.pl (revision da1c088f)
1#! /usr/bin/env perl
2# This file is dual-licensed, meaning that you can use it under your
3# choice of either of the following two licenses:
4#
5# Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
6#
7# Licensed under the Apache License 2.0 (the "License"). You can obtain
8# a copy in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# or
12#
13# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
14# All rights reserved.
15#
16# Redistribution and use in source and binary forms, with or without
17# modification, are permitted provided that the following conditions
18# are met:
19# 1. Redistributions of source code must retain the above copyright
20#    notice, this list of conditions and the following disclaimer.
21# 2. Redistributions in binary form must reproduce the above copyright
22#    notice, this list of conditions and the following disclaimer in the
23#    documentation and/or other materials provided with the distribution.
24#
25# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
37use strict;
38use warnings;
39
40use FindBin qw($Bin);
41use lib "$Bin";
42use lib "$Bin/../../perlasm";
43use riscv;
44
45# $output is the last argument if it looks like a file (it has an extension)
46# $flavour is the first argument if it doesn't look like a file
47my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49
50$output and open STDOUT,">$output";
51
52my $code=<<___;
53.text
54___
55
56################################################################################
57# void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]);
58# void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]);
59# void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]);
60#
61# input:  H: 128-bit H - secret parameter E(K, 0^128)
62# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and
63#                 gcm_ghash_rv64i_zbc*
64#
65# All callers of this function revert the byte-order unconditionally
66# on little-endian machines. So we need to revert the byte-order back.
67# Additionally we reverse the bits of each byte.
68
69{
70my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
71
72$code .= <<___;
73.p2align 3
74.globl gcm_init_rv64i_zbc
75.type gcm_init_rv64i_zbc,\@function
76gcm_init_rv64i_zbc:
77    ld      $VAL0,0($H)
78    ld      $VAL1,8($H)
79    @{[brev8_rv64i   $VAL0, $TMP0, $TMP1, $TMP2]}
80    @{[brev8_rv64i   $VAL1, $TMP0, $TMP1, $TMP2]}
81    @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
82    @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
83    ret
84.size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc
85___
86}
87
88{
89my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
90
91$code .= <<___;
92.p2align 3
93.globl gcm_init_rv64i_zbc__zbb
94.type gcm_init_rv64i_zbc__zbb,\@function
95gcm_init_rv64i_zbc__zbb:
96    ld      $VAL0,0($H)
97    ld      $VAL1,8($H)
98    @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
99    @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
100    @{[rev8 $VAL0, $VAL0]}
101    @{[rev8 $VAL1, $VAL1]}
102    sd      $VAL0,0($Htable)
103    sd      $VAL1,8($Htable)
104    ret
105.size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb
106___
107}
108
109{
110my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1");
111
112$code .= <<___;
113.p2align 3
114.globl gcm_init_rv64i_zbc__zbkb
115.type gcm_init_rv64i_zbc__zbkb,\@function
116gcm_init_rv64i_zbc__zbkb:
117    ld      $TMP0,0($H)
118    ld      $TMP1,8($H)
119    @{[brev8 $TMP0, $TMP0]}
120    @{[brev8 $TMP1, $TMP1]}
121    @{[rev8 $TMP0, $TMP0]}
122    @{[rev8 $TMP1, $TMP1]}
123    sd      $TMP0,0($Htable)
124    sd      $TMP1,8($Htable)
125    ret
126.size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb
127___
128}
129
130################################################################################
131# void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
132# void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
133#
134# input:  Xi: current hash value
135#         Htable: copy of H
136# output: Xi: next hash value Xi
137#
138# Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
139# extensions. Using the no-Karatsuba approach and clmul for the final reduction.
140# This results in an implementation with minimized number of instructions.
141# HW with clmul latencies higher than 2 cycles might observe a performance
142# improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
143# might observe a performance improvement with additionally converting the
144# reduction to shift&xor. For a full discussion of this estimates see
145# https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
146{
147my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
148my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
149
150$code .= <<___;
151.p2align 3
152.globl gcm_gmult_rv64i_zbc
153.type gcm_gmult_rv64i_zbc,\@function
154gcm_gmult_rv64i_zbc:
155    # Load Xi and bit-reverse it
156    ld        $x0, 0($Xi)
157    ld        $x1, 8($Xi)
158    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
159    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
160
161    # Load the key (already bit-reversed)
162    ld        $y0, 0($Htable)
163    ld        $y1, 8($Htable)
164
165    # Load the reduction constant
166    la        $polymod, Lpolymod
167    lbu       $polymod, 0($polymod)
168
169    # Multiplication (without Karatsuba)
170    @{[clmulh $z3, $x1, $y1]}
171    @{[clmul  $z2, $x1, $y1]}
172    @{[clmulh $t1, $x0, $y1]}
173    @{[clmul  $z1, $x0, $y1]}
174    xor       $z2, $z2, $t1
175    @{[clmulh $t1, $x1, $y0]}
176    @{[clmul  $t0, $x1, $y0]}
177    xor       $z2, $z2, $t1
178    xor       $z1, $z1, $t0
179    @{[clmulh $t1, $x0, $y0]}
180    @{[clmul  $z0, $x0, $y0]}
181    xor       $z1, $z1, $t1
182
183    # Reduction with clmul
184    @{[clmulh $t1, $z3, $polymod]}
185    @{[clmul  $t0, $z3, $polymod]}
186    xor       $z2, $z2, $t1
187    xor       $z1, $z1, $t0
188    @{[clmulh $t1, $z2, $polymod]}
189    @{[clmul  $t0, $z2, $polymod]}
190    xor       $x1, $z1, $t1
191    xor       $x0, $z0, $t0
192
193    # Bit-reverse Xi back and store it
194    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
195    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
196    sd        $x0, 0($Xi)
197    sd        $x1, 8($Xi)
198    ret
199.size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc
200___
201}
202
203{
204my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
205my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
206
207$code .= <<___;
208.p2align 3
209.globl gcm_gmult_rv64i_zbc__zbkb
210.type gcm_gmult_rv64i_zbc__zbkb,\@function
211gcm_gmult_rv64i_zbc__zbkb:
212    # Load Xi and bit-reverse it
213    ld        $x0, 0($Xi)
214    ld        $x1, 8($Xi)
215    @{[brev8  $x0, $x0]}
216    @{[brev8  $x1, $x1]}
217
218    # Load the key (already bit-reversed)
219    ld        $y0, 0($Htable)
220    ld        $y1, 8($Htable)
221
222    # Load the reduction constant
223    la        $polymod, Lpolymod
224    lbu       $polymod, 0($polymod)
225
226    # Multiplication (without Karatsuba)
227    @{[clmulh $z3, $x1, $y1]}
228    @{[clmul  $z2, $x1, $y1]}
229    @{[clmulh $t1, $x0, $y1]}
230    @{[clmul  $z1, $x0, $y1]}
231    xor       $z2, $z2, $t1
232    @{[clmulh $t1, $x1, $y0]}
233    @{[clmul  $t0, $x1, $y0]}
234    xor       $z2, $z2, $t1
235    xor       $z1, $z1, $t0
236    @{[clmulh $t1, $x0, $y0]}
237    @{[clmul  $z0, $x0, $y0]}
238    xor       $z1, $z1, $t1
239
240    # Reduction with clmul
241    @{[clmulh $t1, $z3, $polymod]}
242    @{[clmul  $t0, $z3, $polymod]}
243    xor       $z2, $z2, $t1
244    xor       $z1, $z1, $t0
245    @{[clmulh $t1, $z2, $polymod]}
246    @{[clmul  $t0, $z2, $polymod]}
247    xor       $x1, $z1, $t1
248    xor       $x0, $z0, $t0
249
250    # Bit-reverse Xi back and store it
251    @{[brev8  $x0, $x0]}
252    @{[brev8  $x1, $x1]}
253    sd        $x0, 0($Xi)
254    sd        $x1, 8($Xi)
255    ret
256.size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb
257___
258}
259
260################################################################################
261# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
262#                          const u8 *inp, size_t len);
263# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
264#                                const u8 *inp, size_t len);
265#
266# input:  Xi: current hash value
267#         Htable: copy of H
268#         inp: pointer to input data
269#         len: length of input data in bytes (multiple of block size)
270# output: Xi: Xi+1 (next hash value Xi)
271{
272my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
273my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
274
275$code .= <<___;
276.p2align 3
277.globl gcm_ghash_rv64i_zbc
278.type gcm_ghash_rv64i_zbc,\@function
279gcm_ghash_rv64i_zbc:
280    # Load Xi and bit-reverse it
281    ld        $x0, 0($Xi)
282    ld        $x1, 8($Xi)
283    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
284    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
285
286    # Load the key (already bit-reversed)
287    ld        $y0, 0($Htable)
288    ld        $y1, 8($Htable)
289
290    # Load the reduction constant
291    la        $polymod, Lpolymod
292    lbu       $polymod, 0($polymod)
293
294Lstep:
295    # Load the input data, bit-reverse them, and XOR them with Xi
296    ld        $t0, 0($inp)
297    ld        $t1, 8($inp)
298    add       $inp, $inp, 16
299    add       $len, $len, -16
300    @{[brev8_rv64i $t0, $z0, $z1, $z2]}
301    @{[brev8_rv64i $t1, $z0, $z1, $z2]}
302    xor       $x0, $x0, $t0
303    xor       $x1, $x1, $t1
304
305    # Multiplication (without Karatsuba)
306    @{[clmulh $z3, $x1, $y1]}
307    @{[clmul  $z2, $x1, $y1]}
308    @{[clmulh $t1, $x0, $y1]}
309    @{[clmul  $z1, $x0, $y1]}
310    xor       $z2, $z2, $t1
311    @{[clmulh $t1, $x1, $y0]}
312    @{[clmul  $t0, $x1, $y0]}
313    xor       $z2, $z2, $t1
314    xor       $z1, $z1, $t0
315    @{[clmulh $t1, $x0, $y0]}
316    @{[clmul  $z0, $x0, $y0]}
317    xor       $z1, $z1, $t1
318
319    # Reduction with clmul
320    @{[clmulh $t1, $z3, $polymod]}
321    @{[clmul  $t0, $z3, $polymod]}
322    xor       $z2, $z2, $t1
323    xor       $z1, $z1, $t0
324    @{[clmulh $t1, $z2, $polymod]}
325    @{[clmul  $t0, $z2, $polymod]}
326    xor       $x1, $z1, $t1
327    xor       $x0, $z0, $t0
328
329    # Iterate over all blocks
330    bnez      $len, Lstep
331
332    # Bit-reverse final Xi back and store it
333    @{[brev8_rv64i $x0, $z0, $z1, $z2]}
334    @{[brev8_rv64i $x1, $z0, $z1, $z2]}
335    sd        $x0, 0($Xi)
336    sd        $x1, 8($Xi)
337    ret
338.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc
339___
340}
341
342{
343my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
344my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
345
346$code .= <<___;
347.p2align 3
348.globl gcm_ghash_rv64i_zbc__zbkb
349.type gcm_ghash_rv64i_zbc__zbkb,\@function
350gcm_ghash_rv64i_zbc__zbkb:
351    # Load Xi and bit-reverse it
352    ld        $x0, 0($Xi)
353    ld        $x1, 8($Xi)
354    @{[brev8  $x0, $x0]}
355    @{[brev8  $x1, $x1]}
356
357    # Load the key (already bit-reversed)
358    ld        $y0, 0($Htable)
359    ld        $y1, 8($Htable)
360
361    # Load the reduction constant
362    la        $polymod, Lpolymod
363    lbu       $polymod, 0($polymod)
364
365Lstep_zkbk:
366    # Load the input data, bit-reverse them, and XOR them with Xi
367    ld        $t0, 0($inp)
368    ld        $t1, 8($inp)
369    add       $inp, $inp, 16
370    add       $len, $len, -16
371    @{[brev8  $t0, $t0]}
372    @{[brev8  $t1, $t1]}
373    xor       $x0, $x0, $t0
374    xor       $x1, $x1, $t1
375
376    # Multiplication (without Karatsuba)
377    @{[clmulh $z3, $x1, $y1]}
378    @{[clmul  $z2, $x1, $y1]}
379    @{[clmulh $t1, $x0, $y1]}
380    @{[clmul  $z1, $x0, $y1]}
381    xor       $z2, $z2, $t1
382    @{[clmulh $t1, $x1, $y0]}
383    @{[clmul  $t0, $x1, $y0]}
384    xor       $z2, $z2, $t1
385    xor       $z1, $z1, $t0
386    @{[clmulh $t1, $x0, $y0]}
387    @{[clmul  $z0, $x0, $y0]}
388    xor       $z1, $z1, $t1
389
390    # Reduction with clmul
391    @{[clmulh $t1, $z3, $polymod]}
392    @{[clmul  $t0, $z3, $polymod]}
393    xor       $z2, $z2, $t1
394    xor       $z1, $z1, $t0
395    @{[clmulh $t1, $z2, $polymod]}
396    @{[clmul  $t0, $z2, $polymod]}
397    xor       $x1, $z1, $t1
398    xor       $x0, $z0, $t0
399
400    # Iterate over all blocks
401    bnez      $len, Lstep_zkbk
402
403    # Bit-reverse final Xi back and store it
404    @{[brev8  $x0, $x0]}
405    @{[brev8  $x1, $x1]}
406    sd $x0,  0($Xi)
407    sd $x1,  8($Xi)
408    ret
409.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb
410___
411}
412
413$code .= <<___;
414.p2align 3
415Lbrev8_const:
416    .dword  0xAAAAAAAAAAAAAAAA
417    .dword  0xCCCCCCCCCCCCCCCC
418    .dword  0xF0F0F0F0F0F0F0F0
419.size Lbrev8_const,.-Lbrev8_const
420
421Lpolymod:
422    .byte 0x87
423.size Lpolymod,.-Lpolymod
424___
425
426print $code;
427
428close STDOUT or die "error closing STDOUT: $!";
429