1#! /usr/bin/env perl
2# This file is dual-licensed, meaning that you can use it under your
3# choice of either of the following two licenses:
4#
5# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
6#
7# Licensed under the Apache License 2.0 (the "License").  You may not use
8# this file except in compliance with the License.  You can obtain a copy
9# in the file LICENSE in the source distribution or at
10# https://www.openssl.org/source/license.html
11#
12# or
13#
14# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
15# All rights reserved.
16#
17# Redistribution and use in source and binary forms, with or without
18# modification, are permitted provided that the following conditions
19# are met:
20# 1. Redistributions of source code must retain the above copyright
21#    notice, this list of conditions and the following disclaimer.
22# 2. Redistributions in binary form must reproduce the above copyright
23#    notice, this list of conditions and the following disclaimer in the
24#    documentation and/or other materials provided with the distribution.
25#
26# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
38# - RV64I
39# - RISC-V Vector ('V') with VLEN >= 128
40# - RISC-V Basic Bit-manipulation extension ('Zbb')
41# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
42# Optional:
43# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
44
45use strict;
46use warnings;
47
48use FindBin qw($Bin);
49use lib "$Bin";
50use lib "$Bin/../../perlasm";
51use riscv;
52
53# $output is the last argument if it looks like a file (it has an extension)
54# $flavour is the first argument if it doesn't look like a file
55my $output  = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop   : undef;
56my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.|          ? shift : undef;
57
58my $use_zvkb = $flavour && $flavour =~ /zvkb/i ? 1 : 0;
59my $isaext = "_v_zbb" . ( $use_zvkb ? "_zvkb" : "" );
60
61$output and open STDOUT, ">$output";
62
63my $code = <<___;
64.text
65___
66
67# void ChaCha20_ctr32@{[$isaext]}(unsigned char *out, const unsigned char *inp,
68#                                 size_t len, const unsigned int key[8],
69#                                 const unsigned int counter[4]);
70################################################################################
71my ( $OUTPUT, $INPUT, $LEN, $KEY, $COUNTER ) = ( "a0", "a1", "a2", "a3", "a4" );
72my ( $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3 ) = ( "a5", "a6",
73  "a7", "s0" );
74my ( $KEY0, $KEY1, $KEY2, $KEY3, $KEY4, $KEY5, $KEY6, $KEY7, $COUNTER0,
75     $COUNTER1, $NONCE0, $NONCE1) = ( "s1", "s2", "s3", "s4", "s5", "s6", "s7",
76  "s8", "s9", "s10", "s11", "t0" );
77my ( $STATE0, $STATE1, $STATE2, $STATE3,
78     $STATE4, $STATE5, $STATE6, $STATE7,
79     $STATE8, $STATE9, $STATE10, $STATE11,
80     $STATE12, $STATE13, $STATE14, $STATE15) = (
81     $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3,
82     $KEY0, $KEY1, $KEY2, $KEY3,
83     $KEY4, $KEY5, $KEY6, $KEY7,
84     $COUNTER0, $COUNTER1, $NONCE0, $NONCE1 );
85my ( $VL ) = ( "t1" );
86my ( $CURRENT_COUNTER ) = ( "t2" );
87my ( $T0 ) = ( "t3" );
88my ( $T1 ) = ( "t4" );
89my ( $T2 ) = ( "t5" );
90my ( $T3 ) = ( "t6" );
91my (
92    $V0,  $V1,  $V2,  $V3,  $V4,  $V5,  $V6,  $V7,  $V8,  $V9,  $V10,
93    $V11, $V12, $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21,
94    $V22, $V23, $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
95) = map( "v$_", ( 0 .. 31 ) );
96
97sub chacha_sub_round {
98    my (
99        $A0, $B0, $C0,
100        $A1, $B1, $C1,
101        $A2, $B2, $C2,
102        $A3, $B3, $C3,
103
104        $S_A0, $S_B0, $S_C0,
105        $S_A1, $S_B1, $S_C1,
106        $S_A2, $S_B2, $S_C2,
107        $S_A3, $S_B3, $S_C3,
108
109        $ROL_SHIFT,
110
111        $V_T0, $V_T1, $V_T2, $V_T3,
112    ) = @_;
113
114    # a += b; c ^= a;
115    my $code = <<___;
116    @{[vadd_vv $A0, $A0, $B0]}
117    add $S_A0, $S_A0, $S_B0
118    @{[vadd_vv $A1, $A1, $B1]}
119    add $S_A1, $S_A1, $S_B1
120    @{[vadd_vv $A2, $A2, $B2]}
121    add $S_A2, $S_A2, $S_B2
122    @{[vadd_vv $A3, $A3, $B3]}
123    add $S_A3, $S_A3, $S_B3
124    @{[vxor_vv $C0, $C0, $A0]}
125    xor $S_C0, $S_C0, $S_A0
126    @{[vxor_vv $C1, $C1, $A1]}
127    xor $S_C1, $S_C1, $S_A1
128    @{[vxor_vv $C2, $C2, $A2]}
129    xor $S_C2, $S_C2, $S_A2
130    @{[vxor_vv $C3, $C3, $A3]}
131    xor $S_C3, $S_C3, $S_A3
132___
133
134    # c <<<= $ROL_SHIFT;
135    if ($use_zvkb) {
136        my $ror_part = <<___;
137        @{[vror_vi $C0, $C0, 32 - $ROL_SHIFT]}
138        @{[roriw $S_C0, $S_C0, 32 - $ROL_SHIFT]}
139        @{[vror_vi $C1, $C1, 32 - $ROL_SHIFT]}
140        @{[roriw $S_C1, $S_C1, 32 - $ROL_SHIFT]}
141        @{[vror_vi $C2, $C2, 32 - $ROL_SHIFT]}
142        @{[roriw $S_C2, $S_C2, 32 - $ROL_SHIFT]}
143        @{[vror_vi $C3, $C3, 32 - $ROL_SHIFT]}
144        @{[roriw $S_C3, $S_C3, 32 - $ROL_SHIFT]}
145___
146
147        $code .= $ror_part;
148    } else {
149        my $ror_part = <<___;
150        @{[vsll_vi $V_T0, $C0, $ROL_SHIFT]}
151        @{[vsll_vi $V_T1, $C1, $ROL_SHIFT]}
152        @{[vsll_vi $V_T2, $C2, $ROL_SHIFT]}
153        @{[vsll_vi $V_T3, $C3, $ROL_SHIFT]}
154        @{[vsrl_vi $C0, $C0, 32 - $ROL_SHIFT]}
155        @{[vsrl_vi $C1, $C1, 32 - $ROL_SHIFT]}
156        @{[vsrl_vi $C2, $C2, 32 - $ROL_SHIFT]}
157        @{[vsrl_vi $C3, $C3, 32 - $ROL_SHIFT]}
158        @{[vor_vv $C0, $C0, $V_T0]}
159        @{[roriw $S_C0, $S_C0, 32 - $ROL_SHIFT]}
160        @{[vor_vv $C1, $C1, $V_T1]}
161        @{[roriw $S_C1, $S_C1, 32 - $ROL_SHIFT]}
162        @{[vor_vv $C2, $C2, $V_T2]}
163        @{[roriw $S_C2, $S_C2, 32 - $ROL_SHIFT]}
164        @{[vor_vv $C3, $C3, $V_T3]}
165        @{[roriw $S_C3, $S_C3, 32 - $ROL_SHIFT]}
166___
167
168        $code .= $ror_part;
169    }
170
171    return $code;
172}
173
174sub chacha_quad_round_group {
175    my (
176        $A0, $B0, $C0, $D0,
177        $A1, $B1, $C1, $D1,
178        $A2, $B2, $C2, $D2,
179        $A3, $B3, $C3, $D3,
180
181        $S_A0, $S_B0, $S_C0, $S_D0,
182        $S_A1, $S_B1, $S_C1, $S_D1,
183        $S_A2, $S_B2, $S_C2, $S_D2,
184        $S_A3, $S_B3, $S_C3, $S_D3,
185
186        $V_T0, $V_T1, $V_T2, $V_T3,
187    ) = @_;
188
189    my $code = <<___;
190    # a += b; d ^= a; d <<<= 16;
191    @{[chacha_sub_round
192      $A0, $B0, $D0,
193      $A1, $B1, $D1,
194      $A2, $B2, $D2,
195      $A3, $B3, $D3,
196      $S_A0, $S_B0, $S_D0,
197      $S_A1, $S_B1, $S_D1,
198      $S_A2, $S_B2, $S_D2,
199      $S_A3, $S_B3, $S_D3,
200      16,
201      $V_T0, $V_T1, $V_T2, $V_T3]}
202    # c += d; b ^= c; b <<<= 12;
203    @{[chacha_sub_round
204      $C0, $D0, $B0,
205      $C1, $D1, $B1,
206      $C2, $D2, $B2,
207      $C3, $D3, $B3,
208      $S_C0, $S_D0, $S_B0,
209      $S_C1, $S_D1, $S_B1,
210      $S_C2, $S_D2, $S_B2,
211      $S_C3, $S_D3, $S_B3,
212      12,
213      $V_T0, $V_T1, $V_T2, $V_T3]}
214    # a += b; d ^= a; d <<<= 8;
215    @{[chacha_sub_round
216      $A0, $B0, $D0,
217      $A1, $B1, $D1,
218      $A2, $B2, $D2,
219      $A3, $B3, $D3,
220      $S_A0, $S_B0, $S_D0,
221      $S_A1, $S_B1, $S_D1,
222      $S_A2, $S_B2, $S_D2,
223      $S_A3, $S_B3, $S_D3,
224      8,
225      $V_T0, $V_T1, $V_T2, $V_T3]}
226    # c += d; b ^= c; b <<<= 7;
227    @{[chacha_sub_round
228      $C0, $D0, $B0,
229      $C1, $D1, $B1,
230      $C2, $D2, $B2,
231      $C3, $D3, $B3,
232      $S_C0, $S_D0, $S_B0,
233      $S_C1, $S_D1, $S_B1,
234      $S_C2, $S_D2, $S_B2,
235      $S_C3, $S_D3, $S_B3,
236      7,
237      $V_T0, $V_T1, $V_T2, $V_T3]}
238___
239
240    return $code;
241}
242
243$code .= <<___;
244.p2align 3
245.globl ChaCha20_ctr32@{[$isaext]}
246.type ChaCha20_ctr32@{[$isaext]},\@function
247ChaCha20_ctr32@{[$isaext]}:
248    addi sp, sp, -96
249    sd s0, 0(sp)
250    sd s1, 8(sp)
251    sd s2, 16(sp)
252    sd s3, 24(sp)
253    sd s4, 32(sp)
254    sd s5, 40(sp)
255    sd s6, 48(sp)
256    sd s7, 56(sp)
257    sd s8, 64(sp)
258    sd s9, 72(sp)
259    sd s10, 80(sp)
260    sd s11, 88(sp)
261    addi sp, sp, -64
262
263    lw $CURRENT_COUNTER, 0($COUNTER)
264
265.Lblock_loop:
266    # We will use the scalar ALU for 1 chacha block.
267    srli $T0, $LEN, 6
268    @{[vsetvli $VL, $T0, "e32", "m1", "ta", "ma"]}
269    slli $T1, $VL, 6
270    bltu $T1, $LEN, 1f
271    # Since there is no more chacha block existed, we need to split 1 block
272    # from vector ALU.
273    addi $T1, $VL, -1
274    @{[vsetvli $VL, $T1, "e32", "m1", "ta", "ma"]}
2751:
276
277    #### chacha block data
278    # init chacha const states into $V0~$V3
279    # "expa" little endian
280    li $CONST_DATA0, 0x61707865
281    @{[vmv_v_x $V0, $CONST_DATA0]}
282    # "nd 3" little endian
283    li $CONST_DATA1, 0x3320646e
284    @{[vmv_v_x $V1, $CONST_DATA1]}
285    # "2-by" little endian
286    li $CONST_DATA2, 0x79622d32
287    @{[vmv_v_x $V2, $CONST_DATA2]}
288    # "te k" little endian
289    li $CONST_DATA3, 0x6b206574
290    lw $KEY0, 0($KEY)
291    @{[vmv_v_x $V3, $CONST_DATA3]}
292
293    # init chacha key states into $V4~$V11
294    lw $KEY1, 4($KEY)
295    @{[vmv_v_x $V4, $KEY0]}
296    lw $KEY2, 8($KEY)
297    @{[vmv_v_x $V5, $KEY1]}
298    lw $KEY3, 12($KEY)
299    @{[vmv_v_x $V6, $KEY2]}
300    lw $KEY4, 16($KEY)
301    @{[vmv_v_x $V7, $KEY3]}
302    lw $KEY5, 20($KEY)
303    @{[vmv_v_x $V8, $KEY4]}
304    lw $KEY6, 24($KEY)
305    @{[vmv_v_x $V9, $KEY5]}
306    lw $KEY7, 28($KEY)
307    @{[vmv_v_x $V10, $KEY6]}
308    @{[vmv_v_x $V11, $KEY7]}
309
310    # init chacha key states into $V12~$V13
311    lw $COUNTER1, 4($COUNTER)
312    @{[vid_v $V12]}
313    lw $NONCE0, 8($COUNTER)
314    @{[vadd_vx $V12, $V12, $CURRENT_COUNTER]}
315    lw $NONCE1, 12($COUNTER)
316    @{[vmv_v_x $V13, $COUNTER1]}
317    add $COUNTER0, $CURRENT_COUNTER, $VL
318
319    # init chacha nonce states into $V14~$V15
320    @{[vmv_v_x $V14, $NONCE0]}
321    @{[vmv_v_x $V15, $NONCE1]}
322
323    li $T0, 64
324    # load the top-half of input data into $V16~$V23
325    @{[vlsseg_nf_e32_v 8, $V16, $INPUT, $T0]}
326
327    # till now in block_loop, we used:
328    # - $V0~$V15 for chacha states.
329    # - $V16~$V23 for top-half of input data.
330    # - $V24~$V31 haven't been used yet.
331
332    # 20 round groups
333    li $T0, 10
334.Lround_loop:
335    # we can use $V24~$V31 as temporary registers in round_loop.
336    addi $T0, $T0, -1
337    @{[chacha_quad_round_group
338      $V0, $V4, $V8, $V12,
339      $V1, $V5, $V9, $V13,
340      $V2, $V6, $V10, $V14,
341      $V3, $V7, $V11, $V15,
342      $STATE0, $STATE4, $STATE8, $STATE12,
343      $STATE1, $STATE5, $STATE9, $STATE13,
344      $STATE2, $STATE6, $STATE10, $STATE14,
345      $STATE3, $STATE7, $STATE11, $STATE15,
346      $V24, $V25, $V26, $V27]}
347    @{[chacha_quad_round_group
348      $V3, $V4, $V9, $V14,
349      $V0, $V5, $V10, $V15,
350      $V1, $V6, $V11, $V12,
351      $V2, $V7, $V8, $V13,
352      $STATE3, $STATE4, $STATE9, $STATE14,
353      $STATE0, $STATE5, $STATE10, $STATE15,
354      $STATE1, $STATE6, $STATE11, $STATE12,
355      $STATE2, $STATE7, $STATE8, $STATE13,
356      $V24, $V25, $V26, $V27]}
357    bnez $T0, .Lround_loop
358
359    li $T0, 64
360    # load the bottom-half of input data into $V24~$V31
361    addi $T1, $INPUT, 32
362    @{[vlsseg_nf_e32_v 8, $V24, $T1, $T0]}
363
364    # now, there are no free vector registers until the round_loop exits.
365
366    # add chacha top-half initial block states
367    # "expa" little endian
368    li $T0, 0x61707865
369    @{[vadd_vx $V0, $V0, $T0]}
370    add $STATE0, $STATE0, $T0
371    # "nd 3" little endian
372    li $T1, 0x3320646e
373    @{[vadd_vx $V1, $V1, $T1]}
374    add $STATE1, $STATE1, $T1
375    lw $T0, 0($KEY)
376    # "2-by" little endian
377    li $T2, 0x79622d32
378    @{[vadd_vx $V2, $V2, $T2]}
379    add $STATE2, $STATE2, $T2
380    lw $T1, 4($KEY)
381    # "te k" little endian
382    li $T3, 0x6b206574
383    @{[vadd_vx $V3, $V3, $T3]}
384    add $STATE3, $STATE3, $T3
385    lw $T2, 8($KEY)
386    @{[vadd_vx $V4, $V4, $T0]}
387    add $STATE4, $STATE4, $T0
388    lw $T3, 12($KEY)
389    @{[vadd_vx $V5, $V5, $T1]}
390    add $STATE5, $STATE5, $T1
391    @{[vadd_vx $V6, $V6, $T2]}
392    add $STATE6, $STATE6, $T2
393    @{[vadd_vx $V7, $V7, $T3]}
394    add $STATE7, $STATE7, $T3
395
396    # xor with the top-half input
397    @{[vxor_vv $V16, $V16, $V0]}
398    sw $STATE0, 0(sp)
399    sw $STATE1, 4(sp)
400    @{[vxor_vv $V17, $V17, $V1]}
401    sw $STATE2, 8(sp)
402    sw $STATE3, 12(sp)
403    @{[vxor_vv $V18, $V18, $V2]}
404    sw $STATE4, 16(sp)
405    sw $STATE5, 20(sp)
406    @{[vxor_vv $V19, $V19, $V3]}
407    sw $STATE6, 24(sp)
408    sw $STATE7, 28(sp)
409    @{[vxor_vv $V20, $V20, $V4]}
410    lw $T0, 16($KEY)
411    @{[vxor_vv $V21, $V21, $V5]}
412    lw $T1, 20($KEY)
413    @{[vxor_vv $V22, $V22, $V6]}
414    lw $T2, 24($KEY)
415    @{[vxor_vv $V23, $V23, $V7]}
416
417    # save the top-half of output from $V16~$V23
418    li $T3, 64
419    @{[vssseg_nf_e32_v 8, $V16, $OUTPUT, $T3]}
420
421    # add chacha bottom-half initial block states
422    @{[vadd_vx $V8, $V8, $T0]}
423    add $STATE8, $STATE8, $T0
424    lw $T3, 28($KEY)
425    @{[vadd_vx $V9, $V9, $T1]}
426    add $STATE9, $STATE9, $T1
427    lw $T0, 4($COUNTER)
428    @{[vadd_vx $V10, $V10, $T2]}
429    add $STATE10, $STATE10, $T2
430    lw $T1, 8($COUNTER)
431    @{[vadd_vx $V11, $V11, $T3]}
432    add $STATE11, $STATE11, $T3
433    lw $T2, 12($COUNTER)
434    @{[vid_v $V0]}
435    add $STATE12, $STATE12, $CURRENT_COUNTER
436    @{[vadd_vx $V12, $V12, $CURRENT_COUNTER]}
437    add $STATE12, $STATE12, $VL
438    @{[vadd_vx $V13, $V13, $T0]}
439    add $STATE13, $STATE13, $T0
440    @{[vadd_vx $V14, $V14, $T1]}
441    add $STATE14, $STATE14, $T1
442    @{[vadd_vx $V15, $V15, $T2]}
443    add $STATE15, $STATE15, $T2
444    @{[vadd_vv $V12, $V12, $V0]}
445    # xor with the bottom-half input
446    @{[vxor_vv $V24, $V24, $V8]}
447    sw $STATE8, 32(sp)
448    @{[vxor_vv $V25, $V25, $V9]}
449    sw $STATE9, 36(sp)
450    @{[vxor_vv $V26, $V26, $V10]}
451    sw $STATE10, 40(sp)
452    @{[vxor_vv $V27, $V27, $V11]}
453    sw $STATE11, 44(sp)
454    @{[vxor_vv $V29, $V29, $V13]}
455    sw $STATE12, 48(sp)
456    @{[vxor_vv $V28, $V28, $V12]}
457    sw $STATE13, 52(sp)
458    @{[vxor_vv $V30, $V30, $V14]}
459    sw $STATE14, 56(sp)
460    @{[vxor_vv $V31, $V31, $V15]}
461    sw $STATE15, 60(sp)
462
463    # save the bottom-half of output from $V24~$V31
464    li $T0, 64
465    addi $T1, $OUTPUT, 32
466    @{[vssseg_nf_e32_v 8, $V24, $T1, $T0]}
467
468    # the computed vector parts: `64 * VL`
469    slli $T0, $VL, 6
470
471    add $INPUT, $INPUT, $T0
472    add $OUTPUT, $OUTPUT, $T0
473    sub $LEN, $LEN, $T0
474    add $CURRENT_COUNTER, $CURRENT_COUNTER, $VL
475
476    # process the scalar data block
477    addi $CURRENT_COUNTER, $CURRENT_COUNTER, 1
478    li $T0, 64
479    @{[minu $T1, $LEN, $T0]}
480    sub $LEN, $LEN, $T1
481    mv $T2, sp
482.Lscalar_data_loop:
483    @{[vsetvli $VL, $T1, "e8", "m8", "ta", "ma"]}
484    # from this on, vector registers are grouped with lmul = 8
485    @{[vle8_v $V8, $INPUT]}
486    @{[vle8_v $V16, $T2]}
487    @{[vxor_vv $V8, $V8, $V16]}
488    @{[vse8_v $V8, $OUTPUT]}
489    add $INPUT, $INPUT, $VL
490    add $OUTPUT, $OUTPUT, $VL
491    add $T2, $T2, $VL
492    sub $T1, $T1, $VL
493    bnez $T1, .Lscalar_data_loop
494
495    bnez $LEN, .Lblock_loop
496
497    addi sp, sp, 64
498    ld s0, 0(sp)
499    ld s1, 8(sp)
500    ld s2, 16(sp)
501    ld s3, 24(sp)
502    ld s4, 32(sp)
503    ld s5, 40(sp)
504    ld s6, 48(sp)
505    ld s7, 56(sp)
506    ld s8, 64(sp)
507    ld s9, 72(sp)
508    ld s10, 80(sp)
509    ld s11, 88(sp)
510    addi sp, sp, 96
511
512    ret
513.size ChaCha20_ctr32@{[$isaext]},.-ChaCha20_ctr32@{[$isaext]}
514___
515
516print $code;
517
518close STDOUT or die "error closing STDOUT: $!";
519