1#! /usr/bin/env perl 2# This file is dual-licensed, meaning that you can use it under your 3# choice of either of the following two licenses: 4# 5# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved. 6# 7# Licensed under the Apache License 2.0 (the "License"). You may not use 8# this file except in compliance with the License. You can obtain a copy 9# in the file LICENSE in the source distribution or at 10# https://www.openssl.org/source/license.html 11# 12# or 13# 14# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com> 15# All rights reserved. 16# 17# Redistribution and use in source and binary forms, with or without 18# modification, are permitted provided that the following conditions 19# are met: 20# 1. Redistributions of source code must retain the above copyright 21# notice, this list of conditions and the following disclaimer. 22# 2. Redistributions in binary form must reproduce the above copyright 23# notice, this list of conditions and the following disclaimer in the 24# documentation and/or other materials provided with the distribution. 25# 26# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 38# - RV64I 39# - RISC-V Vector ('V') with VLEN >= 128 40# - RISC-V Basic Bit-manipulation extension ('Zbb') 41# - RISC-V Zicclsm(Main memory supports misaligned loads/stores) 42# Optional: 43# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') 44 45use strict; 46use warnings; 47 48use FindBin qw($Bin); 49use lib "$Bin"; 50use lib "$Bin/../../perlasm"; 51use riscv; 52 53# $output is the last argument if it looks like a file (it has an extension) 54# $flavour is the first argument if it doesn't look like a file 55my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 56my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 57 58my $use_zvkb = $flavour && $flavour =~ /zvkb/i ? 1 : 0; 59my $isaext = "_v_zbb" . ( $use_zvkb ? "_zvkb" : "" ); 60 61$output and open STDOUT, ">$output"; 62 63my $code = <<___; 64.text 65___ 66 67# void ChaCha20_ctr32@{[$isaext]}(unsigned char *out, const unsigned char *inp, 68# size_t len, const unsigned int key[8], 69# const unsigned int counter[4]); 70################################################################################ 71my ( $OUTPUT, $INPUT, $LEN, $KEY, $COUNTER ) = ( "a0", "a1", "a2", "a3", "a4" ); 72my ( $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3 ) = ( "a5", "a6", 73 "a7", "s0" ); 74my ( $KEY0, $KEY1, $KEY2, $KEY3, $KEY4, $KEY5, $KEY6, $KEY7, $COUNTER0, 75 $COUNTER1, $NONCE0, $NONCE1) = ( "s1", "s2", "s3", "s4", "s5", "s6", "s7", 76 "s8", "s9", "s10", "s11", "t0" ); 77my ( $STATE0, $STATE1, $STATE2, $STATE3, 78 $STATE4, $STATE5, $STATE6, $STATE7, 79 $STATE8, $STATE9, $STATE10, $STATE11, 80 $STATE12, $STATE13, $STATE14, $STATE15) = ( 81 $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3, 82 $KEY0, $KEY1, $KEY2, $KEY3, 83 $KEY4, $KEY5, $KEY6, $KEY7, 84 $COUNTER0, $COUNTER1, $NONCE0, $NONCE1 ); 85my ( $VL ) = ( "t1" ); 86my ( $CURRENT_COUNTER ) = ( "t2" ); 87my ( $T0 ) = ( "t3" ); 88my ( $T1 ) = ( "t4" ); 89my ( $T2 ) = ( "t5" ); 90my ( $T3 ) = ( "t6" ); 91my ( 92 $V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, $V8, $V9, $V10, 93 $V11, $V12, $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21, 94 $V22, $V23, $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, 95) = map( "v$_", ( 0 .. 31 ) ); 96 97sub chacha_sub_round { 98 my ( 99 $A0, $B0, $C0, 100 $A1, $B1, $C1, 101 $A2, $B2, $C2, 102 $A3, $B3, $C3, 103 104 $S_A0, $S_B0, $S_C0, 105 $S_A1, $S_B1, $S_C1, 106 $S_A2, $S_B2, $S_C2, 107 $S_A3, $S_B3, $S_C3, 108 109 $ROL_SHIFT, 110 111 $V_T0, $V_T1, $V_T2, $V_T3, 112 ) = @_; 113 114 # a += b; c ^= a; 115 my $code = <<___; 116 @{[vadd_vv $A0, $A0, $B0]} 117 add $S_A0, $S_A0, $S_B0 118 @{[vadd_vv $A1, $A1, $B1]} 119 add $S_A1, $S_A1, $S_B1 120 @{[vadd_vv $A2, $A2, $B2]} 121 add $S_A2, $S_A2, $S_B2 122 @{[vadd_vv $A3, $A3, $B3]} 123 add $S_A3, $S_A3, $S_B3 124 @{[vxor_vv $C0, $C0, $A0]} 125 xor $S_C0, $S_C0, $S_A0 126 @{[vxor_vv $C1, $C1, $A1]} 127 xor $S_C1, $S_C1, $S_A1 128 @{[vxor_vv $C2, $C2, $A2]} 129 xor $S_C2, $S_C2, $S_A2 130 @{[vxor_vv $C3, $C3, $A3]} 131 xor $S_C3, $S_C3, $S_A3 132___ 133 134 # c <<<= $ROL_SHIFT; 135 if ($use_zvkb) { 136 my $ror_part = <<___; 137 @{[vror_vi $C0, $C0, 32 - $ROL_SHIFT]} 138 @{[roriw $S_C0, $S_C0, 32 - $ROL_SHIFT]} 139 @{[vror_vi $C1, $C1, 32 - $ROL_SHIFT]} 140 @{[roriw $S_C1, $S_C1, 32 - $ROL_SHIFT]} 141 @{[vror_vi $C2, $C2, 32 - $ROL_SHIFT]} 142 @{[roriw $S_C2, $S_C2, 32 - $ROL_SHIFT]} 143 @{[vror_vi $C3, $C3, 32 - $ROL_SHIFT]} 144 @{[roriw $S_C3, $S_C3, 32 - $ROL_SHIFT]} 145___ 146 147 $code .= $ror_part; 148 } else { 149 my $ror_part = <<___; 150 @{[vsll_vi $V_T0, $C0, $ROL_SHIFT]} 151 @{[vsll_vi $V_T1, $C1, $ROL_SHIFT]} 152 @{[vsll_vi $V_T2, $C2, $ROL_SHIFT]} 153 @{[vsll_vi $V_T3, $C3, $ROL_SHIFT]} 154 @{[vsrl_vi $C0, $C0, 32 - $ROL_SHIFT]} 155 @{[vsrl_vi $C1, $C1, 32 - $ROL_SHIFT]} 156 @{[vsrl_vi $C2, $C2, 32 - $ROL_SHIFT]} 157 @{[vsrl_vi $C3, $C3, 32 - $ROL_SHIFT]} 158 @{[vor_vv $C0, $C0, $V_T0]} 159 @{[roriw $S_C0, $S_C0, 32 - $ROL_SHIFT]} 160 @{[vor_vv $C1, $C1, $V_T1]} 161 @{[roriw $S_C1, $S_C1, 32 - $ROL_SHIFT]} 162 @{[vor_vv $C2, $C2, $V_T2]} 163 @{[roriw $S_C2, $S_C2, 32 - $ROL_SHIFT]} 164 @{[vor_vv $C3, $C3, $V_T3]} 165 @{[roriw $S_C3, $S_C3, 32 - $ROL_SHIFT]} 166___ 167 168 $code .= $ror_part; 169 } 170 171 return $code; 172} 173 174sub chacha_quad_round_group { 175 my ( 176 $A0, $B0, $C0, $D0, 177 $A1, $B1, $C1, $D1, 178 $A2, $B2, $C2, $D2, 179 $A3, $B3, $C3, $D3, 180 181 $S_A0, $S_B0, $S_C0, $S_D0, 182 $S_A1, $S_B1, $S_C1, $S_D1, 183 $S_A2, $S_B2, $S_C2, $S_D2, 184 $S_A3, $S_B3, $S_C3, $S_D3, 185 186 $V_T0, $V_T1, $V_T2, $V_T3, 187 ) = @_; 188 189 my $code = <<___; 190 # a += b; d ^= a; d <<<= 16; 191 @{[chacha_sub_round 192 $A0, $B0, $D0, 193 $A1, $B1, $D1, 194 $A2, $B2, $D2, 195 $A3, $B3, $D3, 196 $S_A0, $S_B0, $S_D0, 197 $S_A1, $S_B1, $S_D1, 198 $S_A2, $S_B2, $S_D2, 199 $S_A3, $S_B3, $S_D3, 200 16, 201 $V_T0, $V_T1, $V_T2, $V_T3]} 202 # c += d; b ^= c; b <<<= 12; 203 @{[chacha_sub_round 204 $C0, $D0, $B0, 205 $C1, $D1, $B1, 206 $C2, $D2, $B2, 207 $C3, $D3, $B3, 208 $S_C0, $S_D0, $S_B0, 209 $S_C1, $S_D1, $S_B1, 210 $S_C2, $S_D2, $S_B2, 211 $S_C3, $S_D3, $S_B3, 212 12, 213 $V_T0, $V_T1, $V_T2, $V_T3]} 214 # a += b; d ^= a; d <<<= 8; 215 @{[chacha_sub_round 216 $A0, $B0, $D0, 217 $A1, $B1, $D1, 218 $A2, $B2, $D2, 219 $A3, $B3, $D3, 220 $S_A0, $S_B0, $S_D0, 221 $S_A1, $S_B1, $S_D1, 222 $S_A2, $S_B2, $S_D2, 223 $S_A3, $S_B3, $S_D3, 224 8, 225 $V_T0, $V_T1, $V_T2, $V_T3]} 226 # c += d; b ^= c; b <<<= 7; 227 @{[chacha_sub_round 228 $C0, $D0, $B0, 229 $C1, $D1, $B1, 230 $C2, $D2, $B2, 231 $C3, $D3, $B3, 232 $S_C0, $S_D0, $S_B0, 233 $S_C1, $S_D1, $S_B1, 234 $S_C2, $S_D2, $S_B2, 235 $S_C3, $S_D3, $S_B3, 236 7, 237 $V_T0, $V_T1, $V_T2, $V_T3]} 238___ 239 240 return $code; 241} 242 243$code .= <<___; 244.p2align 3 245.globl ChaCha20_ctr32@{[$isaext]} 246.type ChaCha20_ctr32@{[$isaext]},\@function 247ChaCha20_ctr32@{[$isaext]}: 248 addi sp, sp, -96 249 sd s0, 0(sp) 250 sd s1, 8(sp) 251 sd s2, 16(sp) 252 sd s3, 24(sp) 253 sd s4, 32(sp) 254 sd s5, 40(sp) 255 sd s6, 48(sp) 256 sd s7, 56(sp) 257 sd s8, 64(sp) 258 sd s9, 72(sp) 259 sd s10, 80(sp) 260 sd s11, 88(sp) 261 addi sp, sp, -64 262 263 lw $CURRENT_COUNTER, 0($COUNTER) 264 265.Lblock_loop: 266 # We will use the scalar ALU for 1 chacha block. 267 srli $T0, $LEN, 6 268 @{[vsetvli $VL, $T0, "e32", "m1", "ta", "ma"]} 269 slli $T1, $VL, 6 270 bltu $T1, $LEN, 1f 271 # Since there is no more chacha block existed, we need to split 1 block 272 # from vector ALU. 273 addi $T1, $VL, -1 274 @{[vsetvli $VL, $T1, "e32", "m1", "ta", "ma"]} 2751: 276 277 #### chacha block data 278 # init chacha const states into $V0~$V3 279 # "expa" little endian 280 li $CONST_DATA0, 0x61707865 281 @{[vmv_v_x $V0, $CONST_DATA0]} 282 # "nd 3" little endian 283 li $CONST_DATA1, 0x3320646e 284 @{[vmv_v_x $V1, $CONST_DATA1]} 285 # "2-by" little endian 286 li $CONST_DATA2, 0x79622d32 287 @{[vmv_v_x $V2, $CONST_DATA2]} 288 # "te k" little endian 289 li $CONST_DATA3, 0x6b206574 290 lw $KEY0, 0($KEY) 291 @{[vmv_v_x $V3, $CONST_DATA3]} 292 293 # init chacha key states into $V4~$V11 294 lw $KEY1, 4($KEY) 295 @{[vmv_v_x $V4, $KEY0]} 296 lw $KEY2, 8($KEY) 297 @{[vmv_v_x $V5, $KEY1]} 298 lw $KEY3, 12($KEY) 299 @{[vmv_v_x $V6, $KEY2]} 300 lw $KEY4, 16($KEY) 301 @{[vmv_v_x $V7, $KEY3]} 302 lw $KEY5, 20($KEY) 303 @{[vmv_v_x $V8, $KEY4]} 304 lw $KEY6, 24($KEY) 305 @{[vmv_v_x $V9, $KEY5]} 306 lw $KEY7, 28($KEY) 307 @{[vmv_v_x $V10, $KEY6]} 308 @{[vmv_v_x $V11, $KEY7]} 309 310 # init chacha key states into $V12~$V13 311 lw $COUNTER1, 4($COUNTER) 312 @{[vid_v $V12]} 313 lw $NONCE0, 8($COUNTER) 314 @{[vadd_vx $V12, $V12, $CURRENT_COUNTER]} 315 lw $NONCE1, 12($COUNTER) 316 @{[vmv_v_x $V13, $COUNTER1]} 317 add $COUNTER0, $CURRENT_COUNTER, $VL 318 319 # init chacha nonce states into $V14~$V15 320 @{[vmv_v_x $V14, $NONCE0]} 321 @{[vmv_v_x $V15, $NONCE1]} 322 323 li $T0, 64 324 # load the top-half of input data into $V16~$V23 325 @{[vlsseg_nf_e32_v 8, $V16, $INPUT, $T0]} 326 327 # till now in block_loop, we used: 328 # - $V0~$V15 for chacha states. 329 # - $V16~$V23 for top-half of input data. 330 # - $V24~$V31 haven't been used yet. 331 332 # 20 round groups 333 li $T0, 10 334.Lround_loop: 335 # we can use $V24~$V31 as temporary registers in round_loop. 336 addi $T0, $T0, -1 337 @{[chacha_quad_round_group 338 $V0, $V4, $V8, $V12, 339 $V1, $V5, $V9, $V13, 340 $V2, $V6, $V10, $V14, 341 $V3, $V7, $V11, $V15, 342 $STATE0, $STATE4, $STATE8, $STATE12, 343 $STATE1, $STATE5, $STATE9, $STATE13, 344 $STATE2, $STATE6, $STATE10, $STATE14, 345 $STATE3, $STATE7, $STATE11, $STATE15, 346 $V24, $V25, $V26, $V27]} 347 @{[chacha_quad_round_group 348 $V3, $V4, $V9, $V14, 349 $V0, $V5, $V10, $V15, 350 $V1, $V6, $V11, $V12, 351 $V2, $V7, $V8, $V13, 352 $STATE3, $STATE4, $STATE9, $STATE14, 353 $STATE0, $STATE5, $STATE10, $STATE15, 354 $STATE1, $STATE6, $STATE11, $STATE12, 355 $STATE2, $STATE7, $STATE8, $STATE13, 356 $V24, $V25, $V26, $V27]} 357 bnez $T0, .Lround_loop 358 359 li $T0, 64 360 # load the bottom-half of input data into $V24~$V31 361 addi $T1, $INPUT, 32 362 @{[vlsseg_nf_e32_v 8, $V24, $T1, $T0]} 363 364 # now, there are no free vector registers until the round_loop exits. 365 366 # add chacha top-half initial block states 367 # "expa" little endian 368 li $T0, 0x61707865 369 @{[vadd_vx $V0, $V0, $T0]} 370 add $STATE0, $STATE0, $T0 371 # "nd 3" little endian 372 li $T1, 0x3320646e 373 @{[vadd_vx $V1, $V1, $T1]} 374 add $STATE1, $STATE1, $T1 375 lw $T0, 0($KEY) 376 # "2-by" little endian 377 li $T2, 0x79622d32 378 @{[vadd_vx $V2, $V2, $T2]} 379 add $STATE2, $STATE2, $T2 380 lw $T1, 4($KEY) 381 # "te k" little endian 382 li $T3, 0x6b206574 383 @{[vadd_vx $V3, $V3, $T3]} 384 add $STATE3, $STATE3, $T3 385 lw $T2, 8($KEY) 386 @{[vadd_vx $V4, $V4, $T0]} 387 add $STATE4, $STATE4, $T0 388 lw $T3, 12($KEY) 389 @{[vadd_vx $V5, $V5, $T1]} 390 add $STATE5, $STATE5, $T1 391 @{[vadd_vx $V6, $V6, $T2]} 392 add $STATE6, $STATE6, $T2 393 @{[vadd_vx $V7, $V7, $T3]} 394 add $STATE7, $STATE7, $T3 395 396 # xor with the top-half input 397 @{[vxor_vv $V16, $V16, $V0]} 398 sw $STATE0, 0(sp) 399 sw $STATE1, 4(sp) 400 @{[vxor_vv $V17, $V17, $V1]} 401 sw $STATE2, 8(sp) 402 sw $STATE3, 12(sp) 403 @{[vxor_vv $V18, $V18, $V2]} 404 sw $STATE4, 16(sp) 405 sw $STATE5, 20(sp) 406 @{[vxor_vv $V19, $V19, $V3]} 407 sw $STATE6, 24(sp) 408 sw $STATE7, 28(sp) 409 @{[vxor_vv $V20, $V20, $V4]} 410 lw $T0, 16($KEY) 411 @{[vxor_vv $V21, $V21, $V5]} 412 lw $T1, 20($KEY) 413 @{[vxor_vv $V22, $V22, $V6]} 414 lw $T2, 24($KEY) 415 @{[vxor_vv $V23, $V23, $V7]} 416 417 # save the top-half of output from $V16~$V23 418 li $T3, 64 419 @{[vssseg_nf_e32_v 8, $V16, $OUTPUT, $T3]} 420 421 # add chacha bottom-half initial block states 422 @{[vadd_vx $V8, $V8, $T0]} 423 add $STATE8, $STATE8, $T0 424 lw $T3, 28($KEY) 425 @{[vadd_vx $V9, $V9, $T1]} 426 add $STATE9, $STATE9, $T1 427 lw $T0, 4($COUNTER) 428 @{[vadd_vx $V10, $V10, $T2]} 429 add $STATE10, $STATE10, $T2 430 lw $T1, 8($COUNTER) 431 @{[vadd_vx $V11, $V11, $T3]} 432 add $STATE11, $STATE11, $T3 433 lw $T2, 12($COUNTER) 434 @{[vid_v $V0]} 435 add $STATE12, $STATE12, $CURRENT_COUNTER 436 @{[vadd_vx $V12, $V12, $CURRENT_COUNTER]} 437 add $STATE12, $STATE12, $VL 438 @{[vadd_vx $V13, $V13, $T0]} 439 add $STATE13, $STATE13, $T0 440 @{[vadd_vx $V14, $V14, $T1]} 441 add $STATE14, $STATE14, $T1 442 @{[vadd_vx $V15, $V15, $T2]} 443 add $STATE15, $STATE15, $T2 444 @{[vadd_vv $V12, $V12, $V0]} 445 # xor with the bottom-half input 446 @{[vxor_vv $V24, $V24, $V8]} 447 sw $STATE8, 32(sp) 448 @{[vxor_vv $V25, $V25, $V9]} 449 sw $STATE9, 36(sp) 450 @{[vxor_vv $V26, $V26, $V10]} 451 sw $STATE10, 40(sp) 452 @{[vxor_vv $V27, $V27, $V11]} 453 sw $STATE11, 44(sp) 454 @{[vxor_vv $V29, $V29, $V13]} 455 sw $STATE12, 48(sp) 456 @{[vxor_vv $V28, $V28, $V12]} 457 sw $STATE13, 52(sp) 458 @{[vxor_vv $V30, $V30, $V14]} 459 sw $STATE14, 56(sp) 460 @{[vxor_vv $V31, $V31, $V15]} 461 sw $STATE15, 60(sp) 462 463 # save the bottom-half of output from $V24~$V31 464 li $T0, 64 465 addi $T1, $OUTPUT, 32 466 @{[vssseg_nf_e32_v 8, $V24, $T1, $T0]} 467 468 # the computed vector parts: `64 * VL` 469 slli $T0, $VL, 6 470 471 add $INPUT, $INPUT, $T0 472 add $OUTPUT, $OUTPUT, $T0 473 sub $LEN, $LEN, $T0 474 add $CURRENT_COUNTER, $CURRENT_COUNTER, $VL 475 476 # process the scalar data block 477 addi $CURRENT_COUNTER, $CURRENT_COUNTER, 1 478 li $T0, 64 479 @{[minu $T1, $LEN, $T0]} 480 sub $LEN, $LEN, $T1 481 mv $T2, sp 482.Lscalar_data_loop: 483 @{[vsetvli $VL, $T1, "e8", "m8", "ta", "ma"]} 484 # from this on, vector registers are grouped with lmul = 8 485 @{[vle8_v $V8, $INPUT]} 486 @{[vle8_v $V16, $T2]} 487 @{[vxor_vv $V8, $V8, $V16]} 488 @{[vse8_v $V8, $OUTPUT]} 489 add $INPUT, $INPUT, $VL 490 add $OUTPUT, $OUTPUT, $VL 491 add $T2, $T2, $VL 492 sub $T1, $T1, $VL 493 bnez $T1, .Lscalar_data_loop 494 495 bnez $LEN, .Lblock_loop 496 497 addi sp, sp, 64 498 ld s0, 0(sp) 499 ld s1, 8(sp) 500 ld s2, 16(sp) 501 ld s3, 24(sp) 502 ld s4, 32(sp) 503 ld s5, 40(sp) 504 ld s6, 48(sp) 505 ld s7, 56(sp) 506 ld s8, 64(sp) 507 ld s9, 72(sp) 508 ld s10, 80(sp) 509 ld s11, 88(sp) 510 addi sp, sp, 96 511 512 ret 513.size ChaCha20_ctr32@{[$isaext]},.-ChaCha20_ctr32@{[$isaext]} 514___ 515 516print $code; 517 518close STDOUT or die "error closing STDOUT: $!"; 519