1#! /usr/bin/env perl 2# This file is dual-licensed, meaning that you can use it under your 3# choice of either of the following two licenses: 4# 5# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. 6# 7# Licensed under the Apache License 2.0 (the "License"). You can obtain 8# a copy in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# or 12# 13# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com> 14# All rights reserved. 15# 16# Redistribution and use in source and binary forms, with or without 17# modification, are permitted provided that the following conditions 18# are met: 19# 1. Redistributions of source code must retain the above copyright 20# notice, this list of conditions and the following disclaimer. 21# 2. Redistributions in binary form must reproduce the above copyright 22# notice, this list of conditions and the following disclaimer in the 23# documentation and/or other materials provided with the distribution. 24# 25# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 37# - RV64I 38# - RISC-V Vector ('V') with VLEN >= 128 39# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') 40# - RISC-V Vector GCM/GMAC extension ('Zvkg') 41# - RISC-V Vector AES Block Cipher extension ('Zvkned') 42# - RISC-V Zicclsm(Main memory supports misaligned loads/stores) 43 44# Reference: https://github.com/riscv/riscv-crypto/issues/192#issuecomment-1270447575 45# 46# Assume we have 12 GCM blocks and we try to parallelize GCM computation for 4 blocks. 47# Tag = M0*H^12 + M1*H^11 + M2*H^10 + M3*H^9 + 48# M4*H^8 + M5*H^7 + M6*H^6 + M7*H^5 + 49# M8*H^4 + M9*H^3 + M10*H^2 + M11*H^1 50# We could rewrite the formula into: 51# T0 = 0 52# T1 = (T0+M1)*H^4 T2 = (T0+M2)*H^4 T3 = (T0+M3)*H^4 T4 = (T0+M4)*H^4 53# T5 = (T1+M5)*H^4 T6 = (T2+M6)*H^4 T7 = (T3+M7)*H^4 T8 = (T4+M8)*H^4 54# T9 = (T5+M9)*H^4 T10 = (T6+M10)*H^3 T11 = (T7+M11)*H^2 T12 = (T8+M12)*H^1 55# 56# We will multiply with [H^4, H^4, H^4, H^4] in each steps except the last iteration. 57# The last iteration will multiply with [H^4, H^3, H^2, H^1]. 58 59use strict; 60use warnings; 61 62use FindBin qw($Bin); 63use lib "$Bin"; 64use lib "$Bin/../../perlasm"; 65use riscv; 66 67# $output is the last argument if it looks like a file (it has an extension) 68# $flavour is the first argument if it doesn't look like a file 69my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 70my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 71 72$output and open STDOUT,">$output"; 73 74my $code=<<___; 75.text 76___ 77 78{ 79my ($INP, $OUTP, $LEN, $KEYP, $IVP, $XIP) = ("a0", "a1", "a2", "a3", "a4", "a5"); 80my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3"); 81my ($PADDING_LEN32) = ("t4"); 82my ($LEN32) = ("t5"); 83my ($CTR) = ("t6"); 84my ($FULL_BLOCK_LEN32) = ("a6"); 85my ($ORIGINAL_LEN32) = ("a7"); 86my ($PROCESSED_LEN) = ("a0"); 87my ($CTR_MASK) = ("v0"); 88my ($INPUT_PADDING_MASK) = ("v0"); 89my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, 90 $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, 91 $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, 92 $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, 93) = map("v$_",(0..31)); 94 95# Do aes-128 enc. 96sub aes_128_cipher_body { 97 my $code=<<___; 98 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 99 @{[vaesz_vs $V28, $V1]} 100 @{[vaesem_vs $V28, $V2]} 101 @{[vaesem_vs $V28, $V3]} 102 @{[vaesem_vs $V28, $V4]} 103 @{[vaesem_vs $V28, $V5]} 104 @{[vaesem_vs $V28, $V6]} 105 @{[vaesem_vs $V28, $V7]} 106 @{[vaesem_vs $V28, $V8]} 107 @{[vaesem_vs $V28, $V9]} 108 @{[vaesem_vs $V28, $V10]} 109 @{[vaesef_vs $V28, $V11]} 110___ 111 112 return $code; 113} 114 115# Do aes-192 enc. 116sub aes_192_cipher_body { 117 my $TMP_REG = shift; 118 119 my $code=<<___; 120 # Load key 4 121 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 122 addi $TMP_REG, $KEYP, 48 123 @{[vle32_v $V11, $TMP_REG]} 124 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 125 @{[vaesz_vs $V28, $V1]} 126 @{[vaesem_vs $V28, $V2]} 127 @{[vaesem_vs $V28, $V3]} 128 @{[vaesem_vs $V28, $V11]} 129 # Load key 8 130 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 131 addi $TMP_REG, $KEYP, 112 132 @{[vle32_v $V11, $TMP_REG]} 133 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 134 @{[vaesem_vs $V28, $V4]} 135 @{[vaesem_vs $V28, $V5]} 136 @{[vaesem_vs $V28, $V6]} 137 @{[vaesem_vs $V28, $V11]} 138 # Load key 13 139 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 140 addi $TMP_REG, $KEYP, 192 141 @{[vle32_v $V11, $TMP_REG]} 142 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 143 @{[vaesem_vs $V28, $V7]} 144 @{[vaesem_vs $V28, $V8]} 145 @{[vaesem_vs $V28, $V9]} 146 @{[vaesem_vs $V28, $V10]} 147 @{[vaesef_vs $V28, $V11]} 148___ 149 150 return $code; 151} 152 153# Do aes-256 enc. 154sub aes_256_cipher_body { 155 my $TMP_REG = shift; 156 157 my $code=<<___; 158 # Load key 3 159 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 160 addi $TMP_REG, $KEYP, 32 161 @{[vle32_v $V11, $TMP_REG]} 162 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 163 @{[vaesz_vs $V28, $V1]} 164 @{[vaesem_vs $V28, $V2]} 165 @{[vaesem_vs $V28, $V11]} 166 # Load key 6 167 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 168 addi $TMP_REG, $KEYP, 80 169 @{[vle32_v $V11, $TMP_REG]} 170 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 171 @{[vaesem_vs $V28, $V3]} 172 @{[vaesem_vs $V28, $V4]} 173 @{[vaesem_vs $V28, $V11]} 174 # Load key 9 175 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 176 addi $TMP_REG, $KEYP, 128 177 @{[vle32_v $V11, $TMP_REG]} 178 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 179 @{[vaesem_vs $V28, $V5]} 180 @{[vaesem_vs $V28, $V6]} 181 @{[vaesem_vs $V28, $V11]} 182 # Load key 12 183 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 184 addi $TMP_REG, $KEYP, 176 185 @{[vle32_v $V11, $TMP_REG]} 186 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 187 @{[vaesem_vs $V28, $V7]} 188 @{[vaesem_vs $V28, $V8]} 189 @{[vaesem_vs $V28, $V11]} 190 # Load key 15 191 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 192 addi $TMP_REG, $KEYP, 224 193 @{[vle32_v $V11, $TMP_REG]} 194 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 195 @{[vaesem_vs $V28, $V9]} 196 @{[vaesem_vs $V28, $V10]} 197 @{[vaesef_vs $V28, $V11]} 198___ 199 200 return $code; 201} 202 203sub handle_padding_in_first_round { 204 my $TMP_REG = shift; 205 206 my $code=<<___; 207 bnez $PADDING_LEN32, 1f 208 209 ## without padding 210 # Store ciphertext/plaintext 211 @{[vse32_v $V28, $OUTP]} 212 j 2f 213 214 ## with padding 2151: 216 # Store ciphertext/plaintext using mask 217 @{[vse32_v $V28, $OUTP, $INPUT_PADDING_MASK]} 218 219 # Fill zero for the padding blocks 220 @{[vsetvli "zero", $PADDING_LEN32, "e32", "m4", "tu", "ma"]} 221 @{[vmv_v_i $V28, 0]} 222 223 # We have used mask register for `INPUT_PADDING_MASK` before. We need to 224 # setup the ctr mask back. 225 # ctr mask : [000100010001....] 226 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]} 227 li $TMP_REG, 0b10001000 228 @{[vmv_v_x $CTR_MASK, $TMP_REG]} 2292: 230 231___ 232 233 return $code; 234} 235 236 237# Do aes-128 enc for first round. 238sub aes_128_first_round { 239 my $PTR_OFFSET_REG = shift; 240 my $TMP_REG = shift; 241 242 my $code=<<___; 243 # Load all 11 aes round keys to v1-v11 registers. 244 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 245 @{[vle32_v $V1, $KEYP]} 246 addi $KEYP, $KEYP, 16 247 @{[vle32_v $V2, $KEYP]} 248 addi $KEYP, $KEYP, 16 249 @{[vle32_v $V3, $KEYP]} 250 addi $KEYP, $KEYP, 16 251 @{[vle32_v $V4, $KEYP]} 252 addi $KEYP, $KEYP, 16 253 @{[vle32_v $V5, $KEYP]} 254 addi $KEYP, $KEYP, 16 255 @{[vle32_v $V6, $KEYP]} 256 addi $KEYP, $KEYP, 16 257 @{[vle32_v $V7, $KEYP]} 258 addi $KEYP, $KEYP, 16 259 @{[vle32_v $V8, $KEYP]} 260 addi $KEYP, $KEYP, 16 261 @{[vle32_v $V9, $KEYP]} 262 addi $KEYP, $KEYP, 16 263 @{[vle32_v $V10, $KEYP]} 264 addi $KEYP, $KEYP, 16 265 @{[vle32_v $V11, $KEYP]} 266 267 # We already have the ciphertext/plaintext and ctr data for the first round. 268 @{[aes_128_cipher_body]} 269 270 # Compute AES ctr result. 271 @{[vxor_vv $V28, $V28, $V24]} 272 273 @{[handle_padding_in_first_round $TMP_REG]} 274 275 add $INP, $INP, $PTR_OFFSET_REG 276 add $OUTP, $OUTP, $PTR_OFFSET_REG 277___ 278 279 return $code; 280} 281 282# Do aes-192 enc for first round. 283sub aes_192_first_round { 284 my $PTR_OFFSET_REG = shift; 285 my $TMP_REG = shift; 286 287 my $code=<<___; 288 # We run out of 32 vector registers, so we just preserve some round keys 289 # and load the remaining round keys inside the aes body. 290 # We keep the round keys for: 291 # 1, 2, 3, 5, 6, 7, 9, 10, 11 and 12th keys. 292 # The following keys will be loaded in the aes body: 293 # 4, 8 and 13th keys. 294 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 295 # key 1 296 @{[vle32_v $V1, $KEYP]} 297 # key 2 298 addi $TMP_REG, $KEYP, 16 299 @{[vle32_v $V2, $TMP_REG]} 300 # key 3 301 addi $TMP_REG, $KEYP, 32 302 @{[vle32_v $V3, $TMP_REG]} 303 # key 5 304 addi $TMP_REG, $KEYP, 64 305 @{[vle32_v $V4, $TMP_REG]} 306 # key 6 307 addi $TMP_REG, $KEYP, 80 308 @{[vle32_v $V5, $TMP_REG]} 309 # key 7 310 addi $TMP_REG, $KEYP, 96 311 @{[vle32_v $V6, $TMP_REG]} 312 # key 9 313 addi $TMP_REG, $KEYP, 128 314 @{[vle32_v $V7, $TMP_REG]} 315 # key 10 316 addi $TMP_REG, $KEYP, 144 317 @{[vle32_v $V8, $TMP_REG]} 318 # key 11 319 addi $TMP_REG, $KEYP, 160 320 @{[vle32_v $V9, $TMP_REG]} 321 # key 12 322 addi $TMP_REG, $KEYP, 176 323 @{[vle32_v $V10, $TMP_REG]} 324 325 # We already have the ciphertext/plaintext and ctr data for the first round. 326 @{[aes_192_cipher_body $TMP_REG]} 327 328 # Compute AES ctr result. 329 @{[vxor_vv $V28, $V28, $V24]} 330 331 @{[handle_padding_in_first_round $TMP_REG]} 332 333 add $INP, $INP, $PTR_OFFSET_REG 334 add $OUTP, $OUTP, $PTR_OFFSET_REG 335___ 336 337 return $code; 338} 339 340# Do aes-256 enc for first round. 341sub aes_256_first_round { 342 my $PTR_OFFSET_REG = shift; 343 my $TMP_REG = shift; 344 345 my $code=<<___; 346 # We run out of 32 vector registers, so we just preserve some round keys 347 # and load the remaining round keys inside the aes body. 348 # We keep the round keys for: 349 # 1, 2, 4, 5, 7, 8, 10, 11, 13 and 14th keys. 350 # The following keys will be loaded in the aes body: 351 # 3, 6, 9, 12 and 15th keys. 352 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 353 # key 1 354 @{[vle32_v $V1, $KEYP]} 355 # key 2 356 addi $TMP_REG, $KEYP, 16 357 @{[vle32_v $V2, $TMP_REG]} 358 # key 4 359 addi $TMP_REG, $KEYP, 48 360 @{[vle32_v $V3, $TMP_REG]} 361 # key 5 362 addi $TMP_REG, $KEYP, 64 363 @{[vle32_v $V4, $TMP_REG]} 364 # key 7 365 addi $TMP_REG, $KEYP, 96 366 @{[vle32_v $V5, $TMP_REG]} 367 # key 8 368 addi $TMP_REG, $KEYP, 112 369 @{[vle32_v $V6, $TMP_REG]} 370 # key 10 371 addi $TMP_REG, $KEYP, 144 372 @{[vle32_v $V7, $TMP_REG]} 373 # key 11 374 addi $TMP_REG, $KEYP, 160 375 @{[vle32_v $V8, $TMP_REG]} 376 # key 13 377 addi $TMP_REG, $KEYP, 192 378 @{[vle32_v $V9, $TMP_REG]} 379 # key 14 380 addi $TMP_REG, $KEYP, 208 381 @{[vle32_v $V10, $TMP_REG]} 382 383 # We already have the ciphertext/plaintext and ctr data for the first round. 384 @{[aes_256_cipher_body $TMP_REG]} 385 386 # Compute AES ctr result. 387 @{[vxor_vv $V28, $V28, $V24]} 388 389 @{[handle_padding_in_first_round $TMP_REG]} 390 391 add $INP, $INP, $PTR_OFFSET_REG 392 add $OUTP, $OUTP, $PTR_OFFSET_REG 393___ 394 395 return $code; 396} 397 398sub aes_gcm_init { 399 my $code=<<___; 400 # Compute the AES-GCM full-block e32 length for `LMUL=4`. We will handle 401 # the multiple AES-GCM blocks at the same time within `LMUL=4` register. 402 # The AES-GCM's SEW is e32 and EGW is 128 bits. 403 # FULL_BLOCK_LEN32 = (VLEN*LMUL)/(EGW) * (EGW/SEW) = (VLEN*4)/(32*4) * 4 404 # = (VLEN*4)/32 405 # We could get the block_num using the VL value of `vsetvli with e32, m4`. 406 @{[vsetvli $FULL_BLOCK_LEN32, "zero", "e32", "m4", "ta", "ma"]} 407 # If `LEN32 % FULL_BLOCK_LEN32` is not equal to zero, we could fill the 408 # zero padding data to make sure we could always handle FULL_BLOCK_LEN32 409 # blocks for all iterations. 410 411 ## Prepare the H^n multiplier in v16 for GCM multiplier. The `n` is the gcm 412 ## block number in a LMUL=4 register group. 413 ## n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4)) 414 ## = (VLEN/32) 415 ## We could use vsetvli with `e32, m1` to compute the `n` number. 416 @{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]} 417 418 # The H is at `gcm128_context.Htable[0]`(addr(Xi)+16*2). 419 addi $T1, $XIP, 32 420 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 421 @{[vle32_v $V31, $T1]} 422 423 # Compute the H^n 424 li $T1, 1 4251: 426 @{[vgmul_vv $V31, $V31]} 427 slli $T1, $T1, 1 428 bltu $T1, $T0, 1b 429 430 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 431 @{[vmv_v_i $V16, 0]} 432 @{[vaesz_vs $V16, $V31]} 433 434 #### Load plaintext into v24 and handle padding. We also load the init tag 435 #### data into v20 and prepare the AES ctr input data into v12 and v28. 436 @{[vmv_v_i $V20, 0]} 437 438 ## Prepare the AES ctr input data into v12. 439 # Setup ctr input mask. 440 # ctr mask : [000100010001....] 441 # Note: The actual vl should be `FULL_BLOCK_LEN32/4 * 2`, but we just use 442 # `FULL_BLOCK_LEN32` here. 443 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]} 444 li $T0, 0b10001000 445 @{[vmv_v_x $CTR_MASK, $T0]} 446 # Load IV. 447 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 448 @{[vle32_v $V31, $IVP]} 449 # Convert the big-endian counter into little-endian. 450 @{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]} 451 @{[vrev8_v $V31, $V31, $CTR_MASK]} 452 # Splat the `single block of IV` to v12 453 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 454 @{[vmv_v_i $V12, 0]} 455 @{[vaesz_vs $V12, $V31]} 456 # Prepare the ctr counter into v8 457 # v8: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...] 458 @{[viota_m $V8, $CTR_MASK, $CTR_MASK]} 459 # Merge IV and ctr counter into v12. 460 # v12:[x, x, x, count+0, x, x, x, count+1, ...] 461 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]} 462 @{[vadd_vv $V12, $V12, $V8, $CTR_MASK]} 463 464 li $PADDING_LEN32, 0 465 # Get the SEW32 size in the first round. 466 # If we have the non-zero value for `LEN32&(FULL_BLOCK_LEN32-1)`, then 467 # we will have the leading padding zero. 468 addi $T0, $FULL_BLOCK_LEN32, -1 469 and $T0, $T0, $LEN32 470 beqz $T0, 1f 471 472 ## with padding 473 sub $LEN32, $LEN32, $T0 474 sub $PADDING_LEN32, $FULL_BLOCK_LEN32, $T0 475 # padding block size 476 srli $T1, $PADDING_LEN32, 2 477 # padding byte size 478 slli $T2, $PADDING_LEN32, 2 479 480 # Adjust the ctr counter to make the counter start from `counter+0` for the 481 # first non-padding block. 482 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]} 483 @{[vsub_vx $V12, $V12, $T1, $CTR_MASK]} 484 # Prepare the AES ctr input into v28. 485 # The ctr data uses big-endian form. 486 @{[vmv_v_v $V28, $V12]} 487 @{[vrev8_v $V28, $V28, $CTR_MASK]} 488 489 # Prepare the mask for input loading in the first round. We use 490 # `VL=FULL_BLOCK_LEN32` with the mask in the first round. 491 # Adjust input ptr. 492 sub $INP, $INP, $T2 493 # Adjust output ptr. 494 sub $OUTP, $OUTP, $T2 495 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e16", "m2", "ta", "ma"]} 496 @{[vid_v $V2]} 497 # We don't use the pseudo instruction `vmsgeu` here. Use `vmsgtu` instead. 498 # The original code is: 499 # vmsgeu.vx $INPUT_PADDING_MASK, $V2, $PADDING_LEN32 500 addi $T0, $PADDING_LEN32, -1 501 @{[vmsgtu_vx $INPUT_PADDING_MASK, $V2, $T0]} 502 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 503 @{[vmv_v_i $V24, 0]} 504 # Load the input for length FULL_BLOCK_LEN32 with mask. 505 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]} 506 @{[vle32_v $V24, $INP, $INPUT_PADDING_MASK]} 507 508 # Load the init `Xi` data to v20 with preceding zero padding. 509 # Adjust Xi ptr. 510 sub $T0, $XIP, $T2 511 # Load for length `zero-padding-e32-length + 4`. 512 addi $T1, $PADDING_LEN32, 4 513 @{[vsetvli "zero", $T1, "e32", "m4", "tu", "mu"]} 514 @{[vle32_v $V20, $T0, $INPUT_PADDING_MASK]} 515 j 2f 516 5171: 518 ## without padding 519 sub $LEN32, $LEN32, $FULL_BLOCK_LEN32 520 521 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 522 @{[vle32_v $V24, $INP]} 523 524 # Load the init Xi data to v20. 525 @{[vsetivli "zero", 4, "e32", "m1", "tu", "ma"]} 526 @{[vle32_v $V20, $XIP]} 527 528 # Prepare the AES ctr input into v28. 529 # The ctr data uses big-endian form. 530 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]} 531 @{[vmv_v_v $V28, $V12]} 532 @{[vrev8_v $V28, $V28, $CTR_MASK]} 5332: 534___ 535 536 return $code; 537} 538 539sub prepare_input_and_ctr { 540 my $PTR_OFFSET_REG = shift; 541 542 my $code=<<___; 543 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]} 544 # Increase ctr in v12. 545 @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]} 546 sub $LEN32, $LEN32, $FULL_BLOCK_LEN32 547 # Load plaintext into v24 548 @{[vsetvli "zero", "zero", "e32", "m4", "ta", "ma"]} 549 @{[vle32_v $V24, $INP]} 550 # Prepare the AES ctr input into v28. 551 # The ctr data uses big-endian form. 552 @{[vmv_v_v $V28, $V12]} 553 add $INP, $INP, $PTR_OFFSET_REG 554 @{[vsetvli "zero", "zero", "e32", "m4", "ta", "mu"]} 555 @{[vrev8_v $V28, $V28, $CTR_MASK]} 556___ 557 558 return $code; 559} 560 561# Store the current CTR back to IV buffer. 562sub store_current_ctr { 563 my $code=<<___; 564 @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]} 565 # Update current ctr value to v12 566 @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]} 567 # Convert ctr to big-endian counter. 568 @{[vrev8_v $V12, $V12, $CTR_MASK]} 569 @{[vse32_v $V12, $IVP, $CTR_MASK]} 570___ 571 572 return $code; 573} 574 575# Compute the final tag into v0 from the partial tag v20. 576sub compute_final_tag { 577 my $TMP_REG = shift; 578 579 my $code=<<___; 580 # The H is at `gcm128_context.Htable[0]` (addr(Xi)+16*2). 581 # Load H to v1 582 addi $TMP_REG, $XIP, 32 583 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 584 @{[vle32_v $V1, $TMP_REG]} 585 # Multiply H for each partial tag and XOR them together. 586 # Handle 1st partial tag 587 @{[vmv_v_v $V0, $V20]} 588 @{[vgmul_vv $V0, $V1]} 589 # Handle 2nd to N-th partial tags 590 li $TMP_REG, 4 5911: 592 @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]} 593 @{[vslidedown_vx $V4, $V20, $TMP_REG]} 594 @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} 595 @{[vghsh_vv $V0, $V1, $V4]} 596 addi $TMP_REG, $TMP_REG, 4 597 blt $TMP_REG, $FULL_BLOCK_LEN32, 1b 598___ 599 600 return $code; 601} 602 603################################################################################ 604# size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt(const unsigned char *in, 605# unsigned char *out, size_t len, 606# const void *key, 607# unsigned char ivec[16], u64 *Xi); 608{ 609$code .= <<___; 610.p2align 3 611.globl rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt 612.type rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,\@function 613rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt: 614 srli $T0, $LEN, 4 615 beqz $T0, .Lenc_end 616 slli $LEN32, $T0, 2 617 618 mv $ORIGINAL_LEN32, $LEN32 619 620 @{[aes_gcm_init]} 621 622 # Load number of rounds 623 lwu $T0, 240($KEYP) 624 li $T1, 14 625 li $T2, 12 626 li $T3, 10 627 628 beq $T0, $T1, aes_gcm_enc_blocks_256 629 beq $T0, $T2, aes_gcm_enc_blocks_192 630 beq $T0, $T3, aes_gcm_enc_blocks_128 631 632.Lenc_end: 633 li $PROCESSED_LEN, 0 634 ret 635 636.size rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt 637___ 638 639$code .= <<___; 640.p2align 3 641aes_gcm_enc_blocks_128: 642 srli $CTR, $FULL_BLOCK_LEN32, 2 643 slli $T0, $FULL_BLOCK_LEN32, 2 644 645 @{[aes_128_first_round $T0, $T1]} 646 647 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 648 649.Lenc_blocks_128: 650 # Compute the partial tags. 651 # The partial tags will multiply with [H^n, H^n, ..., H^n] 652 # [tag0, tag1, ...] = 653 # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n] 654 # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round. 655 beqz $LEN32, .Lenc_blocks_128_end 656 @{[vghsh_vv $V20, $V16, $V28]} 657 658 @{[prepare_input_and_ctr $T0]} 659 660 @{[aes_128_cipher_body]} 661 662 # Compute AES ctr ciphertext result. 663 @{[vxor_vv $V28, $V28, $V24]} 664 665 # Store ciphertext 666 @{[vse32_v $V28, $OUTP]} 667 add $OUTP, $OUTP, $T0 668 669 j .Lenc_blocks_128 670.Lenc_blocks_128_end: 671 672 # Add ciphertext into partial tag 673 @{[vxor_vv $V20, $V20, $V28]} 674 675 @{[store_current_ctr]} 676 677 @{[compute_final_tag $T1]} 678 679 # Save the final tag 680 @{[vse32_v $V0, $XIP]} 681 682 # return the processed size. 683 slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2 684 ret 685.size aes_gcm_enc_blocks_128,.-aes_gcm_enc_blocks_128 686___ 687 688$code .= <<___; 689.p2align 3 690aes_gcm_enc_blocks_192: 691 srli $CTR, $FULL_BLOCK_LEN32, 2 692 slli $T0, $FULL_BLOCK_LEN32, 2 693 694 @{[aes_192_first_round $T0, $T1]} 695 696 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 697 698.Lenc_blocks_192: 699 # Compute the partial tags. 700 # The partial tags will multiply with [H^n, H^n, ..., H^n] 701 # [tag0, tag1, ...] = 702 # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n] 703 # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round. 704 beqz $LEN32, .Lenc_blocks_192_end 705 @{[vghsh_vv $V20, $V16, $V28]} 706 707 @{[prepare_input_and_ctr $T0]} 708 709 @{[aes_192_cipher_body $T1]} 710 711 # Compute AES ctr ciphertext result. 712 @{[vxor_vv $V28, $V28, $V24]} 713 714 # Store ciphertext 715 @{[vse32_v $V28, $OUTP]} 716 add $OUTP, $OUTP, $T0 717 718 j .Lenc_blocks_192 719.Lenc_blocks_192_end: 720 721 # Add ciphertext into partial tag 722 @{[vxor_vv $V20, $V20, $V28]} 723 724 @{[store_current_ctr]} 725 726 @{[compute_final_tag $T1]} 727 728 # Save the final tag 729 @{[vse32_v $V0, $XIP]} 730 731 # return the processed size. 732 slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2 733 ret 734.size aes_gcm_enc_blocks_192,.-aes_gcm_enc_blocks_192 735___ 736 737$code .= <<___; 738.p2align 3 739aes_gcm_enc_blocks_256: 740 srli $CTR, $FULL_BLOCK_LEN32, 2 741 slli $T0, $FULL_BLOCK_LEN32, 2 742 743 @{[aes_256_first_round $T0, $T1]} 744 745 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 746 747.Lenc_blocks_256: 748 # Compute the partial tags. 749 # The partial tags will multiply with [H^n, H^n, ..., H^n] 750 # [tag0, tag1, ...] = 751 # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n] 752 # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round. 753 beqz $LEN32, .Lenc_blocks_256_end 754 @{[vghsh_vv $V20, $V16, $V28]} 755 756 @{[prepare_input_and_ctr $T0]} 757 758 @{[aes_256_cipher_body $T1]} 759 760 # Compute AES ctr ciphertext result. 761 @{[vxor_vv $V28, $V28, $V24]} 762 763 # Store ciphertext 764 @{[vse32_v $V28, $OUTP]} 765 add $OUTP, $OUTP, $T0 766 767 j .Lenc_blocks_256 768.Lenc_blocks_256_end: 769 770 # Add ciphertext into partial tag 771 @{[vxor_vv $V20, $V20, $V28]} 772 773 @{[store_current_ctr]} 774 775 @{[compute_final_tag $T1]} 776 777 # Save the final tag 778 @{[vse32_v $V0, $XIP]} 779 780 # return the processed size. 781 slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2 782 ret 783.size aes_gcm_enc_blocks_256,.-aes_gcm_enc_blocks_256 784___ 785 786} 787 788################################################################################ 789# size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt(const unsigned char *in, 790# unsigned char *out, size_t len, 791# const void *key, 792# unsigned char ivec[16], u64 *Xi); 793{ 794$code .= <<___; 795.p2align 3 796.globl rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt 797.type rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,\@function 798rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt: 799 srli $T0, $LEN, 4 800 beqz $T0, .Ldec_end 801 slli $LEN32, $T0, 2 802 803 mv $ORIGINAL_LEN32, $LEN32 804 805 @{[aes_gcm_init]} 806 807 # Load number of rounds 808 lwu $T0, 240($KEYP) 809 li $T1, 14 810 li $T2, 12 811 li $T3, 10 812 813 beq $T0, $T1, aes_gcm_dec_blocks_256 814 beq $T0, $T2, aes_gcm_dec_blocks_192 815 beq $T0, $T3, aes_gcm_dec_blocks_128 816 817.Ldec_end: 818 li $PROCESSED_LEN, 0 819 ret 820.size rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt 821___ 822 823$code .= <<___; 824.p2align 3 825aes_gcm_dec_blocks_128: 826 srli $CTR, $FULL_BLOCK_LEN32, 2 827 slli $T0, $FULL_BLOCK_LEN32, 2 828 829 @{[aes_128_first_round $T0, $T1]} 830 831 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 832 833.Ldec_blocks_128: 834 # Compute the partial tags. 835 # The partial tags will multiply with [H^n, H^n, ..., H^n] 836 # [tag0, tag1, ...] = 837 # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n] 838 # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round. 839 beqz $LEN32, .Ldec_blocks_256_end 840 @{[vghsh_vv $V20, $V16, $V24]} 841 842 @{[prepare_input_and_ctr $T0]} 843 844 @{[aes_128_cipher_body]} 845 846 # Compute AES ctr plaintext result. 847 @{[vxor_vv $V28, $V28, $V24]} 848 849 # Store plaintext 850 @{[vse32_v $V28, $OUTP]} 851 add $OUTP, $OUTP, $T0 852 853 j .Ldec_blocks_128 854.Ldec_blocks_128_end: 855 856 # Add ciphertext into partial tag 857 @{[vxor_vv $V20, $V20, $V24]} 858 859 @{[store_current_ctr]} 860 861 @{[compute_final_tag $T1]} 862 863 # Save the final tag 864 @{[vse32_v $V0, $XIP]} 865 866 # return the processed size. 867 slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2 868 ret 869.size aes_gcm_dec_blocks_128,.-aes_gcm_dec_blocks_128 870___ 871 872$code .= <<___; 873.p2align 3 874aes_gcm_dec_blocks_192: 875 srli $CTR, $FULL_BLOCK_LEN32, 2 876 slli $T0, $FULL_BLOCK_LEN32, 2 877 878 @{[aes_192_first_round $T0, $T1]} 879 880 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 881 882.Ldec_blocks_192: 883 # Compute the partial tags. 884 # The partial tags will multiply with [H^n, H^n, ..., H^n] 885 # [tag0, tag1, ...] = 886 # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n] 887 # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round. 888 beqz $LEN32, .Ldec_blocks_192_end 889 @{[vghsh_vv $V20, $V16, $V24]} 890 891 @{[prepare_input_and_ctr $T0]} 892 893 @{[aes_192_cipher_body $T1]} 894 895 # Compute AES ctr plaintext result. 896 @{[vxor_vv $V28, $V28, $V24]} 897 898 # Store plaintext 899 @{[vse32_v $V28, $OUTP]} 900 add $OUTP, $OUTP, $T0 901 902 j .Ldec_blocks_192 903.Ldec_blocks_192_end: 904 905 # Add ciphertext into partial tag 906 @{[vxor_vv $V20, $V20, $V24]} 907 908 @{[store_current_ctr]} 909 910 @{[compute_final_tag $T1]} 911 912 # Save the final tag 913 @{[vse32_v $V0, $XIP]} 914 915 # return the processed size. 916 slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2 917 ret 918.size aes_gcm_dec_blocks_192,.-aes_gcm_dec_blocks_192 919___ 920 921$code .= <<___; 922.p2align 3 923aes_gcm_dec_blocks_256: 924 srli $CTR, $FULL_BLOCK_LEN32, 2 925 slli $T0, $FULL_BLOCK_LEN32, 2 926 927 @{[aes_256_first_round $T0, $T1]} 928 929 @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]} 930 931.Ldec_blocks_256: 932 # Compute the partial tags. 933 # The partial tags will multiply with [H^n, H^n, ..., H^n] 934 # [tag0, tag1, ...] = 935 # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n] 936 # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round. 937 beqz $LEN32, .Ldec_blocks_256_end 938 @{[vghsh_vv $V20, $V16, $V24]} 939 940 @{[prepare_input_and_ctr $T0]} 941 942 @{[aes_256_cipher_body $T1]} 943 944 # Compute AES ctr plaintext result. 945 @{[vxor_vv $V28, $V28, $V24]} 946 947 # Store plaintext 948 @{[vse32_v $V28, $OUTP]} 949 add $OUTP, $OUTP, $T0 950 951 j .Ldec_blocks_256 952.Ldec_blocks_256_end: 953 954 # Add ciphertext into partial tag 955 @{[vxor_vv $V20, $V20, $V24]} 956 957 @{[store_current_ctr]} 958 959 @{[compute_final_tag $T1]} 960 961 # Save the final tag 962 @{[vse32_v $V0, $XIP]} 963 964 # return the processed size. 965 slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2 966 ret 967.size aes_gcm_dec_blocks_256,.-aes_gcm_dec_blocks_256 968___ 969 970} 971} 972 973print $code; 974 975close STDOUT or die "error closing STDOUT: $!"; 976