1#! /usr/bin/env perl 2# This file is dual-licensed, meaning that you can use it under your 3# choice of either of the following two licenses: 4# 5# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. 6# 7# Licensed under the Apache License 2.0 (the "License"). You can obtain 8# a copy in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# or 12# 13# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu> 14# All rights reserved. 15# 16# Redistribution and use in source and binary forms, with or without 17# modification, are permitted provided that the following conditions 18# are met: 19# 1. Redistributions of source code must retain the above copyright 20# notice, this list of conditions and the following disclaimer. 21# 2. Redistributions in binary form must reproduce the above copyright 22# notice, this list of conditions and the following disclaimer in the 23# documentation and/or other materials provided with the distribution. 24# 25# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 37# - RV64I 38# - RISC-V Vector ('V') with VLEN >= 128 39# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') 40# - RISC-V Vector Carryless Multiplication extension ('Zvbc') 41 42use strict; 43use warnings; 44 45use FindBin qw($Bin); 46use lib "$Bin"; 47use lib "$Bin/../../perlasm"; 48use riscv; 49 50# $output is the last argument if it looks like a file (it has an extension) 51# $flavour is the first argument if it doesn't look like a file 52my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 53my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 54 55$output and open STDOUT,">$output"; 56 57my $code=<<___; 58.text 59___ 60 61################################################################################ 62# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]); 63# 64# input: H: 128-bit H - secret parameter E(K, 0^128) 65# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and 66# gcm_ghash_rv64i_zvkb_zvbc 67{ 68my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2"); 69my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); 70 71$code .= <<___; 72.p2align 3 73.globl gcm_init_rv64i_zvkb_zvbc 74.type gcm_init_rv64i_zvkb_zvbc,\@function 75gcm_init_rv64i_zvkb_zvbc: 76 # Load/store data in reverse order. 77 # This is needed as a part of endianness swap. 78 add $H, $H, 8 79 li $TMP0, -8 80 li $TMP1, 63 81 la $TMP2, Lpolymod 82 83 @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu 84 85 @{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0 86 @{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2) 87 88 # Shift one left and get the carry bits. 89 @{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1 90 @{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1 91 92 # Use the fact that the polynomial degree is no more than 128, 93 # i.e. only the LSB of the upper half could be set. 94 # Thanks to this we don't need to do the full reduction here. 95 # Instead simply subtract the reduction polynomial. 96 # This idea was taken from x86 ghash implementation in OpenSSL. 97 @{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1 98 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 99 100 @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 101 @{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t 102 103 # Need to set the mask to 3, if the carry bit is set. 104 @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 105 @{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0 106 @{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0 107 @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 108 109 @{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t 110 111 @{[vse64_v $V1, $Htable]} # vse64.v v1, (a0) 112 ret 113.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc 114___ 115} 116 117################################################################################ 118# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]); 119# 120# input: Xi: current hash value 121# Htable: preprocessed H 122# output: Xi: next hash value Xi = (Xi * H mod f) 123{ 124my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4"); 125my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); 126 127$code .= <<___; 128.text 129.p2align 3 130.globl gcm_gmult_rv64i_zvkb_zvbc 131.type gcm_gmult_rv64i_zvkb_zvbc,\@function 132gcm_gmult_rv64i_zvkb_zvbc: 133 ld $TMP0, ($Htable) 134 ld $TMP1, 8($Htable) 135 li $TMP2, 63 136 la $TMP3, Lpolymod 137 ld $TMP3, 8($TMP3) 138 139 # Load/store data in reverse order. 140 # This is needed as a part of endianness swap. 141 add $Xi, $Xi, 8 142 li $TMP4, -8 143 144 @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu 145 146 @{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4 147 @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 148 149 # Multiplication 150 151 # Do two 64x64 multiplications in one go to save some time 152 # and simplify things. 153 154 # A = a1a0 (t1, t0) 155 # B = b1b0 (v5) 156 # C = c1c0 (256 bit) 157 # c1 = a1b1 + (a0b1)h + (a1b0)h 158 # c0 = a0b0 + (a0b1)l + (a1b0)h 159 160 # v1 = (a0b1)l,(a0b0)l 161 @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 162 # v3 = (a0b1)h,(a0b0)h 163 @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 164 165 # v4 = (a1b1)l,(a1b0)l 166 @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 167 # v2 = (a1b1)h,(a1b0)h 168 @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 169 170 # Is there a better way to do this? 171 # Would need to swap the order of elements within a vector register. 172 @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 173 @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 174 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 175 @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 176 177 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 178 # v2 += (a0b1)h 179 @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t 180 # v2 += (a1b1)l 181 @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t 182 183 @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 184 # v1 += (a0b0)h,0 185 @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t 186 # v1 += (a1b0)l,0 187 @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t 188 189 # Now the 256bit product should be stored in (v2,v1) 190 # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l 191 # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l 192 193 # Reduction 194 # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] 195 # This is a slight variation of the Gueron's Montgomery reduction. 196 # The difference being the order of some operations has been changed, 197 # to make a better use of vclmul(h) instructions. 198 199 # First step: 200 # c1 += (c0 * P)l 201 # vmv.v.i v0, 2 202 @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t 203 @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t 204 @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t 205 206 # Second step: 207 # D = d1,d0 is final result 208 # We want: 209 # m1 = c1 + (c1 * P)h 210 # m0 = (c1 * P)l + (c0 * P)h + c0 211 # d1 = c3 + m1 212 # d0 = c2 + m0 213 214 #v3 = (c1 * P)l, 0 215 @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t 216 #v4 = (c1 * P)h, (c0 * P)h 217 @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 218 219 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 220 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 221 222 @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 223 @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t 224 225 # XOR in the upper upper part of the product 226 @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 227 228 @{[vrev8_v $V2, $V2]} # vrev8.v v2, v2 229 @{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4 230 ret 231.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc 232___ 233} 234 235################################################################################ 236# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16], 237# const u8 *inp, size_t len); 238# 239# input: Xi: current hash value 240# Htable: preprocessed H 241# inp: pointer to input data 242# len: length of input data in bytes (multiple of block size) 243# output: Xi: Xi+1 (next hash value Xi) 244{ 245my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); 246my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7"); 247 248$code .= <<___; 249.p2align 3 250.globl gcm_ghash_rv64i_zvkb_zvbc 251.type gcm_ghash_rv64i_zvkb_zvbc,\@function 252gcm_ghash_rv64i_zvkb_zvbc: 253 ld $TMP0, ($Htable) 254 ld $TMP1, 8($Htable) 255 li $TMP2, 63 256 la $TMP3, Lpolymod 257 ld $TMP3, 8($TMP3) 258 259 # Load/store data in reverse order. 260 # This is needed as a part of endianness swap. 261 add $Xi, $Xi, 8 262 add $inp, $inp, 8 263 li $M8, -8 264 265 @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu 266 267 @{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4 268 269Lstep: 270 # Read input data 271 @{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2) 272 add $inp, $inp, 16 273 add $len, $len, -16 274 # XOR them into Xi 275 @{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1 276 277 @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 278 279 # Multiplication 280 281 # Do two 64x64 multiplications in one go to save some time 282 # and simplify things. 283 284 # A = a1a0 (t1, t0) 285 # B = b1b0 (v5) 286 # C = c1c0 (256 bit) 287 # c1 = a1b1 + (a0b1)h + (a1b0)h 288 # c0 = a0b0 + (a0b1)l + (a1b0)h 289 290 # v1 = (a0b1)l,(a0b0)l 291 @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 292 # v3 = (a0b1)h,(a0b0)h 293 @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 294 295 # v4 = (a1b1)l,(a1b0)l 296 @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 297 # v2 = (a1b1)h,(a1b0)h 298 @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 299 300 # Is there a better way to do this? 301 # Would need to swap the order of elements within a vector register. 302 @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 303 @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 304 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 305 @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 306 307 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 308 # v2 += (a0b1)h 309 @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t 310 # v2 += (a1b1)l 311 @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t 312 313 @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 314 # v1 += (a0b0)h,0 315 @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t 316 # v1 += (a1b0)l,0 317 @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t 318 319 # Now the 256bit product should be stored in (v2,v1) 320 # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l 321 # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l 322 323 # Reduction 324 # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] 325 # This is a slight variation of the Gueron's Montgomery reduction. 326 # The difference being the order of some operations has been changed, 327 # to make a better use of vclmul(h) instructions. 328 329 # First step: 330 # c1 += (c0 * P)l 331 # vmv.v.i v0, 2 332 @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t 333 @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t 334 @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t 335 336 # Second step: 337 # D = d1,d0 is final result 338 # We want: 339 # m1 = c1 + (c1 * P)h 340 # m0 = (c1 * P)l + (c0 * P)h + c0 341 # d1 = c3 + m1 342 # d0 = c2 + m0 343 344 #v3 = (c1 * P)l, 0 345 @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t 346 #v4 = (c1 * P)h, (c0 * P)h 347 @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 348 349 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 350 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 351 352 @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 353 @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t 354 355 # XOR in the upper upper part of the product 356 @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 357 358 @{[vrev8_v $V5, $V2]} # vrev8.v v2, v2 359 360 bnez $len, Lstep 361 362 @{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4 363 ret 364.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc 365___ 366} 367 368$code .= <<___; 369.p2align 4 370Lpolymod: 371 .dword 0x0000000000000001 372 .dword 0xc200000000000000 373.size Lpolymod,.-Lpolymod 374___ 375 376print $code; 377 378close STDOUT or die "error closing STDOUT: $!"; 379