#! /usr/bin/env perl # This file is dual-licensed, meaning that you can use it under your # choice of either of the following two licenses: # # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You can obtain # a copy in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # or # # Copyright (c) 2023, Christoph Müllner # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # - RV64I # - RISC-V Vector ('V') with VLEN >= 128 # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') # - RISC-V Vector Carryless Multiplication extension ('Zvbc') use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use lib "$Bin/../../perlasm"; use riscv; # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $output and open STDOUT,">$output"; my $code=<<___; .text ___ ################################################################################ # void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]); # # input: H: 128-bit H - secret parameter E(K, 0^128) # output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and # gcm_ghash_rv64i_zvkb_zvbc { my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2"); my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); $code .= <<___; .p2align 3 .globl gcm_init_rv64i_zvkb_zvbc .type gcm_init_rv64i_zvkb_zvbc,\@function gcm_init_rv64i_zvkb_zvbc: # Load/store data in reverse order. # This is needed as a part of endianness swap. add $H, $H, 8 li $TMP0, -8 li $TMP1, 63 la $TMP2, Lpolymod @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu @{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0 @{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2) # Shift one left and get the carry bits. @{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1 @{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1 # Use the fact that the polynomial degree is no more than 128, # i.e. only the LSB of the upper half could be set. # Thanks to this we don't need to do the full reduction here. # Instead simply subtract the reduction polynomial. # This idea was taken from x86 ghash implementation in OpenSSL. @{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 @{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t # Need to set the mask to 3, if the carry bit is set. @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 @{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0 @{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0 @{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3 @{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t @{[vse64_v $V1, $Htable]} # vse64.v v1, (a0) ret .size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc ___ } ################################################################################ # void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]); # # input: Xi: current hash value # Htable: preprocessed H # output: Xi: next hash value Xi = (Xi * H mod f) { my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4"); my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6"); $code .= <<___; .text .p2align 3 .globl gcm_gmult_rv64i_zvkb_zvbc .type gcm_gmult_rv64i_zvkb_zvbc,\@function gcm_gmult_rv64i_zvkb_zvbc: ld $TMP0, ($Htable) ld $TMP1, 8($Htable) li $TMP2, 63 la $TMP3, Lpolymod ld $TMP3, 8($TMP3) # Load/store data in reverse order. # This is needed as a part of endianness swap. add $Xi, $Xi, 8 li $TMP4, -8 @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu @{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4 @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 # Multiplication # Do two 64x64 multiplications in one go to save some time # and simplify things. # A = a1a0 (t1, t0) # B = b1b0 (v5) # C = c1c0 (256 bit) # c1 = a1b1 + (a0b1)h + (a1b0)h # c0 = a0b0 + (a0b1)l + (a1b0)h # v1 = (a0b1)l,(a0b0)l @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 # v3 = (a0b1)h,(a0b0)h @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 # v4 = (a1b1)l,(a1b0)l @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 # v2 = (a1b1)h,(a1b0)h @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 # Is there a better way to do this? # Would need to swap the order of elements within a vector register. @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 # v2 += (a0b1)h @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t # v2 += (a1b1)l @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 # v1 += (a0b0)h,0 @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t # v1 += (a1b0)l,0 @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t # Now the 256bit product should be stored in (v2,v1) # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l # Reduction # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] # This is a slight variation of the Gueron's Montgomery reduction. # The difference being the order of some operations has been changed, # to make a better use of vclmul(h) instructions. # First step: # c1 += (c0 * P)l # vmv.v.i v0, 2 @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t # Second step: # D = d1,d0 is final result # We want: # m1 = c1 + (c1 * P)h # m0 = (c1 * P)l + (c0 * P)h + c0 # d1 = c3 + m1 # d0 = c2 + m0 #v3 = (c1 * P)l, 0 @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t #v4 = (c1 * P)h, (c0 * P)h @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t # XOR in the upper upper part of the product @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 @{[vrev8_v $V2, $V2]} # vrev8.v v2, v2 @{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4 ret .size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc ___ } ################################################################################ # void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16], # const u8 *inp, size_t len); # # input: Xi: current hash value # Htable: preprocessed H # inp: pointer to input data # len: length of input data in bytes (multiple of block size) # output: Xi: Xi+1 (next hash value Xi) { my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7"); $code .= <<___; .p2align 3 .globl gcm_ghash_rv64i_zvkb_zvbc .type gcm_ghash_rv64i_zvkb_zvbc,\@function gcm_ghash_rv64i_zvkb_zvbc: ld $TMP0, ($Htable) ld $TMP1, 8($Htable) li $TMP2, 63 la $TMP3, Lpolymod ld $TMP3, 8($TMP3) # Load/store data in reverse order. # This is needed as a part of endianness swap. add $Xi, $Xi, 8 add $inp, $inp, 8 li $M8, -8 @{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu @{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4 Lstep: # Read input data @{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2) add $inp, $inp, 16 add $len, $len, -16 # XOR them into Xi @{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1 @{[vrev8_v $V5, $V5]} # vrev8.v v5, v5 # Multiplication # Do two 64x64 multiplications in one go to save some time # and simplify things. # A = a1a0 (t1, t0) # B = b1b0 (v5) # C = c1c0 (256 bit) # c1 = a1b1 + (a0b1)h + (a1b0)h # c0 = a0b0 + (a0b1)l + (a1b0)h # v1 = (a0b1)l,(a0b0)l @{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0 # v3 = (a0b1)h,(a0b0)h @{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0 # v4 = (a1b1)l,(a1b0)l @{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1 # v2 = (a1b1)h,(a1b0)h @{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1 # Is there a better way to do this? # Would need to swap the order of elements within a vector register. @{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1 @{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 @{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 # v2 += (a0b1)h @{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t # v2 += (a1b1)l @{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t @{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2 # v1 += (a0b0)h,0 @{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t # v1 += (a1b0)l,0 @{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t # Now the 256bit product should be stored in (v2,v1) # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l # Reduction # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0] # This is a slight variation of the Gueron's Montgomery reduction. # The difference being the order of some operations has been changed, # to make a better use of vclmul(h) instructions. # First step: # c1 += (c0 * P)l # vmv.v.i v0, 2 @{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t @{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t # Second step: # D = d1,d0 is final result # We want: # m1 = c1 + (c1 * P)h # m0 = (c1 * P)l + (c0 * P)h + c0 # d1 = c3 + m1 # d0 = c2 + m0 #v3 = (c1 * P)l, 0 @{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t #v4 = (c1 * P)h, (c0 * P)h @{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3 @{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1 @{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1 @{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4 @{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t # XOR in the upper upper part of the product @{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1 @{[vrev8_v $V5, $V2]} # vrev8.v v2, v2 bnez $len, Lstep @{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4 ret .size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc ___ } $code .= <<___; .p2align 4 Lpolymod: .dword 0x0000000000000001 .dword 0xc200000000000000 .size Lpolymod,.-Lpolymod ___ print $code; close STDOUT or die "error closing STDOUT: $!";