1#! /usr/bin/env perl 2# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# $output is the last argument if it looks like a file (it has an extension) 10# $flavour is the first argument if it doesn't look like a file 11$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 12$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 13 14$output and open STDOUT,">$output"; 15 16my @regs = map("x$_",(0..31)); 17my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1', 18 map("a$_",(0..7)), 19 map("s$_",(2..11)), 20 map("t$_",(3..6)) 21); 22 23my %reglookup; 24@reglookup{@regs} = @regs; 25@reglookup{@regaliases} = @regs; 26 27# Takes a register name, possibly an alias, and converts it to a register index 28# from 0 to 31 29sub read_reg { 30 my $reg = lc shift; 31 if (!exists($reglookup{$reg})) { 32 die("Unknown register ".$reg); 33 } 34 my $regstr = $reglookup{$reg}; 35 if (!($regstr =~ /^x([0-9]+)$/)) { 36 die("Could not process register ".$reg); 37 } 38 return $1; 39} 40 41sub rv64_rev8 { 42 # Encoding for rev8 rd, rs instruction on RV64 43 # XXXXXXXXXXXXX_ rs _XXX_ rd _XXXXXXX 44 my $template = 0b011010111000_00000_101_00000_0010011; 45 my $rd = read_reg shift; 46 my $rs = read_reg shift; 47 48 return ".word ".($template | ($rs << 15) | ($rd << 7)); 49} 50 51sub rv64_clmul { 52 # Encoding for clmul rd, rs1, rs2 instruction on RV64 53 # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX 54 my $template = 0b0000101_00000_00000_001_00000_0110011; 55 my $rd = read_reg shift; 56 my $rs1 = read_reg shift; 57 my $rs2 = read_reg shift; 58 59 return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); 60} 61 62sub rv64_clmulh { 63 # Encoding for clmulh rd, rs1, rs2 instruction on RV64 64 # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX 65 my $template = 0b0000101_00000_00000_011_00000_0110011; 66 my $rd = read_reg shift; 67 my $rs1 = read_reg shift; 68 my $rs2 = read_reg shift; 69 70 return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7)); 71} 72 73################################################################################ 74# gcm_init_clmul_rv64i_zbb_zbc(u128 Htable[16], const u64 Xi[2]) 75# Initialization function for clmul-based implementation of GMULT 76# This function is used in tandem with gcm_gmult_clmul_rv64i_zbb_zbc 77################################################################################ 78{ 79my ($Haddr,$Xi,$TEMP) = ("a0","a1","a2"); 80 81$code .= <<___; 82.text 83.balign 16 84.globl gcm_init_clmul_rv64i_zbb_zbc 85.type gcm_init_clmul_rv64i_zbb_zbc,\@function 86# Initialize clmul-based implementation of galois field multiplication routine. 87# gcm_init_clmul_rv64i_zbb_zbc(ctx->Htable, ctx->H.u) 88gcm_init_clmul_rv64i_zbb_zbc: 89 # argument 0 = ctx->Htable (store H here) 90 # argument 1 = H.u[] (2x 64-bit words) [H_high64, H_low64] 91 92 # Simply store [H_high64, H_low64] for later 93 ld $TEMP,0($Xi) 94 sd $TEMP,0($Haddr) 95 ld $TEMP,8($Xi) 96 sd $TEMP,8($Haddr) 97 98 ret 99 100___ 101 102} 103 104################################################################################ 105# gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16]) 106# Compute GMULT (X*H mod f) using the Zbc (clmul) and Zbb (basic bit manip) 107# extensions, and the Modified Barrett Reduction technique 108################################################################################ 109{ 110my ($Xi,$Haddr,$A1,$A0,$B1,$B0,$C1,$C0,$D1,$D0,$E1,$E0,$TEMP,$TEMP2,$qp_low) = 111 ("a0","a1","a2","a3","a4","a5","a6","a7","t0","t1","t2","t3","t4","t5","t6"); 112 113$code .= <<___; 114.text 115.balign 16 116.globl gcm_gmult_clmul_rv64i_zbb_zbc 117.type gcm_gmult_clmul_rv64i_zbb_zbc,\@function 118# static void gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16]) 119# Computes product of X*H mod f 120gcm_gmult_clmul_rv64i_zbb_zbc: 121 122 # Load X and H (H is saved previously in gcm_init_clmul_rv64i_zbb_zbc) 123 ld $A1,0($Xi) 124 ld $A0,8($Xi) 125 126 ld $B1,0($Haddr) 127 ld $B0,8($Haddr) 128 129 li $qp_low,0xe100000000000000 130 131 # Perform Katratsuba Multiplication to generate a 255-bit intermediate 132 # A = [A1:A0] 133 # B = [B1:B0] 134 # Let: 135 # [C1:C0] = A1*B1 136 # [D1:D0] = A0*B0 137 # [E1:E0] = (A0+A1)*(B0+B1) 138 # Then: 139 # A*B = [C1:C0+C1+D1+E1:D1+C0+D0+E0:D0] 140 141 @{[rv64_rev8 $A1, $A1]} 142 @{[rv64_clmul $C0,$A1,$B1]} 143 @{[rv64_clmulh $C1,$A1,$B1]} 144 145 @{[rv64_rev8 $A0,$A0]} 146 @{[rv64_clmul $D0,$A0,$B0]} 147 @{[rv64_clmulh $D1,$A0,$B0]} 148 149 xor $TEMP,$A0,$A1 150 xor $TEMP2,$B0,$B1 151 152 @{[rv64_clmul $E0,$TEMP,$TEMP2]} 153 @{[rv64_clmulh $E1,$TEMP,$TEMP2]} 154 155 # 0th term is just C1 156 157 # Construct term 1 in E1 (E1 only appears in dword 1) 158 xor $E1,$E1,$D1 159 xor $E1,$E1,$C1 160 xor $E1,$E1,$C0 161 162 # Term 1 is E1 163 164 # Construct term 2 in E0 (E0 only appears in dword 2) 165 xor $E0,$E0,$D0 166 xor $E0,$E0,$C0 167 xor $E0,$E0,$D1 168 169 # Term 2 is E0 170 171 # final term is just D0 172 173 # X*H is now stored in [C1,E1,E0,D0] 174 175 # Left-justify 176 slli $C1,$C1,1 177 # Or in the high bit of E1 178 srli $TEMP,$E1,63 179 or $C1,$C1,$TEMP 180 181 slli $E1,$E1,1 182 # Or in the high bit of E0 183 srli $TEMP2,$E0,63 184 or $E1,$E1,$TEMP2 185 186 slli $E0,$E0,1 187 # Or in the high bit of D0 188 srli $TEMP,$D0,63 189 or $E0,$E0,$TEMP 190 191 slli $D0,$D0,1 192 193 # Barrett Reduction 194 # c = [E0, D0] 195 # We want the top 128 bits of the result of c*f 196 # We'll get this by computing the low-half (most significant 128 bits in 197 # the reflected domain) of clmul(c,fs)<<1 first, then 198 # xor in c to complete the calculation 199 200 # AA = [AA1:AA0] = [E0,D0] = c 201 # BB = [BB1:BB0] = [qp_low,0] 202 # [CC1:CC0] = AA1*BB1 203 # [DD1:DD0] = AA0*BB0 204 # [EE1:EE0] = (AA0+AA1)*(BB0+BB1) 205 # Then: 206 # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0] 207 # We only need CC0,DD1,DD0,EE0 to compute the low 128 bits of c * qp_low 208___ 209 210my ($CC0,$EE0,$AA1,$AA0,$BB1) = ($A0,$B1,$E0,$D0,$qp_low); 211 212$code .= <<___; 213 214 @{[rv64_clmul $CC0,$AA1,$BB1]} 215 #clmul DD0,AA0,BB0 # BB0 is 0, so DD0 = 0 216 #clmulh DD1,AA0,BB0 # BB0 is 0, so DD1 = 0 217 xor $TEMP,$AA0,$AA1 218 #xor TEMP2,BB0,BB1 # TEMP2 = BB1 = qp_low 219 @{[rv64_clmul $EE0,$TEMP,$BB1]} 220 221 # Result is [N/A:N/A:DD1+CC0+DD0+EE0:DD0] 222 # Simplifying: [CC0+EE0:0] 223 xor $TEMP2,$CC0,$EE0 224 # Shift left by 1 to correct for bit reflection 225 slli $TEMP2,$TEMP2,1 226 227 # xor into c = [E0,D0] 228 # Note that only E0 is affected 229 xor $E0,$E0,$TEMP2 230 231 # Now, q = [E0,D0] 232 233 # The final step is to compute clmul(q,[qp_low:0])<<1 234 # The leftmost 128 bits are the reduced result. 235 # Once again, we use Karatsuba multiplication, but many of the terms 236 # simplify or cancel out. 237 # AA = [AA1:AA0] = [E0,D0] = c 238 # BB = [BB1:BB0] = [qp_low,0] 239 # [CC1:CC0] = AA1*BB1 240 # [DD1:DD0] = AA0*BB0 241 # [EE1:EE0] = (AA0+AA1)*(BB0+BB1) 242 # Then: 243 # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0] 244 # We need CC1,CC0,DD0,DD1,EE1,EE0 to compute the leftmost 128 bits of AA*BB 245 246___ 247 248my ($AA1,$AA0,$BB1,$CC1,$CC0,$EE1,$EE0) = ($E0,$D0,$qp_low,$A0,$A1,$C0,$B0); 249 250$code .= <<___; 251 252 @{[rv64_clmul $CC0,$AA1,$BB1]} 253 @{[rv64_clmulh $CC1,$AA1,$BB1]} 254 255 #clmul DD0,AA0,BB0 # BB0 = 0 so DD0 = 0 256 #clmulh DD1,AA0,BB0 # BB0 = 0 so DD1 = 0 257 258 xor $TEMP,$AA0,$AA1 259 #xor TEMP2,BB0,BB1 # BB0 = 0 to TEMP2 == BB1 == qp_low 260 261 @{[rv64_clmul $EE0,$TEMP,$BB1]} 262 @{[rv64_clmulh $EE1,$TEMP,$BB1]} 263 264 # Need the DD1+CC0+DD0+EE0 term to shift its leftmost bit into the 265 # intermediate result. 266 # This is just CC0+EE0, store it in TEMP 267 xor $TEMP,$CC0,$EE0 268 269 # Result is [CC1:CC0+CC1+EE1:(a single bit)]<<1 270 # Combine into [CC1:CC0] 271 xor $CC0,$CC0,$CC1 272 xor $CC0,$CC0,$EE1 273 274 # Shift 128-bit quantity, xor in [C1,E1] and store 275 slli $CC1,$CC1,1 276 srli $TEMP2,$CC0,63 277 or $CC1,$CC1,$TEMP2 278 # xor in C1 279 xor $CC1,$CC1,$C1 280 @{[rv64_rev8 $CC1,$CC1]} 281 282 slli $CC0,$CC0,1 283 srli $TEMP,$TEMP,63 284 or $CC0,$CC0,$TEMP 285 # xor in E1 286 xor $CC0,$CC0,$E1 287 @{[rv64_rev8 $CC0,$CC0]} 288 sd $CC1,0(a0) 289 sd $CC0,8(a0) 290 291 ret 292___ 293 294} 295 296print $code; 297 298close STDOUT or die "error closing STDOUT: $!"; 299