1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2011 18# 19# The module implements GCM GHASH function and underlying single 20# multiplication operation in GF(2^128). Even though subroutines 21# have _4bit suffix, they are not using any tables, but rely on 22# hardware Galois Field Multiply support. Streamed GHASH processes 23# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven 24# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are 25# comparing apples vs. oranges, but compiler surely could have done 26# better, because theoretical [though not necessarily achievable] 27# estimate for "4-bit" table-driven implementation is ~12 cycles. 28 29$output = pop and open STDOUT,">$output"; 30 31($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments 32 33($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, 34 $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); 35($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, 36 $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); 37($FF000000,$E10000)=("B30","B31"); 38($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len 39 $xia="A9"; 40($rem,$res)=("B4","B5"); # $rem zaps $Htable 41 42$code.=<<___; 43 .text 44 45 .if .ASSEMBLER_VERSION<7000000 46 .asg 0,__TI_EABI__ 47 .endif 48 .if __TI_EABI__ 49 .asg gcm_gmult_1bit,_gcm_gmult_1bit 50 .asg gcm_gmult_4bit,_gcm_gmult_4bit 51 .asg gcm_ghash_4bit,_gcm_ghash_4bit 52 .endif 53 54 .asg B3,RA 55 56 .if 0 57 .global _gcm_gmult_1bit 58_gcm_gmult_1bit: 59 ADDAD $Htable,2,$Htable 60 .endif 61 .global _gcm_gmult_4bit 62_gcm_gmult_4bit: 63 .asmfunc 64 LDDW *${Htable}[-1],$H1:$H0 ; H.lo 65 LDDW *${Htable}[-2],$H3:$H2 ; H.hi 66|| MV $Xip,${xip} ; reassign Xi 67|| MVK 15,B1 ; SPLOOPD constant 68 69 MVK 0xE1,$E10000 70|| LDBU *++${xip}[15],$x1 ; Xi[15] 71 MVK 0xFF,$FF000000 72|| LDBU *--${xip},$x0 ; Xi[14] 73 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial 74 SHL $FF000000,24,$FF000000 ; upper byte mask 75|| BNOP ghash_loop? 76|| MVK 1,B0 ; take a single spin 77 78 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes 79 AND $H2,$FF000000,$H2u ; H2's upper byte 80 AND $H3,$FF000000,$H3u ; H3's upper byte 81|| SHRU $H2u,8,$H2u 82 SHRU $H3u,8,$H3u 83|| ZERO $Z1:$Z0 84 SHRU2 $xia,8,$H01u 85|| ZERO $Z3:$Z2 86 .endasmfunc 87 88 .global _gcm_ghash_4bit 89_gcm_ghash_4bit: 90 .asmfunc 91 LDDW *${Htable}[-1],$H1:$H0 ; H.lo 92|| SHRU $len,4,B0 ; reassign len 93 LDDW *${Htable}[-2],$H3:$H2 ; H.hi 94|| MV $Xip,${xip} ; reassign Xi 95|| MVK 15,B1 ; SPLOOPD constant 96 97 MVK 0xE1,$E10000 98|| [B0] LDNDW *${inp}[1],$H1x:$H0x 99 MVK 0xFF,$FF000000 100|| [B0] LDNDW *${inp}++[2],$H3x:$H2x 101 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial 102|| LDDW *${xip}[1],$Z1:$Z0 103 SHL $FF000000,24,$FF000000 ; upper byte mask 104|| LDDW *${xip}[0],$Z3:$Z2 105 106 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes 107 AND $H2,$FF000000,$H2u ; H2's upper byte 108 AND $H3,$FF000000,$H3u ; H3's upper byte 109|| SHRU $H2u,8,$H2u 110 SHRU $H3u,8,$H3u 111 SHRU2 $xia,8,$H01u 112 113|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 114|| [B0] XOR $H1x,$Z1,$Z1 115 .if .LITTLE_ENDIAN 116 [B0] XOR $H2x,$Z2,$Z2 117|| [B0] XOR $H3x,$Z3,$Z3 118|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall 119 STDW $Z1:$Z0,*${xip}[1] 120|| [B0] SHRU $Z1,16,$x0 ; Xi[14] 121|| [B0] ZERO $Z1:$Z0 122 .else 123 [B0] XOR $H2x,$Z2,$Z2 124|| [B0] XOR $H3x,$Z3,$Z3 125|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall 126 STDW $Z1:$Z0,*${xip}[1] 127|| [B0] SHRU $Z0,8,$x0 ; Xi[14] 128|| [B0] ZERO $Z1:$Z0 129 .endif 130 STDW $Z3:$Z2,*${xip}[0] 131|| [B0] ZERO $Z3:$Z2 132|| [B0] MV $xia,$x1 133 [B0] ADDK 14,${xip} 134 135ghash_loop?: 136 SPLOOPD 6 ; 6*16+7 137|| MVC B1,ILC 138|| [B0] SUB B0,1,B0 139|| ZERO A0 140|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib 141|| SHL $x1,1,$xia 142___ 143 144########____________________________ 145# 0 D2. M1 M2 | 146# 1 M1 | 147# 2 M1 M2 | 148# 3 D1. M1 M2 | 149# 4 S1. L1 | 150# 5 S2 S1x L1 D2 L2 |____________________________ 151# 6/0 L1 S1 L2 S2x |D2. M1 M2 | 152# 7/1 L1 S1 D1x S2 M2 | M1 | 153# 8/2 S1 L1x S2 | M1 M2 | 154# 9/3 S1 L1x | D1. M1 M2 | 155# 10/4 D1x | S1. L1 | 156# 11/5 |S2 S1x L1 D2 L2 |____________ 157# 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... 158# 7/1 L1 S1 D1x S2 M2 | .... 159# 8/2 S1 L1x S2 | .... 160#####... ................|............ 161$code.=<<___; 162 XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) 163|| XORMPY $H01u,$xib,$H01y 164|| [A0] LDBU *--${xip},$x0 165 XORMPY $H1,$xia,$H1x ; 1 166 XORMPY $H2,$xia,$H2x ; 2 167|| XORMPY $H2u,$xib,$H2y 168 XORMPY $H3,$xia,$H3x ; 3 169|| XORMPY $H3u,$xib,$H3y 170||[!A0] MVK.D 15,A0 ; *--${xip} counter 171 XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) 172|| [A0] SUB.S A0,1,A0 173 XOR.L $H1x,$Z1,$Z1 ; 5 174|| AND.D $H01y,$FF000000,$H0z 175|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y 176|| SHL $x0,1,$xib 177|| SHL $x0,1,$xia 178 179 XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue 180|| SHL $Z0,1,$rem ; ; rem=Z<<1 181|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 182|| AND.L $H1y,$FF000000,$H1z 183 XOR.L $H3x,$Z3,$Z3 ; 7/1 184|| SHRMB.S $Z2,$Z1,$Z1 185|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products 186|| AND.S $H2y,$FF000000,$H2z 187|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE 188 XOR.L $H1z,$Z1,$Z1 ; 8/2 189|| SHRMB.S $Z3,$Z2,$Z2 190|| AND.S $H3y,$FF000000,$H3z 191 XOR.L $H2z,$Z2,$Z2 ; 9/3 192|| SHRU $Z3,8,$Z3 193 XOR.D $H3z,$Z3,$Z3 ; 10/4 194 NOP ; 11/5 195 196 SPKERNEL 0,2 197|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res 198 199 ; input pre-fetch is possible where D1 slot is available... 200 [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- 201 [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- 202 NOP ; 10/- 203 .if .LITTLE_ENDIAN 204 SWAP2 $Z0,$Z1 ; 11/- 205|| SWAP4 $Z1,$Z0 206 SWAP4 $Z1,$Z1 ; 12/- 207|| SWAP2 $Z0,$Z0 208 SWAP2 $Z2,$Z3 209|| SWAP4 $Z3,$Z2 210||[!B0] BNOP RA 211 SWAP4 $Z3,$Z3 212|| SWAP2 $Z2,$Z2 213|| [B0] BNOP ghash_loop? 214 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 215|| [B0] XOR $H1x,$Z1,$Z1 216 [B0] XOR $H2x,$Z2,$Z2 217|| [B0] XOR $H3x,$Z3,$Z3 218|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall 219 STDW $Z1:$Z0,*${xip}[1] 220|| [B0] SHRU $Z1,16,$x0 ; Xi[14] 221|| [B0] ZERO $Z1:$Z0 222 .else 223 [!B0] BNOP RA ; 11/- 224 [B0] BNOP ghash_loop? ; 12/- 225 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 226|| [B0] XOR $H1x,$Z1,$Z1 227 [B0] XOR $H2x,$Z2,$Z2 228|| [B0] XOR $H3x,$Z3,$Z3 229|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall 230 STDW $Z1:$Z0,*${xip}[1] 231|| [B0] SHRU $Z0,8,$x0 ; Xi[14] 232|| [B0] ZERO $Z1:$Z0 233 .endif 234 STDW $Z3:$Z2,*${xip}[0] 235|| [B0] ZERO $Z3:$Z2 236|| [B0] MV $xia,$x1 237 [B0] ADDK 14,${xip} 238 .endasmfunc 239 240 .sect .const 241 .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 242 .align 4 243___ 244 245print $code; 246close STDOUT or die "error closing STDOUT: $!"; 247