1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# SHA256 for C64x+. 18# 19# January 2012 20# 21# Performance is just below 10 cycles per processed byte, which is 22# almost 40% faster than compiler-generated code. Unroll is unlikely 23# to give more than ~8% improvement... 24# 25# !!! Note that this module uses AMR, which means that all interrupt 26# service routines are expected to preserve it and for own well-being 27# zero it upon entry. 28 29$output = pop and open STDOUT,">$output"; 30 31($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments 32 $K256="A3"; 33 34($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) 35 =map("A$_",(16..31)); 36($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) 37 =map("B$_",(16..31)); 38 39($Xia,$Xib)=("A5","B5"); # circular/ring buffer 40 $CTXB=$t2e; 41 42($Xn,$X0,$K)=("B7","B8","B9"); 43($Maj,$Ch)=($T2,"B6"); 44 45$code.=<<___; 46 .text 47 48 .if .ASSEMBLER_VERSION<7000000 49 .asg 0,__TI_EABI__ 50 .endif 51 .if __TI_EABI__ 52 .nocmp 53 .asg sha256_block_data_order,_sha256_block_data_order 54 .endif 55 56 .asg B3,RA 57 .asg A15,FP 58 .asg B15,SP 59 60 .if .BIG_ENDIAN 61 .asg SWAP2,MV 62 .asg SWAP4,MV 63 .endif 64 65 .global _sha256_block_data_order 66_sha256_block_data_order: 67__sha256_block: 68 .asmfunc stack_usage(64) 69 MV $NUM,A0 ; reassign $NUM 70|| MVK -64,B0 71 [!A0] BNOP RA ; if ($NUM==0) return; 72|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) 73|| [A0] MV SP,FP 74 [A0] ADDKPC __sha256_block,B2 75|| [A0] AND B0,SP,SP ; align stack at 64 bytes 76 .if __TI_EABI__ 77 [A0] MVK 0x00404,B1 78|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 79 [A0] MVKH 0x50000,B1 80|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 81 .else 82 [A0] MVK 0x00404,B1 83|| [A0] MVKL (K256-__sha256_block),$K256 84 [A0] MVKH 0x50000,B1 85|| [A0] MVKH (K256-__sha256_block),$K256 86 .endif 87 [A0] MVC B1,AMR ; setup circular addressing 88|| [A0] MV SP,$Xia 89 [A0] MV SP,$Xib 90|| [A0] ADD B2,$K256,$K256 91|| [A0] MV $CTXA,$CTXB 92|| [A0] SUBAW SP,2,SP ; reserve two words above buffer 93 LDW *${CTXA}[0],$A ; load ctx 94|| LDW *${CTXB}[4],$E 95 LDW *${CTXA}[1],$B 96|| LDW *${CTXB}[5],$F 97 LDW *${CTXA}[2],$C 98|| LDW *${CTXB}[6],$G 99 LDW *${CTXA}[3],$D 100|| LDW *${CTXB}[7],$H 101 102 LDNW *$INP++,$Xn ; pre-fetch input 103 LDW *$K256++,$K ; pre-fetch K256[0] 104 MVK 14,B0 ; loop counters 105 MVK 47,B1 106|| ADDAW $Xia,9,$Xia 107outerloop?: 108 SUB A0,1,A0 109|| MV $A,$Actx 110|| MV $E,$Ectx 111|| MVD $B,$Bctx 112|| MVD $F,$Fctx 113 MV $C,$Cctx 114|| MV $G,$Gctx 115|| MVD $D,$Dctx 116|| MVD $H,$Hctx 117|| SWAP4 $Xn,$X0 118 119 SPLOOPD 8 ; BODY_00_14 120|| MVC B0,ILC 121|| SWAP2 $X0,$X0 122 123 LDNW *$INP++,$Xn 124|| ROTL $A,30,$S0 125|| OR $A,$B,$Maj 126|| AND $A,$B,$t2a 127|| ROTL $E,26,$S1 128|| AND $F,$E,$Ch 129|| ANDN $G,$E,$t2e 130 ROTL $A,19,$t0a 131|| AND $C,$Maj,$Maj 132|| ROTL $E,21,$t0e 133|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) 134 ROTL $A,10,$t1a 135|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) 136|| ROTL $E,7,$t1e 137|| ADD $K,$H,$T1 ; T1 = h + K256[i] 138 ADD $X0,$T1,$T1 ; T1 += X[i]; 139|| STW $X0,*$Xib++ 140|| XOR $t0a,$S0,$S0 141|| XOR $t0e,$S1,$S1 142 XOR $t1a,$S0,$S0 ; Sigma0(a) 143|| XOR $t1e,$S1,$S1 ; Sigma1(e) 144|| LDW *$K256++,$K ; pre-fetch K256[i+1] 145|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) 146 ADD $S1,$T1,$T1 ; T1 += Sigma1(e) 147|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) 148|| ROTL $G,0,$H ; h = g 149|| MV $F,$G ; g = f 150|| MV $X0,$X14 151|| SWAP4 $Xn,$X0 152 SWAP2 $X0,$X0 153|| MV $E,$F ; f = e 154|| ADD $D,$T1,$E ; e = d + T1 155|| MV $C,$D ; d = c 156 MV $B,$C ; c = b 157|| MV $A,$B ; b = a 158|| ADD $T1,$T2,$A ; a = T1 + T2 159 SPKERNEL 160 161 ROTL $A,30,$S0 ; BODY_15 162|| OR $A,$B,$Maj 163|| AND $A,$B,$t2a 164|| ROTL $E,26,$S1 165|| AND $F,$E,$Ch 166|| ANDN $G,$E,$t2e 167|| LDW *${Xib}[1],$Xn ; modulo-scheduled 168 ROTL $A,19,$t0a 169|| AND $C,$Maj,$Maj 170|| ROTL $E,21,$t0e 171|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) 172|| LDW *${Xib}[2],$X1 ; modulo-scheduled 173 ROTL $A,10,$t1a 174|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) 175|| ROTL $E,7,$t1e 176|| ADD $K,$H,$T1 ; T1 = h + K256[i] 177 ADD $X0,$T1,$T1 ; T1 += X[i]; 178|| STW $X0,*$Xib++ 179|| XOR $t0a,$S0,$S0 180|| XOR $t0e,$S1,$S1 181 XOR $t1a,$S0,$S0 ; Sigma0(a) 182|| XOR $t1e,$S1,$S1 ; Sigma1(e) 183|| LDW *$K256++,$K ; pre-fetch K256[i+1] 184|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) 185 ADD $S1,$T1,$T1 ; T1 += Sigma1(e) 186|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) 187|| ROTL $G,0,$H ; h = g 188|| MV $F,$G ; g = f 189|| MV $X0,$X15 190 MV $E,$F ; f = e 191|| ADD $D,$T1,$E ; e = d + T1 192|| MV $C,$D ; d = c 193|| MV $Xn,$X0 ; modulo-scheduled 194|| LDW *$Xia,$X9 ; modulo-scheduled 195|| ROTL $X1,25,$t0e ; modulo-scheduled 196|| ROTL $X14,15,$t0a ; modulo-scheduled 197 SHRU $X1,3,$s0 ; modulo-scheduled 198|| SHRU $X14,10,$s1 ; modulo-scheduled 199|| ROTL $B,0,$C ; c = b 200|| MV $A,$B ; b = a 201|| ADD $T1,$T2,$A ; a = T1 + T2 202 203 SPLOOPD 10 ; BODY_16_63 204|| MVC B1,ILC 205|| ROTL $X1,14,$t1e ; modulo-scheduled 206|| ROTL $X14,13,$t1a ; modulo-scheduled 207 208 XOR $t0e,$s0,$s0 209|| XOR $t0a,$s1,$s1 210|| MV $X15,$X14 211|| MV $X1,$Xn 212 XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) 213|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) 214|| LDW *${Xib}[2],$X1 ; module-scheduled 215 ROTL $A,30,$S0 216|| OR $A,$B,$Maj 217|| AND $A,$B,$t2a 218|| ROTL $E,26,$S1 219|| AND $F,$E,$Ch 220|| ANDN $G,$E,$t2e 221|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] 222 ROTL $A,19,$t0a 223|| AND $C,$Maj,$Maj 224|| ROTL $E,21,$t0e 225|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) 226|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) 227 ROTL $A,10,$t1a 228|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) 229|| ROTL $E,7,$t1e 230|| ADD $H,$K,$T1 ; T1 = h + K256[i] 231|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) 232 XOR $t0a,$S0,$S0 233|| XOR $t0e,$S1,$S1 234|| ADD $X0,$T1,$T1 ; T1 += X[i] 235|| STW $X0,*$Xib++ 236 XOR $t1a,$S0,$S0 ; Sigma0(a) 237|| XOR $t1e,$S1,$S1 ; Sigma1(e) 238|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) 239|| MV $X0,$X15 240|| ROTL $G,0,$H ; h = g 241|| LDW *$K256++,$K ; pre-fetch K256[i+1] 242 ADD $S1,$T1,$T1 ; T1 += Sigma1(e) 243|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) 244|| MV $F,$G ; g = f 245|| MV $Xn,$X0 ; modulo-scheduled 246|| LDW *++$Xia,$X9 ; modulo-scheduled 247|| ROTL $X1,25,$t0e ; module-scheduled 248|| ROTL $X14,15,$t0a ; modulo-scheduled 249 ROTL $X1,14,$t1e ; modulo-scheduled 250|| ROTL $X14,13,$t1a ; modulo-scheduled 251|| MV $E,$F ; f = e 252|| ADD $D,$T1,$E ; e = d + T1 253|| MV $C,$D ; d = c 254|| MV $B,$C ; c = b 255 MV $A,$B ; b = a 256|| ADD $T1,$T2,$A ; a = T1 + T2 257|| SHRU $X1,3,$s0 ; modulo-scheduled 258|| SHRU $X14,10,$s1 ; modulo-scheduled 259 SPKERNEL 260 261 [A0] B outerloop? 262|| [A0] LDNW *$INP++,$Xn ; pre-fetch input 263|| [A0] ADDK -260,$K256 ; rewind K256 264|| ADD $Actx,$A,$A ; accumulate ctx 265|| ADD $Ectx,$E,$E 266|| ADD $Bctx,$B,$B 267 ADD $Fctx,$F,$F 268|| ADD $Cctx,$C,$C 269|| ADD $Gctx,$G,$G 270|| ADD $Dctx,$D,$D 271|| ADD $Hctx,$H,$H 272|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] 273 274 [!A0] BNOP RA 275||[!A0] MV $CTXA,$CTXB 276 [!A0] MV FP,SP ; restore stack pointer 277||[!A0] LDW *FP[0],FP ; restore frame pointer 278 [!A0] STW $A,*${CTXA}[0] ; save ctx 279||[!A0] STW $E,*${CTXB}[4] 280||[!A0] MVK 0,B0 281 [!A0] STW $B,*${CTXA}[1] 282||[!A0] STW $F,*${CTXB}[5] 283||[!A0] MVC B0,AMR ; clear AMR 284 STW $C,*${CTXA}[2] 285|| STW $G,*${CTXB}[6] 286 STW $D,*${CTXA}[3] 287|| STW $H,*${CTXB}[7] 288 .endasmfunc 289 290 .if __TI_EABI__ 291 .sect ".text:sha_asm.const" 292 .else 293 .sect ".const:sha_asm" 294 .endif 295 .align 128 296K256: 297 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 298 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 299 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 300 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 301 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 302 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 303 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 304 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 305 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 306 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 307 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 308 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 309 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 310 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 311 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 312 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 313 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 314 .align 4 315 316___ 317 318print $code; 319close STDOUT or die "error closing STDOUT: $!"; 320