1#! /usr/bin/env perl 2# Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Hardware SPARC T4 support by David S. Miller 17# ==================================================================== 18 19# Performance improvement is not really impressive on pre-T1 CPU: +8% 20# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it 21# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and 22# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. 23# X[16] vector is packed to 8 64-bit registers and as result nothing 24# is spilled on stack. In addition input data is loaded in compact 25# instruction sequence, thus minimizing the window when the code is 26# subject to [inter-thread] cache-thrashing hazard. The goal is to 27# ensure scalability on UltraSPARC T1, or rather to avoid decay when 28# amount of active threads exceeds the number of physical cores. 29 30# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x 31# faster than software. Multi-process benchmark saturates at 11x 32# single-process result on 8-core processor, or ~9GBps per 2.85GHz 33# socket. 34 35$output=pop and open STDOUT,">$output"; 36 37@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 38$rot1m="%g2"; 39$tmp64="%g3"; 40$Xi="%g4"; 41$A="%l0"; 42$B="%l1"; 43$C="%l2"; 44$D="%l3"; 45$E="%l4"; 46@V=($A,$B,$C,$D,$E); 47$K_00_19="%l5"; 48$K_20_39="%l6"; 49$K_40_59="%l7"; 50$K_60_79="%g5"; 51@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79); 52 53$ctx="%i0"; 54$inp="%i1"; 55$len="%i2"; 56$tmp0="%i3"; 57$tmp1="%i4"; 58$tmp2="%i5"; 59 60sub BODY_00_15 { 61my ($i,$a,$b,$c,$d,$e)=@_; 62my $xi=($i&1)?@X[($i/2)%8]:$Xi; 63 64$code.=<<___; 65 sll $a,5,$tmp0 !! $i 66 add @K[$i/20],$e,$e 67 srl $a,27,$tmp1 68 add $tmp0,$e,$e 69 and $c,$b,$tmp0 70 add $tmp1,$e,$e 71 sll $b,30,$tmp2 72 andn $d,$b,$tmp1 73 srl $b,2,$b 74 or $tmp1,$tmp0,$tmp1 75 or $tmp2,$b,$b 76 add $xi,$e,$e 77___ 78if ($i&1 && $i<15) { 79 $code.= 80 " srlx @X[(($i+1)/2)%8],32,$Xi\n"; 81} 82$code.=<<___; 83 add $tmp1,$e,$e 84___ 85} 86 87sub Xupdate { 88my ($i,$a,$b,$c,$d,$e)=@_; 89my $j=$i/2; 90 91if ($i&1) { 92$code.=<<___; 93 sll $a,5,$tmp0 !! $i 94 add @K[$i/20],$e,$e 95 srl $a,27,$tmp1 96___ 97} else { 98$code.=<<___; 99 sllx @X[($j+6)%8],32,$Xi ! Xupdate($i) 100 xor @X[($j+1)%8],@X[$j%8],@X[$j%8] 101 srlx @X[($j+7)%8],32,$tmp1 102 xor @X[($j+4)%8],@X[$j%8],@X[$j%8] 103 sll $a,5,$tmp0 !! $i 104 or $tmp1,$Xi,$Xi 105 add @K[$i/20],$e,$e !! 106 xor $Xi,@X[$j%8],@X[$j%8] 107 srlx @X[$j%8],31,$Xi 108 add @X[$j%8],@X[$j%8],@X[$j%8] 109 and $Xi,$rot1m,$Xi 110 andn @X[$j%8],$rot1m,@X[$j%8] 111 srl $a,27,$tmp1 !! 112 or $Xi,@X[$j%8],@X[$j%8] 113___ 114} 115} 116 117sub BODY_16_19 { 118my ($i,$a,$b,$c,$d,$e)=@_; 119 120 &Xupdate(@_); 121 if ($i&1) { 122 $xi=@X[($i/2)%8]; 123 } else { 124 $xi=$Xi; 125 $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 126 } 127$code.=<<___; 128 add $tmp0,$e,$e !! 129 and $c,$b,$tmp0 130 add $tmp1,$e,$e 131 sll $b,30,$tmp2 132 add $xi,$e,$e 133 andn $d,$b,$tmp1 134 srl $b,2,$b 135 or $tmp1,$tmp0,$tmp1 136 or $tmp2,$b,$b 137 add $tmp1,$e,$e 138___ 139} 140 141sub BODY_20_39 { 142my ($i,$a,$b,$c,$d,$e)=@_; 143my $xi; 144 &Xupdate(@_); 145 if ($i&1) { 146 $xi=@X[($i/2)%8]; 147 } else { 148 $xi=$Xi; 149 $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 150 } 151$code.=<<___; 152 add $tmp0,$e,$e !! 153 xor $c,$b,$tmp0 154 add $tmp1,$e,$e 155 sll $b,30,$tmp2 156 xor $d,$tmp0,$tmp1 157 srl $b,2,$b 158 add $tmp1,$e,$e 159 or $tmp2,$b,$b 160 add $xi,$e,$e 161___ 162} 163 164sub BODY_40_59 { 165my ($i,$a,$b,$c,$d,$e)=@_; 166my $xi; 167 &Xupdate(@_); 168 if ($i&1) { 169 $xi=@X[($i/2)%8]; 170 } else { 171 $xi=$Xi; 172 $code.="\tsrlx @X[($i/2)%8],32,$xi\n"; 173 } 174$code.=<<___; 175 add $tmp0,$e,$e !! 176 and $c,$b,$tmp0 177 add $tmp1,$e,$e 178 sll $b,30,$tmp2 179 or $c,$b,$tmp1 180 srl $b,2,$b 181 and $d,$tmp1,$tmp1 182 add $xi,$e,$e 183 or $tmp1,$tmp0,$tmp1 184 or $tmp2,$b,$b 185 add $tmp1,$e,$e 186___ 187} 188 189$code.=<<___; 190#ifndef __ASSEMBLER__ 191# define __ASSEMBLER__ 1 192#endif 193#include "crypto/sparc_arch.h" 194 195#ifdef __arch64__ 196.register %g2,#scratch 197.register %g3,#scratch 198#endif 199 200.section ".text",#alloc,#execinstr 201 202#ifdef __PIC__ 203SPARC_PIC_THUNK(%g1) 204#endif 205 206.align 32 207.globl sha1_block_data_order 208sha1_block_data_order: 209 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 210 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] 211 212 andcc %g1, CFR_SHA1, %g0 213 be .Lsoftware 214 nop 215 216 ld [%o0 + 0x00], %f0 ! load context 217 ld [%o0 + 0x04], %f1 218 ld [%o0 + 0x08], %f2 219 andcc %o1, 0x7, %g0 220 ld [%o0 + 0x0c], %f3 221 bne,pn %icc, .Lhwunaligned 222 ld [%o0 + 0x10], %f4 223 224.Lhw_loop: 225 ldd [%o1 + 0x00], %f8 226 ldd [%o1 + 0x08], %f10 227 ldd [%o1 + 0x10], %f12 228 ldd [%o1 + 0x18], %f14 229 ldd [%o1 + 0x20], %f16 230 ldd [%o1 + 0x28], %f18 231 ldd [%o1 + 0x30], %f20 232 subcc %o2, 1, %o2 ! done yet? 233 ldd [%o1 + 0x38], %f22 234 add %o1, 0x40, %o1 235 prefetch [%o1 + 63], 20 236 237 .word 0x81b02820 ! SHA1 238 239 bne,pt SIZE_T_CC, .Lhw_loop 240 nop 241 242.Lhwfinish: 243 st %f0, [%o0 + 0x00] ! store context 244 st %f1, [%o0 + 0x04] 245 st %f2, [%o0 + 0x08] 246 st %f3, [%o0 + 0x0c] 247 retl 248 st %f4, [%o0 + 0x10] 249 250.align 8 251.Lhwunaligned: 252 alignaddr %o1, %g0, %o1 253 254 ldd [%o1 + 0x00], %f10 255.Lhwunaligned_loop: 256 ldd [%o1 + 0x08], %f12 257 ldd [%o1 + 0x10], %f14 258 ldd [%o1 + 0x18], %f16 259 ldd [%o1 + 0x20], %f18 260 ldd [%o1 + 0x28], %f20 261 ldd [%o1 + 0x30], %f22 262 ldd [%o1 + 0x38], %f24 263 subcc %o2, 1, %o2 ! done yet? 264 ldd [%o1 + 0x40], %f26 265 add %o1, 0x40, %o1 266 prefetch [%o1 + 63], 20 267 268 faligndata %f10, %f12, %f8 269 faligndata %f12, %f14, %f10 270 faligndata %f14, %f16, %f12 271 faligndata %f16, %f18, %f14 272 faligndata %f18, %f20, %f16 273 faligndata %f20, %f22, %f18 274 faligndata %f22, %f24, %f20 275 faligndata %f24, %f26, %f22 276 277 .word 0x81b02820 ! SHA1 278 279 bne,pt SIZE_T_CC, .Lhwunaligned_loop 280 for %f26, %f26, %f10 ! %f10=%f26 281 282 ba .Lhwfinish 283 nop 284 285.align 16 286.Lsoftware: 287 save %sp,-STACK_FRAME,%sp 288 sllx $len,6,$len 289 add $inp,$len,$len 290 291 or %g0,1,$rot1m 292 sllx $rot1m,32,$rot1m 293 or $rot1m,1,$rot1m 294 295 ld [$ctx+0],$A 296 ld [$ctx+4],$B 297 ld [$ctx+8],$C 298 ld [$ctx+12],$D 299 ld [$ctx+16],$E 300 andn $inp,7,$tmp0 301 302 sethi %hi(0x5a827999),$K_00_19 303 or $K_00_19,%lo(0x5a827999),$K_00_19 304 sethi %hi(0x6ed9eba1),$K_20_39 305 or $K_20_39,%lo(0x6ed9eba1),$K_20_39 306 sethi %hi(0x8f1bbcdc),$K_40_59 307 or $K_40_59,%lo(0x8f1bbcdc),$K_40_59 308 sethi %hi(0xca62c1d6),$K_60_79 309 or $K_60_79,%lo(0xca62c1d6),$K_60_79 310 311.Lloop: 312 ldx [$tmp0+0],@X[0] 313 ldx [$tmp0+16],@X[2] 314 ldx [$tmp0+32],@X[4] 315 ldx [$tmp0+48],@X[6] 316 and $inp,7,$tmp1 317 ldx [$tmp0+8],@X[1] 318 sll $tmp1,3,$tmp1 319 ldx [$tmp0+24],@X[3] 320 subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too 321 ldx [$tmp0+40],@X[5] 322 bz,pt %icc,.Laligned 323 ldx [$tmp0+56],@X[7] 324 325 sllx @X[0],$tmp1,@X[0] 326 ldx [$tmp0+64],$tmp64 327___ 328for($i=0;$i<7;$i++) 329{ $code.=<<___; 330 srlx @X[$i+1],$tmp2,$Xi 331 sllx @X[$i+1],$tmp1,@X[$i+1] 332 or $Xi,@X[$i],@X[$i] 333___ 334} 335$code.=<<___; 336 srlx $tmp64,$tmp2,$tmp64 337 or $tmp64,@X[7],@X[7] 338.Laligned: 339 srlx @X[0],32,$Xi 340___ 341for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 342for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 343for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 344for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 345for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 346$code.=<<___; 347 348 ld [$ctx+0],@X[0] 349 ld [$ctx+4],@X[1] 350 ld [$ctx+8],@X[2] 351 ld [$ctx+12],@X[3] 352 add $inp,64,$inp 353 ld [$ctx+16],@X[4] 354 cmp $inp,$len 355 356 add $A,@X[0],$A 357 st $A,[$ctx+0] 358 add $B,@X[1],$B 359 st $B,[$ctx+4] 360 add $C,@X[2],$C 361 st $C,[$ctx+8] 362 add $D,@X[3],$D 363 st $D,[$ctx+12] 364 add $E,@X[4],$E 365 st $E,[$ctx+16] 366 367 bne SIZE_T_CC,.Lloop 368 andn $inp,7,$tmp0 369 370 ret 371 restore 372.type sha1_block_data_order,#function 373.size sha1_block_data_order,(.-sha1_block_data_order) 374.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 375.align 4 376___ 377 378# Purpose of these subroutines is to explicitly encode VIS instructions, 379# so that one can compile the module without having to specify VIS 380# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 381# Idea is to reserve for option to produce "universal" binary and let 382# programmer detect if current CPU is VIS capable at run-time. 383sub unvis { 384my ($mnemonic,$rs1,$rs2,$rd)=@_; 385my $ref,$opf; 386my %visopf = ( "faligndata" => 0x048, 387 "for" => 0x07c ); 388 389 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 390 391 if ($opf=$visopf{$mnemonic}) { 392 foreach ($rs1,$rs2,$rd) { 393 return $ref if (!/%f([0-9]{1,2})/); 394 $_=$1; 395 if ($1>=32) { 396 return $ref if ($1&1); 397 # re-encode for upper double register addressing 398 $_=($1|$1>>5)&31; 399 } 400 } 401 402 return sprintf ".word\t0x%08x !%s", 403 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 404 $ref; 405 } else { 406 return $ref; 407 } 408} 409sub unalignaddr { 410my ($mnemonic,$rs1,$rs2,$rd)=@_; 411my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 412my $ref="$mnemonic\t$rs1,$rs2,$rd"; 413 414 foreach ($rs1,$rs2,$rd) { 415 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 416 else { return $ref; } 417 } 418 return sprintf ".word\t0x%08x !%s", 419 0x81b00300|$rd<<25|$rs1<<14|$rs2, 420 $ref; 421} 422 423foreach (split("\n",$code)) { 424 s/\`([^\`]*)\`/eval $1/ge; 425 426 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 427 &unvis($1,$2,$3,$4) 428 /ge; 429 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 430 &unalignaddr($1,$2,$3,$4) 431 /ge; 432 433 print $_,"\n"; 434} 435 436close STDOUT or die "error closing STDOUT: $!"; 437