1#! /usr/bin/env perl 2# Copyright 2007-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Hardware SPARC T4 support by David S. Miller 17# ==================================================================== 18 19# SHA256 performance improvement over compiler generated code varies 20# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit 21# build]. Just like in SHA1 module I aim to ensure scalability on 22# UltraSPARC T1 by packing X[16] to 8 64-bit registers. 23 24# SHA512 on pre-T1 UltraSPARC. 25# 26# Performance is >75% better than 64-bit code generated by Sun C and 27# over 2x than 32-bit code. X[16] resides on stack, but access to it 28# is scheduled for L2 latency and staged through 32 least significant 29# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI 30# duality. Nevertheless it's ~40% faster than SHA256, which is pretty 31# good [optimal coefficient is 50%]. 32# 33# SHA512 on UltraSPARC T1. 34# 35# It's not any faster than 64-bit code generated by Sun C 5.8. This is 36# because 64-bit code generator has the advantage of using 64-bit 37# loads(*) to access X[16], which I consciously traded for 32-/64-bit 38# ABI duality [as per above]. But it surpasses 32-bit Sun C generated 39# code by 60%, not to mention that it doesn't suffer from severe decay 40# when running 4 times physical cores threads and that it leaves gcc 41# [3.4] behind by over 4x factor! If compared to SHA256, single thread 42# performance is only 10% better, but overall throughput for maximum 43# amount of threads for given CPU exceeds corresponding one of SHA256 44# by 30% [again, optimal coefficient is 50%]. 45# 46# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly 47# in-order, i.e. load instruction has to complete prior next 48# instruction in given thread is executed, even if the latter is 49# not dependent on load result! This means that on T1 two 32-bit 50# loads are always slower than one 64-bit load. Once again this 51# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately, 52# 2x32-bit loads can be as fast as 1x64-bit ones. 53# 54# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte, 55# which is 9.3x/11.1x faster than software. Multi-process benchmark 56# saturates at 11.5x single-process result on 8-core processor, or 57# ~11/16GBps per 2.85GHz socket. 58 59# $output is the last argument if it looks like a file (it has an extension) 60$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 61 62$output and open STDOUT,">$output"; 63 64if ($output =~ /512/) { 65 $label="512"; 66 $SZ=8; 67 $LD="ldx"; # load from memory 68 $ST="stx"; # store to memory 69 $SLL="sllx"; # shift left logical 70 $SRL="srlx"; # shift right logical 71 @Sigma0=(28,34,39); 72 @Sigma1=(14,18,41); 73 @sigma0=( 7, 1, 8); # right shift first 74 @sigma1=( 6,19,61); # right shift first 75 $lastK=0x817; 76 $rounds=80; 77 $align=4; 78 79 $locals=16*$SZ; # X[16] 80 81 $A="%o0"; 82 $B="%o1"; 83 $C="%o2"; 84 $D="%o3"; 85 $E="%o4"; 86 $F="%o5"; 87 $G="%g1"; 88 $H="%o7"; 89 @V=($A,$B,$C,$D,$E,$F,$G,$H); 90} else { 91 $label="256"; 92 $SZ=4; 93 $LD="ld"; # load from memory 94 $ST="st"; # store to memory 95 $SLL="sll"; # shift left logical 96 $SRL="srl"; # shift right logical 97 @Sigma0=( 2,13,22); 98 @Sigma1=( 6,11,25); 99 @sigma0=( 3, 7,18); # right shift first 100 @sigma1=(10,17,19); # right shift first 101 $lastK=0x8f2; 102 $rounds=64; 103 $align=8; 104 105 $locals=0; # X[16] is register resident 106 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7"); 107 108 $A="%l0"; 109 $B="%l1"; 110 $C="%l2"; 111 $D="%l3"; 112 $E="%l4"; 113 $F="%l5"; 114 $G="%l6"; 115 $H="%l7"; 116 @V=($A,$B,$C,$D,$E,$F,$G,$H); 117} 118$T1="%g2"; 119$tmp0="%g3"; 120$tmp1="%g4"; 121$tmp2="%g5"; 122 123$ctx="%i0"; 124$inp="%i1"; 125$len="%i2"; 126$Ktbl="%i3"; 127$tmp31="%i4"; 128$tmp32="%i5"; 129 130########### SHA256 131$Xload = sub { 132my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 133 134 if ($i==0) { 135$code.=<<___; 136 ldx [$inp+0],@X[0] 137 ldx [$inp+16],@X[2] 138 ldx [$inp+32],@X[4] 139 ldx [$inp+48],@X[6] 140 ldx [$inp+8],@X[1] 141 ldx [$inp+24],@X[3] 142 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too 143 ldx [$inp+40],@X[5] 144 bz,pt %icc,.Laligned 145 ldx [$inp+56],@X[7] 146 147 sllx @X[0],$tmp31,@X[0] 148 ldx [$inp+64],$T1 149___ 150for($j=0;$j<7;$j++) 151{ $code.=<<___; 152 srlx @X[$j+1],$tmp32,$tmp1 153 sllx @X[$j+1],$tmp31,@X[$j+1] 154 or $tmp1,@X[$j],@X[$j] 155___ 156} 157$code.=<<___; 158 srlx $T1,$tmp32,$T1 159 or $T1,@X[7],@X[7] 160.Laligned: 161___ 162 } 163 164 if ($i&1) { 165 $code.="\tadd @X[$i/2],$h,$T1\n"; 166 } else { 167 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n"; 168 } 169} if ($SZ==4); 170 171########### SHA512 172$Xload = sub { 173my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 174my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8)); 175 176$code.=<<___ if ($i==0); 177 ld [$inp+0],%l0 178 ld [$inp+4],%l1 179 ld [$inp+8],%l2 180 ld [$inp+12],%l3 181 ld [$inp+16],%l4 182 ld [$inp+20],%l5 183 ld [$inp+24],%l6 184 cmp $tmp31,0 185 ld [$inp+28],%l7 186___ 187$code.=<<___ if ($i<15); 188 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 189 add $tmp31,32,$tmp0 190 sllx @pair[0],$tmp0,$tmp1 191 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)` 192 srlx @pair[2],$tmp32,@pair[1] 193 or $tmp1,$tmp2,$tmp2 194 or @pair[1],$tmp2,$tmp2 195 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)` 196 add $h,$tmp2,$T1 197 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] 198___ 199$code.=<<___ if ($i==12); 200 bnz,a,pn %icc,.+8 201 ld [$inp+128],%l0 202___ 203$code.=<<___ if ($i==15); 204 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 205 sllx @pair[1],$tmp31,$tmp2 ! Xload($i) 206 add $tmp31,32,$tmp0 207 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 208 sllx @pair[0],$tmp0,$tmp1 209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 210 srlx @pair[2],$tmp32,@pair[1] 211 or $tmp1,$tmp2,$tmp2 212 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 213 or @pair[1],$tmp2,$tmp2 214 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 215 add $h,$tmp2,$T1 216 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`] 217 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 218 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 219 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 220___ 221} if ($SZ==8); 222 223########### common 224sub BODY_00_15 { 225my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 226 227 if ($i<16) { 228 &$Xload(@_); 229 } else { 230 $code.="\tadd $h,$T1,$T1\n"; 231 } 232 233$code.=<<___; 234 $SRL $e,@Sigma1[0],$h !! $i 235 xor $f,$g,$tmp2 236 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1 237 and $e,$tmp2,$tmp2 238 $SRL $e,@Sigma1[1],$tmp0 239 xor $tmp1,$h,$h 240 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1 241 xor $tmp0,$h,$h 242 $SRL $e,@Sigma1[2],$tmp0 243 xor $tmp1,$h,$h 244 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1 245 xor $tmp0,$h,$h 246 xor $g,$tmp2,$tmp2 ! Ch(e,f,g) 247 xor $tmp1,$h,$tmp0 ! Sigma1(e) 248 249 $SRL $a,@Sigma0[0],$h 250 add $tmp2,$T1,$T1 251 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i] 252 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1 253 add $tmp0,$T1,$T1 254 $SRL $a,@Sigma0[1],$tmp0 255 xor $tmp1,$h,$h 256 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1 257 xor $tmp0,$h,$h 258 $SRL $a,@Sigma0[2],$tmp0 259 xor $tmp1,$h,$h 260 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1 261 xor $tmp0,$h,$h 262 xor $tmp1,$h,$h ! Sigma0(a) 263 264 or $a,$b,$tmp0 265 and $a,$b,$tmp1 266 and $c,$tmp0,$tmp0 267 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c) 268 add $tmp2,$T1,$T1 ! +=K[$i] 269 add $tmp1,$h,$h 270 271 add $T1,$d,$d 272 add $T1,$h,$h 273___ 274} 275 276########### SHA256 277$BODY_16_XX = sub { 278my $i=@_[0]; 279my $xi; 280 281 if ($i&1) { 282 $xi=$tmp32; 283 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n"; 284 } else { 285 $xi=@X[(($i+1)/2)%8]; 286 } 287$code.=<<___; 288 srl $xi,@sigma0[0],$T1 !! Xupdate($i) 289 sll $xi,`32-@sigma0[2]`,$tmp1 290 srl $xi,@sigma0[1],$tmp0 291 xor $tmp1,$T1,$T1 292 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 293 xor $tmp0,$T1,$T1 294 srl $xi,@sigma0[2],$tmp0 295 xor $tmp1,$T1,$T1 296___ 297 if ($i&1) { 298 $xi=@X[(($i+14)/2)%8]; 299 } else { 300 $xi=$tmp32; 301 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n"; 302 } 303$code.=<<___; 304 srl $xi,@sigma1[0],$tmp2 305 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1]) 306 sll $xi,`32-@sigma1[2]`,$tmp1 307 srl $xi,@sigma1[1],$tmp0 308 xor $tmp1,$tmp2,$tmp2 309 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1 310 xor $tmp0,$tmp2,$tmp2 311 srl $xi,@sigma1[2],$tmp0 312 xor $tmp1,$tmp2,$tmp2 313___ 314 if ($i&1) { 315 $xi=@X[($i/2)%8]; 316$code.=<<___; 317 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] 318 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 319 srl @X[($i/2)%8],0,$tmp0 320 add $tmp2,$tmp1,$tmp1 321 add $xi,$T1,$T1 ! +=X[i] 322 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] 323 add $tmp1,$T1,$T1 324 325 srl $T1,0,$T1 326 or $T1,@X[($i/2)%8],@X[($i/2)%8] 327___ 328 } else { 329 $xi=@X[(($i+9)/2)%8]; 330$code.=<<___; 331 srlx @X[($i/2)%8],32,$tmp1 ! X[i] 332 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) 333 add $xi,$T1,$T1 ! +=X[i+9] 334 add $tmp2,$tmp1,$tmp1 335 srl @X[($i/2)%8],0,@X[($i/2)%8] 336 add $tmp1,$T1,$T1 337 338 sllx $T1,32,$tmp0 339 or $tmp0,@X[($i/2)%8],@X[($i/2)%8] 340___ 341 } 342 &BODY_00_15(@_); 343} if ($SZ==4); 344 345########### SHA512 346$BODY_16_XX = sub { 347my $i=@_[0]; 348my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1)); 349 350$code.=<<___; 351 sllx %l2,32,$tmp0 !! Xupdate($i) 352 or %l3,$tmp0,$tmp0 353 354 srlx $tmp0,@sigma0[0],$T1 355 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2 356 sllx $tmp0,`64-@sigma0[2]`,$tmp1 357 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3 358 srlx $tmp0,@sigma0[1],$tmp0 359 xor $tmp1,$T1,$T1 360 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1 361 xor $tmp0,$T1,$T1 362 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0 363 xor $tmp1,$T1,$T1 364 sllx %l6,32,$tmp2 365 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1]) 366 or %l7,$tmp2,$tmp2 367 368 srlx $tmp2,@sigma1[0],$tmp1 369 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6 370 sllx $tmp2,`64-@sigma1[2]`,$tmp0 371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7 372 srlx $tmp2,@sigma1[1],$tmp2 373 xor $tmp0,$tmp1,$tmp1 374 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0 375 xor $tmp2,$tmp1,$tmp1 376 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2 377 xor $tmp0,$tmp1,$tmp1 378 sllx %l4,32,$tmp0 379 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14]) 380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4 381 or %l5,$tmp0,$tmp0 382 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5 383 384 sllx %l0,32,$tmp2 385 add $tmp1,$T1,$T1 386 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0 387 or %l1,$tmp2,$tmp2 388 add $tmp0,$T1,$T1 ! +=X[$i+9] 389 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1 390 add $tmp2,$T1,$T1 ! +=X[$i] 391 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`] 392___ 393 &BODY_00_15(@_); 394} if ($SZ==8); 395 396$code.=<<___; 397#ifndef __ASSEMBLER__ 398# define __ASSEMBLER__ 1 399#endif 400#include "crypto/sparc_arch.h" 401 402#ifdef __arch64__ 403.register %g2,#scratch 404.register %g3,#scratch 405#endif 406 407.section ".text",#alloc,#execinstr 408 409.align 64 410K${label}: 411.type K${label},#object 412___ 413if ($SZ==4) { 414$code.=<<___; 415 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 416 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 417 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 418 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 419 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 420 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 421 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 422 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 423 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 424 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 425 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 426 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 427 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 428 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 429 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 430 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 431___ 432} else { 433$code.=<<___; 434 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd 435 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc 436 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 437 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 438 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe 439 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 440 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 441 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 442 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 443 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 444 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 445 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 446 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 447 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 448 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 449 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 450 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 451 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df 452 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 453 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b 454 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 455 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 456 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 457 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 458 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 459 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 460 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb 461 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 462 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 463 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec 464 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 465 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b 466 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 467 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 468 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 469 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b 470 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 471 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c 472 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a 473 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 474___ 475} 476$code.=<<___; 477.size K${label},.-K${label} 478 479#ifdef __PIC__ 480SPARC_PIC_THUNK(%g1) 481#endif 482 483.globl sha${label}_block_data_order 484.align 32 485sha${label}_block_data_order: 486 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5) 487 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1] 488 489 andcc %g1, CFR_SHA${label}, %g0 490 be .Lsoftware 491 nop 492___ 493$code.=<<___ if ($SZ==8); # SHA512 494 ldd [%o0 + 0x00], %f0 ! load context 495 ldd [%o0 + 0x08], %f2 496 ldd [%o0 + 0x10], %f4 497 ldd [%o0 + 0x18], %f6 498 ldd [%o0 + 0x20], %f8 499 ldd [%o0 + 0x28], %f10 500 andcc %o1, 0x7, %g0 501 ldd [%o0 + 0x30], %f12 502 bne,pn %icc, .Lhwunaligned 503 ldd [%o0 + 0x38], %f14 504 505.Lhwaligned_loop: 506 ldd [%o1 + 0x00], %f16 507 ldd [%o1 + 0x08], %f18 508 ldd [%o1 + 0x10], %f20 509 ldd [%o1 + 0x18], %f22 510 ldd [%o1 + 0x20], %f24 511 ldd [%o1 + 0x28], %f26 512 ldd [%o1 + 0x30], %f28 513 ldd [%o1 + 0x38], %f30 514 ldd [%o1 + 0x40], %f32 515 ldd [%o1 + 0x48], %f34 516 ldd [%o1 + 0x50], %f36 517 ldd [%o1 + 0x58], %f38 518 ldd [%o1 + 0x60], %f40 519 ldd [%o1 + 0x68], %f42 520 ldd [%o1 + 0x70], %f44 521 subcc %o2, 1, %o2 ! done yet? 522 ldd [%o1 + 0x78], %f46 523 add %o1, 0x80, %o1 524 prefetch [%o1 + 63], 20 525 prefetch [%o1 + 64+63], 20 526 527 .word 0x81b02860 ! SHA512 528 529 bne,pt SIZE_T_CC, .Lhwaligned_loop 530 nop 531 532.Lhwfinish: 533 std %f0, [%o0 + 0x00] ! store context 534 std %f2, [%o0 + 0x08] 535 std %f4, [%o0 + 0x10] 536 std %f6, [%o0 + 0x18] 537 std %f8, [%o0 + 0x20] 538 std %f10, [%o0 + 0x28] 539 std %f12, [%o0 + 0x30] 540 retl 541 std %f14, [%o0 + 0x38] 542 543.align 16 544.Lhwunaligned: 545 alignaddr %o1, %g0, %o1 546 547 ldd [%o1 + 0x00], %f18 548.Lhwunaligned_loop: 549 ldd [%o1 + 0x08], %f20 550 ldd [%o1 + 0x10], %f22 551 ldd [%o1 + 0x18], %f24 552 ldd [%o1 + 0x20], %f26 553 ldd [%o1 + 0x28], %f28 554 ldd [%o1 + 0x30], %f30 555 ldd [%o1 + 0x38], %f32 556 ldd [%o1 + 0x40], %f34 557 ldd [%o1 + 0x48], %f36 558 ldd [%o1 + 0x50], %f38 559 ldd [%o1 + 0x58], %f40 560 ldd [%o1 + 0x60], %f42 561 ldd [%o1 + 0x68], %f44 562 ldd [%o1 + 0x70], %f46 563 ldd [%o1 + 0x78], %f48 564 subcc %o2, 1, %o2 ! done yet? 565 ldd [%o1 + 0x80], %f50 566 add %o1, 0x80, %o1 567 prefetch [%o1 + 63], 20 568 prefetch [%o1 + 64+63], 20 569 570 faligndata %f18, %f20, %f16 571 faligndata %f20, %f22, %f18 572 faligndata %f22, %f24, %f20 573 faligndata %f24, %f26, %f22 574 faligndata %f26, %f28, %f24 575 faligndata %f28, %f30, %f26 576 faligndata %f30, %f32, %f28 577 faligndata %f32, %f34, %f30 578 faligndata %f34, %f36, %f32 579 faligndata %f36, %f38, %f34 580 faligndata %f38, %f40, %f36 581 faligndata %f40, %f42, %f38 582 faligndata %f42, %f44, %f40 583 faligndata %f44, %f46, %f42 584 faligndata %f46, %f48, %f44 585 faligndata %f48, %f50, %f46 586 587 .word 0x81b02860 ! SHA512 588 589 bne,pt SIZE_T_CC, .Lhwunaligned_loop 590 for %f50, %f50, %f18 ! %f18=%f50 591 592 ba .Lhwfinish 593 nop 594___ 595$code.=<<___ if ($SZ==4); # SHA256 596 ld [%o0 + 0x00], %f0 597 ld [%o0 + 0x04], %f1 598 ld [%o0 + 0x08], %f2 599 ld [%o0 + 0x0c], %f3 600 ld [%o0 + 0x10], %f4 601 ld [%o0 + 0x14], %f5 602 andcc %o1, 0x7, %g0 603 ld [%o0 + 0x18], %f6 604 bne,pn %icc, .Lhwunaligned 605 ld [%o0 + 0x1c], %f7 606 607.Lhwloop: 608 ldd [%o1 + 0x00], %f8 609 ldd [%o1 + 0x08], %f10 610 ldd [%o1 + 0x10], %f12 611 ldd [%o1 + 0x18], %f14 612 ldd [%o1 + 0x20], %f16 613 ldd [%o1 + 0x28], %f18 614 ldd [%o1 + 0x30], %f20 615 subcc %o2, 1, %o2 ! done yet? 616 ldd [%o1 + 0x38], %f22 617 add %o1, 0x40, %o1 618 prefetch [%o1 + 63], 20 619 620 .word 0x81b02840 ! SHA256 621 622 bne,pt SIZE_T_CC, .Lhwloop 623 nop 624 625.Lhwfinish: 626 st %f0, [%o0 + 0x00] ! store context 627 st %f1, [%o0 + 0x04] 628 st %f2, [%o0 + 0x08] 629 st %f3, [%o0 + 0x0c] 630 st %f4, [%o0 + 0x10] 631 st %f5, [%o0 + 0x14] 632 st %f6, [%o0 + 0x18] 633 retl 634 st %f7, [%o0 + 0x1c] 635 636.align 8 637.Lhwunaligned: 638 alignaddr %o1, %g0, %o1 639 640 ldd [%o1 + 0x00], %f10 641.Lhwunaligned_loop: 642 ldd [%o1 + 0x08], %f12 643 ldd [%o1 + 0x10], %f14 644 ldd [%o1 + 0x18], %f16 645 ldd [%o1 + 0x20], %f18 646 ldd [%o1 + 0x28], %f20 647 ldd [%o1 + 0x30], %f22 648 ldd [%o1 + 0x38], %f24 649 subcc %o2, 1, %o2 ! done yet? 650 ldd [%o1 + 0x40], %f26 651 add %o1, 0x40, %o1 652 prefetch [%o1 + 63], 20 653 654 faligndata %f10, %f12, %f8 655 faligndata %f12, %f14, %f10 656 faligndata %f14, %f16, %f12 657 faligndata %f16, %f18, %f14 658 faligndata %f18, %f20, %f16 659 faligndata %f20, %f22, %f18 660 faligndata %f22, %f24, %f20 661 faligndata %f24, %f26, %f22 662 663 .word 0x81b02840 ! SHA256 664 665 bne,pt SIZE_T_CC, .Lhwunaligned_loop 666 for %f26, %f26, %f10 ! %f10=%f26 667 668 ba .Lhwfinish 669 nop 670___ 671$code.=<<___; 672.align 16 673.Lsoftware: 674 save %sp,-STACK_FRAME-$locals,%sp 675 and $inp,`$align-1`,$tmp31 676 sllx $len,`log(16*$SZ)/log(2)`,$len 677 andn $inp,`$align-1`,$inp 678 sll $tmp31,3,$tmp31 679 add $inp,$len,$len 680___ 681$code.=<<___ if ($SZ==8); # SHA512 682 mov 32,$tmp32 683 sub $tmp32,$tmp31,$tmp32 684___ 685$code.=<<___; 686.Lpic: call .+8 687 add %o7,K${label}-.Lpic,$Ktbl 688 689 $LD [$ctx+`0*$SZ`],$A 690 $LD [$ctx+`1*$SZ`],$B 691 $LD [$ctx+`2*$SZ`],$C 692 $LD [$ctx+`3*$SZ`],$D 693 $LD [$ctx+`4*$SZ`],$E 694 $LD [$ctx+`5*$SZ`],$F 695 $LD [$ctx+`6*$SZ`],$G 696 $LD [$ctx+`7*$SZ`],$H 697 698.Lloop: 699___ 700for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 701$code.=".L16_xx:\n"; 702for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 703$code.=<<___; 704 and $tmp2,0xfff,$tmp2 705 cmp $tmp2,$lastK 706 bne .L16_xx 707 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16 708 709___ 710$code.=<<___ if ($SZ==4); # SHA256 711 $LD [$ctx+`0*$SZ`],@X[0] 712 $LD [$ctx+`1*$SZ`],@X[1] 713 $LD [$ctx+`2*$SZ`],@X[2] 714 $LD [$ctx+`3*$SZ`],@X[3] 715 $LD [$ctx+`4*$SZ`],@X[4] 716 $LD [$ctx+`5*$SZ`],@X[5] 717 $LD [$ctx+`6*$SZ`],@X[6] 718 $LD [$ctx+`7*$SZ`],@X[7] 719 720 add $A,@X[0],$A 721 $ST $A,[$ctx+`0*$SZ`] 722 add $B,@X[1],$B 723 $ST $B,[$ctx+`1*$SZ`] 724 add $C,@X[2],$C 725 $ST $C,[$ctx+`2*$SZ`] 726 add $D,@X[3],$D 727 $ST $D,[$ctx+`3*$SZ`] 728 add $E,@X[4],$E 729 $ST $E,[$ctx+`4*$SZ`] 730 add $F,@X[5],$F 731 $ST $F,[$ctx+`5*$SZ`] 732 add $G,@X[6],$G 733 $ST $G,[$ctx+`6*$SZ`] 734 add $H,@X[7],$H 735 $ST $H,[$ctx+`7*$SZ`] 736___ 737$code.=<<___ if ($SZ==8); # SHA512 738 ld [$ctx+`0*$SZ+0`],%l0 739 ld [$ctx+`0*$SZ+4`],%l1 740 ld [$ctx+`1*$SZ+0`],%l2 741 ld [$ctx+`1*$SZ+4`],%l3 742 ld [$ctx+`2*$SZ+0`],%l4 743 ld [$ctx+`2*$SZ+4`],%l5 744 ld [$ctx+`3*$SZ+0`],%l6 745 746 sllx %l0,32,$tmp0 747 ld [$ctx+`3*$SZ+4`],%l7 748 sllx %l2,32,$tmp1 749 or %l1,$tmp0,$tmp0 750 or %l3,$tmp1,$tmp1 751 add $tmp0,$A,$A 752 add $tmp1,$B,$B 753 $ST $A,[$ctx+`0*$SZ`] 754 sllx %l4,32,$tmp2 755 $ST $B,[$ctx+`1*$SZ`] 756 sllx %l6,32,$T1 757 or %l5,$tmp2,$tmp2 758 or %l7,$T1,$T1 759 add $tmp2,$C,$C 760 $ST $C,[$ctx+`2*$SZ`] 761 add $T1,$D,$D 762 $ST $D,[$ctx+`3*$SZ`] 763 764 ld [$ctx+`4*$SZ+0`],%l0 765 ld [$ctx+`4*$SZ+4`],%l1 766 ld [$ctx+`5*$SZ+0`],%l2 767 ld [$ctx+`5*$SZ+4`],%l3 768 ld [$ctx+`6*$SZ+0`],%l4 769 ld [$ctx+`6*$SZ+4`],%l5 770 ld [$ctx+`7*$SZ+0`],%l6 771 772 sllx %l0,32,$tmp0 773 ld [$ctx+`7*$SZ+4`],%l7 774 sllx %l2,32,$tmp1 775 or %l1,$tmp0,$tmp0 776 or %l3,$tmp1,$tmp1 777 add $tmp0,$E,$E 778 add $tmp1,$F,$F 779 $ST $E,[$ctx+`4*$SZ`] 780 sllx %l4,32,$tmp2 781 $ST $F,[$ctx+`5*$SZ`] 782 sllx %l6,32,$T1 783 or %l5,$tmp2,$tmp2 784 or %l7,$T1,$T1 785 add $tmp2,$G,$G 786 $ST $G,[$ctx+`6*$SZ`] 787 add $T1,$H,$H 788 $ST $H,[$ctx+`7*$SZ`] 789___ 790$code.=<<___; 791 add $inp,`16*$SZ`,$inp ! advance inp 792 cmp $inp,$len 793 bne SIZE_T_CC,.Lloop 794 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl 795 796 ret 797 restore 798.type sha${label}_block_data_order,#function 799.size sha${label}_block_data_order,(.-sha${label}_block_data_order) 800.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" 801.align 4 802___ 803 804# Purpose of these subroutines is to explicitly encode VIS instructions, 805# so that one can compile the module without having to specify VIS 806# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 807# Idea is to reserve for option to produce "universal" binary and let 808# programmer detect if current CPU is VIS capable at run-time. 809sub unvis { 810my ($mnemonic,$rs1,$rs2,$rd)=@_; 811my $ref,$opf; 812my %visopf = ( "faligndata" => 0x048, 813 "for" => 0x07c ); 814 815 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 816 817 if ($opf=$visopf{$mnemonic}) { 818 foreach ($rs1,$rs2,$rd) { 819 return $ref if (!/%f([0-9]{1,2})/); 820 $_=$1; 821 if ($1>=32) { 822 return $ref if ($1&1); 823 # re-encode for upper double register addressing 824 $_=($1|$1>>5)&31; 825 } 826 } 827 828 return sprintf ".word\t0x%08x !%s", 829 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 830 $ref; 831 } else { 832 return $ref; 833 } 834} 835sub unalignaddr { 836my ($mnemonic,$rs1,$rs2,$rd)=@_; 837my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 838my $ref="$mnemonic\t$rs1,$rs2,$rd"; 839 840 foreach ($rs1,$rs2,$rd) { 841 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 842 else { return $ref; } 843 } 844 return sprintf ".word\t0x%08x !%s", 845 0x81b00300|$rd<<25|$rs1<<14|$rs2, 846 $ref; 847} 848 849foreach (split("\n",$code)) { 850 s/\`([^\`]*)\`/eval $1/ge; 851 852 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 853 &unvis($1,$2,$3,$4) 854 /ge; 855 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 856 &unalignaddr($1,$2,$3,$4) 857 /ge; 858 859 print $_,"\n"; 860} 861 862close STDOUT or die "error closing STDOUT: $!"; 863