1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# January 2009 18# 19# Provided that UltraSPARC VIS instructions are pipe-lined(*) and 20# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC 21# Graphic Unit would make it possible to achieve higher instruction- 22# level parallelism, ILP, and thus higher performance. It should be 23# explicitly noted that ILP is the keyword, and it means that this 24# code would be unsuitable for cores like UltraSPARC-Tx. The idea is 25# not really novel, Sun had VIS-powered implementation for a while. 26# Unlike Sun's implementation this one can process multiple unaligned 27# input blocks, and as such works as drop-in replacement for OpenSSL 28# sha1_block_data_order. Performance improvement was measured to be 29# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on 30# UltraSPARC-III. See below for discussion... 31# 32# The module does not present direct interest for OpenSSL, because 33# it doesn't provide better performance on contemporary SPARCv9 CPUs, 34# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they 35# absolutely must score on UltraSPARC-I-IV can simply replace 36# crypto/sha/asm/sha1-sparcv9.pl with this module. 37# 38# (*) "Pipe-lined" means that even if it takes several cycles to 39# complete, next instruction using same functional unit [but not 40# depending on the result of the current instruction] can start 41# execution without having to wait for the unit. "Pairable" 42# means that two [or more] independent instructions can be 43# issued at the very same time. 44 45$bits=32; 46for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } 47if ($bits==64) { $bias=2047; $frame=192; } 48else { $bias=0; $frame=112; } 49 50$output=pop and open STDOUT,">$output"; 51 52$ctx="%i0"; 53$inp="%i1"; 54$len="%i2"; 55$tmp0="%i3"; 56$tmp1="%i4"; 57$tmp2="%i5"; 58$tmp3="%g5"; 59 60$base="%g1"; 61$align="%g4"; 62$Xfer="%o5"; 63$nXfer=$tmp3; 64$Xi="%o7"; 65 66$A="%l0"; 67$B="%l1"; 68$C="%l2"; 69$D="%l3"; 70$E="%l4"; 71@V=($A,$B,$C,$D,$E); 72 73$Actx="%o0"; 74$Bctx="%o1"; 75$Cctx="%o2"; 76$Dctx="%o3"; 77$Ectx="%o4"; 78 79$fmul="%f32"; 80$VK_00_19="%f34"; 81$VK_20_39="%f36"; 82$VK_40_59="%f38"; 83$VK_60_79="%f40"; 84@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79); 85@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7", 86 "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16"); 87 88# This is reference 2x-parallelized VIS-powered Xupdate procedure. It 89# covers even K_NN_MM addition... 90sub Xupdate { 91my ($i)=@_; 92my $K=@VK[($i+16)/20]; 93my $j=($i+16)%16; 94 95# [ provided that GSR.alignaddr_offset is 5, $mul contains 96# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to 97# chosen registers... ] 98$code.=<<___; 99 fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13] 100 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] 101 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] 102 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] 103 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 104 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 105 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 106 ![fxors %f15,%f2,%f2] 107 for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp 108 ![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency 109 fpadd32 $K,@X[$j],%f20 110 std %f20,[$Xfer+`4*$j`] 111___ 112# The numbers delimited with slash are the earliest possible dispatch 113# cycles for given instruction assuming 1 cycle latency for simple VIS 114# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as 115# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being 116# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1 117# round. As [long as] FPU/VIS instructions are perfectly pairable with 118# IALU ones, the round timing is defined by the maximum between VIS 119# and IALU timings. The latter varies from round to round and averages 120# out at 6.25 ticks. This means that USI&II should operate at IALU 121# rate, while USIII&IV - at VIS rate. This explains why performance 122# improvement varies among processors. Well, given that pure IALU 123# sha1-sparcv9.pl module exhibits virtually uniform performance of 124# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical 125# lower limits. Real-life performance was measured to be 6.6 cycles 126# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than 127# half-round VIS timing, because there are 16 Xupdate-free rounds, 128# which "push down" average theoretical timing to 8 cycles... 129 130# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS 131# latency. Well, it might have, but it doesn't have dedicated 132# VIS-unit. Instead, VIS instructions are executed by other 133# functional units, ones used here - by IALU. This doesn't 134# improve effective ILP... 135} 136 137# The reference Xupdate procedure is then "strained" over *pairs* of 138# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13] 139# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves 140# plenty of room to amortize for read-after-write hazard, as well as 141# to fetch and align input for the next spin. The VIS instructions are 142# scheduled for latency of 2 cycles, because there are not enough IALU 143# instructions to schedule for latency of 3, while scheduling for 1 144# would give no gain on USI&II anyway. 145 146sub BODY_00_19 { 147my ($i,$a,$b,$c,$d,$e)=@_; 148my $j=$i&~1; 149my $k=($j+16+2)%16; # ahead reference 150my $l=($j+16-2)%16; # behind reference 151my $K=@VK[($j+16-2)/20]; 152 153$j=($j+16)%16; 154 155$code.=<<___ if (!($i&1)); 156 sll $a,5,$tmp0 !! $i 157 and $c,$b,$tmp3 158 ld [$Xfer+`4*($i%16)`],$Xi 159 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] 160 srl $a,27,$tmp1 161 add $tmp0,$e,$e 162 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] 163 sll $b,30,$tmp2 164 add $tmp1,$e,$e 165 andn $d,$b,$tmp1 166 add $Xi,$e,$e 167 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] 168 srl $b,2,$b 169 or $tmp1,$tmp3,$tmp1 170 or $tmp2,$b,$b 171 add $tmp1,$e,$e 172 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 173___ 174$code.=<<___ if ($i&1); 175 sll $a,5,$tmp0 !! $i 176 and $c,$b,$tmp3 177 ld [$Xfer+`4*($i%16)`],$Xi 178 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 179 srl $a,27,$tmp1 180 add $tmp0,$e,$e 181 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 182 sll $b,30,$tmp2 183 add $tmp1,$e,$e 184 fpadd32 $K,@X[$l],%f20 ! 185 andn $d,$b,$tmp1 186 add $Xi,$e,$e 187 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] 188 srl $b,2,$b 189 or $tmp1,$tmp3,$tmp1 190 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp 191 or $tmp2,$b,$b 192 add $tmp1,$e,$e 193___ 194$code.=<<___ if ($i&1 && $i>=2); 195 std %f20,[$Xfer+`4*$l`] ! 196___ 197} 198 199sub BODY_20_39 { 200my ($i,$a,$b,$c,$d,$e)=@_; 201my $j=$i&~1; 202my $k=($j+16+2)%16; # ahead reference 203my $l=($j+16-2)%16; # behind reference 204my $K=@VK[($j+16-2)/20]; 205 206$j=($j+16)%16; 207 208$code.=<<___ if (!($i&1) && $i<64); 209 sll $a,5,$tmp0 !! $i 210 ld [$Xfer+`4*($i%16)`],$Xi 211 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] 212 srl $a,27,$tmp1 213 add $tmp0,$e,$e 214 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] 215 xor $c,$b,$tmp0 216 add $tmp1,$e,$e 217 sll $b,30,$tmp2 218 xor $d,$tmp0,$tmp1 219 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] 220 srl $b,2,$b 221 add $tmp1,$e,$e 222 or $tmp2,$b,$b 223 add $Xi,$e,$e 224 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 225___ 226$code.=<<___ if ($i&1 && $i<64); 227 sll $a,5,$tmp0 !! $i 228 ld [$Xfer+`4*($i%16)`],$Xi 229 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 230 srl $a,27,$tmp1 231 add $tmp0,$e,$e 232 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 233 xor $c,$b,$tmp0 234 add $tmp1,$e,$e 235 fpadd32 $K,@X[$l],%f20 ! 236 sll $b,30,$tmp2 237 xor $d,$tmp0,$tmp1 238 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] 239 srl $b,2,$b 240 add $tmp1,$e,$e 241 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp 242 or $tmp2,$b,$b 243 add $Xi,$e,$e 244 std %f20,[$Xfer+`4*$l`] ! 245___ 246$code.=<<___ if ($i==64); 247 sll $a,5,$tmp0 !! $i 248 ld [$Xfer+`4*($i%16)`],$Xi 249 fpadd32 $K,@X[$l],%f20 250 srl $a,27,$tmp1 251 add $tmp0,$e,$e 252 xor $c,$b,$tmp0 253 add $tmp1,$e,$e 254 sll $b,30,$tmp2 255 xor $d,$tmp0,$tmp1 256 std %f20,[$Xfer+`4*$l`] 257 srl $b,2,$b 258 add $tmp1,$e,$e 259 or $tmp2,$b,$b 260 add $Xi,$e,$e 261___ 262$code.=<<___ if ($i>64); 263 sll $a,5,$tmp0 !! $i 264 ld [$Xfer+`4*($i%16)`],$Xi 265 srl $a,27,$tmp1 266 add $tmp0,$e,$e 267 xor $c,$b,$tmp0 268 add $tmp1,$e,$e 269 sll $b,30,$tmp2 270 xor $d,$tmp0,$tmp1 271 srl $b,2,$b 272 add $tmp1,$e,$e 273 or $tmp2,$b,$b 274 add $Xi,$e,$e 275___ 276} 277 278sub BODY_40_59 { 279my ($i,$a,$b,$c,$d,$e)=@_; 280my $j=$i&~1; 281my $k=($j+16+2)%16; # ahead reference 282my $l=($j+16-2)%16; # behind reference 283my $K=@VK[($j+16-2)/20]; 284 285$j=($j+16)%16; 286 287$code.=<<___ if (!($i&1)); 288 sll $a,5,$tmp0 !! $i 289 ld [$Xfer+`4*($i%16)`],$Xi 290 fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14] 291 srl $a,27,$tmp1 292 add $tmp0,$e,$e 293 fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9] 294 and $c,$b,$tmp0 295 add $tmp1,$e,$e 296 sll $b,30,$tmp2 297 or $c,$b,$tmp1 298 fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9] 299 srl $b,2,$b 300 and $d,$tmp1,$tmp1 301 add $Xi,$e,$e 302 or $tmp1,$tmp0,$tmp1 303 faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24 304 or $tmp2,$b,$b 305 add $tmp1,$e,$e 306 fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1 307___ 308$code.=<<___ if ($i&1); 309 sll $a,5,$tmp0 !! $i 310 ld [$Xfer+`4*($i%16)`],$Xi 311 srl $a,27,$tmp1 312 add $tmp0,$e,$e 313 fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1 314 and $c,$b,$tmp0 315 add $tmp1,$e,$e 316 fpadd32 $K,@X[$l],%f20 ! 317 sll $b,30,$tmp2 318 or $c,$b,$tmp1 319 fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13] 320 srl $b,2,$b 321 and $d,$tmp1,$tmp1 322 fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp 323 add $Xi,$e,$e 324 or $tmp1,$tmp0,$tmp1 325 or $tmp2,$b,$b 326 add $tmp1,$e,$e 327 std %f20,[$Xfer+`4*$l`] ! 328___ 329} 330 331# If there is more data to process, then we pre-fetch the data for 332# next iteration in last ten rounds... 333sub BODY_70_79 { 334my ($i,$a,$b,$c,$d,$e)=@_; 335my $j=$i&~1; 336my $m=($i%8)*2; 337 338$j=($j+16)%16; 339 340$code.=<<___ if ($i==70); 341 sll $a,5,$tmp0 !! $i 342 ld [$Xfer+`4*($i%16)`],$Xi 343 srl $a,27,$tmp1 344 add $tmp0,$e,$e 345 ldd [$inp+64],@X[0] 346 xor $c,$b,$tmp0 347 add $tmp1,$e,$e 348 sll $b,30,$tmp2 349 xor $d,$tmp0,$tmp1 350 srl $b,2,$b 351 add $tmp1,$e,$e 352 or $tmp2,$b,$b 353 add $Xi,$e,$e 354 355 and $inp,-64,$nXfer 356 inc 64,$inp 357 and $nXfer,255,$nXfer 358 alignaddr %g0,$align,%g0 359 add $base,$nXfer,$nXfer 360___ 361$code.=<<___ if ($i==71); 362 sll $a,5,$tmp0 !! $i 363 ld [$Xfer+`4*($i%16)`],$Xi 364 srl $a,27,$tmp1 365 add $tmp0,$e,$e 366 xor $c,$b,$tmp0 367 add $tmp1,$e,$e 368 sll $b,30,$tmp2 369 xor $d,$tmp0,$tmp1 370 srl $b,2,$b 371 add $tmp1,$e,$e 372 or $tmp2,$b,$b 373 add $Xi,$e,$e 374___ 375$code.=<<___ if ($i>=72); 376 faligndata @X[$m],@X[$m+2],@X[$m] 377 sll $a,5,$tmp0 !! $i 378 ld [$Xfer+`4*($i%16)`],$Xi 379 srl $a,27,$tmp1 380 add $tmp0,$e,$e 381 xor $c,$b,$tmp0 382 add $tmp1,$e,$e 383 fpadd32 $VK_00_19,@X[$m],%f20 384 sll $b,30,$tmp2 385 xor $d,$tmp0,$tmp1 386 srl $b,2,$b 387 add $tmp1,$e,$e 388 or $tmp2,$b,$b 389 add $Xi,$e,$e 390___ 391$code.=<<___ if ($i<77); 392 ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)] 393___ 394$code.=<<___ if ($i==77); # redundant if $inp was aligned 395 add $align,63,$tmp0 396 and $tmp0,-8,$tmp0 397 ldd [$inp+$tmp0],@X[16] 398___ 399$code.=<<___ if ($i>=72); 400 std %f20,[$nXfer+`4*$m`] 401___ 402} 403 404$code.=<<___; 405.section ".text",#alloc,#execinstr 406 407.align 64 408vis_const: 409.long 0x5a827999,0x5a827999 ! K_00_19 410.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39 411.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59 412.long 0xca62c1d6,0xca62c1d6 ! K_60_79 413.long 0x00000100,0x00000100 414.align 64 415.type vis_const,#object 416.size vis_const,(.-vis_const) 417 418.globl sha1_block_data_order 419sha1_block_data_order: 420 save %sp,-$frame,%sp 421 add %fp,$bias-256,$base 422 4231: call .+8 424 add %o7,vis_const-1b,$tmp0 425 426 ldd [$tmp0+0],$VK_00_19 427 ldd [$tmp0+8],$VK_20_39 428 ldd [$tmp0+16],$VK_40_59 429 ldd [$tmp0+24],$VK_60_79 430 ldd [$tmp0+32],$fmul 431 432 ld [$ctx+0],$Actx 433 and $base,-256,$base 434 ld [$ctx+4],$Bctx 435 sub $base,$bias+$frame,%sp 436 ld [$ctx+8],$Cctx 437 and $inp,7,$align 438 ld [$ctx+12],$Dctx 439 and $inp,-8,$inp 440 ld [$ctx+16],$Ectx 441 442 ! X[16] is maintained in FP register bank 443 alignaddr %g0,$align,%g0 444 ldd [$inp+0],@X[0] 445 sub $inp,-64,$Xfer 446 ldd [$inp+8],@X[2] 447 and $Xfer,-64,$Xfer 448 ldd [$inp+16],@X[4] 449 and $Xfer,255,$Xfer 450 ldd [$inp+24],@X[6] 451 add $base,$Xfer,$Xfer 452 ldd [$inp+32],@X[8] 453 ldd [$inp+40],@X[10] 454 ldd [$inp+48],@X[12] 455 brz,pt $align,.Laligned 456 ldd [$inp+56],@X[14] 457 458 ldd [$inp+64],@X[16] 459 faligndata @X[0],@X[2],@X[0] 460 faligndata @X[2],@X[4],@X[2] 461 faligndata @X[4],@X[6],@X[4] 462 faligndata @X[6],@X[8],@X[6] 463 faligndata @X[8],@X[10],@X[8] 464 faligndata @X[10],@X[12],@X[10] 465 faligndata @X[12],@X[14],@X[12] 466 faligndata @X[14],@X[16],@X[14] 467 468.Laligned: 469 mov 5,$tmp0 470 dec 1,$len 471 alignaddr %g0,$tmp0,%g0 472 fpadd32 $VK_00_19,@X[0],%f16 473 fpadd32 $VK_00_19,@X[2],%f18 474 fpadd32 $VK_00_19,@X[4],%f20 475 fpadd32 $VK_00_19,@X[6],%f22 476 fpadd32 $VK_00_19,@X[8],%f24 477 fpadd32 $VK_00_19,@X[10],%f26 478 fpadd32 $VK_00_19,@X[12],%f28 479 fpadd32 $VK_00_19,@X[14],%f30 480 std %f16,[$Xfer+0] 481 mov $Actx,$A 482 std %f18,[$Xfer+8] 483 mov $Bctx,$B 484 std %f20,[$Xfer+16] 485 mov $Cctx,$C 486 std %f22,[$Xfer+24] 487 mov $Dctx,$D 488 std %f24,[$Xfer+32] 489 mov $Ectx,$E 490 std %f26,[$Xfer+40] 491 fxors @X[13],@X[0],@X[0] 492 std %f28,[$Xfer+48] 493 ba .Loop 494 std %f30,[$Xfer+56] 495.align 32 496.Loop: 497___ 498for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 499for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 500for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 501for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 502$code.=<<___; 503 tst $len 504 bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail 505 nop 506___ 507for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); } 508$code.=<<___; 509 add $A,$Actx,$Actx 510 add $B,$Bctx,$Bctx 511 add $C,$Cctx,$Cctx 512 add $D,$Dctx,$Dctx 513 add $E,$Ectx,$Ectx 514 mov 5,$tmp0 515 fxors @X[13],@X[0],@X[0] 516 mov $Actx,$A 517 mov $Bctx,$B 518 mov $Cctx,$C 519 mov $Dctx,$D 520 mov $Ectx,$E 521 alignaddr %g0,$tmp0,%g0 522 dec 1,$len 523 ba .Loop 524 mov $nXfer,$Xfer 525 526.align 32 527.Ltail: 528___ 529for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 530$code.=<<___; 531 add $A,$Actx,$Actx 532 add $B,$Bctx,$Bctx 533 add $C,$Cctx,$Cctx 534 add $D,$Dctx,$Dctx 535 add $E,$Ectx,$Ectx 536 537 st $Actx,[$ctx+0] 538 st $Bctx,[$ctx+4] 539 st $Cctx,[$ctx+8] 540 st $Dctx,[$ctx+12] 541 st $Ectx,[$ctx+16] 542 543 ret 544 restore 545.type sha1_block_data_order,#function 546.size sha1_block_data_order,(.-sha1_block_data_order) 547.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>" 548.align 4 549___ 550 551# Purpose of these subroutines is to explicitly encode VIS instructions, 552# so that one can compile the module without having to specify VIS 553# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 554# Idea is to reserve for option to produce "universal" binary and let 555# programmer detect if current CPU is VIS capable at run-time. 556sub unvis { 557my ($mnemonic,$rs1,$rs2,$rd)=@_; 558my ($ref,$opf); 559my %visopf = ( "fmul8ulx16" => 0x037, 560 "faligndata" => 0x048, 561 "fpadd32" => 0x052, 562 "fxor" => 0x06c, 563 "fxors" => 0x06d ); 564 565 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 566 567 if ($opf=$visopf{$mnemonic}) { 568 foreach ($rs1,$rs2,$rd) { 569 return $ref if (!/%f([0-9]{1,2})/); 570 $_=$1; 571 if ($1>=32) { 572 return $ref if ($1&1); 573 # re-encode for upper double register addressing 574 $_=($1|$1>>5)&31; 575 } 576 } 577 578 return sprintf ".word\t0x%08x !%s", 579 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 580 $ref; 581 } else { 582 return $ref; 583 } 584} 585sub unalignaddr { 586my ($mnemonic,$rs1,$rs2,$rd)=@_; 587my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 588my $ref="$mnemonic\t$rs1,$rs2,$rd"; 589 590 foreach ($rs1,$rs2,$rd) { 591 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; } 592 else { return $ref; } 593 } 594 return sprintf ".word\t0x%08x !%s", 595 0x81b00300|$rd<<25|$rs1<<14|$rs2, 596 $ref; 597} 598 599$code =~ s/\`([^\`]*)\`/eval $1/gem; 600$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/ 601 &unvis($1,$2,$3,$4) 602 /gem; 603$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/ 604 &unalignaddr($1,$2,$3,$4) 605 /gem; 606print $code; 607close STDOUT or die "error closing STDOUT: $!"; 608