1#! /usr/bin/env perl 2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# SHA256/512 for PowerISA v2.07. 18# 19# Accurate performance measurements are problematic, because it's 20# always virtualized setup with possibly throttled processor. 21# Relative comparison is therefore more informative. This module is 22# ~60% faster than integer-only sha512-ppc.pl. To anchor to something 23# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than 24# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than 25# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting 26# result is degree of computational resources' utilization. POWER8 is 27# "massively multi-threaded chip" and difference between single- and 28# maximum multi-process benchmark results tells that utilization is 29# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and 30# for sha1-ppc.pl - 73%. 100% means that multi-process result equals 31# to single-process one, given that all threads end up on the same 32# physical core. 33# 34###################################################################### 35# Believed-to-be-accurate results in cycles per processed byte [on 36# little-endian system]. Numbers in square brackets are for 64-bit 37# build of sha512-ppc.pl, presented for reference. 38# 39# POWER8 POWER9 40# SHA256 9.7 [15.8] 11.2 [12.5] 41# SHA512 6.1 [10.3] 7.0 [7.9] 42 43# $output is the last argument if it looks like a file (it has an extension) 44# $flavour is the first argument if it doesn't look like a file 45$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 47 48if ($flavour =~ /64/) { 49 $SIZE_T=8; 50 $LRSAVE=2*$SIZE_T; 51 $STU="stdu"; 52 $POP="ld"; 53 $PUSH="std"; 54} elsif ($flavour =~ /32/) { 55 $SIZE_T=4; 56 $LRSAVE=$SIZE_T; 57 $STU="stwu"; 58 $POP="lwz"; 59 $PUSH="stw"; 60} else { die "nonsense $flavour"; } 61 62$LENDIAN=($flavour=~/le/); 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 67die "can't locate ppc-xlate.pl"; 68 69open STDOUT,"| $^X $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71 72if ($output =~ /512/) { 73 $bits=512; 74 $SZ=8; 75 $sz="d"; 76 $rounds=80; 77} else { 78 $bits=256; 79 $SZ=4; 80 $sz="w"; 81 $rounds=64; 82} 83 84$func="sha${bits}_block_p8"; 85$LOCALS=8*$SIZE_T+8*16; 86$FRAME=$LOCALS+9*16+6*$SIZE_T; 87 88$sp ="r1"; 89$toc="r2"; 90$ctx="r3"; 91$inp="r4"; 92$num="r5"; 93$Tbl="r6"; 94$idx="r7"; 95$lrsave="r8"; 96$offload="r11"; 97$vrsave="r12"; 98@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31))); 99 100@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); 101@X=map("v$_",(8..19,24..27)); 102($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31)); 103 104sub ROUND { 105my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 106my $j=($i+1)%16; 107my $k=($i+2)%8; 108 109$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); 110 lvx_u @X[$i+1],0,$inp ; load X[i] in advance 111 addi $inp,$inp,16 112___ 113$code.=<<___ if ($i<16 && ($i%(16/$SZ))); 114 vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ 115___ 116$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); 117 vperm @X[$i],@X[$i],@X[$i],$lemask 118___ 119$code.=<<___ if ($i>=15); 120 vshasigma${sz} $Sigma,@X[($j+1)%16],0,0 121 vaddu${sz}m @X[$j],@X[$j],$Sigma 122 vshasigma${sz} $Sigma,@X[($j+14)%16],0,15 123 vaddu${sz}m @X[$j],@X[$j],$Sigma 124 vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16] 125___ 126$code.=<<___; 127 vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] 128 vsel $Func,$g,$f,$e ; Ch(e,f,g) 129 vaddu${sz}m $g,$g,$Ki ; future h+=K[i] 130 vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) 131 vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e) 132 vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e) 133 vxor $Func,$a,$b 134 vsel $Func,$b,$c,$Func ; Maj(a,b,c) 135 vaddu${sz}m $d,$d,$h ; d+=h 136 vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a) 137 vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c) 138 vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c) 139 lvx $Ki,@I[$k],$idx ; load next K[i] 140___ 141$code.=<<___ if ($k == 7); 142 addi $idx,$idx,0x80 143___ 144} 145 146$code=<<___; 147.machine "any" 148.text 149 150.globl $func 151.align 6 152$func: 153 $STU $sp,-$FRAME($sp) 154 mflr $lrsave 155 li r10,`$LOCALS+15` 156 li r11,`$LOCALS+31` 157 stvx v24,r10,$sp # ABI says so 158 addi r10,r10,32 159 mfspr $vrsave,256 160 stvx v25,r11,$sp 161 addi r11,r11,32 162 stvx v26,r10,$sp 163 addi r10,r10,32 164 stvx v27,r11,$sp 165 addi r11,r11,32 166 stvx v28,r10,$sp 167 addi r10,r10,32 168 stvx v29,r11,$sp 169 addi r11,r11,32 170 stvx v30,r10,$sp 171 stvx v31,r11,$sp 172 li r11,-4096+255 # 0xfffff0ff 173 stw $vrsave,`$FRAME-6*$SIZE_T-4`($sp) # save vrsave 174 li $x10,0x10 175 $PUSH r26,`$FRAME-6*$SIZE_T`($sp) 176 li $x20,0x20 177 $PUSH r27,`$FRAME-5*$SIZE_T`($sp) 178 li $x30,0x30 179 $PUSH r28,`$FRAME-4*$SIZE_T`($sp) 180 li $x40,0x40 181 $PUSH r29,`$FRAME-3*$SIZE_T`($sp) 182 li $x50,0x50 183 $PUSH r30,`$FRAME-2*$SIZE_T`($sp) 184 li $x60,0x60 185 $PUSH r31,`$FRAME-1*$SIZE_T`($sp) 186 li $x70,0x70 187 $PUSH $lrsave,`$FRAME+$LRSAVE`($sp) 188 mtspr 256,r11 189 190 bl LPICmeup 191 addi $offload,$sp,`8*$SIZE_T+15` 192___ 193$code.=<<___ if ($LENDIAN); 194 li $idx,8 195 lvsl $lemask,0,$idx 196 vspltisb $Ki,0x0f 197 vxor $lemask,$lemask,$Ki 198___ 199$code.=<<___ if ($SZ==4); 200 lvx_4w $A,$x00,$ctx 201 lvx_4w $E,$x10,$ctx 202 vsldoi $B,$A,$A,4 # unpack 203 vsldoi $C,$A,$A,8 204 vsldoi $D,$A,$A,12 205 vsldoi $F,$E,$E,4 206 vsldoi $G,$E,$E,8 207 vsldoi $H,$E,$E,12 208___ 209$code.=<<___ if ($SZ==8); 210 lvx_u $A,$x00,$ctx 211 lvx_u $C,$x10,$ctx 212 lvx_u $E,$x20,$ctx 213 vsldoi $B,$A,$A,8 # unpack 214 lvx_u $G,$x30,$ctx 215 vsldoi $D,$C,$C,8 216 vsldoi $F,$E,$E,8 217 vsldoi $H,$G,$G,8 218___ 219$code.=<<___; 220 li r0,`($rounds-16)/16` # inner loop counter 221 b Loop 222.align 5 223Loop: 224 lvx $Ki,$x00,$Tbl 225 lvx_u @X[0],0,$inp 226 addi $inp,$inp,16 227 mr $idx,$Tbl # copy $Tbl 228 stvx $A,$x00,$offload # offload $A-$H 229 stvx $B,$x10,$offload 230 stvx $C,$x20,$offload 231 stvx $D,$x30,$offload 232 stvx $E,$x40,$offload 233 stvx $F,$x50,$offload 234 stvx $G,$x60,$offload 235 stvx $H,$x70,$offload 236 vaddu${sz}m $H,$H,$Ki # h+K[i] 237 lvx $Ki,$x10,$Tbl 238___ 239for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } 240$code.=<<___; 241 mtctr r0 242 b L16_xx 243.align 5 244L16_xx: 245___ 246for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } 247$code.=<<___; 248 bdnz L16_xx 249 250 lvx @X[2],$x00,$offload 251 subic. $num,$num,1 252 lvx @X[3],$x10,$offload 253 vaddu${sz}m $A,$A,@X[2] 254 lvx @X[4],$x20,$offload 255 vaddu${sz}m $B,$B,@X[3] 256 lvx @X[5],$x30,$offload 257 vaddu${sz}m $C,$C,@X[4] 258 lvx @X[6],$x40,$offload 259 vaddu${sz}m $D,$D,@X[5] 260 lvx @X[7],$x50,$offload 261 vaddu${sz}m $E,$E,@X[6] 262 lvx @X[8],$x60,$offload 263 vaddu${sz}m $F,$F,@X[7] 264 lvx @X[9],$x70,$offload 265 vaddu${sz}m $G,$G,@X[8] 266 vaddu${sz}m $H,$H,@X[9] 267 bne Loop 268___ 269$code.=<<___ if ($SZ==4); 270 lvx @X[0],$x20,$idx 271 vperm $A,$A,$B,$Ki # pack the answer 272 lvx @X[1],$x30,$idx 273 vperm $E,$E,$F,$Ki 274 vperm $A,$A,$C,@X[0] 275 vperm $E,$E,$G,@X[0] 276 vperm $A,$A,$D,@X[1] 277 vperm $E,$E,$H,@X[1] 278 stvx_4w $A,$x00,$ctx 279 stvx_4w $E,$x10,$ctx 280___ 281$code.=<<___ if ($SZ==8); 282 vperm $A,$A,$B,$Ki # pack the answer 283 vperm $C,$C,$D,$Ki 284 vperm $E,$E,$F,$Ki 285 vperm $G,$G,$H,$Ki 286 stvx_u $A,$x00,$ctx 287 stvx_u $C,$x10,$ctx 288 stvx_u $E,$x20,$ctx 289 stvx_u $G,$x30,$ctx 290___ 291$code.=<<___; 292 addi $offload,$sp,`$LOCALS+15` 293 mtlr $lrsave 294 mtspr 256,$vrsave 295 lvx v24,$x00,$offload # ABI says so 296 lvx v25,$x10,$offload 297 lvx v26,$x20,$offload 298 lvx v27,$x30,$offload 299 lvx v28,$x40,$offload 300 lvx v29,$x50,$offload 301 lvx v30,$x60,$offload 302 lvx v31,$x70,$offload 303 $POP r26,`$FRAME-6*$SIZE_T`($sp) 304 $POP r27,`$FRAME-5*$SIZE_T`($sp) 305 $POP r28,`$FRAME-4*$SIZE_T`($sp) 306 $POP r29,`$FRAME-3*$SIZE_T`($sp) 307 $POP r30,`$FRAME-2*$SIZE_T`($sp) 308 $POP r31,`$FRAME-1*$SIZE_T`($sp) 309 addi $sp,$sp,$FRAME 310 blr 311 .long 0 312 .byte 0,12,4,1,0x80,6,3,0 313 .long 0 314.size $func,.-$func 315___ 316 317# Ugly hack here, because PPC assembler syntax seem to vary too 318# much from platforms to platform... 319$code.=<<___; 320.align 6 321LPICmeup: 322 mflr r0 323 bcl 20,31,\$+4 324 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry 325 addi $Tbl,$Tbl,`64-8` 326 mtlr r0 327 blr 328 .long 0 329 .byte 0,12,0x14,0,0,0,0,0 330 .space `64-9*4` 331___ 332 333if ($SZ==8) { 334 local *table = sub { 335 foreach(@_) { $code.=".quad $_,$_\n"; } 336 }; 337 table( 338 "0x428a2f98d728ae22","0x7137449123ef65cd", 339 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc", 340 "0x3956c25bf348b538","0x59f111f1b605d019", 341 "0x923f82a4af194f9b","0xab1c5ed5da6d8118", 342 "0xd807aa98a3030242","0x12835b0145706fbe", 343 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2", 344 "0x72be5d74f27b896f","0x80deb1fe3b1696b1", 345 "0x9bdc06a725c71235","0xc19bf174cf692694", 346 "0xe49b69c19ef14ad2","0xefbe4786384f25e3", 347 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65", 348 "0x2de92c6f592b0275","0x4a7484aa6ea6e483", 349 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5", 350 "0x983e5152ee66dfab","0xa831c66d2db43210", 351 "0xb00327c898fb213f","0xbf597fc7beef0ee4", 352 "0xc6e00bf33da88fc2","0xd5a79147930aa725", 353 "0x06ca6351e003826f","0x142929670a0e6e70", 354 "0x27b70a8546d22ffc","0x2e1b21385c26c926", 355 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df", 356 "0x650a73548baf63de","0x766a0abb3c77b2a8", 357 "0x81c2c92e47edaee6","0x92722c851482353b", 358 "0xa2bfe8a14cf10364","0xa81a664bbc423001", 359 "0xc24b8b70d0f89791","0xc76c51a30654be30", 360 "0xd192e819d6ef5218","0xd69906245565a910", 361 "0xf40e35855771202a","0x106aa07032bbd1b8", 362 "0x19a4c116b8d2d0c8","0x1e376c085141ab53", 363 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8", 364 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb", 365 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3", 366 "0x748f82ee5defb2fc","0x78a5636f43172f60", 367 "0x84c87814a1f0ab72","0x8cc702081a6439ec", 368 "0x90befffa23631e28","0xa4506cebde82bde9", 369 "0xbef9a3f7b2c67915","0xc67178f2e372532b", 370 "0xca273eceea26619c","0xd186b8c721c0c207", 371 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178", 372 "0x06f067aa72176fba","0x0a637dc5a2c898a6", 373 "0x113f9804bef90dae","0x1b710b35131c471b", 374 "0x28db77f523047d84","0x32caab7b40c72493", 375 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c", 376 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a", 377 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0"); 378$code.=<<___ if (!$LENDIAN); 379.quad 0x0001020304050607,0x1011121314151617 380___ 381$code.=<<___ if ($LENDIAN); # quad-swapped 382.quad 0x1011121314151617,0x0001020304050607 383___ 384} else { 385 local *table = sub { 386 foreach(@_) { $code.=".long $_,$_,$_,$_\n"; } 387 }; 388 table( 389 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5", 390 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5", 391 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3", 392 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174", 393 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc", 394 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da", 395 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7", 396 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967", 397 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13", 398 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85", 399 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3", 400 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070", 401 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5", 402 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3", 403 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208", 404 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0"); 405$code.=<<___ if (!$LENDIAN); 406.long 0x00010203,0x10111213,0x10111213,0x10111213 407.long 0x00010203,0x04050607,0x10111213,0x10111213 408.long 0x00010203,0x04050607,0x08090a0b,0x10111213 409___ 410$code.=<<___ if ($LENDIAN); # word-swapped 411.long 0x10111213,0x10111213,0x10111213,0x00010203 412.long 0x10111213,0x10111213,0x04050607,0x00010203 413.long 0x10111213,0x08090a0b,0x04050607,0x00010203 414___ 415} 416$code.=<<___; 417.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 418.align 2 419___ 420 421$code =~ s/\`([^\`]*)\`/eval $1/gem; 422print $code; 423close STDOUT or die "error closing STDOUT: $!"; 424