1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for s390x. 17# 18# June 2017. 19# 20# Below code is [lane complementing] KECCAK_2X implementation (see 21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though 22# instead of actually unrolling the loop pair-wise I simply flip 23# pointers to T[][] and A[][] at the end of round. Since number of 24# rounds is even, last round writes to A[][] and everything works out. 25# In the nutshell it's transliteration of x86_64 module, because both 26# architectures have similar capabilities/limitations. Performance 27# measurement is problematic as I don't have access to an idle system. 28# It looks like z13 processes one byte [out of long message] in ~14 29# cycles. At least the result is consistent with estimate based on 30# amount of instruction and assumed instruction issue rate. It's ~2.5x 31# faster than compiler-generated code. 32 33# $output is the last argument if it looks like a file (it has an extension) 34# $flavour is the first argument if it doesn't look like a file 35$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 36$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 37 38if ($flavour =~ /3[12]/) { 39 $SIZE_T=4; 40 $g=""; 41} else { 42 $SIZE_T=8; 43 $g="g"; 44} 45 46$output and open STDOUT,">$output"; 47 48my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); 49 50my @C = map("%r$_",(0,1,5..7)); 51my @D = map("%r$_",(8..12)); 52my @T = map("%r$_",(13..14)); 53my ($src,$dst,$iotas) = map("%r$_",(2..4)); 54my $sp = "%r15"; 55 56$stdframe=16*$SIZE_T+4*8; 57$frame=$stdframe+25*8; 58 59my @rhotates = ([ 0, 1, 62, 28, 27 ], 60 [ 36, 44, 6, 55, 20 ], 61 [ 3, 10, 43, 25, 39 ], 62 [ 41, 45, 15, 21, 8 ], 63 [ 18, 2, 61, 56, 14 ]); 64 65{ my @C = @C; # copy, because we mess them up... 66 my @D = @D; 67 68$code.=<<___; 69.text 70 71.type __KeccakF1600,\@function 72.align 32 73__KeccakF1600: 74 st${g} %r14,$SIZE_T*14($sp) 75 lg @C[0],$A[4][0]($src) 76 lg @C[1],$A[4][1]($src) 77 lg @C[2],$A[4][2]($src) 78 lg @C[3],$A[4][3]($src) 79 lg @C[4],$A[4][4]($src) 80 larl $iotas,iotas 81 j .Loop 82 83.align 16 84.Loop: 85 lg @D[0],$A[0][0]($src) 86 lg @D[1],$A[1][1]($src) 87 lg @D[2],$A[2][2]($src) 88 lg @D[3],$A[3][3]($src) 89 90 xgr @C[0],@D[0] 91 xg @C[1],$A[0][1]($src) 92 xg @C[2],$A[0][2]($src) 93 xg @C[3],$A[0][3]($src) 94 lgr @D[4],@C[4] 95 xg @C[4],$A[0][4]($src) 96 97 xg @C[0],$A[1][0]($src) 98 xgr @C[1],@D[1] 99 xg @C[2],$A[1][2]($src) 100 xg @C[3],$A[1][3]($src) 101 xg @C[4],$A[1][4]($src) 102 103 xg @C[0],$A[2][0]($src) 104 xg @C[1],$A[2][1]($src) 105 xgr @C[2],@D[2] 106 xg @C[3],$A[2][3]($src) 107 xg @C[4],$A[2][4]($src) 108 109 xg @C[0],$A[3][0]($src) 110 xg @C[1],$A[3][1]($src) 111 xg @C[2],$A[3][2]($src) 112 xgr @C[3],@D[3] 113 xg @C[4],$A[3][4]($src) 114 115 lgr @T[0],@C[2] 116 rllg @C[2],@C[2],1 117 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0] 118 119 rllg @C[0],@C[0],1 120 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3] 121 122 rllg @C[3],@C[3],1 123 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1] 124 125 rllg @C[1],@C[1],1 126 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4] 127 128 rllg @C[4],@C[4],1 129 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2] 130___ 131 (@D[0..4], @C) = (@C[1..4,0], @D); 132$code.=<<___; 133 xgr @C[1],@D[1] 134 xgr @C[2],@D[2] 135 xgr @C[3],@D[3] 136 rllg @C[1],@C[1],$rhotates[1][1] 137 xgr @C[4],@D[4] 138 rllg @C[2],@C[2],$rhotates[2][2] 139 xgr @C[0],@D[0] 140 141 lgr @T[0],@C[1] 142 ogr @C[1],@C[2] 143 rllg @C[3],@C[3],$rhotates[3][3] 144 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2]) 145 rllg @C[4],@C[4],$rhotates[4][4] 146 xg @C[1],0($iotas) 147 la $iotas,8($iotas) 148 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] 149 150 lgr @T[1],@C[4] 151 ngr @C[4],@C[3] 152 lghi @C[1],-1 # no 'not' instruction :-( 153 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3]) 154 xgr @C[2],@C[1] # not @C[2] 155 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3]) 156 ogr @C[2],@C[3] 157 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3]) 158 159 ngr @T[0],@C[0] 160 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3]) 161 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0]) 162 ogr @T[1],@C[0] 163 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0]) 164 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0]) 165 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0]) 166 167 168 lg @C[0],$A[0][3]($src) 169 lg @C[4],$A[4][2]($src) 170 lg @C[3],$A[3][1]($src) 171 lg @C[1],$A[1][4]($src) 172 lg @C[2],$A[2][0]($src) 173 174 xgr @C[0],@D[3] 175 xgr @C[4],@D[2] 176 rllg @C[0],@C[0],$rhotates[0][3] 177 xgr @C[3],@D[1] 178 rllg @C[4],@C[4],$rhotates[4][2] 179 xgr @C[1],@D[4] 180 rllg @C[3],@C[3],$rhotates[3][1] 181 xgr @C[2],@D[0] 182 183 lgr @T[0],@C[0] 184 ogr @C[0],@C[4] 185 rllg @C[1],@C[1],$rhotates[1][4] 186 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4]) 187 rllg @C[2],@C[2],$rhotates[2][0] 188 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4]) 189 190 lgr @T[1],@C[1] 191 ngr @C[1],@T[0] 192 lghi @C[0],-1 # no 'not' instruction :-( 193 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0]) 194 xgr @C[4],@C[0] # not @C[4] 195 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0]) 196 197 ogr @C[4],@C[3] 198 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3]) 199 200 ngr @C[3],@C[2] 201 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3]) 202 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2]) 203 ogr @T[1],@C[2] 204 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2]) 205 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2]) 206 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2]) 207 208 209 lg @C[2],$A[2][3]($src) 210 lg @C[3],$A[3][4]($src) 211 lg @C[1],$A[1][2]($src) 212 lg @C[4],$A[4][0]($src) 213 lg @C[0],$A[0][1]($src) 214 215 xgr @C[2],@D[3] 216 xgr @C[3],@D[4] 217 rllg @C[2],@C[2],$rhotates[2][3] 218 xgr @C[1],@D[2] 219 rllg @C[3],@C[3],$rhotates[3][4] 220 xgr @C[4],@D[0] 221 rllg @C[1],@C[1],$rhotates[1][2] 222 xgr @C[0],@D[1] 223 224 lgr @T[0],@C[2] 225 ngr @C[2],@C[3] 226 rllg @C[4],@C[4],$rhotates[4][0] 227 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3]) 228 lghi @T[1],-1 # no 'not' instruction :-( 229 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3]) 230 231 xgr @C[3],@T[1] # not @C[3] 232 lgr @T[1],@C[4] 233 ngr @C[4],@C[3] 234 rllg @C[0],@C[0],$rhotates[0][1] 235 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3]) 236 ogr @T[0],@C[1] 237 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) 238 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1]) 239 240 ngr @C[1],@C[0] 241 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1]) 242 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0]) 243 ogr @C[0],@T[1] 244 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0]) 245 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4]) 246 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) 247 248 249 lg @C[2],$A[2][1]($src) 250 lg @C[3],$A[3][2]($src) 251 lg @C[1],$A[1][0]($src) 252 lg @C[4],$A[4][3]($src) 253 lg @C[0],$A[0][4]($src) 254 255 xgr @C[2],@D[1] 256 xgr @C[3],@D[2] 257 rllg @C[2],@C[2],$rhotates[2][1] 258 xgr @C[1],@D[0] 259 rllg @C[3],@C[3],$rhotates[3][2] 260 xgr @C[4],@D[3] 261 rllg @C[1],@C[1],$rhotates[1][0] 262 xgr @C[0],@D[4] 263 rllg @C[4],@C[4],$rhotates[4][3] 264 265 lgr @T[0],@C[2] 266 ogr @C[2],@C[3] 267 lghi @T[1],-1 # no 'not' instruction :-( 268 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3]) 269 xgr @C[3],@T[1] # not @C[3] 270 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3]) 271 272 lgr @T[1],@C[4] 273 ogr @C[4],@C[3] 274 rllg @C[0],@C[0],$rhotates[0][4] 275 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3]) 276 ngr @T[0],@C[1] 277 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) 278 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1]) 279 280 ogr @C[1],@C[0] 281 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1]) 282 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0]) 283 ngr @C[0],@T[1] 284 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0]) 285 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4]) 286 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) 287 288 289 xg @D[2],$A[0][2]($src) 290 xg @D[3],$A[1][3]($src) 291 xg @D[1],$A[4][1]($src) 292 xg @D[4],$A[2][4]($src) 293 xgr $dst,$src # xchg $dst,$src 294 rllg @D[2],@D[2],$rhotates[0][2] 295 xg @D[0],$A[3][0]($src) 296 rllg @D[3],@D[3],$rhotates[1][3] 297 xgr $src,$dst 298 rllg @D[1],@D[1],$rhotates[4][1] 299 xgr $dst,$src 300 rllg @D[4],@D[4],$rhotates[2][4] 301___ 302 @C = @D[2..4,0,1]; 303$code.=<<___; 304 lgr @T[0],@C[0] 305 ngr @C[0],@C[1] 306 lghi @T[1],-1 # no 'not' instruction :-( 307 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1]) 308 xgr @C[1],@T[1] # not @C[1] 309 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1]) 310 311 lgr @T[1],@C[2] 312 ngr @C[2],@C[1] 313 rllg @D[0],@D[0],$rhotates[3][0] 314 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1]) 315 ogr @T[0],@C[4] 316 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) 317 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4]) 318 319 ngr @C[4],@C[3] 320 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4]) 321 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3]) 322 ogr @C[3],@T[1] 323 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3]) 324 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3]) 325 326 lgr @C[1],@C[0] # harmonize with the loop top 327 lgr @C[0],@T[0] 328 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) 329 330 tmll $iotas,255 331 jnz .Loop 332 333 l${g} %r14,$SIZE_T*14($sp) 334 br %r14 335.size __KeccakF1600,.-__KeccakF1600 336___ 337} 338{ 339$code.=<<___; 340.type KeccakF1600,\@function 341.align 32 342KeccakF1600: 343.LKeccakF1600: 344 lghi %r1,-$frame 345 stm${g} %r6,%r15,$SIZE_T*6($sp) 346 lgr %r0,$sp 347 la $sp,0(%r1,$sp) 348 st${g} %r0,0($sp) 349 350 lghi @D[0],-1 # no 'not' instruction :-( 351 lghi @D[1],-1 352 lghi @D[2],-1 353 lghi @D[3],-1 354 lghi @D[4],-1 355 lghi @T[0],-1 356 xg @D[0],$A[0][1]($src) 357 xg @D[1],$A[0][2]($src) 358 xg @D[2],$A[1][3]($src) 359 xg @D[3],$A[2][2]($src) 360 xg @D[4],$A[3][2]($src) 361 xg @T[0],$A[4][0]($src) 362 stmg @D[0],@D[1],$A[0][1]($src) 363 stg @D[2],$A[1][3]($src) 364 stg @D[3],$A[2][2]($src) 365 stg @D[4],$A[3][2]($src) 366 stg @T[0],$A[4][0]($src) 367 368 la $dst,$stdframe($sp) 369 370 bras %r14,__KeccakF1600 371 372 lghi @D[0],-1 # no 'not' instruction :-( 373 lghi @D[1],-1 374 lghi @D[2],-1 375 lghi @D[3],-1 376 lghi @D[4],-1 377 lghi @T[0],-1 378 xg @D[0],$A[0][1]($src) 379 xg @D[1],$A[0][2]($src) 380 xg @D[2],$A[1][3]($src) 381 xg @D[3],$A[2][2]($src) 382 xg @D[4],$A[3][2]($src) 383 xg @T[0],$A[4][0]($src) 384 stmg @D[0],@D[1],$A[0][1]($src) 385 stg @D[2],$A[1][3]($src) 386 stg @D[3],$A[2][2]($src) 387 stg @D[4],$A[3][2]($src) 388 stg @T[0],$A[4][0]($src) 389 390 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) 391 br %r14 392.size KeccakF1600,.-KeccakF1600 393___ 394} 395{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5)); 396 397$code.=<<___; 398.globl SHA3_absorb 399.type SHA3_absorb,\@function 400.align 32 401SHA3_absorb: 402 lghi %r1,-$frame 403 stm${g} %r5,%r15,$SIZE_T*5($sp) 404 lgr %r0,$sp 405 la $sp,0(%r1,$sp) 406 st${g} %r0,0($sp) 407 408 lghi @D[0],-1 # no 'not' instruction :-( 409 lghi @D[1],-1 410 lghi @D[2],-1 411 lghi @D[3],-1 412 lghi @D[4],-1 413 lghi @T[0],-1 414 xg @D[0],$A[0][1]($src) 415 xg @D[1],$A[0][2]($src) 416 xg @D[2],$A[1][3]($src) 417 xg @D[3],$A[2][2]($src) 418 xg @D[4],$A[3][2]($src) 419 xg @T[0],$A[4][0]($src) 420 stmg @D[0],@D[1],$A[0][1]($src) 421 stg @D[2],$A[1][3]($src) 422 stg @D[3],$A[2][2]($src) 423 stg @D[4],$A[3][2]($src) 424 stg @T[0],$A[4][0]($src) 425 426.Loop_absorb: 427 cl${g}r $len,$bsz 428 jl .Ldone_absorb 429 430 srl${g} $bsz,3 431 la %r1,0($A_flat) 432 433.Lblock_absorb: 434 lrvg %r0,0($inp) 435 la $inp,8($inp) 436 xg %r0,0(%r1) 437 a${g}hi $len,-8 438 stg %r0,0(%r1) 439 la %r1,8(%r1) 440 brct $bsz,.Lblock_absorb 441 442 stm${g} $inp,$len,$frame+3*$SIZE_T($sp) 443 la $dst,$stdframe($sp) 444 bras %r14,__KeccakF1600 445 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp) 446 j .Loop_absorb 447 448.align 16 449.Ldone_absorb: 450 lghi @D[0],-1 # no 'not' instruction :-( 451 lghi @D[1],-1 452 lghi @D[2],-1 453 lghi @D[3],-1 454 lghi @D[4],-1 455 lghi @T[0],-1 456 xg @D[0],$A[0][1]($src) 457 xg @D[1],$A[0][2]($src) 458 xg @D[2],$A[1][3]($src) 459 xg @D[3],$A[2][2]($src) 460 xg @D[4],$A[3][2]($src) 461 xg @T[0],$A[4][0]($src) 462 stmg @D[0],@D[1],$A[0][1]($src) 463 stg @D[2],$A[1][3]($src) 464 stg @D[3],$A[2][2]($src) 465 stg @D[4],$A[3][2]($src) 466 stg @T[0],$A[4][0]($src) 467 468 lgr %r2,$len # return value 469 470 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) 471 br %r14 472.size SHA3_absorb,.-SHA3_absorb 473___ 474} 475{ my ($A_flat,$out,$len,$bsz,$next) = map("%r$_",(2..6)); 476 477$code.=<<___; 478.globl SHA3_squeeze 479.type SHA3_squeeze,\@function 480.align 32 481SHA3_squeeze: 482 srl${g} $bsz,3 483 st${g} %r14,2*$SIZE_T($sp) 484 lghi %r14,8 485 st${g} $bsz,5*$SIZE_T($sp) 486 la %r1,0($A_flat) 487 cijne $next,0,.Lnext_block 488 489 j .Loop_squeeze 490 491.align 16 492.Loop_squeeze: 493 cl${g}r $len,%r14 494 jl .Ltail_squeeze 495 496 lrvg %r0,0(%r1) 497 la %r1,8(%r1) 498 stg %r0,0($out) 499 la $out,8($out) 500 a${g}hi $len,-8 # len -= 8 501 jz .Ldone_squeeze 502 503 brct $bsz,.Loop_squeeze # bsz-- 504 505.Lnext_block: 506 stm${g} $out,$len,3*$SIZE_T($sp) 507 bras %r14,.LKeccakF1600 508 lm${g} $out,$bsz,3*$SIZE_T($sp) 509 lghi %r14,8 510 la %r1,0($A_flat) 511 j .Loop_squeeze 512 513.Ltail_squeeze: 514 lg %r0,0(%r1) 515.Loop_tail_squeeze: 516 stc %r0,0($out) 517 la $out,1($out) 518 srlg %r0,8 519 brct $len,.Loop_tail_squeeze 520 521.Ldone_squeeze: 522 l${g} %r14,2*$SIZE_T($sp) 523 br %r14 524.size SHA3_squeeze,.-SHA3_squeeze 525___ 526} 527$code.=<<___; 528.align 256 529 .quad 0,0,0,0,0,0,0,0 530.type iotas,\@object 531iotas: 532 .quad 0x0000000000000001 533 .quad 0x0000000000008082 534 .quad 0x800000000000808a 535 .quad 0x8000000080008000 536 .quad 0x000000000000808b 537 .quad 0x0000000080000001 538 .quad 0x8000000080008081 539 .quad 0x8000000000008009 540 .quad 0x000000000000008a 541 .quad 0x0000000000000088 542 .quad 0x0000000080008009 543 .quad 0x000000008000000a 544 .quad 0x000000008000808b 545 .quad 0x800000000000008b 546 .quad 0x8000000000008089 547 .quad 0x8000000000008003 548 .quad 0x8000000000008002 549 .quad 0x8000000000000080 550 .quad 0x000000000000800a 551 .quad 0x800000008000000a 552 .quad 0x8000000080008081 553 .quad 0x8000000000008080 554 .quad 0x0000000080000001 555 .quad 0x8000000080008008 556.size iotas,.-iotas 557.asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>" 558___ 559 560# unlike 32-bit shift 64-bit one takes three arguments 561$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm; 562 563print $code; 564close STDOUT or die "error closing STDOUT: $!"; 565