1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for PPC64. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT implementation that works on 21# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and 22# it's possible to achieve performance better than below, but that is 23# naturally option only for POWER8 and successors... 24# 25###################################################################### 26# Numbers are cycles per processed byte. 27# 28# r=1088(*) 29# 30# PPC970/G5 14.0/+130% 31# POWER7 9.7/+110% 32# POWER8 10.6/+100% 33# POWER9 8.2/+66% 34# 35# (*) Corresponds to SHA3-256. Percentage after slash is improvement 36# over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do 37# much better (but watch out for them generating code specific 38# to processor they execute on). 39 40# $output is the last argument if it looks like a file (it has an extension) 41# $flavour is the first argument if it doesn't look like a file 42$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 43$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 44 45if ($flavour =~ /64/) { 46 $SIZE_T =8; 47 $LRSAVE =2*$SIZE_T; 48 $UCMP ="cmpld"; 49 $STU ="stdu"; 50 $POP ="ld"; 51 $PUSH ="std"; 52} else { die "nonsense $flavour"; } 53 54$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; 55 56if ($LITTLE_ENDIAN) { 57 $DWORD_LE_LOAD = "ldu r0,8(r3)"; 58 $LE_LOAD_SIZE = "8"; 59} else { 60 $DWORD_LE_LOAD = "bl dword_le_load"; 61 $LE_LOAD_SIZE = "1"; 62} 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 67die "can't locate ppc-xlate.pl"; 68 69open STDOUT,"| $^X $xlate $flavour \"$output\"" 70 or die "can't call $xlate: $!"; 71 72$FRAME=24*$SIZE_T+6*$SIZE_T+32; 73$LOCALS=6*$SIZE_T; 74$TEMP=$LOCALS+6*$SIZE_T; 75 76my $sp ="r1"; 77 78my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ], 79 (7, 12, 17, 22, 27)); 80 $A[1][1] = "r6"; # r13 is reserved 81 82my @C = map("r$_", (0,3,4,5)); 83 84my @rhotates = ([ 0, 1, 62, 28, 27 ], 85 [ 36, 44, 6, 55, 20 ], 86 [ 3, 10, 43, 25, 39 ], 87 [ 41, 45, 15, 21, 8 ], 88 [ 18, 2, 61, 56, 14 ]); 89 90$code.=<<___; 91.text 92 93.type KeccakF1600_int,\@function 94.align 5 95KeccakF1600_int: 96 li r0,24 97 mtctr r0 98 b .Loop 99.align 4 100.Loop: 101 xor $C[0],$A[0][0],$A[1][0] ; Theta 102 std $A[0][4],`$TEMP+0`($sp) 103 xor $C[1],$A[0][1],$A[1][1] 104 std $A[1][4],`$TEMP+8`($sp) 105 xor $C[2],$A[0][2],$A[1][2] 106 std $A[2][4],`$TEMP+16`($sp) 107 xor $C[3],$A[0][3],$A[1][3] 108 std $A[3][4],`$TEMP+24`($sp) 109___ 110 $C[4]=$A[0][4]; 111 $C[5]=$A[1][4]; 112 $C[6]=$A[2][4]; 113 $C[7]=$A[3][4]; 114$code.=<<___; 115 xor $C[4],$A[0][4],$A[1][4] 116 xor $C[0],$C[0],$A[2][0] 117 xor $C[1],$C[1],$A[2][1] 118 xor $C[2],$C[2],$A[2][2] 119 xor $C[3],$C[3],$A[2][3] 120 xor $C[4],$C[4],$A[2][4] 121 xor $C[0],$C[0],$A[3][0] 122 xor $C[1],$C[1],$A[3][1] 123 xor $C[2],$C[2],$A[3][2] 124 xor $C[3],$C[3],$A[3][3] 125 xor $C[4],$C[4],$A[3][4] 126 xor $C[0],$C[0],$A[4][0] 127 xor $C[2],$C[2],$A[4][2] 128 xor $C[1],$C[1],$A[4][1] 129 xor $C[3],$C[3],$A[4][3] 130 rotldi $C[5],$C[2],1 131 xor $C[4],$C[4],$A[4][4] 132 rotldi $C[6],$C[3],1 133 xor $C[5],$C[5],$C[0] 134 rotldi $C[7],$C[4],1 135 136 xor $A[0][1],$A[0][1],$C[5] 137 xor $A[1][1],$A[1][1],$C[5] 138 xor $A[2][1],$A[2][1],$C[5] 139 xor $A[3][1],$A[3][1],$C[5] 140 xor $A[4][1],$A[4][1],$C[5] 141 142 rotldi $C[5],$C[0],1 143 xor $C[6],$C[6],$C[1] 144 xor $C[2],$C[2],$C[7] 145 rotldi $C[7],$C[1],1 146 xor $C[3],$C[3],$C[5] 147 xor $C[4],$C[4],$C[7] 148 149 xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2] 150 xor $A[1][2],$A[1][2],$C[6] 151 xor $A[2][2],$A[2][2],$C[6] 152 xor $A[3][2],$A[3][2],$C[6] 153 xor $A[4][2],$A[4][2],$C[6] 154 155 xor $A[0][0],$A[0][0],$C[4] 156 xor $A[1][0],$A[1][0],$C[4] 157 xor $A[2][0],$A[2][0],$C[4] 158 xor $A[3][0],$A[3][0],$C[4] 159 xor $A[4][0],$A[4][0],$C[4] 160___ 161 $C[4]=undef; 162 $C[5]=undef; 163 $C[6]=undef; 164 $C[7]=undef; 165$code.=<<___; 166 ld $A[0][4],`$TEMP+0`($sp) 167 xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3] 168 ld $A[1][4],`$TEMP+8`($sp) 169 xor $A[1][3],$A[1][3],$C[2] 170 ld $A[2][4],`$TEMP+16`($sp) 171 xor $A[2][3],$A[2][3],$C[2] 172 ld $A[3][4],`$TEMP+24`($sp) 173 xor $A[3][3],$A[3][3],$C[2] 174 xor $A[4][3],$A[4][3],$C[2] 175 176 xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4] 177 xor $A[1][4],$A[1][4],$C[3] 178 xor $A[2][4],$A[2][4],$C[3] 179 xor $A[3][4],$A[3][4],$C[3] 180 xor $A[4][4],$A[4][4],$C[3] 181 182 mr $C[3],$A[0][1] ; Rho+Pi 183 rotldi $A[0][1],$A[1][1],$rhotates[1][1] 184 ;mr $C[1],$A[0][2] 185 rotldi $A[0][2],$A[2][2],$rhotates[2][2] 186 ;mr $C[0],$A[0][3] 187 rotldi $A[0][3],$A[3][3],$rhotates[3][3] 188 ;mr $C[2],$A[0][4] 189 rotldi $A[0][4],$A[4][4],$rhotates[4][4] 190 191 rotldi $A[1][1],$A[1][4],$rhotates[1][4] 192 rotldi $A[2][2],$A[2][3],$rhotates[2][3] 193 rotldi $A[3][3],$A[3][2],$rhotates[3][2] 194 rotldi $A[4][4],$A[4][1],$rhotates[4][1] 195 196 rotldi $A[1][4],$A[4][2],$rhotates[4][2] 197 rotldi $A[2][3],$A[3][4],$rhotates[3][4] 198 rotldi $A[3][2],$A[2][1],$rhotates[2][1] 199 rotldi $A[4][1],$A[1][3],$rhotates[1][3] 200 201 rotldi $A[4][2],$A[2][4],$rhotates[2][4] 202 rotldi $A[3][4],$A[4][3],$rhotates[4][3] 203 rotldi $A[2][1],$A[1][2],$rhotates[1][2] 204 rotldi $A[1][3],$A[3][1],$rhotates[3][1] 205 206 rotldi $A[2][4],$A[4][0],$rhotates[4][0] 207 rotldi $A[4][3],$A[3][0],$rhotates[3][0] 208 rotldi $A[1][2],$A[2][0],$rhotates[2][0] 209 rotldi $A[3][1],$A[1][0],$rhotates[1][0] 210 211 rotldi $A[1][0],$C[0],$rhotates[0][3] 212 rotldi $A[2][0],$C[3],$rhotates[0][1] 213 rotldi $A[3][0],$C[2],$rhotates[0][4] 214 rotldi $A[4][0],$C[1],$rhotates[0][2] 215 216 andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota 217 andc $C[1],$A[0][3],$A[0][2] 218 andc $C[2],$A[0][0],$A[0][4] 219 andc $C[3],$A[0][1],$A[0][0] 220 xor $A[0][0],$A[0][0],$C[0] 221 andc $C[0],$A[0][4],$A[0][3] 222 xor $A[0][1],$A[0][1],$C[1] 223 ld $C[1],`$LOCALS+4*$SIZE_T`($sp) 224 xor $A[0][3],$A[0][3],$C[2] 225 xor $A[0][4],$A[0][4],$C[3] 226 xor $A[0][2],$A[0][2],$C[0] 227 ldu $C[3],8($C[1]) ; Iota[i++] 228 229 andc $C[0],$A[1][2],$A[1][1] 230 std $C[1],`$LOCALS+4*$SIZE_T`($sp) 231 andc $C[1],$A[1][3],$A[1][2] 232 andc $C[2],$A[1][0],$A[1][4] 233 xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota 234 andc $C[3],$A[1][1],$A[1][0] 235 xor $A[1][0],$A[1][0],$C[0] 236 andc $C[0],$A[1][4],$A[1][3] 237 xor $A[1][1],$A[1][1],$C[1] 238 xor $A[1][3],$A[1][3],$C[2] 239 xor $A[1][4],$A[1][4],$C[3] 240 xor $A[1][2],$A[1][2],$C[0] 241 242 andc $C[0],$A[2][2],$A[2][1] 243 andc $C[1],$A[2][3],$A[2][2] 244 andc $C[2],$A[2][0],$A[2][4] 245 andc $C[3],$A[2][1],$A[2][0] 246 xor $A[2][0],$A[2][0],$C[0] 247 andc $C[0],$A[2][4],$A[2][3] 248 xor $A[2][1],$A[2][1],$C[1] 249 xor $A[2][3],$A[2][3],$C[2] 250 xor $A[2][4],$A[2][4],$C[3] 251 xor $A[2][2],$A[2][2],$C[0] 252 253 andc $C[0],$A[3][2],$A[3][1] 254 andc $C[1],$A[3][3],$A[3][2] 255 andc $C[2],$A[3][0],$A[3][4] 256 andc $C[3],$A[3][1],$A[3][0] 257 xor $A[3][0],$A[3][0],$C[0] 258 andc $C[0],$A[3][4],$A[3][3] 259 xor $A[3][1],$A[3][1],$C[1] 260 xor $A[3][3],$A[3][3],$C[2] 261 xor $A[3][4],$A[3][4],$C[3] 262 xor $A[3][2],$A[3][2],$C[0] 263 264 andc $C[0],$A[4][2],$A[4][1] 265 andc $C[1],$A[4][3],$A[4][2] 266 andc $C[2],$A[4][0],$A[4][4] 267 andc $C[3],$A[4][1],$A[4][0] 268 xor $A[4][0],$A[4][0],$C[0] 269 andc $C[0],$A[4][4],$A[4][3] 270 xor $A[4][1],$A[4][1],$C[1] 271 xor $A[4][3],$A[4][3],$C[2] 272 xor $A[4][4],$A[4][4],$C[3] 273 xor $A[4][2],$A[4][2],$C[0] 274 275 bdnz .Loop 276 277 blr 278 .long 0 279 .byte 0,12,0x14,0,0,0,0,0 280.size KeccakF1600_int,.-KeccakF1600_int 281 282.type KeccakF1600,\@function 283.align 5 284KeccakF1600: 285 $STU $sp,-$FRAME($sp) 286 mflr r0 287 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) 288 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 289 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 290 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 291 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 292 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 293 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 294 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 295 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 296 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 297 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 298 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 299 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 300 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 301 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 302 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 303 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 304 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 305 $PUSH r0,`$FRAME+$LRSAVE`($sp) 306 307 bl PICmeup 308 subi r12,r12,8 ; prepare for ldu 309 310 $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) 311 ;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp) 312 ;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp) 313 ;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp) 314 $PUSH r12,`$LOCALS+4*$SIZE_T`($sp) 315 316 ld $A[0][0],`8*0`(r3) ; load A[5][5] 317 ld $A[0][1],`8*1`(r3) 318 ld $A[0][2],`8*2`(r3) 319 ld $A[0][3],`8*3`(r3) 320 ld $A[0][4],`8*4`(r3) 321 ld $A[1][0],`8*5`(r3) 322 ld $A[1][1],`8*6`(r3) 323 ld $A[1][2],`8*7`(r3) 324 ld $A[1][3],`8*8`(r3) 325 ld $A[1][4],`8*9`(r3) 326 ld $A[2][0],`8*10`(r3) 327 ld $A[2][1],`8*11`(r3) 328 ld $A[2][2],`8*12`(r3) 329 ld $A[2][3],`8*13`(r3) 330 ld $A[2][4],`8*14`(r3) 331 ld $A[3][0],`8*15`(r3) 332 ld $A[3][1],`8*16`(r3) 333 ld $A[3][2],`8*17`(r3) 334 ld $A[3][3],`8*18`(r3) 335 ld $A[3][4],`8*19`(r3) 336 ld $A[4][0],`8*20`(r3) 337 ld $A[4][1],`8*21`(r3) 338 ld $A[4][2],`8*22`(r3) 339 ld $A[4][3],`8*23`(r3) 340 ld $A[4][4],`8*24`(r3) 341 342 bl KeccakF1600_int 343 344 $POP r3,`$LOCALS+0*$SIZE_T`($sp) 345 std $A[0][0],`8*0`(r3) ; return A[5][5] 346 std $A[0][1],`8*1`(r3) 347 std $A[0][2],`8*2`(r3) 348 std $A[0][3],`8*3`(r3) 349 std $A[0][4],`8*4`(r3) 350 std $A[1][0],`8*5`(r3) 351 std $A[1][1],`8*6`(r3) 352 std $A[1][2],`8*7`(r3) 353 std $A[1][3],`8*8`(r3) 354 std $A[1][4],`8*9`(r3) 355 std $A[2][0],`8*10`(r3) 356 std $A[2][1],`8*11`(r3) 357 std $A[2][2],`8*12`(r3) 358 std $A[2][3],`8*13`(r3) 359 std $A[2][4],`8*14`(r3) 360 std $A[3][0],`8*15`(r3) 361 std $A[3][1],`8*16`(r3) 362 std $A[3][2],`8*17`(r3) 363 std $A[3][3],`8*18`(r3) 364 std $A[3][4],`8*19`(r3) 365 std $A[4][0],`8*20`(r3) 366 std $A[4][1],`8*21`(r3) 367 std $A[4][2],`8*22`(r3) 368 std $A[4][3],`8*23`(r3) 369 std $A[4][4],`8*24`(r3) 370 371 $POP r0,`$FRAME+$LRSAVE`($sp) 372 $POP r14,`$FRAME-$SIZE_T*18`($sp) 373 $POP r15,`$FRAME-$SIZE_T*17`($sp) 374 $POP r16,`$FRAME-$SIZE_T*16`($sp) 375 $POP r17,`$FRAME-$SIZE_T*15`($sp) 376 $POP r18,`$FRAME-$SIZE_T*14`($sp) 377 $POP r19,`$FRAME-$SIZE_T*13`($sp) 378 $POP r20,`$FRAME-$SIZE_T*12`($sp) 379 $POP r21,`$FRAME-$SIZE_T*11`($sp) 380 $POP r22,`$FRAME-$SIZE_T*10`($sp) 381 $POP r23,`$FRAME-$SIZE_T*9`($sp) 382 $POP r24,`$FRAME-$SIZE_T*8`($sp) 383 $POP r25,`$FRAME-$SIZE_T*7`($sp) 384 $POP r26,`$FRAME-$SIZE_T*6`($sp) 385 $POP r27,`$FRAME-$SIZE_T*5`($sp) 386 $POP r28,`$FRAME-$SIZE_T*4`($sp) 387 $POP r29,`$FRAME-$SIZE_T*3`($sp) 388 $POP r30,`$FRAME-$SIZE_T*2`($sp) 389 $POP r31,`$FRAME-$SIZE_T*1`($sp) 390 mtlr r0 391 addi $sp,$sp,$FRAME 392 blr 393 .long 0 394 .byte 0,12,4,1,0x80,18,1,0 395 .long 0 396.size KeccakF1600,.-KeccakF1600 397___ 398if (!$LITTLE_ENDIAN) { 399$code.=<<___; 400.type dword_le_load,\@function 401.align 5 402dword_le_load: 403 lbz r0,1(r3) 404 lbz r4,2(r3) 405 lbz r5,3(r3) 406 insrdi r0,r4,8,48 407 lbz r4,4(r3) 408 insrdi r0,r5,8,40 409 lbz r5,5(r3) 410 insrdi r0,r4,8,32 411 lbz r4,6(r3) 412 insrdi r0,r5,8,24 413 lbz r5,7(r3) 414 insrdi r0,r4,8,16 415 lbzu r4,8(r3) 416 insrdi r0,r5,8,8 417 insrdi r0,r4,8,0 418 blr 419 .long 0 420 .byte 0,12,0x14,0,0,0,1,0 421 .long 0 422.size dword_le_load,.-dword_le_load 423___ 424} 425 426$code.=<<___; 427.globl SHA3_absorb 428.type SHA3_absorb,\@function 429.align 5 430SHA3_absorb: 431 $STU $sp,-$FRAME($sp) 432 mflr r0 433 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) 434 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 435 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 436 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 437 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 438 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 439 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 440 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 441 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 442 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 443 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 444 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 445 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 446 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 447 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 448 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 449 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 450 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 451 $PUSH r0,`$FRAME+$LRSAVE`($sp) 452 453 bl PICmeup 454 subi r4,r4,$LE_LOAD_SIZE ; prepare for ldu or lbzu 455 subi r12,r12,8 ; prepare for ldu 456 457 $PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][] 458 $PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp 459 $PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len 460 $PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz 461 mr r0,r6 462 $PUSH r12,`$LOCALS+4*$SIZE_T`($sp) 463 464 ld $A[0][0],`8*0`(r3) ; load A[5][5] 465 ld $A[0][1],`8*1`(r3) 466 ld $A[0][2],`8*2`(r3) 467 ld $A[0][3],`8*3`(r3) 468 ld $A[0][4],`8*4`(r3) 469 ld $A[1][0],`8*5`(r3) 470 ld $A[1][1],`8*6`(r3) 471 ld $A[1][2],`8*7`(r3) 472 ld $A[1][3],`8*8`(r3) 473 ld $A[1][4],`8*9`(r3) 474 ld $A[2][0],`8*10`(r3) 475 ld $A[2][1],`8*11`(r3) 476 ld $A[2][2],`8*12`(r3) 477 ld $A[2][3],`8*13`(r3) 478 ld $A[2][4],`8*14`(r3) 479 ld $A[3][0],`8*15`(r3) 480 ld $A[3][1],`8*16`(r3) 481 ld $A[3][2],`8*17`(r3) 482 ld $A[3][3],`8*18`(r3) 483 ld $A[3][4],`8*19`(r3) 484 ld $A[4][0],`8*20`(r3) 485 ld $A[4][1],`8*21`(r3) 486 ld $A[4][2],`8*22`(r3) 487 ld $A[4][3],`8*23`(r3) 488 ld $A[4][4],`8*24`(r3) 489 490 mr r3,r4 491 mr r4,r5 492 mr r5,r0 493 494 b .Loop_absorb 495 496.align 4 497.Loop_absorb: 498 $UCMP r4,r5 ; len < bsz? 499 blt .Labsorbed 500 501 sub r4,r4,r5 ; len -= bsz 502 srwi r5,r5,3 503 $PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len 504 mtctr r5 505 $DWORD_LE_LOAD ; *inp++ 506 xor $A[0][0],$A[0][0],r0 507 bdz .Lprocess_block 508 $DWORD_LE_LOAD ; *inp++ 509 xor $A[0][1],$A[0][1],r0 510 bdz .Lprocess_block 511 $DWORD_LE_LOAD ; *inp++ 512 xor $A[0][2],$A[0][2],r0 513 bdz .Lprocess_block 514 $DWORD_LE_LOAD ; *inp++ 515 xor $A[0][3],$A[0][3],r0 516 bdz .Lprocess_block 517 $DWORD_LE_LOAD ; *inp++ 518 xor $A[0][4],$A[0][4],r0 519 bdz .Lprocess_block 520 $DWORD_LE_LOAD ; *inp++ 521 xor $A[1][0],$A[1][0],r0 522 bdz .Lprocess_block 523 $DWORD_LE_LOAD ; *inp++ 524 xor $A[1][1],$A[1][1],r0 525 bdz .Lprocess_block 526 $DWORD_LE_LOAD ; *inp++ 527 xor $A[1][2],$A[1][2],r0 528 bdz .Lprocess_block 529 $DWORD_LE_LOAD ; *inp++ 530 xor $A[1][3],$A[1][3],r0 531 bdz .Lprocess_block 532 $DWORD_LE_LOAD ; *inp++ 533 xor $A[1][4],$A[1][4],r0 534 bdz .Lprocess_block 535 $DWORD_LE_LOAD ; *inp++ 536 xor $A[2][0],$A[2][0],r0 537 bdz .Lprocess_block 538 $DWORD_LE_LOAD ; *inp++ 539 xor $A[2][1],$A[2][1],r0 540 bdz .Lprocess_block 541 $DWORD_LE_LOAD ; *inp++ 542 xor $A[2][2],$A[2][2],r0 543 bdz .Lprocess_block 544 $DWORD_LE_LOAD ; *inp++ 545 xor $A[2][3],$A[2][3],r0 546 bdz .Lprocess_block 547 $DWORD_LE_LOAD ; *inp++ 548 xor $A[2][4],$A[2][4],r0 549 bdz .Lprocess_block 550 $DWORD_LE_LOAD ; *inp++ 551 xor $A[3][0],$A[3][0],r0 552 bdz .Lprocess_block 553 $DWORD_LE_LOAD ; *inp++ 554 xor $A[3][1],$A[3][1],r0 555 bdz .Lprocess_block 556 $DWORD_LE_LOAD ; *inp++ 557 xor $A[3][2],$A[3][2],r0 558 bdz .Lprocess_block 559 $DWORD_LE_LOAD ; *inp++ 560 xor $A[3][3],$A[3][3],r0 561 bdz .Lprocess_block 562 $DWORD_LE_LOAD ; *inp++ 563 xor $A[3][4],$A[3][4],r0 564 bdz .Lprocess_block 565 $DWORD_LE_LOAD ; *inp++ 566 xor $A[4][0],$A[4][0],r0 567 bdz .Lprocess_block 568 $DWORD_LE_LOAD ; *inp++ 569 xor $A[4][1],$A[4][1],r0 570 bdz .Lprocess_block 571 $DWORD_LE_LOAD ; *inp++ 572 xor $A[4][2],$A[4][2],r0 573 bdz .Lprocess_block 574 $DWORD_LE_LOAD ; *inp++ 575 xor $A[4][3],$A[4][3],r0 576 bdz .Lprocess_block 577 $DWORD_LE_LOAD ; *inp++ 578 xor $A[4][4],$A[4][4],r0 579 580.Lprocess_block: 581 $PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp 582 583 bl KeccakF1600_int 584 585 $POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24] 586 $POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz 587 $POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len 588 $POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp 589 addic r0,r0,`-8*24` ; rewind iotas 590 $PUSH r0,`$LOCALS+4*$SIZE_T`($sp) 591 592 b .Loop_absorb 593 594.align 4 595.Labsorbed: 596 $POP r3,`$LOCALS+0*$SIZE_T`($sp) 597 std $A[0][0],`8*0`(r3) ; return A[5][5] 598 std $A[0][1],`8*1`(r3) 599 std $A[0][2],`8*2`(r3) 600 std $A[0][3],`8*3`(r3) 601 std $A[0][4],`8*4`(r3) 602 std $A[1][0],`8*5`(r3) 603 std $A[1][1],`8*6`(r3) 604 std $A[1][2],`8*7`(r3) 605 std $A[1][3],`8*8`(r3) 606 std $A[1][4],`8*9`(r3) 607 std $A[2][0],`8*10`(r3) 608 std $A[2][1],`8*11`(r3) 609 std $A[2][2],`8*12`(r3) 610 std $A[2][3],`8*13`(r3) 611 std $A[2][4],`8*14`(r3) 612 std $A[3][0],`8*15`(r3) 613 std $A[3][1],`8*16`(r3) 614 std $A[3][2],`8*17`(r3) 615 std $A[3][3],`8*18`(r3) 616 std $A[3][4],`8*19`(r3) 617 std $A[4][0],`8*20`(r3) 618 std $A[4][1],`8*21`(r3) 619 std $A[4][2],`8*22`(r3) 620 std $A[4][3],`8*23`(r3) 621 std $A[4][4],`8*24`(r3) 622 623 mr r3,r4 ; return value 624 $POP r0,`$FRAME+$LRSAVE`($sp) 625 $POP r14,`$FRAME-$SIZE_T*18`($sp) 626 $POP r15,`$FRAME-$SIZE_T*17`($sp) 627 $POP r16,`$FRAME-$SIZE_T*16`($sp) 628 $POP r17,`$FRAME-$SIZE_T*15`($sp) 629 $POP r18,`$FRAME-$SIZE_T*14`($sp) 630 $POP r19,`$FRAME-$SIZE_T*13`($sp) 631 $POP r20,`$FRAME-$SIZE_T*12`($sp) 632 $POP r21,`$FRAME-$SIZE_T*11`($sp) 633 $POP r22,`$FRAME-$SIZE_T*10`($sp) 634 $POP r23,`$FRAME-$SIZE_T*9`($sp) 635 $POP r24,`$FRAME-$SIZE_T*8`($sp) 636 $POP r25,`$FRAME-$SIZE_T*7`($sp) 637 $POP r26,`$FRAME-$SIZE_T*6`($sp) 638 $POP r27,`$FRAME-$SIZE_T*5`($sp) 639 $POP r28,`$FRAME-$SIZE_T*4`($sp) 640 $POP r29,`$FRAME-$SIZE_T*3`($sp) 641 $POP r30,`$FRAME-$SIZE_T*2`($sp) 642 $POP r31,`$FRAME-$SIZE_T*1`($sp) 643 mtlr r0 644 addi $sp,$sp,$FRAME 645 blr 646 .long 0 647 .byte 0,12,4,1,0x80,18,4,0 648 .long 0 649.size SHA3_absorb,.-SHA3_absorb 650___ 651{ 652my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31)); 653$code.=<<___; 654.globl SHA3_squeeze 655.type SHA3_squeeze,\@function 656.align 5 657SHA3_squeeze: 658 $STU $sp,`-10*$SIZE_T`($sp) 659 mflr r0 660 $PUSH r28,`6*$SIZE_T`($sp) 661 $PUSH r29,`7*$SIZE_T`($sp) 662 $PUSH r30,`8*$SIZE_T`($sp) 663 $PUSH r31,`9*$SIZE_T`($sp) 664 $PUSH r0,`10*$SIZE_T+$LRSAVE`($sp) 665 666 mr $A_flat,r3 667 subi r3,r3,8 ; prepare for ldu 668 subi $out,r4,1 ; prepare for stbu 669 mr $len,r5 670 mr $bsz,r6 671 cmplwi r7,0 ; r7 = 'next' argument 672 bne .Lnext_block 673 b .Loop_squeeze 674 675.align 4 676.Loop_squeeze: 677 ldu r0,8(r3) 678 ${UCMP}i $len,8 679 blt .Lsqueeze_tail 680 681 stb r0,1($out) 682 srdi r0,r0,8 683 stb r0,2($out) 684 srdi r0,r0,8 685 stb r0,3($out) 686 srdi r0,r0,8 687 stb r0,4($out) 688 srdi r0,r0,8 689 stb r0,5($out) 690 srdi r0,r0,8 691 stb r0,6($out) 692 srdi r0,r0,8 693 stb r0,7($out) 694 srdi r0,r0,8 695 stbu r0,8($out) 696 697 subic. $len,$len,8 698 beq .Lsqueeze_done 699 700 subic. r6,r6,8 701 bgt .Loop_squeeze 702 703.Lnext_block: 704 mr r3,$A_flat 705 bl KeccakF1600 706 subi r3,$A_flat,8 ; prepare for ldu 707 mr r6,$bsz 708 b .Loop_squeeze 709 710.align 4 711.Lsqueeze_tail: 712 mtctr $len 713.Loop_tail: 714 stbu r0,1($out) 715 srdi r0,r0,8 716 bdnz .Loop_tail 717 718.Lsqueeze_done: 719 $POP r0,`10*$SIZE_T+$LRSAVE`($sp) 720 $POP r28,`6*$SIZE_T`($sp) 721 $POP r29,`7*$SIZE_T`($sp) 722 $POP r30,`8*$SIZE_T`($sp) 723 $POP r31,`9*$SIZE_T`($sp) 724 mtlr r0 725 addi $sp,$sp,`10*$SIZE_T` 726 blr 727 .long 0 728 .byte 0,12,4,1,0x80,4,4,0 729 .long 0 730.size SHA3_squeeze,.-SHA3_squeeze 731___ 732} 733 734# Ugly hack here, because PPC assembler syntax seem to vary too 735# much from platforms to platform... 736$code.=<<___; 737.align 6 738PICmeup: 739 mflr r0 740 bcl 20,31,\$+4 741 mflr r12 ; vvvvvv "distance" between . and 1st data entry 742 addi r12,r12,`64-8` 743 mtlr r0 744 blr 745 .long 0 746 .byte 0,12,0x14,0,0,0,0,0 747 .space `64-9*4` 748.type iotas,\@object 749iotas: 750 .quad 0x0000000000000001 751 .quad 0x0000000000008082 752 .quad 0x800000000000808a 753 .quad 0x8000000080008000 754 .quad 0x000000000000808b 755 .quad 0x0000000080000001 756 .quad 0x8000000080008081 757 .quad 0x8000000000008009 758 .quad 0x000000000000008a 759 .quad 0x0000000000000088 760 .quad 0x0000000080008009 761 .quad 0x000000008000000a 762 .quad 0x000000008000808b 763 .quad 0x800000000000008b 764 .quad 0x8000000000008089 765 .quad 0x8000000000008003 766 .quad 0x8000000000008002 767 .quad 0x8000000000000080 768 .quad 0x000000000000800a 769 .quad 0x800000008000000a 770 .quad 0x8000000080008081 771 .quad 0x8000000000008080 772 .quad 0x0000000080000001 773 .quad 0x8000000080008008 774.size iotas,.-iotas 775.asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 776___ 777 778$code =~ s/\`([^\`]*)\`/eval $1/gem; 779print $code; 780close STDOUT or die "error closing STDOUT: $!"; 781