1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for PowerISA 2.07. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT SIMD implementation, but with 21# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral. 22# POWER8 processor spends 9.8 cycles to process byte out of large 23# buffer for r=1088, which matches SHA3-256. This is 17% better than 24# scalar PPC64 code. It probably should be noted that if POWER8's 25# successor can achieve higher scalar instruction issue rate, then 26# this module will loose... And it does on POWER9 with 12.0 vs. 9.4. 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33if ($flavour =~ /64/) { 34 $SIZE_T =8; 35 $LRSAVE =2*$SIZE_T; 36 $UCMP ="cmpld"; 37 $STU ="stdu"; 38 $POP ="ld"; 39 $PUSH ="std"; 40} elsif ($flavour =~ /32/) { 41 $SIZE_T =4; 42 $LRSAVE =$SIZE_T; 43 $STU ="stwu"; 44 $POP ="lwz"; 45 $PUSH ="stw"; 46 $UCMP ="cmplw"; 47} else { die "nonsense $flavour"; } 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 52die "can't locate ppc-xlate.pl"; 53 54open STDOUT,"| $^X $xlate $flavour \"$output\"" 55 or die "can't call $xlate: $!"; 56 57$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload 58 59my $sp ="r1"; 60 61my $iotas = "r12"; 62 63######################################################################## 64# Register layout: 65# 66# v0 A[0][0] A[1][0] 67# v1 A[0][1] A[1][1] 68# v2 A[0][2] A[1][2] 69# v3 A[0][3] A[1][3] 70# v4 A[0][4] A[1][4] 71# 72# v5 A[2][0] A[3][0] 73# v6 A[2][1] A[3][1] 74# v7 A[2][2] A[3][2] 75# v8 A[2][3] A[3][3] 76# v9 A[2][4] A[3][4] 77# 78# v10 A[4][0] A[4][1] 79# v11 A[4][2] A[4][3] 80# v12 A[4][4] A[4][4] 81# 82# v13..25 rhotates[][] 83# v26..31 volatile 84# 85$code.=<<___; 86.machine "any" 87.text 88 89.type KeccakF1600_int,\@function 90.align 5 91KeccakF1600_int: 92 li r0,24 93 mtctr r0 94 li r0,0 95 b .Loop 96 97.align 4 98.Loop: 99 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta 100 vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0] 101 vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1] 102 vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2] 103 vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3] 104 vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4] 105 vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1] 106 vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1] 107 vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3] 108 vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3] 109 vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4] 110 vxor v26,v26,v31 ; C[0..1] 111 vxor v27,v27,v28 ; C[2..3] 112 vxor v28,v29,v30 ; C[4..4] 113 vspltisb v31,1 114 vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1] 115 vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3] 116 vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low! 117 118 vrld v29,v26,v31 ; ROL64(C[0..1],1) 119 vrld v30,v27,v31 ; ROL64(C[2..3],1) 120 vrld v31,v28,v31 ; ROL64(C[4..4],1) 121 vpermdi v31,v31,v29,0b10 122 vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1) 123 vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1) 124 vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low! 125 126 vpermdi v29,v26,v26,0b00 ; C[0..0] 127 vpermdi v30,v28,v26,0b10 ; C[4..0] 128 vpermdi v31,v28,v28,0b11 ; C[4..4] 129 vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0] 130 vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0] 131 vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0] 132 vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4] 133 vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4] 134 135 vpermdi v29,v27,v27,0b00 ; C[2..2] 136 vpermdi v30,v26,v26,0b11 ; C[1..1] 137 vpermdi v31,v26,v27,0b10 ; C[1..2] 138 vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2] 139 vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2] 140 vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1] 141 vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1] 142 vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2] 143 144 vpermdi v29,v27,v27,0b11 ; C[3..3] 145 vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3] 146 vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3] 147 vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3] 148 149 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho 150 vrld v26,v0, v13 ; v0 151 vrld v1, v1, v14 152 vrld v27,v2, v15 ; v2 153 vrld v28,v3, v16 ; v3 154 vrld v4, v4, v17 155 vrld v5, v5, v18 156 vrld v6, v6, v19 157 vrld v29,v7, v20 ; v7 158 vrld v8, v8, v21 159 vrld v9, v9, v22 160 vrld v10,v10,v23 161 vrld v30,v11,v24 ; v11 162 vrld v12,v12,v25 163 164 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi 165 vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3] 166 vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0] 167 vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0] 168 vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4] 169 vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4] 170 vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1] 171 vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2] 172 vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1] 173 vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0] 174 vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2] 175 vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1] 176 vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3] 177 vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3] 178 179 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota 180 lvx_u v31,$iotas,r0 ; iotas[index] 181 addic r0,r0,16 ; index++ 182 183 vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2]) 184 vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3]) 185 vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4]) 186 vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0]) 187 vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1]) 188 vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2]) 189 vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3]) 190 vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) 191 vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) 192 vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) 193 194 vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2]) 195 vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3]) 196 vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4]) 197 vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0]) 198 vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1]) 199 vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) 200 vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) 201 vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) 202 vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) 203 vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) 204 205 vxor v0, v0, v31 ; A[0][0] ^= iotas[index++] 206 207 vpermdi v26,v10,v11,0b10 ; A[4][1..2] 208 vpermdi v27,v12,v10,0b00 ; A[4][4..0] 209 vpermdi v28,v11,v12,0b10 ; A[4][3..4] 210 vpermdi v29,v10,v10,0b10 ; A[4][1..0] 211 vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3]) 212 vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0]) 213 vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1]) 214 vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3]) 215 vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0]) 216 vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0]) 217 218 bdnz .Loop 219 220 vpermdi v12,v12,v12,0b11 ; broadcast A[4][4] 221 blr 222 .long 0 223 .byte 0,12,0x14,0,0,0,0,0 224.size KeccakF1600_int,.-KeccakF1600_int 225 226.type KeccakF1600,\@function 227.align 5 228KeccakF1600: 229 $STU $sp,-$FRAME($sp) 230 li r10,`15+6*$SIZE_T` 231 li r11,`31+6*$SIZE_T` 232 mflr r8 233 mfspr r7, 256 ; save vrsave 234 stvx v20,r10,$sp 235 addi r10,r10,32 236 stvx v21,r11,$sp 237 addi r11,r11,32 238 stvx v22,r10,$sp 239 addi r10,r10,32 240 stvx v23,r11,$sp 241 addi r11,r11,32 242 stvx v24,r10,$sp 243 addi r10,r10,32 244 stvx v25,r11,$sp 245 addi r11,r11,32 246 stvx v26,r10,$sp 247 addi r10,r10,32 248 stvx v27,r11,$sp 249 addi r11,r11,32 250 stvx v28,r10,$sp 251 addi r10,r10,32 252 stvx v29,r11,$sp 253 addi r11,r11,32 254 stvx v30,r10,$sp 255 stvx v31,r11,$sp 256 stw r7,`$FRAME-4`($sp) ; save vrsave 257 li r0, -1 258 $PUSH r8,`$FRAME+$LRSAVE`($sp) 259 mtspr 256, r0 ; preserve all AltiVec registers 260 261 li r11,16 262 lvx_4w v0,0,r3 ; load A[5][5] 263 li r10,32 264 lvx_4w v1,r11,r3 265 addi r11,r11,32 266 lvx_4w v2,r10,r3 267 addi r10,r10,32 268 lvx_4w v3,r11,r3 269 addi r11,r11,32 270 lvx_4w v4,r10,r3 271 addi r10,r10,32 272 lvx_4w v5,r11,r3 273 addi r11,r11,32 274 lvx_4w v6,r10,r3 275 addi r10,r10,32 276 lvx_4w v7,r11,r3 277 addi r11,r11,32 278 lvx_4w v8,r10,r3 279 addi r10,r10,32 280 lvx_4w v9,r11,r3 281 addi r11,r11,32 282 lvx_4w v10,r10,r3 283 addi r10,r10,32 284 lvx_4w v11,r11,r3 285 lvx_splt v12,r10,r3 286 287 bl PICmeup 288 289 li r11,16 290 lvx_u v13,0,r12 ; load rhotates 291 li r10,32 292 lvx_u v14,r11,r12 293 addi r11,r11,32 294 lvx_u v15,r10,r12 295 addi r10,r10,32 296 lvx_u v16,r11,r12 297 addi r11,r11,32 298 lvx_u v17,r10,r12 299 addi r10,r10,32 300 lvx_u v18,r11,r12 301 addi r11,r11,32 302 lvx_u v19,r10,r12 303 addi r10,r10,32 304 lvx_u v20,r11,r12 305 addi r11,r11,32 306 lvx_u v21,r10,r12 307 addi r10,r10,32 308 lvx_u v22,r11,r12 309 addi r11,r11,32 310 lvx_u v23,r10,r12 311 addi r10,r10,32 312 lvx_u v24,r11,r12 313 lvx_u v25,r10,r12 314 addi r12,r12,`16*16` ; points at iotas 315 316 bl KeccakF1600_int 317 318 li r11,16 319 stvx_4w v0,0,r3 ; return A[5][5] 320 li r10,32 321 stvx_4w v1,r11,r3 322 addi r11,r11,32 323 stvx_4w v2,r10,r3 324 addi r10,r10,32 325 stvx_4w v3,r11,r3 326 addi r11,r11,32 327 stvx_4w v4,r10,r3 328 addi r10,r10,32 329 stvx_4w v5,r11,r3 330 addi r11,r11,32 331 stvx_4w v6,r10,r3 332 addi r10,r10,32 333 stvx_4w v7,r11,r3 334 addi r11,r11,32 335 stvx_4w v8,r10,r3 336 addi r10,r10,32 337 stvx_4w v9,r11,r3 338 addi r11,r11,32 339 stvx_4w v10,r10,r3 340 addi r10,r10,32 341 stvx_4w v11,r11,r3 342 stvdx_u v12,r10,r3 343 344 li r10,`15+6*$SIZE_T` 345 li r11,`31+6*$SIZE_T` 346 mtlr r8 347 mtspr 256, r7 ; restore vrsave 348 lvx v20,r10,$sp 349 addi r10,r10,32 350 lvx v21,r11,$sp 351 addi r11,r11,32 352 lvx v22,r10,$sp 353 addi r10,r10,32 354 lvx v23,r11,$sp 355 addi r11,r11,32 356 lvx v24,r10,$sp 357 addi r10,r10,32 358 lvx v25,r11,$sp 359 addi r11,r11,32 360 lvx v26,r10,$sp 361 addi r10,r10,32 362 lvx v27,r11,$sp 363 addi r11,r11,32 364 lvx v28,r10,$sp 365 addi r10,r10,32 366 lvx v29,r11,$sp 367 addi r11,r11,32 368 lvx v30,r10,$sp 369 lvx v31,r11,$sp 370 addi $sp,$sp,$FRAME 371 blr 372 .long 0 373 .byte 0,12,0x04,1,0x80,0,1,0 374 .long 0 375.size KeccakF1600,.-KeccakF1600 376___ 377{ 378my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6)); 379 380$code.=<<___; 381.globl SHA3_absorb 382.type SHA3_absorb,\@function 383.align 5 384SHA3_absorb: 385 $STU $sp,-$FRAME($sp) 386 li r10,`15+6*$SIZE_T` 387 li r11,`31+6*$SIZE_T` 388 mflr r8 389 mfspr r7, 256 ; save vrsave 390 stvx v20,r10,$sp 391 addi r10,r10,32 392 stvx v21,r11,$sp 393 addi r11,r11,32 394 stvx v22,r10,$sp 395 addi r10,r10,32 396 stvx v23,r11,$sp 397 addi r11,r11,32 398 stvx v24,r10,$sp 399 addi r10,r10,32 400 stvx v25,r11,$sp 401 addi r11,r11,32 402 stvx v26,r10,$sp 403 addi r10,r10,32 404 stvx v27,r11,$sp 405 addi r11,r11,32 406 stvx v28,r10,$sp 407 addi r10,r10,32 408 stvx v29,r11,$sp 409 addi r11,r11,32 410 stvx v30,r10,$sp 411 stvx v31,r11,$sp 412 stw r7,`$FRAME-4`($sp) ; save vrsave 413 li r0, -1 414 $PUSH r8,`$FRAME+$LRSAVE`($sp) 415 mtspr 256, r0 ; preserve all AltiVec registers 416 417 li r11,16 418 lvx_4w v0,0,$A_jagged ; load A[5][5] 419 li r10,32 420 lvx_4w v1,r11,$A_jagged 421 addi r11,r11,32 422 lvx_4w v2,r10,$A_jagged 423 addi r10,r10,32 424 lvx_4w v3,r11,$A_jagged 425 addi r11,r11,32 426 lvx_4w v4,r10,$A_jagged 427 addi r10,r10,32 428 lvx_4w v5,r11,$A_jagged 429 addi r11,r11,32 430 lvx_4w v6,r10,$A_jagged 431 addi r10,r10,32 432 lvx_4w v7,r11,$A_jagged 433 addi r11,r11,32 434 lvx_4w v8,r10,$A_jagged 435 addi r10,r10,32 436 lvx_4w v9,r11,$A_jagged 437 addi r11,r11,32 438 lvx_4w v10,r10,$A_jagged 439 addi r10,r10,32 440 lvx_4w v11,r11,$A_jagged 441 lvx_splt v12,r10,$A_jagged 442 443 bl PICmeup 444 445 li r11,16 446 lvx_u v13,0,r12 ; load rhotates 447 li r10,32 448 lvx_u v14,r11,r12 449 addi r11,r11,32 450 lvx_u v15,r10,r12 451 addi r10,r10,32 452 lvx_u v16,r11,r12 453 addi r11,r11,32 454 lvx_u v17,r10,r12 455 addi r10,r10,32 456 lvx_u v18,r11,r12 457 addi r11,r11,32 458 lvx_u v19,r10,r12 459 addi r10,r10,32 460 lvx_u v20,r11,r12 461 addi r11,r11,32 462 lvx_u v21,r10,r12 463 addi r10,r10,32 464 lvx_u v22,r11,r12 465 addi r11,r11,32 466 lvx_u v23,r10,r12 467 addi r10,r10,32 468 lvx_u v24,r11,r12 469 lvx_u v25,r10,r12 470 li r10,-32 471 li r11,-16 472 addi r12,r12,`16*16` ; points at iotas 473 b .Loop_absorb 474 475.align 4 476.Loop_absorb: 477 $UCMP $len,$bsz ; len < bsz? 478 blt .Labsorbed 479 480 sub $len,$len,$bsz ; len -= bsz 481 srwi r0,$bsz,3 482 mtctr r0 483 484 lvx_u v30,r10,r12 ; permutation masks 485 lvx_u v31,r11,r12 486 ?vspltisb v27,7 ; prepare masks for byte swap 487 ?vxor v30,v30,v27 ; on big-endian 488 ?vxor v31,v31,v27 489 490 vxor v27,v27,v27 ; zero 491 lvdx_u v26,0,$inp 492 addi $inp,$inp,8 493 vperm v26,v26,v27,v30 494 vxor v0, v0, v26 495 bdz .Lprocess_block 496 lvdx_u v26,0,$inp 497 addi $inp,$inp,8 498 vperm v26,v26,v27,v30 499 vxor v1, v1, v26 500 bdz .Lprocess_block 501 lvdx_u v26,0,$inp 502 addi $inp,$inp,8 503 vperm v26,v26,v27,v30 504 vxor v2, v2, v26 505 bdz .Lprocess_block 506 lvdx_u v26,0,$inp 507 addi $inp,$inp,8 508 vperm v26,v26,v27,v30 509 vxor v3, v3, v26 510 bdz .Lprocess_block 511 lvdx_u v26,0,$inp 512 addi $inp,$inp,8 513 vperm v26,v26,v27,v30 514 vxor v4, v4, v26 515 bdz .Lprocess_block 516 lvdx_u v26,0,$inp 517 addi $inp,$inp,8 518 vperm v26,v26,v27,v31 519 vxor v0, v0, v26 520 bdz .Lprocess_block 521 lvdx_u v26,0,$inp 522 addi $inp,$inp,8 523 vperm v26,v26,v27,v31 524 vxor v1, v1, v26 525 bdz .Lprocess_block 526 lvdx_u v26,0,$inp 527 addi $inp,$inp,8 528 vperm v26,v26,v27,v31 529 vxor v2, v2, v26 530 bdz .Lprocess_block 531 lvdx_u v26,0,$inp 532 addi $inp,$inp,8 533 vperm v26,v26,v27,v31 534 vxor v3, v3, v26 535 bdz .Lprocess_block 536 lvdx_u v26,0,$inp 537 addi $inp,$inp,8 538 vperm v26,v26,v27,v31 539 vxor v4, v4, v26 540 bdz .Lprocess_block 541 lvdx_u v26,0,$inp 542 addi $inp,$inp,8 543 vperm v26,v26,v27,v30 544 vxor v5, v5, v26 545 bdz .Lprocess_block 546 lvdx_u v26,0,$inp 547 addi $inp,$inp,8 548 vperm v26,v26,v27,v30 549 vxor v6, v6, v26 550 bdz .Lprocess_block 551 lvdx_u v26,0,$inp 552 addi $inp,$inp,8 553 vperm v26,v26,v27,v30 554 vxor v7, v7, v26 555 bdz .Lprocess_block 556 lvdx_u v26,0,$inp 557 addi $inp,$inp,8 558 vperm v26,v26,v27,v30 559 vxor v8, v8, v26 560 bdz .Lprocess_block 561 lvdx_u v26,0,$inp 562 addi $inp,$inp,8 563 vperm v26,v26,v27,v30 564 vxor v9, v9, v26 565 bdz .Lprocess_block 566 lvdx_u v26,0,$inp 567 addi $inp,$inp,8 568 vperm v26,v26,v27,v31 569 vxor v5, v5, v26 570 bdz .Lprocess_block 571 lvdx_u v26,0,$inp 572 addi $inp,$inp,8 573 vperm v26,v26,v27,v31 574 vxor v6, v6, v26 575 bdz .Lprocess_block 576 lvdx_u v26,0,$inp 577 addi $inp,$inp,8 578 vperm v26,v26,v27,v31 579 vxor v7, v7, v26 580 bdz .Lprocess_block 581 lvdx_u v26,0,$inp 582 addi $inp,$inp,8 583 vperm v26,v26,v27,v31 584 vxor v8, v8, v26 585 bdz .Lprocess_block 586 lvdx_u v26,0,$inp 587 addi $inp,$inp,8 588 vperm v26,v26,v27,v31 589 vxor v9, v9, v26 590 bdz .Lprocess_block 591 lvdx_u v26,0,$inp 592 addi $inp,$inp,8 593 vperm v26,v26,v27,v30 594 vxor v10, v10, v26 595 bdz .Lprocess_block 596 lvdx_u v26,0,$inp 597 addi $inp,$inp,8 598 vperm v26,v26,v27,v31 599 vxor v10, v10, v26 600 bdz .Lprocess_block 601 lvdx_u v26,0,$inp 602 addi $inp,$inp,8 603 vperm v26,v26,v27,v30 604 vxor v11, v11, v26 605 bdz .Lprocess_block 606 lvdx_u v26,0,$inp 607 addi $inp,$inp,8 608 vperm v26,v26,v27,v31 609 vxor v11, v11, v26 610 bdz .Lprocess_block 611 lvdx_u v26,0,$inp 612 addi $inp,$inp,8 613 vperm v26,v26,v27,v31 614 vxor v12, v12, v26 615 616.Lprocess_block: 617 bl KeccakF1600_int 618 619 b .Loop_absorb 620 621.align 4 622.Labsorbed: 623 li r11,16 624 stvx_4w v0,0,$A_jagged ; return A[5][5] 625 li r10,32 626 stvx_4w v1,r11,$A_jagged 627 addi r11,r11,32 628 stvx_4w v2,r10,$A_jagged 629 addi r10,r10,32 630 stvx_4w v3,r11,$A_jagged 631 addi r11,r11,32 632 stvx_4w v4,r10,$A_jagged 633 addi r10,r10,32 634 stvx_4w v5,r11,$A_jagged 635 addi r11,r11,32 636 stvx_4w v6,r10,$A_jagged 637 addi r10,r10,32 638 stvx_4w v7,r11,$A_jagged 639 addi r11,r11,32 640 stvx_4w v8,r10,$A_jagged 641 addi r10,r10,32 642 stvx_4w v9,r11,$A_jagged 643 addi r11,r11,32 644 stvx_4w v10,r10,$A_jagged 645 addi r10,r10,32 646 stvx_4w v11,r11,$A_jagged 647 stvdx_u v12,r10,$A_jagged 648 649 mr r3,$len ; return value 650 li r10,`15+6*$SIZE_T` 651 li r11,`31+6*$SIZE_T` 652 mtlr r8 653 mtspr 256, r7 ; restore vrsave 654 lvx v20,r10,$sp 655 addi r10,r10,32 656 lvx v21,r11,$sp 657 addi r11,r11,32 658 lvx v22,r10,$sp 659 addi r10,r10,32 660 lvx v23,r11,$sp 661 addi r11,r11,32 662 lvx v24,r10,$sp 663 addi r10,r10,32 664 lvx v25,r11,$sp 665 addi r11,r11,32 666 lvx v26,r10,$sp 667 addi r10,r10,32 668 lvx v27,r11,$sp 669 addi r11,r11,32 670 lvx v28,r10,$sp 671 addi r10,r10,32 672 lvx v29,r11,$sp 673 addi r11,r11,32 674 lvx v30,r10,$sp 675 lvx v31,r11,$sp 676 addi $sp,$sp,$FRAME 677 blr 678 .long 0 679 .byte 0,12,0x04,1,0x80,0,4,0 680 .long 0 681.size SHA3_absorb,.-SHA3_absorb 682___ 683} 684{ 685my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6)); 686 687$code.=<<___; 688.globl SHA3_squeeze 689.type SHA3_squeeze,\@function 690.align 5 691SHA3_squeeze: 692 mflr r9 ; r9 is not touched by KeccakF1600 693 subi $out,$out,1 ; prepare for stbu 694 addi r8,$A_jagged,4 ; prepare volatiles 695 mr r10,$bsz 696 li r11,0 697 b .Loop_squeeze 698.align 4 699.Loop_squeeze: 700 lwzx r7,r11,r8 ; lo 701 lwzx r0,r11,$A_jagged ; hi 702 ${UCMP}i $len,8 703 blt .Lsqueeze_tail 704 705 stbu r7,1($out) ; write lo 706 srwi r7,r7,8 707 stbu r7,1($out) 708 srwi r7,r7,8 709 stbu r7,1($out) 710 srwi r7,r7,8 711 stbu r7,1($out) 712 stbu r0,1($out) ; write hi 713 srwi r0,r0,8 714 stbu r0,1($out) 715 srwi r0,r0,8 716 stbu r0,1($out) 717 srwi r0,r0,8 718 stbu r0,1($out) 719 720 subic. $len,$len,8 721 beqlr ; return if done 722 723 subic. r10,r10,8 724 ble .Loutput_expand 725 726 addi r11,r11,16 ; calculate jagged index 727 cmplwi r11,`16*5` 728 blt .Loop_squeeze 729 subi r11,r11,72 730 beq .Loop_squeeze 731 addi r11,r11,72 732 cmplwi r11,`16*5+8` 733 subi r11,r11,8 734 beq .Loop_squeeze 735 addi r11,r11,8 736 cmplwi r11,`16*10` 737 subi r11,r11,72 738 beq .Loop_squeeze 739 addi r11,r11,72 740 blt .Loop_squeeze 741 subi r11,r11,8 742 b .Loop_squeeze 743 744.align 4 745.Loutput_expand: 746 bl KeccakF1600 747 mtlr r9 748 749 addi r8,$A_jagged,4 ; restore volatiles 750 mr r10,$bsz 751 li r11,0 752 b .Loop_squeeze 753 754.align 4 755.Lsqueeze_tail: 756 mtctr $len 757 subic. $len,$len,4 758 ble .Loop_tail_lo 759 li r8,4 760 mtctr r8 761.Loop_tail_lo: 762 stbu r7,1($out) 763 srdi r7,r7,8 764 bdnz .Loop_tail_lo 765 ble .Lsqueeze_done 766 mtctr $len 767.Loop_tail_hi: 768 stbu r0,1($out) 769 srdi r0,r0,8 770 bdnz .Loop_tail_hi 771 772.Lsqueeze_done: 773 blr 774 .long 0 775 .byte 0,12,0x14,0,0,0,4,0 776 .long 0 777.size SHA3_squeeze,.-SHA3_squeeze 778___ 779} 780$code.=<<___; 781.align 6 782PICmeup: 783 mflr r0 784 bcl 20,31,\$+4 785 mflr r12 ; vvvvvv "distance" between . and 1st data entry 786 addi r12,r12,`64-8` 787 mtlr r0 788 blr 789 .long 0 790 .byte 0,12,0x14,0,0,0,0,0 791 .space `64-9*4` 792.type rhotates,\@object 793.align 6 794rhotates: 795 .quad 0, 36 796 .quad 1, 44 797 .quad 62, 6 798 .quad 28, 55 799 .quad 27, 20 800 .quad 3, 41 801 .quad 10, 45 802 .quad 43, 15 803 .quad 25, 21 804 .quad 39, 8 805 .quad 18, 2 806 .quad 61, 56 807 .quad 14, 14 808.size rhotates,.-rhotates 809 .quad 0,0 810 .quad 0x0001020304050607,0x1011121314151617 811 .quad 0x1011121314151617,0x0001020304050607 812.type iotas,\@object 813iotas: 814 .quad 0x0000000000000001,0 815 .quad 0x0000000000008082,0 816 .quad 0x800000000000808a,0 817 .quad 0x8000000080008000,0 818 .quad 0x000000000000808b,0 819 .quad 0x0000000080000001,0 820 .quad 0x8000000080008081,0 821 .quad 0x8000000000008009,0 822 .quad 0x000000000000008a,0 823 .quad 0x0000000000000088,0 824 .quad 0x0000000080008009,0 825 .quad 0x000000008000000a,0 826 .quad 0x000000008000808b,0 827 .quad 0x800000000000008b,0 828 .quad 0x8000000000008089,0 829 .quad 0x8000000000008003,0 830 .quad 0x8000000000008002,0 831 .quad 0x8000000000000080,0 832 .quad 0x000000000000800a,0 833 .quad 0x800000008000000a,0 834 .quad 0x8000000080008081,0 835 .quad 0x8000000000008080,0 836 .quad 0x0000000080000001,0 837 .quad 0x8000000080008008,0 838.size iotas,.-iotas 839.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 840___ 841 842foreach (split("\n",$code)) { 843 s/\`([^\`]*)\`/eval $1/ge; 844 845 if ($flavour =~ /le$/) { # little-endian 846 s/\?([a-z]+)/;$1/; 847 } else { # big-endian 848 s/\?([a-z]+)/$1/; 849 } 850 851 print $_,"\n"; 852} 853 854close STDOUT or die "error closing STDOUT: $!"; 855