1#!/usr/bin/env perl 2# Copyright 2017-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for AVX-512F. 17# 18# July 2017. 19# 20# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c). 21# Pretty straightforward, the only "magic" is data layout in registers. 22# It's impossible to have one that is optimal for every step, hence 23# it's changing as algorithm progresses. Data is saved in linear order, 24# but in-register order morphs between rounds. Even rounds take in 25# linear layout, and odd rounds - transposed, or "verticaly-shaped"... 26# 27######################################################################## 28# Numbers are cycles per processed byte out of large message. 29# 30# r=1088(*) 31# 32# Knights Landing 7.6 33# Skylake-X 5.7 34# 35# (*) Corresponds to SHA3-256. 36 37######################################################################## 38# Below code is combination of two ideas. One is taken from Keccak Code 39# Package, hereafter KCP, and another one from initial version of this 40# module. What is common is observation that Pi's input and output are 41# "mostly transposed", i.e. if input is aligned by x coordinate, then 42# output is [mostly] aligned by y. Both versions, KCP and predecessor, 43# were trying to use one of them from round to round, which resulted in 44# some kind of transposition in each round. This version still does 45# transpose data, but only every second round. Another essential factor 46# is that KCP transposition has to be performed with instructions that 47# turned to be rather expensive on Knights Landing, both latency- and 48# throughput-wise. Not to mention that some of them have to depend on 49# each other. On the other hand initial version of this module was 50# relying heavily on blend instructions. There were lots of them, 51# resulting in higher instruction count, yet it performed better on 52# Knights Landing, because processor can execute pair of them each 53# cycle and they have minimal latency. This module is an attempt to 54# bring best parts together:-) 55# 56# Coordinates below correspond to those in sha/keccak1600.c. Input 57# layout is straight linear: 58# 59# [0][4] [0][3] [0][2] [0][1] [0][0] 60# [1][4] [1][3] [1][2] [1][1] [1][0] 61# [2][4] [2][3] [2][2] [2][1] [2][0] 62# [3][4] [3][3] [3][2] [3][1] [3][0] 63# [4][4] [4][3] [4][2] [4][1] [4][0] 64# 65# It's perfect for Theta, while Pi is reduced to intra-register 66# permutations which yield layout perfect for Chi: 67# 68# [4][0] [3][0] [2][0] [1][0] [0][0] 69# [4][1] [3][1] [2][1] [1][1] [0][1] 70# [4][2] [3][2] [2][2] [1][2] [0][2] 71# [4][3] [3][3] [2][3] [1][3] [0][3] 72# [4][4] [3][4] [2][4] [1][4] [0][4] 73# 74# Now instead of performing full transposition and feeding it to next 75# identical round, we perform kind of diagonal transposition to layout 76# from initial version of this module, and make it suitable for Theta: 77# 78# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0] 79# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0] 80# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0] 81# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0] 82# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0] 83# 84# Now intra-register permutations yield initial [almost] straight 85# linear layout: 86# 87# [4][4] [3][3] [2][2] [1][1] [0][0] 88##[0][4] [0][3] [0][2] [0][1] [0][0] 89# [3][4] [2][3] [1][2] [0][1] [4][0] 90##[2][3] [2][2] [2][1] [2][0] [2][4] 91# [2][4] [1][3] [0][2] [4][1] [3][0] 92##[4][2] [4][1] [4][0] [4][4] [4][3] 93# [1][4] [0][3] [4][2] [3][1] [2][0] 94##[1][1] [1][0] [1][4] [1][3] [1][2] 95# [0][4] [4][3] [3][2] [2][1] [1][0] 96##[3][0] [3][4] [3][3] [3][2] [3][1] 97# 98# This means that odd round Chi is performed in less suitable layout, 99# with a number of additional permutations. But overall it turned to be 100# a win. Permutations are fastest possible on Knights Landing and they 101# are laid down to be independent of each other. In the essence I traded 102# 20 blend instructions for 3 permutations. The result is 13% faster 103# than KCP on Skylake-X, and >40% on Knights Landing. 104# 105# As implied, data is loaded in straight linear order. Digits in 106# variables' names represent coordinates of right-most element of 107# loaded data chunk: 108 109my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0] 110 $A10, # [1][4] [1][3] [1][2] [1][1] [1][0] 111 $A20, # [2][4] [2][3] [2][2] [2][1] [2][0] 112 $A30, # [3][4] [3][3] [3][2] [3][1] [3][0] 113 $A40) = # [4][4] [4][3] [4][2] [4][1] [4][0] 114 map("%zmm$_",(0..4)); 115 116# We also need to map the magic order into offsets within structure: 117 118my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4], 119 [1,0], [1,1], [1,2], [1,3], [1,4], 120 [2,0], [2,1], [2,2], [2,3], [2,4], 121 [3,0], [3,1], [3,2], [3,3], [3,4], 122 [4,0], [4,1], [4,2], [4,3], [4,4]); 123 @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear 124 125my @T = map("%zmm$_",(5..12)); 126my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo 127my @Pi0 = map("%zmm$_",(17..21)); 128my @Rhotate0 = map("%zmm$_",(22..26)); 129my @Rhotate1 = map("%zmm$_",(27..31)); 130 131my ($C00,$D00) = @T[0..1]; 132my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6)); 133 134$code.=<<___; 135.text 136 137.type __KeccakF1600,\@function 138.align 32 139__KeccakF1600: 140 lea iotas(%rip),%r10 141 mov \$12,%eax 142 jmp .Loop_avx512 143 144.align 32 145.Loop_avx512: 146 ######################################### Theta, even round 147 vmovdqa64 $A00,@T[0] # put aside original A00 148 vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00" 149 vpternlogq \$0x96,$A40,$A30,$A00 150 151 vprolq \$1,$A00,$D00 152 vpermq $A00,@Theta[1],$A00 153 vpermq $D00,@Theta[4],$D00 154 155 vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00 156 vpternlogq \$0x96,$A00,$D00,$A10 157 vpternlogq \$0x96,$A00,$D00,$A20 158 vpternlogq \$0x96,$A00,$D00,$A30 159 vpternlogq \$0x96,$A00,$D00,$A40 160 161 ######################################### Rho 162 vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00 163 vprolvq @Rhotate0[1],$A10,$A10 164 vprolvq @Rhotate0[2],$A20,$A20 165 vprolvq @Rhotate0[3],$A30,$A30 166 vprolvq @Rhotate0[4],$A40,$A40 167 168 ######################################### Pi 169 vpermq $A00,@Pi0[0],$A00 170 vpermq $A10,@Pi0[1],$A10 171 vpermq $A20,@Pi0[2],$A20 172 vpermq $A30,@Pi0[3],$A30 173 vpermq $A40,@Pi0[4],$A40 174 175 ######################################### Chi 176 vmovdqa64 $A00,@T[0] 177 vmovdqa64 $A10,@T[1] 178 vpternlogq \$0xD2,$A20,$A10,$A00 179 vpternlogq \$0xD2,$A30,$A20,$A10 180 vpternlogq \$0xD2,$A40,$A30,$A20 181 vpternlogq \$0xD2,@T[0],$A40,$A30 182 vpternlogq \$0xD2,@T[1],@T[0],$A40 183 184 ######################################### Iota 185 vpxorq (%r10),$A00,${A00}{$k00001} 186 lea 16(%r10),%r10 187 188 ######################################### Harmonize rounds 189 vpblendmq $A20,$A10,@{T[1]}{$k00010} 190 vpblendmq $A30,$A20,@{T[2]}{$k00010} 191 vpblendmq $A40,$A30,@{T[3]}{$k00010} 192 vpblendmq $A10,$A00,@{T[0]}{$k00010} 193 vpblendmq $A00,$A40,@{T[4]}{$k00010} 194 195 vpblendmq $A30,@T[1],@{T[1]}{$k00100} 196 vpblendmq $A40,@T[2],@{T[2]}{$k00100} 197 vpblendmq $A20,@T[0],@{T[0]}{$k00100} 198 vpblendmq $A00,@T[3],@{T[3]}{$k00100} 199 vpblendmq $A10,@T[4],@{T[4]}{$k00100} 200 201 vpblendmq $A40,@T[1],@{T[1]}{$k01000} 202 vpblendmq $A30,@T[0],@{T[0]}{$k01000} 203 vpblendmq $A00,@T[2],@{T[2]}{$k01000} 204 vpblendmq $A10,@T[3],@{T[3]}{$k01000} 205 vpblendmq $A20,@T[4],@{T[4]}{$k01000} 206 207 vpblendmq $A40,@T[0],@{T[0]}{$k10000} 208 vpblendmq $A00,@T[1],@{T[1]}{$k10000} 209 vpblendmq $A10,@T[2],@{T[2]}{$k10000} 210 vpblendmq $A20,@T[3],@{T[3]}{$k10000} 211 vpblendmq $A30,@T[4],@{T[4]}{$k10000} 212 213 #vpermq @T[0],@Theta[0],$A00 # doesn't actually change order 214 vpermq @T[1],@Theta[1],$A10 215 vpermq @T[2],@Theta[2],$A20 216 vpermq @T[3],@Theta[3],$A30 217 vpermq @T[4],@Theta[4],$A40 218 219 ######################################### Theta, odd round 220 vmovdqa64 $T[0],$A00 # real A00 221 vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias 222 vpternlogq \$0x96,$A40,$A30,$C00 223 224 vprolq \$1,$C00,$D00 225 vpermq $C00,@Theta[1],$C00 226 vpermq $D00,@Theta[4],$D00 227 228 vpternlogq \$0x96,$C00,$D00,$A00 229 vpternlogq \$0x96,$C00,$D00,$A30 230 vpternlogq \$0x96,$C00,$D00,$A10 231 vpternlogq \$0x96,$C00,$D00,$A40 232 vpternlogq \$0x96,$C00,$D00,$A20 233 234 ######################################### Rho 235 vprolvq @Rhotate1[0],$A00,$A00 236 vprolvq @Rhotate1[3],$A30,@T[1] 237 vprolvq @Rhotate1[1],$A10,@T[2] 238 vprolvq @Rhotate1[4],$A40,@T[3] 239 vprolvq @Rhotate1[2],$A20,@T[4] 240 241 vpermq $A00,@Theta[4],@T[5] 242 vpermq $A00,@Theta[3],@T[6] 243 244 ######################################### Iota 245 vpxorq -8(%r10),$A00,${A00}{$k00001} 246 247 ######################################### Pi 248 vpermq @T[1],@Theta[2],$A10 249 vpermq @T[2],@Theta[4],$A20 250 vpermq @T[3],@Theta[1],$A30 251 vpermq @T[4],@Theta[3],$A40 252 253 ######################################### Chi 254 vpternlogq \$0xD2,@T[6],@T[5],$A00 255 256 vpermq @T[1],@Theta[1],@T[7] 257 #vpermq @T[1],@Theta[0],@T[1] 258 vpternlogq \$0xD2,@T[1],@T[7],$A10 259 260 vpermq @T[2],@Theta[3],@T[0] 261 vpermq @T[2],@Theta[2],@T[2] 262 vpternlogq \$0xD2,@T[2],@T[0],$A20 263 264 #vpermq @T[3],@Theta[0],@T[3] 265 vpermq @T[3],@Theta[4],@T[1] 266 vpternlogq \$0xD2,@T[1],@T[3],$A30 267 268 vpermq @T[4],@Theta[2],@T[0] 269 vpermq @T[4],@Theta[1],@T[4] 270 vpternlogq \$0xD2,@T[4],@T[0],$A40 271 272 dec %eax 273 jnz .Loop_avx512 274 275 ret 276.size __KeccakF1600,.-__KeccakF1600 277___ 278 279my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 280my $out = $inp; # in squeeze 281 282$code.=<<___; 283.globl SHA3_absorb 284.type SHA3_absorb,\@function 285.align 32 286SHA3_absorb: 287 mov %rsp,%r11 288 289 lea -320(%rsp),%rsp 290 and \$-64,%rsp 291 292 lea 96($A_flat),$A_flat 293 lea 96($inp),$inp 294 lea 128(%rsp),%r9 295 296 lea theta_perm(%rip),%r8 297 298 kxnorw $k11111,$k11111,$k11111 299 kshiftrw \$15,$k11111,$k00001 300 kshiftrw \$11,$k11111,$k11111 301 kshiftlw \$1,$k00001,$k00010 302 kshiftlw \$2,$k00001,$k00100 303 kshiftlw \$3,$k00001,$k01000 304 kshiftlw \$4,$k00001,$k10000 305 306 #vmovdqa64 64*0(%r8),@Theta[0] 307 vmovdqa64 64*1(%r8),@Theta[1] 308 vmovdqa64 64*2(%r8),@Theta[2] 309 vmovdqa64 64*3(%r8),@Theta[3] 310 vmovdqa64 64*4(%r8),@Theta[4] 311 312 vmovdqa64 64*5(%r8),@Rhotate1[0] 313 vmovdqa64 64*6(%r8),@Rhotate1[1] 314 vmovdqa64 64*7(%r8),@Rhotate1[2] 315 vmovdqa64 64*8(%r8),@Rhotate1[3] 316 vmovdqa64 64*9(%r8),@Rhotate1[4] 317 318 vmovdqa64 64*10(%r8),@Rhotate0[0] 319 vmovdqa64 64*11(%r8),@Rhotate0[1] 320 vmovdqa64 64*12(%r8),@Rhotate0[2] 321 vmovdqa64 64*13(%r8),@Rhotate0[3] 322 vmovdqa64 64*14(%r8),@Rhotate0[4] 323 324 vmovdqa64 64*15(%r8),@Pi0[0] 325 vmovdqa64 64*16(%r8),@Pi0[1] 326 vmovdqa64 64*17(%r8),@Pi0[2] 327 vmovdqa64 64*18(%r8),@Pi0[3] 328 vmovdqa64 64*19(%r8),@Pi0[4] 329 330 vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} 331 vpxorq @T[0],@T[0],@T[0] 332 vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} 333 vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} 334 vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} 335 vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} 336 337 vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack 338 vmovdqa64 @T[0],1*64-128(%r9) 339 vmovdqa64 @T[0],2*64-128(%r9) 340 vmovdqa64 @T[0],3*64-128(%r9) 341 vmovdqa64 @T[0],4*64-128(%r9) 342 jmp .Loop_absorb_avx512 343 344.align 32 345.Loop_absorb_avx512: 346 mov $bsz,%rax 347 sub $bsz,$len 348 jc .Ldone_absorb_avx512 349 350 shr \$3,%eax 351___ 352for(my $i=0; $i<25; $i++) { 353$code.=<<___ 354 mov 8*$i-96($inp),%r8 355 mov %r8,$A_jagged[$i]-128(%r9) 356 dec %eax 357 jz .Labsorved_avx512 358___ 359} 360$code.=<<___; 361.Labsorved_avx512: 362 lea ($inp,$bsz),$inp 363 364 vpxorq 64*0-128(%r9),$A00,$A00 365 vpxorq 64*1-128(%r9),$A10,$A10 366 vpxorq 64*2-128(%r9),$A20,$A20 367 vpxorq 64*3-128(%r9),$A30,$A30 368 vpxorq 64*4-128(%r9),$A40,$A40 369 370 call __KeccakF1600 371 372 jmp .Loop_absorb_avx512 373 374.align 32 375.Ldone_absorb_avx512: 376 vmovdqu64 $A00,40*0-96($A_flat){$k11111} 377 vmovdqu64 $A10,40*1-96($A_flat){$k11111} 378 vmovdqu64 $A20,40*2-96($A_flat){$k11111} 379 vmovdqu64 $A30,40*3-96($A_flat){$k11111} 380 vmovdqu64 $A40,40*4-96($A_flat){$k11111} 381 382 vzeroupper 383 384 lea (%r11),%rsp 385 lea ($len,$bsz),%rax # return value 386 ret 387.size SHA3_absorb,.-SHA3_absorb 388 389.globl SHA3_squeeze 390.type SHA3_squeeze,\@function 391.align 32 392SHA3_squeeze: 393 mov %rsp,%r11 394 395 lea 96($A_flat),$A_flat 396 cmp $bsz,$len 397 jbe .Lno_output_extension_avx512 398 399 lea theta_perm(%rip),%r8 400 401 kxnorw $k11111,$k11111,$k11111 402 kshiftrw \$15,$k11111,$k00001 403 kshiftrw \$11,$k11111,$k11111 404 kshiftlw \$1,$k00001,$k00010 405 kshiftlw \$2,$k00001,$k00100 406 kshiftlw \$3,$k00001,$k01000 407 kshiftlw \$4,$k00001,$k10000 408 409 #vmovdqa64 64*0(%r8),@Theta[0] 410 vmovdqa64 64*1(%r8),@Theta[1] 411 vmovdqa64 64*2(%r8),@Theta[2] 412 vmovdqa64 64*3(%r8),@Theta[3] 413 vmovdqa64 64*4(%r8),@Theta[4] 414 415 vmovdqa64 64*5(%r8),@Rhotate1[0] 416 vmovdqa64 64*6(%r8),@Rhotate1[1] 417 vmovdqa64 64*7(%r8),@Rhotate1[2] 418 vmovdqa64 64*8(%r8),@Rhotate1[3] 419 vmovdqa64 64*9(%r8),@Rhotate1[4] 420 421 vmovdqa64 64*10(%r8),@Rhotate0[0] 422 vmovdqa64 64*11(%r8),@Rhotate0[1] 423 vmovdqa64 64*12(%r8),@Rhotate0[2] 424 vmovdqa64 64*13(%r8),@Rhotate0[3] 425 vmovdqa64 64*14(%r8),@Rhotate0[4] 426 427 vmovdqa64 64*15(%r8),@Pi0[0] 428 vmovdqa64 64*16(%r8),@Pi0[1] 429 vmovdqa64 64*17(%r8),@Pi0[2] 430 vmovdqa64 64*18(%r8),@Pi0[3] 431 vmovdqa64 64*19(%r8),@Pi0[4] 432 433 vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z} 434 vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z} 435 vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z} 436 vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z} 437 vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z} 438 439.Lno_output_extension_avx512: 440 shr \$3,$bsz 441 lea -96($A_flat),%r9 442 mov $bsz,%rax 443 jmp .Loop_squeeze_avx512 444 445.align 32 446.Loop_squeeze_avx512: 447 cmp \$8,$len 448 jb .Ltail_squeeze_avx512 449 450 mov (%r9),%r8 451 lea 8(%r9),%r9 452 mov %r8,($out) 453 lea 8($out),$out 454 sub \$8,$len # len -= 8 455 jz .Ldone_squeeze_avx512 456 457 sub \$1,%rax # bsz-- 458 jnz .Loop_squeeze_avx512 459 460 #vpermq @Theta[4],@Theta[4],@Theta[3] 461 #vpermq @Theta[3],@Theta[4],@Theta[2] 462 #vpermq @Theta[3],@Theta[3],@Theta[1] 463 464 call __KeccakF1600 465 466 vmovdqu64 $A00,40*0-96($A_flat){$k11111} 467 vmovdqu64 $A10,40*1-96($A_flat){$k11111} 468 vmovdqu64 $A20,40*2-96($A_flat){$k11111} 469 vmovdqu64 $A30,40*3-96($A_flat){$k11111} 470 vmovdqu64 $A40,40*4-96($A_flat){$k11111} 471 472 lea -96($A_flat),%r9 473 mov $bsz,%rax 474 jmp .Loop_squeeze_avx512 475 476.Ltail_squeeze_avx512: 477 mov $out,%rdi 478 mov %r9,%rsi 479 mov $len,%rcx 480 .byte 0xf3,0xa4 # rep movsb 481 482.Ldone_squeeze_avx512: 483 vzeroupper 484 485 lea (%r11),%rsp 486 ret 487.size SHA3_squeeze,.-SHA3_squeeze 488 489.section .rodata 490.align 64 491theta_perm: 492 .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] 493 .quad 4, 0, 1, 2, 3, 5, 6, 7 494 .quad 3, 4, 0, 1, 2, 5, 6, 7 495 .quad 2, 3, 4, 0, 1, 5, 6, 7 496 .quad 1, 2, 3, 4, 0, 5, 6, 7 497 498rhotates1: 499 .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] 500 .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] 501 .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] 502 .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] 503 .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] 504 505rhotates0: 506 .quad 0, 1, 62, 28, 27, 0, 0, 0 507 .quad 36, 44, 6, 55, 20, 0, 0, 0 508 .quad 3, 10, 43, 25, 39, 0, 0, 0 509 .quad 41, 45, 15, 21, 8, 0, 0, 0 510 .quad 18, 2, 61, 56, 14, 0, 0, 0 511 512pi0_perm: 513 .quad 0, 3, 1, 4, 2, 5, 6, 7 514 .quad 1, 4, 2, 0, 3, 5, 6, 7 515 .quad 2, 0, 3, 1, 4, 5, 6, 7 516 .quad 3, 1, 4, 2, 0, 5, 6, 7 517 .quad 4, 2, 0, 3, 1, 5, 6, 7 518 519 520iotas: 521 .quad 0x0000000000000001 522 .quad 0x0000000000008082 523 .quad 0x800000000000808a 524 .quad 0x8000000080008000 525 .quad 0x000000000000808b 526 .quad 0x0000000080000001 527 .quad 0x8000000080008081 528 .quad 0x8000000000008009 529 .quad 0x000000000000008a 530 .quad 0x0000000000000088 531 .quad 0x0000000080008009 532 .quad 0x000000008000000a 533 .quad 0x000000008000808b 534 .quad 0x800000000000008b 535 .quad 0x8000000000008089 536 .quad 0x8000000000008003 537 .quad 0x8000000000008002 538 .quad 0x8000000000000080 539 .quad 0x000000000000800a 540 .quad 0x800000008000000a 541 .quad 0x8000000080008081 542 .quad 0x8000000000008080 543 .quad 0x0000000080000001 544 .quad 0x8000000080008008 545 546.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>" 547___ 548 549$output=pop and open STDOUT,">$output"; 550print $code; 551close STDOUT or die "error closing STDOUT: $!"; 552