xref: /openssl/crypto/sha/asm/keccak1600-avx512.pl (revision da1c088f)
1#!/usr/bin/env perl
2# Copyright 2017-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for AVX-512F.
17#
18# July 2017.
19#
20# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
21# Pretty straightforward, the only "magic" is data layout in registers.
22# It's impossible to have one that is optimal for every step, hence
23# it's changing as algorithm progresses. Data is saved in linear order,
24# but in-register order morphs between rounds. Even rounds take in
25# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
26#
27########################################################################
28# Numbers are cycles per processed byte out of large message.
29#
30#			r=1088(*)
31#
32# Knights Landing	7.6
33# Skylake-X		5.7
34#
35# (*)	Corresponds to SHA3-256.
36
37########################################################################
38# Below code is combination of two ideas. One is taken from Keccak Code
39# Package, hereafter KCP, and another one from initial version of this
40# module. What is common is observation that Pi's input and output are
41# "mostly transposed", i.e. if input is aligned by x coordinate, then
42# output is [mostly] aligned by y. Both versions, KCP and predecessor,
43# were trying to use one of them from round to round, which resulted in
44# some kind of transposition in each round. This version still does
45# transpose data, but only every second round. Another essential factor
46# is that KCP transposition has to be performed with instructions that
47# turned to be rather expensive on Knights Landing, both latency- and
48# throughput-wise. Not to mention that some of them have to depend on
49# each other. On the other hand initial version of this module was
50# relying heavily on blend instructions. There were lots of them,
51# resulting in higher instruction count, yet it performed better on
52# Knights Landing, because processor can execute pair of them each
53# cycle and they have minimal latency. This module is an attempt to
54# bring best parts together:-)
55#
56# Coordinates below correspond to those in sha/keccak1600.c. Input
57# layout is straight linear:
58#
59# [0][4] [0][3] [0][2] [0][1] [0][0]
60# [1][4] [1][3] [1][2] [1][1] [1][0]
61# [2][4] [2][3] [2][2] [2][1] [2][0]
62# [3][4] [3][3] [3][2] [3][1] [3][0]
63# [4][4] [4][3] [4][2] [4][1] [4][0]
64#
65# It's perfect for Theta, while Pi is reduced to intra-register
66# permutations which yield layout perfect for Chi:
67#
68# [4][0] [3][0] [2][0] [1][0] [0][0]
69# [4][1] [3][1] [2][1] [1][1] [0][1]
70# [4][2] [3][2] [2][2] [1][2] [0][2]
71# [4][3] [3][3] [2][3] [1][3] [0][3]
72# [4][4] [3][4] [2][4] [1][4] [0][4]
73#
74# Now instead of performing full transposition and feeding it to next
75# identical round, we perform kind of diagonal transposition to layout
76# from initial version of this module, and make it suitable for Theta:
77#
78# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
79# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
80# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
81# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
82# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
83#
84# Now intra-register permutations yield initial [almost] straight
85# linear layout:
86#
87# [4][4] [3][3] [2][2] [1][1] [0][0]
88##[0][4] [0][3] [0][2] [0][1] [0][0]
89# [3][4] [2][3] [1][2] [0][1] [4][0]
90##[2][3] [2][2] [2][1] [2][0] [2][4]
91# [2][4] [1][3] [0][2] [4][1] [3][0]
92##[4][2] [4][1] [4][0] [4][4] [4][3]
93# [1][4] [0][3] [4][2] [3][1] [2][0]
94##[1][1] [1][0] [1][4] [1][3] [1][2]
95# [0][4] [4][3] [3][2] [2][1] [1][0]
96##[3][0] [3][4] [3][3] [3][2] [3][1]
97#
98# This means that odd round Chi is performed in less suitable layout,
99# with a number of additional permutations. But overall it turned to be
100# a win. Permutations are fastest possible on Knights Landing and they
101# are laid down to be independent of each other. In the essence I traded
102# 20 blend instructions for 3 permutations. The result is 13% faster
103# than KCP on Skylake-X, and >40% on Knights Landing.
104#
105# As implied, data is loaded in straight linear order. Digits in
106# variables' names represent coordinates of right-most element of
107# loaded data chunk:
108
109my ($A00,	# [0][4] [0][3] [0][2] [0][1] [0][0]
110    $A10,	# [1][4] [1][3] [1][2] [1][1] [1][0]
111    $A20,	# [2][4] [2][3] [2][2] [2][1] [2][0]
112    $A30,	# [3][4] [3][3] [3][2] [3][1] [3][0]
113    $A40) =	# [4][4] [4][3] [4][2] [4][1] [4][0]
114    map("%zmm$_",(0..4));
115
116# We also need to map the magic order into offsets within structure:
117
118my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
119		[1,0], [1,1], [1,2], [1,3], [1,4],
120		[2,0], [2,1], [2,2], [2,3], [2,4],
121		[3,0], [3,1], [3,2], [3,3], [3,4],
122		[4,0], [4,1], [4,2], [4,3], [4,4]);
123   @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged);	# ... and now linear
124
125my @T        = map("%zmm$_",(5..12));
126my @Theta    = map("%zmm$_",(33,13..16));	# invalid @Theta[0] is not typo
127my @Pi0      = map("%zmm$_",(17..21));
128my @Rhotate0 = map("%zmm$_",(22..26));
129my @Rhotate1 = map("%zmm$_",(27..31));
130
131my ($C00,$D00) = @T[0..1];
132my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
133
134$code.=<<___;
135.text
136
137.type	__KeccakF1600,\@function
138.align	32
139__KeccakF1600:
140	lea		iotas(%rip),%r10
141	mov		\$12,%eax
142	jmp		.Loop_avx512
143
144.align	32
145.Loop_avx512:
146	######################################### Theta, even round
147	vmovdqa64	$A00,@T[0]		# put aside original A00
148	vpternlogq	\$0x96,$A20,$A10,$A00	# and use it as "C00"
149	vpternlogq	\$0x96,$A40,$A30,$A00
150
151	vprolq		\$1,$A00,$D00
152	vpermq		$A00,@Theta[1],$A00
153	vpermq		$D00,@Theta[4],$D00
154
155	vpternlogq	\$0x96,$A00,$D00,@T[0]	# T[0] is original A00
156	vpternlogq	\$0x96,$A00,$D00,$A10
157	vpternlogq	\$0x96,$A00,$D00,$A20
158	vpternlogq	\$0x96,$A00,$D00,$A30
159	vpternlogq	\$0x96,$A00,$D00,$A40
160
161	######################################### Rho
162	vprolvq		@Rhotate0[0],@T[0],$A00	# T[0] is original A00
163	vprolvq		@Rhotate0[1],$A10,$A10
164	vprolvq		@Rhotate0[2],$A20,$A20
165	vprolvq		@Rhotate0[3],$A30,$A30
166	vprolvq		@Rhotate0[4],$A40,$A40
167
168	######################################### Pi
169	vpermq		$A00,@Pi0[0],$A00
170	vpermq		$A10,@Pi0[1],$A10
171	vpermq		$A20,@Pi0[2],$A20
172	vpermq		$A30,@Pi0[3],$A30
173	vpermq		$A40,@Pi0[4],$A40
174
175	######################################### Chi
176	vmovdqa64	$A00,@T[0]
177	vmovdqa64	$A10,@T[1]
178	vpternlogq	\$0xD2,$A20,$A10,$A00
179	vpternlogq	\$0xD2,$A30,$A20,$A10
180	vpternlogq	\$0xD2,$A40,$A30,$A20
181	vpternlogq	\$0xD2,@T[0],$A40,$A30
182	vpternlogq	\$0xD2,@T[1],@T[0],$A40
183
184	######################################### Iota
185	vpxorq		(%r10),$A00,${A00}{$k00001}
186	lea		16(%r10),%r10
187
188	######################################### Harmonize rounds
189	vpblendmq	$A20,$A10,@{T[1]}{$k00010}
190	vpblendmq	$A30,$A20,@{T[2]}{$k00010}
191	vpblendmq	$A40,$A30,@{T[3]}{$k00010}
192	 vpblendmq	$A10,$A00,@{T[0]}{$k00010}
193	vpblendmq	$A00,$A40,@{T[4]}{$k00010}
194
195	vpblendmq	$A30,@T[1],@{T[1]}{$k00100}
196	vpblendmq	$A40,@T[2],@{T[2]}{$k00100}
197	 vpblendmq	$A20,@T[0],@{T[0]}{$k00100}
198	vpblendmq	$A00,@T[3],@{T[3]}{$k00100}
199	vpblendmq	$A10,@T[4],@{T[4]}{$k00100}
200
201	vpblendmq	$A40,@T[1],@{T[1]}{$k01000}
202	 vpblendmq	$A30,@T[0],@{T[0]}{$k01000}
203	vpblendmq	$A00,@T[2],@{T[2]}{$k01000}
204	vpblendmq	$A10,@T[3],@{T[3]}{$k01000}
205	vpblendmq	$A20,@T[4],@{T[4]}{$k01000}
206
207	vpblendmq	$A40,@T[0],@{T[0]}{$k10000}
208	vpblendmq	$A00,@T[1],@{T[1]}{$k10000}
209	vpblendmq	$A10,@T[2],@{T[2]}{$k10000}
210	vpblendmq	$A20,@T[3],@{T[3]}{$k10000}
211	vpblendmq	$A30,@T[4],@{T[4]}{$k10000}
212
213	#vpermq		@T[0],@Theta[0],$A00	# doesn't actually change order
214	vpermq		@T[1],@Theta[1],$A10
215	vpermq		@T[2],@Theta[2],$A20
216	vpermq		@T[3],@Theta[3],$A30
217	vpermq		@T[4],@Theta[4],$A40
218
219	######################################### Theta, odd round
220	vmovdqa64	$T[0],$A00		# real A00
221	vpternlogq	\$0x96,$A20,$A10,$C00	# C00 is @T[0]'s alias
222	vpternlogq	\$0x96,$A40,$A30,$C00
223
224	vprolq		\$1,$C00,$D00
225	vpermq		$C00,@Theta[1],$C00
226	vpermq		$D00,@Theta[4],$D00
227
228	vpternlogq	\$0x96,$C00,$D00,$A00
229	vpternlogq	\$0x96,$C00,$D00,$A30
230	vpternlogq	\$0x96,$C00,$D00,$A10
231	vpternlogq	\$0x96,$C00,$D00,$A40
232	vpternlogq	\$0x96,$C00,$D00,$A20
233
234	######################################### Rho
235	vprolvq		@Rhotate1[0],$A00,$A00
236	vprolvq		@Rhotate1[3],$A30,@T[1]
237	vprolvq		@Rhotate1[1],$A10,@T[2]
238	vprolvq		@Rhotate1[4],$A40,@T[3]
239	vprolvq		@Rhotate1[2],$A20,@T[4]
240
241	 vpermq		$A00,@Theta[4],@T[5]
242	 vpermq		$A00,@Theta[3],@T[6]
243
244	######################################### Iota
245	vpxorq		-8(%r10),$A00,${A00}{$k00001}
246
247	######################################### Pi
248	vpermq		@T[1],@Theta[2],$A10
249	vpermq		@T[2],@Theta[4],$A20
250	vpermq		@T[3],@Theta[1],$A30
251	vpermq		@T[4],@Theta[3],$A40
252
253	######################################### Chi
254	vpternlogq	\$0xD2,@T[6],@T[5],$A00
255
256	vpermq		@T[1],@Theta[1],@T[7]
257	#vpermq		@T[1],@Theta[0],@T[1]
258	vpternlogq	\$0xD2,@T[1],@T[7],$A10
259
260	vpermq		@T[2],@Theta[3],@T[0]
261	vpermq		@T[2],@Theta[2],@T[2]
262	vpternlogq	\$0xD2,@T[2],@T[0],$A20
263
264	#vpermq		@T[3],@Theta[0],@T[3]
265	vpermq		@T[3],@Theta[4],@T[1]
266	vpternlogq	\$0xD2,@T[1],@T[3],$A30
267
268	vpermq		@T[4],@Theta[2],@T[0]
269	vpermq		@T[4],@Theta[1],@T[4]
270	vpternlogq	\$0xD2,@T[4],@T[0],$A40
271
272	dec		%eax
273	jnz		.Loop_avx512
274
275	ret
276.size	__KeccakF1600,.-__KeccakF1600
277___
278
279my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
280my  $out = $inp;	# in squeeze
281
282$code.=<<___;
283.globl	SHA3_absorb
284.type	SHA3_absorb,\@function
285.align	32
286SHA3_absorb:
287	mov	%rsp,%r11
288
289	lea	-320(%rsp),%rsp
290	and	\$-64,%rsp
291
292	lea	96($A_flat),$A_flat
293	lea	96($inp),$inp
294	lea	128(%rsp),%r9
295
296	lea		theta_perm(%rip),%r8
297
298	kxnorw		$k11111,$k11111,$k11111
299	kshiftrw	\$15,$k11111,$k00001
300	kshiftrw	\$11,$k11111,$k11111
301	kshiftlw	\$1,$k00001,$k00010
302	kshiftlw	\$2,$k00001,$k00100
303	kshiftlw	\$3,$k00001,$k01000
304	kshiftlw	\$4,$k00001,$k10000
305
306	#vmovdqa64	64*0(%r8),@Theta[0]
307	vmovdqa64	64*1(%r8),@Theta[1]
308	vmovdqa64	64*2(%r8),@Theta[2]
309	vmovdqa64	64*3(%r8),@Theta[3]
310	vmovdqa64	64*4(%r8),@Theta[4]
311
312	vmovdqa64	64*5(%r8),@Rhotate1[0]
313	vmovdqa64	64*6(%r8),@Rhotate1[1]
314	vmovdqa64	64*7(%r8),@Rhotate1[2]
315	vmovdqa64	64*8(%r8),@Rhotate1[3]
316	vmovdqa64	64*9(%r8),@Rhotate1[4]
317
318	vmovdqa64	64*10(%r8),@Rhotate0[0]
319	vmovdqa64	64*11(%r8),@Rhotate0[1]
320	vmovdqa64	64*12(%r8),@Rhotate0[2]
321	vmovdqa64	64*13(%r8),@Rhotate0[3]
322	vmovdqa64	64*14(%r8),@Rhotate0[4]
323
324	vmovdqa64	64*15(%r8),@Pi0[0]
325	vmovdqa64	64*16(%r8),@Pi0[1]
326	vmovdqa64	64*17(%r8),@Pi0[2]
327	vmovdqa64	64*18(%r8),@Pi0[3]
328	vmovdqa64	64*19(%r8),@Pi0[4]
329
330	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
331	vpxorq		@T[0],@T[0],@T[0]
332	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
333	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
334	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
335	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}
336
337	vmovdqa64	@T[0],0*64-128(%r9)	# zero transfer area on stack
338	vmovdqa64	@T[0],1*64-128(%r9)
339	vmovdqa64	@T[0],2*64-128(%r9)
340	vmovdqa64	@T[0],3*64-128(%r9)
341	vmovdqa64	@T[0],4*64-128(%r9)
342	jmp		.Loop_absorb_avx512
343
344.align	32
345.Loop_absorb_avx512:
346	mov		$bsz,%rax
347	sub		$bsz,$len
348	jc		.Ldone_absorb_avx512
349
350	shr		\$3,%eax
351___
352for(my $i=0; $i<25; $i++) {
353$code.=<<___
354	mov	8*$i-96($inp),%r8
355	mov	%r8,$A_jagged[$i]-128(%r9)
356	dec	%eax
357	jz	.Labsorved_avx512
358___
359}
360$code.=<<___;
361.Labsorved_avx512:
362	lea	($inp,$bsz),$inp
363
364	vpxorq	64*0-128(%r9),$A00,$A00
365	vpxorq	64*1-128(%r9),$A10,$A10
366	vpxorq	64*2-128(%r9),$A20,$A20
367	vpxorq	64*3-128(%r9),$A30,$A30
368	vpxorq	64*4-128(%r9),$A40,$A40
369
370	call	__KeccakF1600
371
372	jmp	.Loop_absorb_avx512
373
374.align	32
375.Ldone_absorb_avx512:
376	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
377	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
378	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
379	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
380	vmovdqu64	$A40,40*4-96($A_flat){$k11111}
381
382	vzeroupper
383
384	lea	(%r11),%rsp
385	lea	($len,$bsz),%rax		# return value
386	ret
387.size	SHA3_absorb,.-SHA3_absorb
388
389.globl	SHA3_squeeze
390.type	SHA3_squeeze,\@function
391.align	32
392SHA3_squeeze:
393	mov	%rsp,%r11
394
395	lea	96($A_flat),$A_flat
396	cmp	$bsz,$len
397	jbe	.Lno_output_extension_avx512
398
399	lea		theta_perm(%rip),%r8
400
401	kxnorw		$k11111,$k11111,$k11111
402	kshiftrw	\$15,$k11111,$k00001
403	kshiftrw	\$11,$k11111,$k11111
404	kshiftlw	\$1,$k00001,$k00010
405	kshiftlw	\$2,$k00001,$k00100
406	kshiftlw	\$3,$k00001,$k01000
407	kshiftlw	\$4,$k00001,$k10000
408
409	#vmovdqa64	64*0(%r8),@Theta[0]
410	vmovdqa64	64*1(%r8),@Theta[1]
411	vmovdqa64	64*2(%r8),@Theta[2]
412	vmovdqa64	64*3(%r8),@Theta[3]
413	vmovdqa64	64*4(%r8),@Theta[4]
414
415	vmovdqa64	64*5(%r8),@Rhotate1[0]
416	vmovdqa64	64*6(%r8),@Rhotate1[1]
417	vmovdqa64	64*7(%r8),@Rhotate1[2]
418	vmovdqa64	64*8(%r8),@Rhotate1[3]
419	vmovdqa64	64*9(%r8),@Rhotate1[4]
420
421	vmovdqa64	64*10(%r8),@Rhotate0[0]
422	vmovdqa64	64*11(%r8),@Rhotate0[1]
423	vmovdqa64	64*12(%r8),@Rhotate0[2]
424	vmovdqa64	64*13(%r8),@Rhotate0[3]
425	vmovdqa64	64*14(%r8),@Rhotate0[4]
426
427	vmovdqa64	64*15(%r8),@Pi0[0]
428	vmovdqa64	64*16(%r8),@Pi0[1]
429	vmovdqa64	64*17(%r8),@Pi0[2]
430	vmovdqa64	64*18(%r8),@Pi0[3]
431	vmovdqa64	64*19(%r8),@Pi0[4]
432
433	vmovdqu64	40*0-96($A_flat),${A00}{$k11111}{z}
434	vmovdqu64	40*1-96($A_flat),${A10}{$k11111}{z}
435	vmovdqu64	40*2-96($A_flat),${A20}{$k11111}{z}
436	vmovdqu64	40*3-96($A_flat),${A30}{$k11111}{z}
437	vmovdqu64	40*4-96($A_flat),${A40}{$k11111}{z}
438
439.Lno_output_extension_avx512:
440	shr	\$3,$bsz
441	lea	-96($A_flat),%r9
442	mov	$bsz,%rax
443	jmp	.Loop_squeeze_avx512
444
445.align	32
446.Loop_squeeze_avx512:
447	cmp	\$8,$len
448	jb	.Ltail_squeeze_avx512
449
450	mov	(%r9),%r8
451	lea	8(%r9),%r9
452	mov	%r8,($out)
453	lea	8($out),$out
454	sub	\$8,$len		# len -= 8
455	jz	.Ldone_squeeze_avx512
456
457	sub	\$1,%rax		# bsz--
458	jnz	.Loop_squeeze_avx512
459
460	#vpermq		@Theta[4],@Theta[4],@Theta[3]
461	#vpermq		@Theta[3],@Theta[4],@Theta[2]
462	#vpermq		@Theta[3],@Theta[3],@Theta[1]
463
464	call		__KeccakF1600
465
466	vmovdqu64	$A00,40*0-96($A_flat){$k11111}
467	vmovdqu64	$A10,40*1-96($A_flat){$k11111}
468	vmovdqu64	$A20,40*2-96($A_flat){$k11111}
469	vmovdqu64	$A30,40*3-96($A_flat){$k11111}
470	vmovdqu64	$A40,40*4-96($A_flat){$k11111}
471
472	lea	-96($A_flat),%r9
473	mov	$bsz,%rax
474	jmp	.Loop_squeeze_avx512
475
476.Ltail_squeeze_avx512:
477	mov	$out,%rdi
478	mov	%r9,%rsi
479	mov	$len,%rcx
480	.byte	0xf3,0xa4		# rep movsb
481
482.Ldone_squeeze_avx512:
483	vzeroupper
484
485	lea	(%r11),%rsp
486	ret
487.size	SHA3_squeeze,.-SHA3_squeeze
488
489.section .rodata
490.align	64
491theta_perm:
492	.quad	0, 1, 2, 3, 4, 5, 6, 7		# [not used]
493	.quad	4, 0, 1, 2, 3, 5, 6, 7
494	.quad	3, 4, 0, 1, 2, 5, 6, 7
495	.quad	2, 3, 4, 0, 1, 5, 6, 7
496	.quad	1, 2, 3, 4, 0, 5, 6, 7
497
498rhotates1:
499	.quad	0,  44, 43, 21, 14, 0, 0, 0	# [0][0] [1][1] [2][2] [3][3] [4][4]
500	.quad	18, 1,  6,  25, 8,  0, 0, 0	# [4][0] [0][1] [1][2] [2][3] [3][4]
501	.quad	41, 2,	62, 55, 39, 0, 0, 0	# [3][0] [4][1] [0][2] [1][3] [2][4]
502	.quad	3,  45, 61, 28, 20, 0, 0, 0	# [2][0] [3][1] [4][2] [0][3] [1][4]
503	.quad	36, 10, 15, 56, 27, 0, 0, 0	# [1][0] [2][1] [3][2] [4][3] [0][4]
504
505rhotates0:
506	.quad	 0,  1, 62, 28, 27, 0, 0, 0
507	.quad	36, 44,  6, 55, 20, 0, 0, 0
508	.quad	 3, 10, 43, 25, 39, 0, 0, 0
509	.quad	41, 45, 15, 21,  8, 0, 0, 0
510	.quad	18,  2, 61, 56, 14, 0, 0, 0
511
512pi0_perm:
513	.quad	0, 3, 1, 4, 2, 5, 6, 7
514	.quad	1, 4, 2, 0, 3, 5, 6, 7
515	.quad	2, 0, 3, 1, 4, 5, 6, 7
516	.quad	3, 1, 4, 2, 0, 5, 6, 7
517	.quad	4, 2, 0, 3, 1, 5, 6, 7
518
519
520iotas:
521	.quad	0x0000000000000001
522	.quad	0x0000000000008082
523	.quad	0x800000000000808a
524	.quad	0x8000000080008000
525	.quad	0x000000000000808b
526	.quad	0x0000000080000001
527	.quad	0x8000000080008081
528	.quad	0x8000000000008009
529	.quad	0x000000000000008a
530	.quad	0x0000000000000088
531	.quad	0x0000000080008009
532	.quad	0x000000008000000a
533	.quad	0x000000008000808b
534	.quad	0x800000000000008b
535	.quad	0x8000000000008089
536	.quad	0x8000000000008003
537	.quad	0x8000000000008002
538	.quad	0x8000000000000080
539	.quad	0x000000000000800a
540	.quad	0x800000008000000a
541	.quad	0x8000000080008081
542	.quad	0x8000000000008080
543	.quad	0x0000000080000001
544	.quad	0x8000000080008008
545
546.asciz	"Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
547___
548
549$output=pop and open STDOUT,">$output";
550print $code;
551close STDOUT or die "error closing STDOUT: $!";
552