xref: /openssl/crypto/sha/asm/keccak1600-x86_64.pl (revision 33388b44)
1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for x86_64.
17#
18# June 2017.
19#
20# Below code is [lane complementing] KECCAK_2X implementation (see
21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22# instead of actually unrolling the loop pair-wise I simply flip
23# pointers to T[][] and A[][] at the end of round. Since number of
24# rounds is even, last round writes to A[][] and everything works out.
25# How does it compare to x86_64 assembly module in Keccak Code Package?
26# Depending on processor it's either as fast or faster by up to 15%...
27#
28########################################################################
29# Numbers are cycles per processed byte out of large message.
30#
31#			r=1088(*)
32#
33# P4			25.8
34# Core 2		12.9
35# Westmere		13.7
36# Sandy Bridge		12.9(**)
37# Haswell		9.6
38# Skylake		9.4
39# Silvermont		22.8
40# Goldmont		15.8
41# VIA Nano		17.3
42# Sledgehammer		13.3
43# Bulldozer		16.5
44# Ryzen			8.8
45#
46# (*)	Corresponds to SHA3-256. Improvement over compiler-generate
47#	varies a lot, most common coefficient is 15% in comparison to
48#	gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49# (**)	Sandy Bridge has broken rotate instruction. Performance can be
50#	improved by 14% by replacing rotates with double-precision
51#	shift with same register as source and destination.
52
53# $output is the last argument if it looks like a file (it has an extension)
54# $flavour is the first argument if it doesn't look like a file
55$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
56$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
57
58$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59
60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
62( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
63die "can't locate x86_64-xlate.pl";
64
65open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
66    or die "can't call $xlate: $!";
67*STDOUT=*OUT;
68
69my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
70              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
71
72my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
73my @D = map("%r$_",(8..12));
74my @T = map("%r$_",(13..14));
75my $iotas = "%r15";
76
77my @rhotates = ([  0,  1, 62, 28, 27 ],
78                [ 36, 44,  6, 55, 20 ],
79                [  3, 10, 43, 25, 39 ],
80                [ 41, 45, 15, 21,  8 ],
81                [ 18,  2, 61, 56, 14 ]);
82
83$code.=<<___;
84.text
85
86.type	__KeccakF1600,\@abi-omnipotent
87.align	32
88__KeccakF1600:
89.cfi_startproc
90	mov	$A[4][0](%rdi),@C[0]
91	mov	$A[4][1](%rdi),@C[1]
92	mov	$A[4][2](%rdi),@C[2]
93	mov	$A[4][3](%rdi),@C[3]
94	mov	$A[4][4](%rdi),@C[4]
95	jmp	.Loop
96
97.align	32
98.Loop:
99	mov	$A[0][0](%rdi),@D[0]
100	mov	$A[1][1](%rdi),@D[1]
101	mov	$A[2][2](%rdi),@D[2]
102	mov	$A[3][3](%rdi),@D[3]
103
104	xor	$A[0][2](%rdi),@C[2]
105	xor	$A[0][3](%rdi),@C[3]
106	xor	@D[0],         @C[0]
107	xor	$A[0][1](%rdi),@C[1]
108	 xor	$A[1][2](%rdi),@C[2]
109	 xor	$A[1][0](%rdi),@C[0]
110	mov	@C[4],@D[4]
111	xor	$A[0][4](%rdi),@C[4]
112
113	xor	@D[2],         @C[2]
114	xor	$A[2][0](%rdi),@C[0]
115	 xor	$A[1][3](%rdi),@C[3]
116	 xor	@D[1],         @C[1]
117	 xor	$A[1][4](%rdi),@C[4]
118
119	xor	$A[3][2](%rdi),@C[2]
120	xor	$A[3][0](%rdi),@C[0]
121	 xor	$A[2][3](%rdi),@C[3]
122	 xor	$A[2][1](%rdi),@C[1]
123	 xor	$A[2][4](%rdi),@C[4]
124
125	mov	@C[2],@T[0]
126	rol	\$1,@C[2]
127	xor	@C[0],@C[2]		# D[1] = ROL64(C[2], 1) ^ C[0]
128	 xor	@D[3],         @C[3]
129
130	rol	\$1,@C[0]
131	xor	@C[3],@C[0]		# D[4] = ROL64(C[0], 1) ^ C[3]
132	 xor	$A[3][1](%rdi),@C[1]
133
134	rol	\$1,@C[3]
135	xor	@C[1],@C[3]		# D[2] = ROL64(C[3], 1) ^ C[1]
136	 xor	$A[3][4](%rdi),@C[4]
137
138	rol	\$1,@C[1]
139	xor	@C[4],@C[1]		# D[0] = ROL64(C[1], 1) ^ C[4]
140
141	rol	\$1,@C[4]
142	xor	@T[0],@C[4]		# D[3] = ROL64(C[4], 1) ^ C[2]
143___
144	(@D[0..4], @C) = (@C[1..4,0], @D);
145$code.=<<___;
146	xor	@D[1],@C[1]
147	xor	@D[2],@C[2]
148	rol	\$$rhotates[1][1],@C[1]
149	xor	@D[3],@C[3]
150	xor	@D[4],@C[4]
151	rol	\$$rhotates[2][2],@C[2]
152	xor	@D[0],@C[0]
153	 mov	@C[1],@T[0]
154	rol	\$$rhotates[3][3],@C[3]
155	 or	@C[2],@C[1]
156	 xor	@C[0],@C[1]		#           C[0] ^ ( C[1] | C[2])
157	rol	\$$rhotates[4][4],@C[4]
158
159	 xor	($iotas),@C[1]
160	 lea	8($iotas),$iotas
161
162	mov	@C[4],@T[1]
163	and	@C[3],@C[4]
164	 mov	@C[1],$A[0][0](%rsi)	# R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
165	xor	@C[2],@C[4]		#           C[2] ^ ( C[4] & C[3])
166	not	@C[2]
167	mov	@C[4],$A[0][2](%rsi)	# R[0][2] = C[2] ^ ( C[4] & C[3])
168
169	or	@C[3],@C[2]
170	  mov	$A[4][2](%rdi),@C[4]
171	xor	@T[0],@C[2]		#           C[1] ^ (~C[2] | C[3])
172	mov	@C[2],$A[0][1](%rsi)	# R[0][1] = C[1] ^ (~C[2] | C[3])
173
174	and	@C[0],@T[0]
175	  mov	$A[1][4](%rdi),@C[1]
176	xor	@T[1],@T[0]		#           C[4] ^ ( C[1] & C[0])
177	  mov	$A[2][0](%rdi),@C[2]
178	mov	@T[0],$A[0][4](%rsi)	# R[0][4] = C[4] ^ ( C[1] & C[0])
179
180	or	@C[0],@T[1]
181	  mov	$A[0][3](%rdi),@C[0]
182	xor	@C[3],@T[1]		#           C[3] ^ ( C[4] | C[0])
183	  mov	$A[3][1](%rdi),@C[3]
184	mov	@T[1],$A[0][3](%rsi)	# R[0][3] = C[3] ^ ( C[4] | C[0])
185
186
187	xor	@D[3],@C[0]
188	xor	@D[2],@C[4]
189	rol	\$$rhotates[0][3],@C[0]
190	xor	@D[1],@C[3]
191	xor	@D[4],@C[1]
192	rol	\$$rhotates[4][2],@C[4]
193	rol	\$$rhotates[3][1],@C[3]
194	xor	@D[0],@C[2]
195	rol	\$$rhotates[1][4],@C[1]
196	 mov	@C[0],@T[0]
197	 or	@C[4],@C[0]
198	rol	\$$rhotates[2][0],@C[2]
199
200	xor	@C[3],@C[0]		#           C[3] ^ (C[0] |  C[4])
201	mov	@C[0],$A[1][3](%rsi)	# R[1][3] = C[3] ^ (C[0] |  C[4])
202
203	mov	@C[1],@T[1]
204	and	@T[0],@C[1]
205	  mov	$A[0][1](%rdi),@C[0]
206	xor	@C[4],@C[1]		#           C[4] ^ (C[1] &  C[0])
207	not	@C[4]
208	mov	@C[1],$A[1][4](%rsi)	# R[1][4] = C[4] ^ (C[1] &  C[0])
209
210	or	@C[3],@C[4]
211	  mov	$A[1][2](%rdi),@C[1]
212	xor	@C[2],@C[4]		#           C[2] ^ (~C[4] | C[3])
213	mov	@C[4],$A[1][2](%rsi)	# R[1][2] = C[2] ^ (~C[4] | C[3])
214
215	and	@C[2],@C[3]
216	  mov	$A[4][0](%rdi),@C[4]
217	xor	@T[1],@C[3]		#           C[1] ^ (C[3] &  C[2])
218	mov	@C[3],$A[1][1](%rsi)	# R[1][1] = C[1] ^ (C[3] &  C[2])
219
220	or	@C[2],@T[1]
221	  mov	$A[2][3](%rdi),@C[2]
222	xor	@T[0],@T[1]		#           C[0] ^ (C[1] |  C[2])
223	  mov	$A[3][4](%rdi),@C[3]
224	mov	@T[1],$A[1][0](%rsi)	# R[1][0] = C[0] ^ (C[1] |  C[2])
225
226
227	xor	@D[3],@C[2]
228	xor	@D[4],@C[3]
229	rol	\$$rhotates[2][3],@C[2]
230	xor	@D[2],@C[1]
231	rol	\$$rhotates[3][4],@C[3]
232	xor	@D[0],@C[4]
233	rol	\$$rhotates[1][2],@C[1]
234	xor	@D[1],@C[0]
235	rol	\$$rhotates[4][0],@C[4]
236	 mov	@C[2],@T[0]
237	 and	@C[3],@C[2]
238	rol	\$$rhotates[0][1],@C[0]
239
240	not	@C[3]
241	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] & C[3])
242	mov	@C[2],$A[2][1](%rsi)	# R[2][1] =  C[1] ^ ( C[2] & C[3])
243
244	mov	@C[4],@T[1]
245	and	@C[3],@C[4]
246	  mov	$A[2][1](%rdi),@C[2]
247	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] & ~C[3])
248	mov	@C[4],$A[2][2](%rsi)	# R[2][2] =  C[2] ^ ( C[4] & ~C[3])
249
250	or	@C[1],@T[0]
251	  mov	$A[4][3](%rdi),@C[4]
252	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] | C[1])
253	mov	@T[0],$A[2][0](%rsi)	# R[2][0] =  C[0] ^ ( C[2] | C[1])
254
255	and	@C[0],@C[1]
256	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] & C[0])
257	mov	@C[1],$A[2][4](%rsi)	# R[2][4] =  C[4] ^ ( C[1] & C[0])
258
259	or	@C[0],@T[1]
260	  mov	$A[1][0](%rdi),@C[1]
261	xor	@C[3],@T[1]		#           ~C[3] ^ ( C[0] | C[4])
262	  mov	$A[3][2](%rdi),@C[3]
263	mov	@T[1],$A[2][3](%rsi)	# R[2][3] = ~C[3] ^ ( C[0] | C[4])
264
265
266	mov	$A[0][4](%rdi),@C[0]
267
268	xor	@D[1],@C[2]
269	xor	@D[2],@C[3]
270	rol	\$$rhotates[2][1],@C[2]
271	xor	@D[0],@C[1]
272	rol	\$$rhotates[3][2],@C[3]
273	xor	@D[3],@C[4]
274	rol	\$$rhotates[1][0],@C[1]
275	xor	@D[4],@C[0]
276	rol	\$$rhotates[4][3],@C[4]
277	 mov	@C[2],@T[0]
278	 or	@C[3],@C[2]
279	rol	\$$rhotates[0][4],@C[0]
280
281	not	@C[3]
282	xor	@C[1],@C[2]		#            C[1] ^ ( C[2] | C[3])
283	mov	@C[2],$A[3][1](%rsi)	# R[3][1] =  C[1] ^ ( C[2] | C[3])
284
285	mov	@C[4],@T[1]
286	or	@C[3],@C[4]
287	xor	@T[0],@C[4]		#            C[2] ^ ( C[4] | ~C[3])
288	mov	@C[4],$A[3][2](%rsi)	# R[3][2] =  C[2] ^ ( C[4] | ~C[3])
289
290	and	@C[1],@T[0]
291	xor	@C[0],@T[0]		#            C[0] ^ ( C[2] & C[1])
292	mov	@T[0],$A[3][0](%rsi)	# R[3][0] =  C[0] ^ ( C[2] & C[1])
293
294	or	@C[0],@C[1]
295	xor	@T[1],@C[1]		#            C[4] ^ ( C[1] | C[0])
296	mov	@C[1],$A[3][4](%rsi)	# R[3][4] =  C[4] ^ ( C[1] | C[0])
297
298	and	@T[1],@C[0]
299	xor	@C[3],@C[0]		#           ~C[3] ^ ( C[0] & C[4])
300	mov	@C[0],$A[3][3](%rsi)	# R[3][3] = ~C[3] ^ ( C[0] & C[4])
301
302
303	xor	$A[0][2](%rdi),@D[2]
304	xor	$A[1][3](%rdi),@D[3]
305	rol	\$$rhotates[0][2],@D[2]
306	xor	$A[4][1](%rdi),@D[1]
307	rol	\$$rhotates[1][3],@D[3]
308	xor	$A[2][4](%rdi),@D[4]
309	rol	\$$rhotates[4][1],@D[1]
310	xor	$A[3][0](%rdi),@D[0]
311	xchg	%rsi,%rdi
312	rol	\$$rhotates[2][4],@D[4]
313	rol	\$$rhotates[3][0],@D[0]
314___
315	@C = @D[2..4,0,1];
316$code.=<<___;
317	mov	@C[0],@T[0]
318	and	@C[1],@C[0]
319	not	@C[1]
320	xor	@C[4],@C[0]		#            C[4] ^ ( C[0] & C[1])
321	mov	@C[0],$A[4][4](%rdi)	# R[4][4] =  C[4] ^ ( C[0] & C[1])
322
323	mov	@C[2],@T[1]
324	and	@C[1],@C[2]
325	xor	@T[0],@C[2]		#            C[0] ^ ( C[2] & ~C[1])
326	mov	@C[2],$A[4][0](%rdi)	# R[4][0] =  C[0] ^ ( C[2] & ~C[1])
327
328	or	@C[4],@T[0]
329	xor	@C[3],@T[0]		#            C[3] ^ ( C[0] | C[4])
330	mov	@T[0],$A[4][3](%rdi)	# R[4][3] =  C[3] ^ ( C[0] | C[4])
331
332	and	@C[3],@C[4]
333	xor	@T[1],@C[4]		#            C[2] ^ ( C[4] & C[3])
334	mov	@C[4],$A[4][2](%rdi)	# R[4][2] =  C[2] ^ ( C[4] & C[3])
335
336	or	@T[1],@C[3]
337	xor	@C[1],@C[3]		#           ~C[1] ^ ( C[2] | C[3])
338	mov	@C[3],$A[4][1](%rdi)	# R[4][1] = ~C[1] ^ ( C[2] | C[3])
339
340	mov	@C[0],@C[1]		# harmonize with the loop top
341	mov	@T[0],@C[0]
342
343	test	\$255,$iotas
344	jnz	.Loop
345
346	lea	-192($iotas),$iotas	# rewind iotas
347	ret
348.cfi_endproc
349.size	__KeccakF1600,.-__KeccakF1600
350
351.type	KeccakF1600,\@abi-omnipotent
352.align	32
353KeccakF1600:
354.cfi_startproc
355	push	%rbx
356.cfi_push	%rbx
357	push	%rbp
358.cfi_push	%rbp
359	push	%r12
360.cfi_push	%r12
361	push	%r13
362.cfi_push	%r13
363	push	%r14
364.cfi_push	%r14
365	push	%r15
366.cfi_push	%r15
367
368	lea	100(%rdi),%rdi		# size optimization
369	sub	\$200,%rsp
370.cfi_adjust_cfa_offset	200
371
372	notq	$A[0][1](%rdi)
373	notq	$A[0][2](%rdi)
374	notq	$A[1][3](%rdi)
375	notq	$A[2][2](%rdi)
376	notq	$A[3][2](%rdi)
377	notq	$A[4][0](%rdi)
378
379	lea	iotas(%rip),$iotas
380	lea	100(%rsp),%rsi		# size optimization
381
382	call	__KeccakF1600
383
384	notq	$A[0][1](%rdi)
385	notq	$A[0][2](%rdi)
386	notq	$A[1][3](%rdi)
387	notq	$A[2][2](%rdi)
388	notq	$A[3][2](%rdi)
389	notq	$A[4][0](%rdi)
390	lea	-100(%rdi),%rdi		# preserve A[][]
391
392	add	\$200,%rsp
393.cfi_adjust_cfa_offset	-200
394
395	pop	%r15
396.cfi_pop	%r15
397	pop	%r14
398.cfi_pop	%r14
399	pop	%r13
400.cfi_pop	%r13
401	pop	%r12
402.cfi_pop	%r12
403	pop	%rbp
404.cfi_pop	%rbp
405	pop	%rbx
406.cfi_pop	%rbx
407	ret
408.cfi_endproc
409.size	KeccakF1600,.-KeccakF1600
410___
411
412{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
413     ($A_flat,$inp) = ("%r8","%r9");
414$code.=<<___;
415.globl	SHA3_absorb
416.type	SHA3_absorb,\@function,4
417.align	32
418SHA3_absorb:
419.cfi_startproc
420	push	%rbx
421.cfi_push	%rbx
422	push	%rbp
423.cfi_push	%rbp
424	push	%r12
425.cfi_push	%r12
426	push	%r13
427.cfi_push	%r13
428	push	%r14
429.cfi_push	%r14
430	push	%r15
431.cfi_push	%r15
432
433	lea	100(%rdi),%rdi		# size optimization
434	sub	\$232,%rsp
435.cfi_adjust_cfa_offset	232
436
437	mov	%rsi,$inp
438	lea	100(%rsp),%rsi		# size optimization
439
440	notq	$A[0][1](%rdi)
441	notq	$A[0][2](%rdi)
442	notq	$A[1][3](%rdi)
443	notq	$A[2][2](%rdi)
444	notq	$A[3][2](%rdi)
445	notq	$A[4][0](%rdi)
446	lea	iotas(%rip),$iotas
447
448	mov	$bsz,216-100(%rsi)	# save bsz
449
450.Loop_absorb:
451	cmp	$bsz,$len
452	jc	.Ldone_absorb
453
454	shr	\$3,$bsz
455	lea	-100(%rdi),$A_flat
456
457.Lblock_absorb:
458	mov	($inp),%rax
459	lea	8($inp),$inp
460	xor	($A_flat),%rax
461	lea	8($A_flat),$A_flat
462	sub	\$8,$len
463	mov	%rax,-8($A_flat)
464	sub	\$1,$bsz
465	jnz	.Lblock_absorb
466
467	mov	$inp,200-100(%rsi)	# save inp
468	mov	$len,208-100(%rsi)	# save len
469	call	__KeccakF1600
470	mov	200-100(%rsi),$inp	# pull inp
471	mov	208-100(%rsi),$len	# pull len
472	mov	216-100(%rsi),$bsz	# pull bsz
473	jmp	.Loop_absorb
474
475.align	32
476.Ldone_absorb:
477	mov	$len,%rax		# return value
478
479	notq	$A[0][1](%rdi)
480	notq	$A[0][2](%rdi)
481	notq	$A[1][3](%rdi)
482	notq	$A[2][2](%rdi)
483	notq	$A[3][2](%rdi)
484	notq	$A[4][0](%rdi)
485
486	add	\$232,%rsp
487.cfi_adjust_cfa_offset	-232
488
489	pop	%r15
490.cfi_pop	%r15
491	pop	%r14
492.cfi_pop	%r14
493	pop	%r13
494.cfi_pop	%r13
495	pop	%r12
496.cfi_pop	%r12
497	pop	%rbp
498.cfi_pop	%rbp
499	pop	%rbx
500.cfi_pop	%rbx
501	ret
502.cfi_endproc
503.size	SHA3_absorb,.-SHA3_absorb
504___
505}
506{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
507     ($out,$len,$bsz) = ("%r12","%r13","%r14");
508
509$code.=<<___;
510.globl	SHA3_squeeze
511.type	SHA3_squeeze,\@function,4
512.align	32
513SHA3_squeeze:
514.cfi_startproc
515	push	%r12
516.cfi_push	%r12
517	push	%r13
518.cfi_push	%r13
519	push	%r14
520.cfi_push	%r14
521
522	shr	\$3,%rcx
523	mov	$A_flat,%r8
524	mov	%rsi,$out
525	mov	%rdx,$len
526	mov	%rcx,$bsz
527	jmp	.Loop_squeeze
528
529.align	32
530.Loop_squeeze:
531	cmp	\$8,$len
532	jb	.Ltail_squeeze
533
534	mov	(%r8),%rax
535	lea	8(%r8),%r8
536	mov	%rax,($out)
537	lea	8($out),$out
538	sub	\$8,$len		# len -= 8
539	jz	.Ldone_squeeze
540
541	sub	\$1,%rcx		# bsz--
542	jnz	.Loop_squeeze
543
544	call	KeccakF1600
545	mov	$A_flat,%r8
546	mov	$bsz,%rcx
547	jmp	.Loop_squeeze
548
549.Ltail_squeeze:
550	mov	%r8, %rsi
551	mov	$out,%rdi
552	mov	$len,%rcx
553	.byte	0xf3,0xa4		# rep	movsb
554
555.Ldone_squeeze:
556	pop	%r14
557.cfi_pop	%r14
558	pop	%r13
559.cfi_pop	%r13
560	pop	%r12
561.cfi_pop	%r13
562	ret
563.cfi_endproc
564.size	SHA3_squeeze,.-SHA3_squeeze
565___
566}
567$code.=<<___;
568.align	256
569	.quad	0,0,0,0,0,0,0,0
570.type	iotas,\@object
571iotas:
572	.quad	0x0000000000000001
573	.quad	0x0000000000008082
574	.quad	0x800000000000808a
575	.quad	0x8000000080008000
576	.quad	0x000000000000808b
577	.quad	0x0000000080000001
578	.quad	0x8000000080008081
579	.quad	0x8000000000008009
580	.quad	0x000000000000008a
581	.quad	0x0000000000000088
582	.quad	0x0000000080008009
583	.quad	0x000000008000000a
584	.quad	0x000000008000808b
585	.quad	0x800000000000008b
586	.quad	0x8000000000008089
587	.quad	0x8000000000008003
588	.quad	0x8000000000008002
589	.quad	0x8000000000000080
590	.quad	0x000000000000800a
591	.quad	0x800000008000000a
592	.quad	0x8000000080008081
593	.quad	0x8000000000008080
594	.quad	0x0000000080000001
595	.quad	0x8000000080008008
596.size	iotas,.-iotas
597.asciz	"Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
598___
599
600foreach (split("\n",$code)) {
601	# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
602	# Haswell, but it hurts other processors by up to 2-3-4x...
603	#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
604	# Below replacement results in 9.3 on Haswell [as well as
605	# on Ryzen, i.e. it *hurts* Ryzen]...
606	#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
607
608	print $_, "\n";
609}
610
611close STDOUT or die "error closing STDOUT: $!";
612