xref: /openssl/crypto/chacha/asm/chacha-s390x.pl (revision 82611229)
1#! /usr/bin/env perl
2# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# December 2015
18#
19# ChaCha20 for s390x.
20#
21# 3 times faster than compiler-generated code.
22
23#
24# August 2018
25#
26# Add vx code path: 4x"vertical".
27#
28# Copyright IBM Corp. 2018
29# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
30
31#
32# February 2019
33#
34# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
35# 4x"vertical" submission [on z13] and >3 faster than scalar code.
36# But to harness overheads revert to transliteration of VSX code path
37# from chacha-ppc module, which is also 4x"vertical", to handle inputs
38# not longer than 256 bytes.
39
40use strict;
41use FindBin qw($Bin);
42use lib "$Bin/../..";
43use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE);
44
45# $output is the last argument if it looks like a file (it has an extension)
46# $flavour is the first argument if it doesn't look like a file
47my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49
50my ($z,$SIZE_T);
51if ($flavour =~ /3[12]/) {
52	$z=0;	# S/390 ABI
53	$SIZE_T=4;
54} else {
55	$z=1;	# zSeries ABI
56	$SIZE_T=8;
57}
58
59my $sp="%r15";
60my $stdframe=16*$SIZE_T+4*8;
61
62sub ROUND {
63my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
64my @t=map("%r$_",(8,9));
65my ($a0,$b0,$c0,$d0)=@_;
66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
69my ($xc,$xc_)=map("$_",@t);
70
71	# Consider order in which variables are addressed by their
72	# index:
73	#
74	#	a   b   c   d
75	#
76	#	0   4   8  12 < even round
77	#	1   5   9  13
78	#	2   6  10  14
79	#	3   7  11  15
80	#	0   5  10  15 < odd round
81	#	1   6  11  12
82	#	2   7   8  13
83	#	3   4   9  14
84	#
85	# 'a', 'b' and 'd's are permanently allocated in registers,
86	# @x[0..7,12..15], while 'c's are maintained in memory. If
87	# you observe 'c' column, you'll notice that pair of 'c's is
88	# invariant between rounds. This means that we have to reload
89	# them once per round, in the middle. This is why you'll see
90	# 'c' stores and loads in the middle, but none in the beginning
91	# or end.
92
93	alr	(@x[$a0],@x[$b0]);	# Q1
94	 alr	(@x[$a1],@x[$b1]);	# Q2
95	xr	(@x[$d0],@x[$a0]);
96	 xr	(@x[$d1],@x[$a1]);
97	rll	(@x[$d0],@x[$d0],16);
98	 rll	(@x[$d1],@x[$d1],16);
99
100	alr	($xc,@x[$d0]);
101	 alr	($xc_,@x[$d1]);
102	xr	(@x[$b0],$xc);
103	 xr	(@x[$b1],$xc_);
104	rll	(@x[$b0],@x[$b0],12);
105	 rll	(@x[$b1],@x[$b1],12);
106
107	alr	(@x[$a0],@x[$b0]);
108	 alr	(@x[$a1],@x[$b1]);
109	xr	(@x[$d0],@x[$a0]);
110	 xr	(@x[$d1],@x[$a1]);
111	rll	(@x[$d0],@x[$d0],8);
112	 rll	(@x[$d1],@x[$d1],8);
113
114	alr	($xc,@x[$d0]);
115	 alr	($xc_,@x[$d1]);
116	xr	(@x[$b0],$xc);
117	 xr	(@x[$b1],$xc_);
118	rll	(@x[$b0],@x[$b0],7);
119	 rll	(@x[$b1],@x[$b1],7);
120
121	stm	($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");	# reload pair of 'c's
122	lm	($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
123
124	alr	(@x[$a2],@x[$b2]);	# Q3
125	 alr	(@x[$a3],@x[$b3]);	# Q4
126	xr	(@x[$d2],@x[$a2]);
127	 xr	(@x[$d3],@x[$a3]);
128	rll	(@x[$d2],@x[$d2],16);
129	 rll	(@x[$d3],@x[$d3],16);
130
131	alr	($xc,@x[$d2]);
132	 alr	($xc_,@x[$d3]);
133	xr	(@x[$b2],$xc);
134	 xr	(@x[$b3],$xc_);
135	rll	(@x[$b2],@x[$b2],12);
136	 rll	(@x[$b3],@x[$b3],12);
137
138	alr	(@x[$a2],@x[$b2]);
139	 alr	(@x[$a3],@x[$b3]);
140	xr	(@x[$d2],@x[$a2]);
141	 xr	(@x[$d3],@x[$a3]);
142	rll	(@x[$d2],@x[$d2],8);
143	 rll	(@x[$d3],@x[$d3],8);
144
145	alr	($xc,@x[$d2]);
146	 alr	($xc_,@x[$d3]);
147	xr	(@x[$b2],$xc);
148	 xr	(@x[$b3],$xc_);
149	rll	(@x[$b2],@x[$b2],7);
150	 rll	(@x[$b3],@x[$b3],7);
151}
152
153sub VX_lane_ROUND {
154my ($a0,$b0,$c0,$d0)=@_;
155my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
156my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
157my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
158my @x=map("%v$_",(0..15));
159
160	vaf	(@x[$a0],@x[$a0],@x[$b0]);	# Q1
161	vx	(@x[$d0],@x[$d0],@x[$a0]);
162	verllf	(@x[$d0],@x[$d0],16);
163	vaf	(@x[$a1],@x[$a1],@x[$b1]);	# Q2
164	vx	(@x[$d1],@x[$d1],@x[$a1]);
165	verllf	(@x[$d1],@x[$d1],16);
166	vaf	(@x[$a2],@x[$a2],@x[$b2]);	# Q3
167	vx	(@x[$d2],@x[$d2],@x[$a2]);
168	verllf	(@x[$d2],@x[$d2],16);
169	vaf	(@x[$a3],@x[$a3],@x[$b3]);	# Q4
170	vx	(@x[$d3],@x[$d3],@x[$a3]);
171	verllf	(@x[$d3],@x[$d3],16);
172
173	vaf	(@x[$c0],@x[$c0],@x[$d0]);
174	vx	(@x[$b0],@x[$b0],@x[$c0]);
175	verllf	(@x[$b0],@x[$b0],12);
176	vaf	(@x[$c1],@x[$c1],@x[$d1]);
177	vx	(@x[$b1],@x[$b1],@x[$c1]);
178	verllf	(@x[$b1],@x[$b1],12);
179	vaf	(@x[$c2],@x[$c2],@x[$d2]);
180	vx	(@x[$b2],@x[$b2],@x[$c2]);
181	verllf	(@x[$b2],@x[$b2],12);
182	vaf	(@x[$c3],@x[$c3],@x[$d3]);
183	vx	(@x[$b3],@x[$b3],@x[$c3]);
184	verllf	(@x[$b3],@x[$b3],12);
185
186	vaf	(@x[$a0],@x[$a0],@x[$b0]);
187	vx	(@x[$d0],@x[$d0],@x[$a0]);
188	verllf	(@x[$d0],@x[$d0],8);
189	vaf	(@x[$a1],@x[$a1],@x[$b1]);
190	vx	(@x[$d1],@x[$d1],@x[$a1]);
191	verllf	(@x[$d1],@x[$d1],8);
192	vaf	(@x[$a2],@x[$a2],@x[$b2]);
193	vx	(@x[$d2],@x[$d2],@x[$a2]);
194	verllf	(@x[$d2],@x[$d2],8);
195	vaf	(@x[$a3],@x[$a3],@x[$b3]);
196	vx	(@x[$d3],@x[$d3],@x[$a3]);
197	verllf	(@x[$d3],@x[$d3],8);
198
199	vaf	(@x[$c0],@x[$c0],@x[$d0]);
200	vx	(@x[$b0],@x[$b0],@x[$c0]);
201	verllf	(@x[$b0],@x[$b0],7);
202	vaf	(@x[$c1],@x[$c1],@x[$d1]);
203	vx	(@x[$b1],@x[$b1],@x[$c1]);
204	verllf	(@x[$b1],@x[$b1],7);
205	vaf	(@x[$c2],@x[$c2],@x[$d2]);
206	vx	(@x[$b2],@x[$b2],@x[$c2]);
207	verllf	(@x[$b2],@x[$b2],7);
208	vaf	(@x[$c3],@x[$c3],@x[$d3]);
209	vx	(@x[$b3],@x[$b3],@x[$c3]);
210	verllf	(@x[$b3],@x[$b3],7);
211}
212
213sub VX_ROUND {
214my @a=@_[0..5];
215my @b=@_[6..11];
216my @c=@_[12..17];
217my @d=@_[18..23];
218my $odd=@_[24];
219
220	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
221	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
222	verllf		(@d[$_],@d[$_],16) for (0..5);
223
224	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
225	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
226	verllf		(@b[$_],@b[$_],12) for (0..5);
227
228	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
229	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
230	verllf		(@d[$_],@d[$_],8) for (0..5);
231
232	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
233	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
234	verllf		(@b[$_],@b[$_],7) for (0..5);
235
236	vsldb		(@c[$_],@c[$_],@c[$_],8) for (0..5);
237	vsldb		(@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
238	vsldb		(@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
239}
240
241PERLASM_BEGIN($output);
242
243INCLUDE	("s390x_arch.h");
244TEXT	();
245
246################
247# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
248#                     const unsigned int key[8], const unsigned int counter[4])
249my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
250{
251my $frame=$stdframe+4*20;
252my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
253my @t=map("%r$_",(8,9));
254
255GLOBL	("ChaCha20_ctr32");
256TYPE	("ChaCha20_ctr32","\@function");
257ALIGN	(32);
258LABEL	("ChaCha20_ctr32");
259	larl	("%r1","OPENSSL_s390xcap_P");
260
261	lghi	("%r0",64);
262&{$z?	\&ltgr:\&ltr}	($len,$len);		# len==0?
263	bzr	("%r14");
264	lg	("%r1","S390X_STFLE+16(%r1)");
265&{$z?	\&clgr:\&clr}	($len,"%r0");
266	jle	(".Lshort");
267
268	tmhh	("%r1",0x4000);			# check for vx bit
269	jnz	(".LChaCha20_ctr32_vx");
270
271LABEL	(".Lshort");
272&{$z?	\&aghi:\&ahi}	($len,-64);
273&{$z?	\&lghi:\&lhi}	("%r1",-$frame);
274&{$z?	\&stmg:\&stm}	("%r6","%r15","6*$SIZE_T($sp)");
275&{$z?	\&slgr:\&slr}	($out,$inp);	# difference
276	la	($len,"0($inp,$len)");	# end of input minus 64
277	larl	("%r7",".Lsigma");
278	lgr	("%r0",$sp);
279	la	($sp,"0(%r1,$sp)");
280&{$z?	\&stg:\&st}	("%r0","0($sp)");
281
282	lmg	("%r8","%r11","0($key)");	# load key
283	lmg	("%r12","%r13","0($counter)");	# load counter
284	lmg	("%r6","%r7","0(%r7)");	# load sigma constant
285
286	la	("%r14","0($inp)");
287&{$z?	\&stg:\&st}	($out,"$frame+3*$SIZE_T($sp)");
288&{$z?	\&stg:\&st}	($len,"$frame+4*$SIZE_T($sp)");
289	stmg	("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
290	srlg	(@x[12],"%r12",32);	# 32-bit counter value
291	j	(".Loop_outer");
292
293ALIGN	(16);
294LABEL	(".Loop_outer");
295	lm	(@x[0],@x[7],"$stdframe+4*0($sp)");	# load x[0]-x[7]
296	lm	(@t[0],@t[1],"$stdframe+4*10($sp)");	# load x[10]-x[11]
297	lm	(@x[13],@x[15],"$stdframe+4*13($sp)");	# load x[13]-x[15]
298	stm	(@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
299	lm	(@t[0],@t[1],"$stdframe+4*8($sp)");	# load x[8]-x[9]
300	st	(@x[12],"$stdframe+4*12($sp)");	# save counter
301&{$z?	\&stg:\&st}	("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
302	lhi	("%r14",10);
303	j	(".Loop");
304
305ALIGN	(4);
306LABEL	(".Loop");
307	ROUND	(0, 4, 8,12);
308	ROUND	(0, 5,10,15);
309	brct	("%r14",".Loop");
310
311&{$z?	\&lg:\&l}	("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
312	stm	(@t[0],@t[1],"$stdframe+4*8+4*8($sp)");	# offload x[8]-x[9]
313&{$z?	\&lmg:\&lm}	(@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
314
315	al	(@x[0],"$stdframe+4*0($sp)");	# accumulate key schedule
316	al	(@x[1],"$stdframe+4*1($sp)");
317	al	(@x[2],"$stdframe+4*2($sp)");
318	al	(@x[3],"$stdframe+4*3($sp)");
319	al	(@x[4],"$stdframe+4*4($sp)");
320	al	(@x[5],"$stdframe+4*5($sp)");
321	al	(@x[6],"$stdframe+4*6($sp)");
322	al	(@x[7],"$stdframe+4*7($sp)");
323	lrvr	(@x[0],@x[0]);
324	lrvr	(@x[1],@x[1]);
325	lrvr	(@x[2],@x[2]);
326	lrvr	(@x[3],@x[3]);
327	lrvr	(@x[4],@x[4]);
328	lrvr	(@x[5],@x[5]);
329	lrvr	(@x[6],@x[6]);
330	lrvr	(@x[7],@x[7]);
331	al	(@x[12],"$stdframe+4*12($sp)");
332	al	(@x[13],"$stdframe+4*13($sp)");
333	al	(@x[14],"$stdframe+4*14($sp)");
334	al	(@x[15],"$stdframe+4*15($sp)");
335	lrvr	(@x[12],@x[12]);
336	lrvr	(@x[13],@x[13]);
337	lrvr	(@x[14],@x[14]);
338	lrvr	(@x[15],@x[15]);
339
340	la	(@t[0],"0(@t[0],%r14)");	# reconstruct output pointer
341&{$z?	\&clgr:\&clr}	("%r14",@t[1]);
342	jh	(".Ltail");
343
344	x	(@x[0],"4*0(%r14)");	# xor with input
345	x	(@x[1],"4*1(%r14)");
346	st	(@x[0],"4*0(@t[0])");	# store output
347	x	(@x[2],"4*2(%r14)");
348	st	(@x[1],"4*1(@t[0])");
349	x	(@x[3],"4*3(%r14)");
350	st	(@x[2],"4*2(@t[0])");
351	x	(@x[4],"4*4(%r14)");
352	st	(@x[3],"4*3(@t[0])");
353	 lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");	# load x[8]-x[11]
354	x	(@x[5],"4*5(%r14)");
355	st	(@x[4],"4*4(@t[0])");
356	x	(@x[6],"4*6(%r14)");
357	 al	(@x[0],"$stdframe+4*8($sp)");
358	st	(@x[5],"4*5(@t[0])");
359	x	(@x[7],"4*7(%r14)");
360	 al	(@x[1],"$stdframe+4*9($sp)");
361	st	(@x[6],"4*6(@t[0])");
362	x	(@x[12],"4*12(%r14)");
363	 al	(@x[2],"$stdframe+4*10($sp)");
364	st	(@x[7],"4*7(@t[0])");
365	x	(@x[13],"4*13(%r14)");
366	 al	(@x[3],"$stdframe+4*11($sp)");
367	st	(@x[12],"4*12(@t[0])");
368	x	(@x[14],"4*14(%r14)");
369	st	(@x[13],"4*13(@t[0])");
370	x	(@x[15],"4*15(%r14)");
371	st	(@x[14],"4*14(@t[0])");
372	 lrvr	(@x[0],@x[0]);
373	st	(@x[15],"4*15(@t[0])");
374	 lrvr	(@x[1],@x[1]);
375	 lrvr	(@x[2],@x[2]);
376	 lrvr	(@x[3],@x[3]);
377	lhi	(@x[12],1);
378	 x	(@x[0],"4*8(%r14)");
379	al	(@x[12],"$stdframe+4*12($sp)");	# increment counter
380	 x	(@x[1],"4*9(%r14)");
381	 st	(@x[0],"4*8(@t[0])");
382	 x	(@x[2],"4*10(%r14)");
383	 st	(@x[1],"4*9(@t[0])");
384	 x	(@x[3],"4*11(%r14)");
385	 st	(@x[2],"4*10(@t[0])");
386	 st	(@x[3],"4*11(@t[0])");
387
388&{$z?	\&clgr:\&clr}	("%r14",@t[1]);	# done yet?
389	la	("%r14","64(%r14)");
390	jl	(".Loop_outer");
391
392LABEL	(".Ldone");
393	xgr	("%r0","%r0");
394	xgr	("%r1","%r1");
395	xgr	("%r2","%r2");
396	xgr	("%r3","%r3");
397	stmg	("%r0","%r3","$stdframe+4*4($sp)");	# wipe key copy
398	stmg	("%r0","%r3","$stdframe+4*12($sp)");
399
400&{$z?	\&lmg:\&lm}	("%r6","%r15","$frame+6*$SIZE_T($sp)");
401	br	("%r14");
402
403ALIGN	(16);
404LABEL	(".Ltail");
405	la	(@t[1],"64($t[1])");
406	stm	(@x[0],@x[7],"$stdframe+4*0($sp)");
407&{$z?	\&slgr:\&slr}	(@t[1],"%r14");
408	lm	(@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
409&{$z?	\&lghi:\&lhi}	(@x[6],0);
410	stm	(@x[12],@x[15],"$stdframe+4*12($sp)");
411	al	(@x[0],"$stdframe+4*8($sp)");
412	al	(@x[1],"$stdframe+4*9($sp)");
413	al	(@x[2],"$stdframe+4*10($sp)");
414	al	(@x[3],"$stdframe+4*11($sp)");
415	lrvr	(@x[0],@x[0]);
416	lrvr	(@x[1],@x[1]);
417	lrvr	(@x[2],@x[2]);
418	lrvr	(@x[3],@x[3]);
419	stm	(@x[0],@x[3],"$stdframe+4*8($sp)");
420
421LABEL	(".Loop_tail");
422	llgc	(@x[4],"0(@x[6],%r14)");
423	llgc	(@x[5],"$stdframe(@x[6],$sp)");
424	xr	(@x[5],@x[4]);
425	stc	(@x[5],"0(@x[6],@t[0])");
426	la	(@x[6],"1(@x[6])");
427	brct	(@t[1],".Loop_tail");
428
429	j	(".Ldone");
430SIZE	("ChaCha20_ctr32",".-ChaCha20_ctr32");
431}
432
433########################################################################
434# 4x"vertical" layout minimizes amount of instructions, but pipeline
435# runs underutilized [because of vector instructions' high latency].
436# On the other hand minimum amount of data it takes to fully utilize
437# the pipeline is higher, so that effectively, short inputs would be
438# processed slower. Hence this code path targeting <=256 bytes lengths.
439#
440{
441my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
442    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
443my @K=map("%v$_",(16..19));
444my $CTR="%v26";
445my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
446my $beperm="%v31";
447my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
448my $FRAME=$stdframe+4*16;
449
450ALIGN	(32);
451LABEL	("ChaCha20_ctr32_4x");
452LABEL	(".LChaCha20_ctr32_4x");
453&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
454if (!$z) {
455	std	("%f4","16*$SIZE_T+2*8($sp)");
456	std	("%f6","16*$SIZE_T+3*8($sp)");
457}
458&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
459	lgr	("%r0",$sp);
460	la	($sp,"0(%r1,$sp)");
461&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
462if ($z) {
463	std	("%f8","$stdframe+8*0($sp)");
464	std	("%f9","$stdframe+8*1($sp)");
465	std	("%f10","$stdframe+8*2($sp)");
466	std	("%f11","$stdframe+8*3($sp)");
467	std	("%f12","$stdframe+8*4($sp)");
468	std	("%f13","$stdframe+8*5($sp)");
469	std	("%f14","$stdframe+8*6($sp)");
470	std	("%f15","$stdframe+8*7($sp)");
471}
472	larl	("%r7",".Lsigma");
473	lhi	("%r0",10);
474	lhi	("%r1",0);
475
476	vl	(@K[0],"0(%r7)");		# load sigma
477	vl	(@K[1],"0($key)");		# load key
478	vl	(@K[2],"16($key)");
479	vl	(@K[3],"0($counter)");		# load counter
480
481	vl	($beperm,"0x40(%r7)");
482	vl	($xt1,"0x50(%r7)");
483	vrepf	($CTR,@K[3],0);
484	vlvgf	(@K[3],"%r1",0);		# clear @K[3].word[0]
485	vaf	($CTR,$CTR,$xt1);
486
487#LABEL	(".Loop_outer_4x");
488	vlm	($xa0,$xa3,"0x60(%r7)");	# load [smashed] sigma
489
490	vrepf	($xb0,@K[1],0);			# smash the key
491	vrepf	($xb1,@K[1],1);
492	vrepf	($xb2,@K[1],2);
493	vrepf	($xb3,@K[1],3);
494
495	vrepf	($xc0,@K[2],0);
496	vrepf	($xc1,@K[2],1);
497	vrepf	($xc2,@K[2],2);
498	vrepf	($xc3,@K[2],3);
499
500	vlr	($xd0,$CTR);
501	vrepf	($xd1,@K[3],1);
502	vrepf	($xd2,@K[3],2);
503	vrepf	($xd3,@K[3],3);
504
505LABEL	(".Loop_4x");
506	VX_lane_ROUND(0, 4, 8,12);
507	VX_lane_ROUND(0, 5,10,15);
508	brct	("%r0",".Loop_4x");
509
510	vaf	($xd0,$xd0,$CTR);
511
512	vmrhf	($xt0,$xa0,$xa1);		# transpose data
513	vmrhf	($xt1,$xa2,$xa3);
514	vmrlf	($xt2,$xa0,$xa1);
515	vmrlf	($xt3,$xa2,$xa3);
516	vpdi	($xa0,$xt0,$xt1,0b0000);
517	vpdi	($xa1,$xt0,$xt1,0b0101);
518	vpdi	($xa2,$xt2,$xt3,0b0000);
519	vpdi	($xa3,$xt2,$xt3,0b0101);
520
521	vmrhf	($xt0,$xb0,$xb1);
522	vmrhf	($xt1,$xb2,$xb3);
523	vmrlf	($xt2,$xb0,$xb1);
524	vmrlf	($xt3,$xb2,$xb3);
525	vpdi	($xb0,$xt0,$xt1,0b0000);
526	vpdi	($xb1,$xt0,$xt1,0b0101);
527	vpdi	($xb2,$xt2,$xt3,0b0000);
528	vpdi	($xb3,$xt2,$xt3,0b0101);
529
530	vmrhf	($xt0,$xc0,$xc1);
531	vmrhf	($xt1,$xc2,$xc3);
532	vmrlf	($xt2,$xc0,$xc1);
533	vmrlf	($xt3,$xc2,$xc3);
534	vpdi	($xc0,$xt0,$xt1,0b0000);
535	vpdi	($xc1,$xt0,$xt1,0b0101);
536	vpdi	($xc2,$xt2,$xt3,0b0000);
537	vpdi	($xc3,$xt2,$xt3,0b0101);
538
539	vmrhf	($xt0,$xd0,$xd1);
540	vmrhf	($xt1,$xd2,$xd3);
541	vmrlf	($xt2,$xd0,$xd1);
542	vmrlf	($xt3,$xd2,$xd3);
543	vpdi	($xd0,$xt0,$xt1,0b0000);
544	vpdi	($xd1,$xt0,$xt1,0b0101);
545	vpdi	($xd2,$xt2,$xt3,0b0000);
546	vpdi	($xd3,$xt2,$xt3,0b0101);
547
548	#vrepif	($xt0,4);
549	#vaf	($CTR,$CTR,$xt0);		# next counter value
550
551	vaf	($xa0,$xa0,@K[0]);
552	vaf	($xb0,$xb0,@K[1]);
553	vaf	($xc0,$xc0,@K[2]);
554	vaf	($xd0,$xd0,@K[3]);
555
556	vperm	($xa0,$xa0,$xa0,$beperm);
557	vperm	($xb0,$xb0,$xb0,$beperm);
558	vperm	($xc0,$xc0,$xc0,$beperm);
559	vperm	($xd0,$xd0,$xd0,$beperm);
560
561	#&{$z?	\&clgfi:\&clfi} ($len,0x40);
562	#jl	(".Ltail_4x");
563
564	vlm	($xt0,$xt3,"0($inp)");
565
566	vx	($xt0,$xt0,$xa0);
567	vx	($xt1,$xt1,$xb0);
568	vx	($xt2,$xt2,$xc0);
569	vx	($xt3,$xt3,$xd0);
570
571	vstm	($xt0,$xt3,"0($out)");
572
573	la	($inp,"0x40($inp)");
574	la	($out,"0x40($out)");
575&{$z?	\&aghi:\&ahi}	($len,-0x40);
576	#je	(".Ldone_4x");
577
578	vaf	($xa0,$xa1,@K[0]);
579	vaf	($xb0,$xb1,@K[1]);
580	vaf	($xc0,$xc1,@K[2]);
581	vaf	($xd0,$xd1,@K[3]);
582
583	vperm	($xa0,$xa0,$xa0,$beperm);
584	vperm	($xb0,$xb0,$xb0,$beperm);
585	vperm	($xc0,$xc0,$xc0,$beperm);
586	vperm	($xd0,$xd0,$xd0,$beperm);
587
588&{$z?	\&clgfi:\&clfi} ($len,0x40);
589	jl	(".Ltail_4x");
590
591	vlm	($xt0,$xt3,"0($inp)");
592
593	vx	($xt0,$xt0,$xa0);
594	vx	($xt1,$xt1,$xb0);
595	vx	($xt2,$xt2,$xc0);
596	vx	($xt3,$xt3,$xd0);
597
598	vstm	($xt0,$xt3,"0($out)");
599
600	la	($inp,"0x40($inp)");
601	la	($out,"0x40($out)");
602&{$z?	\&aghi:\&ahi}	($len,-0x40);
603	je	(".Ldone_4x");
604
605	vaf	($xa0,$xa2,@K[0]);
606	vaf	($xb0,$xb2,@K[1]);
607	vaf	($xc0,$xc2,@K[2]);
608	vaf	($xd0,$xd2,@K[3]);
609
610	vperm	($xa0,$xa0,$xa0,$beperm);
611	vperm	($xb0,$xb0,$xb0,$beperm);
612	vperm	($xc0,$xc0,$xc0,$beperm);
613	vperm	($xd0,$xd0,$xd0,$beperm);
614
615&{$z?	\&clgfi:\&clfi} ($len,0x40);
616	jl	(".Ltail_4x");
617
618	vlm	($xt0,$xt3,"0($inp)");
619
620	vx	($xt0,$xt0,$xa0);
621	vx	($xt1,$xt1,$xb0);
622	vx	($xt2,$xt2,$xc0);
623	vx	($xt3,$xt3,$xd0);
624
625	vstm	($xt0,$xt3,"0($out)");
626
627	la	($inp,"0x40($inp)");
628	la	($out,"0x40($out)");
629&{$z?	\&aghi:\&ahi}	($len,-0x40);
630	je	(".Ldone_4x");
631
632	vaf	($xa0,$xa3,@K[0]);
633	vaf	($xb0,$xb3,@K[1]);
634	vaf	($xc0,$xc3,@K[2]);
635	vaf	($xd0,$xd3,@K[3]);
636
637	vperm	($xa0,$xa0,$xa0,$beperm);
638	vperm	($xb0,$xb0,$xb0,$beperm);
639	vperm	($xc0,$xc0,$xc0,$beperm);
640	vperm	($xd0,$xd0,$xd0,$beperm);
641
642&{$z?	\&clgfi:\&clfi} ($len,0x40);
643	jl	(".Ltail_4x");
644
645	vlm	($xt0,$xt3,"0($inp)");
646
647	vx	($xt0,$xt0,$xa0);
648	vx	($xt1,$xt1,$xb0);
649	vx	($xt2,$xt2,$xc0);
650	vx	($xt3,$xt3,$xd0);
651
652	vstm	($xt0,$xt3,"0($out)");
653
654	#la	$inp,0x40($inp));
655	#la	$out,0x40($out));
656	#lhi	%r0,10);
657	#&{$z?	\&aghi:\&ahi}	$len,-0x40);
658	#jne	.Loop_outer_4x);
659
660LABEL	(".Ldone_4x");
661if (!$z) {
662	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
663	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
664} else {
665	ld	("%f8","$stdframe+8*0($sp)");
666	ld	("%f9","$stdframe+8*1($sp)");
667	ld	("%f10","$stdframe+8*2($sp)");
668	ld	("%f11","$stdframe+8*3($sp)");
669	ld	("%f12","$stdframe+8*4($sp)");
670	ld	("%f13","$stdframe+8*5($sp)");
671	ld	("%f14","$stdframe+8*6($sp)");
672	ld	("%f15","$stdframe+8*7($sp)");
673}
674&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
675	la	($sp,"$FRAME($sp)");
676	br	("%r14");
677
678ALIGN	(16);
679LABEL	(".Ltail_4x");
680if (!$z) {
681	vlr	($xt0,$xb0);
682	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
683	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
684
685	vst	($xa0,"$stdframe+0x00($sp)");
686	vst	($xt0,"$stdframe+0x10($sp)");
687	vst	($xc0,"$stdframe+0x20($sp)");
688	vst	($xd0,"$stdframe+0x30($sp)");
689} else {
690	vlr	($xt0,$xc0);
691	ld	("%f8","$stdframe+8*0($sp)");
692	ld	("%f9","$stdframe+8*1($sp)");
693	ld	("%f10","$stdframe+8*2($sp)");
694	ld	("%f11","$stdframe+8*3($sp)");
695	vlr	($xt1,$xd0);
696	ld	("%f12","$stdframe+8*4($sp)");
697	ld	("%f13","$stdframe+8*5($sp)");
698	ld	("%f14","$stdframe+8*6($sp)");
699	ld	("%f15","$stdframe+8*7($sp)");
700
701	vst	($xa0,"$stdframe+0x00($sp)");
702	vst	($xb0,"$stdframe+0x10($sp)");
703	vst	($xt0,"$stdframe+0x20($sp)");
704	vst	($xt1,"$stdframe+0x30($sp)");
705}
706	lghi	("%r1",0);
707
708LABEL	(".Loop_tail_4x");
709	llgc	("%r5","0(%r1,$inp)");
710	llgc	("%r6","$stdframe(%r1,$sp)");
711	xr	("%r6","%r5");
712	stc	("%r6","0(%r1,$out)");
713	la	("%r1","1(%r1)");
714	brct	($len,".Loop_tail_4x");
715
716&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
717	la	($sp,"$FRAME($sp)");
718	br	("%r14");
719SIZE	("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
720}
721
722########################################################################
723# 6x"horizontal" layout is optimal fit for the platform in its current
724# shape, more specifically for given vector instructions' latency. Well,
725# computational part of 8x"vertical" would be faster, but it consumes
726# all registers and dealing with that will diminish the return...
727#
728{
729my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
730    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
731    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
732my @K=map("%v$_",(27,24..26));
733my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
734my $beperm="%v31";
735my $FRAME=$stdframe + 4*16;
736
737GLOBL	("ChaCha20_ctr32_vx");
738ALIGN	(32);
739LABEL	("ChaCha20_ctr32_vx");
740LABEL	(".LChaCha20_ctr32_vx");
741&{$z?	\&clgfi:\&clfi}	($len,256);
742	jle	(".LChaCha20_ctr32_4x");
743&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
744if (!$z) {
745	std	("%f4","16*$SIZE_T+2*8($sp)");
746	std	("%f6","16*$SIZE_T+3*8($sp)");
747}
748&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
749	lgr	("%r0",$sp);
750	la	($sp,"0(%r1,$sp)");
751&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
752if ($z) {
753	std	("%f8","$FRAME-8*8($sp)");
754	std	("%f9","$FRAME-8*7($sp)");
755	std	("%f10","$FRAME-8*6($sp)");
756	std	("%f11","$FRAME-8*5($sp)");
757	std	("%f12","$FRAME-8*4($sp)");
758	std	("%f13","$FRAME-8*3($sp)");
759	std	("%f14","$FRAME-8*2($sp)");
760	std	("%f15","$FRAME-8*1($sp)");
761}
762	larl	("%r7",".Lsigma");
763	lhi	("%r0",10);
764
765	vlm	(@K[1],@K[2],"0($key)");	# load key
766	vl	(@K[3],"0($counter)");		# load counter
767
768	vlm	(@K[0],"$beperm","0(%r7)");	# load sigma, increments, ...
769
770LABEL	(".Loop_outer_vx");
771	vlr	($a0,@K[0]);
772	vlr	($b0,@K[1]);
773	vlr	($a1,@K[0]);
774	vlr	($b1,@K[1]);
775	vlr	($a2,@K[0]);
776	vlr	($b2,@K[1]);
777	vlr	($a3,@K[0]);
778	vlr	($b3,@K[1]);
779	vlr	($a4,@K[0]);
780	vlr	($b4,@K[1]);
781	vlr	($a5,@K[0]);
782	vlr	($b5,@K[1]);
783
784	vlr	($d0,@K[3]);
785	vaf	($d1,@K[3],$t1);		# K[3]+1
786	vaf	($d2,@K[3],$t2);		# K[3]+2
787	vaf	($d3,@K[3],$t3);		# K[3]+3
788	vaf	($d4,$d2,$t2);			# K[3]+4
789	vaf	($d5,$d2,$t3);			# K[3]+5
790
791	vlr	($c0,@K[2]);
792	vlr	($c1,@K[2]);
793	vlr	($c2,@K[2]);
794	vlr	($c3,@K[2]);
795	vlr	($c4,@K[2]);
796	vlr	($c5,@K[2]);
797
798	vlr	($t1,$d1);
799	vlr	($t2,$d2);
800	vlr	($t3,$d3);
801
802ALIGN	(4);
803LABEL	(".Loop_vx");
804
805	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
806		 $b0,$b1,$b2,$b3,$b4,$b5,
807		 $c0,$c1,$c2,$c3,$c4,$c5,
808		 $d0,$d1,$d2,$d3,$d4,$d5,
809		 0);
810
811	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
812		 $b0,$b1,$b2,$b3,$b4,$b5,
813		 $c0,$c1,$c2,$c3,$c4,$c5,
814		 $d0,$d1,$d2,$d3,$d4,$d5,
815		 1);
816
817	brct	("%r0",".Loop_vx");
818
819	vaf	($a0,$a0,@K[0]);
820	vaf	($b0,$b0,@K[1]);
821	vaf	($c0,$c0,@K[2]);
822	vaf	($d0,$d0,@K[3]);
823	vaf	($a1,$a1,@K[0]);
824	vaf	($d1,$d1,$t1);			# +K[3]+1
825
826	vperm	($a0,$a0,$a0,$beperm);
827	vperm	($b0,$b0,$b0,$beperm);
828	vperm	($c0,$c0,$c0,$beperm);
829	vperm	($d0,$d0,$d0,$beperm);
830
831&{$z?	\&clgfi:\&clfi}	($len,0x40);
832	jl	(".Ltail_vx");
833
834	vaf	($d2,$d2,$t2);			# +K[3]+2
835	vaf	($d3,$d3,$t3);			# +K[3]+3
836	vlm	($t0,$t3,"0($inp)");
837
838	vx	($a0,$a0,$t0);
839	vx	($b0,$b0,$t1);
840	vx	($c0,$c0,$t2);
841	vx	($d0,$d0,$t3);
842
843	vlm	(@K[0],$t3,"0(%r7)");		# re-load sigma and increments
844
845	vstm	($a0,$d0,"0($out)");
846
847	la	($inp,"0x40($inp)");
848	la	($out,"0x40($out)");
849&{$z?	\&aghi:\&ahi}	($len,-0x40);
850	je	(".Ldone_vx");
851
852	vaf	($b1,$b1,@K[1]);
853	vaf	($c1,$c1,@K[2]);
854
855	vperm	($a0,$a1,$a1,$beperm);
856	vperm	($b0,$b1,$b1,$beperm);
857	vperm	($c0,$c1,$c1,$beperm);
858	vperm	($d0,$d1,$d1,$beperm);
859
860&{$z?	\&clgfi:\&clfi} ($len,0x40);
861	jl	(".Ltail_vx");
862
863	vlm	($a1,$d1,"0($inp)");
864
865	vx	($a0,$a0,$a1);
866	vx	($b0,$b0,$b1);
867	vx	($c0,$c0,$c1);
868	vx	($d0,$d0,$d1);
869
870	vstm	($a0,$d0,"0($out)");
871
872	la	($inp,"0x40($inp)");
873	la	($out,"0x40($out)");
874&{$z?	\&aghi:\&ahi}	($len,-0x40);
875	je	(".Ldone_vx");
876
877	vaf	($a2,$a2,@K[0]);
878	vaf	($b2,$b2,@K[1]);
879	vaf	($c2,$c2,@K[2]);
880
881	vperm	($a0,$a2,$a2,$beperm);
882	vperm	($b0,$b2,$b2,$beperm);
883	vperm	($c0,$c2,$c2,$beperm);
884	vperm	($d0,$d2,$d2,$beperm);
885
886&{$z?	\&clgfi:\&clfi}	($len,0x40);
887	jl	(".Ltail_vx");
888
889	vlm	($a1,$d1,"0($inp)");
890
891	vx	($a0,$a0,$a1);
892	vx	($b0,$b0,$b1);
893	vx	($c0,$c0,$c1);
894	vx	($d0,$d0,$d1);
895
896	vstm	($a0,$d0,"0($out)");
897
898	la	($inp,"0x40($inp)");
899	la	($out,"0x40($out)");
900&{$z?	\&aghi:\&ahi}	($len,-0x40);
901	je	(".Ldone_vx");
902
903	vaf	($a3,$a3,@K[0]);
904	vaf	($b3,$b3,@K[1]);
905	vaf	($c3,$c3,@K[2]);
906	vaf	($d2,@K[3],$t3);		# K[3]+3
907
908	vperm	($a0,$a3,$a3,$beperm);
909	vperm	($b0,$b3,$b3,$beperm);
910	vperm	($c0,$c3,$c3,$beperm);
911	vperm	($d0,$d3,$d3,$beperm);
912
913&{$z?	\&clgfi:\&clfi}	($len,0x40);
914	jl	(".Ltail_vx");
915
916	vaf	($d3,$d2,$t1);			# K[3]+4
917	vlm	($a1,$d1,"0($inp)");
918
919	vx	($a0,$a0,$a1);
920	vx	($b0,$b0,$b1);
921	vx	($c0,$c0,$c1);
922	vx	($d0,$d0,$d1);
923
924	vstm	($a0,$d0,"0($out)");
925
926	la	($inp,"0x40($inp)");
927	la	($out,"0x40($out)");
928&{$z?	\&aghi:\&ahi}	($len,-0x40);
929	je	(".Ldone_vx");
930
931	vaf	($a4,$a4,@K[0]);
932	vaf	($b4,$b4,@K[1]);
933	vaf	($c4,$c4,@K[2]);
934	vaf	($d4,$d4,$d3);			# +K[3]+4
935	vaf	($d3,$d3,$t1);			# K[3]+5
936	vaf	(@K[3],$d2,$t3);		# K[3]+=6
937
938	vperm	($a0,$a4,$a4,$beperm);
939	vperm	($b0,$b4,$b4,$beperm);
940	vperm	($c0,$c4,$c4,$beperm);
941	vperm	($d0,$d4,$d4,$beperm);
942
943&{$z?	\&clgfi:\&clfi}	($len,0x40);
944	jl	(".Ltail_vx");
945
946	vlm	($a1,$d1,"0($inp)");
947
948	vx	($a0,$a0,$a1);
949	vx	($b0,$b0,$b1);
950	vx	($c0,$c0,$c1);
951	vx	($d0,$d0,$d1);
952
953	vstm	($a0,$d0,"0($out)");
954
955	la	($inp,"0x40($inp)");
956	la	($out,"0x40($out)");
957&{$z?	\&aghi:\&ahi}	($len,-0x40);
958	je	(".Ldone_vx");
959
960	vaf	($a5,$a5,@K[0]);
961	vaf	($b5,$b5,@K[1]);
962	vaf	($c5,$c5,@K[2]);
963	vaf	($d5,$d5,$d3);			# +K[3]+5
964
965	vperm	($a0,$a5,$a5,$beperm);
966	vperm	($b0,$b5,$b5,$beperm);
967	vperm	($c0,$c5,$c5,$beperm);
968	vperm	($d0,$d5,$d5,$beperm);
969
970&{$z?	\&clgfi:\&clfi} ($len,0x40);
971	jl	(".Ltail_vx");
972
973	vlm	($a1,$d1,"0($inp)");
974
975	vx	($a0,$a0,$a1);
976	vx	($b0,$b0,$b1);
977	vx	($c0,$c0,$c1);
978	vx	($d0,$d0,$d1);
979
980	vstm	($a0,$d0,"0($out)");
981
982	la	($inp,"0x40($inp)");
983	la	($out,"0x40($out)");
984	lhi	("%r0",10);
985&{$z?	\&aghi:\&ahi}	($len,-0x40);
986	jne	(".Loop_outer_vx");
987
988LABEL	(".Ldone_vx");
989if (!$z) {
990	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
991	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
992} else {
993	ld	("%f8","$FRAME-8*8($sp)");
994	ld	("%f9","$FRAME-8*7($sp)");
995	ld	("%f10","$FRAME-8*6($sp)");
996	ld	("%f11","$FRAME-8*5($sp)");
997	ld	("%f12","$FRAME-8*4($sp)");
998	ld	("%f13","$FRAME-8*3($sp)");
999	ld	("%f14","$FRAME-8*2($sp)");
1000	ld	("%f15","$FRAME-8*1($sp)");
1001}
1002&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1003	la	($sp,"$FRAME($sp)");
1004	br	("%r14");
1005
1006ALIGN	(16);
1007LABEL	(".Ltail_vx");
1008if (!$z) {
1009	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
1010	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
1011} else {
1012	ld	("%f8","$FRAME-8*8($sp)");
1013	ld	("%f9","$FRAME-8*7($sp)");
1014	ld	("%f10","$FRAME-8*6($sp)");
1015	ld	("%f11","$FRAME-8*5($sp)");
1016	ld	("%f12","$FRAME-8*4($sp)");
1017	ld	("%f13","$FRAME-8*3($sp)");
1018	ld	("%f14","$FRAME-8*2($sp)");
1019	ld	("%f15","$FRAME-8*1($sp)");
1020}
1021	vstm	($a0,$d0,"$stdframe($sp)");
1022	lghi	("%r1",0);
1023
1024LABEL	(".Loop_tail_vx");
1025	llgc	("%r5","0(%r1,$inp)");
1026	llgc	("%r6","$stdframe(%r1,$sp)");
1027	xr	("%r6","%r5");
1028	stc	("%r6","0(%r1,$out)");
1029	la	("%r1","1(%r1)");
1030	brct	($len,".Loop_tail_vx");
1031
1032&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1033	la	($sp,"$FRAME($sp)");
1034	br	("%r14");
1035SIZE	("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
1036}
1037################
1038
1039ALIGN	(32);
1040LABEL	(".Lsigma");
1041LONG	(0x61707865,0x3320646e,0x79622d32,0x6b206574);	# endian-neutral sigma
1042LONG	(1,0,0,0);
1043LONG	(2,0,0,0);
1044LONG	(3,0,0,0);
1045LONG	(0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);	# byte swap
1046
1047LONG	(0,1,2,3);
1048LONG	(0x61707865,0x61707865,0x61707865,0x61707865);	# smashed sigma
1049LONG	(0x3320646e,0x3320646e,0x3320646e,0x3320646e);
1050LONG	(0x79622d32,0x79622d32,0x79622d32,0x79622d32);
1051LONG	(0x6b206574,0x6b206574,0x6b206574,0x6b206574);
1052
1053ASCIZ	("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1054ALIGN	(4);
1055
1056PERLASM_END();
1057