xref: /openssl/crypto/aes/asm/aes-s390x.pl (revision 33388b44)
1#! /usr/bin/env perl
2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# AES for s390x.
18
19# April 2007.
20#
21# Software performance improvement over gcc-generated code is ~70% and
22# in absolute terms is ~73 cycles per byte processed with 128-bit key.
23# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
24# *strictly* in-order execution and issued instruction [in this case
25# load value from memory is critical] has to complete before execution
26# flow proceeds. S-boxes are compressed to 2KB[+256B].
27#
28# As for hardware acceleration support. It's basically a "teaser," as
29# it can and should be improved in several ways. Most notably support
30# for CBC is not utilized, nor multiple blocks are ever processed.
31# Then software key schedule can be postponed till hardware support
32# detection... Performance improvement over assembler is reportedly
33# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
34# support is implemented.
35
36# May 2007.
37#
38# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
39# for 128-bit keys, if hardware support is detected.
40
41# January 2009.
42#
43# Add support for hardware AES192/256 and reschedule instructions to
44# minimize/avoid Address Generation Interlock hazard and to favour
45# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
46# almost 50% on z9. The gain is smaller on z10, because being dual-
47# issue z10 makes it impossible to eliminate the interlock condition:
48# critical path is not long enough. Yet it spends ~24 cycles per byte
49# processed with 128-bit key.
50#
51# Unlike previous version hardware support detection takes place only
52# at the moment of key schedule setup, which is denoted in key->rounds.
53# This is done, because deferred key setup can't be made MT-safe, not
54# for keys longer than 128 bits.
55#
56# Add AES_cbc_encrypt, which gives incredible performance improvement,
57# it was measured to be ~6.6x. It's less than previously mentioned 8x,
58# because software implementation was optimized.
59
60# May 2010.
61#
62# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
63# performance improvement over "generic" counter mode routine relying
64# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
65# to the fact that exact throughput value depends on current stack
66# frame alignment within 4KB page. In worst case you get ~75% of the
67# maximum, but *on average* it would be as much as ~98%. Meaning that
68# worst case is unlike, it's like hitting ravine on plateau.
69
70# November 2010.
71#
72# Adapt for -m31 build. If kernel supports what's called "highgprs"
73# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
74# instructions and achieve "64-bit" performance even in 31-bit legacy
75# application context. The feature is not specific to any particular
76# processor, as long as it's "z-CPU". Latter implies that the code
77# remains z/Architecture specific. On z990 it was measured to perform
78# 2x better than code generated by gcc 4.3.
79
80# December 2010.
81#
82# Add support for z196 "cipher message with counter" instruction.
83# Note however that it's disengaged, because it was measured to
84# perform ~12% worse than vanilla km-based code...
85
86# February 2011.
87#
88# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
89# instructions, which deliver ~70% improvement at 8KB block size over
90# vanilla km-based code, 37% - at most like 512-bytes block size.
91
92# $output is the last argument if it looks like a file (it has an extension)
93# $flavour is the first argument if it doesn't look like a file
94$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
95$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
96
97if ($flavour =~ /3[12]/) {
98	$SIZE_T=4;
99	$g="";
100} else {
101	$SIZE_T=8;
102	$g="g";
103}
104
105$output and open STDOUT,">$output";
106
107$softonly=0;	# allow hardware support
108
109$t0="%r0";	$mask="%r0";
110$t1="%r1";
111$t2="%r2";	$inp="%r2";
112$t3="%r3";	$out="%r3";	$bits="%r3";
113$key="%r4";
114$i1="%r5";
115$i2="%r6";
116$i3="%r7";
117$s0="%r8";
118$s1="%r9";
119$s2="%r10";
120$s3="%r11";
121$tbl="%r12";
122$rounds="%r13";
123$ra="%r14";
124$sp="%r15";
125
126$stdframe=16*$SIZE_T+4*8;
127
128sub _data_word()
129{ my $i;
130    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
131}
132
133$code=<<___;
134#include "s390x_arch.h"
135
136.text
137
138.type	AES_Te,\@object
139.align	256
140AES_Te:
141___
142&_data_word(
143	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
144	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
145	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
146	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
147	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
148	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
149	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
150	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
151	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
152	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
153	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
154	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
155	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
156	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
157	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
158	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
159	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
160	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
161	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
162	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
163	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
164	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
165	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
166	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
167	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
168	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
169	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
170	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
171	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
172	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
173	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
174	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
175	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
176	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
177	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
178	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
179	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
180	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
181	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
182	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
183	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
184	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
185	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
186	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
187	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
188	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
189	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
190	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
191	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
192	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
193	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
194	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
195	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
196	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
197	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
198	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
199	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
200	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
201	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
202	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
203	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
204	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
205	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
206	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
207$code.=<<___;
208# Te4[256]
209.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
210.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
211.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
212.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
213.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
214.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
215.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
216.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
217.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
218.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
219.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
220.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
221.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
222.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
223.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
224.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
225.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
226.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
227.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
228.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
229.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
230.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
231.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
232.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
233.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
234.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
235.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
236.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
237.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
238.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
239.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
240.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
241# rcon[]
242.long	0x01000000, 0x02000000, 0x04000000, 0x08000000
243.long	0x10000000, 0x20000000, 0x40000000, 0x80000000
244.long	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
245.align	256
246.size	AES_Te,.-AES_Te
247
248# void AES_encrypt(const unsigned char *inp, unsigned char *out,
249# 		 const AES_KEY *key) {
250.globl	AES_encrypt
251.type	AES_encrypt,\@function
252AES_encrypt:
253___
254$code.=<<___ if (!$softonly);
255	l	%r0,240($key)
256	lhi	%r1,16
257	clr	%r0,%r1
258	jl	.Lesoft
259
260	la	%r1,0($key)
261	#la	%r2,0($inp)
262	la	%r4,0($out)
263	lghi	%r3,16		# single block length
264	.long	0xb92e0042	# km %r4,%r2
265	brc	1,.-4		# can this happen?
266	br	%r14
267.align	64
268.Lesoft:
269___
270$code.=<<___;
271	stm${g}	%r3,$ra,3*$SIZE_T($sp)
272
273	llgf	$s0,0($inp)
274	llgf	$s1,4($inp)
275	llgf	$s2,8($inp)
276	llgf	$s3,12($inp)
277
278	larl	$tbl,AES_Te
279	bras	$ra,_s390x_AES_encrypt
280
281	l${g}	$out,3*$SIZE_T($sp)
282	st	$s0,0($out)
283	st	$s1,4($out)
284	st	$s2,8($out)
285	st	$s3,12($out)
286
287	lm${g}	%r6,$ra,6*$SIZE_T($sp)
288	br	$ra
289.size	AES_encrypt,.-AES_encrypt
290
291.type   _s390x_AES_encrypt,\@function
292.align	16
293_s390x_AES_encrypt:
294	st${g}	$ra,15*$SIZE_T($sp)
295	x	$s0,0($key)
296	x	$s1,4($key)
297	x	$s2,8($key)
298	x	$s3,12($key)
299	l	$rounds,240($key)
300	llill	$mask,`0xff<<3`
301	aghi	$rounds,-1
302	j	.Lenc_loop
303.align	16
304.Lenc_loop:
305	sllg	$t1,$s0,`0+3`
306	srlg	$t2,$s0,`8-3`
307	srlg	$t3,$s0,`16-3`
308	srl	$s0,`24-3`
309	nr	$s0,$mask
310	ngr	$t1,$mask
311	nr	$t2,$mask
312	nr	$t3,$mask
313
314	srlg	$i1,$s1,`16-3`	# i0
315	sllg	$i2,$s1,`0+3`
316	srlg	$i3,$s1,`8-3`
317	srl	$s1,`24-3`
318	nr	$i1,$mask
319	nr	$s1,$mask
320	ngr	$i2,$mask
321	nr	$i3,$mask
322
323	l	$s0,0($s0,$tbl)	# Te0[s0>>24]
324	l	$t1,1($t1,$tbl)	# Te3[s0>>0]
325	l	$t2,2($t2,$tbl) # Te2[s0>>8]
326	l	$t3,3($t3,$tbl)	# Te1[s0>>16]
327
328	x	$s0,3($i1,$tbl)	# Te1[s1>>16]
329	l	$s1,0($s1,$tbl)	# Te0[s1>>24]
330	x	$t2,1($i2,$tbl)	# Te3[s1>>0]
331	x	$t3,2($i3,$tbl)	# Te2[s1>>8]
332
333	srlg	$i1,$s2,`8-3`	# i0
334	srlg	$i2,$s2,`16-3`	# i1
335	nr	$i1,$mask
336	nr	$i2,$mask
337	sllg	$i3,$s2,`0+3`
338	srl	$s2,`24-3`
339	nr	$s2,$mask
340	ngr	$i3,$mask
341
342	xr	$s1,$t1
343	srlg	$ra,$s3,`8-3`	# i1
344	sllg	$t1,$s3,`0+3`	# i0
345	nr	$ra,$mask
346	la	$key,16($key)
347	ngr	$t1,$mask
348
349	x	$s0,2($i1,$tbl)	# Te2[s2>>8]
350	x	$s1,3($i2,$tbl)	# Te1[s2>>16]
351	l	$s2,0($s2,$tbl)	# Te0[s2>>24]
352	x	$t3,1($i3,$tbl)	# Te3[s2>>0]
353
354	srlg	$i3,$s3,`16-3`	# i2
355	xr	$s2,$t2
356	srl	$s3,`24-3`
357	nr	$i3,$mask
358	nr	$s3,$mask
359
360	x	$s0,0($key)
361	x	$s1,4($key)
362	x	$s2,8($key)
363	x	$t3,12($key)
364
365	x	$s0,1($t1,$tbl)	# Te3[s3>>0]
366	x	$s1,2($ra,$tbl)	# Te2[s3>>8]
367	x	$s2,3($i3,$tbl)	# Te1[s3>>16]
368	l	$s3,0($s3,$tbl)	# Te0[s3>>24]
369	xr	$s3,$t3
370
371	brct	$rounds,.Lenc_loop
372	.align	16
373
374	sllg	$t1,$s0,`0+3`
375	srlg	$t2,$s0,`8-3`
376	ngr	$t1,$mask
377	srlg	$t3,$s0,`16-3`
378	srl	$s0,`24-3`
379	nr	$s0,$mask
380	nr	$t2,$mask
381	nr	$t3,$mask
382
383	srlg	$i1,$s1,`16-3`	# i0
384	sllg	$i2,$s1,`0+3`
385	ngr	$i2,$mask
386	srlg	$i3,$s1,`8-3`
387	srl	$s1,`24-3`
388	nr	$i1,$mask
389	nr	$s1,$mask
390	nr	$i3,$mask
391
392	llgc	$s0,2($s0,$tbl)	# Te4[s0>>24]
393	llgc	$t1,2($t1,$tbl)	# Te4[s0>>0]
394	sll	$s0,24
395	llgc	$t2,2($t2,$tbl)	# Te4[s0>>8]
396	llgc	$t3,2($t3,$tbl)	# Te4[s0>>16]
397	sll	$t2,8
398	sll	$t3,16
399
400	llgc	$i1,2($i1,$tbl)	# Te4[s1>>16]
401	llgc	$s1,2($s1,$tbl)	# Te4[s1>>24]
402	llgc	$i2,2($i2,$tbl)	# Te4[s1>>0]
403	llgc	$i3,2($i3,$tbl)	# Te4[s1>>8]
404	sll	$i1,16
405	sll	$s1,24
406	sll	$i3,8
407	or	$s0,$i1
408	or	$s1,$t1
409	or	$t2,$i2
410	or	$t3,$i3
411
412	srlg	$i1,$s2,`8-3`	# i0
413	srlg	$i2,$s2,`16-3`	# i1
414	nr	$i1,$mask
415	nr	$i2,$mask
416	sllg	$i3,$s2,`0+3`
417	srl	$s2,`24-3`
418	ngr	$i3,$mask
419	nr	$s2,$mask
420
421	sllg	$t1,$s3,`0+3`	# i0
422	srlg	$ra,$s3,`8-3`	# i1
423	ngr	$t1,$mask
424
425	llgc	$i1,2($i1,$tbl)	# Te4[s2>>8]
426	llgc	$i2,2($i2,$tbl)	# Te4[s2>>16]
427	sll	$i1,8
428	llgc	$s2,2($s2,$tbl)	# Te4[s2>>24]
429	llgc	$i3,2($i3,$tbl)	# Te4[s2>>0]
430	sll	$i2,16
431	nr	$ra,$mask
432	sll	$s2,24
433	or	$s0,$i1
434	or	$s1,$i2
435	or	$s2,$t2
436	or	$t3,$i3
437
438	srlg	$i3,$s3,`16-3`	# i2
439	srl	$s3,`24-3`
440	nr	$i3,$mask
441	nr	$s3,$mask
442
443	l	$t0,16($key)
444	l	$t2,20($key)
445
446	llgc	$i1,2($t1,$tbl)	# Te4[s3>>0]
447	llgc	$i2,2($ra,$tbl)	# Te4[s3>>8]
448	llgc	$i3,2($i3,$tbl)	# Te4[s3>>16]
449	llgc	$s3,2($s3,$tbl)	# Te4[s3>>24]
450	sll	$i2,8
451	sll	$i3,16
452	sll	$s3,24
453	or	$s0,$i1
454	or	$s1,$i2
455	or	$s2,$i3
456	or	$s3,$t3
457
458	l${g}	$ra,15*$SIZE_T($sp)
459	xr	$s0,$t0
460	xr	$s1,$t2
461	x	$s2,24($key)
462	x	$s3,28($key)
463
464	br	$ra
465.size	_s390x_AES_encrypt,.-_s390x_AES_encrypt
466___
467
468$code.=<<___;
469.type	AES_Td,\@object
470.align	256
471AES_Td:
472___
473&_data_word(
474	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
475	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
476	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
477	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
478	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
479	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
480	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
481	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
482	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
483	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
484	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
485	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
486	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
487	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
488	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
489	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
490	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
491	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
492	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
493	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
494	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
495	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
496	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
497	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
498	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
499	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
500	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
501	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
502	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
503	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
504	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
505	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
506	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
507	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
508	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
509	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
510	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
511	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
512	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
513	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
514	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
515	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
516	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
517	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
518	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
519	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
520	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
521	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
522	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
523	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
524	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
525	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
526	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
527	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
528	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
529	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
530	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
531	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
532	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
533	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
534	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
535	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
536	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
537	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
538$code.=<<___;
539# Td4[256]
540.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
541.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
542.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
543.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
544.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
545.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
546.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
547.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
548.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
549.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
550.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
551.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
552.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
553.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
554.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
555.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
556.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
557.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
558.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
559.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
560.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
561.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
562.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
563.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
564.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
565.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
566.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
567.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
568.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
569.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
570.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
571.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
572.size	AES_Td,.-AES_Td
573
574# void AES_decrypt(const unsigned char *inp, unsigned char *out,
575# 		 const AES_KEY *key) {
576.globl	AES_decrypt
577.type	AES_decrypt,\@function
578AES_decrypt:
579___
580$code.=<<___ if (!$softonly);
581	l	%r0,240($key)
582	lhi	%r1,16
583	clr	%r0,%r1
584	jl	.Ldsoft
585
586	la	%r1,0($key)
587	#la	%r2,0($inp)
588	la	%r4,0($out)
589	lghi	%r3,16		# single block length
590	.long	0xb92e0042	# km %r4,%r2
591	brc	1,.-4		# can this happen?
592	br	%r14
593.align	64
594.Ldsoft:
595___
596$code.=<<___;
597	stm${g}	%r3,$ra,3*$SIZE_T($sp)
598
599	llgf	$s0,0($inp)
600	llgf	$s1,4($inp)
601	llgf	$s2,8($inp)
602	llgf	$s3,12($inp)
603
604	larl	$tbl,AES_Td
605	bras	$ra,_s390x_AES_decrypt
606
607	l${g}	$out,3*$SIZE_T($sp)
608	st	$s0,0($out)
609	st	$s1,4($out)
610	st	$s2,8($out)
611	st	$s3,12($out)
612
613	lm${g}	%r6,$ra,6*$SIZE_T($sp)
614	br	$ra
615.size	AES_decrypt,.-AES_decrypt
616
617.type   _s390x_AES_decrypt,\@function
618.align	16
619_s390x_AES_decrypt:
620	st${g}	$ra,15*$SIZE_T($sp)
621	x	$s0,0($key)
622	x	$s1,4($key)
623	x	$s2,8($key)
624	x	$s3,12($key)
625	l	$rounds,240($key)
626	llill	$mask,`0xff<<3`
627	aghi	$rounds,-1
628	j	.Ldec_loop
629.align	16
630.Ldec_loop:
631	srlg	$t1,$s0,`16-3`
632	srlg	$t2,$s0,`8-3`
633	sllg	$t3,$s0,`0+3`
634	srl	$s0,`24-3`
635	nr	$s0,$mask
636	nr	$t1,$mask
637	nr	$t2,$mask
638	ngr	$t3,$mask
639
640	sllg	$i1,$s1,`0+3`	# i0
641	srlg	$i2,$s1,`16-3`
642	srlg	$i3,$s1,`8-3`
643	srl	$s1,`24-3`
644	ngr	$i1,$mask
645	nr	$s1,$mask
646	nr	$i2,$mask
647	nr	$i3,$mask
648
649	l	$s0,0($s0,$tbl)	# Td0[s0>>24]
650	l	$t1,3($t1,$tbl)	# Td1[s0>>16]
651	l	$t2,2($t2,$tbl)	# Td2[s0>>8]
652	l	$t3,1($t3,$tbl)	# Td3[s0>>0]
653
654	x	$s0,1($i1,$tbl)	# Td3[s1>>0]
655	l	$s1,0($s1,$tbl)	# Td0[s1>>24]
656	x	$t2,3($i2,$tbl)	# Td1[s1>>16]
657	x	$t3,2($i3,$tbl)	# Td2[s1>>8]
658
659	srlg	$i1,$s2,`8-3`	# i0
660	sllg	$i2,$s2,`0+3`	# i1
661	srlg	$i3,$s2,`16-3`
662	srl	$s2,`24-3`
663	nr	$i1,$mask
664	ngr	$i2,$mask
665	nr	$s2,$mask
666	nr	$i3,$mask
667
668	xr	$s1,$t1
669	srlg	$ra,$s3,`8-3`	# i1
670	srlg	$t1,$s3,`16-3`	# i0
671	nr	$ra,$mask
672	la	$key,16($key)
673	nr	$t1,$mask
674
675	x	$s0,2($i1,$tbl)	# Td2[s2>>8]
676	x	$s1,1($i2,$tbl)	# Td3[s2>>0]
677	l	$s2,0($s2,$tbl)	# Td0[s2>>24]
678	x	$t3,3($i3,$tbl)	# Td1[s2>>16]
679
680	sllg	$i3,$s3,`0+3`	# i2
681	srl	$s3,`24-3`
682	ngr	$i3,$mask
683	nr	$s3,$mask
684
685	xr	$s2,$t2
686	x	$s0,0($key)
687	x	$s1,4($key)
688	x	$s2,8($key)
689	x	$t3,12($key)
690
691	x	$s0,3($t1,$tbl)	# Td1[s3>>16]
692	x	$s1,2($ra,$tbl)	# Td2[s3>>8]
693	x	$s2,1($i3,$tbl)	# Td3[s3>>0]
694	l	$s3,0($s3,$tbl)	# Td0[s3>>24]
695	xr	$s3,$t3
696
697	brct	$rounds,.Ldec_loop
698	.align	16
699
700	l	$t1,`2048+0`($tbl)	# prefetch Td4
701	l	$t2,`2048+64`($tbl)
702	l	$t3,`2048+128`($tbl)
703	l	$i1,`2048+192`($tbl)
704	llill	$mask,0xff
705
706	srlg	$i3,$s0,24	# i0
707	srlg	$t1,$s0,16
708	srlg	$t2,$s0,8
709	nr	$s0,$mask	# i3
710	nr	$t1,$mask
711
712	srlg	$i1,$s1,24
713	nr	$t2,$mask
714	srlg	$i2,$s1,16
715	srlg	$ra,$s1,8
716	nr	$s1,$mask	# i0
717	nr	$i2,$mask
718	nr	$ra,$mask
719
720	llgc	$i3,2048($i3,$tbl)	# Td4[s0>>24]
721	llgc	$t1,2048($t1,$tbl)	# Td4[s0>>16]
722	llgc	$t2,2048($t2,$tbl)	# Td4[s0>>8]
723	sll	$t1,16
724	llgc	$t3,2048($s0,$tbl)	# Td4[s0>>0]
725	sllg	$s0,$i3,24
726	sll	$t2,8
727
728	llgc	$s1,2048($s1,$tbl)	# Td4[s1>>0]
729	llgc	$i1,2048($i1,$tbl)	# Td4[s1>>24]
730	llgc	$i2,2048($i2,$tbl)	# Td4[s1>>16]
731	sll	$i1,24
732	llgc	$i3,2048($ra,$tbl)	# Td4[s1>>8]
733	sll	$i2,16
734	sll	$i3,8
735	or	$s0,$s1
736	or	$t1,$i1
737	or	$t2,$i2
738	or	$t3,$i3
739
740	srlg	$i1,$s2,8	# i0
741	srlg	$i2,$s2,24
742	srlg	$i3,$s2,16
743	nr	$s2,$mask	# i1
744	nr	$i1,$mask
745	nr	$i3,$mask
746	llgc	$i1,2048($i1,$tbl)	# Td4[s2>>8]
747	llgc	$s1,2048($s2,$tbl)	# Td4[s2>>0]
748	llgc	$i2,2048($i2,$tbl)	# Td4[s2>>24]
749	llgc	$i3,2048($i3,$tbl)	# Td4[s2>>16]
750	sll	$i1,8
751	sll	$i2,24
752	or	$s0,$i1
753	sll	$i3,16
754	or	$t2,$i2
755	or	$t3,$i3
756
757	srlg	$i1,$s3,16	# i0
758	srlg	$i2,$s3,8	# i1
759	srlg	$i3,$s3,24
760	nr	$s3,$mask	# i2
761	nr	$i1,$mask
762	nr	$i2,$mask
763
764	l${g}	$ra,15*$SIZE_T($sp)
765	or	$s1,$t1
766	l	$t0,16($key)
767	l	$t1,20($key)
768
769	llgc	$i1,2048($i1,$tbl)	# Td4[s3>>16]
770	llgc	$i2,2048($i2,$tbl)	# Td4[s3>>8]
771	sll	$i1,16
772	llgc	$s2,2048($s3,$tbl)	# Td4[s3>>0]
773	llgc	$s3,2048($i3,$tbl)	# Td4[s3>>24]
774	sll	$i2,8
775	sll	$s3,24
776	or	$s0,$i1
777	or	$s1,$i2
778	or	$s2,$t2
779	or	$s3,$t3
780
781	xr	$s0,$t0
782	xr	$s1,$t1
783	x	$s2,24($key)
784	x	$s3,28($key)
785
786	br	$ra
787.size	_s390x_AES_decrypt,.-_s390x_AES_decrypt
788___
789
790$code.=<<___;
791# void AES_set_encrypt_key(const unsigned char *in, int bits,
792# 		 AES_KEY *key) {
793.globl	AES_set_encrypt_key
794.type	AES_set_encrypt_key,\@function
795.align	16
796AES_set_encrypt_key:
797_s390x_AES_set_encrypt_key:
798	lghi	$t0,0
799	cl${g}r	$inp,$t0
800	je	.Lminus1
801	cl${g}r	$key,$t0
802	je	.Lminus1
803
804	lghi	$t0,128
805	clr	$bits,$t0
806	je	.Lproceed
807	lghi	$t0,192
808	clr	$bits,$t0
809	je	.Lproceed
810	lghi	$t0,256
811	clr	$bits,$t0
812	je	.Lproceed
813	lghi	%r2,-2
814	br	%r14
815
816.align	16
817.Lproceed:
818___
819$code.=<<___ if (!$softonly);
820	# convert bits to km(c) code, [128,192,256]->[18,19,20]
821	lhi	%r5,-128
822	lhi	%r0,18
823	ar	%r5,$bits
824	srl	%r5,6
825	ar	%r5,%r0
826
827	larl	%r1,OPENSSL_s390xcap_P
828	llihh	%r0,0x8000
829	srlg	%r0,%r0,0(%r5)
830	ng	%r0,S390X_KM(%r1)  # check availability of both km...
831	ng	%r0,S390X_KMC(%r1) # ...and kmc support for given key length
832	jz	.Lekey_internal
833
834	lmg	%r0,%r1,0($inp)	# just copy 128 bits...
835	stmg	%r0,%r1,0($key)
836	lhi	%r0,192
837	cr	$bits,%r0
838	jl	1f
839	lg	%r1,16($inp)
840	stg	%r1,16($key)
841	je	1f
842	lg	%r1,24($inp)
843	stg	%r1,24($key)
8441:	st	$bits,236($key)	# save bits [for debugging purposes]
845	lgr	$t0,%r5
846	st	%r5,240($key)	# save km(c) code
847	lghi	%r2,0
848	br	%r14
849___
850$code.=<<___;
851.align	16
852.Lekey_internal:
853	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
854
855	larl	$tbl,AES_Te+2048
856
857	llgf	$s0,0($inp)
858	llgf	$s1,4($inp)
859	llgf	$s2,8($inp)
860	llgf	$s3,12($inp)
861	st	$s0,0($key)
862	st	$s1,4($key)
863	st	$s2,8($key)
864	st	$s3,12($key)
865	lghi	$t0,128
866	cr	$bits,$t0
867	jne	.Lnot128
868
869	llill	$mask,0xff
870	lghi	$t3,0			# i=0
871	lghi	$rounds,10
872	st	$rounds,240($key)
873
874	llgfr	$t2,$s3			# temp=rk[3]
875	srlg	$i1,$s3,8
876	srlg	$i2,$s3,16
877	srlg	$i3,$s3,24
878	nr	$t2,$mask
879	nr	$i1,$mask
880	nr	$i2,$mask
881
882.align	16
883.L128_loop:
884	la	$t2,0($t2,$tbl)
885	la	$i1,0($i1,$tbl)
886	la	$i2,0($i2,$tbl)
887	la	$i3,0($i3,$tbl)
888	icm	$t2,2,0($t2)		# Te4[rk[3]>>0]<<8
889	icm	$t2,4,0($i1)		# Te4[rk[3]>>8]<<16
890	icm	$t2,8,0($i2)		# Te4[rk[3]>>16]<<24
891	icm	$t2,1,0($i3)		# Te4[rk[3]>>24]
892	x	$t2,256($t3,$tbl)	# rcon[i]
893	xr	$s0,$t2			# rk[4]=rk[0]^...
894	xr	$s1,$s0			# rk[5]=rk[1]^rk[4]
895	xr	$s2,$s1			# rk[6]=rk[2]^rk[5]
896	xr	$s3,$s2			# rk[7]=rk[3]^rk[6]
897
898	llgfr	$t2,$s3			# temp=rk[3]
899	srlg	$i1,$s3,8
900	srlg	$i2,$s3,16
901	nr	$t2,$mask
902	nr	$i1,$mask
903	srlg	$i3,$s3,24
904	nr	$i2,$mask
905
906	st	$s0,16($key)
907	st	$s1,20($key)
908	st	$s2,24($key)
909	st	$s3,28($key)
910	la	$key,16($key)		# key+=4
911	la	$t3,4($t3)		# i++
912	brct	$rounds,.L128_loop
913	lghi	$t0,10
914	lghi	%r2,0
915	lm${g}	%r4,%r13,4*$SIZE_T($sp)
916	br	$ra
917
918.align	16
919.Lnot128:
920	llgf	$t0,16($inp)
921	llgf	$t1,20($inp)
922	st	$t0,16($key)
923	st	$t1,20($key)
924	lghi	$t0,192
925	cr	$bits,$t0
926	jne	.Lnot192
927
928	llill	$mask,0xff
929	lghi	$t3,0			# i=0
930	lghi	$rounds,12
931	st	$rounds,240($key)
932	lghi	$rounds,8
933
934	srlg	$i1,$t1,8
935	srlg	$i2,$t1,16
936	srlg	$i3,$t1,24
937	nr	$t1,$mask
938	nr	$i1,$mask
939	nr	$i2,$mask
940
941.align	16
942.L192_loop:
943	la	$t1,0($t1,$tbl)
944	la	$i1,0($i1,$tbl)
945	la	$i2,0($i2,$tbl)
946	la	$i3,0($i3,$tbl)
947	icm	$t1,2,0($t1)		# Te4[rk[5]>>0]<<8
948	icm	$t1,4,0($i1)		# Te4[rk[5]>>8]<<16
949	icm	$t1,8,0($i2)		# Te4[rk[5]>>16]<<24
950	icm	$t1,1,0($i3)		# Te4[rk[5]>>24]
951	x	$t1,256($t3,$tbl)	# rcon[i]
952	xr	$s0,$t1			# rk[6]=rk[0]^...
953	xr	$s1,$s0			# rk[7]=rk[1]^rk[6]
954	xr	$s2,$s1			# rk[8]=rk[2]^rk[7]
955	xr	$s3,$s2			# rk[9]=rk[3]^rk[8]
956
957	st	$s0,24($key)
958	st	$s1,28($key)
959	st	$s2,32($key)
960	st	$s3,36($key)
961	brct	$rounds,.L192_continue
962	lghi	$t0,12
963	lghi	%r2,0
964	lm${g}	%r4,%r13,4*$SIZE_T($sp)
965	br	$ra
966
967.align	16
968.L192_continue:
969	lgr	$t1,$s3
970	x	$t1,16($key)		# rk[10]=rk[4]^rk[9]
971	st	$t1,40($key)
972	x	$t1,20($key)		# rk[11]=rk[5]^rk[10]
973	st	$t1,44($key)
974
975	srlg	$i1,$t1,8
976	srlg	$i2,$t1,16
977	srlg	$i3,$t1,24
978	nr	$t1,$mask
979	nr	$i1,$mask
980	nr	$i2,$mask
981
982	la	$key,24($key)		# key+=6
983	la	$t3,4($t3)		# i++
984	j	.L192_loop
985
986.align	16
987.Lnot192:
988	llgf	$t0,24($inp)
989	llgf	$t1,28($inp)
990	st	$t0,24($key)
991	st	$t1,28($key)
992	llill	$mask,0xff
993	lghi	$t3,0			# i=0
994	lghi	$rounds,14
995	st	$rounds,240($key)
996	lghi	$rounds,7
997
998	srlg	$i1,$t1,8
999	srlg	$i2,$t1,16
1000	srlg	$i3,$t1,24
1001	nr	$t1,$mask
1002	nr	$i1,$mask
1003	nr	$i2,$mask
1004
1005.align	16
1006.L256_loop:
1007	la	$t1,0($t1,$tbl)
1008	la	$i1,0($i1,$tbl)
1009	la	$i2,0($i2,$tbl)
1010	la	$i3,0($i3,$tbl)
1011	icm	$t1,2,0($t1)		# Te4[rk[7]>>0]<<8
1012	icm	$t1,4,0($i1)		# Te4[rk[7]>>8]<<16
1013	icm	$t1,8,0($i2)		# Te4[rk[7]>>16]<<24
1014	icm	$t1,1,0($i3)		# Te4[rk[7]>>24]
1015	x	$t1,256($t3,$tbl)	# rcon[i]
1016	xr	$s0,$t1			# rk[8]=rk[0]^...
1017	xr	$s1,$s0			# rk[9]=rk[1]^rk[8]
1018	xr	$s2,$s1			# rk[10]=rk[2]^rk[9]
1019	xr	$s3,$s2			# rk[11]=rk[3]^rk[10]
1020	st	$s0,32($key)
1021	st	$s1,36($key)
1022	st	$s2,40($key)
1023	st	$s3,44($key)
1024	brct	$rounds,.L256_continue
1025	lghi	$t0,14
1026	lghi	%r2,0
1027	lm${g}	%r4,%r13,4*$SIZE_T($sp)
1028	br	$ra
1029
1030.align	16
1031.L256_continue:
1032	lgr	$t1,$s3			# temp=rk[11]
1033	srlg	$i1,$s3,8
1034	srlg	$i2,$s3,16
1035	srlg	$i3,$s3,24
1036	nr	$t1,$mask
1037	nr	$i1,$mask
1038	nr	$i2,$mask
1039	la	$t1,0($t1,$tbl)
1040	la	$i1,0($i1,$tbl)
1041	la	$i2,0($i2,$tbl)
1042	la	$i3,0($i3,$tbl)
1043	llgc	$t1,0($t1)		# Te4[rk[11]>>0]
1044	icm	$t1,2,0($i1)		# Te4[rk[11]>>8]<<8
1045	icm	$t1,4,0($i2)		# Te4[rk[11]>>16]<<16
1046	icm	$t1,8,0($i3)		# Te4[rk[11]>>24]<<24
1047	x	$t1,16($key)		# rk[12]=rk[4]^...
1048	st	$t1,48($key)
1049	x	$t1,20($key)		# rk[13]=rk[5]^rk[12]
1050	st	$t1,52($key)
1051	x	$t1,24($key)		# rk[14]=rk[6]^rk[13]
1052	st	$t1,56($key)
1053	x	$t1,28($key)		# rk[15]=rk[7]^rk[14]
1054	st	$t1,60($key)
1055
1056	srlg	$i1,$t1,8
1057	srlg	$i2,$t1,16
1058	srlg	$i3,$t1,24
1059	nr	$t1,$mask
1060	nr	$i1,$mask
1061	nr	$i2,$mask
1062
1063	la	$key,32($key)		# key+=8
1064	la	$t3,4($t3)		# i++
1065	j	.L256_loop
1066
1067.Lminus1:
1068	lghi	%r2,-1
1069	br	$ra
1070.size	AES_set_encrypt_key,.-AES_set_encrypt_key
1071
1072# void AES_set_decrypt_key(const unsigned char *in, int bits,
1073# 		 AES_KEY *key) {
1074.globl	AES_set_decrypt_key
1075.type	AES_set_decrypt_key,\@function
1076.align	16
1077AES_set_decrypt_key:
1078	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
1079	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
1080	bras	$ra,_s390x_AES_set_encrypt_key
1081	#l${g}	$key,4*$SIZE_T($sp)
1082	l${g}	$ra,14*$SIZE_T($sp)
1083	ltgr	%r2,%r2
1084	bnzr	$ra
1085___
1086$code.=<<___ if (!$softonly);
1087	#l	$t0,240($key)
1088	lhi	$t1,16
1089	cr	$t0,$t1
1090	jl	.Lgo
1091	oill	$t0,S390X_DECRYPT	# set "decrypt" bit
1092	st	$t0,240($key)
1093	br	$ra
1094___
1095$code.=<<___;
1096.align	16
1097.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
1098	la	$i1,0($key)
1099	sllg	$i2,$rounds,4
1100	la	$i2,0($i2,$key)
1101	srl	$rounds,1
1102	lghi	$t1,-16
1103
1104.align	16
1105.Linv:	lmg	$s0,$s1,0($i1)
1106	lmg	$s2,$s3,0($i2)
1107	stmg	$s0,$s1,0($i2)
1108	stmg	$s2,$s3,0($i1)
1109	la	$i1,16($i1)
1110	la	$i2,0($t1,$i2)
1111	brct	$rounds,.Linv
1112___
1113$mask80=$i1;
1114$mask1b=$i2;
1115$maskfe=$i3;
1116$code.=<<___;
1117	llgf	$rounds,240($key)
1118	aghi	$rounds,-1
1119	sll	$rounds,2	# (rounds-1)*4
1120	llilh	$mask80,0x8080
1121	llilh	$mask1b,0x1b1b
1122	llilh	$maskfe,0xfefe
1123	oill	$mask80,0x8080
1124	oill	$mask1b,0x1b1b
1125	oill	$maskfe,0xfefe
1126
1127.align	16
1128.Lmix:	l	$s0,16($key)	# tp1
1129	lr	$s1,$s0
1130	ngr	$s1,$mask80
1131	srlg	$t1,$s1,7
1132	slr	$s1,$t1
1133	nr	$s1,$mask1b
1134	sllg	$t1,$s0,1
1135	nr	$t1,$maskfe
1136	xr	$s1,$t1		# tp2
1137
1138	lr	$s2,$s1
1139	ngr	$s2,$mask80
1140	srlg	$t1,$s2,7
1141	slr	$s2,$t1
1142	nr	$s2,$mask1b
1143	sllg	$t1,$s1,1
1144	nr	$t1,$maskfe
1145	xr	$s2,$t1		# tp4
1146
1147	lr	$s3,$s2
1148	ngr	$s3,$mask80
1149	srlg	$t1,$s3,7
1150	slr	$s3,$t1
1151	nr	$s3,$mask1b
1152	sllg	$t1,$s2,1
1153	nr	$t1,$maskfe
1154	xr	$s3,$t1		# tp8
1155
1156	xr	$s1,$s0		# tp2^tp1
1157	xr	$s2,$s0		# tp4^tp1
1158	rll	$s0,$s0,24	# = ROTATE(tp1,8)
1159	xr	$s2,$s3		# ^=tp8
1160	xr	$s0,$s1		# ^=tp2^tp1
1161	xr	$s1,$s3		# tp2^tp1^tp8
1162	xr	$s0,$s2		# ^=tp4^tp1^tp8
1163	rll	$s1,$s1,8
1164	rll	$s2,$s2,16
1165	xr	$s0,$s1		# ^= ROTATE(tp8^tp2^tp1,24)
1166	rll	$s3,$s3,24
1167	xr	$s0,$s2    	# ^= ROTATE(tp8^tp4^tp1,16)
1168	xr	$s0,$s3		# ^= ROTATE(tp8,8)
1169
1170	st	$s0,16($key)
1171	la	$key,4($key)
1172	brct	$rounds,.Lmix
1173
1174	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1175	lghi	%r2,0
1176	br	$ra
1177.size	AES_set_decrypt_key,.-AES_set_decrypt_key
1178___
1179
1180########################################################################
1181# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1182#                     size_t length, const AES_KEY *key,
1183#                     unsigned char *ivec, const int enc)
1184{
1185my $inp="%r2";
1186my $out="%r4";	# length and out are swapped
1187my $len="%r3";
1188my $key="%r5";
1189my $ivp="%r6";
1190
1191$code.=<<___;
1192.globl	AES_cbc_encrypt
1193.type	AES_cbc_encrypt,\@function
1194.align	16
1195AES_cbc_encrypt:
1196	xgr	%r3,%r4		# flip %r3 and %r4, out and len
1197	xgr	%r4,%r3
1198	xgr	%r3,%r4
1199___
1200$code.=<<___ if (!$softonly);
1201	lhi	%r0,16
1202	cl	%r0,240($key)
1203	jh	.Lcbc_software
1204
1205	lg	%r0,0($ivp)	# copy ivec
1206	lg	%r1,8($ivp)
1207	stmg	%r0,%r1,16($sp)
1208	lmg	%r0,%r1,0($key)	# copy key, cover 256 bit
1209	stmg	%r0,%r1,32($sp)
1210	lmg	%r0,%r1,16($key)
1211	stmg	%r0,%r1,48($sp)
1212	l	%r0,240($key)	# load kmc code
1213	lghi	$key,15		# res=len%16, len-=res;
1214	ngr	$key,$len
1215	sl${g}r	$len,$key
1216	la	%r1,16($sp)	# parameter block - ivec || key
1217	jz	.Lkmc_truncated
1218	.long	0xb92f0042	# kmc %r4,%r2
1219	brc	1,.-4		# pay attention to "partial completion"
1220	ltr	$key,$key
1221	jnz	.Lkmc_truncated
1222.Lkmc_done:
1223	lmg	%r0,%r1,16($sp)	# copy ivec to caller
1224	stg	%r0,0($ivp)
1225	stg	%r1,8($ivp)
1226	br	$ra
1227.align	16
1228.Lkmc_truncated:
1229	ahi	$key,-1		# it's the way it's encoded in mvc
1230	tmll	%r0,S390X_DECRYPT
1231	jnz	.Lkmc_truncated_dec
1232	lghi	%r1,0
1233	stg	%r1,16*$SIZE_T($sp)
1234	stg	%r1,16*$SIZE_T+8($sp)
1235	bras	%r1,1f
1236	mvc	16*$SIZE_T(1,$sp),0($inp)
12371:	ex	$key,0(%r1)
1238	la	%r1,16($sp)	# restore parameter block
1239	la	$inp,16*$SIZE_T($sp)
1240	lghi	$len,16
1241	.long	0xb92f0042	# kmc %r4,%r2
1242	j	.Lkmc_done
1243.align	16
1244.Lkmc_truncated_dec:
1245	st${g}	$out,4*$SIZE_T($sp)
1246	la	$out,16*$SIZE_T($sp)
1247	lghi	$len,16
1248	.long	0xb92f0042	# kmc %r4,%r2
1249	l${g}	$out,4*$SIZE_T($sp)
1250	bras	%r1,2f
1251	mvc	0(1,$out),16*$SIZE_T($sp)
12522:	ex	$key,0(%r1)
1253	j	.Lkmc_done
1254.align	16
1255.Lcbc_software:
1256___
1257$code.=<<___;
1258	stm${g}	$key,$ra,5*$SIZE_T($sp)
1259	lhi	%r0,0
1260	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
1261	je	.Lcbc_decrypt
1262
1263	larl	$tbl,AES_Te
1264
1265	llgf	$s0,0($ivp)
1266	llgf	$s1,4($ivp)
1267	llgf	$s2,8($ivp)
1268	llgf	$s3,12($ivp)
1269
1270	lghi	$t0,16
1271	sl${g}r	$len,$t0
1272	brc	4,.Lcbc_enc_tail	# if borrow
1273.Lcbc_enc_loop:
1274	stm${g}	$inp,$out,2*$SIZE_T($sp)
1275	x	$s0,0($inp)
1276	x	$s1,4($inp)
1277	x	$s2,8($inp)
1278	x	$s3,12($inp)
1279	lgr	%r4,$key
1280
1281	bras	$ra,_s390x_AES_encrypt
1282
1283	lm${g}	$inp,$key,2*$SIZE_T($sp)
1284	st	$s0,0($out)
1285	st	$s1,4($out)
1286	st	$s2,8($out)
1287	st	$s3,12($out)
1288
1289	la	$inp,16($inp)
1290	la	$out,16($out)
1291	lghi	$t0,16
1292	lt${g}r	$len,$len
1293	jz	.Lcbc_enc_done
1294	sl${g}r	$len,$t0
1295	brc	4,.Lcbc_enc_tail	# if borrow
1296	j	.Lcbc_enc_loop
1297.align	16
1298.Lcbc_enc_done:
1299	l${g}	$ivp,6*$SIZE_T($sp)
1300	st	$s0,0($ivp)
1301	st	$s1,4($ivp)
1302	st	$s2,8($ivp)
1303	st	$s3,12($ivp)
1304
1305	lm${g}	%r7,$ra,7*$SIZE_T($sp)
1306	br	$ra
1307
1308.align	16
1309.Lcbc_enc_tail:
1310	aghi	$len,15
1311	lghi	$t0,0
1312	stg	$t0,16*$SIZE_T($sp)
1313	stg	$t0,16*$SIZE_T+8($sp)
1314	bras	$t1,3f
1315	mvc	16*$SIZE_T(1,$sp),0($inp)
13163:	ex	$len,0($t1)
1317	lghi	$len,0
1318	la	$inp,16*$SIZE_T($sp)
1319	j	.Lcbc_enc_loop
1320
1321.align	16
1322.Lcbc_decrypt:
1323	larl	$tbl,AES_Td
1324
1325	lg	$t0,0($ivp)
1326	lg	$t1,8($ivp)
1327	stmg	$t0,$t1,16*$SIZE_T($sp)
1328
1329.Lcbc_dec_loop:
1330	stm${g}	$inp,$out,2*$SIZE_T($sp)
1331	llgf	$s0,0($inp)
1332	llgf	$s1,4($inp)
1333	llgf	$s2,8($inp)
1334	llgf	$s3,12($inp)
1335	lgr	%r4,$key
1336
1337	bras	$ra,_s390x_AES_decrypt
1338
1339	lm${g}	$inp,$key,2*$SIZE_T($sp)
1340	sllg	$s0,$s0,32
1341	sllg	$s2,$s2,32
1342	lr	$s0,$s1
1343	lr	$s2,$s3
1344
1345	lg	$t0,0($inp)
1346	lg	$t1,8($inp)
1347	xg	$s0,16*$SIZE_T($sp)
1348	xg	$s2,16*$SIZE_T+8($sp)
1349	lghi	$s1,16
1350	sl${g}r	$len,$s1
1351	brc	4,.Lcbc_dec_tail	# if borrow
1352	brc	2,.Lcbc_dec_done	# if zero
1353	stg	$s0,0($out)
1354	stg	$s2,8($out)
1355	stmg	$t0,$t1,16*$SIZE_T($sp)
1356
1357	la	$inp,16($inp)
1358	la	$out,16($out)
1359	j	.Lcbc_dec_loop
1360
1361.Lcbc_dec_done:
1362	stg	$s0,0($out)
1363	stg	$s2,8($out)
1364.Lcbc_dec_exit:
1365	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1366	stmg	$t0,$t1,0($ivp)
1367
1368	br	$ra
1369
1370.align	16
1371.Lcbc_dec_tail:
1372	aghi	$len,15
1373	stg	$s0,16*$SIZE_T($sp)
1374	stg	$s2,16*$SIZE_T+8($sp)
1375	bras	$s1,4f
1376	mvc	0(1,$out),16*$SIZE_T($sp)
13774:	ex	$len,0($s1)
1378	j	.Lcbc_dec_exit
1379.size	AES_cbc_encrypt,.-AES_cbc_encrypt
1380___
1381}
1382########################################################################
1383# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1384#                     size_t blocks, const AES_KEY *key,
1385#                     const unsigned char *ivec)
1386{
1387my $inp="%r2";
1388my $out="%r4";	# blocks and out are swapped
1389my $len="%r3";
1390my $key="%r5";	my $iv0="%r5";
1391my $ivp="%r6";
1392my $fp ="%r7";
1393
1394$code.=<<___;
1395.globl	AES_ctr32_encrypt
1396.type	AES_ctr32_encrypt,\@function
1397.align	16
1398AES_ctr32_encrypt:
1399	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
1400	xgr	%r4,%r3
1401	xgr	%r3,%r4
1402	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
1403___
1404$code.=<<___ if (!$softonly);
1405	l	%r0,240($key)
1406	lhi	%r1,16
1407	clr	%r0,%r1
1408	jl	.Lctr32_software
1409
1410	st${g}	$s2,10*$SIZE_T($sp)
1411	st${g}	$s3,11*$SIZE_T($sp)
1412
1413	clr	$len,%r1		# does work even in 64-bit mode
1414	jle	.Lctr32_nokma		# kma is slower for <= 16 blocks
1415
1416	larl	%r1,OPENSSL_s390xcap_P
1417	lr	$s2,%r0
1418	llihh	$s3,0x8000
1419	srlg	$s3,$s3,0($s2)
1420	ng	$s3,S390X_KMA(%r1)		# check kma capability vector
1421	jz	.Lctr32_nokma
1422
1423	l${g}hi	%r1,-$stdframe-112
1424	l${g}r	$s3,$sp
1425	la	$sp,0(%r1,$sp)			# prepare parameter block
1426
1427	lhi	%r1,0x0600
1428	sllg	$len,$len,4
1429	or	%r0,%r1				# set HS and LAAD flags
1430
1431	st${g}	$s3,0($sp)			# backchain
1432	la	%r1,$stdframe($sp)
1433
1434	lmg	$s2,$s3,0($key)			# copy key
1435	stg	$s2,$stdframe+80($sp)
1436	stg	$s3,$stdframe+88($sp)
1437	lmg	$s2,$s3,16($key)
1438	stg	$s2,$stdframe+96($sp)
1439	stg	$s3,$stdframe+104($sp)
1440
1441	lmg	$s2,$s3,0($ivp)			# copy iv
1442	stg	$s2,$stdframe+64($sp)
1443	ahi	$s3,-1				# kma requires counter-1
1444	stg	$s3,$stdframe+72($sp)
1445	st	$s3,$stdframe+12($sp)		# copy counter
1446
1447	lghi	$s2,0				# no AAD
1448	lghi	$s3,0
1449
1450	.long	0xb929a042	# kma $out,$s2,$inp
1451	brc	1,.-4		# pay attention to "partial completion"
1452
1453	stg	%r0,$stdframe+80($sp)		# wipe key
1454	stg	%r0,$stdframe+88($sp)
1455	stg	%r0,$stdframe+96($sp)
1456	stg	%r0,$stdframe+104($sp)
1457	la	$sp,$stdframe+112($sp)
1458
1459	lm${g}	$s2,$s3,10*$SIZE_T($sp)
1460	br	$ra
1461
1462.align	16
1463.Lctr32_nokma:
1464	stm${g}	%r6,$s1,6*$SIZE_T($sp)
1465
1466	slgr	$out,$inp
1467	la	%r1,0($key)	# %r1 is permanent copy of $key
1468	lg	$iv0,0($ivp)	# load ivec
1469	lg	$ivp,8($ivp)
1470
1471	# prepare and allocate stack frame at the top of 4K page
1472	# with 1K reserved for eventual signal handling
1473	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1474	lghi	$s1,-4096
1475	algr	$s0,$sp
1476	lgr	$fp,$sp
1477	ngr	$s0,$s1		# align at page boundary
1478	slgr	$fp,$s0		# total buffer size
1479	lgr	$s2,$sp
1480	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1481	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1482	# buffer size is at lest 256 and at most 3072+256-16
1483
1484	la	$sp,1024($s0)	# alloca
1485	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
1486	st${g}	$s2,0($sp)	# back-chain
1487	st${g}	$fp,$SIZE_T($sp)
1488
1489	slgr	$len,$fp
1490	brc	1,.Lctr32_hw_switch	# not zero, no borrow
1491	algr	$fp,$len	# input is shorter than allocated buffer
1492	lghi	$len,0
1493	st${g}	$fp,$SIZE_T($sp)
1494
1495.Lctr32_hw_switch:
1496___
1497$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
1498	llgfr	$s0,%r0
1499	lgr	$s1,%r1
1500	larl	%r1,OPENSSL_s390xcap_P
1501	llihh	%r0,0x8000	# check if kmctr supports the function code
1502	srlg	%r0,%r0,0($s0)
1503	ng	%r0,S390X_KMCTR(%r1)	# check kmctr capability vector
1504	lgr	%r0,$s0
1505	lgr	%r1,$s1
1506	jz	.Lctr32_km_loop
1507
1508####### kmctr code
1509	algr	$out,$inp	# restore $out
1510	lgr	$s1,$len	# $s1 undertakes $len
1511	j	.Lctr32_kmctr_loop
1512.align	16
1513.Lctr32_kmctr_loop:
1514	la	$s2,16($sp)
1515	lgr	$s3,$fp
1516.Lctr32_kmctr_prepare:
1517	stg	$iv0,0($s2)
1518	stg	$ivp,8($s2)
1519	la	$s2,16($s2)
1520	ahi	$ivp,1		# 32-bit increment, preserves upper half
1521	brct	$s3,.Lctr32_kmctr_prepare
1522
1523	#la	$inp,0($inp)	# inp
1524	sllg	$len,$fp,4	# len
1525	#la	$out,0($out)	# out
1526	la	$s2,16($sp)	# iv
1527	.long	0xb92da042	# kmctr $out,$s2,$inp
1528	brc	1,.-4		# pay attention to "partial completion"
1529
1530	slgr	$s1,$fp
1531	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
1532	algr	$fp,$s1
1533	lghi	$s1,0
1534	brc	4+1,.Lctr32_kmctr_loop	# not zero
1535
1536	l${g}	$sp,0($sp)
1537	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1538	br	$ra
1539.align	16
1540___
1541$code.=<<___ if (!$softonly);
1542.Lctr32_km_loop:
1543	la	$s2,16($sp)
1544	lgr	$s3,$fp
1545.Lctr32_km_prepare:
1546	stg	$iv0,0($s2)
1547	stg	$ivp,8($s2)
1548	la	$s2,16($s2)
1549	ahi	$ivp,1		# 32-bit increment, preserves upper half
1550	brct	$s3,.Lctr32_km_prepare
1551
1552	la	$s0,16($sp)	# inp
1553	sllg	$s1,$fp,4	# len
1554	la	$s2,16($sp)	# out
1555	.long	0xb92e00a8	# km %r10,%r8
1556	brc	1,.-4		# pay attention to "partial completion"
1557
1558	la	$s2,16($sp)
1559	lgr	$s3,$fp
1560	slgr	$s2,$inp
1561.Lctr32_km_xor:
1562	lg	$s0,0($inp)
1563	lg	$s1,8($inp)
1564	xg	$s0,0($s2,$inp)
1565	xg	$s1,8($s2,$inp)
1566	stg	$s0,0($out,$inp)
1567	stg	$s1,8($out,$inp)
1568	la	$inp,16($inp)
1569	brct	$s3,.Lctr32_km_xor
1570
1571	slgr	$len,$fp
1572	brc	1,.Lctr32_km_loop	# not zero, no borrow
1573	algr	$fp,$len
1574	lghi	$len,0
1575	brc	4+1,.Lctr32_km_loop	# not zero
1576
1577	l${g}	$s0,0($sp)
1578	l${g}	$s1,$SIZE_T($sp)
1579	la	$s2,16($sp)
1580.Lctr32_km_zap:
1581	stg	$s0,0($s2)
1582	stg	$s0,8($s2)
1583	la	$s2,16($s2)
1584	brct	$s1,.Lctr32_km_zap
1585
1586	la	$sp,0($s0)
1587	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1588	br	$ra
1589.align	16
1590.Lctr32_software:
1591___
1592$code.=<<___;
1593	stm${g}	$key,$ra,5*$SIZE_T($sp)
1594	sl${g}r	$inp,$out
1595	larl	$tbl,AES_Te
1596	llgf	$t1,12($ivp)
1597
1598.Lctr32_loop:
1599	stm${g}	$inp,$out,2*$SIZE_T($sp)
1600	llgf	$s0,0($ivp)
1601	llgf	$s1,4($ivp)
1602	llgf	$s2,8($ivp)
1603	lgr	$s3,$t1
1604	st	$t1,16*$SIZE_T($sp)
1605	lgr	%r4,$key
1606
1607	bras	$ra,_s390x_AES_encrypt
1608
1609	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
1610	llgf	$t1,16*$SIZE_T($sp)
1611	x	$s0,0($inp,$out)
1612	x	$s1,4($inp,$out)
1613	x	$s2,8($inp,$out)
1614	x	$s3,12($inp,$out)
1615	stm	$s0,$s3,0($out)
1616
1617	la	$out,16($out)
1618	ahi	$t1,1		# 32-bit increment
1619	brct	$len,.Lctr32_loop
1620
1621	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1622	br	$ra
1623.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
1624___
1625}
1626
1627########################################################################
1628# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
1629#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
1630#	const unsigned char iv[16]);
1631#
1632{
1633my $inp="%r2";
1634my $out="%r4";	# len and out are swapped
1635my $len="%r3";
1636my $key1="%r5";	# $i1
1637my $key2="%r6";	# $i2
1638my $fp="%r7";	# $i3
1639my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
1640
1641$code.=<<___;
1642.type	_s390x_xts_km,\@function
1643.align	16
1644_s390x_xts_km:
1645___
1646$code.=<<___ if(1);
1647	llgfr	$s0,%r0			# put aside the function code
1648	lghi	$s1,0x7f
1649	nr	$s1,%r0
1650	larl	%r1,OPENSSL_s390xcap_P
1651	llihh	%r0,0x8000
1652	srlg	%r0,%r0,32($s1)		# check for 32+function code
1653	ng	%r0,S390X_KM(%r1)	# check km capability vector
1654	lgr	%r0,$s0			# restore the function code
1655	la	%r1,0($key1)		# restore $key1
1656	jz	.Lxts_km_vanilla
1657
1658	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
1659	algr	$out,$inp
1660
1661	oill	%r0,32			# switch to xts function code
1662	aghi	$s1,-18			#
1663	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
1664	la	%r1,$tweak-16($sp)
1665	slgr	%r1,$s1			# parameter block position
1666	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
1667	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
1668					# yes, it contains junk and overlaps
1669					# with the tweak in 128-bit case.
1670					# it's done to avoid conditional
1671					# branch.
1672	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
1673
1674	.long	0xb92e0042		# km %r4,%r2
1675	brc	1,.-4			# pay attention to "partial completion"
1676
1677	lrvg	$s0,$tweak+0($sp)	# load the last tweak
1678	lrvg	$s1,$tweak+8($sp)
1679	stmg	%r0,%r3,$tweak-32($sp)	# wipe copy of the key
1680
1681	nill	%r0,0xffdf		# switch back to original function code
1682	la	%r1,0($key1)		# restore pointer to $key1
1683	slgr	$out,$inp
1684
1685	llgc	$len,2*$SIZE_T-1($sp)
1686	nill	$len,0x0f		# $len%=16
1687	br	$ra
1688
1689.align	16
1690.Lxts_km_vanilla:
1691___
1692$code.=<<___;
1693	# prepare and allocate stack frame at the top of 4K page
1694	# with 1K reserved for eventual signal handling
1695	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
1696	lghi	$s1,-4096
1697	algr	$s0,$sp
1698	lgr	$fp,$sp
1699	ngr	$s0,$s1		# align at page boundary
1700	slgr	$fp,$s0		# total buffer size
1701	lgr	$s2,$sp
1702	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
1703	slgr	$fp,$s1		# deduct reservation to get usable buffer size
1704	# buffer size is at lest 256 and at most 3072+256-16
1705
1706	la	$sp,1024($s0)	# alloca
1707	nill	$fp,0xfff0	# round to 16*n
1708	st${g}	$s2,0($sp)	# back-chain
1709	nill	$len,0xfff0	# redundant
1710	st${g}	$fp,$SIZE_T($sp)
1711
1712	slgr	$len,$fp
1713	brc	1,.Lxts_km_go	# not zero, no borrow
1714	algr	$fp,$len	# input is shorter than allocated buffer
1715	lghi	$len,0
1716	st${g}	$fp,$SIZE_T($sp)
1717
1718.Lxts_km_go:
1719	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
1720	lrvg	$s1,$tweak+8($s2)
1721
1722	la	$s2,16($sp)		# vector of ascending tweak values
1723	slgr	$s2,$inp
1724	srlg	$s3,$fp,4
1725	j	.Lxts_km_start
1726
1727.Lxts_km_loop:
1728	la	$s2,16($sp)
1729	slgr	$s2,$inp
1730	srlg	$s3,$fp,4
1731.Lxts_km_prepare:
1732	lghi	$i1,0x87
1733	srag	$i2,$s1,63		# broadcast upper bit
1734	ngr	$i1,$i2			# rem
1735	algr	$s0,$s0
1736	alcgr	$s1,$s1
1737	xgr	$s0,$i1
1738.Lxts_km_start:
1739	lrvgr	$i1,$s0			# flip byte order
1740	lrvgr	$i2,$s1
1741	stg	$i1,0($s2,$inp)
1742	stg	$i2,8($s2,$inp)
1743	xg	$i1,0($inp)
1744	xg	$i2,8($inp)
1745	stg	$i1,0($out,$inp)
1746	stg	$i2,8($out,$inp)
1747	la	$inp,16($inp)
1748	brct	$s3,.Lxts_km_prepare
1749
1750	slgr	$inp,$fp		# rewind $inp
1751	la	$s2,0($out,$inp)
1752	lgr	$s3,$fp
1753	.long	0xb92e00aa		# km $s2,$s2
1754	brc	1,.-4			# pay attention to "partial completion"
1755
1756	la	$s2,16($sp)
1757	slgr	$s2,$inp
1758	srlg	$s3,$fp,4
1759.Lxts_km_xor:
1760	lg	$i1,0($out,$inp)
1761	lg	$i2,8($out,$inp)
1762	xg	$i1,0($s2,$inp)
1763	xg	$i2,8($s2,$inp)
1764	stg	$i1,0($out,$inp)
1765	stg	$i2,8($out,$inp)
1766	la	$inp,16($inp)
1767	brct	$s3,.Lxts_km_xor
1768
1769	slgr	$len,$fp
1770	brc	1,.Lxts_km_loop		# not zero, no borrow
1771	algr	$fp,$len
1772	lghi	$len,0
1773	brc	4+1,.Lxts_km_loop	# not zero
1774
1775	l${g}	$i1,0($sp)		# back-chain
1776	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
1777	la	$i2,16($sp)
1778	srlg	$fp,$fp,4
1779.Lxts_km_zap:
1780	stg	$i1,0($i2)
1781	stg	$i1,8($i2)
1782	la	$i2,16($i2)
1783	brct	$fp,.Lxts_km_zap
1784
1785	la	$sp,0($i1)
1786	llgc	$len,2*$SIZE_T-1($i1)
1787	nill	$len,0x0f		# $len%=16
1788	bzr	$ra
1789
1790	# generate one more tweak...
1791	lghi	$i1,0x87
1792	srag	$i2,$s1,63		# broadcast upper bit
1793	ngr	$i1,$i2			# rem
1794	algr	$s0,$s0
1795	alcgr	$s1,$s1
1796	xgr	$s0,$i1
1797
1798	ltr	$len,$len		# clear zero flag
1799	br	$ra
1800.size	_s390x_xts_km,.-_s390x_xts_km
1801
1802.globl	AES_xts_encrypt
1803.type	AES_xts_encrypt,\@function
1804.align	16
1805AES_xts_encrypt:
1806	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
1807	xgr	%r4,%r3
1808	xgr	%r3,%r4
1809___
1810$code.=<<___ if ($SIZE_T==4);
1811	llgfr	$len,$len
1812___
1813$code.=<<___;
1814	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
1815	srag	$len,$len,4		# formally wrong, because it expands
1816					# sign byte, but who can afford asking
1817					# to process more than 2^63-1 bytes?
1818					# I use it, because it sets condition
1819					# code...
1820	bcr	8,$ra			# abort if zero (i.e. less than 16)
1821___
1822$code.=<<___ if (!$softonly);
1823	llgf	%r0,240($key2)
1824	lhi	%r1,16
1825	clr	%r0,%r1
1826	jl	.Lxts_enc_software
1827
1828	st${g}	$ra,5*$SIZE_T($sp)
1829	stm${g}	%r6,$s3,6*$SIZE_T($sp)
1830
1831	sllg	$len,$len,4		# $len&=~15
1832	slgr	$out,$inp
1833
1834	# generate the tweak value
1835	l${g}	$s3,$stdframe($sp)	# pointer to iv
1836	la	$s2,$tweak($sp)
1837	lmg	$s0,$s1,0($s3)
1838	lghi	$s3,16
1839	stmg	$s0,$s1,0($s2)
1840	la	%r1,0($key2)		# $key2 is not needed anymore
1841	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
1842	brc	1,.-4			# can this happen?
1843
1844	l	%r0,240($key1)
1845	la	%r1,0($key1)		# $key1 is not needed anymore
1846	bras	$ra,_s390x_xts_km
1847	jz	.Lxts_enc_km_done
1848
1849	aghi	$inp,-16		# take one step back
1850	la	$i3,0($out,$inp)	# put aside real $out
1851.Lxts_enc_km_steal:
1852	llgc	$i1,16($inp)
1853	llgc	$i2,0($out,$inp)
1854	stc	$i1,0($out,$inp)
1855	stc	$i2,16($out,$inp)
1856	la	$inp,1($inp)
1857	brct	$len,.Lxts_enc_km_steal
1858
1859	la	$s2,0($i3)
1860	lghi	$s3,16
1861	lrvgr	$i1,$s0			# flip byte order
1862	lrvgr	$i2,$s1
1863	xg	$i1,0($s2)
1864	xg	$i2,8($s2)
1865	stg	$i1,0($s2)
1866	stg	$i2,8($s2)
1867	.long	0xb92e00aa		# km $s2,$s2
1868	brc	1,.-4			# can this happen?
1869	lrvgr	$i1,$s0			# flip byte order
1870	lrvgr	$i2,$s1
1871	xg	$i1,0($i3)
1872	xg	$i2,8($i3)
1873	stg	$i1,0($i3)
1874	stg	$i2,8($i3)
1875
1876.Lxts_enc_km_done:
1877	stg	$sp,$tweak+0($sp)	# wipe tweak
1878	stg	$sp,$tweak+8($sp)
1879	l${g}	$ra,5*$SIZE_T($sp)
1880	lm${g}	%r6,$s3,6*$SIZE_T($sp)
1881	br	$ra
1882.align	16
1883.Lxts_enc_software:
1884___
1885$code.=<<___;
1886	stm${g}	%r6,$ra,6*$SIZE_T($sp)
1887
1888	slgr	$out,$inp
1889
1890	l${g}	$s3,$stdframe($sp)	# ivp
1891	llgf	$s0,0($s3)		# load iv
1892	llgf	$s1,4($s3)
1893	llgf	$s2,8($s3)
1894	llgf	$s3,12($s3)
1895	stm${g}	%r2,%r5,2*$SIZE_T($sp)
1896	la	$key,0($key2)
1897	larl	$tbl,AES_Te
1898	bras	$ra,_s390x_AES_encrypt	# generate the tweak
1899	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1900	stm	$s0,$s3,$tweak($sp)	# save the tweak
1901	j	.Lxts_enc_enter
1902
1903.align	16
1904.Lxts_enc_loop:
1905	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1906	lrvg	$s3,$tweak+8($sp)
1907	lghi	%r1,0x87
1908	srag	%r0,$s3,63		# broadcast upper bit
1909	ngr	%r1,%r0			# rem
1910	algr	$s1,$s1
1911	alcgr	$s3,$s3
1912	xgr	$s1,%r1
1913	lrvgr	$s1,$s1			# flip byte order
1914	lrvgr	$s3,$s3
1915	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1916	stg	$s1,$tweak+0($sp)	# save the tweak
1917	llgfr	$s1,$s1
1918	srlg	$s2,$s3,32
1919	stg	$s3,$tweak+8($sp)
1920	llgfr	$s3,$s3
1921	la	$inp,16($inp)		# $inp+=16
1922.Lxts_enc_enter:
1923	x	$s0,0($inp)		# ^=*($inp)
1924	x	$s1,4($inp)
1925	x	$s2,8($inp)
1926	x	$s3,12($inp)
1927	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
1928	la	$key,0($key1)
1929	bras	$ra,_s390x_AES_encrypt
1930	lm${g}	%r2,%r5,2*$SIZE_T($sp)
1931	x	$s0,$tweak+0($sp)	# ^=tweak
1932	x	$s1,$tweak+4($sp)
1933	x	$s2,$tweak+8($sp)
1934	x	$s3,$tweak+12($sp)
1935	st	$s0,0($out,$inp)
1936	st	$s1,4($out,$inp)
1937	st	$s2,8($out,$inp)
1938	st	$s3,12($out,$inp)
1939	brct${g}	$len,.Lxts_enc_loop
1940
1941	llgc	$len,`2*$SIZE_T-1`($sp)
1942	nill	$len,0x0f		# $len%16
1943	jz	.Lxts_enc_done
1944
1945	la	$i3,0($inp,$out)	# put aside real $out
1946.Lxts_enc_steal:
1947	llgc	%r0,16($inp)
1948	llgc	%r1,0($out,$inp)
1949	stc	%r0,0($out,$inp)
1950	stc	%r1,16($out,$inp)
1951	la	$inp,1($inp)
1952	brct	$len,.Lxts_enc_steal
1953	la	$out,0($i3)		# restore real $out
1954
1955	# generate last tweak...
1956	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
1957	lrvg	$s3,$tweak+8($sp)
1958	lghi	%r1,0x87
1959	srag	%r0,$s3,63		# broadcast upper bit
1960	ngr	%r1,%r0			# rem
1961	algr	$s1,$s1
1962	alcgr	$s3,$s3
1963	xgr	$s1,%r1
1964	lrvgr	$s1,$s1			# flip byte order
1965	lrvgr	$s3,$s3
1966	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
1967	stg	$s1,$tweak+0($sp)	# save the tweak
1968	llgfr	$s1,$s1
1969	srlg	$s2,$s3,32
1970	stg	$s3,$tweak+8($sp)
1971	llgfr	$s3,$s3
1972
1973	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
1974	x	$s1,4($out)
1975	x	$s2,8($out)
1976	x	$s3,12($out)
1977	st${g}	$out,4*$SIZE_T($sp)
1978	la	$key,0($key1)
1979	bras	$ra,_s390x_AES_encrypt
1980	l${g}	$out,4*$SIZE_T($sp)
1981	x	$s0,`$tweak+0`($sp)	# ^=tweak
1982	x	$s1,`$tweak+4`($sp)
1983	x	$s2,`$tweak+8`($sp)
1984	x	$s3,`$tweak+12`($sp)
1985	st	$s0,0($out)
1986	st	$s1,4($out)
1987	st	$s2,8($out)
1988	st	$s3,12($out)
1989
1990.Lxts_enc_done:
1991	stg	$sp,$tweak+0($sp)	# wipe tweak
1992	stg	$sp,$tweak+8($sp)
1993	lm${g}	%r6,$ra,6*$SIZE_T($sp)
1994	br	$ra
1995.size	AES_xts_encrypt,.-AES_xts_encrypt
1996___
1997# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
1998#	size_t len, const AES_KEY *key1, const AES_KEY *key2,
1999#	const unsigned char iv[16]);
2000#
2001$code.=<<___;
2002.globl	AES_xts_decrypt
2003.type	AES_xts_decrypt,\@function
2004.align	16
2005AES_xts_decrypt:
2006	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
2007	xgr	%r4,%r3
2008	xgr	%r3,%r4
2009___
2010$code.=<<___ if ($SIZE_T==4);
2011	llgfr	$len,$len
2012___
2013$code.=<<___;
2014	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
2015	aghi	$len,-16
2016	bcr	4,$ra			# abort if less than zero. formally
2017					# wrong, because $len is unsigned,
2018					# but who can afford asking to
2019					# process more than 2^63-1 bytes?
2020	tmll	$len,0x0f
2021	jnz	.Lxts_dec_proceed
2022	aghi	$len,16
2023.Lxts_dec_proceed:
2024___
2025$code.=<<___ if (!$softonly);
2026	llgf	%r0,240($key2)
2027	lhi	%r1,16
2028	clr	%r0,%r1
2029	jl	.Lxts_dec_software
2030
2031	st${g}	$ra,5*$SIZE_T($sp)
2032	stm${g}	%r6,$s3,6*$SIZE_T($sp)
2033
2034	nill	$len,0xfff0		# $len&=~15
2035	slgr	$out,$inp
2036
2037	# generate the tweak value
2038	l${g}	$s3,$stdframe($sp)	# pointer to iv
2039	la	$s2,$tweak($sp)
2040	lmg	$s0,$s1,0($s3)
2041	lghi	$s3,16
2042	stmg	$s0,$s1,0($s2)
2043	la	%r1,0($key2)		# $key2 is not needed past this point
2044	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
2045	brc	1,.-4			# can this happen?
2046
2047	l	%r0,240($key1)
2048	la	%r1,0($key1)		# $key1 is not needed anymore
2049
2050	ltgr	$len,$len
2051	jz	.Lxts_dec_km_short
2052	bras	$ra,_s390x_xts_km
2053	jz	.Lxts_dec_km_done
2054
2055	lrvgr	$s2,$s0			# make copy in reverse byte order
2056	lrvgr	$s3,$s1
2057	j	.Lxts_dec_km_2ndtweak
2058
2059.Lxts_dec_km_short:
2060	llgc	$len,`2*$SIZE_T-1`($sp)
2061	nill	$len,0x0f		# $len%=16
2062	lrvg	$s0,$tweak+0($sp)	# load the tweak
2063	lrvg	$s1,$tweak+8($sp)
2064	lrvgr	$s2,$s0			# make copy in reverse byte order
2065	lrvgr	$s3,$s1
2066
2067.Lxts_dec_km_2ndtweak:
2068	lghi	$i1,0x87
2069	srag	$i2,$s1,63		# broadcast upper bit
2070	ngr	$i1,$i2			# rem
2071	algr	$s0,$s0
2072	alcgr	$s1,$s1
2073	xgr	$s0,$i1
2074	lrvgr	$i1,$s0			# flip byte order
2075	lrvgr	$i2,$s1
2076
2077	xg	$i1,0($inp)
2078	xg	$i2,8($inp)
2079	stg	$i1,0($out,$inp)
2080	stg	$i2,8($out,$inp)
2081	la	$i2,0($out,$inp)
2082	lghi	$i3,16
2083	.long	0xb92e0066		# km $i2,$i2
2084	brc	1,.-4			# can this happen?
2085	lrvgr	$i1,$s0
2086	lrvgr	$i2,$s1
2087	xg	$i1,0($out,$inp)
2088	xg	$i2,8($out,$inp)
2089	stg	$i1,0($out,$inp)
2090	stg	$i2,8($out,$inp)
2091
2092	la	$i3,0($out,$inp)	# put aside real $out
2093.Lxts_dec_km_steal:
2094	llgc	$i1,16($inp)
2095	llgc	$i2,0($out,$inp)
2096	stc	$i1,0($out,$inp)
2097	stc	$i2,16($out,$inp)
2098	la	$inp,1($inp)
2099	brct	$len,.Lxts_dec_km_steal
2100
2101	lgr	$s0,$s2
2102	lgr	$s1,$s3
2103	xg	$s0,0($i3)
2104	xg	$s1,8($i3)
2105	stg	$s0,0($i3)
2106	stg	$s1,8($i3)
2107	la	$s0,0($i3)
2108	lghi	$s1,16
2109	.long	0xb92e0088		# km $s0,$s0
2110	brc	1,.-4			# can this happen?
2111	xg	$s2,0($i3)
2112	xg	$s3,8($i3)
2113	stg	$s2,0($i3)
2114	stg	$s3,8($i3)
2115.Lxts_dec_km_done:
2116	stg	$sp,$tweak+0($sp)	# wipe tweak
2117	stg	$sp,$tweak+8($sp)
2118	l${g}	$ra,5*$SIZE_T($sp)
2119	lm${g}	%r6,$s3,6*$SIZE_T($sp)
2120	br	$ra
2121.align	16
2122.Lxts_dec_software:
2123___
2124$code.=<<___;
2125	stm${g}	%r6,$ra,6*$SIZE_T($sp)
2126
2127	srlg	$len,$len,4
2128	slgr	$out,$inp
2129
2130	l${g}	$s3,$stdframe($sp)	# ivp
2131	llgf	$s0,0($s3)		# load iv
2132	llgf	$s1,4($s3)
2133	llgf	$s2,8($s3)
2134	llgf	$s3,12($s3)
2135	stm${g}	%r2,%r5,2*$SIZE_T($sp)
2136	la	$key,0($key2)
2137	larl	$tbl,AES_Te
2138	bras	$ra,_s390x_AES_encrypt	# generate the tweak
2139	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2140	larl	$tbl,AES_Td
2141	lt${g}r	$len,$len
2142	stm	$s0,$s3,$tweak($sp)	# save the tweak
2143	jz	.Lxts_dec_short
2144	j	.Lxts_dec_enter
2145
2146.align	16
2147.Lxts_dec_loop:
2148	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2149	lrvg	$s3,$tweak+8($sp)
2150	lghi	%r1,0x87
2151	srag	%r0,$s3,63		# broadcast upper bit
2152	ngr	%r1,%r0			# rem
2153	algr	$s1,$s1
2154	alcgr	$s3,$s3
2155	xgr	$s1,%r1
2156	lrvgr	$s1,$s1			# flip byte order
2157	lrvgr	$s3,$s3
2158	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2159	stg	$s1,$tweak+0($sp)	# save the tweak
2160	llgfr	$s1,$s1
2161	srlg	$s2,$s3,32
2162	stg	$s3,$tweak+8($sp)
2163	llgfr	$s3,$s3
2164.Lxts_dec_enter:
2165	x	$s0,0($inp)		# tweak^=*(inp)
2166	x	$s1,4($inp)
2167	x	$s2,8($inp)
2168	x	$s3,12($inp)
2169	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
2170	la	$key,0($key1)
2171	bras	$ra,_s390x_AES_decrypt
2172	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2173	x	$s0,$tweak+0($sp)	# ^=tweak
2174	x	$s1,$tweak+4($sp)
2175	x	$s2,$tweak+8($sp)
2176	x	$s3,$tweak+12($sp)
2177	st	$s0,0($out,$inp)
2178	st	$s1,4($out,$inp)
2179	st	$s2,8($out,$inp)
2180	st	$s3,12($out,$inp)
2181	la	$inp,16($inp)
2182	brct${g}	$len,.Lxts_dec_loop
2183
2184	llgc	$len,`2*$SIZE_T-1`($sp)
2185	nill	$len,0x0f		# $len%16
2186	jz	.Lxts_dec_done
2187
2188	# generate pair of tweaks...
2189	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2190	lrvg	$s3,$tweak+8($sp)
2191	lghi	%r1,0x87
2192	srag	%r0,$s3,63		# broadcast upper bit
2193	ngr	%r1,%r0			# rem
2194	algr	$s1,$s1
2195	alcgr	$s3,$s3
2196	xgr	$s1,%r1
2197	lrvgr	$i2,$s1			# flip byte order
2198	lrvgr	$i3,$s3
2199	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
2200	j	.Lxts_dec_2ndtweak
2201
2202.align	16
2203.Lxts_dec_short:
2204	llgc	$len,`2*$SIZE_T-1`($sp)
2205	nill	$len,0x0f		# $len%16
2206	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
2207	lrvg	$s3,$tweak+8($sp)
2208.Lxts_dec_2ndtweak:
2209	lghi	%r1,0x87
2210	srag	%r0,$s3,63		# broadcast upper bit
2211	ngr	%r1,%r0			# rem
2212	algr	$s1,$s1
2213	alcgr	$s3,$s3
2214	xgr	$s1,%r1
2215	lrvgr	$s1,$s1			# flip byte order
2216	lrvgr	$s3,$s3
2217	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
2218	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
2219	llgfr	$s1,$s1
2220	srlg	$s2,$s3,32
2221	stg	$s3,$tweak-16+8($sp)
2222	llgfr	$s3,$s3
2223
2224	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
2225	x	$s1,4($inp)
2226	x	$s2,8($inp)
2227	x	$s3,12($inp)
2228	stm${g}	%r2,%r3,2*$SIZE_T($sp)
2229	la	$key,0($key1)
2230	bras	$ra,_s390x_AES_decrypt
2231	lm${g}	%r2,%r5,2*$SIZE_T($sp)
2232	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
2233	x	$s1,$tweak-16+4($sp)
2234	x	$s2,$tweak-16+8($sp)
2235	x	$s3,$tweak-16+12($sp)
2236	st	$s0,0($out,$inp)
2237	st	$s1,4($out,$inp)
2238	st	$s2,8($out,$inp)
2239	st	$s3,12($out,$inp)
2240
2241	la	$i3,0($out,$inp)	# put aside real $out
2242.Lxts_dec_steal:
2243	llgc	%r0,16($inp)
2244	llgc	%r1,0($out,$inp)
2245	stc	%r0,0($out,$inp)
2246	stc	%r1,16($out,$inp)
2247	la	$inp,1($inp)
2248	brct	$len,.Lxts_dec_steal
2249	la	$out,0($i3)		# restore real $out
2250
2251	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
2252	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
2253	x	$s1,4($out)
2254	x	$s2,8($out)
2255	x	$s3,12($out)
2256	st${g}	$out,4*$SIZE_T($sp)
2257	la	$key,0($key1)
2258	bras	$ra,_s390x_AES_decrypt
2259	l${g}	$out,4*$SIZE_T($sp)
2260	x	$s0,$tweak+0($sp)	# ^=tweak
2261	x	$s1,$tweak+4($sp)
2262	x	$s2,$tweak+8($sp)
2263	x	$s3,$tweak+12($sp)
2264	st	$s0,0($out)
2265	st	$s1,4($out)
2266	st	$s2,8($out)
2267	st	$s3,12($out)
2268	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
2269	stg	$sp,$tweak-16+8($sp)
2270.Lxts_dec_done:
2271	stg	$sp,$tweak+0($sp)	# wipe tweak
2272	stg	$sp,$tweak+8($sp)
2273	lm${g}	%r6,$ra,6*$SIZE_T($sp)
2274	br	$ra
2275.size	AES_xts_decrypt,.-AES_xts_decrypt
2276___
2277}
2278$code.=<<___;
2279.string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2280___
2281
2282$code =~ s/\`([^\`]*)\`/eval $1/gem;
2283print $code;
2284close STDOUT or die "error closing STDOUT: $!";	# force flush
2285