xref: /openssl/crypto/modes/asm/aes-gcm-ppc.pl (revision fecb3aae)
1#! /usr/bin/env perl
2# Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright 2021- IBM Inc. All rights reserved
4#
5# Licensed under the Apache License 2.0 (the "License").  You may not use
6# this file except in compliance with the License.  You can obtain a copy
7# in the file LICENSE in the source distribution or at
8# https://www.openssl.org/source/license.html
9#
10#===================================================================================
11# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
12#
13# GHASH is based on the Karatsuba multiplication method.
14#
15#    Xi xor X1
16#
17#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
18#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
19#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
20#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
21#      (X4.h * H.h + X4.l * H.l + X4 * H)
22#
23# Xi = v0
24# H Poly = v2
25# Hash keys = v3 - v14
26#     ( H.l, H, H.h)
27#     ( H^2.l, H^2, H^2.h)
28#     ( H^3.l, H^3, H^3.h)
29#     ( H^4.l, H^4, H^4.h)
30#
31# v30 is IV
32# v31 - counter 1
33#
34# AES used,
35#     vs0 - vs14 for round keys
36#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
37#
38# This implementation uses stitched AES-GCM approach to improve overall performance.
39# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
40#
41# Current large block (16384 bytes) performance per second with 128 bit key --
42#
43#                        Encrypt  Decrypt
44# Power10[le] (3.5GHz)   5.32G    5.26G
45#
46# ===================================================================================
47#
48# $output is the last argument if it looks like a file (it has an extension)
49# $flavour is the first argument if it doesn't look like a file
50$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
51$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
52
53if ($flavour =~ /64/) {
54	$SIZE_T=8;
55	$LRSAVE=2*$SIZE_T;
56	$STU="stdu";
57	$POP="ld";
58	$PUSH="std";
59	$UCMP="cmpld";
60	$SHRI="srdi";
61} elsif ($flavour =~ /32/) {
62	$SIZE_T=4;
63	$LRSAVE=$SIZE_T;
64	$STU="stwu";
65	$POP="lwz";
66	$PUSH="stw";
67	$UCMP="cmplw";
68	$SHRI="srwi";
69} else { die "nonsense $flavour"; }
70
71$sp="r1";
72$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
73
74$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
75( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
76( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
77die "can't locate ppc-xlate.pl";
78
79open STDOUT,"| $^X $xlate $flavour \"$output\""
80    or die "can't call $xlate: $!";
81
82$code=<<___;
83.machine        "any"
84.text
85
86# 4x loops
87# v15 - v18 - input states
88# vs1 - vs9 - round keys
89#
90.macro Loop_aes_middle4x
91	xxlor	19+32, 1, 1
92	xxlor	20+32, 2, 2
93	xxlor	21+32, 3, 3
94	xxlor	22+32, 4, 4
95
96	vcipher	15, 15, 19
97	vcipher	16, 16, 19
98	vcipher	17, 17, 19
99	vcipher	18, 18, 19
100
101	vcipher	15, 15, 20
102	vcipher	16, 16, 20
103	vcipher	17, 17, 20
104	vcipher	18, 18, 20
105
106	vcipher	15, 15, 21
107	vcipher	16, 16, 21
108	vcipher	17, 17, 21
109	vcipher	18, 18, 21
110
111	vcipher	15, 15, 22
112	vcipher	16, 16, 22
113	vcipher	17, 17, 22
114	vcipher	18, 18, 22
115
116	xxlor	19+32, 5, 5
117	xxlor	20+32, 6, 6
118	xxlor	21+32, 7, 7
119	xxlor	22+32, 8, 8
120
121	vcipher	15, 15, 19
122	vcipher	16, 16, 19
123	vcipher	17, 17, 19
124	vcipher	18, 18, 19
125
126	vcipher	15, 15, 20
127	vcipher	16, 16, 20
128	vcipher	17, 17, 20
129	vcipher	18, 18, 20
130
131	vcipher	15, 15, 21
132	vcipher	16, 16, 21
133	vcipher	17, 17, 21
134	vcipher	18, 18, 21
135
136	vcipher	15, 15, 22
137	vcipher	16, 16, 22
138	vcipher	17, 17, 22
139	vcipher	18, 18, 22
140
141	xxlor	23+32, 9, 9
142	vcipher	15, 15, 23
143	vcipher	16, 16, 23
144	vcipher	17, 17, 23
145	vcipher	18, 18, 23
146.endm
147
148# 8x loops
149# v15 - v22 - input states
150# vs1 - vs9 - round keys
151#
152.macro Loop_aes_middle8x
153	xxlor	23+32, 1, 1
154	xxlor	24+32, 2, 2
155	xxlor	25+32, 3, 3
156	xxlor	26+32, 4, 4
157
158	vcipher	15, 15, 23
159	vcipher	16, 16, 23
160	vcipher	17, 17, 23
161	vcipher	18, 18, 23
162	vcipher	19, 19, 23
163	vcipher	20, 20, 23
164	vcipher	21, 21, 23
165	vcipher	22, 22, 23
166
167	vcipher	15, 15, 24
168	vcipher	16, 16, 24
169	vcipher	17, 17, 24
170	vcipher	18, 18, 24
171	vcipher	19, 19, 24
172	vcipher	20, 20, 24
173	vcipher	21, 21, 24
174	vcipher	22, 22, 24
175
176	vcipher	15, 15, 25
177	vcipher	16, 16, 25
178	vcipher	17, 17, 25
179	vcipher	18, 18, 25
180	vcipher	19, 19, 25
181	vcipher	20, 20, 25
182	vcipher	21, 21, 25
183	vcipher	22, 22, 25
184
185	vcipher	15, 15, 26
186	vcipher	16, 16, 26
187	vcipher	17, 17, 26
188	vcipher	18, 18, 26
189	vcipher	19, 19, 26
190	vcipher	20, 20, 26
191	vcipher	21, 21, 26
192	vcipher	22, 22, 26
193
194	xxlor	23+32, 5, 5
195	xxlor	24+32, 6, 6
196	xxlor	25+32, 7, 7
197	xxlor	26+32, 8, 8
198
199	vcipher	15, 15, 23
200	vcipher	16, 16, 23
201	vcipher	17, 17, 23
202	vcipher	18, 18, 23
203	vcipher	19, 19, 23
204	vcipher	20, 20, 23
205	vcipher	21, 21, 23
206	vcipher	22, 22, 23
207
208	vcipher	15, 15, 24
209	vcipher	16, 16, 24
210	vcipher	17, 17, 24
211	vcipher	18, 18, 24
212	vcipher	19, 19, 24
213	vcipher	20, 20, 24
214	vcipher	21, 21, 24
215	vcipher	22, 22, 24
216
217	vcipher	15, 15, 25
218	vcipher	16, 16, 25
219	vcipher	17, 17, 25
220	vcipher	18, 18, 25
221	vcipher	19, 19, 25
222	vcipher	20, 20, 25
223	vcipher	21, 21, 25
224	vcipher	22, 22, 25
225
226	vcipher	15, 15, 26
227	vcipher	16, 16, 26
228	vcipher	17, 17, 26
229	vcipher	18, 18, 26
230	vcipher	19, 19, 26
231	vcipher	20, 20, 26
232	vcipher	21, 21, 26
233	vcipher	22, 22, 26
234
235	xxlor	23+32, 9, 9
236	vcipher	15, 15, 23
237	vcipher	16, 16, 23
238	vcipher	17, 17, 23
239	vcipher	18, 18, 23
240	vcipher	19, 19, 23
241	vcipher	20, 20, 23
242	vcipher	21, 21, 23
243	vcipher	22, 22, 23
244.endm
245
246#
247# Compute 4x hash values based on Karatsuba method.
248#
249ppc_aes_gcm_ghash:
250	vxor		15, 15, 0
251
252	xxlxor		29, 29, 29
253
254	vpmsumd		23, 12, 15		# H4.L * X.L
255	vpmsumd		24, 9, 16
256	vpmsumd		25, 6, 17
257	vpmsumd		26, 3, 18
258
259	vxor		23, 23, 24
260	vxor		23, 23, 25
261	vxor		23, 23, 26		# L
262
263	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
264	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
265	vpmsumd		26, 7, 17
266	vpmsumd		27, 4, 18
267
268	vxor		24, 24, 25
269	vxor		24, 24, 26
270	vxor		24, 24, 27		# M
271
272	# sum hash and reduction with H Poly
273	vpmsumd		28, 23, 2		# reduction
274
275	xxlor		29+32, 29, 29
276	vsldoi		26, 24, 29, 8		# mL
277	vsldoi		29, 29, 24, 8		# mH
278	vxor		23, 23, 26		# mL + L
279
280	vsldoi		23, 23, 23, 8		# swap
281	vxor		23, 23, 28
282
283	vpmsumd		24, 14, 15		# H4.H * X.H
284	vpmsumd		25, 11, 16
285	vpmsumd		26, 8, 17
286	vpmsumd		27, 5, 18
287
288	vxor		24, 24, 25
289	vxor		24, 24, 26
290	vxor		24, 24, 27
291
292	vxor		24, 24, 29
293
294	# sum hash and reduction with H Poly
295	vsldoi		27, 23, 23, 8		# swap
296	vpmsumd		23, 23, 2
297	vxor		27, 27, 24
298	vxor		23, 23, 27
299
300	xxlor		32, 23+32, 23+32		# update hash
301
302	blr
303
304#
305# Combine two 4x ghash
306# v15 - v22 - input blocks
307#
308.macro ppc_aes_gcm_ghash2_4x
309	# first 4x hash
310	vxor		15, 15, 0		# Xi + X
311
312	xxlxor		29, 29, 29
313
314	vpmsumd		23, 12, 15		# H4.L * X.L
315	vpmsumd		24, 9, 16
316	vpmsumd		25, 6, 17
317	vpmsumd		26, 3, 18
318
319	vxor		23, 23, 24
320	vxor		23, 23, 25
321	vxor		23, 23, 26		# L
322
323	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
324	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
325	vpmsumd		26, 7, 17
326	vpmsumd		27, 4, 18
327
328	vxor		24, 24, 25
329	vxor		24, 24, 26
330
331	# sum hash and reduction with H Poly
332	vpmsumd		28, 23, 2		# reduction
333
334	xxlor		29+32, 29, 29
335
336	vxor		24, 24, 27		# M
337	vsldoi		26, 24, 29, 8		# mL
338	vsldoi		29, 29, 24, 8		# mH
339	vxor		23, 23, 26		# mL + L
340
341	vsldoi		23, 23, 23, 8		# swap
342	vxor		23, 23, 28
343
344	vpmsumd		24, 14, 15		# H4.H * X.H
345	vpmsumd		25, 11, 16
346	vpmsumd		26, 8, 17
347	vpmsumd		27, 5, 18
348
349	vxor		24, 24, 25
350	vxor		24, 24, 26
351	vxor		24, 24, 27		# H
352
353	vxor		24, 24, 29		# H + mH
354
355	# sum hash and reduction with H Poly
356	vsldoi		27, 23, 23, 8		# swap
357	vpmsumd		23, 23, 2
358	vxor		27, 27, 24
359	vxor		27, 23, 27		# 1st Xi
360
361	# 2nd 4x hash
362	vpmsumd		24, 9, 20
363	vpmsumd		25, 6, 21
364	vpmsumd		26, 3, 22
365	vxor		19, 19, 27		# Xi + X
366	vpmsumd		23, 12, 19		# H4.L * X.L
367
368	vxor		23, 23, 24
369	vxor		23, 23, 25
370	vxor		23, 23, 26		# L
371
372	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
373	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
374	vpmsumd		26, 7, 21
375	vpmsumd		27, 4, 22
376
377	vxor		24, 24, 25
378	vxor		24, 24, 26
379
380	# sum hash and reduction with H Poly
381	vpmsumd		28, 23, 2		# reduction
382
383	xxlor		29+32, 29, 29
384
385	vxor		24, 24, 27		# M
386	vsldoi		26, 24, 29, 8		# mL
387	vsldoi		29, 29, 24, 8		# mH
388	vxor		23, 23, 26		# mL + L
389
390	vsldoi		23, 23, 23, 8		# swap
391	vxor		23, 23, 28
392
393	vpmsumd		24, 14, 19		# H4.H * X.H
394	vpmsumd		25, 11, 20
395	vpmsumd		26, 8, 21
396	vpmsumd		27, 5, 22
397
398	vxor		24, 24, 25
399	vxor		24, 24, 26
400	vxor		24, 24, 27		# H
401
402	vxor		24, 24, 29		# H + mH
403
404	# sum hash and reduction with H Poly
405	vsldoi		27, 23, 23, 8		# swap
406	vpmsumd		23, 23, 2
407	vxor		27, 27, 24
408	vxor		23, 23, 27
409
410	xxlor		32, 23+32, 23+32		# update hash
411
412.endm
413
414#
415# Compute update single hash
416#
417.macro ppc_update_hash_1x
418	vxor		28, 28, 0
419
420	vxor		19, 19, 19
421
422	vpmsumd		22, 3, 28		# L
423	vpmsumd		23, 4, 28		# M
424	vpmsumd		24, 5, 28		# H
425
426	vpmsumd		27, 22, 2		# reduction
427
428	vsldoi		25, 23, 19, 8		# mL
429	vsldoi		26, 19, 23, 8		# mH
430	vxor		22, 22, 25		# LL + LL
431	vxor		24, 24, 26		# HH + HH
432
433	vsldoi		22, 22, 22, 8		# swap
434	vxor		22, 22, 27
435
436	vsldoi		20, 22, 22, 8		# swap
437	vpmsumd		22, 22, 2		# reduction
438	vxor		20, 20, 24
439	vxor		22, 22, 20
440
441	vmr		0, 22			# update hash
442
443.endm
444
445#
446# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
447#               const AES_KEY *key, unsigned char iv[16],
448#               void *Xip);
449#
450#    r3 - inp
451#    r4 - out
452#    r5 - len
453#    r6 - AES round keys
454#    r7 - iv
455#    r8 - Xi, HPoli, hash keys
456#
457.global ppc_aes_gcm_encrypt
458.align 5
459ppc_aes_gcm_encrypt:
460_ppc_aes_gcm_encrypt:
461
462	stdu 1,-512(1)
463	mflr 0
464
465	std	14,112(1)
466	std	15,120(1)
467	std	16,128(1)
468	std	17,136(1)
469	std	18,144(1)
470	std	19,152(1)
471	std	20,160(1)
472	std	21,168(1)
473	li	9, 256
474	stvx	20, 9, 1
475	addi	9, 9, 16
476	stvx	21, 9, 1
477	addi	9, 9, 16
478	stvx	22, 9, 1
479	addi	9, 9, 16
480	stvx	23, 9, 1
481	addi	9, 9, 16
482	stvx	24, 9, 1
483	addi	9, 9, 16
484	stvx	25, 9, 1
485	addi	9, 9, 16
486	stvx	26, 9, 1
487	addi	9, 9, 16
488	stvx	27, 9, 1
489	addi	9, 9, 16
490	stvx	28, 9, 1
491	addi	9, 9, 16
492	stvx	29, 9, 1
493	addi	9, 9, 16
494	stvx	30, 9, 1
495	addi	9, 9, 16
496	stvx	31, 9, 1
497	std	0, 528(1)
498
499	# Load Xi
500	lxvb16x	32, 0, 8	# load Xi
501
502	# load Hash - h^4, h^3, h^2, h
503	li	10, 32
504	lxvd2x	2+32, 10, 8	# H Poli
505	li	10, 48
506	lxvd2x	3+32, 10, 8	# Hl
507	li	10, 64
508	lxvd2x	4+32, 10, 8	# H
509	li	10, 80
510	lxvd2x	5+32, 10, 8	# Hh
511
512	li	10, 96
513	lxvd2x	6+32, 10, 8	# H^2l
514	li	10, 112
515	lxvd2x	7+32, 10, 8	# H^2
516	li	10, 128
517	lxvd2x	8+32, 10, 8	# H^2h
518
519	li	10, 144
520	lxvd2x	9+32, 10, 8	# H^3l
521	li	10, 160
522	lxvd2x	10+32, 10, 8	# H^3
523	li	10, 176
524	lxvd2x	11+32, 10, 8	# H^3h
525
526	li	10, 192
527	lxvd2x	12+32, 10, 8	# H^4l
528	li	10, 208
529	lxvd2x	13+32, 10, 8	# H^4
530	li	10, 224
531	lxvd2x	14+32, 10, 8	# H^4h
532
533	# initialize ICB: GHASH( IV ), IV - r7
534	lxvb16x	30+32, 0, 7	# load IV  - v30
535
536	mr	12, 5		# length
537	li	11, 0		# block index
538
539	# counter 1
540	vxor	31, 31, 31
541	vspltisb 22, 1
542	vsldoi	31, 31, 22,1	# counter 1
543
544	# load round key to VSR
545	lxv	0, 0(6)
546	lxv	1, 0x10(6)
547	lxv	2, 0x20(6)
548	lxv	3, 0x30(6)
549	lxv	4, 0x40(6)
550	lxv	5, 0x50(6)
551	lxv	6, 0x60(6)
552	lxv	7, 0x70(6)
553	lxv	8, 0x80(6)
554	lxv	9, 0x90(6)
555	lxv	10, 0xa0(6)
556
557	# load rounds - 10 (128), 12 (192), 14 (256)
558	lwz	9,240(6)
559
560	#
561	# vxor	state, state, w # addroundkey
562	xxlor	32+29, 0, 0
563	vxor	15, 30, 29	# IV + round key - add round key 0
564
565	cmpdi	9, 10
566	beq	Loop_aes_gcm_8x
567
568	# load 2 more round keys (v11, v12)
569	lxv	11, 0xb0(6)
570	lxv	12, 0xc0(6)
571
572	cmpdi	9, 12
573	beq	Loop_aes_gcm_8x
574
575	# load 2 more round keys (v11, v12, v13, v14)
576	lxv	13, 0xd0(6)
577	lxv	14, 0xe0(6)
578	cmpdi	9, 14
579	beq	Loop_aes_gcm_8x
580
581	b	aes_gcm_out
582
583.align 5
584Loop_aes_gcm_8x:
585	mr	14, 3
586	mr	9, 4
587
588	# n blocks
589	li	10, 128
590	divdu	10, 5, 10	# n 128 bytes-blocks
591	cmpdi	10, 0
592	beq	Loop_last_block
593
594	vaddudm	30, 30, 31	# IV + counter
595	vxor	16, 30, 29
596	vaddudm	30, 30, 31
597	vxor	17, 30, 29
598	vaddudm	30, 30, 31
599	vxor	18, 30, 29
600	vaddudm	30, 30, 31
601	vxor	19, 30, 29
602	vaddudm	30, 30, 31
603	vxor	20, 30, 29
604	vaddudm	30, 30, 31
605	vxor	21, 30, 29
606	vaddudm	30, 30, 31
607	vxor	22, 30, 29
608
609	mtctr	10
610
611	li	15, 16
612	li	16, 32
613	li	17, 48
614	li	18, 64
615	li	19, 80
616	li	20, 96
617	li	21, 112
618
619	lwz	10, 240(6)
620
621Loop_8x_block:
622
623	lxvb16x		15, 0, 14	# load block
624	lxvb16x		16, 15, 14	# load block
625	lxvb16x		17, 16, 14	# load block
626	lxvb16x		18, 17, 14	# load block
627	lxvb16x		19, 18, 14	# load block
628	lxvb16x		20, 19, 14	# load block
629	lxvb16x		21, 20, 14	# load block
630	lxvb16x		22, 21, 14	# load block
631	addi		14, 14, 128
632
633	Loop_aes_middle8x
634
635	xxlor	23+32, 10, 10
636
637	cmpdi	10, 10
638	beq	Do_next_ghash
639
640	# 192 bits
641	xxlor	24+32, 11, 11
642
643	vcipher	15, 15, 23
644	vcipher	16, 16, 23
645	vcipher	17, 17, 23
646	vcipher	18, 18, 23
647	vcipher	19, 19, 23
648	vcipher	20, 20, 23
649	vcipher	21, 21, 23
650	vcipher	22, 22, 23
651
652	vcipher	15, 15, 24
653	vcipher	16, 16, 24
654	vcipher	17, 17, 24
655	vcipher	18, 18, 24
656	vcipher	19, 19, 24
657	vcipher	20, 20, 24
658	vcipher	21, 21, 24
659	vcipher	22, 22, 24
660
661	xxlor	23+32, 12, 12
662
663	cmpdi	10, 12
664	beq	Do_next_ghash
665
666	# 256 bits
667	xxlor	24+32, 13, 13
668
669	vcipher	15, 15, 23
670	vcipher	16, 16, 23
671	vcipher	17, 17, 23
672	vcipher	18, 18, 23
673	vcipher	19, 19, 23
674	vcipher	20, 20, 23
675	vcipher	21, 21, 23
676	vcipher	22, 22, 23
677
678	vcipher	15, 15, 24
679	vcipher	16, 16, 24
680	vcipher	17, 17, 24
681	vcipher	18, 18, 24
682	vcipher	19, 19, 24
683	vcipher	20, 20, 24
684	vcipher	21, 21, 24
685	vcipher	22, 22, 24
686
687	xxlor	23+32, 14, 14
688
689	cmpdi	10, 14
690	beq	Do_next_ghash
691	b	aes_gcm_out
692
693Do_next_ghash:
694
695	#
696	# last round
697	vcipherlast     15, 15, 23
698	vcipherlast     16, 16, 23
699
700	xxlxor		47, 47, 15
701	stxvb16x        47, 0, 9	# store output
702	xxlxor		48, 48, 16
703	stxvb16x        48, 15, 9	# store output
704
705	vcipherlast     17, 17, 23
706	vcipherlast     18, 18, 23
707
708	xxlxor		49, 49, 17
709	stxvb16x        49, 16, 9	# store output
710	xxlxor		50, 50, 18
711	stxvb16x        50, 17, 9	# store output
712
713	vcipherlast     19, 19, 23
714	vcipherlast     20, 20, 23
715
716	xxlxor		51, 51, 19
717	stxvb16x        51, 18, 9	# store output
718	xxlxor		52, 52, 20
719	stxvb16x        52, 19, 9	# store output
720
721	vcipherlast     21, 21, 23
722	vcipherlast     22, 22, 23
723
724	xxlxor		53, 53, 21
725	stxvb16x        53, 20, 9	# store output
726	xxlxor		54, 54, 22
727	stxvb16x        54, 21, 9	# store output
728
729	addi		9, 9, 128
730
731	# ghash here
732	ppc_aes_gcm_ghash2_4x
733
734	xxlor	27+32, 0, 0
735	vaddudm 30, 30, 31		# IV + counter
736	vmr	29, 30
737	vxor    15, 30, 27		# add round key
738	vaddudm 30, 30, 31
739	vxor    16, 30, 27
740	vaddudm 30, 30, 31
741	vxor    17, 30, 27
742	vaddudm 30, 30, 31
743	vxor    18, 30, 27
744	vaddudm 30, 30, 31
745	vxor    19, 30, 27
746	vaddudm 30, 30, 31
747	vxor    20, 30, 27
748	vaddudm 30, 30, 31
749	vxor    21, 30, 27
750	vaddudm 30, 30, 31
751	vxor    22, 30, 27
752
753	addi    12, 12, -128
754	addi    11, 11, 128
755
756	bdnz	Loop_8x_block
757
758	vmr	30, 29
759
760Loop_last_block:
761	cmpdi   12, 0
762	beq     aes_gcm_out
763
764	# loop last few blocks
765	li      10, 16
766	divdu   10, 12, 10
767
768	mtctr   10
769
770	lwz	10, 240(6)
771
772	cmpdi   12, 16
773	blt     Final_block
774
775.macro Loop_aes_middle_1x
776	xxlor	19+32, 1, 1
777	xxlor	20+32, 2, 2
778	xxlor	21+32, 3, 3
779	xxlor	22+32, 4, 4
780
781	vcipher 15, 15, 19
782	vcipher 15, 15, 20
783	vcipher 15, 15, 21
784	vcipher 15, 15, 22
785
786	xxlor	19+32, 5, 5
787	xxlor	20+32, 6, 6
788	xxlor	21+32, 7, 7
789	xxlor	22+32, 8, 8
790
791	vcipher 15, 15, 19
792	vcipher 15, 15, 20
793	vcipher 15, 15, 21
794	vcipher 15, 15, 22
795
796	xxlor	19+32, 9, 9
797	vcipher 15, 15, 19
798.endm
799
800Next_rem_block:
801	lxvb16x 15, 0, 14		# load block
802
803	Loop_aes_middle_1x
804
805	xxlor	23+32, 10, 10
806
807	cmpdi	10, 10
808	beq	Do_next_1x
809
810	# 192 bits
811	xxlor	24+32, 11, 11
812
813	vcipher	15, 15, 23
814	vcipher	15, 15, 24
815
816	xxlor	23+32, 12, 12
817
818	cmpdi	10, 12
819	beq	Do_next_1x
820
821	# 256 bits
822	xxlor	24+32, 13, 13
823
824	vcipher	15, 15, 23
825	vcipher	15, 15, 24
826
827	xxlor	23+32, 14, 14
828
829	cmpdi	10, 14
830	beq	Do_next_1x
831
832Do_next_1x:
833	vcipherlast     15, 15, 23
834
835	xxlxor		47, 47, 15
836	stxvb16x	47, 0, 9	# store output
837	addi		14, 14, 16
838	addi		9, 9, 16
839
840	vmr		28, 15
841	ppc_update_hash_1x
842
843	addi		12, 12, -16
844	addi		11, 11, 16
845	xxlor		19+32, 0, 0
846	vaddudm		30, 30, 31		# IV + counter
847	vxor		15, 30, 19		# add round key
848
849	bdnz	Next_rem_block
850
851	cmpdi	12, 0
852	beq	aes_gcm_out
853
854Final_block:
855	Loop_aes_middle_1x
856
857	xxlor	23+32, 10, 10
858
859	cmpdi	10, 10
860	beq	Do_final_1x
861
862	# 192 bits
863	xxlor	24+32, 11, 11
864
865	vcipher	15, 15, 23
866	vcipher	15, 15, 24
867
868	xxlor	23+32, 12, 12
869
870	cmpdi	10, 12
871	beq	Do_final_1x
872
873	# 256 bits
874	xxlor	24+32, 13, 13
875
876	vcipher	15, 15, 23
877	vcipher	15, 15, 24
878
879	xxlor	23+32, 14, 14
880
881	cmpdi	10, 14
882	beq	Do_final_1x
883
884Do_final_1x:
885	vcipherlast     15, 15, 23
886
887	lxvb16x	15, 0, 14		# load last block
888	xxlxor	47, 47, 15
889
890	# create partial block mask
891	li	15, 16
892	sub	15, 15, 12		# index to the mask
893
894	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
895	vspltisb	17, 0		# second 16 bytes - 0x0000...00
896	li	10, 192
897	stvx	16, 10, 1
898	addi	10, 10, 16
899	stvx	17, 10, 1
900
901	addi	10, 1, 192
902	lxvb16x	16, 15, 10		# load partial block mask
903	xxland	47, 47, 16
904
905	vmr	28, 15
906	ppc_update_hash_1x
907
908	# * should store only the remaining bytes.
909	bl	Write_partial_block
910
911	b aes_gcm_out
912
913#
914# Write partial block
915# r9 - output
916# r12 - remaining bytes
917# v15 - partial input data
918#
919Write_partial_block:
920	li		10, 192
921	stxvb16x	15+32, 10, 1		# last block
922
923	#add		10, 9, 11		# Output
924	addi		10, 9, -1
925	addi		16, 1, 191
926
927        mtctr		12			# remaining bytes
928	li		15, 0
929
930Write_last_byte:
931        lbzu		14, 1(16)
932	stbu		14, 1(10)
933        bdnz		Write_last_byte
934	blr
935
936aes_gcm_out:
937	# out = state
938	stxvb16x	32, 0, 8		# write out Xi
939	add	3, 11, 12		# return count
940
941	li	9, 256
942	lvx	20, 9, 1
943	addi	9, 9, 16
944	lvx	21, 9, 1
945	addi	9, 9, 16
946	lvx	22, 9, 1
947	addi	9, 9, 16
948	lvx	23, 9, 1
949	addi	9, 9, 16
950	lvx	24, 9, 1
951	addi	9, 9, 16
952	lvx	25, 9, 1
953	addi	9, 9, 16
954	lvx	26, 9, 1
955	addi	9, 9, 16
956	lvx	27, 9, 1
957	addi	9, 9, 16
958	lvx	28, 9, 1
959	addi	9, 9, 16
960	lvx	29, 9, 1
961	addi	9, 9, 16
962	lvx	30, 9, 1
963	addi	9, 9, 16
964	lvx	31, 9, 1
965
966	ld	0, 528(1)
967	ld      14,112(1)
968	ld      15,120(1)
969	ld      16,128(1)
970	ld      17,136(1)
971	ld      18,144(1)
972	ld      19,152(1)
973	ld      20,160(1)
974	ld	21,168(1)
975
976	mtlr	0
977	addi	1, 1, 512
978	blr
979
980#
981# 8x Decrypt
982#
983.global ppc_aes_gcm_decrypt
984.align 5
985ppc_aes_gcm_decrypt:
986_ppc_aes_gcm_decrypt:
987
988	stdu 1,-512(1)
989	mflr 0
990
991	std	14,112(1)
992	std	15,120(1)
993	std	16,128(1)
994	std	17,136(1)
995	std	18,144(1)
996	std	19,152(1)
997	std	20,160(1)
998	std	21,168(1)
999	li	9, 256
1000	stvx	20, 9, 1
1001	addi	9, 9, 16
1002	stvx	21, 9, 1
1003	addi	9, 9, 16
1004	stvx	22, 9, 1
1005	addi	9, 9, 16
1006	stvx	23, 9, 1
1007	addi	9, 9, 16
1008	stvx	24, 9, 1
1009	addi	9, 9, 16
1010	stvx	25, 9, 1
1011	addi	9, 9, 16
1012	stvx	26, 9, 1
1013	addi	9, 9, 16
1014	stvx	27, 9, 1
1015	addi	9, 9, 16
1016	stvx	28, 9, 1
1017	addi	9, 9, 16
1018	stvx	29, 9, 1
1019	addi	9, 9, 16
1020	stvx	30, 9, 1
1021	addi	9, 9, 16
1022	stvx	31, 9, 1
1023	std	0, 528(1)
1024
1025	# Load Xi
1026	lxvb16x	32, 0, 8	# load Xi
1027
1028	# load Hash - h^4, h^3, h^2, h
1029	li	10, 32
1030	lxvd2x	2+32, 10, 8	# H Poli
1031	li	10, 48
1032	lxvd2x	3+32, 10, 8	# Hl
1033	li	10, 64
1034	lxvd2x	4+32, 10, 8	# H
1035	li	10, 80
1036	lxvd2x	5+32, 10, 8	# Hh
1037
1038	li	10, 96
1039	lxvd2x	6+32, 10, 8	# H^2l
1040	li	10, 112
1041	lxvd2x	7+32, 10, 8	# H^2
1042	li	10, 128
1043	lxvd2x	8+32, 10, 8	# H^2h
1044
1045	li	10, 144
1046	lxvd2x	9+32, 10, 8	# H^3l
1047	li	10, 160
1048	lxvd2x	10+32, 10, 8	# H^3
1049	li	10, 176
1050	lxvd2x	11+32, 10, 8	# H^3h
1051
1052	li	10, 192
1053	lxvd2x	12+32, 10, 8	# H^4l
1054	li	10, 208
1055	lxvd2x	13+32, 10, 8	# H^4
1056	li	10, 224
1057	lxvd2x	14+32, 10, 8	# H^4h
1058
1059	# initialize ICB: GHASH( IV ), IV - r7
1060	lxvb16x	30+32, 0, 7	# load IV  - v30
1061
1062	mr	12, 5		# length
1063	li	11, 0		# block index
1064
1065	# counter 1
1066	vxor	31, 31, 31
1067	vspltisb 22, 1
1068	vsldoi	31, 31, 22,1	# counter 1
1069
1070	# load round key to VSR
1071	lxv	0, 0(6)
1072	lxv	1, 0x10(6)
1073	lxv	2, 0x20(6)
1074	lxv	3, 0x30(6)
1075	lxv	4, 0x40(6)
1076	lxv	5, 0x50(6)
1077	lxv	6, 0x60(6)
1078	lxv	7, 0x70(6)
1079	lxv	8, 0x80(6)
1080	lxv	9, 0x90(6)
1081	lxv	10, 0xa0(6)
1082
1083	# load rounds - 10 (128), 12 (192), 14 (256)
1084	lwz	9,240(6)
1085
1086	#
1087	# vxor	state, state, w # addroundkey
1088	xxlor	32+29, 0, 0
1089	vxor	15, 30, 29	# IV + round key - add round key 0
1090
1091	cmpdi	9, 10
1092	beq	Loop_aes_gcm_8x_dec
1093
1094	# load 2 more round keys (v11, v12)
1095	lxv	11, 0xb0(6)
1096	lxv	12, 0xc0(6)
1097
1098	cmpdi	9, 12
1099	beq	Loop_aes_gcm_8x_dec
1100
1101	# load 2 more round keys (v11, v12, v13, v14)
1102	lxv	13, 0xd0(6)
1103	lxv	14, 0xe0(6)
1104	cmpdi	9, 14
1105	beq	Loop_aes_gcm_8x_dec
1106
1107	b	aes_gcm_out
1108
1109.align 5
1110Loop_aes_gcm_8x_dec:
1111	mr	14, 3
1112	mr	9, 4
1113
1114	# n blocks
1115	li	10, 128
1116	divdu	10, 5, 10	# n 128 bytes-blocks
1117	cmpdi	10, 0
1118	beq	Loop_last_block_dec
1119
1120	vaddudm	30, 30, 31	# IV + counter
1121	vxor	16, 30, 29
1122	vaddudm	30, 30, 31
1123	vxor	17, 30, 29
1124	vaddudm	30, 30, 31
1125	vxor	18, 30, 29
1126	vaddudm	30, 30, 31
1127	vxor	19, 30, 29
1128	vaddudm	30, 30, 31
1129	vxor	20, 30, 29
1130	vaddudm	30, 30, 31
1131	vxor	21, 30, 29
1132	vaddudm	30, 30, 31
1133	vxor	22, 30, 29
1134
1135	mtctr	10
1136
1137	li	15, 16
1138	li	16, 32
1139	li	17, 48
1140	li	18, 64
1141	li	19, 80
1142	li	20, 96
1143	li	21, 112
1144
1145	lwz	10, 240(6)
1146
1147Loop_8x_block_dec:
1148
1149	lxvb16x		15, 0, 14	# load block
1150	lxvb16x		16, 15, 14	# load block
1151	lxvb16x		17, 16, 14	# load block
1152	lxvb16x		18, 17, 14	# load block
1153	lxvb16x		19, 18, 14	# load block
1154	lxvb16x		20, 19, 14	# load block
1155	lxvb16x		21, 20, 14	# load block
1156	lxvb16x		22, 21, 14	# load block
1157	addi		14, 14, 128
1158
1159	Loop_aes_middle8x
1160
1161	xxlor	23+32, 10, 10
1162
1163	cmpdi	10, 10
1164	beq	Do_last_aes_dec
1165
1166	# 192 bits
1167	xxlor	24+32, 11, 11
1168
1169	vcipher	15, 15, 23
1170	vcipher	16, 16, 23
1171	vcipher	17, 17, 23
1172	vcipher	18, 18, 23
1173	vcipher	19, 19, 23
1174	vcipher	20, 20, 23
1175	vcipher	21, 21, 23
1176	vcipher	22, 22, 23
1177
1178	vcipher	15, 15, 24
1179	vcipher	16, 16, 24
1180	vcipher	17, 17, 24
1181	vcipher	18, 18, 24
1182	vcipher	19, 19, 24
1183	vcipher	20, 20, 24
1184	vcipher	21, 21, 24
1185	vcipher	22, 22, 24
1186
1187	xxlor	23+32, 12, 12
1188
1189	cmpdi	10, 12
1190	beq	Do_last_aes_dec
1191
1192	# 256 bits
1193	xxlor	24+32, 13, 13
1194
1195	vcipher	15, 15, 23
1196	vcipher	16, 16, 23
1197	vcipher	17, 17, 23
1198	vcipher	18, 18, 23
1199	vcipher	19, 19, 23
1200	vcipher	20, 20, 23
1201	vcipher	21, 21, 23
1202	vcipher	22, 22, 23
1203
1204	vcipher	15, 15, 24
1205	vcipher	16, 16, 24
1206	vcipher	17, 17, 24
1207	vcipher	18, 18, 24
1208	vcipher	19, 19, 24
1209	vcipher	20, 20, 24
1210	vcipher	21, 21, 24
1211	vcipher	22, 22, 24
1212
1213	xxlor	23+32, 14, 14
1214
1215	cmpdi	10, 14
1216	beq	Do_last_aes_dec
1217	b	aes_gcm_out
1218
1219Do_last_aes_dec:
1220
1221	#
1222	# last round
1223	vcipherlast     15, 15, 23
1224	vcipherlast     16, 16, 23
1225
1226	xxlxor		47, 47, 15
1227	stxvb16x        47, 0, 9	# store output
1228	xxlxor		48, 48, 16
1229	stxvb16x        48, 15, 9	# store output
1230
1231	vcipherlast     17, 17, 23
1232	vcipherlast     18, 18, 23
1233
1234	xxlxor		49, 49, 17
1235	stxvb16x        49, 16, 9	# store output
1236	xxlxor		50, 50, 18
1237	stxvb16x        50, 17, 9	# store output
1238
1239	vcipherlast     19, 19, 23
1240	vcipherlast     20, 20, 23
1241
1242	xxlxor		51, 51, 19
1243	stxvb16x        51, 18, 9	# store output
1244	xxlxor		52, 52, 20
1245	stxvb16x        52, 19, 9	# store output
1246
1247	vcipherlast     21, 21, 23
1248	vcipherlast     22, 22, 23
1249
1250	xxlxor		53, 53, 21
1251	stxvb16x        53, 20, 9	# store output
1252	xxlxor		54, 54, 22
1253	stxvb16x        54, 21, 9	# store output
1254
1255	addi		9, 9, 128
1256
1257	xxlor		15+32, 15, 15
1258	xxlor		16+32, 16, 16
1259	xxlor		17+32, 17, 17
1260	xxlor		18+32, 18, 18
1261	xxlor		19+32, 19, 19
1262	xxlor		20+32, 20, 20
1263	xxlor		21+32, 21, 21
1264	xxlor		22+32, 22, 22
1265
1266	# ghash here
1267	ppc_aes_gcm_ghash2_4x
1268
1269	xxlor	27+32, 0, 0
1270	vaddudm 30, 30, 31		# IV + counter
1271	vmr	29, 30
1272	vxor    15, 30, 27		# add round key
1273	vaddudm 30, 30, 31
1274	vxor    16, 30, 27
1275	vaddudm 30, 30, 31
1276	vxor    17, 30, 27
1277	vaddudm 30, 30, 31
1278	vxor    18, 30, 27
1279	vaddudm 30, 30, 31
1280	vxor    19, 30, 27
1281	vaddudm 30, 30, 31
1282	vxor    20, 30, 27
1283	vaddudm 30, 30, 31
1284	vxor    21, 30, 27
1285	vaddudm 30, 30, 31
1286	vxor    22, 30, 27
1287	addi    12, 12, -128
1288	addi    11, 11, 128
1289
1290	bdnz	Loop_8x_block_dec
1291
1292	vmr	30, 29
1293
1294Loop_last_block_dec:
1295	cmpdi   12, 0
1296	beq     aes_gcm_out
1297
1298	# loop last few blocks
1299	li      10, 16
1300	divdu   10, 12, 10
1301
1302	mtctr   10
1303
1304	lwz	10,240(6)
1305
1306	cmpdi   12, 16
1307	blt     Final_block_dec
1308
1309Next_rem_block_dec:
1310	lxvb16x 15, 0, 14		# load block
1311
1312	Loop_aes_middle_1x
1313
1314	xxlor	23+32, 10, 10
1315
1316	cmpdi	10, 10
1317	beq	Do_next_1x_dec
1318
1319	# 192 bits
1320	xxlor	24+32, 11, 11
1321
1322	vcipher	15, 15, 23
1323	vcipher	15, 15, 24
1324
1325	xxlor	23+32, 12, 12
1326
1327	cmpdi	10, 12
1328	beq	Do_next_1x_dec
1329
1330	# 256 bits
1331	xxlor	24+32, 13, 13
1332
1333	vcipher	15, 15, 23
1334	vcipher	15, 15, 24
1335
1336	xxlor	23+32, 14, 14
1337
1338	cmpdi	10, 14
1339	beq	Do_next_1x_dec
1340
1341Do_next_1x_dec:
1342	vcipherlast     15, 15, 23
1343
1344	xxlxor  47, 47, 15
1345	stxvb16x        47, 0, 9	# store output
1346	addi	14, 14, 16
1347	addi	9, 9, 16
1348
1349	xxlor	28+32, 15, 15
1350	ppc_update_hash_1x
1351
1352	addi    12, 12, -16
1353	addi    11, 11, 16
1354	xxlor	19+32, 0, 0
1355	vaddudm 30, 30, 31		# IV + counter
1356	vxor	15, 30, 19		# add round key
1357
1358	bdnz	Next_rem_block_dec
1359
1360	cmpdi	12, 0
1361	beq	aes_gcm_out
1362
1363Final_block_dec:
1364	Loop_aes_middle_1x
1365
1366	xxlor	23+32, 10, 10
1367
1368	cmpdi	10, 10
1369	beq	Do_final_1x_dec
1370
1371	# 192 bits
1372	xxlor	24+32, 11, 11
1373
1374	vcipher	15, 15, 23
1375	vcipher	15, 15, 24
1376
1377	xxlor	23+32, 12, 12
1378
1379	cmpdi	10, 12
1380	beq	Do_final_1x_dec
1381
1382	# 256 bits
1383	xxlor	24+32, 13, 13
1384
1385	vcipher	15, 15, 23
1386	vcipher	15, 15, 24
1387
1388	xxlor	23+32, 14, 14
1389
1390	cmpdi	10, 14
1391	beq	Do_final_1x_dec
1392
1393Do_final_1x_dec:
1394	vcipherlast     15, 15, 23
1395
1396	lxvb16x	15, 0, 14		# load block
1397	xxlxor	47, 47, 15
1398
1399	# create partial block mask
1400	li	15, 16
1401	sub	15, 15, 12		# index to the mask
1402
1403	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
1404	vspltisb	17, 0		# second 16 bytes - 0x0000...00
1405	li	10, 192
1406	stvx	16, 10, 1
1407	addi	10, 10, 16
1408	stvx	17, 10, 1
1409
1410	addi	10, 1, 192
1411	lxvb16x	16, 15, 10		# load block mask
1412	xxland	47, 47, 16
1413
1414	xxlor	28+32, 15, 15
1415	ppc_update_hash_1x
1416
1417	# * should store only the remaining bytes.
1418	bl	Write_partial_block
1419
1420	b aes_gcm_out
1421
1422
1423___
1424
1425foreach (split("\n",$code)) {
1426	s/\`([^\`]*)\`/eval $1/geo;
1427
1428	if ($flavour =~ /le$/o) {	# little-endian
1429	    s/le\?//o		or
1430	    s/be\?/#be#/o;
1431	} else {
1432	    s/le\?/#le#/o	or
1433	    s/be\?//o;
1434	}
1435	print $_,"\n";
1436}
1437
1438close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1439