1#! /usr/bin/env perl
2# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
15# obtain it.
16#========================================================================
17#
18# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
19# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
20# intermediate hashesfrom the 8 blocks.
21#
22#  ____________________________________________________
23# |                                                    |
24# | PRE                                                |
25# |____________________________________________________|
26# |                |                |                  |
27# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
28# |________________|________________|__________________|
29# |                |                |                  |
30# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
31# |________________|________________|__________________|
32# |                |                |                  |
33# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
34# |________________|________________|__________________|
35# |                |                |                  |
36# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
37# |________________|________________|__________________|
38# |                |                |                  |
39# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
40# |________________|________________|__________________|
41# |                |                |                  |
42# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
43# |________________|________________|__________________|
44# |                |                |                  |
45# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
46# |________________|________________|__________________|
47# |                |                |                  |
48# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
49# |________________|____(mostly)____|__________________|
50# |                                                    |
51# | MODULO                                             |
52# |____________________________________________________|
53#
54# PRE:
55#     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
56# EXT low_acc, low_acc, low_acc, #8
57# EOR res_curr (8k+0), res_curr (4k+0), low_acc
58#
59# CTR block:
60#     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
61# REV     ctr32, rev_ctr32
62# ORR     ctr64, constctr96_top32, ctr32, LSL #32
63# INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
64# INS     ctr_next.d[1], ctr64X
65# ADD     rev_ctr32, #1
66#
67# AES block:
68#      Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
69#      Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
70#      Given we are very constrained in our ASIMD registers this is quite important
71#
72#      Encrypt:
73# LDR     input_low, [ input_ptr  ], #8
74# LDR     input_high, [ input_ptr  ], #8
75# EOR     input_low, k14_low
76# EOR     input_high, k14_high
77# INS     res_curr.d[0], input_low
78# INS     res_curr.d[1], input_high
79# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
80# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
81# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
82# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
83# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
84# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
85# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
86# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
87# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
88# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
89# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
90# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
91# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
92# AESE    ctr_curr, k13
93# EOR     res_curr, res_curr, ctr_curr
94# ST1     { res_curr.16b  }, [ output_ptr  ], #16
95#
96#     Decrypt:
97# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
98# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
99# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
100# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
101# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
102# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
103# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
104# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
105# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
106# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
107# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
108# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
109# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
110# AESE    ctr_curr, k13
111# LDR     res_curr, [ input_ptr  ], #16
112# EOR     res_curr, res_curr, ctr_curr
113# MOV     output_low, res_curr.d[0]
114# MOV     output_high, res_curr.d[1]
115# EOR     output_low, k14_low
116# EOR     output_high, k14_high
117# STP     output_low, output_high, [ output_ptr  ], #16
118
119# GHASH block X:
120#     Do 128b karatsuba polynomial multiplication on block
121#     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
122#
123# multiplication:
124#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
125#
126#     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
127#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
128#
129#     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
130#     multiplying with "twisted" powers of H
131#
132# Note: We can PMULL directly into the acc_x in first GHASH of the loop
133# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
134#       path latency dominates the performance
135#
136#       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
137#       than indicated here
138# REV64   res_curr, res_curr
139# INS     t_m.d[0], res_curr.d[1]
140# EOR     t_m.8B, t_m.8B, res_curr.8B
141# PMULL2  t_h, res_curr, HX
142# PMULL   t_l, res_curr, HX
143# PMULL   t_m, t_m, HX_k
144# EOR     acc_h, acc_h, t_h
145# EOR     acc_l, acc_l, t_l
146# EOR     acc_m, acc_m, t_m
147#
148# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
149#         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
150#         with a reversed constant
151# EOR3    acc_m, acc_m, acc_l, acc_h                     // Finish off karatsuba processing
152# PMULL   t_mod, acc_h, mod_constant
153# EXT     acc_h, acc_h, acc_h, #8
154# EOR3     acc_m, acc_m, t_mod, acc_h
155# PMULL   acc_h, acc_m, mod_constant
156# EXT     acc_m, acc_m, acc_m, #8
157# EOR3    acc_l, acc_l, acc_m, acc_h
158
159$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
160$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
161
162$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
163( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
164( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
165die "can't locate arm-xlate.pl";
166
167die "only for 64 bit" if $flavour !~ /64/;
168
169open OUT,"| \"$^X\" $xlate $flavour $output";
170*STDOUT=*OUT;
171
172$code=<<___;
173#include "arm_arch.h"
174
175#if __ARM_MAX_ARCH__>=8
176___
177$code.=".arch   armv8.2-a+crypto\n.text\n";
178
179$input_ptr="x0";  #argument block
180$bit_length="x1";
181$output_ptr="x2";
182$current_tag="x3";
183$counter="x16";
184$constant_temp="x15";
185$modulo_constant="x10";
186$cc="x8";
187{
188my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
189my ($temp2_x,$temp3_x)=map("x$_",(13..14));
190my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
191my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
192my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
193my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
194my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
195
196my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
197my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
198my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
199
200my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
201my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
202
203my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
204my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
205my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
206my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
207
208my $t0="v16";
209my $t0d="d16";
210
211my $t1="v29";
212my $t2=$res1;
213my $t3=$t1;
214
215my $t4=$res0;
216my $t5=$res2;
217my $t6=$t0;
218
219my $t7=$res3;
220my $t8=$res4;
221my $t9=$res5;
222
223my $t10=$res6;
224my $t11="v21";
225my $t12=$t1;
226
227my $rtmp_ctr="v30";
228my $rtmp_ctrq="q30";
229my $rctr_inc="v31";
230my $rctr_incd="d31";
231
232my $mod_constantd=$t0d;
233my $mod_constant=$t0;
234
235my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
236my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
237my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
238my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
239my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
240my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
241my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
242my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
243my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
244my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
245my $rk2q1="v28.1q";
246my $rk3q1="v26.1q";
247my $rk4v="v27";
248
249
250#########################################################################################
251# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
252#                               size_t len,
253#                               unsigned char *out,
254#                               const void *key,
255#                               unsigned char ivec[16],
256#                               u64 *Xi);
257#
258$code.=<<___;
259.global unroll8_eor3_aes_gcm_enc_128_kernel
260.type   unroll8_eor3_aes_gcm_enc_128_kernel,%function
261.align  4
262unroll8_eor3_aes_gcm_enc_128_kernel:
263	AARCH64_VALID_CALL_TARGET
264	cbz	x1, .L128_enc_ret
265	stp	d8, d9, [sp, #-80]!
266	mov	$counter, x4
267	mov	$cc, x5
268	stp	d10, d11, [sp, #16]
269	stp	d12, d13, [sp, #32]
270	stp	d14, d15, [sp, #48]
271	mov	x5, #0xc200000000000000
272	stp	x5, xzr, [sp, #64]
273	add	$modulo_constant, sp, #64
274
275	mov	$constant_temp, #0x100000000				@ set up counter increment
276	movi	$rctr_inc.16b, #0x0
277	mov	$rctr_inc.d[1], $constant_temp
278	lsr	$main_end_input_ptr, $bit_length, #3		  	@ byte_len
279	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
280
281	sub	$main_end_input_ptr, $main_end_input_ptr, #1	 	@ byte_len - 1
282
283	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80		@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
284
285	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
286
287	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
288
289	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
290	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
291
292	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
293	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
294
295	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
296	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
297
298	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
299	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
300
301	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
302	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
303	ldp	$rk0q, $rk1q, [$cc, #0]				  	@ load rk0, rk1
304
305	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
306	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
307
308	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
309	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
310
311	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
312	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
313	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
314
315	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
316	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
317	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
318
319	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
320	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
321	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
322
323	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
324
325	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
326	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
327	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
328
329	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
330	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
331	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
332
333	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
334	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
335	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
336
337	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
338	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
339	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
340
341	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
342	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
343	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
344
345	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
346
347	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
348	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
349	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
350
351	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
352	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
353	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
354
355	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
356
357	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
358	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
359	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
360
361	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
362	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
363	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
364
365	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
366	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
367	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
368
369	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
370	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
371	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
372
373	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
374	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
375	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
376
377	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
378	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
379	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
380
381	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
382	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
383	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
384
385	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
386	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
387	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
388
389	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
390	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
391	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
392
393	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
394
395	ld1	{ $acc_lb}, [$current_tag]
396	ext	$acc_lb, $acc_lb, $acc_lb, #8
397	rev64	$acc_lb, $acc_lb
398
399	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
400
401	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
402	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
403	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
404
405	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
406	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
407	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
408
409	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
410	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
411	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
412
413	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
414	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
415	ldr	$rk10q, [$cc, #160]					@ load rk10
416
417	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
418	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
419	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
420
421	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
422	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
423	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
424
425	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
426	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
427	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
428
429	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
430	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
431	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
432
433	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
434	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
435	b.ge	.L128_enc_tail						@ handle tail
436
437	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
438
439	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
440
441	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
442
443	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
444	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
445
446	eor3	$res0b, $ctr_t0b, $ctr0b, $rk10				@ AES block 0 - result
447	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
448	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
449
450	eor3	$res1b, $ctr_t1b, $ctr1b, $rk10				@ AES block 1 - result
451	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
452
453	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
454	eor3	$res5b, $ctr_t5b, $ctr5b, $rk10				@ AES block 5 - result
455	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
456
457	eor3	$res2b, $ctr_t2b, $ctr2b, $rk10				@ AES block 2 - result
458	eor3	$res6b, $ctr_t6b, $ctr6b, $rk10				@ AES block 6 - result
459	eor3	$res4b, $ctr_t4b, $ctr4b, $rk10				@ AES block 4 - result
460
461	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
462	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
463
464	eor3	$res3b, $ctr_t3b, $ctr3b, $rk10				@ AES block 3 - result
465	eor3	$res7b, $ctr_t7b, $ctr7b,$rk10				@ AES block 7 - result
466	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
467
468	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
469	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
470	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
471
472	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
473
474	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
475	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
476	b.ge	.L128_enc_prepretail					@ do prepretail
477
478.L128_enc_main_loop:							@ main loop start
479	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
480	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
481	ext     $h5.16b, $h5.16b, $h5.16b, #8
482	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
483	ext     $h6.16b, $h6.16b, $h6.16b, #8
484	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
485
486	rev64	$res1b, $res1b						@ GHASH block 8k+1
487	rev64	$res0b, $res0b						@ GHASH block 8k
488	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
489	ext     $h7.16b, $h7.16b, $h7.16b, #8
490	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
491	ext     $h8.16b, $h8.16b, $h8.16b, #8
492
493	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
494	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
495	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
496
497	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
498	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
499	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
500	rev64	$res3b, $res3b						@ GHASH block 8k+3
501
502	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
503	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
504	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
505
506	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
507
508	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
509	rev64	$res2b, $res2b						@ GHASH block 8k+2
510	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
511
512	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
513	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
514	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
515
516	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
517	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
518	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
519
520	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
521	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
522	ext     $h3.16b, $h3.16b, $h3.16b, #8
523	ldr	$h4q, [$current_tag, #112]				@ load h3l | h3h
524	ext     $h4.16b, $h4.16b, $h4.16b, #8
525	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
526
527	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
528	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
529	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
530
531	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
532	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
533	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
534
535	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
536	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
537	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
538
539	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
540	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
541	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
542
543	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
544	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
545	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
546
547	eor3	$acc_hb, $acc_hb, $t1.16b,$t2.16b			@ GHASH block 8k+2, 8k+3 - high
548	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
549	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
550
551	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
552	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
553	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
554
555	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
556	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
557	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
558
559	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
560	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
561	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
562
563	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
564	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
565
566	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
567	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
568	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
569
570	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
571	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
572	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
573
574	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
575	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
576	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
577
578	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
579	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
580	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
581
582	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
583	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
584	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
585	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
586
587	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
588	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
589	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
590
591	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
592	ext     $h1.16b, $h1.16b, $h1.16b, #8
593	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
594	ext     $h2.16b, $h2.16b, $h2.16b, #8
595	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
596	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
597
598	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
599	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
600
601	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
602	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
603
604	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
605	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
606
607	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
608	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
609
610	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
611	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
612	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
613
614	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
615	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
616	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
617
618	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
619	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
620	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
621
622	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
623	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
624	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
625
626	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
627	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
628	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
629
630	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
631	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
632	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
633
634	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
635	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
636	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
637
638	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
639	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
640
641	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
642	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
643	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
644
645	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
646	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
647	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
648
649	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
650	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
651	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
652
653	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
654	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
655
656	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
657	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
658	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
659
660	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
661	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
662	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
663
664	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
665	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
666	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
667
668	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
669	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
670	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
671
672	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
673	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
674	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
675
676	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
677	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
678	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
679
680	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
681	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
682	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
683
684	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
685	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
686	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
687
688	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
689	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
690
691	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
692	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load plaintext
693	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
694
695	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
696	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
697	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
698
699	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
700	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
701	ldr	$rk10q, [$cc, #160]					@ load rk10
702
703	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
704	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
705	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
706	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
707
708	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
709	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
710	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
711
712	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
713	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
714	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
715
716	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load plaintext
717	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
718	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
719
720	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
721	eor3	$res4b, $ctr_t4b, $ctr4b, $rk10				@ AES block 4 - result
722	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
723
724	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
725	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
726
727	eor3	$res2b, $ctr_t2b, $ctr2b, $rk10				@ AES block 8k+10 - result
728
729	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
730	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
731
732	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
733	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
734
735	eor3	$res7b, $ctr_t7b, $ctr7b, $rk10				@ AES block 7 - result
736	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
737	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
738
739	eor3	$res1b, $ctr_t1b, $ctr1b, $rk10				@ AES block 8k+9 - result
740	eor3	$res3b, $ctr_t3b, $ctr3b, $rk10				@ AES block 8k+11 - result
741	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
742
743	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
744	eor3	$res5b, $ctr_t5b, $ctr5b, $rk10				@ AES block 5 - result
745	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
746
747	eor3	$res0b, $ctr_t0b, $ctr0b, $rk10				@ AES block 8k+8 - result
748	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
749	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
750
751	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
752	eor3	$res6b, $ctr_t6b, $ctr6b, $rk10				@ AES block 6 - result
753
754	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
755	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
756
757	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
758	b.lt	.L128_enc_main_loop
759
760.L128_enc_prepretail:							@ PREPRETAIL
761	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
762	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
763	ext     $h7.16b, $h7.16b, $h7.16b, #8
764	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
765	ext     $h8.16b, $h8.16b, $h8.16b, #8
766	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
767
768	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
769	ext     $h5.16b, $h5.16b, $h5.16b, #8
770	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
771	ext     $h6.16b, $h6.16b, $h6.16b, #8
772	rev64	$res0b, $res0b						@ GHASH block 8k
773	rev64	$res1b, $res1b						@ GHASH block 8k+1
774
775	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
776	ldr	$h78kq, [$current_tag, #192]				@ load h6k | h5k
777	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
778	rev64	$res3b, $res3b						@ GHASH block 8k+3
779
780	rev64	$res2b, $res2b						@ GHASH block 8k+2
781	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
782
783	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
784
785	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
786	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
787	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
788
789	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
790	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
791
792	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
793	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
794	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
795
796	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
797	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
798
799	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
800	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
801
802	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
803	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
804
805	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
806	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
807
808	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
809
810	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
811
812	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
813
814	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
815
816	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
817	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
818
819	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
820	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
821
822	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
823	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
824
825	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
826	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
827	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
828
829	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
830	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
831
832	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
833	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
834	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
835
836	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
837	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
838
839	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
840	ext     $h3.16b, $h3.16b, $h3.16b, #8
841	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
842	ext     $h4.16b, $h4.16b, $h4.16b, #8
843
844	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
845	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
846	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
847
848	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
849	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
850
851	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
852	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
853
854	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
855	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
856	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
857	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
858
859	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
860	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
861
862	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
863	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
864	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
865
866	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
867	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
868	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
869
870	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
871	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
872
873	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
874	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
875	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
876
877	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
878	ext     $h1.16b, $h1.16b, $h1.16b, #8
879	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
880	ext     $h2.16b, $h2.16b, $h2.16b, #8
881	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
882	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
883
884	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
885	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
886	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
887
888	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
889	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
890	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
891
892	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
893	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
894
895	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
896	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
897	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
898
899	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
900	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
901	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
902
903	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
904	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
905	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
906
907	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
908	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
909	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
910
911	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
912	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
913	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
914
915	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
916	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
917	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
918
919	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
920	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
921
922	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
923	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
924	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
925
926	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
927	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
928	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
929
930	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
931	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
932	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
933
934	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
935	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
936	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
937
938	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
939	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
940
941	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
942	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
943	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
944
945	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
946	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
947
948	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
949	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
950	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
951
952	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
953	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
954	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
955
956	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
957	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
958	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
959
960	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
961	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
962	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
963
964	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
965	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
966	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
967	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
968
969	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
970	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
971	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
972
973	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
974	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
975
976	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
977	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
978
979	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
980	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
981	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
982	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
983
984	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
985	eor3	$acc_lb, $acc_lb, $acc_hb, $acc_mb		 	@ MODULO - fold into low
986	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
987
988	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
989	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
990	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
991
992	ldr	$rk10q, [$cc, #160]					@ load rk10
993	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
994	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
995
996	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
997	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
998
999	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
1000	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
1001
1002	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
1003	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
1004.L128_enc_tail:								@ TAIL
1005
1006	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
1007	ldr	$ctr_t0q, [$input_ptr], #16				@ AES block 8k+8 - load plaintext
1008
1009	mov	$t1.16b, $rk10
1010	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
1011	ext     $h5.16b, $h5.16b, $h5.16b, #8
1012
1013	eor3	$res1b, $ctr_t0b, $ctr0b, $t1.16b			@ AES block 8k+8 - result
1014	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
1015	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
1016	ext     $h6.16b, $h6.16b, $h6.16b, #8
1017	ext     $h7.16b, $h7.16b, $h7.16b, #8
1018
1019	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
1020	ext     $h8.16b, $h8.16b, $h8.16b, #8
1021	cmp	$main_end_input_ptr, #112
1022	b.gt	.L128_enc_blocks_more_than_7
1023
1024	mov	$ctr7b, $ctr6b
1025	mov	$ctr6b, $ctr5b
1026	movi	$acc_h.8b, #0
1027
1028	cmp	$main_end_input_ptr, #96
1029	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1030	mov	$ctr5b, $ctr4b
1031
1032	mov	$ctr4b, $ctr3b
1033	mov	$ctr3b, $ctr2b
1034	mov	$ctr2b, $ctr1b
1035
1036	movi	$acc_l.8b, #0
1037	movi	$acc_m.8b, #0
1038	b.gt	.L128_enc_blocks_more_than_6
1039
1040	mov	$ctr7b, $ctr6b
1041	cmp	$main_end_input_ptr, #80
1042
1043	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1044	mov	$ctr6b, $ctr5b
1045	mov	$ctr5b, $ctr4b
1046
1047	mov	$ctr4b, $ctr3b
1048	mov	$ctr3b, $ctr1b
1049	b.gt	.L128_enc_blocks_more_than_5
1050
1051	cmp	$main_end_input_ptr, #64
1052	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1053
1054	mov	$ctr7b, $ctr6b
1055	mov	$ctr6b, $ctr5b
1056
1057	mov	$ctr5b, $ctr4b
1058	mov	$ctr4b, $ctr1b
1059	b.gt	.L128_enc_blocks_more_than_4
1060
1061	mov	$ctr7b, $ctr6b
1062	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1063	mov	$ctr6b, $ctr5b
1064
1065	mov	$ctr5b, $ctr1b
1066	cmp	$main_end_input_ptr, #48
1067	b.gt	.L128_enc_blocks_more_than_3
1068
1069	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1070	mov	$ctr7b, $ctr6b
1071	mov	$ctr6b, $ctr1b
1072
1073	cmp	$main_end_input_ptr, #32
1074	ldr	$h34kq, [$current_tag, #96]					@ load h4k | h3k
1075	b.gt	.L128_enc_blocks_more_than_2
1076
1077	cmp	$main_end_input_ptr, #16
1078
1079	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1080	mov	$ctr7b, $ctr1b
1081	b.gt	.L128_enc_blocks_more_than_1
1082
1083	ldr	$h12kq, [$current_tag, #48]					@ load h2k | h1k
1084	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1085	b	 .L128_enc_blocks_less_than_1
1086.L128_enc_blocks_more_than_7:						@ blocks left >  7
1087	st1	{ $res1b}, [$output_ptr], #16				@ AES final-7 block  - store result
1088
1089	rev64	$res0b, $res1b						@ GHASH final-7 block
1090	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-6 block - load plaintext
1091
1092	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1093
1094	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
1095
1096	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
1097
1098	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
1099
1100	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
1101	movi	$t0.8b, #0						@ supress further partial tag feed in
1102
1103	eor3	$res1b, $ctr_t1b, $ctr1b, $t1.16b			@ AES final-6 block - result
1104
1105	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d				@ GHASH final-7 block - mid
1106	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
1107.L128_enc_blocks_more_than_6:						@ blocks left >  6
1108
1109	st1	{ $res1b}, [$output_ptr], #16				@ AES final-6 block - store result
1110
1111	rev64	$res0b, $res1b						@ GHASH final-6 block
1112	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-5 block - load plaintext
1113
1114	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1115
1116	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
1117
1118	eor3	$res1b, $ctr_t1b, $ctr2b, $t1.16b			@ AES final-5 block - result
1119	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
1120
1121	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
1122	movi	$t0.8b, #0						@ supress further partial tag feed in
1123
1124	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
1125	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
1126
1127	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
1128
1129	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
1130	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
1131.L128_enc_blocks_more_than_5:						@ blocks left >  5
1132
1133	st1	{ $res1b}, [$output_ptr], #16				@ AES final-5 block - store result
1134
1135	rev64	$res0b, $res1b						@ GHASH final-5 block
1136
1137	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1138
1139	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
1140	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-4 block - load plaintext
1141	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
1142
1143	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
1144
1145	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
1146
1147	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
1148
1149	eor3	$res1b, $ctr_t1b, $ctr3b, $t1.16b			@ AES final-4 block - result
1150	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
1151	movi	$t0.8b, #0						@ supress further partial tag feed in
1152
1153	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
1154	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
1155
1156	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
1157.L128_enc_blocks_more_than_4:						@ blocks left >  4
1158
1159	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-4 block - store result
1160
1161	rev64	$res0b, $res1b						@ GHASH final-4 block
1162
1163	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-3 block - load plaintext
1164
1165	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1166
1167	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
1168	movi	$t0.8b, #0						@ supress further partial tag feed in
1169	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
1170
1171	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
1172
1173	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
1174
1175	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
1176	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
1177
1178	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
1179
1180	eor3	$res1b, $ctr_t1b, $ctr4b, $t1.16b			@ AES final-3 block - result
1181	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
1182.L128_enc_blocks_more_than_3:						@ blocks left >  3
1183
1184	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-3 block - store result
1185
1186	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
1187	ext     $h4.16b, $h4.16b, $h4.16b, #8
1188
1189	rev64	$res0b, $res1b						@ GHASH final-3 block
1190
1191	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1192	movi	$t0.8b, #0						@ supress further partial tag feed in
1193
1194	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
1195	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
1196	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
1197
1198	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-2 block - load plaintext
1199
1200	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
1201
1202	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
1203	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
1204
1205	eor3	$res1b, $ctr_t1b, $ctr5b, $t1.16b			@ AES final-2 block - result
1206
1207	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
1208	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
1209
1210	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
1211	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
1212.L128_enc_blocks_more_than_2:						@ blocks left >  2
1213
1214	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-2 block - store result
1215
1216	rev64	$res0b, $res1b						@ GHASH final-2 block
1217
1218	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1219
1220	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-1 block - load plaintext
1221
1222	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
1223	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
1224	ext     $h3.16b, $h3.16b, $h3.16b, #8
1225	movi	$t0.8b, #0						@ supress further partial tag feed in
1226
1227	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
1228	eor3	$res1b, $ctr_t1b, $ctr6b, $t1.16b			@ AES final-1 block - result
1229
1230	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
1231
1232	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
1233	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
1234
1235	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
1236
1237	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
1238	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
1239.L128_enc_blocks_more_than_1:						@ blocks left >  1
1240
1241	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-1 block - store result
1242
1243	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
1244	ext     $h2.16b, $h2.16b, $h2.16b, #8
1245	rev64	$res0b, $res1b						@ GHASH final-1 block
1246	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final block - load plaintext
1247
1248	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1249
1250	movi	$t0.8b, #0						@ supress further partial tag feed in
1251	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
1252	eor3	$res1b, $ctr_t1b, $ctr7b, $t1.16b			@ AES final block - result
1253
1254	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
1255
1256	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
1257
1258	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
1259
1260	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
1261
1262	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
1263	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
1264
1265	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
1266
1267	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
1268	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
1269.L128_enc_blocks_less_than_1:						@ blocks left <= 1
1270
1271	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
1272	str	$rtmp_ctrq, [$counter]					@ store the updated counter
1273	and	$bit_length, $bit_length, #127			 	@ bit_length %= 128
1274
1275	sub	$bit_length, $bit_length, #128			 	@ bit_length -= 128
1276
1277	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
1278
1279	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
1280	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
1281	and	$bit_length, $bit_length, #127			 	@ bit_length %= 128
1282
1283	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
1284	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
1285	cmp	$bit_length, #64
1286
1287	csel	$temp2_x, $temp1_x, $temp0_x, lt
1288	csel	$temp3_x, $temp0_x, xzr, lt
1289
1290	mov	$ctr0.d[1], $temp3_x
1291	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
1292
1293	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
1294
1295	rev64	$res0b, $res1b						@ GHASH final block
1296
1297	bif	$res1b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
1298	st1	{ $res1b}, [$output_ptr]				@ store all 16B
1299
1300	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1301
1302	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
1303
1304	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
1305	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
1306	ext	$h1.16b, $h1.16b, $h1.16b, #8
1307
1308	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
1309
1310	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
1311	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
1312	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
1313
1314	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
1315
1316	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
1317
1318	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
1319
1320	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
1321	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		  	@ MODULO - top 64b align with mid
1322
1323	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		  	@ MODULO - karatsuba tidy up
1324
1325	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b		 	@ MODULO - fold into mid
1326
1327	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
1328	ext	$t11.16b, $acc_mb, $acc_mb, #8			  	@ MODULO - other mid alignment
1329
1330	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		  	@ MODULO - fold into low
1331	ext	$acc_lb, $acc_lb, $acc_lb, #8
1332	rev64	$acc_lb, $acc_lb
1333	st1	{ $acc_l.16b }, [$current_tag]
1334	lsr	x0, $bit_length, #3					@ return sizes
1335
1336	ldp	d10, d11, [sp, #16]
1337	ldp	d12, d13, [sp, #32]
1338	ldp	d14, d15, [sp, #48]
1339	ldp	d8, d9, [sp], #80
1340	ret
1341
1342.L128_enc_ret:
1343	mov w0, #0x0
1344	ret
1345.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1346___
1347
1348#########################################################################################
1349# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
1350#                               size_t len,
1351#                               unsigned char *out,
1352#                               u64 *Xi,
1353#                               unsigned char ivec[16],
1354#                               const void *key);
1355#
1356$code.=<<___;
1357.global unroll8_eor3_aes_gcm_dec_128_kernel
1358.type   unroll8_eor3_aes_gcm_dec_128_kernel,%function
1359.align  4
1360unroll8_eor3_aes_gcm_dec_128_kernel:
1361	AARCH64_VALID_CALL_TARGET
1362	cbz	x1, .L128_dec_ret
1363	stp	d8, d9, [sp, #-80]!
1364	mov	$counter, x4
1365	mov	$cc, x5
1366	stp	d10, d11, [sp, #16]
1367	stp	d12, d13, [sp, #32]
1368	stp	d14, d15, [sp, #48]
1369	mov	x5, #0xc200000000000000
1370	stp	x5, xzr, [sp, #64]
1371	add	$modulo_constant, sp, #64
1372
1373	lsr	$main_end_input_ptr, $bit_length, #3		 	@ byte_len
1374	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
1375
1376	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
1377	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
1378
1379	mov	$constant_temp, #0x100000000				@ set up counter increment
1380	movi	$rctr_inc.16b, #0x0
1381	mov	$rctr_inc.d[1], $constant_temp
1382	ld1	{ $acc_lb}, [$current_tag]
1383	  ext	$acc_lb, $acc_lb, $acc_lb, #8
1384	rev64	$acc_lb, $acc_lb
1385
1386	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
1387
1388	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
1389
1390	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
1391
1392	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
1393	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
1394
1395	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1396
1397	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
1398	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
1399	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
1400
1401	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
1402	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
1403
1404	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
1405	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
1406
1407	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
1408	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
1409
1410	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
1411	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
1412
1413	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
1414
1415	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
1416	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
1417	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
1418
1419	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
1420	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
1421
1422	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
1423
1424	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
1425	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
1426
1427	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
1428
1429	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
1430
1431	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
1432	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
1433
1434	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
1435	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
1436
1437	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
1438	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
1439	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
1440
1441	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
1442	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
1443	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
1444
1445	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
1446	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
1447	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
1448
1449	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
1450	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
1451
1452	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
1453	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
1454
1455	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
1456	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
1457
1458	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
1459	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
1460
1461	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
1462	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
1463	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
1464
1465	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
1466	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
1467	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
1468
1469	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
1470	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
1471	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
1472
1473	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
1474	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
1475	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
1476
1477	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
1478	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
1479
1480	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
1481	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
1482
1483	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
1484
1485	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
1486	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
1487	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
1488
1489	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
1490	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
1491	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
1492
1493	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
1494	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
1495	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
1496
1497	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
1498	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
1499	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
1500
1501	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
1502	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
1503	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
1504
1505	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
1506	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
1507	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
1508
1509	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
1510	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
1511
1512	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
1513	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
1514
1515	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
1516	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
1517	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
1518
1519	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
1520	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
1521	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
1522
1523	aese	$ctr0b, $rk9						@ AES block 0 - round 9
1524	aese	$ctr1b, $rk9						@ AES block 1 - round 9
1525	aese	$ctr6b, $rk9						@ AES block 6 - round 9
1526
1527	ldr	$rk10q, [$cc, #160]					@ load rk10
1528	aese	$ctr4b, $rk9						@ AES block 4 - round 9
1529	aese	$ctr3b, $rk9						@ AES block 3 - round 9
1530
1531	aese	$ctr2b, $rk9						@ AES block 2 - round 9
1532	aese	$ctr5b, $rk9						@ AES block 5 - round 9
1533	aese	$ctr7b, $rk9						@ AES block 7 - round 9
1534
1535	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
1536	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
1537	b.ge	.L128_dec_tail						@ handle tail
1538
1539	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 0, 1 - load ciphertext
1540
1541	eor3	$ctr0b, $res0b, $ctr0b, $rk10				@ AES block 0 - result
1542	eor3	$ctr1b, $res1b, $ctr1b, $rk10				@ AES block 1 - result
1543	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 0, 1 - store result
1544
1545	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
1546	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
1547	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 2, 3 - load ciphertext
1548
1549	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 4, 5 - load ciphertext
1550
1551	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
1552	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
1553	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 6, 7 - load ciphertext
1554
1555	eor3	$ctr3b, $res3b, $ctr3b, $rk10				@ AES block 3 - result
1556	eor3	$ctr2b, $res2b, $ctr2b, $rk10				@ AES block 2 - result
1557	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 2, 3 - store result
1558
1559	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
1560	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
1561
1562	eor3	$ctr6b, $res6b, $ctr6b, $rk10				@ AES block 6 - result
1563
1564	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
1565	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
1566
1567	eor3	$ctr4b, $res4b, $ctr4b, $rk10				@ AES block 4 - result
1568	eor3	$ctr5b, $res5b, $ctr5b, $rk10				@ AES block 5 - result
1569	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 4, 5 - store result
1570
1571	eor3	$ctr7b, $res7b, $ctr7b, $rk10				@ AES block 7 - result
1572	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 6, 7 - store result
1573	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
1574
1575	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
1576	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
1577	b.ge	.L128_dec_prepretail					@ do prepretail
1578
1579.L128_dec_main_loop:							@ main loop start
1580	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
1581	ext     $h7.16b, $h7.16b, $h7.16b, #8
1582	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
1583	ext     $h8.16b, $h8.16b, $h8.16b, #8
1584
1585	rev64	$res1b, $res1b						@ GHASH block 8k+1
1586	rev64	$res0b, $res0b						@ GHASH block 8k
1587	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
1588
1589	rev64	$res6b, $res6b						@ GHASH block 8k+6
1590	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
1591	ext     $h5.16b, $h5.16b, $h5.16b, #8
1592	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
1593	ext     $h6.16b, $h6.16b, $h6.16b, #8
1594
1595	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
1596	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
1597	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
1598
1599	rev64	$res2b, $res2b						@ GHASH block 8k+2
1600	rev64	$res4b, $res4b						@ GHASH block 8k+4
1601	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
1602
1603	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
1604	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
1605	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
1606	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
1607
1608	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
1609	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
1610	rev64	$res3b, $res3b						@ GHASH block 8k+3
1611
1612	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
1613	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1614	rev64	$res5b, $res5b						@ GHASH block 8k+5
1615
1616	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
1617	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
1618	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1619
1620	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
1621	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
1622	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
1623
1624	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
1625	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
1626	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
1627
1628	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
1629	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
1630	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
1631
1632	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
1633	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
1634	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
1635
1636	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
1637	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
1638	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
1639
1640	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
1641	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1642	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
1643
1644	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
1645	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1646	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
1647
1648	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
1649	ext     $h3.16b, $h3.16b, $h3.16b, #8
1650	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
1651	ext     $h4.16b, $h4.16b, $h4.16b, #8
1652	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
1653	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
1654
1655	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
1656	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
1657	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
1658
1659	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
1660	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
1661	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
1662
1663	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
1664	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
1665	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
1666
1667	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
1668	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
1669	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
1670	ext     $h1.16b, $h1.16b, $h1.16b, #8
1671	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
1672	ext     $h2.16b, $h2.16b, $h2.16b, #8
1673
1674	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
1675	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
1676	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
1677
1678	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1679	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
1680	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
1681
1682	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
1683	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
1684	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
1685
1686	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
1687	rev64	$res7b, $res7b						@ GHASH block 8k+7
1688	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
1689
1690	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
1691	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
1692	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
1693
1694	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
1695	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
1696	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
1697	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1698
1699	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
1700	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
1701	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
1702
1703	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
1704	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
1705	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
1706
1707	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
1708	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
1709	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
1710
1711	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
1712	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
1713	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
1714
1715	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
1716	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1717	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
1718
1719	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
1720	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
1721	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
1722
1723	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
1724	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
1725	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1726
1727	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
1728	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
1729	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
1730
1731	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
1732	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
1733	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
1734
1735	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
1736	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
1737	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
1738
1739	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
1740	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
1741	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
1742
1743	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
1744	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b 			@ GHASH block 8k+4, 8k+5 - mid
1745	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
1746
1747	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
1748	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
1749	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
1750
1751	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
1752	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
1753	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
1754
1755	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
1756	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
1757	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
1758
1759	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
1760	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
1761	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
1762
1763	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
1764	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
1765	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
1766
1767	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
1768	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
1769	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
1770
1771	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
1772	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
1773	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
1774
1775	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
1776	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
1777	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
1778
1779	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
1780	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
1781	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
1782
1783	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
1784	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
1785	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
1786
1787	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
1788	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
1789	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
1790
1791	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
1792	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
1793	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load ciphertext
1794
1795	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load ciphertext
1796	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
1797	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
1798
1799	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load ciphertext
1800	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
1801	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
1802
1803	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load ciphertext
1804	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
1805	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
1806
1807	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
1808	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
1809	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
1810
1811	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
1812	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
1813	ldr	$rk10q, [$cc, #160]					@ load rk10
1814
1815	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
1816	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
1817	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
1818
1819	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
1820	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
1821	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
1822
1823	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
1824	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
1825
1826	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
1827	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
1828	eor3	$ctr1b, $res1b, $ctr1b, $rk10				@ AES block 8k+9 - result
1829
1830	eor3	$ctr0b, $res0b, $ctr0b, $rk10				@ AES block 8k+8 - result
1831	eor3	$ctr7b, $res7b, $ctr7b, $rk10				@ AES block 8k+15 - result
1832	eor3	$ctr6b, $res6b, $ctr6b, $rk10				@ AES block 8k+14 - result
1833
1834	eor3	$ctr2b, $res2b, $ctr2b, $rk10				@ AES block 8k+10 - result
1835	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
1836	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
1837
1838	eor3	$ctr4b, $res4b, $ctr4b, $rk10				@ AES block 8k+12 - result
1839	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
1840	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
1841
1842	eor3	$ctr3b, $res3b, $ctr3b, $rk10				@ AES block 8k+11 - result
1843	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
1844	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
1845
1846	eor3	$ctr5b, $res5b, $ctr5b, $rk10				@ AES block 8k+13 - result
1847	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
1848
1849	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
1850	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
1851	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
1852
1853	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
1854	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
1855	b.lt	.L128_dec_main_loop
1856
1857.L128_dec_prepretail:							@ PREPRETAIL
1858	rev64	$res3b, $res3b						@ GHASH block 8k+3
1859	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
1860	rev64	$res0b, $res0b						@ GHASH block 8k
1861
1862	rev64	$res2b, $res2b						@ GHASH block 8k+2
1863	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
1864	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
1865
1866	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
1867	ext     $h7.16b, $h7.16b, $h7.16b, #8
1868	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
1869	ext     $h8.16b, $h8.16b, $h8.16b, #8
1870	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
1871	rev64	$res1b, $res1b						@ GHASH block 8k+1
1872
1873	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
1874	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
1875	ext     $h5.16b, $h5.16b, $h5.16b, #8
1876	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
1877	ext     $h6.16b, $h6.16b, $h6.16b, #8
1878	rev64	$res5b, $res5b						@ GHASH block 8k+5
1879
1880	rev64	$res4b, $res4b						@ GHASH block 8k+4
1881
1882	rev64	$res6b, $res6b						@ GHASH block 8k+6
1883
1884	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
1885	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
1886	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
1887	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
1888
1889	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
1890	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
1891	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
1892
1893	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1894	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1895	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
1896
1897	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
1898	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
1899	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
1900
1901	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
1902	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
1903	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
1904
1905	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
1906	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
1907	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
1908
1909	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
1910	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1911	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1912
1913	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
1914	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
1915	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
1916
1917	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k - mid
1918	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
1919	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
1920
1921	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
1922	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
1923	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
1924
1925	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
1926	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
1927	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
1928
1929	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
1930	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
1931	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
1932
1933	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
1934	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
1935	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
1936
1937	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
1938	ext     $h3.16b, $h3.16b, $h3.16b, #8
1939	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
1940	ext     $h4.16b, $h4.16b, $h4.16b, #8
1941	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
1942	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
1943
1944	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
1945	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
1946	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
1947
1948	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
1949	ext     $h1.16b, $h1.16b, $h1.16b, #8
1950	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
1951	ext     $h2.16b, $h2.16b, $h2.16b, #8
1952	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
1953
1954	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
1955	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
1956	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
1957
1958	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
1959	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1960	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
1961
1962	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
1963	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
1964	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
1965
1966	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
1967	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
1968	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1969
1970	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
1971	rev64	$res7b, $res7b						@ GHASH block 8k+7
1972	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
1973
1974	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
1975	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
1976	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
1977	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
1978
1979	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
1980	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
1981	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1982
1983	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
1984	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
1985	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1986
1987	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
1988	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
1989	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
1990
1991	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
1992	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
1993	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
1994
1995	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
1996	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
1997	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
1998
1999	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
2000	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
2001	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
2002
2003	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
2004	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
2005	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
2006
2007	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
2008	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
2009	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
2010
2011	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
2012	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
2013	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
2014
2015	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
2016	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
2017	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
2018
2019	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
2020	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
2021	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
2022
2023	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
2024	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
2025	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
2026
2027	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
2028	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
2029	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
2030
2031	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
2032	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
2033	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
2034
2035	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
2036	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
2037	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
2038
2039	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
2040	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
2041	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
2042
2043	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
2044	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
2045	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
2046
2047	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
2048	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
2049	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
2050
2051	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
2052	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
2053	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
2054
2055	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
2056	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
2057	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
2058
2059	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
2060	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
2061	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
2062
2063	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
2064	ldr	$rk10q, [$cc, #160]					@ load rk10
2065
2066	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
2067	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
2068
2069	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
2070	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
2071	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
2072
2073	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
2074	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
2075	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
2076
2077	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
2078	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
2079	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
2080
2081	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
2082	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
2083	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
2084
2085	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
2086	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
2087	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
2088
2089	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
2090	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
2091	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
2092
2093.L128_dec_tail:								@ TAIL
2094
2095	mov	$t1.16b, $rk10
2096	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
2097
2098	cmp	$main_end_input_ptr, #112
2099
2100	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
2101	ext     $h8.16b, $h8.16b, $h8.16b, #8
2102	ldr	$res1q, [$input_ptr], #16				@ AES block 8k+8 - load ciphertext
2103
2104	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
2105	ext     $h5.16b, $h5.16b, $h5.16b, #8
2106	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
2107
2108	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
2109	ext     $h6.16b, $h6.16b, $h6.16b, #8
2110	ext     $h7.16b, $h7.16b, $h7.16b, #8
2111
2112	eor3	$res4b, $res1b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
2113	b.gt	.L128_dec_blocks_more_than_7
2114
2115	cmp	$main_end_input_ptr, #96
2116	mov	$ctr7b, $ctr6b
2117	movi	$acc_l.8b, #0
2118
2119	movi	$acc_h.8b, #0
2120	mov	$ctr6b, $ctr5b
2121	mov	$ctr5b, $ctr4b
2122
2123	mov	$ctr4b, $ctr3b
2124	mov	$ctr3b, $ctr2b
2125	mov	$ctr2b, $ctr1b
2126
2127	movi	$acc_m.8b, #0
2128	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2129	b.gt	.L128_dec_blocks_more_than_6
2130
2131	cmp	$main_end_input_ptr, #80
2132	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2133
2134	mov	$ctr7b, $ctr6b
2135	mov	$ctr6b, $ctr5b
2136	mov	$ctr5b, $ctr4b
2137
2138	mov	$ctr4b, $ctr3b
2139	mov	$ctr3b, $ctr1b
2140	b.gt	.L128_dec_blocks_more_than_5
2141
2142	cmp	$main_end_input_ptr, #64
2143
2144	mov	$ctr7b, $ctr6b
2145	mov	$ctr6b, $ctr5b
2146	mov	$ctr5b, $ctr4b
2147
2148	mov	$ctr4b, $ctr1b
2149	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2150	b.gt	.L128_dec_blocks_more_than_4
2151
2152	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2153	mov	$ctr7b, $ctr6b
2154	mov	$ctr6b, $ctr5b
2155
2156	mov	$ctr5b, $ctr1b
2157	cmp	$main_end_input_ptr, #48
2158	b.gt	.L128_dec_blocks_more_than_3
2159
2160	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2161	mov	$ctr7b, $ctr6b
2162	cmp	$main_end_input_ptr, #32
2163
2164	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
2165	mov	$ctr6b, $ctr1b
2166	b.gt	.L128_dec_blocks_more_than_2
2167
2168	cmp	$main_end_input_ptr, #16
2169
2170	mov	$ctr7b, $ctr1b
2171	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2172	b.gt	L128_dec_blocks_more_than_1
2173
2174	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2175	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
2176	b	 .L128_dec_blocks_less_than_1
2177.L128_dec_blocks_more_than_7:						@ blocks left >  7
2178	rev64	$res0b, $res1b						@ GHASH final-7 block
2179
2180	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2181
2182	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
2183
2184	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
2185	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
2186
2187	movi	$t0.8b, #0						@ supress further partial tag feed in
2188	ldr	$res1q, [$input_ptr], #16				@ AES final-6 block - load ciphertext
2189
2190	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
2191
2192	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
2193	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-7 block  - store result
2194	eor3	$res4b, $res1b, $ctr1b, $t1.16b				@ AES final-6 block - result
2195
2196	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
2197.L128_dec_blocks_more_than_6:						@ blocks left >  6
2198
2199	rev64	$res0b, $res1b						@ GHASH final-6 block
2200
2201	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2202
2203	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
2204
2205	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
2206
2207	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
2208	ldr	$res1q, [$input_ptr], #16				@ AES final-5 block - load ciphertext
2209	movi	$t0.8b, #0						@ supress further partial tag feed in
2210
2211	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
2212	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-6 block - store result
2213	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
2214
2215	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
2216	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
2217
2218	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
2219	eor3	$res4b, $res1b, $ctr2b, $t1.16b				@ AES final-5 block - result
2220.L128_dec_blocks_more_than_5:						@ blocks left >  5
2221
2222	rev64	$res0b, $res1b						@ GHASH final-5 block
2223
2224	ldr	$res1q, [$input_ptr], #16				@ AES final-4 block - load ciphertext
2225	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-5 block - store result
2226
2227	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2228
2229	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
2230
2231	eor3	$res4b, $res1b, $ctr3b, $t1.16b				@ AES final-4 block - result
2232
2233	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
2234
2235	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
2236	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
2237	movi	$t0.8b, #0						@ supress further partial tag feed in
2238
2239	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
2240	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
2241	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
2242
2243	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
2244	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
2245.L128_dec_blocks_more_than_4:						@ blocks left >  4
2246
2247	rev64	$res0b, $res1b						@ GHASH final-4 block
2248
2249	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2250	ldr	$res1q, [$input_ptr], #16				@ AES final-3 block - load ciphertext
2251
2252	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
2253	movi	$t0.8b, #0						@ supress further partial tag feed in
2254	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
2255
2256	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
2257
2258	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
2259
2260	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-4 block - store result
2261	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
2262
2263	eor3	$res4b, $res1b, $ctr4b, $t1.16b				@ AES final-3 block - result
2264	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
2265
2266	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
2267
2268	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
2269.L128_dec_blocks_more_than_3:						@ blocks left >  3
2270
2271	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-3 block - store result
2272	rev64	$res0b, $res1b						@ GHASH final-3 block
2273
2274	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2275
2276	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
2277
2278	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
2279	ext     $h4.16b, $h4.16b, $h4.16b, #8
2280	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
2281
2282	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
2283
2284	ldr	$res1q, [$input_ptr], #16				@ AES final-2 block - load ciphertext
2285
2286	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
2287	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
2288	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
2289
2290	movi	$t0.8b, #0						@ supress further partial tag feed in
2291	eor3	$res4b, $res1b, $ctr5b, $t1.16b				@ AES final-2 block - result
2292	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
2293
2294	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
2295
2296	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
2297	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
2298.L128_dec_blocks_more_than_2:						@ blocks left >  2
2299
2300	rev64	$res0b, $res1b						@ GHASH final-2 block
2301
2302	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-2 block - store result
2303
2304	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2305	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
2306	ext     $h3.16b, $h3.16b, $h3.16b, #8
2307	movi	$t0.8b, #0						@ supress further partial tag feed in
2308
2309	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
2310
2311	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
2312
2313	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
2314
2315	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
2316	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
2317	ldr	$res1q, [$input_ptr], #16				@ AES final-1 block - load ciphertext
2318
2319	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
2320
2321	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
2322
2323	eor3	$res4b, $res1b, $ctr6b, $t1.16b				@ AES final-1 block - result
2324	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
2325.L128_dec_blocks_more_than_1:						@ blocks left >  1
2326
2327	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-1 block - store result
2328	rev64	$res0b, $res1b						@ GHASH final-1 block
2329
2330	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
2331	ext     $h2.16b, $h2.16b, $h2.16b, #8
2332
2333	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2334
2335	movi	$t0.8b, #0						@ supress further partial tag feed in
2336
2337	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
2338
2339	ldr	$res1q, [$input_ptr], #16				@ AES final block - load ciphertext
2340	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
2341
2342	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
2343	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
2344	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
2345
2346	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
2347	eor3	$res4b, $res1b, $ctr7b, $t1.16b				@ AES final block - result
2348
2349	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
2350
2351	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
2352
2353	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
2354
2355	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
2356.L128_dec_blocks_less_than_1:						@ blocks left <= 1
2357
2358	and	$bit_length, $bit_length, #127				@ bit_length %= 128
2359
2360	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
2361
2362	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
2363
2364	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
2365	and	$bit_length, $bit_length, #127				@ bit_length %= 128
2366
2367	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
2368	cmp	$bit_length, #64
2369	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
2370
2371	csel	$temp2_x, $temp1_x, $temp0_x, lt
2372	csel	$temp3_x, $temp0_x, xzr, lt
2373
2374	mov	$ctr0.d[1], $temp3_x
2375	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
2376
2377	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
2378	ext     $h1.16b, $h1.16b, $h1.16b, #8
2379	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
2380
2381	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
2382
2383	rev64	$res0b, $res1b						@ GHASH final block
2384
2385	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2386
2387	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
2388	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
2389
2390	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
2391	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
2392
2393	bif	$res4b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
2394
2395	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
2396	st1	{ $res4b}, [$output_ptr]				@ store all 16B
2397
2398	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
2399
2400	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
2401	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
2402
2403	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
2404
2405	eor	$t10.16b, $acc_hb, $acc_lb				@ MODULO - karatsuba tidy up
2406
2407	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
2408	ext	$acc_hb, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
2409
2410	eor	$acc_mb, $acc_mb, $t10.16b				@ MODULO - karatsuba tidy up
2411
2412	eor3	$acc_mb, $acc_mb, $acc_hb, $t11.16b			@ MODULO - fold into mid
2413
2414	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
2415	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
2416
2417	eor3	$acc_lb, $acc_lb, $acc_mb, $acc_hb			@ MODULO - fold into low
2418	ext	$acc_lb, $acc_lb, $acc_lb, #8
2419	rev64	$acc_lb, $acc_lb
2420	st1	{ $acc_l.16b }, [$current_tag]
2421	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
2422
2423	str	$rtmp_ctrq, [$counter]					@ store the updated counter
2424
2425	lsr	x0, $bit_length, #3
2426
2427	ldp	d10, d11, [sp, #16]
2428	ldp	d12, d13, [sp, #32]
2429	ldp	d14, d15, [sp, #48]
2430	ldp	d8, d9, [sp], #80
2431	ret
2432.L128_dec_ret:
2433	mov w0, #0x0
2434	ret
2435.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2436___
2437}
2438
2439{
2440my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
2441my ($temp2_x,$temp3_x)=map("x$_",(13..14));
2442my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
2443my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
2444my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
2445my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
2446my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
2447
2448my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
2449my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
2450my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
2451
2452my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
2453my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
2454
2455my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
2456my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
2457my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
2458my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
2459
2460my $t0="v16";
2461my $t0d="d16";
2462
2463my $t1="v29";
2464my $t2=$res1;
2465my $t3=$t1;
2466
2467my $t4=$res0;
2468my $t5=$res2;
2469my $t6=$t0;
2470
2471my $t7=$res3;
2472my $t8=$res4;
2473my $t9=$res5;
2474
2475my $t10=$res6;
2476my $t11="v21";
2477my $t12=$t1;
2478
2479my $rtmp_ctr="v30";
2480my $rtmp_ctrq="q30";
2481my $rctr_inc="v31";
2482my $rctr_incd="d31";
2483
2484my $mod_constantd=$t0d;
2485my $mod_constant=$t0;
2486
2487my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
2488my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
2489my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
2490my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
2491my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
2492my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
2493my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
2494my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
2495my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
2496my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
2497my $rk2q1="v28.1q";
2498my $rk3q1="v26.1q";
2499my $rk4v="v27";
2500
2501#########################################################################################
2502# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
2503#                               size_t len,
2504#                               unsigned char *out,
2505#                               const void *key,
2506#                               unsigned char ivec[16],
2507#                               u64 *Xi);
2508#
2509$code.=<<___;
2510.global unroll8_eor3_aes_gcm_enc_192_kernel
2511.type   unroll8_eor3_aes_gcm_enc_192_kernel,%function
2512.align  4
2513unroll8_eor3_aes_gcm_enc_192_kernel:
2514	AARCH64_VALID_CALL_TARGET
2515	cbz	x1, .L192_enc_ret
2516	stp	d8, d9, [sp, #-80]!
2517	mov	$counter, x4
2518	mov	$cc, x5
2519	stp	d10, d11, [sp, #16]
2520	stp	d12, d13, [sp, #32]
2521	stp	d14, d15, [sp, #48]
2522	mov	x5, #0xc200000000000000
2523	stp	x5, xzr, [sp, #64]
2524	add	$modulo_constant, sp, #64
2525
2526	lsr	$main_end_input_ptr, $bit_length, #3		 	@ byte_len
2527	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
2528
2529	mov	$constant_temp, #0x100000000				@ set up counter increment
2530	movi	$rctr_inc.16b, #0x0
2531	mov	$rctr_inc.d[1], $constant_temp
2532
2533	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
2534
2535	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
2536
2537	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
2538	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
2539
2540	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
2541	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
2542
2543	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
2544	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
2545
2546	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
2547	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
2548	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
2549
2550	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2551
2552	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
2553	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
2554	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
2555
2556	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
2557
2558	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
2559	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
2560
2561	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
2562
2563	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
2564	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
2565	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
2566
2567	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
2568	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
2569	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
2570
2571	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
2572	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
2573	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
2574
2575	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
2576	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
2577
2578	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
2579	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
2580	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
2581
2582	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
2583	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
2584	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
2585
2586	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
2587	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
2588	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
2589
2590	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
2591	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
2592
2593	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
2594	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
2595	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
2596
2597	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
2598	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
2599
2600	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
2601	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
2602	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
2603
2604	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
2605
2606	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
2607
2608	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
2609
2610	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
2611	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
2612	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
2613
2614	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
2615	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
2616	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
2617
2618	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
2619	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
2620	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
2621
2622	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
2623	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
2624	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
2625
2626	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
2627	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
2628	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
2629
2630	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
2631	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
2632	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
2633
2634	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
2635
2636	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
2637	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
2638	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
2639
2640	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
2641	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
2642	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
2643
2644	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
2645	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
2646	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
2647
2648	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
2649	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
2650
2651	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
2652	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
2653
2654	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
2655	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
2656
2657	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
2658	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
2659
2660	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
2661	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
2662
2663	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
2664	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
2665	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
2666
2667	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
2668	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
2669	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
2670
2671	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
2672	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
2673	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
2674
2675        ld1     { $acc_lb}, [$current_tag]
2676	ext     $acc_lb, $acc_lb, $acc_lb, #8
2677	rev64   $acc_lb, $acc_lb
2678	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
2679
2680	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
2681	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
2682
2683	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
2684	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
2685
2686	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
2687	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
2688
2689	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 14 - round 10
2690	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
2691	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 11 - round 10
2692
2693	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 9 - round 10
2694	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 13 - round 10
2695	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 12 - round 10
2696
2697	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8 - round 10
2698	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 10 - round 10
2699	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 15 - round 10
2700
2701	aese	$ctr6b, $rk11						@ AES block 14 - round 11
2702	aese	$ctr3b, $rk11						@ AES block 11 - round 11
2703
2704	aese	$ctr4b, $rk11						@ AES block 12 - round 11
2705	aese	$ctr7b, $rk11						@ AES block 15 - round 11
2706	ldr	$rk12q, [$cc, #192]					@ load rk12
2707
2708	aese	$ctr1b, $rk11						@ AES block 9 - round 11
2709	aese	$ctr5b, $rk11						@ AES block 13 - round 11
2710
2711	aese	$ctr2b, $rk11						@ AES block 10 - round 11
2712	aese	$ctr0b, $rk11						@ AES block 8 - round 11
2713	b.ge	.L192_enc_tail						@ handle tail
2714
2715	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
2716
2717	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
2718
2719	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
2720
2721	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
2722
2723	eor3	$res0b, $ctr_t0b, $ctr0b, $rk12				@ AES block 0 - result
2724	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
2725	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
2726
2727	eor3	$res3b, $ctr_t3b, $ctr3b, $rk12				@ AES block 3 - result
2728	eor3	$res1b, $ctr_t1b, $ctr1b, $rk12				@ AES block 1 - result
2729
2730	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
2731	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
2732	eor3	$res4b, $ctr_t4b, $ctr4b, $rk12				@ AES block 4 - result
2733
2734	eor3	$res5b, $ctr_t5b, $ctr5b, $rk12				@ AES block 5 - result
2735	eor3	$res7b, $ctr_t7b, $ctr7b, $rk12				@ AES block 7 - result
2736	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
2737
2738	eor3	$res2b, $ctr_t2b, $ctr2b, $rk12				@ AES block 2 - result
2739	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
2740	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
2741
2742	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
2743	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
2744
2745	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
2746	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
2747	eor3	$res6b, $ctr_t6b, $ctr6b, $rk12				@ AES block 6 - result
2748
2749	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
2750
2751	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
2752	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
2753	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
2754
2755	b.ge	.L192_enc_prepretail					@ do prepretail
2756
2757.L192_enc_main_loop:							@ main loop start
2758	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
2759	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
2760	rev64	$res2b, $res2b						@ GHASH block 8k+2
2761
2762	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
2763	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
2764	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
2765	ext     $h7.16b, $h7.16b, $h7.16b, #8
2766	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
2767	ext     $h8.16b, $h8.16b, $h8.16b, #8
2768
2769	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
2770	rev64	$res0b, $res0b						@ GHASH block 8k
2771	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
2772	ext     $h5.16b, $h5.16b, $h5.16b, #8
2773	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
2774	ext     $h6.16b, $h6.16b, $h6.16b, #8
2775
2776	rev64	$res1b, $res1b						@ GHASH block 8k+1
2777	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
2778	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
2779
2780	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
2781	rev64	$res3b, $res3b						@ GHASH block 8k+3
2782	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
2783
2784	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
2785	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
2786	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
2787
2788	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
2789	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
2790	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
2791
2792	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
2793	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
2794	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
2795
2796	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
2797	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
2798	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
2799
2800	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
2801	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
2802	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
2803
2804	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
2805	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
2806	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
2807	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
2808
2809	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
2810	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
2811	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
2812
2813	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
2814	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
2815	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
2816
2817	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
2818	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
2819	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
2820
2821	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
2822	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
2823	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
2824
2825	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
2826	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
2827	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
2828
2829	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
2830	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
2831	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
2832
2833	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
2834	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
2835	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
2836
2837	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
2838	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
2839	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
2840
2841	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
2842	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
2843	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
2844
2845	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
2846	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
2847	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
2848	ext     $h3.16b, $h3.16b, $h3.16b, #8
2849	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
2850	ext     $h4.16b, $h4.16b, $h4.16b, #8
2851
2852	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k - mid
2853	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
2854	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
2855
2856	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
2857	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
2858	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
2859
2860	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
2861	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
2862	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
2863
2864	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
2865	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
2866	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
2867
2868	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
2869	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
2870	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
2871
2872	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
2873	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
2874	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
2875
2876	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
2877	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
2878	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
2879
2880	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
2881	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
2882	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
2883	ext     $h1.16b, $h1.16b, $h1.16b, #8
2884	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
2885	ext     $h2.16b, $h2.16b, $h2.16b, #8
2886
2887	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
2888	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
2889	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
2890
2891	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
2892	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
2893	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
2894
2895	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
2896	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
2897
2898	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
2899	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
2900	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
2901
2902	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
2903	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
2904	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
2905
2906	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
2907	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
2908	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
2909
2910	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
2911	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
2912	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
2913
2914	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
2915	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
2916	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
2917
2918	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
2919	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
2920	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
2921
2922	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
2923	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
2924
2925	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
2926	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
2927	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
2928
2929	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
2930	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
2931
2932	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
2933	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
2934	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
2935
2936	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
2937	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
2938	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
2939
2940	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
2941	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
2942	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
2943
2944	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
2945	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
2946	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
2947
2948	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
2949	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
2950	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
2951
2952	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
2953	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
2954	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
2955
2956	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
2957	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
2958	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
2959
2960	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
2961	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
2962	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
2963
2964	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
2965	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
2966	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
2967
2968	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
2969	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
2970	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
2971
2972	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
2973	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
2974	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
2975
2976	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
2977	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
2978	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
2979
2980	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
2981	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
2982	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
2983
2984	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
2985	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
2986	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
2987
2988	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
2989	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
2990	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
2991
2992	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
2993	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
2994	ldr	$rk12q, [$cc, #192]					@ load rk12
2995	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
2996
2997	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
2998	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
2999	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
3000
3001	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
3002	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
3003	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load plaintext
3004
3005	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load plaintext
3006	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
3007	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
3008
3009	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
3010	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
3011
3012	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
3013	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
3014
3015	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
3016	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
3017	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
3018
3019	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
3020	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
3021	eor3	$res4b, $ctr_t4b, $ctr4b, $rk12				@ AES block 4 - result
3022
3023	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
3024	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
3025	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
3026
3027	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
3028	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
3029	eor3	$res7b, $ctr_t7b, $ctr7b, $rk12				@ AES block 7 - result
3030
3031	eor3	$res2b, $ctr_t2b, $ctr2b, $rk12				@ AES block 8k+10 - result
3032	eor3	$res0b, $ctr_t0b, $ctr0b, $rk12				@ AES block 8k+8 - result
3033	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
3034
3035	eor3	$res1b, $ctr_t1b, $ctr1b, $rk12				@ AES block 8k+9 - result
3036	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
3037	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
3038	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
3039
3040	eor3	$res6b, $ctr_t6b, $ctr6b, $rk12				@ AES block 6 - result
3041	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
3042	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
3043
3044	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
3045	eor3	$res5b, $ctr_t5b, $ctr5b, $rk12				@ AES block 5 - result
3046	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
3047
3048	eor3	$res3b, $ctr_t3b, $ctr3b, $rk12				@ AES block 8k+11 - result
3049	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
3050
3051	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
3052
3053	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
3054
3055	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
3056	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
3057	b.lt	.L192_enc_main_loop
3058
3059.L192_enc_prepretail:							@ PREPRETAIL
3060	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
3061	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
3062	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
3063
3064	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
3065	ext     $h7.16b, $h7.16b, $h7.16b, #8
3066	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
3067	ext     $h8.16b, $h8.16b, $h8.16b, #8
3068	rev64	$res0b, $res0b						@ GHASH block 8k
3069	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
3070
3071	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
3072	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
3073	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
3074	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
3075
3076	rev64	$res3b, $res3b						@ GHASH block 8k+3
3077	rev64	$res2b, $res2b						@ GHASH block 8k+2
3078	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
3079	ext     $h5.16b, $h5.16b, $h5.16b, #8
3080	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
3081	ext     $h6.16b, $h6.16b, $h6.16b, #8
3082
3083	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
3084	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
3085	rev64	$res1b, $res1b						@ GHASH block 8k+1
3086
3087	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
3088	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
3089	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
3090
3091	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
3092	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
3093	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
3094
3095	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
3096	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
3097	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
3098
3099	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
3100	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
3101	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3102
3103	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3104	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
3105	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
3106
3107	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
3108	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
3109	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
3110
3111	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
3112	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
3113	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
3114
3115	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
3116	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
3117	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
3118
3119	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
3120	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
3121	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
3122
3123	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
3124	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
3125	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
3126
3127	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
3128	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
3129	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
3130
3131	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
3132	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
3133	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
3134
3135	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
3136	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
3137	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
3138
3139	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
3140	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
3141	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
3142
3143	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
3144	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
3145	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
3146
3147	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
3148	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
3149	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
3150
3151	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
3152	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
3153	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
3154
3155	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
3156	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
3157	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
3158
3159	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
3160	ext     $h3.16b, $h3.16b, $h3.16b, #8
3161	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
3162	ext     $h4.16b, $h4.16b, $h4.16b, #8
3163	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
3164	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
3165
3166	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
3167	ext     $h1.16b, $h1.16b, $h1.16b, #8
3168	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
3169	ext     $h2.16b, $h2.16b, $h2.16b, #8
3170	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
3171	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
3172
3173	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
3174	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
3175	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
3176
3177	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
3178	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
3179	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
3180
3181	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
3182	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
3183	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
3184
3185	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
3186	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
3187	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
3188
3189	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
3190	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
3191	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
3192	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
3193
3194	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
3195	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
3196	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
3197
3198	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
3199	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
3200	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
3201
3202	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
3203	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
3204
3205	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
3206	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
3207	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
3208
3209	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
3210	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
3211	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
3212
3213	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
3214	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
3215	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
3216
3217	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
3218	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
3219	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
3220
3221	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
3222	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
3223
3224	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
3225	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
3226	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
3227
3228	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
3229	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
3230	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
3231
3232	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
3233	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
3234	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
3235
3236	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
3237	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
3238	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
3239
3240	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
3241	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
3242	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
3243
3244	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
3245	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
3246	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
3247
3248	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
3249	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
3250	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
3251
3252	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
3253	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
3254
3255	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
3256	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
3257
3258	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
3259	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
3260	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
3261	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
3262
3263	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
3264	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
3265
3266	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
3267	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
3268	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
3269
3270	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
3271	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
3272	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
3273
3274	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
3275	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
3276	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
3277
3278	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
3279	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
3280	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
3281
3282	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
3283	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
3284
3285	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
3286	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
3287	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
3288	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
3289
3290	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
3291	ldr	$rk12q, [$cc, #192]					@ load rk12
3292
3293	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
3294	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
3295	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
3296
3297	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
3298	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
3299	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
3300
3301	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
3302	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
3303
3304	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
3305	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
3306
3307	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
3308	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
3309
3310	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
3311	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
3312	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
3313
3314	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
3315	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
3316	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
3317
3318.L192_enc_tail:								@ TAIL
3319
3320	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
3321        ext     $h5.16b, $h5.16b, $h5.16b, #8
3322	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
3323
3324	ldr	$ctr_t0q, [$input_ptr], #16				@ AES block 8k+8 - l3ad plaintext
3325
3326	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
3327        ext     $h8.16b, $h8.16b, $h8.16b, #8
3328
3329	mov	$t1.16b, $rk12
3330
3331	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
3332        ext     $h6.16b, $h6.16b, $h6.16b, #8
3333	ext     $h7.16b, $h7.16b, $h7.16b, #8
3334	cmp	$main_end_input_ptr, #112
3335
3336	eor3	$res1b, $ctr_t0b, $ctr0b, $t1.16b			@ AES block 8k+8 - result
3337	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
3338	b.gt	.L192_enc_blocks_more_than_7
3339
3340	cmp	$main_end_input_ptr, #96
3341	mov	$ctr7b, $ctr6b
3342	movi	$acc_h.8b, #0
3343
3344	mov	$ctr6b, $ctr5b
3345	movi	$acc_l.8b, #0
3346	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3347
3348	mov	$ctr5b, $ctr4b
3349	mov	$ctr4b, $ctr3b
3350	mov	$ctr3b, $ctr2b
3351
3352	mov	$ctr2b, $ctr1b
3353	movi	$acc_m.8b, #0
3354	b.gt	.L192_enc_blocks_more_than_6
3355
3356	mov	$ctr7b, $ctr6b
3357	cmp	$main_end_input_ptr, #80
3358
3359	mov	$ctr6b, $ctr5b
3360	mov	$ctr5b, $ctr4b
3361	mov	$ctr4b, $ctr3b
3362
3363	mov	$ctr3b, $ctr1b
3364	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3365	b.gt	.L192_enc_blocks_more_than_5
3366
3367	cmp	$main_end_input_ptr, #64
3368	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3369
3370	mov	$ctr7b, $ctr6b
3371	mov	$ctr6b, $ctr5b
3372	mov	$ctr5b, $ctr4b
3373
3374	mov	$ctr4b, $ctr1b
3375	b.gt	.L192_enc_blocks_more_than_4
3376
3377	mov	$ctr7b, $ctr6b
3378	mov	$ctr6b, $ctr5b
3379	mov	$ctr5b, $ctr1b
3380
3381	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3382	cmp	$main_end_input_ptr, #48
3383	b.gt	.L192_enc_blocks_more_than_3
3384
3385	mov	$ctr7b, $ctr6b
3386	mov	$ctr6b, $ctr1b
3387	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3388
3389	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
3390	cmp	$main_end_input_ptr, #32
3391	b.gt	.L192_enc_blocks_more_than_2
3392
3393	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3394
3395	cmp	$main_end_input_ptr, #16
3396	mov	$ctr7b, $ctr1b
3397	b.gt	.L192_enc_blocks_more_than_1
3398
3399	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3400	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
3401	b	 .L192_enc_blocks_less_than_1
3402.L192_enc_blocks_more_than_7:						@ blocks left >  7
3403	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-7 block  - store result
3404
3405	rev64	$res0b, $res1b						@ GHASH final-7 block
3406	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
3407
3408	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3409
3410	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
3411
3412	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-6 block - load plaintext
3413
3414	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
3415	movi	$t0.8b, #0						@ supress further partial tag feed in
3416	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
3417
3418	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
3419
3420	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
3421	eor3	$res1b, $ctr_t1b, $ctr1b, $t1.16b			@ AES final-6 block - result
3422.L192_enc_blocks_more_than_6:						@ blocks left >  6
3423
3424	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-6 block - store result
3425
3426	rev64	$res0b, $res1b						@ GHASH final-6 block
3427
3428	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-5 block - load plaintext
3429
3430	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3431
3432	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
3433
3434	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
3435	eor3	$res1b, $ctr_t1b, $ctr2b, $t1.16b			@ AES final-5 block - result
3436
3437	movi	$t0.8b, #0						@ supress further partial tag feed in
3438	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
3439	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
3440
3441	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
3442
3443	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
3444	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
3445
3446	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
3447.L192_enc_blocks_more_than_5:						@ blocks left >  5
3448
3449	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-5 block - store result
3450
3451	rev64	$res0b, $res1b						@ GHASH final-5 block
3452
3453	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3454
3455	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
3456
3457	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-4 block - load plaintext
3458	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
3459
3460	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
3461	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
3462
3463	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
3464	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
3465
3466	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
3467	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
3468
3469	eor3	$res1b, $ctr_t1b, $ctr3b, $t1.16b			@ AES final-4 block - result
3470	movi	$t0.8b, #0						@ supress further partial tag feed in
3471
3472	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
3473.L192_enc_blocks_more_than_4:						@ blocks left >  4
3474
3475	st1	{ $res1b}, [$output_ptr], #16				@ AES final-4 block - store result
3476
3477	rev64	$res0b, $res1b						@ GHASH final-4 block
3478
3479	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3480
3481	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-3 block - load plaintext
3482	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
3483	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
3484
3485	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
3486	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
3487
3488	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
3489
3490	movi	$t0.8b, #0						@ supress further partial tag feed in
3491	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
3492
3493	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
3494
3495	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
3496	eor3	$res1b, $ctr_t1b, $ctr4b, $t1.16b			@ AES final-3 block - result
3497.L192_enc_blocks_more_than_3:						@ blocks left >  3
3498
3499	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
3500	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-3 block - store result
3501
3502	rev64	$res0b, $res1b						@ GHASH final-3 block
3503
3504	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3505	movi	$t0.8b, #0						@ supress further partial tag feed in
3506
3507	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-2 block - load plaintext
3508	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
3509	ext     $h4.16b, $h4.16b, $h4.16b, #8
3510
3511	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
3512
3513	eor3	$res1b, $ctr_t1b, $ctr5b, $t1.16b			@ AES final-2 block - result
3514	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
3515
3516	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
3517	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
3518
3519	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
3520	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
3521
3522	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
3523
3524	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
3525	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
3526.L192_enc_blocks_more_than_2:						@ blocks left >  2
3527
3528	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-2 block - store result
3529
3530	rev64	$res0b, $res1b						@ GHASH final-2 block
3531	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
3532	ext     $h3.16b, $h3.16b, $h3.16b, #8
3533
3534	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3535
3536	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-1 block - load plaintext
3537	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
3538
3539	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
3540
3541	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
3542	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
3543	movi	$t0.8b, #0						@ supress further partial tag feed in
3544
3545	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
3546
3547	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
3548	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
3549
3550	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
3551	eor3	$res1b, $ctr_t1b, $ctr6b, $t1.16b			@ AES final-1 block - result
3552.L192_enc_blocks_more_than_1:						@ blocks left >  1
3553
3554	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
3555	ext     $h2.16b, $h2.16b, $h2.16b, #8
3556	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-1 block - store result
3557
3558	rev64	$res0b, $res1b						@ GHASH final-1 block
3559
3560	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3561
3562	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
3563	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
3564
3565	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
3566	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
3567	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
3568
3569	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final block - load plaintext
3570	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
3571
3572	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
3573
3574	eor3	$res1b, $ctr_t1b, $ctr7b, $t1.16b			@ AES final block - result
3575	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
3576
3577	movi	$t0.8b, #0						@ supress further partial tag feed in
3578
3579	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
3580	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
3581.L192_enc_blocks_less_than_1:						@ blocks left <= 1
3582
3583	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
3584	and	$bit_length, $bit_length, #127				@ bit_length %= 128
3585
3586	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
3587
3588	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
3589
3590	and	$bit_length, $bit_length, #127				@ bit_length %= 128
3591
3592	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
3593	cmp	$bit_length, #64
3594	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
3595
3596	csel	$temp2_x, $temp1_x, $temp0_x, lt
3597	csel	$temp3_x, $temp0_x, xzr, lt
3598
3599	mov	$ctr0.d[1], $temp3_x
3600	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
3601	ext     $h1.16b, $h1.16b, $h1.16b, #8
3602
3603	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
3604	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
3605
3606	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
3607
3608	rev64	$res0b, $res1b						@ GHASH final block
3609	bif	$res1b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
3610
3611	st1	{ $res1b}, [$output_ptr]				@ store all 16B
3612
3613	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3614
3615	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
3616	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
3617
3618	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
3619	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
3620
3621	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
3622
3623	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
3624
3625	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
3626	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
3627
3628	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
3629	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
3630
3631	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
3632
3633	str	$rtmp_ctrq, [$counter]					@ store the updated counter
3634	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
3635
3636	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
3637
3638	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
3639
3640	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
3641	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
3642
3643	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
3644		ext	$acc_lb, $acc_lb, $acc_lb, #8
3645	rev64	$acc_lb, $acc_lb
3646	st1	{ $acc_l.16b }, [$current_tag]
3647
3648	lsr	x0, $bit_length, #3					@ return sizes
3649
3650	ldp	d10, d11, [sp, #16]
3651	ldp	d12, d13, [sp, #32]
3652	ldp	d14, d15, [sp, #48]
3653	ldp	d8, d9, [sp], #80
3654	ret
3655
3656.L192_enc_ret:
3657	mov w0, #0x0
3658	ret
3659.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
3660___
3661
3662#########################################################################################
3663# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
3664#                               size_t len,
3665#                               unsigned char *out,
3666#                               const void *key,
3667#                               unsigned char ivec[16],
3668#                               u64 *Xi);
3669#
3670$code.=<<___;
3671.global unroll8_eor3_aes_gcm_dec_192_kernel
3672.type   unroll8_eor3_aes_gcm_dec_192_kernel,%function
3673.align  4
3674unroll8_eor3_aes_gcm_dec_192_kernel:
3675	AARCH64_VALID_CALL_TARGET
3676	cbz	x1, .L192_dec_ret
3677	stp	d8, d9, [sp, #-80]!
3678	mov	$counter, x4
3679	mov	$cc, x5
3680	stp	d10, d11, [sp, #16]
3681	stp	d12, d13, [sp, #32]
3682	stp	d14, d15, [sp, #48]
3683        mov     x5, #0xc200000000000000
3684	stp     x5, xzr, [sp, #64]
3685	add     $modulo_constant, sp, #64
3686
3687	lsr	$main_end_input_ptr, $bit_length, #3		 	@ byte_len
3688	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
3689	ld1	{ $acc_lb}, [$current_tag]
3690
3691		mov	$constant_temp, #0x100000000			@ set up counter increment
3692	movi	$rctr_inc.16b, #0x0
3693	mov	$rctr_inc.d[1], $constant_temp
3694
3695	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
3696
3697	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
3698
3699	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
3700	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
3701
3702	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
3703	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
3704
3705	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
3706	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
3707
3708	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
3709	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
3710
3711	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
3712	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
3713	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
3714
3715	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
3716	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
3717
3718	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
3719
3720	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
3721	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
3722	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
3723
3724	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
3725	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
3726	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
3727
3728	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
3729	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
3730	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
3731
3732	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
3733
3734	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
3735
3736	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
3737	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
3738	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
3739
3740	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
3741	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
3742
3743	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
3744	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
3745	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
3746
3747	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
3748	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
3749	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
3750
3751	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
3752	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
3753	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
3754
3755	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
3756
3757	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
3758	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
3759	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
3760
3761	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
3762	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
3763
3764	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
3765	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
3766	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
3767
3768	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
3769	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
3770	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
3771
3772	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
3773	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
3774	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
3775
3776	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
3777	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
3778	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
3779
3780	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
3781	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
3782
3783	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
3784	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
3785	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
3786
3787	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
3788	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
3789	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
3790
3791	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
3792
3793	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
3794	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
3795	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
3796
3797	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
3798	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
3799	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
3800
3801	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
3802	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
3803	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
3804
3805	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
3806
3807	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
3808	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
3809
3810	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
3811	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
3812	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
3813
3814	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
3815	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
3816	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
3817
3818	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
3819	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
3820	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3821
3822	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
3823	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
3824	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
3825
3826	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
3827	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
3828	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
3829
3830	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
3831	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
3832
3833	ld1	{ $acc_lb}, [$current_tag]
3834	ext	$acc_lb, $acc_lb, $acc_lb, #8
3835	rev64	$acc_lb, $acc_lb
3836
3837	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
3838
3839	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
3840	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
3841
3842	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
3843	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
3844	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
3845
3846	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
3847	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
3848
3849	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
3850	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
3851
3852	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 10
3853	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 10
3854	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 10
3855
3856	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 10
3857	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 10
3858	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 10
3859
3860	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 10
3861	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 10
3862	ldr	$rk12q, [$cc, #192]					@ load rk12
3863
3864	aese	$ctr0b, $rk11						@ AES block 0 - round 11
3865	aese	$ctr1b, $rk11						@ AES block 1 - round 11
3866	aese	$ctr4b, $rk11						@ AES block 4 - round 11
3867
3868	aese	$ctr6b, $rk11						@ AES block 6 - round 11
3869	aese	$ctr5b, $rk11						@ AES block 5 - round 11
3870	aese	$ctr7b, $rk11						@ AES block 7 - round 11
3871
3872	aese	$ctr2b, $rk11						@ AES block 2 - round 11
3873	aese	$ctr3b, $rk11						@ AES block 3 - round 11
3874	b.ge	.L192_dec_tail						@ handle tail
3875
3876	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 0, 1 - load ciphertext
3877
3878	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 2, 3 - load ciphertext
3879
3880	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 4, 5 - load ciphertext
3881
3882	eor3	$ctr1b, $res1b, $ctr1b, $rk12				@ AES block 1 - result
3883	eor3	$ctr0b, $res0b, $ctr0b, $rk12				@ AES block 0 - result
3884	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 0, 1 - store result
3885
3886	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
3887	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
3888
3889	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
3890	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
3891	eor3	$ctr3b, $res3b, $ctr3b, $rk12				@ AES block 3 - result
3892
3893	eor3	$ctr2b, $res2b, $ctr2b, $rk12				@ AES block 2 - result
3894	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 2, 3 - store result
3895	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 6, 7 - load ciphertext
3896
3897	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
3898	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
3899
3900	eor3	$ctr4b, $res4b, $ctr4b, $rk12				@ AES block 4 - result
3901
3902	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
3903	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
3904
3905	eor3	$ctr5b, $res5b, $ctr5b, $rk12				@ AES block 5 - result
3906	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 4, 5 - store result
3907	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
3908
3909	eor3	$ctr6b, $res6b, $ctr6b, $rk12				@ AES block 6 - result
3910	eor3	$ctr7b, $res7b, $ctr7b, $rk12				@ AES block 7 - result
3911	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
3912
3913	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
3914	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 6, 7 - store result
3915	b.ge	.L192_dec_prepretail					@ do prepretail
3916
3917.L192_dec_main_loop:							@ main loop start
3918	rev64	$res1b, $res1b						@ GHASH block 8k+1
3919	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
3920	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
3921
3922	rev64	$res0b, $res0b						@ GHASH block 8k
3923	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
3924	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
3925
3926	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
3927	ext     $h7.16b, $h7.16b, $h7.16b, #8
3928	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
3929	ext     $h8.16b, $h8.16b, $h8.16b, #8
3930	rev64	$res4b, $res4b						@ GHASH block 8k+4
3931	rev64	$res3b, $res3b						@ GHASH block 8k+3
3932
3933	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
3934	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
3935	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
3936
3937	rev64	$res5b, $res5b						@ GHASH block 8k+5
3938
3939	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
3940	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
3941	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
3942
3943	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
3944	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
3945	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
3946
3947	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
3948	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
3949	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
3950
3951	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
3952	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
3953	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
3954
3955	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
3956	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
3957	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
3958	ext     $h5.16b, $h5.16b, $h5.16b, #8
3959	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
3960	ext     $h6.16b, $h6.16b, $h6.16b, #8
3961
3962	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
3963	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
3964	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
3965
3966	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
3967	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
3968	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
3969
3970	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3971	rev64	$res2b, $res2b						@ GHASH block 8k+2
3972	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
3973
3974	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
3975	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
3976	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
3977	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3978
3979	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
3980	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
3981	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
3982
3983	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
3984	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
3985	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
3986
3987	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
3988	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
3989	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
3990
3991	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
3992	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
3993	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
3994
3995	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
3996	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
3997	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
3998
3999	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
4000	ext     $h3.16b, $h3.16b, $h3.16b, #8
4001	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
4002	ext     $h4.16b, $h4.16b, $h4.16b, #8
4003	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
4004	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
4005
4006	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
4007	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4008	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4009
4010	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
4011	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
4012
4013	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
4014	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
4015	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
4016
4017	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
4018	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
4019	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
4020
4021	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4022	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
4023
4024	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
4025	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
4026	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
4027
4028	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
4029	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
4030	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
4031
4032	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
4033	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
4034	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
4035
4036	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
4037	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
4038	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
4039
4040	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
4041	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
4042	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
4043
4044	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
4045	ext     $h1.16b, $h1.16b, $h1.16b, #8
4046	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
4047	ext     $h2.16b, $h2.16b, $h2.16b, #8
4048	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
4049	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
4050
4051	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
4052	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
4053	rev64	$res7b, $res7b						@ GHASH block 8k+7
4054
4055	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
4056	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
4057	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
4058
4059	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
4060	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4061	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
4062
4063	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
4064	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
4065	rev64	$res6b, $res6b						@ GHASH block 8k+6
4066
4067	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4068	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4069	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
4070	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
4071
4072	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
4073	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
4074	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4075
4076	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
4077	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
4078	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
4079
4080	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
4081	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
4082	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
4083
4084	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
4085	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
4086	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
4087
4088	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
4089	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
4090	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
4091
4092	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
4093	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4094	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
4095
4096	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
4097	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
4098	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
4099
4100	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
4101	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
4102	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
4103
4104	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
4105	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
4106	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
4107
4108	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
4109	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
4110	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
4111
4112	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
4113	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
4114	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
4115
4116	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
4117	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
4118	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
4119
4120	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
4121	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
4122	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
4123
4124	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
4125	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
4126	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
4127
4128	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
4129	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
4130	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
4131
4132	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
4133	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
4134	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
4135
4136	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
4137	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
4138	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
4139
4140	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
4141	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load ciphertext
4142
4143	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
4144	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
4145	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load ciphertext
4146
4147	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
4148	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
4149	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
4150
4151	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
4152	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
4153	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
4154
4155	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
4156	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
4157	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load ciphertext
4158
4159	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
4160	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
4161	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
4162
4163	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
4164	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
4165	ldr	$rk12q, [$cc, #192]					@ load rk12
4166
4167	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load ciphertext
4168	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
4169	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
4170
4171	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
4172	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
4173	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
4174
4175	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
4176	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
4177	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
4178
4179	eor3	$ctr0b, $res0b, $ctr0b, $rk12				@ AES block 8k+8 - result
4180	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
4181	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
4182
4183	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
4184	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
4185	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
4186
4187	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
4188	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
4189	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
4190
4191	eor3	$ctr1b, $res1b, $ctr1b, $rk12				@ AES block 8k+9 - result
4192	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
4193	eor3	$ctr3b, $res3b, $ctr3b, $rk12				@ AES block 8k+11 - result
4194
4195	eor3	$ctr2b, $res2b, $ctr2b, $rk12				@ AES block 8k+10 - result
4196	eor3	$ctr7b, $res7b, $ctr7b, $rk12				@ AES block 8k+15 - result
4197	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
4198
4199	eor3	$ctr5b, $res5b, $ctr5b, $rk12				@ AES block 8k+13 - result
4200	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
4201	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
4202
4203	eor3	$ctr4b, $res4b, $ctr4b, $rk12				@ AES block 8k+12 - result
4204	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
4205	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
4206
4207	eor3	$ctr6b, $res6b, $ctr6b, $rk12				@ AES block 8k+14 - result
4208	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
4209	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
4210
4211	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
4212	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
4213
4214	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
4215	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
4216	b.lt	.L192_dec_main_loop
4217
4218.L192_dec_prepretail:							@ PREPRETAIL
4219	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
4220	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
4221	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
4222
4223	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
4224	ext     $h7.16b, $h7.16b, $h7.16b, #8
4225	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
4226	ext     $h8.16b, $h8.16b, $h8.16b, #8
4227	rev64	$res0b, $res0b						@ GHASH block 8k
4228	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
4229
4230	rev64	$res3b, $res3b						@ GHASH block 8k+3
4231	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
4232	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
4233
4234	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
4235	rev64	$res2b, $res2b						@ GHASH block 8k+2
4236	rev64	$res1b, $res1b						@ GHASH block 8k+1
4237
4238	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
4239	ext     $h5.16b, $h5.16b, $h5.16b, #8
4240	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
4241	ext     $h6.16b, $h6.16b, $h6.16b, #8
4242	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
4243
4244	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
4245	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
4246	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
4247
4248	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
4249	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
4250	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
4251
4252	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
4253	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
4254	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
4255
4256	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
4257	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
4258	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
4259
4260	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
4261	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
4262	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
4263
4264	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
4265	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
4266	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
4267
4268	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
4269	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
4270	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
4271
4272	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
4273	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
4274	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
4275
4276	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
4277	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
4278	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
4279
4280	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
4281	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
4282	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
4283	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
4284
4285	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
4286	rev64	$res5b, $res5b						@ GHASH block 8k+5
4287	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
4288
4289	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
4290	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
4291	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
4292
4293	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4294	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
4295	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
4296
4297	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
4298	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
4299	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4300
4301	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
4302	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
4303	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
4304
4305	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
4306	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
4307	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
4308
4309	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
4310	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
4311	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
4312
4313	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
4314	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
4315	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
4316
4317	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
4318	ext     $h3.16b, $h3.16b, $h3.16b, #8
4319	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
4320	ext     $h4.16b, $h4.16b, $h4.16b, #8
4321	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
4322	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
4323
4324	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
4325	ext     $h1.16b, $h1.16b, $h1.16b, #8
4326	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
4327	ext     $h2.16b, $h2.16b, $h2.16b, #8
4328	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
4329	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
4330
4331	rev64	$res7b, $res7b						@ GHASH block 8k+7
4332
4333	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
4334	rev64	$res4b, $res4b						@ GHASH block 8k+4
4335
4336	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
4337	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
4338	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
4339
4340	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
4341	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
4342	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
4343
4344	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
4345	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
4346	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
4347
4348	rev64	$res6b, $res6b						@ GHASH block 8k+6
4349	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4350	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4351	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4352
4353	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
4354	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
4355	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
4356
4357	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
4358	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
4359	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
4360
4361	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
4362	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
4363	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
4364
4365	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
4366
4367	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
4368	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4369	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
4370
4371	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
4372	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4373	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
4374
4375	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4376	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
4377	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
4378
4379	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
4380	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
4381
4382	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
4383	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
4384	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
4385
4386	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
4387	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
4388	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
4389
4390	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
4391	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
4392	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
4393
4394	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
4395	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
4396	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
4397
4398	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
4399	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
4400	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
4401
4402	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
4403	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
4404	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
4405
4406	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
4407	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
4408	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
4409
4410	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
4411	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
4412	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
4413
4414	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
4415	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
4416	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
4417
4418	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
4419	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
4420
4421	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
4422	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
4423	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
4424
4425	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
4426	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
4427	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
4428
4429	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
4430	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
4431	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
4432
4433	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
4434	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
4435	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
4436
4437	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
4438	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
4439	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
4440
4441	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
4442	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
4443	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
4444
4445	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
4446	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
4447	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
4448
4449	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
4450	ldr	$rk12q, [$cc, #192]					@ load rk12
4451	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
4452
4453	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
4454	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
4455	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
4456
4457	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
4458	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
4459	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
4460
4461	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
4462	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
4463	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
4464
4465	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
4466	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
4467	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
4468
4469	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
4470	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
4471	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
4472
4473	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
4474	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
4475	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
4476
4477.L192_dec_tail:								@ TAIL
4478
4479	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
4480
4481	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
4482        ext     $h5.16b, $h5.16b, $h5.16b, #8
4483	ldr	$res1q, [$input_ptr], #16				@ AES block 8k+8 - load ciphertext
4484
4485	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
4486        ext     $h8.16b, $h8.16b, $h8.16b, #8
4487
4488	mov	$t1.16b, $rk12
4489
4490	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
4491        ext     $h6.16b, $h6.16b, $h6.16b, #8
4492        ext     $h7.16b, $h7.16b, $h7.16b, #8
4493	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
4494
4495	eor3	$res4b, $res1b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
4496	cmp	$main_end_input_ptr, #112
4497	b.gt	.L192_dec_blocks_more_than_7
4498
4499	mov	$ctr7b, $ctr6b
4500	movi	$acc_h.8b, #0
4501	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4502
4503	mov	$ctr6b, $ctr5b
4504	mov	$ctr5b, $ctr4b
4505	mov	$ctr4b, $ctr3b
4506
4507	cmp	$main_end_input_ptr, #96
4508	movi	$acc_l.8b, #0
4509	mov	$ctr3b, $ctr2b
4510
4511	mov	$ctr2b, $ctr1b
4512	movi	$acc_m.8b, #0
4513	b.gt	.L192_dec_blocks_more_than_6
4514
4515	mov	$ctr7b, $ctr6b
4516	mov	$ctr6b, $ctr5b
4517	mov	$ctr5b, $ctr4b
4518
4519	mov	$ctr4b, $ctr3b
4520	mov	$ctr3b, $ctr1b
4521
4522	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4523	cmp	$main_end_input_ptr, #80
4524	b.gt	.L192_dec_blocks_more_than_5
4525
4526	mov	$ctr7b, $ctr6b
4527	mov	$ctr6b, $ctr5b
4528
4529	mov	$ctr5b, $ctr4b
4530	mov	$ctr4b, $ctr1b
4531	cmp	$main_end_input_ptr, #64
4532
4533	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4534	b.gt	.L192_dec_blocks_more_than_4
4535
4536	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4537	mov	$ctr7b, $ctr6b
4538	mov	$ctr6b, $ctr5b
4539
4540	mov	$ctr5b, $ctr1b
4541	cmp	$main_end_input_ptr, #48
4542	b.gt	.L192_dec_blocks_more_than_3
4543
4544	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4545	mov	$ctr7b, $ctr6b
4546	cmp	$main_end_input_ptr, #32
4547
4548	mov	$ctr6b, $ctr1b
4549	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4550	b.gt	.L192_dec_blocks_more_than_2
4551
4552	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4553
4554	mov	$ctr7b, $ctr1b
4555	cmp	$main_end_input_ptr, #16
4556	b.gt	.L192_dec_blocks_more_than_1
4557
4558	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4559	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4560	b	 .L192_dec_blocks_less_than_1
4561.L192_dec_blocks_more_than_7:						@ blocks left >  7
4562	rev64	$res0b, $res1b						@ GHASH final-7 block
4563
4564	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
4565	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4566
4567	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
4568	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
4569	ldr	$res1q, [$input_ptr], #16				@ AES final-6 block - load ciphertext
4570
4571	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
4572
4573	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
4574	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-7 block  - store result
4575
4576	eor3	$res4b, $res1b, $ctr1b, $t1.16b				@ AES final-6 block - result
4577
4578	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
4579	movi	$t0.8b, #0						@ supress further partial tag feed in
4580.L192_dec_blocks_more_than_6:						@ blocks left >  6
4581
4582	rev64	$res0b, $res1b						@ GHASH final-6 block
4583
4584	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4585
4586	ldr	$res1q, [$input_ptr], #16				@ AES final-5 block - load ciphertext
4587	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
4588
4589	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
4590	movi	$t0.8b, #0						@ supress further partial tag feed in
4591	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
4592
4593	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-6 block - store result
4594	eor3	$res4b, $res1b, $ctr2b, $t1.16b				@ AES final-5 block - result
4595
4596	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
4597	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
4598	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
4599
4600	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
4601	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
4602.L192_dec_blocks_more_than_5:						@ blocks left >  5
4603
4604	rev64	$res0b, $res1b						@ GHASH final-5 block
4605
4606	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4607
4608	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
4609
4610	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
4611
4612	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
4613	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
4614
4615	ldr	$res1q, [$input_ptr], #16				@ AES final-4 block - load ciphertext
4616
4617	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
4618	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
4619
4620	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
4621
4622	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
4623	movi	$t0.8b, #0						@ supress further partial tag feed in
4624	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-5 block - store result
4625
4626	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
4627	eor3	$res4b, $res1b, $ctr3b, $t1.16b				@ AES final-4 block - result
4628.L192_dec_blocks_more_than_4:						@ blocks left >  4
4629
4630	rev64	$res0b, $res1b						@ GHASH final-4 block
4631
4632	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4633	movi	$t0.8b, #0						@ supress further partial tag feed in
4634
4635	ldr	$res1q, [$input_ptr], #16				@ AES final-3 block - load ciphertext
4636	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
4637	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
4638
4639	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
4640
4641	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
4642
4643	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
4644	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-4 block - store result
4645	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
4646
4647	eor3	$res4b, $res1b, $ctr4b, $t1.16b				@ AES final-3 block - result
4648
4649	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
4650	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
4651.L192_dec_blocks_more_than_3:						@ blocks left >  3
4652
4653	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
4654	ext     $h4.16b, $h4.16b, $h4.16b, #8
4655	rev64	$res0b, $res1b						@ GHASH final-3 block
4656	ldr	$res1q, [$input_ptr], #16				@ AES final-2 block - load ciphertext
4657
4658	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4659
4660	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
4661	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
4662
4663	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
4664	movi	$t0.8b, #0						@ supress further partial tag feed in
4665	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
4666
4667	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-3 block - store result
4668	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
4669	eor3	$res4b, $res1b, $ctr5b, $t1.16b				@ AES final-2 block - result
4670
4671	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
4672	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4673
4674	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
4675
4676	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
4677
4678	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
4679.L192_dec_blocks_more_than_2:						@ blocks left >  2
4680
4681	rev64	$res0b, $res1b						@ GHASH final-2 block
4682	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
4683	ext     $h3.16b, $h3.16b, $h3.16b, #8
4684
4685	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4686
4687	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
4688	ldr	$res1q, [$input_ptr], #16				@ AES final-1 block - load ciphertext
4689
4690	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
4691
4692	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
4693
4694	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
4695	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
4696
4697	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
4698	movi	$t0.8b, #0						@ supress further partial tag feed in
4699
4700	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
4701	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-2 block - store result
4702
4703	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
4704	eor3	$res4b, $res1b, $ctr6b, $t1.16b				@ AES final-1 block - result
4705.L192_dec_blocks_more_than_1:						@ blocks left >  1
4706
4707	rev64	$res0b, $res1b						@ GHASH final-1 block
4708	ldr	$res1q, [$input_ptr], #16				@ AES final block - load ciphertext
4709	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
4710	ext     $h2.16b, $h2.16b, $h2.16b, #8
4711
4712	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4713	movi	$t0.8b, #0						@ supress further partial tag feed in
4714	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4715
4716	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
4717	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
4718	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-1 block - store result
4719
4720	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
4721
4722	eor3	$res4b, $res1b, $ctr7b, $t1.16b				@ AES final block - result
4723
4724	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
4725
4726	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
4727
4728	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
4729
4730	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
4731
4732	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
4733	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
4734.L192_dec_blocks_less_than_1:						@ blocks left <= 1
4735
4736	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
4737	and	$bit_length, $bit_length, #127				@ bit_length %= 128
4738
4739	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
4740	str	$rtmp_ctrq, [$counter]					@ store the updated counter
4741
4742	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
4743	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
4744
4745	and	$bit_length, $bit_length, #127				@ bit_length %= 128
4746
4747	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
4748	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
4749	cmp	$bit_length, #64
4750
4751	csel	$temp2_x, $temp1_x, $temp0_x, lt
4752	csel	$temp3_x, $temp0_x, xzr, lt
4753	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
4754	ext     $h1.16b, $h1.16b, $h1.16b, #8
4755
4756	mov	$ctr0.d[1], $temp3_x
4757	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
4758
4759	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
4760
4761	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
4762	bif	$res4b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
4763
4764	rev64	$res0b, $res1b						@ GHASH final block
4765
4766	st1	{ $res4b}, [$output_ptr]				@ store all 16B
4767
4768	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4769
4770	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
4771	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
4772
4773	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
4774	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
4775	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
4776
4777	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
4778	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
4779
4780	eor	$t10.16b, $acc_hb, $acc_lb				@ MODULO - karatsuba tidy up
4781	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
4782	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
4783
4784	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
4785	ext	$acc_hb, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
4786
4787	eor	$acc_mb, $acc_mb, $t10.16b				@ MODULO - karatsuba tidy up
4788
4789	eor3	$acc_mb, $acc_mb, $acc_hb, $t11.16b			@ MODULO - fold into mid
4790
4791	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
4792	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
4793
4794	eor3	$acc_lb, $acc_lb, $acc_mb, $acc_hb			@ MODULO - fold into low
4795	ext	$acc_lb, $acc_lb, $acc_lb, #8
4796	rev64	$acc_lb, $acc_lb
4797	st1	{ $acc_l.16b }, [$current_tag]
4798
4799	ldp	d10, d11, [sp, #16]
4800	ldp	d12, d13, [sp, #32]
4801	ldp	d14, d15, [sp, #48]
4802	ldp	d8, d9, [sp], #80
4803	ret
4804
4805.L192_dec_ret:
4806	mov w0, #0x0
4807	ret
4808.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
4809___
4810}
4811
4812{
4813
4814my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
4815my ($temp2_x,$temp3_x)=map("x$_",(13..14));
4816my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
4817my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
4818my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
4819my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
4820my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
4821
4822my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
4823my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
4824my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
4825
4826my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
4827my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
4828
4829my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
4830my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
4831my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
4832my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
4833
4834my $t0="v16";
4835my $t0d="d16";
4836
4837my $t1="v29";
4838my $t2=$res1;
4839my $t3=$t1;
4840
4841my $t4=$res0;
4842my $t5=$res2;
4843my $t6=$t0;
4844
4845my $t7=$res3;
4846my $t8=$res4;
4847my $t9=$res5;
4848
4849my $t10=$res6;
4850my $t11="v21";
4851my $t12=$t1;
4852
4853my $rtmp_ctr="v30";
4854my $rtmp_ctrq="q30";
4855my $rctr_inc="v31";
4856my $rctr_incd="d31";
4857
4858my $mod_constantd=$t0d;
4859my $mod_constant=$t0;
4860
4861my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
4862my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
4863my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
4864my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
4865my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
4866my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
4867my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
4868my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
4869my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
4870my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
4871my $rk2q1="v28.1q";
4872my $rk3q1="v26.1q";
4873my $rk4v="v27";
4874#########################################################################################
4875# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
4876#                               size_t len,
4877#                               unsigned char *out,
4878#                               const void *key,
4879#                               unsigned char ivec[16],
4880#                               u64 *Xi);
4881#
4882$code.=<<___;
4883.global unroll8_eor3_aes_gcm_enc_256_kernel
4884.type   unroll8_eor3_aes_gcm_enc_256_kernel,%function
4885.align  4
4886unroll8_eor3_aes_gcm_enc_256_kernel:
4887	AARCH64_VALID_CALL_TARGET
4888	cbz	x1, .L256_enc_ret
4889	stp	d8, d9, [sp, #-80]!
4890	mov	$counter, x4
4891	mov	$cc, x5
4892	stp	d10, d11, [sp, #16]
4893	stp	d12, d13, [sp, #32]
4894	stp	d14, d15, [sp, #48]
4895	mov	x5, #0xc200000000000000
4896	stp	x5, xzr, [sp, #64]
4897	add	$modulo_constant, sp, #64
4898
4899	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
4900
4901	lsr	$main_end_input_ptr, $bit_length, #3		 	@ byte_len
4902
4903	mov	$constant_temp, #0x100000000			@ set up counter increment
4904	movi	$rctr_inc.16b, #0x0
4905	mov	$rctr_inc.d[1], $constant_temp
4906	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
4907
4908	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4909
4910	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
4911
4912	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
4913
4914	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
4915
4916	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
4917	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
4918
4919	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
4920	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
4921
4922	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
4923	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
4924
4925	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
4926	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
4927
4928	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
4929	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
4930	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
4931
4932	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
4933	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
4934
4935	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
4936
4937	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
4938	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
4939	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
4940
4941	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
4942	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
4943	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
4944
4945	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
4946	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
4947	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
4948
4949	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
4950	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
4951	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
4952
4953	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
4954	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
4955
4956	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
4957
4958	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
4959
4960	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
4961	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
4962	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
4963
4964	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
4965	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
4966	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
4967
4968	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
4969	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
4970	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
4971
4972	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
4973	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
4974	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
4975
4976	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
4977
4978	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
4979	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
4980	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
4981
4982	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
4983	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
4984
4985	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
4986	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
4987	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
4988
4989	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
4990	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
4991
4992	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
4993	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
4994	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
4995
4996	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
4997	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
4998	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
4999
5000	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
5001	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
5002	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
5003
5004	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
5005	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
5006	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
5007
5008	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
5009	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
5010	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
5011
5012	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
5013	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
5014	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
5015
5016	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
5017	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
5018	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
5019
5020	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
5021	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
5022
5023	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
5024	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
5025	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
5026
5027	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
5028	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
5029
5030	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
5031
5032	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
5033	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
5034
5035	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
5036	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
5037	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
5038
5039	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
5040	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
5041	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
5042
5043	ld1	{ $acc_lb}, [$current_tag]
5044	ext	$acc_lb, $acc_lb, $acc_lb, #8
5045	rev64	$acc_lb, $acc_lb
5046	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
5047
5048	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
5049	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
5050	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
5051
5052	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
5053	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
5054	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
5055
5056	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
5057
5058	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 10
5059	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 10
5060	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
5061
5062	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 10
5063	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 10
5064	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 10
5065
5066	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 10
5067	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 10
5068	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 10
5069
5070	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 11
5071	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
5072	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 11
5073
5074	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 11
5075	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 11
5076	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 11
5077
5078	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 11
5079	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 11
5080	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 11
5081
5082	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
5083	ldr	$rk14q, [$cc, #224]					@ load rk14
5084
5085	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 12
5086	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 12
5087	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 12
5088
5089	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 12
5090	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 12
5091	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 12
5092
5093	aese	$ctr2b, $rk13						@ AES block 2 - round 13
5094	aese	$ctr1b, $rk13						@ AES block 1 - round 13
5095	aese	$ctr4b, $rk13						@ AES block 4 - round 13
5096
5097	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 12
5098	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 12
5099
5100	aese	$ctr0b, $rk13						@ AES block 0 - round 13
5101	aese	$ctr5b, $rk13						@ AES block 5 - round 13
5102
5103	aese	$ctr6b, $rk13						@ AES block 6 - round 13
5104	aese	$ctr7b, $rk13						@ AES block 7 - round 13
5105	aese	$ctr3b, $rk13						@ AES block 3 - round 13
5106
5107	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
5108	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
5109	b.ge	.L256_enc_tail						@ handle tail
5110
5111	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
5112
5113	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
5114
5115	eor3	$res0b, $ctr_t0b, $ctr0b, $rk14				@ AES block 0 - result
5116	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
5117	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
5118
5119	eor3	$res1b, $ctr_t1b, $ctr1b, $rk14				@ AES block 1 - result
5120	eor3	$res3b, $ctr_t3b, $ctr3b, $rk14				@ AES block 3 - result
5121
5122	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
5123	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
5124	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
5125
5126	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
5127	eor3	$res2b, $ctr_t2b, $ctr2b, $rk14				@ AES block 2 - result
5128	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
5129
5130	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
5131	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
5132	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
5133
5134	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
5135
5136	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
5137	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
5138
5139	eor3	$res4b, $ctr_t4b, $ctr4b, $rk14				@ AES block 4 - result
5140
5141	eor3	$res7b, $ctr_t7b, $ctr7b, $rk14				@ AES block 7 - result
5142	eor3	$res6b, $ctr_t6b, $ctr6b, $rk14				@ AES block 6 - result
5143	eor3	$res5b, $ctr_t5b, $ctr5b, $rk14				@ AES block 5 - result
5144
5145	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
5146	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
5147
5148	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
5149	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
5150	b.ge	.L256_enc_prepretail					@ do prepretail
5151
5152.L256_enc_main_loop:							@ main loop start
5153	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
5154
5155	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
5156	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
5157	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
5158	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
5159
5160	rev64	$res3b, $res3b						@ GHASH block 8k+3
5161	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
5162	ext     $h5.16b, $h5.16b, $h5.16b, #8
5163	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
5164	ext     $h6.16b, $h6.16b, $h6.16b, #8
5165	rev64	$res1b, $res1b						@ GHASH block 8k+1
5166
5167	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
5168	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
5169	rev64	$res0b, $res0b						@ GHASH block 8k
5170
5171	rev64	$res4b, $res4b						@ GHASH block 8k+4
5172	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
5173	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
5174	ext     $h7.16b, $h7.16b, $h7.16b, #8
5175	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
5176	ext     $h8.16b, $h8.16b, $h8.16b, #8
5177
5178	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
5179	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
5180	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
5181
5182	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
5183	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
5184	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
5185
5186	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
5187	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
5188	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
5189
5190	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
5191	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
5192	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
5193
5194	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
5195	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
5196	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
5197
5198	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
5199	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
5200	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
5201
5202	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
5203	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
5204	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
5205
5206	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5207	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5208	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
5209
5210	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
5211	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
5212	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
5213
5214	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
5215	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
5216	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
5217
5218	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
5219	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
5220	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
5221
5222	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
5223	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
5224	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
5225
5226	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
5227	rev64	$res6b, $res6b						@ GHASH block 8k+6
5228	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
5229
5230	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
5231	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
5232	rev64	$res2b, $res2b						@ GHASH block 8k+2
5233
5234	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
5235	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
5236	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
5237
5238	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
5239	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
5240	rev64	$res5b, $res5b						@ GHASH block 8k+5
5241
5242	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
5243	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
5244	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
5245	ext     $h3.16b, $h3.16b, $h3.16b, #8
5246	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
5247	ext     $h4.16b, $h4.16b, $h4.16b, #8
5248
5249	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5250	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
5251	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
5252
5253	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
5254	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
5255	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
5256
5257	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
5258	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
5259	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
5260
5261	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5262	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
5263	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
5264
5265	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5266	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
5267	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
5268
5269	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
5270	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
5271	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
5272
5273	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
5274	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
5275	rev64	$res7b, $res7b						@ GHASH block 8k+7
5276
5277	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
5278	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
5279	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
5280
5281	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
5282	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
5283	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
5284
5285	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
5286	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
5287	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
5288
5289	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
5290	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
5291	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
5292
5293	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
5294	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
5295	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
5296
5297	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
5298	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
5299	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
5300
5301	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
5302	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
5303	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
5304
5305	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
5306	ext     $h1.16b, $h1.16b, $h1.16b, #8
5307	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
5308	ext     $h2.16b, $h2.16b, $h2.16b, #8
5309	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
5310	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
5311
5312	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
5313	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5314	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
5315	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
5316
5317	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
5318	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
5319	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
5320
5321	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5322	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
5323	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
5324
5325	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
5326	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
5327	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
5328
5329	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
5330	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5331	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
5332
5333	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
5334	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
5335	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
5336
5337	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
5338	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
5339	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
5340
5341	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
5342	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
5343	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
5344
5345	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
5346	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5347	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
5348
5349	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
5350	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
5351	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
5352
5353	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
5354	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
5355	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
5356
5357	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
5358	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
5359	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
5360
5361	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
5362	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
5363	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
5364
5365	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
5366	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
5367	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
5368
5369	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
5370
5371	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
5372	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
5373	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
5374
5375	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
5376	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
5377	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
5378
5379	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
5380	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
5381	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
5382
5383	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
5384	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
5385	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
5386
5387	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
5388
5389	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
5390	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
5391
5392	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
5393	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
5394	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
5395
5396	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
5397	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
5398	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
5399
5400	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
5401	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
5402
5403	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
5404	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
5405
5406	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
5407	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
5408
5409	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
5410	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
5411	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
5412
5413	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
5414	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
5415	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
5416
5417	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
5418	ldr	$rk14q, [$cc, #224]					@ load rk14
5419	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
5420
5421	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
5422	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
5423	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
5424
5425	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
5426	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 12
5427	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
5428
5429	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
5430	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
5431	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
5432
5433	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
5434	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
5435	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
5436
5437	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
5438	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
5439	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
5440
5441	eor3	$res2b, $ctr_t2b, $ctr2b, $rk14				@ AES block 8k+10 - result
5442	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
5443	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
5444
5445	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
5446	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
5447	eor3	$res5b, $ctr_t5b, $ctr5b, $rk14				@ AES block 5 - result
5448
5449	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
5450	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
5451	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
5452
5453	eor3	$res4b, $ctr_t4b, $ctr4b, $rk14				@ AES block 4 - result
5454	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
5455	eor3	$res3b, $ctr_t3b, $ctr3b, $rk14				@ AES block 8k+11 - result
5456
5457	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
5458	eor3	$res1b, $ctr_t1b, $ctr1b, $rk14				@ AES block 8k+9 - result
5459	eor3	$res0b, $ctr_t0b, $ctr0b, $rk14				@ AES block 8k+8 - result
5460
5461	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
5462	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
5463	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
5464
5465	eor3	$res7b, $ctr_t7b, $ctr7b, $rk14				@ AES block 7 - result
5466	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
5467	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
5468
5469	eor3	$res6b, $ctr_t6b, $ctr6b, $rk14				@ AES block 6 - result
5470	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
5471	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
5472
5473	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
5474	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
5475	b.lt	.L256_enc_main_loop
5476
5477.L256_enc_prepretail:							@ PREPRETAIL
5478	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
5479	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
5480	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
5481
5482	rev64	$res2b, $res2b						@ GHASH block 8k+2
5483
5484	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
5485	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
5486
5487	rev64	$res5b, $res5b						@ GHASH block 8k+5
5488	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
5489	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
5490
5491	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
5492
5493	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
5494	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
5495	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
5496
5497	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
5498	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
5499
5500	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
5501	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
5502	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
5503
5504	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
5505	rev64	$res0b, $res0b						@ GHASH block 8k
5506	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
5507
5508	rev64	$res1b, $res1b						@ GHASH block 8k+1
5509	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
5510	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
5511
5512	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
5513	ext     $h7.16b, $h7.16b, $h7.16b, #8
5514	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
5515	ext     $h8.16b, $h8.16b, $h8.16b, #8
5516	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
5517
5518	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
5519	ext     $h5.16b, $h5.16b, $h5.16b, #8
5520	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
5521	ext     $h6.16b, $h6.16b, $h6.16b, #8
5522	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
5523	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
5524
5525	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
5526	eor	$res0b, $res0b, $acc_lb					@ PRE 1
5527
5528	rev64	$res3b, $res3b						@ GHASH block 8k+3
5529	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
5530
5531	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
5532	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
5533	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
5534
5535	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
5536	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
5537	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
5538
5539	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
5540	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
5541	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
5542
5543	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
5544	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5545	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
5546
5547	rev64	$res6b, $res6b						@ GHASH block 8k+6
5548	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
5549	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
5550
5551	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
5552	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
5553	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5554
5555	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
5556	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
5557
5558	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
5559	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
5560	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
5561
5562	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
5563	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
5564	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
5565
5566	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
5567	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
5568	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
5569
5570	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
5571	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
5572	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
5573
5574	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
5575	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
5576	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
5577
5578	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
5579	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
5580	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
5581
5582	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
5583	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5584	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5585
5586	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
5587	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
5588	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
5589
5590	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
5591	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
5592	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
5593
5594	rev64	$res4b, $res4b						@ GHASH block 8k+4
5595	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
5596	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
5597
5598	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
5599	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
5600	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
5601
5602	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
5603	ext     $h3.16b, $h3.16b, $h3.16b, #8
5604	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
5605	ext     $h4.16b, $h4.16b, $h4.16b, #8
5606	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
5607	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
5608
5609	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
5610	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
5611
5612	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
5613	rev64	$res7b, $res7b						@ GHASH block 8k+7
5614	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5615
5616	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
5617	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
5618	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
5619
5620	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
5621	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
5622	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
5623
5624	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
5625	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5626	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
5627	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
5628
5629	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
5630	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
5631	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
5632
5633	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
5634	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
5635	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
5636	ext     $h1.16b, $h1.16b, $h1.16b, #8
5637	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
5638	ext     $h2.16b, $h2.16b, $h2.16b, #8
5639
5640	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
5641	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
5642	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
5643
5644	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
5645	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5646
5647	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
5648	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
5649	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
5650
5651	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
5652	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
5653	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
5654
5655	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
5656	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
5657	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
5658
5659	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5660	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5661	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
5662
5663	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
5664	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
5665	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
5666
5667	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
5668	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
5669	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
5670
5671	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
5672	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
5673	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
5674
5675	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
5676	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
5677	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
5678
5679	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
5680	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
5681	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
5682
5683	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
5684	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
5685	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
5686
5687	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
5688	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
5689	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
5690
5691	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
5692	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
5693	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
5694
5695	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
5696
5697	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
5698	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
5699	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
5700
5701	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
5702	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
5703
5704	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
5705	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
5706	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
5707
5708	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
5709	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
5710	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
5711
5712	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
5713	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
5714	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
5715
5716	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
5717	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
5718	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
5719
5720	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
5721	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
5722	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
5723
5724	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
5725	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
5726	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
5727
5728	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
5729	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
5730	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
5731
5732	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
5733	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
5734	ldr	$rk14q, [$cc, #224]					@ load rk14
5735
5736	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 12
5737	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
5738	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
5739
5740	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
5741	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
5742	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
5743
5744	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
5745	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
5746
5747	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
5748	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
5749	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
5750
5751	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
5752	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
5753	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
5754
5755	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
5756	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
5757	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
5758
5759	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
5760	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
5761.L256_enc_tail:								@ TAIL
5762
5763	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8l | h8h
5764        ext     $h8.16b, $h8.16b, $h8.16b, #8
5765	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr		@ main_end_input_ptr is number of bytes left to process
5766
5767	ldr	$ctr_t0q, [$input_ptr], #16				@ AES block 8k+8 - load plaintext
5768
5769	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
5770        ext     $h5.16b, $h5.16b, $h5.16b, #8
5771
5772	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
5773	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
5774        ext     $h6.16b, $h6.16b, $h6.16b, #8
5775        ext     $h7.16b, $h7.16b, $h7.16b, #8
5776	mov	$t1.16b, $rk14
5777
5778	cmp	$main_end_input_ptr, #112
5779	eor3	$res1b, $ctr_t0b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
5780	b.gt	.L256_enc_blocks_more_than_7
5781
5782	movi	$acc_l.8b, #0
5783	mov	$ctr7b, $ctr6b
5784	movi	$acc_h.8b, #0
5785
5786	mov	$ctr6b, $ctr5b
5787	mov	$ctr5b, $ctr4b
5788	mov	$ctr4b, $ctr3b
5789
5790	mov	$ctr3b, $ctr2b
5791	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5792	mov	$ctr2b, $ctr1b
5793
5794	movi	$acc_m.8b, #0
5795	cmp	$main_end_input_ptr, #96
5796	b.gt	.L256_enc_blocks_more_than_6
5797
5798	mov	$ctr7b, $ctr6b
5799	mov	$ctr6b, $ctr5b
5800	cmp	$main_end_input_ptr, #80
5801
5802	mov	$ctr5b, $ctr4b
5803	mov	$ctr4b, $ctr3b
5804	mov	$ctr3b, $ctr1b
5805
5806	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5807	b.gt	.L256_enc_blocks_more_than_5
5808
5809	mov	$ctr7b, $ctr6b
5810	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5811
5812	mov	$ctr6b, $ctr5b
5813	mov	$ctr5b, $ctr4b
5814
5815	cmp	$main_end_input_ptr, #64
5816	mov	$ctr4b, $ctr1b
5817	b.gt	.L256_enc_blocks_more_than_4
5818
5819	cmp	$main_end_input_ptr, #48
5820	mov	$ctr7b, $ctr6b
5821	mov	$ctr6b, $ctr5b
5822
5823	mov	$ctr5b, $ctr1b
5824	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5825	b.gt	.L256_enc_blocks_more_than_3
5826
5827	cmp	$main_end_input_ptr, #32
5828	mov	$ctr7b, $ctr6b
5829	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5830
5831	mov	$ctr6b, $ctr1b
5832	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5833	b.gt	.L256_enc_blocks_more_than_2
5834
5835	mov	$ctr7b, $ctr1b
5836
5837	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5838	cmp	$main_end_input_ptr, #16
5839	b.gt	.L256_enc_blocks_more_than_1
5840
5841	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5842	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
5843	b	 .L256_enc_blocks_less_than_1
5844.L256_enc_blocks_more_than_7:						@ blocks left >  7
5845	st1	{ $res1b}, [$output_ptr], #16				@ AES final-7 block  - store result
5846
5847	rev64	$res0b, $res1b						@ GHASH final-7 block
5848
5849	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5850
5851	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-6 block - load plaintext
5852
5853	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
5854	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
5855	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
5856
5857	movi	$t0.8b, #0						@ supress further partial tag feed in
5858
5859	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
5860	eor3	$res1b, $ctr_t1b, $ctr1b, $t1.16b			@ AES final-6 block - result
5861
5862	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d				@ GHASH final-7 block - mid
5863	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
5864.L256_enc_blocks_more_than_6:						@ blocks left >  6
5865
5866	st1	{ $res1b}, [$output_ptr], #16				@ AES final-6 block - store result
5867
5868	rev64	$res0b, $res1b						@ GHASH final-6 block
5869
5870	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5871
5872	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
5873	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
5874	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
5875
5876	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-5 block - load plaintext
5877
5878	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
5879
5880	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
5881
5882	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
5883	eor3	$res1b, $ctr_t1b, $ctr2b, $t1.16b			@ AES final-5 block - result
5884
5885	movi	$t0.8b, #0						@ supress further partial tag feed in
5886
5887	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
5888	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
5889.L256_enc_blocks_more_than_5:						@ blocks left >  5
5890
5891	st1	{ $res1b}, [$output_ptr], #16				@ AES final-5 block - store result
5892
5893	rev64	$res0b, $res1b						@ GHASH final-5 block
5894
5895	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5896
5897	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
5898
5899	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
5900
5901	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
5902	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
5903
5904	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
5905
5906	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-4 block - load plaintext
5907	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
5908
5909	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
5910	movi	$t0.8b, #0						@ supress further partial tag feed in
5911	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
5912
5913	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
5914	eor3	$res1b, $ctr_t1b, $ctr3b, $t1.16b			@ AES final-4 block - result
5915.L256_enc_blocks_more_than_4:						@ blocks left >  4
5916
5917	st1	{ $res1b}, [$output_ptr], #16				@ AES final-4 block - store result
5918
5919	rev64	$res0b, $res1b						@ GHASH final-4 block
5920
5921	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-3 block - load plaintext
5922
5923	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5924
5925	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
5926	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
5927
5928	eor3	$res1b, $ctr_t1b, $ctr4b, $t1.16b			@ AES final-3 block - result
5929	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
5930
5931	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
5932	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
5933
5934	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
5935
5936	movi	$t0.8b, #0						@ supress further partial tag feed in
5937
5938	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
5939	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
5940.L256_enc_blocks_more_than_3:						@ blocks left >  3
5941
5942	st1	{ $res1b}, [$output_ptr], #16				@ AES final-3 block - store result
5943
5944	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
5945	ext     $h4.16b, $h4.16b, $h4.16b, #8
5946	rev64	$res0b, $res1b						@ GHASH final-3 block
5947
5948	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5949
5950	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
5951	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
5952
5953	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
5954	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
5955	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5956
5957	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
5958	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-2 block - load plaintext
5959
5960	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
5961	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
5962
5963	eor3	$res1b, $ctr_t1b, $ctr5b, $t1.16b			@ AES final-2 block - result
5964	movi	$t0.8b, #0						@ supress further partial tag feed in
5965
5966	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
5967	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
5968.L256_enc_blocks_more_than_2:						@ blocks left >  2
5969
5970	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
5971	ext     $h3.16b, $h3.16b, $h3.16b, #8
5972
5973	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-2 block - store result
5974
5975	rev64	$res0b, $res1b						@ GHASH final-2 block
5976	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-1 block - load plaintext
5977
5978	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5979
5980	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
5981
5982	movi	$t0.8b, #0						@ supress further partial tag feed in
5983
5984	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
5985	eor3	$res1b, $ctr_t1b, $ctr6b, $t1.16b			@ AES final-1 block - result
5986
5987	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
5988
5989	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
5990
5991	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
5992	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
5993
5994	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
5995	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
5996.L256_enc_blocks_more_than_1:						@ blocks left >  1
5997
5998	st1	{ $res1b}, [$output_ptr], #16				@ AES final-1 block - store result
5999
6000	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
6001	ext     $h2.16b, $h2.16b, $h2.16b, #8
6002	rev64	$res0b, $res1b						@ GHASH final-1 block
6003	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final block - load plaintext
6004
6005	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
6006	movi	$t0.8b, #0						@ supress further partial tag feed in
6007
6008	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
6009	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
6010
6011	eor3	$res1b, $ctr_t1b, $ctr7b, $t1.16b			@ AES final block - result
6012	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
6013
6014	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
6015	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
6016
6017	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
6018
6019	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
6020	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
6021
6022	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
6023
6024	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
6025.L256_enc_blocks_less_than_1:						@ blocks left <= 1
6026
6027	and	$bit_length, $bit_length, #127				@ bit_length %= 128
6028
6029	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
6030
6031	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
6032
6033	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
6034	and	$bit_length, $bit_length, #127				@ bit_length %= 128
6035
6036	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
6037	cmp	$bit_length, #64
6038	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
6039
6040	csel	$temp3_x, $temp0_x, xzr, lt
6041	csel	$temp2_x, $temp1_x, $temp0_x, lt
6042
6043	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
6044	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
6045	ext     $h1.16b, $h1.16b, $h1.16b, #8
6046
6047	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
6048	mov	$ctr0.d[1], $temp3_x
6049
6050	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
6051
6052	rev64	$res0b, $res1b						@ GHASH final block
6053
6054	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
6055	bif	$res1b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
6056	str	$rtmp_ctrq, [$counter]					@ store the updated counter
6057
6058	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
6059	st1	{ $res1b}, [$output_ptr]				@ store all 16B
6060
6061	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
6062	pmull2	$rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
6063	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
6064
6065	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
6066	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
6067
6068	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
6069
6070	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
6071
6072	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
6073	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
6074
6075	ext	$t11.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
6076
6077	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
6078	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
6079
6080	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
6081
6082	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
6083	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
6084
6085	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
6086		ext	$acc_lb, $acc_lb, $acc_lb, #8
6087	rev64	$acc_lb, $acc_lb
6088	st1	{ $acc_l.16b }, [$current_tag]
6089	lsr	x0, $bit_length, #3					@ return sizes
6090
6091        ldp     d10, d11, [sp, #16]
6092	ldp     d12, d13, [sp, #32]
6093	ldp     d14, d15, [sp, #48]
6094	ldp     d8, d9, [sp], #80
6095	ret
6096
6097.L256_enc_ret:
6098	mov w0, #0x0
6099	ret
6100.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6101___
6102
6103{
6104#########################################################################################
6105# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
6106#                               size_t len,
6107#                               unsigned char *out,
6108#                               const void *key,
6109#                               unsigned char ivec[16],
6110#                               u64 *Xi);
6111#
6112$code.=<<___;
6113.global unroll8_eor3_aes_gcm_dec_256_kernel
6114.type   unroll8_eor3_aes_gcm_dec_256_kernel,%function
6115.align  4
6116unroll8_eor3_aes_gcm_dec_256_kernel:
6117	AARCH64_VALID_CALL_TARGET
6118	cbz	x1, .L256_dec_ret
6119	stp	d8, d9, [sp, #-80]!
6120	mov	$counter, x4
6121	mov	$cc, x5
6122	stp	d10, d11, [sp, #16]
6123	stp	d12, d13, [sp, #32]
6124	stp	d14, d15, [sp, #48]
6125	mov	x5, #0xc200000000000000
6126	stp	x5, xzr, [sp, #64]
6127	add	$modulo_constant, sp, #64
6128
6129	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
6130
6131	mov	$constant_temp, #0x100000000			@ set up counter increment
6132	movi	$rctr_inc.16b, #0x0
6133	mov	$rctr_inc.d[1], $constant_temp
6134	lsr	$main_end_input_ptr, $bit_length, #3		  	@ byte_len
6135
6136	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
6137
6138	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
6139
6140	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
6141
6142	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
6143	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
6144
6145	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
6146	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
6147	ldp	$rk0q, $rk1q, [$cc, #0]				  	@ load rk0, rk1
6148
6149	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
6150	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
6151
6152	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
6153	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
6154
6155	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
6156
6157	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
6158	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
6159
6160	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
6161	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
6162
6163	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
6164	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
6165
6166	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
6167	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
6168
6169	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b		        @ AES block 6 - round 0
6170	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
6171
6172	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
6173	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b		        @ AES block 7 - round 0
6174	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
6175
6176	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b		        @ AES block 6 - round 1
6177	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b		        @ AES block 4 - round 1
6178	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b		        @ AES block 0 - round 1
6179
6180	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
6181	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
6182	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
6183
6184	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
6185	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
6186
6187	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
6188	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
6189	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
6190
6191	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
6192	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
6193	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
6194
6195	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
6196	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
6197	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
6198
6199	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
6200	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
6201
6202	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
6203	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
6204
6205	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
6206	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
6207	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
6208
6209	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
6210
6211	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
6212	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
6213
6214	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
6215	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
6216	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
6217
6218	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
6219	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
6220	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
6221
6222	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
6223	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
6224
6225	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
6226	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
6227	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
6228
6229	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
6230
6231	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
6232	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
6233
6234	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
6235
6236	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
6237	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
6238	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
6239
6240	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
6241	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
6242	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
6243
6244	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
6245	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
6246	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
6247
6248	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
6249	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
6250
6251	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
6252	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
6253	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
6254
6255	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
6256	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
6257	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
6258
6259	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
6260	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
6261	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
6262
6263	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
6264	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
6265	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
6266
6267	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
6268	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
6269	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
6270
6271	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
6272
6273	ld1	{ $acc_lb}, [$current_tag]
6274	ext	$acc_lb, $acc_lb, $acc_lb, #8
6275	rev64	$acc_lb, $acc_lb
6276	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
6277	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
6278	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
6279
6280	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
6281	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
6282
6283	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
6284	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
6285
6286	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
6287
6288	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
6289	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
6290
6291	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 10
6292	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 10
6293	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 10
6294
6295	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 10
6296	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 10
6297	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 10
6298
6299	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 10
6300	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 10
6301	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
6302
6303	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 11
6304	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
6305
6306	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 11
6307	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 11
6308	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 11
6309
6310	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 11
6311	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 11
6312	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 11
6313
6314	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 11
6315	ldr	$rk14q, [$cc, #224]					@ load rk14
6316
6317	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 12
6318	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 12
6319	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 12
6320
6321	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
6322	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 12
6323	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 12
6324
6325	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 12
6326	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 12
6327	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 12
6328
6329	aese	$ctr5b, $rk13						@ AES block 5 - round 13
6330	aese	$ctr1b, $rk13						@ AES block 1 - round 13
6331	aese	$ctr2b, $rk13						@ AES block 2 - round 13
6332
6333	aese	$ctr0b, $rk13						@ AES block 0 - round 13
6334	aese	$ctr4b, $rk13						@ AES block 4 - round 13
6335	aese	$ctr6b, $rk13						@ AES block 6 - round 13
6336
6337	aese	$ctr3b, $rk13						@ AES block 3 - round 13
6338	aese	$ctr7b, $rk13						@ AES block 7 - round 13
6339	b.ge	.L256_dec_tail						@ handle tail
6340
6341	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 0, 1 - load ciphertext
6342
6343	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 2, 3 - load ciphertext
6344
6345	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 4, 5 - load ciphertext
6346
6347	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 6, 7 - load ciphertext
6348	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
6349
6350	eor3	$ctr1b, $res1b, $ctr1b, $rk14				@ AES block 1 - result
6351	eor3	$ctr0b, $res0b, $ctr0b, $rk14				@ AES block 0 - result
6352	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 0, 1 - store result
6353
6354	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
6355	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
6356	eor3	$ctr3b, $res3b, $ctr3b, $rk14				@ AES block 3 - result
6357
6358	eor3	$ctr5b, $res5b, $ctr5b, $rk14				@ AES block 5 - result
6359
6360	eor3	$ctr4b, $res4b, $ctr4b, $rk14				@ AES block 4 - result
6361	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
6362	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
6363
6364	eor3	$ctr2b, $res2b, $ctr2b, $rk14				@ AES block 2 - result
6365	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 2, 3 - store result
6366
6367	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
6368	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
6369
6370	eor3	$ctr6b, $res6b, $ctr6b, $rk14				@ AES block 6 - result
6371
6372	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
6373	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
6374	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 4, 5 - store result
6375
6376	eor3	$ctr7b, $res7b, $ctr7b, $rk14				@ AES block 7 - result
6377	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 6, 7 - store result
6378
6379	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
6380	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
6381	b.ge	.L256_dec_prepretail					@ do prepretail
6382
6383.L256_dec_main_loop:							@ main loop start
6384	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
6385	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
6386	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
6387
6388	rev64	$res1b, $res1b						@ GHASH block 8k+1
6389	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
6390	ext     $h7.16b, $h7.16b, $h7.16b, #8
6391	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
6392	ext     $h8.16b, $h8.16b, $h8.16b, #8
6393
6394	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
6395	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
6396	rev64	$res0b, $res0b						@ GHASH block 8k
6397
6398	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
6399	rev64	$res4b, $res4b						@ GHASH block 8k+4
6400	rev64	$res3b, $res3b						@ GHASH block 8k+3
6401
6402	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
6403	rev64	$res7b, $res7b						@ GHASH block 8k+7
6404
6405	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
6406	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
6407	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
6408
6409	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
6410	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
6411	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
6412
6413	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
6414	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
6415	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
6416
6417	eor	$res0b, $res0b, $acc_lb					@ PRE 1
6418	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
6419	ext     $h5.16b, $h5.16b, $h5.16b, #8
6420	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
6421	ext     $h6.16b, $h6.16b, $h6.16b, #8
6422	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
6423
6424	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
6425	rev64	$res2b, $res2b						@ GHASH block 8k+2
6426	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
6427
6428	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
6429	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
6430	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
6431
6432	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6433	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
6434	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
6435
6436	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
6437	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
6438	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
6439
6440	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
6441	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
6442	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
6443
6444	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
6445	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
6446	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
6447
6448	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
6449	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
6450	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
6451
6452	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
6453	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
6454	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
6455
6456	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
6457	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
6458	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
6459
6460	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
6461	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
6462	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6463
6464	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
6465	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
6466	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
6467
6468	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
6469	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
6470	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
6471
6472	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
6473	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
6474	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
6475
6476	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
6477	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
6478	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
6479
6480	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
6481	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
6482	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
6483	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
6484
6485	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
6486	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
6487	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
6488
6489	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
6490	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
6491	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
6492
6493	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
6494	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
6495	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
6496
6497	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
6498	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6499	rev64	$res5b, $res5b						@ GHASH block 8k+5
6500
6501	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
6502	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
6503	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6504
6505	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
6506	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
6507	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
6508
6509	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6510	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
6511	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
6512
6513	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
6514	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
6515	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
6516
6517	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
6518	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
6519	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
6520
6521	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
6522	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
6523	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
6524
6525	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
6526	ext     $h3.16b, $h3.16b, $h3.16b, #8
6527	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
6528	ext     $h4.16b, $h4.16b, $h4.16b, #8
6529	rev64	$res6b, $res6b						@ GHASH block 8k+6
6530	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
6531
6532	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
6533	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
6534	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
6535
6536	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
6537	ext     $h1.16b, $h1.16b, $h1.16b, #8
6538	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
6539	ext     $h2.16b, $h2.16b, $h2.16b, #8
6540	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
6541	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
6542
6543	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
6544	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
6545	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
6546
6547	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
6548	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
6549	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
6550	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
6551
6552	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
6553	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
6554	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6555
6556	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
6557	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
6558	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
6559
6560	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
6561	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
6562	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
6563
6564	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
6565	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
6566	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
6567
6568	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6569	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
6570	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
6571
6572	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
6573	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
6574	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6575
6576	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
6577	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
6578	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
6579
6580	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
6581	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
6582	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
6583
6584	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load ciphertext
6585	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
6586	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
6587
6588	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
6589	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
6590	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
6591
6592	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
6593	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
6594	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
6595
6596	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
6597	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
6598	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
6599
6600	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
6601	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
6602	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
6603
6604	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
6605	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
6606	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
6607
6608	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
6609	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
6610	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
6611
6612	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
6613	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
6614	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
6615
6616	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
6617	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
6618	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
6619
6620	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
6621	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
6622	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
6623
6624	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
6625	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
6626
6627	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
6628	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
6629	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
6630
6631	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load ciphertext
6632	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
6633	ext	$t11.16b, $acc_hb, $acc_hb, #8				 @ MODULO - other top alignment
6634
6635	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
6636	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
6637	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
6638
6639	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
6640	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
6641	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
6642
6643	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
6644	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
6645	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
6646
6647	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
6648	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 12
6649	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
6650
6651	ldr	$rk14q, [$cc, #224]					@ load rk14
6652	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
6653	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
6654
6655	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
6656	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
6657	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
6658
6659	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load ciphertext
6660	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
6661	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
6662
6663	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load ciphertext
6664	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
6665	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
6666
6667	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
6668	eor3	$ctr2b, $res2b, $ctr2b, $rk14				@ AES block 8k+10 - result
6669	eor3	$ctr1b, $res1b, $ctr1b, $rk14				@ AES block 8k+9 - result
6670
6671	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
6672	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
6673
6674	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
6675	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
6676	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
6677
6678	eor3	$ctr5b, $res5b, $ctr5b, $rk14				@ AES block 8k+13 - result
6679	eor3	$ctr0b, $res0b, $ctr0b, $rk14				@ AES block 8k+8 - result
6680	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
6681
6682	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
6683	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
6684	eor3	$ctr4b, $res4b, $ctr4b, $rk14				@ AES block 8k+12 - result
6685
6686	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
6687	eor3	$ctr3b, $res3b, $ctr3b, $rk14				@ AES block 8k+11 - result
6688	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
6689
6690	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
6691	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
6692	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
6693
6694	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
6695	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
6696	eor3	$ctr7b, $res7b, $ctr7b, $rk14				@ AES block 8k+15 - result
6697
6698	eor3	$ctr6b, $res6b, $ctr6b, $rk14				@ AES block 8k+14 - result
6699	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
6700	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
6701
6702	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
6703	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
6704	b.lt	.L256_dec_main_loop
6705
6706.L256_dec_prepretail:							@ PREPRETAIL
6707	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
6708	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
6709	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
6710
6711	rev64	$res4b, $res4b						@ GHASH block 8k+4
6712	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
6713	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
6714
6715	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
6716	rev64	$res0b, $res0b						@ GHASH block 8k
6717	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
6718
6719	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
6720	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
6721	ext     $h7.16b, $h7.16b, $h7.16b, #8
6722	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
6723	ext     $h8.16b, $h8.16b, $h8.16b, #8
6724	rev64	$res1b, $res1b						@ GHASH block 8k+1
6725
6726	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
6727	rev64	$res2b, $res2b						@ GHASH block 8k+2
6728	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
6729	ext     $h5.16b, $h5.16b, $h5.16b, #8
6730	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
6731	ext     $h6.16b, $h6.16b, $h6.16b, #8
6732
6733	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
6734	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
6735	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
6736
6737	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
6738	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
6739	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
6740
6741	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
6742	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
6743	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
6744
6745	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
6746	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
6747	eor	$res0b, $res0b, $acc_lb					@ PRE 1
6748
6749	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
6750	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
6751	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
6752
6753	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
6754	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
6755	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
6756
6757	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
6758	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6759	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
6760
6761	rev64	$res3b, $res3b						@ GHASH block 8k+3
6762	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
6763
6764	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
6765	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
6766	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
6767
6768	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
6769	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
6770	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
6771
6772	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
6773	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
6774
6775	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
6776	rev64	$res6b, $res6b						@ GHASH block 8k+6
6777
6778	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
6779	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
6780	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
6781
6782	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
6783	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6784	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
6785
6786	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
6787	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
6788	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
6789
6790	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
6791	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
6792	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
6793
6794	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
6795	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
6796	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
6797
6798	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
6799	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6800	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6801
6802	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
6803	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
6804	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
6805
6806	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
6807	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
6808	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
6809
6810	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
6811	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
6812	ext     $h1.16b, $h1.16b, $h1.16b, #8
6813	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
6814	ext     $h2.16b, $h2.16b, $h2.16b, #8
6815	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
6816
6817	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
6818	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
6819	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
6820
6821	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
6822	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
6823	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
6824
6825	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
6826	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
6827	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
6828
6829	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
6830	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
6831	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
6832
6833	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
6834	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
6835	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
6836
6837	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
6838	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
6839	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
6840
6841	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
6842	ext     $h3.16b, $h3.16b, $h3.16b, #8
6843	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
6844	ext     $h4.16b, $h4.16b, $h4.16b, #8
6845	rev64	$res7b, $res7b						@ GHASH block 8k+7
6846	rev64	$res5b, $res5b						@ GHASH block 8k+5
6847
6848	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
6849
6850	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6851
6852	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
6853	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
6854	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
6855	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
6856
6857	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
6858	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
6859
6860	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
6861	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
6862	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
6863
6864	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6865	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
6866	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6867
6868	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
6869	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
6870	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
6871
6872	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
6873	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
6874	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
6875
6876	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
6877	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
6878	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
6879
6880	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
6881	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
6882
6883	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
6884	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
6885	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
6886
6887	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
6888	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6889	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
6890
6891	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
6892	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
6893	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
6894
6895	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
6896	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
6897	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
6898
6899	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
6900	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
6901	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
6902
6903	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
6904	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
6905	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
6906
6907	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
6908	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
6909	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
6910
6911	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
6912	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
6913	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
6914
6915	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
6916	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
6917	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
6918
6919	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
6920	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
6921	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
6922
6923	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
6924	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
6925	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
6926
6927	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
6928	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
6929	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
6930
6931	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
6932
6933	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
6934	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
6935	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
6936
6937	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
6938	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
6939	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
6940
6941	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
6942
6943	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
6944	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
6945	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
6946
6947	ext	$t11.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
6948
6949	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
6950	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
6951	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
6952
6953	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
6954	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
6955
6956	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
6957	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
6958	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
6959
6960	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
6961	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
6962
6963	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
6964
6965	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
6966	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
6967	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
6968
6969	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
6970	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
6971	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
6972
6973	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
6974	ldr	$rk14q, [$cc, #224]					@ load rk14
6975	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b	        	@ AES block 8k+9 - round 12
6976
6977	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
6978	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
6979	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
6980
6981	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
6982	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
6983	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
6984
6985	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
6986	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
6987	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
6988
6989	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
6990	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
6991.L256_dec_tail:								@ TAIL
6992
6993	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
6994	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr		@ main_end_input_ptr is number of bytes left to process
6995	cmp	$main_end_input_ptr, #112
6996
6997	ldr	$res1q, [$input_ptr], #16				@ AES block 8k+8 - load ciphertext
6998
6999	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
7000        ext     $h8.16b, $h8.16b, $h8.16b, #8
7001	mov	$t1.16b, $rk14
7002
7003	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
7004        ext     $h5.16b, $h5.16b, $h5.16b, #8
7005
7006	eor3	$res4b, $res1b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
7007	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
7008        ext     $h6.16b, $h6.16b, $h6.16b, #8
7009        ext     $h7.16b, $h7.16b, $h7.16b, #8
7010	b.gt	.L256_dec_blocks_more_than_7
7011
7012	mov	$ctr7b, $ctr6b
7013	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7014	mov	$ctr6b, $ctr5b
7015
7016	mov	$ctr5b, $ctr4b
7017	mov	$ctr4b, $ctr3b
7018	movi	$acc_l.8b, #0
7019
7020	movi	$acc_h.8b, #0
7021	movi	$acc_m.8b, #0
7022	mov	$ctr3b, $ctr2b
7023
7024	cmp	$main_end_input_ptr, #96
7025	mov	$ctr2b, $ctr1b
7026	b.gt	.L256_dec_blocks_more_than_6
7027
7028	mov	$ctr7b, $ctr6b
7029	mov	$ctr6b, $ctr5b
7030
7031	mov	$ctr5b, $ctr4b
7032	cmp	$main_end_input_ptr, #80
7033	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7034
7035	mov	$ctr4b, $ctr3b
7036	mov	$ctr3b, $ctr1b
7037	b.gt	.L256_dec_blocks_more_than_5
7038
7039	cmp	$main_end_input_ptr, #64
7040	mov	$ctr7b, $ctr6b
7041	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7042
7043	mov	$ctr6b, $ctr5b
7044
7045	mov	$ctr5b, $ctr4b
7046	mov	$ctr4b, $ctr1b
7047	b.gt	.L256_dec_blocks_more_than_4
7048
7049	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7050	mov	$ctr7b, $ctr6b
7051	cmp	$main_end_input_ptr, #48
7052
7053	mov	$ctr6b, $ctr5b
7054	mov	$ctr5b, $ctr1b
7055	b.gt	.L256_dec_blocks_more_than_3
7056
7057	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
7058	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7059	mov	$ctr7b, $ctr6b
7060
7061	cmp	$main_end_input_ptr, #32
7062	mov	$ctr6b, $ctr1b
7063	b.gt	.L256_dec_blocks_more_than_2
7064
7065	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7066
7067	mov	$ctr7b, $ctr1b
7068	cmp	$main_end_input_ptr, #16
7069	b.gt	.L256_dec_blocks_more_than_1
7070
7071	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7072	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
7073	b	 .L256_dec_blocks_less_than_1
7074.L256_dec_blocks_more_than_7:						@ blocks left >  7
7075	rev64	$res0b, $res1b						@ GHASH final-7 block
7076	ldr	$res1q, [$input_ptr], #16				@ AES final-6 block - load ciphertext
7077	st1	{ $res4b}, [$output_ptr], #16				@ AES final-7 block  - store result
7078
7079	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
7080
7081	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7082
7083	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
7084	eor3	$res4b, $res1b, $ctr1b, $t1.16b				@ AES final-6 block - result
7085
7086	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
7087
7088	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
7089	movi	$t0.8b, #0						@ supress further partial tag feed in
7090
7091	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
7092	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
7093.L256_dec_blocks_more_than_6:						@ blocks left >  6
7094
7095	rev64	$res0b, $res1b						@ GHASH final-6 block
7096
7097	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7098	ldr	$res1q, [$input_ptr], #16				@ AES final-5 block - load ciphertext
7099	movi	$t0.8b, #0						@ supress further partial tag feed in
7100
7101	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
7102	st1	{ $res4b}, [$output_ptr], #16				@ AES final-6 block - store result
7103	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
7104
7105	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
7106
7107	eor3	$res4b, $res1b, $ctr2b, $t1.16b				@ AES final-5 block - result
7108	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
7109	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
7110
7111	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
7112
7113	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
7114	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
7115.L256_dec_blocks_more_than_5:						@ blocks left >  5
7116
7117	rev64	$res0b, $res1b						@ GHASH final-5 block
7118
7119	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7120
7121	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
7122	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
7123
7124	ldr	$res1q, [$input_ptr], #16				@ AES final-4 block - load ciphertext
7125
7126	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
7127	st1	{ $res4b}, [$output_ptr], #16			  	@ AES final-5 block - store result
7128
7129	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
7130	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
7131
7132	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
7133
7134	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
7135	eor3	$res4b, $res1b, $ctr3b, $t1.16b				@ AES final-4 block - result
7136	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
7137
7138	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
7139	movi	$t0.8b, #0						@ supress further partial tag feed in
7140.L256_dec_blocks_more_than_4:						@ blocks left >  4
7141
7142	rev64	$res0b, $res1b						@ GHASH final-4 block
7143
7144	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7145
7146	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
7147	ldr	$res1q, [$input_ptr], #16				@ AES final-3 block - load ciphertext
7148
7149	movi	$t0.8b, #0						@ supress further partial tag feed in
7150
7151	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
7152	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
7153
7154	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
7155
7156	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
7157
7158	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
7159
7160	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
7161	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-4 block - store result
7162
7163	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
7164	eor3	$res4b, $res1b, $ctr4b, $t1.16b				@ AES final-3 block - result
7165.L256_dec_blocks_more_than_3:						@ blocks left >  3
7166
7167	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
7168	ext     $h4.16b, $h4.16b, $h4.16b, #8
7169	rev64	$res0b, $res1b						@ GHASH final-3 block
7170
7171	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7172	ldr	$res1q, [$input_ptr], #16				@ AES final-2 block - load ciphertext
7173	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
7174
7175	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
7176	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-3 block - store result
7177
7178	eor3	$res4b, $res1b, $ctr5b, $t1.16b				@ AES final-2 block - result
7179
7180	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
7181
7182	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
7183	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
7184	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
7185
7186	movi	$t0.8b, #0						@ supress further partial tag feed in
7187	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
7188	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
7189
7190	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
7191
7192	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
7193.L256_dec_blocks_more_than_2:						@ blocks left >  2
7194
7195	rev64	$res0b, $res1b						@ GHASH final-2 block
7196
7197	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
7198	ext     $h3.16b, $h3.16b, $h3.16b, #8
7199	ldr	$res1q, [$input_ptr], #16				@ AES final-1 block - load ciphertext
7200
7201	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7202
7203	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
7204
7205	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
7206	st1	{ $res4b}, [$output_ptr], #16			  	@ AES final-2 block - store result
7207	eor3	$res4b, $res1b, $ctr6b, $t1.16b				@ AES final-1 block - result
7208
7209	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
7210	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
7211	movi	$t0.8b, #0						@ supress further partial tag feed in
7212
7213	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
7214	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
7215
7216	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
7217	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
7218.L256_dec_blocks_more_than_1:						@ blocks left >  1
7219
7220	rev64	$res0b, $res1b						@ GHASH final-1 block
7221
7222	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7223
7224	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
7225	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
7226	ext     $h2.16b, $h2.16b, $h2.16b, #8
7227
7228	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
7229	ldr	$res1q, [$input_ptr], #16				@ AES final block - load ciphertext
7230	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-1 block - store result
7231
7232	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
7233	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
7234
7235	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
7236
7237	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
7238
7239	eor3	$res4b, $res1b, $ctr7b, $t1.16b				@ AES final block - result
7240	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
7241
7242	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
7243
7244	movi	$t0.8b, #0						@ supress further partial tag feed in
7245	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
7246
7247	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
7248.L256_dec_blocks_less_than_1:						@ blocks left <= 1
7249
7250	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
7251	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
7252	and	$bit_length, $bit_length, #127				@ bit_length %= 128
7253
7254	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
7255	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
7256	str	$rtmp_ctrq, [$counter]					@ store the updated counter
7257
7258	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
7259
7260	and	$bit_length, $bit_length, #127			 	@ bit_length %= 128
7261
7262	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
7263	cmp	$bit_length, #64
7264	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
7265
7266	csel	$temp3_x, $temp0_x, xzr, lt
7267	csel	$temp2_x, $temp1_x, $temp0_x, lt
7268
7269	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
7270	mov	$ctr0.d[1], $temp3_x
7271
7272	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
7273	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
7274	ext     $h1.16b, $h1.16b, $h1.16b, #8
7275	bif	$res4b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
7276
7277	rev64	$res0b, $res1b						@ GHASH final block
7278
7279	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7280
7281	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
7282	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
7283
7284	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
7285
7286	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
7287	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
7288
7289	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
7290
7291	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
7292	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
7293	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
7294
7295	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
7296	eor	$t10.16b, $acc_hb, $acc_lb				@ MODULO - karatsuba tidy up
7297
7298	ext	$acc_hb, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
7299	st1	{ $res4b}, [$output_ptr]				@ store all 16B
7300
7301	eor	$acc_mb, $acc_mb, $t10.16b				@ MODULO - karatsuba tidy up
7302
7303	eor	$t11.16b, $acc_hb, $t11.16b				@ MODULO - fold into mid
7304	eor	$acc_mb, $acc_mb, $t11.16b				@ MODULO - fold into mid
7305
7306	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
7307
7308	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
7309	eor	$acc_lb, $acc_lb, $acc_hb				@ MODULO - fold into low
7310
7311	eor	$acc_lb, $acc_lb, $acc_mb				@ MODULO - fold into low
7312	ext	$acc_lb, $acc_lb, $acc_lb, #8
7313	rev64	$acc_lb, $acc_lb
7314	st1	{ $acc_l.16b }, [$current_tag]
7315	lsr	x0, $bit_length, #3					@ return sizes
7316
7317        ldp     d10, d11, [sp, #16]
7318	ldp     d12, d13, [sp, #32]
7319	ldp     d14, d15, [sp, #48]
7320	ldp     d8, d9, [sp], #80
7321	ret
7322
7323.L256_dec_ret:
7324	mov w0, #0x0
7325	ret
7326.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
7327___
7328}
7329}
7330
7331$code.=<<___;
7332.asciz  "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
7333.align  2
7334#endif
7335___
7336
7337{
7338    my  %opcode = (
7339    "rax1"    => 0xce608c00,    "eor3"    => 0xce000000,
7340    "bcax"    => 0xce200000,    "xar"    => 0xce800000    );
7341
7342    sub unsha3 {
7343         my ($mnemonic,$arg)=@_;
7344
7345         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
7346         &&
7347         sprintf ".inst\t0x%08x\t//%s %s",
7348            $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
7349            $mnemonic,$arg;
7350    }
7351    sub unvmov {
7352        my $arg=shift;
7353
7354        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
7355        sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
7356                             $3<8?$3:$3+8,($4 eq "lo")?0:1;
7357    }
7358
7359     foreach(split("\n",$code)) {
7360        s/@\s/\/\//o;               # old->new style commentary
7361        s/\`([^\`]*)\`/eval($1)/ge;
7362
7363        m/\bld1r\b/ and s/\.16b/.2d/g    or
7364        s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
7365        print $_,"\n";
7366     }
7367}
7368
7369close STDOUT or die "error closing STDOUT: $!"; # enforce flush
7370