1#! /usr/bin/env perl
2# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
15# obtain it.
16#========================================================================
17#
18# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
19# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
20# intermediate hashesfrom the 8 blocks.
21#
22#  ____________________________________________________
23# |                                                    |
24# | PRE                                                |
25# |____________________________________________________|
26# |                |                |                  |
27# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
28# |________________|________________|__________________|
29# |                |                |                  |
30# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
31# |________________|________________|__________________|
32# |                |                |                  |
33# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
34# |________________|________________|__________________|
35# |                |                |                  |
36# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
37# |________________|________________|__________________|
38# |                |                |                  |
39# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
40# |________________|________________|__________________|
41# |                |                |                  |
42# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
43# |________________|________________|__________________|
44# |                |                |                  |
45# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
46# |________________|________________|__________________|
47# |                |                |                  |
48# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
49# |________________|____(mostly)____|__________________|
50# |                                                    |
51# | MODULO                                             |
52# |____________________________________________________|
53#
54# PRE:
55#     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
56# EXT low_acc, low_acc, low_acc, #8
57# EOR res_curr (8k+0), res_curr (4k+0), low_acc
58#
59# CTR block:
60#     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
61# REV     ctr32, rev_ctr32
62# ORR     ctr64, constctr96_top32, ctr32, LSL #32
63# INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
64# INS     ctr_next.d[1], ctr64X
65# ADD     rev_ctr32, #1
66#
67# AES block:
68#      Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
69#      Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
70#      Given we are very constrained in our ASIMD registers this is quite important
71#
72#      Encrypt:
73# LDR     input_low, [ input_ptr  ], #8
74# LDR     input_high, [ input_ptr  ], #8
75# EOR     input_low, k14_low
76# EOR     input_high, k14_high
77# INS     res_curr.d[0], input_low
78# INS     res_curr.d[1], input_high
79# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
80# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
81# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
82# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
83# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
84# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
85# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
86# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
87# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
88# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
89# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
90# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
91# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
92# AESE    ctr_curr, k13
93# EOR     res_curr, res_curr, ctr_curr
94# ST1     { res_curr.16b  }, [ output_ptr  ], #16
95#
96#     Decrypt:
97# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
98# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
99# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
100# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
101# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
102# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
103# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
104# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
105# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
106# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
107# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
108# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
109# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
110# AESE    ctr_curr, k13
111# LDR     res_curr, [ input_ptr  ], #16
112# EOR     res_curr, res_curr, ctr_curr
113# MOV     output_low, res_curr.d[0]
114# MOV     output_high, res_curr.d[1]
115# EOR     output_low, k14_low
116# EOR     output_high, k14_high
117# STP     output_low, output_high, [ output_ptr  ], #16
118
119# GHASH block X:
120#     Do 128b karatsuba polynomial multiplication on block
121#     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
122#
123# multiplication:
124#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
125#
126#     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
127#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
128#
129#     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
130#     multiplying with "twisted" powers of H
131#
132# Note: We can PMULL directly into the acc_x in first GHASH of the loop
133# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
134#       path latency dominates the performance
135#
136#       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
137#       than indicated here
138# REV64   res_curr, res_curr
139# INS     t_m.d[0], res_curr.d[1]
140# EOR     t_m.8B, t_m.8B, res_curr.8B
141# PMULL2  t_h, res_curr, HX
142# PMULL   t_l, res_curr, HX
143# PMULL   t_m, t_m, HX_k
144# EOR     acc_h, acc_h, t_h
145# EOR     acc_l, acc_l, t_l
146# EOR     acc_m, acc_m, t_m
147#
148# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
149#         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
150#         with a reversed constant
151# EOR3    acc_m, acc_m, acc_l, acc_h                     // Finish off karatsuba processing
152# PMULL   t_mod, acc_h, mod_constant
153# EXT     acc_h, acc_h, acc_h, #8
154# EOR3     acc_m, acc_m, t_mod, acc_h
155# PMULL   acc_h, acc_m, mod_constant
156# EXT     acc_m, acc_m, acc_m, #8
157# EOR3    acc_l, acc_l, acc_m, acc_h
158
159$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
160$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
161
162$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
163( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
164( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
165die "can't locate arm-xlate.pl";
166
167die "only for 64 bit" if $flavour !~ /64/;
168
169open OUT,"| \"$^X\" $xlate $flavour $output";
170*STDOUT=*OUT;
171
172$code=<<___;
173#include "arm_arch.h"
174
175#if __ARM_MAX_ARCH__>=8
176___
177$code.=".arch   armv8-a+crypto\n.text\n";
178
179$input_ptr="x0";  #argument block
180$bit_length="x1";
181$byte_length="x9";
182$output_ptr="x2";
183$current_tag="x3";
184$counter="x16";
185$constant_temp="x15";
186$modulo_constant="x10";
187$cc="x8";
188{
189my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
190my ($temp2_x,$temp3_x)=map("x$_",(13..14));
191my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
192my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
193my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
194my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
195my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
196
197my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
198my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
199my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
200
201my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
202my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
203
204my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
205my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
206my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
207my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
208
209my $t0="v16";
210my $t0d="d16";
211
212my $t1="v29";
213my $t2=$res1;
214my $t3=$t1;
215
216my $t4=$res0;
217my $t5=$res2;
218my $t6=$t0;
219
220my $t7=$res3;
221my $t8=$res4;
222my $t9=$res5;
223
224my $t10=$res6;
225my $t11="v21";
226my $t12=$t1;
227
228my $rtmp_ctr="v30";
229my $rtmp_ctrq="q30";
230my $rctr_inc="v31";
231my $rctr_incd="d31";
232
233my $mod_constantd=$t0d;
234my $mod_constant=$t0;
235
236my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
237my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
238my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
239my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
240my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
241my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
242my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
243my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
244my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
245my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
246my $rk2q1="v28.1q";
247my $rk3q1="v26.1q";
248my $rk4v="v27";
249
250
251#########################################################################################
252# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const uint8_t * plaintext,
253#                                            uint64_t plaintext_length,
254#                                            uint8_t * ciphertext,
255#                                            uint64_t *Xi,
256#                                            unsigned char ivec[16],
257#                                            const void *key);
258#
259$code.=<<___;
260.global unroll8_eor3_aes_gcm_enc_128_kernel
261.type   unroll8_eor3_aes_gcm_enc_128_kernel,%function
262.align  4
263unroll8_eor3_aes_gcm_enc_128_kernel:
264	AARCH64_VALID_CALL_TARGET
265	cbz	x1, .L128_enc_ret
266	stp	d8, d9, [sp, #-80]!
267	lsr	$byte_length, $bit_length, #3
268	mov	$counter, x4
269	mov	$cc, x5
270	stp	d10, d11, [sp, #16]
271	stp	d12, d13, [sp, #32]
272	stp	d14, d15, [sp, #48]
273	mov	x5, #0xc200000000000000
274	stp	x5, xzr, [sp, #64]
275	add	$modulo_constant, sp, #64
276
277	mov	$constant_temp, #0x100000000				@ set up counter increment
278	movi	$rctr_inc.16b, #0x0
279	mov	$rctr_inc.d[1], $constant_temp
280	mov	$main_end_input_ptr, $byte_length
281	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
282
283	sub	$main_end_input_ptr, $main_end_input_ptr, #1	 	@ byte_len - 1
284
285	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80		@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
286
287	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
288
289	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
290
291	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
292	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
293
294	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
295	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
296
297	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
298	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
299
300	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
301	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
302
303	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
304	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
305	ldp	$rk0q, $rk1q, [$cc, #0]				  	@ load rk0, rk1
306
307	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
308	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
309
310	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
311	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
312
313	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
314	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
315	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
316
317	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
318	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
319	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
320
321	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
322	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
323	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
324
325	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
326
327	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
328	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
329	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
330
331	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
332	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
333	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
334
335	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
336	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
337	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
338
339	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
340	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
341	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
342
343	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
344	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
345	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
346
347	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
348
349	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
350	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
351	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
352
353	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
354	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
355	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
356
357	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
358
359	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
360	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
361	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
362
363	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
364	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
365	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
366
367	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
368	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
369	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
370
371	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
372	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
373	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
374
375	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
376	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
377	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
378
379	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
380	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
381	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
382
383	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
384	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
385	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
386
387	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
388	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
389	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
390
391	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
392	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
393	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
394
395	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
396
397	ld1	{ $acc_lb}, [$current_tag]
398	ext	$acc_lb, $acc_lb, $acc_lb, #8
399	rev64	$acc_lb, $acc_lb
400
401	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
402
403	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
404	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
405	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
406
407	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
408	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
409	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
410
411	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
412	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
413	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
414
415	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
416	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
417	ldr	$rk10q, [$cc, #160]					@ load rk10
418
419	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
420	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
421	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
422
423	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
424	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
425	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
426
427	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
428	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
429	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
430
431	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
432	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
433	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
434
435	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
436	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
437	b.ge	.L128_enc_tail						@ handle tail
438
439	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
440
441	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
442
443	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
444
445	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
446	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
447
448	eor3	$res0b, $ctr_t0b, $ctr0b, $rk10				@ AES block 0 - result
449	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
450	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
451
452	eor3	$res1b, $ctr_t1b, $ctr1b, $rk10				@ AES block 1 - result
453	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
454
455	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
456	eor3	$res5b, $ctr_t5b, $ctr5b, $rk10				@ AES block 5 - result
457	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
458
459	eor3	$res2b, $ctr_t2b, $ctr2b, $rk10				@ AES block 2 - result
460	eor3	$res6b, $ctr_t6b, $ctr6b, $rk10				@ AES block 6 - result
461	eor3	$res4b, $ctr_t4b, $ctr4b, $rk10				@ AES block 4 - result
462
463	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
464	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
465
466	eor3	$res3b, $ctr_t3b, $ctr3b, $rk10				@ AES block 3 - result
467	eor3	$res7b, $ctr_t7b, $ctr7b,$rk10				@ AES block 7 - result
468	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
469
470	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
471	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
472	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
473
474	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
475
476	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
477	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
478	b.ge	.L128_enc_prepretail					@ do prepretail
479
480.L128_enc_main_loop:							@ main loop start
481	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
482	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
483	ext     $h5.16b, $h5.16b, $h5.16b, #8
484	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
485	ext     $h6.16b, $h6.16b, $h6.16b, #8
486	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
487
488	rev64	$res1b, $res1b						@ GHASH block 8k+1
489	rev64	$res0b, $res0b						@ GHASH block 8k
490	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
491	ext     $h7.16b, $h7.16b, $h7.16b, #8
492	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
493	ext     $h8.16b, $h8.16b, $h8.16b, #8
494
495	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
496	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
497	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
498
499	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
500	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
501	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
502	rev64	$res3b, $res3b						@ GHASH block 8k+3
503
504	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
505	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
506	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
507
508	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
509
510	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
511	rev64	$res2b, $res2b						@ GHASH block 8k+2
512	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
513
514	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
515	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
516	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
517
518	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
519	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
520	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
521
522	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
523	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
524	ext     $h3.16b, $h3.16b, $h3.16b, #8
525	ldr	$h4q, [$current_tag, #112]				@ load h3l | h3h
526	ext     $h4.16b, $h4.16b, $h4.16b, #8
527	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
528
529	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
530	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
531	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
532
533	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
534	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
535	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
536
537	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
538	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
539	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
540
541	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
542	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
543	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
544
545	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
546	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
547	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
548
549	eor3	$acc_hb, $acc_hb, $t1.16b,$t2.16b			@ GHASH block 8k+2, 8k+3 - high
550	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
551	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
552
553	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
554	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
555	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
556
557	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
558	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
559	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
560
561	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
562	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
563	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
564
565	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
566	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
567
568	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
569	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
570	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
571
572	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
573	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
574	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
575
576	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
577	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
578	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
579
580	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
581	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
582	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
583
584	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
585	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
586	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
587	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
588
589	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
590	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
591	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
592
593	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
594	ext     $h1.16b, $h1.16b, $h1.16b, #8
595	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
596	ext     $h2.16b, $h2.16b, $h2.16b, #8
597	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
598	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
599
600	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
601	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
602
603	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
604	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
605
606	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
607	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
608
609	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
610	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
611
612	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
613	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
614	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
615
616	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
617	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
618	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
619
620	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
621	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
622	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
623
624	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
625	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
626	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
627
628	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
629	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
630	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
631
632	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
633	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
634	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
635
636	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
637	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
638	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
639
640	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
641	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
642
643	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
644	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
645	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
646
647	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
648	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
649	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
650
651	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
652	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
653	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
654
655	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
656	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
657
658	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
659	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
660	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
661
662	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
663	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
664	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
665
666	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
667	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
668	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
669
670	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
671	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
672	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
673
674	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
675	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
676	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
677
678	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
679	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
680	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
681
682	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
683	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
684	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
685
686	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
687	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
688	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
689
690	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
691	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
692
693	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
694	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load plaintext
695	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
696
697	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
698	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
699	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
700
701	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
702	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
703	ldr	$rk10q, [$cc, #160]					@ load rk10
704
705	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
706	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
707	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
708	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
709
710	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
711	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
712	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
713
714	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
715	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
716	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
717
718	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load plaintext
719	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
720	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
721
722	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
723	eor3	$res4b, $ctr_t4b, $ctr4b, $rk10				@ AES block 4 - result
724	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
725
726	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
727	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
728
729	eor3	$res2b, $ctr_t2b, $ctr2b, $rk10				@ AES block 8k+10 - result
730
731	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
732	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
733
734	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
735	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
736
737	eor3	$res7b, $ctr_t7b, $ctr7b, $rk10				@ AES block 7 - result
738	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
739	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
740
741	eor3	$res1b, $ctr_t1b, $ctr1b, $rk10				@ AES block 8k+9 - result
742	eor3	$res3b, $ctr_t3b, $ctr3b, $rk10				@ AES block 8k+11 - result
743	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
744
745	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
746	eor3	$res5b, $ctr_t5b, $ctr5b, $rk10				@ AES block 5 - result
747	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
748
749	eor3	$res0b, $ctr_t0b, $ctr0b, $rk10				@ AES block 8k+8 - result
750	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
751	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
752
753	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
754	eor3	$res6b, $ctr_t6b, $ctr6b, $rk10				@ AES block 6 - result
755
756	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
757	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
758
759	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
760	b.lt	.L128_enc_main_loop
761
762.L128_enc_prepretail:							@ PREPRETAIL
763	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
764	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
765	ext     $h7.16b, $h7.16b, $h7.16b, #8
766	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
767	ext     $h8.16b, $h8.16b, $h8.16b, #8
768	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
769
770	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
771	ext     $h5.16b, $h5.16b, $h5.16b, #8
772	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
773	ext     $h6.16b, $h6.16b, $h6.16b, #8
774	rev64	$res0b, $res0b						@ GHASH block 8k
775	rev64	$res1b, $res1b						@ GHASH block 8k+1
776
777	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
778	ldr	$h78kq, [$current_tag, #192]				@ load h6k | h5k
779	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
780	rev64	$res3b, $res3b						@ GHASH block 8k+3
781
782	rev64	$res2b, $res2b						@ GHASH block 8k+2
783	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
784
785	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
786
787	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
788	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
789	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
790
791	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
792	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
793
794	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
795	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
796	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
797
798	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
799	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
800
801	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
802	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
803
804	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
805	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
806
807	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
808	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
809
810	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
811
812	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
813
814	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
815
816	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
817
818	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
819	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
820
821	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
822	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
823
824	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
825	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
826
827	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
828	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
829	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
830
831	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
832	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
833
834	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
835	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
836	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
837
838	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
839	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
840
841	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
842	ext     $h3.16b, $h3.16b, $h3.16b, #8
843	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
844	ext     $h4.16b, $h4.16b, $h4.16b, #8
845
846	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
847	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
848	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
849
850	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
851	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
852
853	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
854	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
855
856	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
857	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
858	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
859	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
860
861	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
862	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
863
864	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
865	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
866	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
867
868	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
869	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
870	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
871
872	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
873	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
874
875	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
876	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
877	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
878
879	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
880	ext     $h1.16b, $h1.16b, $h1.16b, #8
881	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
882	ext     $h2.16b, $h2.16b, $h2.16b, #8
883	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
884	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
885
886	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
887	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
888	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
889
890	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
891	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
892	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
893
894	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
895	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
896
897	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
898	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
899	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
900
901	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
902	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
903	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
904
905	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
906	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
907	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
908
909	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
910	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
911	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
912
913	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
914	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
915	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
916
917	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
918	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
919	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
920
921	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
922	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
923
924	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
925	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
926	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
927
928	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
929	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
930	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
931
932	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
933	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
934	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
935
936	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
937	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
938	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
939
940	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
941	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
942
943	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
944	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
945	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
946
947	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
948	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
949
950	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
951	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
952	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
953
954	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
955	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
956	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
957
958	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
959	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
960	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
961
962	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
963	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
964	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
965
966	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
967	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
968	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
969	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
970
971	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
972	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
973	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
974
975	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
976	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
977
978	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
979	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
980
981	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
982	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
983	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
984	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
985
986	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
987	eor3	$acc_lb, $acc_lb, $acc_hb, $acc_mb		 	@ MODULO - fold into low
988	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
989
990	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
991	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
992	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
993
994	ldr	$rk10q, [$cc, #160]					@ load rk10
995	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
996	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
997
998	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
999	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
1000
1001	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
1002	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
1003
1004	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
1005	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
1006.L128_enc_tail:								@ TAIL
1007
1008	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
1009	ldr	$ctr_t0q, [$input_ptr], #16				@ AES block 8k+8 - load plaintext
1010
1011	mov	$t1.16b, $rk10
1012	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
1013	ext     $h5.16b, $h5.16b, $h5.16b, #8
1014
1015	eor3	$res1b, $ctr_t0b, $ctr0b, $t1.16b			@ AES block 8k+8 - result
1016	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
1017	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
1018	ext     $h6.16b, $h6.16b, $h6.16b, #8
1019	ext     $h7.16b, $h7.16b, $h7.16b, #8
1020
1021	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
1022	ext     $h8.16b, $h8.16b, $h8.16b, #8
1023	cmp	$main_end_input_ptr, #112
1024	b.gt	.L128_enc_blocks_more_than_7
1025
1026	mov	$ctr7b, $ctr6b
1027	mov	$ctr6b, $ctr5b
1028	movi	$acc_h.8b, #0
1029
1030	cmp	$main_end_input_ptr, #96
1031	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1032	mov	$ctr5b, $ctr4b
1033
1034	mov	$ctr4b, $ctr3b
1035	mov	$ctr3b, $ctr2b
1036	mov	$ctr2b, $ctr1b
1037
1038	movi	$acc_l.8b, #0
1039	movi	$acc_m.8b, #0
1040	b.gt	.L128_enc_blocks_more_than_6
1041
1042	mov	$ctr7b, $ctr6b
1043	cmp	$main_end_input_ptr, #80
1044
1045	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1046	mov	$ctr6b, $ctr5b
1047	mov	$ctr5b, $ctr4b
1048
1049	mov	$ctr4b, $ctr3b
1050	mov	$ctr3b, $ctr1b
1051	b.gt	.L128_enc_blocks_more_than_5
1052
1053	cmp	$main_end_input_ptr, #64
1054	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1055
1056	mov	$ctr7b, $ctr6b
1057	mov	$ctr6b, $ctr5b
1058
1059	mov	$ctr5b, $ctr4b
1060	mov	$ctr4b, $ctr1b
1061	b.gt	.L128_enc_blocks_more_than_4
1062
1063	mov	$ctr7b, $ctr6b
1064	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1065	mov	$ctr6b, $ctr5b
1066
1067	mov	$ctr5b, $ctr1b
1068	cmp	$main_end_input_ptr, #48
1069	b.gt	.L128_enc_blocks_more_than_3
1070
1071	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1072	mov	$ctr7b, $ctr6b
1073	mov	$ctr6b, $ctr1b
1074
1075	cmp	$main_end_input_ptr, #32
1076	ldr	$h34kq, [$current_tag, #96]					@ load h4k | h3k
1077	b.gt	.L128_enc_blocks_more_than_2
1078
1079	cmp	$main_end_input_ptr, #16
1080
1081	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1082	mov	$ctr7b, $ctr1b
1083	b.gt	.L128_enc_blocks_more_than_1
1084
1085	ldr	$h12kq, [$current_tag, #48]					@ load h2k | h1k
1086	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1087	b	 .L128_enc_blocks_less_than_1
1088.L128_enc_blocks_more_than_7:						@ blocks left >  7
1089	st1	{ $res1b}, [$output_ptr], #16				@ AES final-7 block  - store result
1090
1091	rev64	$res0b, $res1b						@ GHASH final-7 block
1092	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-6 block - load plaintext
1093
1094	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1095
1096	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
1097
1098	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
1099
1100	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
1101
1102	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
1103	movi	$t0.8b, #0						@ suppress further partial tag feed in
1104
1105	eor3	$res1b, $ctr_t1b, $ctr1b, $t1.16b			@ AES final-6 block - result
1106
1107	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d				@ GHASH final-7 block - mid
1108	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
1109.L128_enc_blocks_more_than_6:						@ blocks left >  6
1110
1111	st1	{ $res1b}, [$output_ptr], #16				@ AES final-6 block - store result
1112
1113	rev64	$res0b, $res1b						@ GHASH final-6 block
1114	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-5 block - load plaintext
1115
1116	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1117
1118	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
1119
1120	eor3	$res1b, $ctr_t1b, $ctr2b, $t1.16b			@ AES final-5 block - result
1121	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
1122
1123	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
1124	movi	$t0.8b, #0						@ suppress further partial tag feed in
1125
1126	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
1127	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
1128
1129	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
1130
1131	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
1132	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
1133.L128_enc_blocks_more_than_5:						@ blocks left >  5
1134
1135	st1	{ $res1b}, [$output_ptr], #16				@ AES final-5 block - store result
1136
1137	rev64	$res0b, $res1b						@ GHASH final-5 block
1138
1139	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1140
1141	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
1142	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-4 block - load plaintext
1143	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
1144
1145	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
1146
1147	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
1148
1149	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
1150
1151	eor3	$res1b, $ctr_t1b, $ctr3b, $t1.16b			@ AES final-4 block - result
1152	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
1153	movi	$t0.8b, #0						@ suppress further partial tag feed in
1154
1155	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
1156	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
1157
1158	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
1159.L128_enc_blocks_more_than_4:						@ blocks left >  4
1160
1161	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-4 block - store result
1162
1163	rev64	$res0b, $res1b						@ GHASH final-4 block
1164
1165	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-3 block - load plaintext
1166
1167	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1168
1169	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
1170	movi	$t0.8b, #0						@ suppress further partial tag feed in
1171	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
1172
1173	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
1174
1175	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
1176
1177	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
1178	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
1179
1180	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
1181
1182	eor3	$res1b, $ctr_t1b, $ctr4b, $t1.16b			@ AES final-3 block - result
1183	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
1184.L128_enc_blocks_more_than_3:						@ blocks left >  3
1185
1186	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-3 block - store result
1187
1188	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
1189	ext     $h4.16b, $h4.16b, $h4.16b, #8
1190
1191	rev64	$res0b, $res1b						@ GHASH final-3 block
1192
1193	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1194	movi	$t0.8b, #0						@ suppress further partial tag feed in
1195
1196	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
1197	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
1198	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
1199
1200	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-2 block - load plaintext
1201
1202	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
1203
1204	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
1205	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
1206
1207	eor3	$res1b, $ctr_t1b, $ctr5b, $t1.16b			@ AES final-2 block - result
1208
1209	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
1210	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
1211
1212	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
1213	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
1214.L128_enc_blocks_more_than_2:						@ blocks left >  2
1215
1216	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-2 block - store result
1217
1218	rev64	$res0b, $res1b						@ GHASH final-2 block
1219
1220	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1221
1222	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-1 block - load plaintext
1223
1224	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
1225	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
1226	ext     $h3.16b, $h3.16b, $h3.16b, #8
1227	movi	$t0.8b, #0						@ suppress further partial tag feed in
1228
1229	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
1230	eor3	$res1b, $ctr_t1b, $ctr6b, $t1.16b			@ AES final-1 block - result
1231
1232	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
1233
1234	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
1235	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
1236
1237	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
1238
1239	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
1240	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
1241.L128_enc_blocks_more_than_1:						@ blocks left >  1
1242
1243	st1	{ $res1b}, [$output_ptr], #16			  	@ AES final-1 block - store result
1244
1245	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
1246	ext     $h2.16b, $h2.16b, $h2.16b, #8
1247	rev64	$res0b, $res1b						@ GHASH final-1 block
1248	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final block - load plaintext
1249
1250	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1251
1252	movi	$t0.8b, #0						@ suppress further partial tag feed in
1253	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
1254	eor3	$res1b, $ctr_t1b, $ctr7b, $t1.16b			@ AES final block - result
1255
1256	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
1257
1258	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
1259
1260	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
1261
1262	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
1263
1264	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
1265	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
1266
1267	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
1268
1269	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
1270	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
1271.L128_enc_blocks_less_than_1:						@ blocks left <= 1
1272
1273	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
1274	str	$rtmp_ctrq, [$counter]					@ store the updated counter
1275	and	$bit_length, $bit_length, #127			 	@ bit_length %= 128
1276
1277	sub	$bit_length, $bit_length, #128			 	@ bit_length -= 128
1278
1279	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
1280
1281	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
1282	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
1283	and	$bit_length, $bit_length, #127			 	@ bit_length %= 128
1284
1285	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
1286	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
1287	cmp	$bit_length, #64
1288
1289	csel	$temp2_x, $temp1_x, $temp0_x, lt
1290	csel	$temp3_x, $temp0_x, xzr, lt
1291
1292	mov	$ctr0.d[1], $temp3_x
1293	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
1294
1295	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
1296
1297	rev64	$res0b, $res1b						@ GHASH final block
1298
1299	bif	$res1b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
1300	st1	{ $res1b}, [$output_ptr]				@ store all 16B
1301
1302	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
1303
1304	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
1305
1306	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
1307	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
1308	ext	$h1.16b, $h1.16b, $h1.16b, #8
1309
1310	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
1311
1312	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
1313	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
1314	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
1315
1316	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
1317
1318	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
1319
1320	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
1321
1322	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
1323	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		  	@ MODULO - top 64b align with mid
1324
1325	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		  	@ MODULO - karatsuba tidy up
1326
1327	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b		 	@ MODULO - fold into mid
1328
1329	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
1330	ext	$t11.16b, $acc_mb, $acc_mb, #8			  	@ MODULO - other mid alignment
1331
1332	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		  	@ MODULO - fold into low
1333	ext	$acc_lb, $acc_lb, $acc_lb, #8
1334	rev64	$acc_lb, $acc_lb
1335	st1	{ $acc_l.16b }, [$current_tag]
1336	mov	x0, $byte_length
1337
1338	ldp	d10, d11, [sp, #16]
1339	ldp	d12, d13, [sp, #32]
1340	ldp	d14, d15, [sp, #48]
1341	ldp	d8, d9, [sp], #80
1342	ret
1343
1344.L128_enc_ret:
1345	mov w0, #0x0
1346	ret
1347.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1348___
1349
1350#########################################################################################
1351# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const uint8_t * ciphertext,
1352#                                            uint64_t plaintext_length,
1353#                                            uint8_t * plaintext,
1354#                                            uint64_t *Xi,
1355#                                            unsigned char ivec[16],
1356#                                            const void *key);
1357#
1358$code.=<<___;
1359.global unroll8_eor3_aes_gcm_dec_128_kernel
1360.type   unroll8_eor3_aes_gcm_dec_128_kernel,%function
1361.align  4
1362unroll8_eor3_aes_gcm_dec_128_kernel:
1363	AARCH64_VALID_CALL_TARGET
1364	cbz	x1, .L128_dec_ret
1365	stp	d8, d9, [sp, #-80]!
1366	lsr	$byte_length, $bit_length, #3
1367	mov	$counter, x4
1368	mov	$cc, x5
1369	stp	d10, d11, [sp, #16]
1370	stp	d12, d13, [sp, #32]
1371	stp	d14, d15, [sp, #48]
1372	mov	x5, #0xc200000000000000
1373	stp	x5, xzr, [sp, #64]
1374	add	$modulo_constant, sp, #64
1375
1376	mov	$main_end_input_ptr, $byte_length
1377	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
1378
1379	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
1380	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
1381
1382	mov	$constant_temp, #0x100000000				@ set up counter increment
1383	movi	$rctr_inc.16b, #0x0
1384	mov	$rctr_inc.d[1], $constant_temp
1385	ld1	{ $acc_lb}, [$current_tag]
1386	  ext	$acc_lb, $acc_lb, $acc_lb, #8
1387	rev64	$acc_lb, $acc_lb
1388
1389	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
1390
1391	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
1392
1393	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
1394
1395	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
1396	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
1397
1398	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1399
1400	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
1401	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
1402	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
1403
1404	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
1405	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
1406
1407	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
1408	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
1409
1410	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
1411	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
1412
1413	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
1414	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
1415
1416	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
1417
1418	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
1419	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
1420	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
1421
1422	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
1423	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
1424
1425	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
1426
1427	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
1428	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
1429
1430	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
1431
1432	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
1433
1434	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
1435	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
1436
1437	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
1438	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
1439
1440	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
1441	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
1442	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
1443
1444	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
1445	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
1446	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
1447
1448	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
1449	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
1450	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
1451
1452	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
1453	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
1454
1455	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
1456	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
1457
1458	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
1459	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
1460
1461	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
1462	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
1463
1464	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
1465	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
1466	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
1467
1468	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
1469	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
1470	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
1471
1472	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
1473	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
1474	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
1475
1476	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
1477	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
1478	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
1479
1480	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
1481	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
1482
1483	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
1484	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
1485
1486	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
1487
1488	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
1489	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
1490	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
1491
1492	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
1493	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
1494	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
1495
1496	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
1497	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
1498	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
1499
1500	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
1501	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
1502	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
1503
1504	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
1505	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
1506	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
1507
1508	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
1509	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
1510	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
1511
1512	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
1513	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
1514
1515	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
1516	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
1517
1518	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
1519	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
1520	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
1521
1522	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
1523	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
1524	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
1525
1526	aese	$ctr0b, $rk9						@ AES block 0 - round 9
1527	aese	$ctr1b, $rk9						@ AES block 1 - round 9
1528	aese	$ctr6b, $rk9						@ AES block 6 - round 9
1529
1530	ldr	$rk10q, [$cc, #160]					@ load rk10
1531	aese	$ctr4b, $rk9						@ AES block 4 - round 9
1532	aese	$ctr3b, $rk9						@ AES block 3 - round 9
1533
1534	aese	$ctr2b, $rk9						@ AES block 2 - round 9
1535	aese	$ctr5b, $rk9						@ AES block 5 - round 9
1536	aese	$ctr7b, $rk9						@ AES block 7 - round 9
1537
1538	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
1539	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
1540	b.ge	.L128_dec_tail						@ handle tail
1541
1542	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 0, 1 - load ciphertext
1543
1544	eor3	$ctr0b, $res0b, $ctr0b, $rk10				@ AES block 0 - result
1545	eor3	$ctr1b, $res1b, $ctr1b, $rk10				@ AES block 1 - result
1546	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 0, 1 - store result
1547
1548	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
1549	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
1550	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 2, 3 - load ciphertext
1551
1552	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 4, 5 - load ciphertext
1553
1554	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
1555	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
1556	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 6, 7 - load ciphertext
1557
1558	eor3	$ctr3b, $res3b, $ctr3b, $rk10				@ AES block 3 - result
1559	eor3	$ctr2b, $res2b, $ctr2b, $rk10				@ AES block 2 - result
1560	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 2, 3 - store result
1561
1562	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
1563	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
1564
1565	eor3	$ctr6b, $res6b, $ctr6b, $rk10				@ AES block 6 - result
1566
1567	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
1568	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
1569
1570	eor3	$ctr4b, $res4b, $ctr4b, $rk10				@ AES block 4 - result
1571	eor3	$ctr5b, $res5b, $ctr5b, $rk10				@ AES block 5 - result
1572	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 4, 5 - store result
1573
1574	eor3	$ctr7b, $res7b, $ctr7b, $rk10				@ AES block 7 - result
1575	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 6, 7 - store result
1576	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
1577
1578	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
1579	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
1580	b.ge	.L128_dec_prepretail					@ do prepretail
1581
1582.L128_dec_main_loop:							@ main loop start
1583	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
1584	ext     $h7.16b, $h7.16b, $h7.16b, #8
1585	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
1586	ext     $h8.16b, $h8.16b, $h8.16b, #8
1587
1588	rev64	$res1b, $res1b						@ GHASH block 8k+1
1589	rev64	$res0b, $res0b						@ GHASH block 8k
1590	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
1591
1592	rev64	$res6b, $res6b						@ GHASH block 8k+6
1593	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
1594	ext     $h5.16b, $h5.16b, $h5.16b, #8
1595	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
1596	ext     $h6.16b, $h6.16b, $h6.16b, #8
1597
1598	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
1599	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
1600	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
1601
1602	rev64	$res2b, $res2b						@ GHASH block 8k+2
1603	rev64	$res4b, $res4b						@ GHASH block 8k+4
1604	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
1605
1606	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
1607	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
1608	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
1609	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
1610
1611	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
1612	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
1613	rev64	$res3b, $res3b						@ GHASH block 8k+3
1614
1615	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
1616	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1617	rev64	$res5b, $res5b						@ GHASH block 8k+5
1618
1619	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
1620	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
1621	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1622
1623	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
1624	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
1625	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
1626
1627	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
1628	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
1629	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
1630
1631	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
1632	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
1633	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
1634
1635	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
1636	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
1637	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
1638
1639	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
1640	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
1641	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
1642
1643	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
1644	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1645	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
1646
1647	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
1648	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1649	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
1650
1651	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
1652	ext     $h3.16b, $h3.16b, $h3.16b, #8
1653	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
1654	ext     $h4.16b, $h4.16b, $h4.16b, #8
1655	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
1656	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
1657
1658	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
1659	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
1660	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
1661
1662	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
1663	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
1664	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
1665
1666	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
1667	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
1668	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
1669
1670	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
1671	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
1672	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
1673	ext     $h1.16b, $h1.16b, $h1.16b, #8
1674	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
1675	ext     $h2.16b, $h2.16b, $h2.16b, #8
1676
1677	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
1678	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
1679	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
1680
1681	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1682	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
1683	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
1684
1685	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
1686	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
1687	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
1688
1689	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
1690	rev64	$res7b, $res7b						@ GHASH block 8k+7
1691	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
1692
1693	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
1694	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
1695	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
1696
1697	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
1698	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
1699	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
1700	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1701
1702	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
1703	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
1704	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
1705
1706	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
1707	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
1708	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
1709
1710	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
1711	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
1712	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
1713
1714	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
1715	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
1716	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
1717
1718	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
1719	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1720	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
1721
1722	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
1723	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
1724	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
1725
1726	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
1727	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
1728	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1729
1730	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
1731	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
1732	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
1733
1734	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
1735	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
1736	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
1737
1738	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
1739	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
1740	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
1741
1742	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
1743	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
1744	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
1745
1746	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
1747	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b 			@ GHASH block 8k+4, 8k+5 - mid
1748	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
1749
1750	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
1751	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
1752	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
1753
1754	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
1755	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
1756	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
1757
1758	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
1759	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
1760	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
1761
1762	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
1763	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
1764	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
1765
1766	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
1767	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
1768	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
1769
1770	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
1771	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
1772	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
1773
1774	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
1775	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
1776	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
1777
1778	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
1779	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
1780	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
1781
1782	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
1783	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
1784	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
1785
1786	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
1787	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
1788	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
1789
1790	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
1791	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
1792	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
1793
1794	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
1795	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
1796	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load ciphertext
1797
1798	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load ciphertext
1799	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
1800	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
1801
1802	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load ciphertext
1803	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
1804	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
1805
1806	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load ciphertext
1807	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
1808	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
1809
1810	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
1811	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
1812	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
1813
1814	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
1815	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
1816	ldr	$rk10q, [$cc, #160]					@ load rk10
1817
1818	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
1819	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
1820	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
1821
1822	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
1823	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
1824	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
1825
1826	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
1827	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
1828
1829	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
1830	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
1831	eor3	$ctr1b, $res1b, $ctr1b, $rk10				@ AES block 8k+9 - result
1832
1833	eor3	$ctr0b, $res0b, $ctr0b, $rk10				@ AES block 8k+8 - result
1834	eor3	$ctr7b, $res7b, $ctr7b, $rk10				@ AES block 8k+15 - result
1835	eor3	$ctr6b, $res6b, $ctr6b, $rk10				@ AES block 8k+14 - result
1836
1837	eor3	$ctr2b, $res2b, $ctr2b, $rk10				@ AES block 8k+10 - result
1838	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
1839	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
1840
1841	eor3	$ctr4b, $res4b, $ctr4b, $rk10				@ AES block 8k+12 - result
1842	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
1843	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
1844
1845	eor3	$ctr3b, $res3b, $ctr3b, $rk10				@ AES block 8k+11 - result
1846	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
1847	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
1848
1849	eor3	$ctr5b, $res5b, $ctr5b, $rk10				@ AES block 8k+13 - result
1850	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
1851
1852	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
1853	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
1854	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
1855
1856	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
1857	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
1858	b.lt	.L128_dec_main_loop
1859
1860.L128_dec_prepretail:							@ PREPRETAIL
1861	rev64	$res3b, $res3b						@ GHASH block 8k+3
1862	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
1863	rev64	$res0b, $res0b						@ GHASH block 8k
1864
1865	rev64	$res2b, $res2b						@ GHASH block 8k+2
1866	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
1867	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
1868
1869	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
1870	ext     $h7.16b, $h7.16b, $h7.16b, #8
1871	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
1872	ext     $h8.16b, $h8.16b, $h8.16b, #8
1873	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
1874	rev64	$res1b, $res1b						@ GHASH block 8k+1
1875
1876	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
1877	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
1878	ext     $h5.16b, $h5.16b, $h5.16b, #8
1879	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
1880	ext     $h6.16b, $h6.16b, $h6.16b, #8
1881	rev64	$res5b, $res5b						@ GHASH block 8k+5
1882
1883	rev64	$res4b, $res4b						@ GHASH block 8k+4
1884
1885	rev64	$res6b, $res6b						@ GHASH block 8k+6
1886
1887	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
1888	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
1889	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
1890	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
1891
1892	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
1893	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
1894	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
1895
1896	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1897	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
1898	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
1899
1900	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
1901	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
1902	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
1903
1904	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
1905	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
1906	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
1907
1908	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
1909	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
1910	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
1911
1912	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
1913	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1914	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
1915
1916	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
1917	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
1918	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
1919
1920	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k - mid
1921	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
1922	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
1923
1924	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
1925	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
1926	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
1927
1928	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
1929	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
1930	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
1931
1932	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
1933	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
1934	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
1935
1936	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
1937	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
1938	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
1939
1940	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
1941	ext     $h3.16b, $h3.16b, $h3.16b, #8
1942	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
1943	ext     $h4.16b, $h4.16b, $h4.16b, #8
1944	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
1945	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
1946
1947	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
1948	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
1949	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
1950
1951	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
1952	ext     $h1.16b, $h1.16b, $h1.16b, #8
1953	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
1954	ext     $h2.16b, $h2.16b, $h2.16b, #8
1955	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
1956
1957	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
1958	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
1959	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
1960
1961	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
1962	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1963	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
1964
1965	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
1966	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
1967	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
1968
1969	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
1970	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
1971	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
1972
1973	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
1974	rev64	$res7b, $res7b						@ GHASH block 8k+7
1975	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
1976
1977	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
1978	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
1979	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
1980	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
1981
1982	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
1983	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
1984	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1985
1986	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
1987	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
1988	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
1989
1990	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
1991	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
1992	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
1993
1994	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
1995	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
1996	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
1997
1998	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
1999	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
2000	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
2001
2002	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
2003	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
2004	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
2005
2006	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
2007	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
2008	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
2009
2010	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
2011	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
2012	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
2013
2014	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
2015	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
2016	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
2017
2018	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
2019	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
2020	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
2021
2022	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
2023	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
2024	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
2025
2026	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
2027	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
2028	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
2029
2030	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
2031	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
2032	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
2033
2034	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
2035	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
2036	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
2037
2038	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
2039	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
2040	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
2041
2042	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
2043	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
2044	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
2045
2046	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
2047	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
2048	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
2049
2050	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
2051	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
2052	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
2053
2054	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
2055	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
2056	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
2057
2058	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
2059	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
2060	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
2061
2062	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
2063	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
2064	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
2065
2066	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
2067	ldr	$rk10q, [$cc, #160]					@ load rk10
2068
2069	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
2070	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
2071
2072	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
2073	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
2074	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
2075
2076	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
2077	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
2078	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
2079
2080	aese	$ctr6b, $rk9						@ AES block 8k+14 - round 9
2081	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
2082	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
2083
2084	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
2085	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
2086	aese	$ctr2b, $rk9						@ AES block 8k+10 - round 9
2087
2088	aese	$ctr3b, $rk9						@ AES block 8k+11 - round 9
2089	aese	$ctr5b, $rk9						@ AES block 8k+13 - round 9
2090	aese	$ctr0b, $rk9						@ AES block 8k+8 - round 9
2091
2092	aese	$ctr4b, $rk9						@ AES block 8k+12 - round 9
2093	aese	$ctr1b, $rk9						@ AES block 8k+9 - round 9
2094	aese	$ctr7b, $rk9						@ AES block 8k+15 - round 9
2095
2096.L128_dec_tail:								@ TAIL
2097
2098	mov	$t1.16b, $rk10
2099	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
2100
2101	cmp	$main_end_input_ptr, #112
2102
2103	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
2104	ext     $h8.16b, $h8.16b, $h8.16b, #8
2105	ldr	$res1q, [$input_ptr], #16				@ AES block 8k+8 - load ciphertext
2106
2107	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
2108	ext     $h5.16b, $h5.16b, $h5.16b, #8
2109	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
2110
2111	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
2112	ext     $h6.16b, $h6.16b, $h6.16b, #8
2113	ext     $h7.16b, $h7.16b, $h7.16b, #8
2114
2115	eor3	$res4b, $res1b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
2116	b.gt	.L128_dec_blocks_more_than_7
2117
2118	cmp	$main_end_input_ptr, #96
2119	mov	$ctr7b, $ctr6b
2120	movi	$acc_l.8b, #0
2121
2122	movi	$acc_h.8b, #0
2123	mov	$ctr6b, $ctr5b
2124	mov	$ctr5b, $ctr4b
2125
2126	mov	$ctr4b, $ctr3b
2127	mov	$ctr3b, $ctr2b
2128	mov	$ctr2b, $ctr1b
2129
2130	movi	$acc_m.8b, #0
2131	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2132	b.gt	.L128_dec_blocks_more_than_6
2133
2134	cmp	$main_end_input_ptr, #80
2135	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2136
2137	mov	$ctr7b, $ctr6b
2138	mov	$ctr6b, $ctr5b
2139	mov	$ctr5b, $ctr4b
2140
2141	mov	$ctr4b, $ctr3b
2142	mov	$ctr3b, $ctr1b
2143	b.gt	.L128_dec_blocks_more_than_5
2144
2145	cmp	$main_end_input_ptr, #64
2146
2147	mov	$ctr7b, $ctr6b
2148	mov	$ctr6b, $ctr5b
2149	mov	$ctr5b, $ctr4b
2150
2151	mov	$ctr4b, $ctr1b
2152	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2153	b.gt	.L128_dec_blocks_more_than_4
2154
2155	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2156	mov	$ctr7b, $ctr6b
2157	mov	$ctr6b, $ctr5b
2158
2159	mov	$ctr5b, $ctr1b
2160	cmp	$main_end_input_ptr, #48
2161	b.gt	.L128_dec_blocks_more_than_3
2162
2163	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2164	mov	$ctr7b, $ctr6b
2165	cmp	$main_end_input_ptr, #32
2166
2167	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
2168	mov	$ctr6b, $ctr1b
2169	b.gt	.L128_dec_blocks_more_than_2
2170
2171	cmp	$main_end_input_ptr, #16
2172
2173	mov	$ctr7b, $ctr1b
2174	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2175	b.gt	L128_dec_blocks_more_than_1
2176
2177	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2178	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
2179	b	 .L128_dec_blocks_less_than_1
2180.L128_dec_blocks_more_than_7:						@ blocks left >  7
2181	rev64	$res0b, $res1b						@ GHASH final-7 block
2182
2183	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2184
2185	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
2186
2187	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
2188	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
2189
2190	movi	$t0.8b, #0						@ suppress further partial tag feed in
2191	ldr	$res1q, [$input_ptr], #16				@ AES final-6 block - load ciphertext
2192
2193	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
2194
2195	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
2196	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-7 block  - store result
2197	eor3	$res4b, $res1b, $ctr1b, $t1.16b				@ AES final-6 block - result
2198
2199	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
2200.L128_dec_blocks_more_than_6:						@ blocks left >  6
2201
2202	rev64	$res0b, $res1b						@ GHASH final-6 block
2203
2204	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2205
2206	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
2207
2208	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
2209
2210	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
2211	ldr	$res1q, [$input_ptr], #16				@ AES final-5 block - load ciphertext
2212	movi	$t0.8b, #0						@ suppress further partial tag feed in
2213
2214	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
2215	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-6 block - store result
2216	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
2217
2218	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
2219	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
2220
2221	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
2222	eor3	$res4b, $res1b, $ctr2b, $t1.16b				@ AES final-5 block - result
2223.L128_dec_blocks_more_than_5:						@ blocks left >  5
2224
2225	rev64	$res0b, $res1b						@ GHASH final-5 block
2226
2227	ldr	$res1q, [$input_ptr], #16				@ AES final-4 block - load ciphertext
2228	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-5 block - store result
2229
2230	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2231
2232	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
2233
2234	eor3	$res4b, $res1b, $ctr3b, $t1.16b				@ AES final-4 block - result
2235
2236	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
2237
2238	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
2239	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
2240	movi	$t0.8b, #0						@ suppress further partial tag feed in
2241
2242	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
2243	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
2244	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
2245
2246	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
2247	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
2248.L128_dec_blocks_more_than_4:						@ blocks left >  4
2249
2250	rev64	$res0b, $res1b						@ GHASH final-4 block
2251
2252	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2253	ldr	$res1q, [$input_ptr], #16				@ AES final-3 block - load ciphertext
2254
2255	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
2256	movi	$t0.8b, #0						@ suppress further partial tag feed in
2257	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
2258
2259	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
2260
2261	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
2262
2263	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-4 block - store result
2264	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
2265
2266	eor3	$res4b, $res1b, $ctr4b, $t1.16b				@ AES final-3 block - result
2267	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
2268
2269	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
2270
2271	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
2272.L128_dec_blocks_more_than_3:						@ blocks left >  3
2273
2274	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-3 block - store result
2275	rev64	$res0b, $res1b						@ GHASH final-3 block
2276
2277	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2278
2279	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
2280
2281	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
2282	ext     $h4.16b, $h4.16b, $h4.16b, #8
2283	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
2284
2285	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
2286
2287	ldr	$res1q, [$input_ptr], #16				@ AES final-2 block - load ciphertext
2288
2289	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
2290	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
2291	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
2292
2293	movi	$t0.8b, #0						@ suppress further partial tag feed in
2294	eor3	$res4b, $res1b, $ctr5b, $t1.16b				@ AES final-2 block - result
2295	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
2296
2297	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
2298
2299	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
2300	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
2301.L128_dec_blocks_more_than_2:						@ blocks left >  2
2302
2303	rev64	$res0b, $res1b						@ GHASH final-2 block
2304
2305	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-2 block - store result
2306
2307	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2308	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
2309	ext     $h3.16b, $h3.16b, $h3.16b, #8
2310	movi	$t0.8b, #0						@ suppress further partial tag feed in
2311
2312	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
2313
2314	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
2315
2316	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
2317
2318	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
2319	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
2320	ldr	$res1q, [$input_ptr], #16				@ AES final-1 block - load ciphertext
2321
2322	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
2323
2324	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
2325
2326	eor3	$res4b, $res1b, $ctr6b, $t1.16b				@ AES final-1 block - result
2327	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
2328.L128_dec_blocks_more_than_1:						@ blocks left >  1
2329
2330	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-1 block - store result
2331	rev64	$res0b, $res1b						@ GHASH final-1 block
2332
2333	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
2334	ext     $h2.16b, $h2.16b, $h2.16b, #8
2335
2336	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2337
2338	movi	$t0.8b, #0						@ suppress further partial tag feed in
2339
2340	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
2341
2342	ldr	$res1q, [$input_ptr], #16				@ AES final block - load ciphertext
2343	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
2344
2345	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
2346	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
2347	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
2348
2349	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
2350	eor3	$res4b, $res1b, $ctr7b, $t1.16b				@ AES final block - result
2351
2352	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
2353
2354	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
2355
2356	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
2357
2358	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
2359.L128_dec_blocks_less_than_1:						@ blocks left <= 1
2360
2361	and	$bit_length, $bit_length, #127				@ bit_length %= 128
2362
2363	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
2364
2365	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
2366
2367	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
2368	and	$bit_length, $bit_length, #127				@ bit_length %= 128
2369
2370	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
2371	cmp	$bit_length, #64
2372	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
2373
2374	csel	$temp2_x, $temp1_x, $temp0_x, lt
2375	csel	$temp3_x, $temp0_x, xzr, lt
2376
2377	mov	$ctr0.d[1], $temp3_x
2378	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
2379
2380	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
2381	ext     $h1.16b, $h1.16b, $h1.16b, #8
2382	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
2383
2384	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
2385
2386	rev64	$res0b, $res1b						@ GHASH final block
2387
2388	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
2389
2390	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
2391	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
2392
2393	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
2394	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
2395
2396	bif	$res4b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
2397
2398	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
2399	st1	{ $res4b}, [$output_ptr]				@ store all 16B
2400
2401	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
2402
2403	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
2404	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
2405
2406	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
2407
2408	eor	$t10.16b, $acc_hb, $acc_lb				@ MODULO - karatsuba tidy up
2409
2410	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
2411	ext	$acc_hb, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
2412
2413	eor	$acc_mb, $acc_mb, $t10.16b				@ MODULO - karatsuba tidy up
2414
2415	eor3	$acc_mb, $acc_mb, $acc_hb, $t11.16b			@ MODULO - fold into mid
2416
2417	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
2418	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
2419
2420	eor3	$acc_lb, $acc_lb, $acc_mb, $acc_hb			@ MODULO - fold into low
2421	ext	$acc_lb, $acc_lb, $acc_lb, #8
2422	rev64	$acc_lb, $acc_lb
2423	st1	{ $acc_l.16b }, [$current_tag]
2424	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
2425
2426	str	$rtmp_ctrq, [$counter]					@ store the updated counter
2427
2428	mov	x0, $byte_length
2429
2430	ldp	d10, d11, [sp, #16]
2431	ldp	d12, d13, [sp, #32]
2432	ldp	d14, d15, [sp, #48]
2433	ldp	d8, d9, [sp], #80
2434	ret
2435.L128_dec_ret:
2436	mov w0, #0x0
2437	ret
2438.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2439___
2440}
2441
2442{
2443my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
2444my ($temp2_x,$temp3_x)=map("x$_",(13..14));
2445my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
2446my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
2447my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
2448my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
2449my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
2450
2451my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
2452my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
2453my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
2454
2455my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
2456my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
2457
2458my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
2459my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
2460my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
2461my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
2462
2463my $t0="v16";
2464my $t0d="d16";
2465
2466my $t1="v29";
2467my $t2=$res1;
2468my $t3=$t1;
2469
2470my $t4=$res0;
2471my $t5=$res2;
2472my $t6=$t0;
2473
2474my $t7=$res3;
2475my $t8=$res4;
2476my $t9=$res5;
2477
2478my $t10=$res6;
2479my $t11="v21";
2480my $t12=$t1;
2481
2482my $rtmp_ctr="v30";
2483my $rtmp_ctrq="q30";
2484my $rctr_inc="v31";
2485my $rctr_incd="d31";
2486
2487my $mod_constantd=$t0d;
2488my $mod_constant=$t0;
2489
2490my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
2491my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
2492my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
2493my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
2494my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
2495my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
2496my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
2497my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
2498my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
2499my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
2500my $rk2q1="v28.1q";
2501my $rk3q1="v26.1q";
2502my $rk4v="v27";
2503
2504#########################################################################################
2505# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const uint8_t * plaintext,
2506#                                            uint64_t plaintext_length,
2507#                                            uint8_t * ciphertext,
2508#                                            uint64_t *Xi,
2509#                                            unsigned char ivec[16],
2510#                                            const void *key);
2511#
2512$code.=<<___;
2513.global unroll8_eor3_aes_gcm_enc_192_kernel
2514.type   unroll8_eor3_aes_gcm_enc_192_kernel,%function
2515.align  4
2516unroll8_eor3_aes_gcm_enc_192_kernel:
2517	AARCH64_VALID_CALL_TARGET
2518	cbz	x1, .L192_enc_ret
2519	stp	d8, d9, [sp, #-80]!
2520	lsr	$byte_length, $bit_length, #3
2521	mov	$counter, x4
2522	mov	$cc, x5
2523	stp	d10, d11, [sp, #16]
2524	stp	d12, d13, [sp, #32]
2525	stp	d14, d15, [sp, #48]
2526	mov	x5, #0xc200000000000000
2527	stp	x5, xzr, [sp, #64]
2528	add	$modulo_constant, sp, #64
2529
2530	mov	$main_end_input_ptr, $byte_length
2531	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
2532
2533	mov	$constant_temp, #0x100000000				@ set up counter increment
2534	movi	$rctr_inc.16b, #0x0
2535	mov	$rctr_inc.d[1], $constant_temp
2536
2537	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
2538
2539	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
2540
2541	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
2542	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
2543
2544	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
2545	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
2546
2547	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
2548	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
2549
2550	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
2551	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
2552	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
2553
2554	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2555
2556	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
2557	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
2558	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
2559
2560	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
2561
2562	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
2563	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
2564
2565	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
2566
2567	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
2568	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
2569	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
2570
2571	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
2572	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
2573	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
2574
2575	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
2576	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
2577	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
2578
2579	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
2580	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
2581
2582	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
2583	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
2584	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
2585
2586	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
2587	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
2588	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
2589
2590	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
2591	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
2592	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
2593
2594	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
2595	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
2596
2597	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
2598	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
2599	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
2600
2601	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
2602	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
2603
2604	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
2605	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
2606	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
2607
2608	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
2609
2610	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
2611
2612	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
2613
2614	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
2615	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
2616	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
2617
2618	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
2619	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
2620	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
2621
2622	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
2623	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
2624	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
2625
2626	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
2627	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
2628	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
2629
2630	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
2631	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
2632	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
2633
2634	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
2635	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
2636	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
2637
2638	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
2639
2640	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
2641	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
2642	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
2643
2644	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
2645	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
2646	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
2647
2648	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
2649	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
2650	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
2651
2652	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
2653	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
2654
2655	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
2656	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
2657
2658	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
2659	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
2660
2661	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
2662	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
2663
2664	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
2665	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
2666
2667	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
2668	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
2669	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
2670
2671	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
2672	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
2673	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
2674
2675	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
2676	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
2677	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
2678
2679        ld1     { $acc_lb}, [$current_tag]
2680	ext     $acc_lb, $acc_lb, $acc_lb, #8
2681	rev64   $acc_lb, $acc_lb
2682	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
2683
2684	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
2685	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
2686
2687	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
2688	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
2689
2690	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
2691	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
2692
2693	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 14 - round 10
2694	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
2695	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 11 - round 10
2696
2697	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 9 - round 10
2698	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 13 - round 10
2699	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 12 - round 10
2700
2701	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8 - round 10
2702	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 10 - round 10
2703	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 15 - round 10
2704
2705	aese	$ctr6b, $rk11						@ AES block 14 - round 11
2706	aese	$ctr3b, $rk11						@ AES block 11 - round 11
2707
2708	aese	$ctr4b, $rk11						@ AES block 12 - round 11
2709	aese	$ctr7b, $rk11						@ AES block 15 - round 11
2710	ldr	$rk12q, [$cc, #192]					@ load rk12
2711
2712	aese	$ctr1b, $rk11						@ AES block 9 - round 11
2713	aese	$ctr5b, $rk11						@ AES block 13 - round 11
2714
2715	aese	$ctr2b, $rk11						@ AES block 10 - round 11
2716	aese	$ctr0b, $rk11						@ AES block 8 - round 11
2717	b.ge	.L192_enc_tail						@ handle tail
2718
2719	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
2720
2721	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
2722
2723	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
2724
2725	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
2726
2727	eor3	$res0b, $ctr_t0b, $ctr0b, $rk12				@ AES block 0 - result
2728	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
2729	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
2730
2731	eor3	$res3b, $ctr_t3b, $ctr3b, $rk12				@ AES block 3 - result
2732	eor3	$res1b, $ctr_t1b, $ctr1b, $rk12				@ AES block 1 - result
2733
2734	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
2735	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
2736	eor3	$res4b, $ctr_t4b, $ctr4b, $rk12				@ AES block 4 - result
2737
2738	eor3	$res5b, $ctr_t5b, $ctr5b, $rk12				@ AES block 5 - result
2739	eor3	$res7b, $ctr_t7b, $ctr7b, $rk12				@ AES block 7 - result
2740	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
2741
2742	eor3	$res2b, $ctr_t2b, $ctr2b, $rk12				@ AES block 2 - result
2743	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
2744	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
2745
2746	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
2747	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
2748
2749	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
2750	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
2751	eor3	$res6b, $ctr_t6b, $ctr6b, $rk12				@ AES block 6 - result
2752
2753	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
2754
2755	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
2756	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
2757	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
2758
2759	b.ge	.L192_enc_prepretail					@ do prepretail
2760
2761.L192_enc_main_loop:							@ main loop start
2762	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
2763	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
2764	rev64	$res2b, $res2b						@ GHASH block 8k+2
2765
2766	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
2767	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
2768	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
2769	ext     $h7.16b, $h7.16b, $h7.16b, #8
2770	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
2771	ext     $h8.16b, $h8.16b, $h8.16b, #8
2772
2773	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
2774	rev64	$res0b, $res0b						@ GHASH block 8k
2775	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
2776	ext     $h5.16b, $h5.16b, $h5.16b, #8
2777	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
2778	ext     $h6.16b, $h6.16b, $h6.16b, #8
2779
2780	rev64	$res1b, $res1b						@ GHASH block 8k+1
2781	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
2782	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
2783
2784	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
2785	rev64	$res3b, $res3b						@ GHASH block 8k+3
2786	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
2787
2788	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
2789	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
2790	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
2791
2792	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
2793	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
2794	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
2795
2796	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
2797	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
2798	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
2799
2800	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
2801	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
2802	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
2803
2804	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
2805	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
2806	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
2807
2808	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
2809	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
2810	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
2811	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
2812
2813	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
2814	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
2815	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
2816
2817	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
2818	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
2819	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
2820
2821	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
2822	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
2823	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
2824
2825	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
2826	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
2827	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
2828
2829	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
2830	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
2831	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
2832
2833	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
2834	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
2835	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
2836
2837	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
2838	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
2839	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
2840
2841	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
2842	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
2843	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
2844
2845	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
2846	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
2847	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
2848
2849	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
2850	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
2851	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
2852	ext     $h3.16b, $h3.16b, $h3.16b, #8
2853	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
2854	ext     $h4.16b, $h4.16b, $h4.16b, #8
2855
2856	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k - mid
2857	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
2858	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
2859
2860	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
2861	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
2862	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
2863
2864	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
2865	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
2866	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
2867
2868	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
2869	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
2870	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
2871
2872	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
2873	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
2874	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
2875
2876	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
2877	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
2878	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
2879
2880	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
2881	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
2882	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
2883
2884	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
2885	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
2886	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
2887	ext     $h1.16b, $h1.16b, $h1.16b, #8
2888	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
2889	ext     $h2.16b, $h2.16b, $h2.16b, #8
2890
2891	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
2892	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
2893	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
2894
2895	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
2896	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
2897	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
2898
2899	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
2900	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
2901
2902	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
2903	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
2904	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
2905
2906	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
2907	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
2908	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
2909
2910	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
2911	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
2912	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
2913
2914	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
2915	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
2916	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
2917
2918	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
2919	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
2920	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
2921
2922	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
2923	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
2924	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
2925
2926	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
2927	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
2928
2929	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
2930	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
2931	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
2932
2933	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
2934	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
2935
2936	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
2937	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
2938	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
2939
2940	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
2941	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
2942	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
2943
2944	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
2945	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
2946	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
2947
2948	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
2949	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
2950	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
2951
2952	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
2953	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
2954	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
2955
2956	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
2957	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
2958	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
2959
2960	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
2961	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
2962	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
2963
2964	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
2965	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
2966	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
2967
2968	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
2969	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
2970	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
2971
2972	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
2973	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
2974	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
2975
2976	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
2977	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
2978	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
2979
2980	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
2981	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
2982	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
2983
2984	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
2985	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
2986	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
2987
2988	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
2989	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
2990	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
2991
2992	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
2993	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
2994	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
2995
2996	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
2997	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
2998	ldr	$rk12q, [$cc, #192]					@ load rk12
2999	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
3000
3001	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
3002	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
3003	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
3004
3005	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
3006	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
3007	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load plaintext
3008
3009	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load plaintext
3010	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
3011	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
3012
3013	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
3014	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
3015
3016	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
3017	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
3018
3019	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
3020	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
3021	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
3022
3023	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
3024	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
3025	eor3	$res4b, $ctr_t4b, $ctr4b, $rk12				@ AES block 4 - result
3026
3027	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
3028	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
3029	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
3030
3031	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
3032	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
3033	eor3	$res7b, $ctr_t7b, $ctr7b, $rk12				@ AES block 7 - result
3034
3035	eor3	$res2b, $ctr_t2b, $ctr2b, $rk12				@ AES block 8k+10 - result
3036	eor3	$res0b, $ctr_t0b, $ctr0b, $rk12				@ AES block 8k+8 - result
3037	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
3038
3039	eor3	$res1b, $ctr_t1b, $ctr1b, $rk12				@ AES block 8k+9 - result
3040	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
3041	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
3042	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
3043
3044	eor3	$res6b, $ctr_t6b, $ctr6b, $rk12				@ AES block 6 - result
3045	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
3046	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
3047
3048	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
3049	eor3	$res5b, $ctr_t5b, $ctr5b, $rk12				@ AES block 5 - result
3050	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
3051
3052	eor3	$res3b, $ctr_t3b, $ctr3b, $rk12				@ AES block 8k+11 - result
3053	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
3054
3055	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
3056
3057	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
3058
3059	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
3060	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
3061	b.lt	.L192_enc_main_loop
3062
3063.L192_enc_prepretail:							@ PREPRETAIL
3064	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
3065	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
3066	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
3067
3068	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
3069	ext     $h7.16b, $h7.16b, $h7.16b, #8
3070	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
3071	ext     $h8.16b, $h8.16b, $h8.16b, #8
3072	rev64	$res0b, $res0b						@ GHASH block 8k
3073	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
3074
3075	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
3076	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
3077	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
3078	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
3079
3080	rev64	$res3b, $res3b						@ GHASH block 8k+3
3081	rev64	$res2b, $res2b						@ GHASH block 8k+2
3082	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
3083	ext     $h5.16b, $h5.16b, $h5.16b, #8
3084	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
3085	ext     $h6.16b, $h6.16b, $h6.16b, #8
3086
3087	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
3088	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
3089	rev64	$res1b, $res1b						@ GHASH block 8k+1
3090
3091	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
3092	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
3093	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
3094
3095	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
3096	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
3097	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
3098
3099	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
3100	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
3101	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
3102
3103	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
3104	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
3105	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3106
3107	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3108	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
3109	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
3110
3111	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
3112	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
3113	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
3114
3115	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
3116	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
3117	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
3118
3119	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
3120	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
3121	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
3122
3123	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
3124	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
3125	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
3126
3127	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
3128	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
3129	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
3130
3131	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
3132	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
3133	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
3134
3135	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
3136	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
3137	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
3138
3139	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
3140	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
3141	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
3142
3143	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
3144	rev64	$res5b, $res5b						@ GHASH block 8k+5 (t0, t1, t2 and t3 free)
3145	rev64	$res6b, $res6b						@ GHASH block 8k+6 (t0, t1, and t2 free)
3146
3147	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
3148	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
3149	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
3150
3151	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
3152	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
3153	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
3154
3155	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
3156	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
3157	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
3158
3159	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
3160	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
3161	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
3162
3163	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
3164	ext     $h3.16b, $h3.16b, $h3.16b, #8
3165	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
3166	ext     $h4.16b, $h4.16b, $h4.16b, #8
3167	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
3168	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
3169
3170	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
3171	ext     $h1.16b, $h1.16b, $h1.16b, #8
3172	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
3173	ext     $h2.16b, $h2.16b, $h2.16b, #8
3174	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
3175	rev64	$res4b, $res4b						@ GHASH block 8k+4 (t0, t1, and t2 free)
3176
3177	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
3178	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
3179	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
3180
3181	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
3182	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
3183	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
3184
3185	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
3186	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
3187	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
3188
3189	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
3190	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
3191	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
3192
3193	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
3194	rev64	$res7b, $res7b						@ GHASH block 8k+7 (t0, t1, t2 and t3 free)
3195	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
3196	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
3197
3198	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
3199	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
3200	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
3201
3202	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
3203	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
3204	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
3205
3206	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
3207	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
3208
3209	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
3210	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
3211	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
3212
3213	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
3214	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
3215	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
3216
3217	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
3218	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
3219	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
3220
3221	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
3222	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
3223	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
3224
3225	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
3226	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
3227
3228	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
3229	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
3230	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
3231
3232	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
3233	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
3234	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
3235
3236	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
3237	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
3238	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
3239
3240	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
3241	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
3242	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
3243
3244	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
3245	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
3246	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
3247
3248	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
3249	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
3250	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
3251
3252	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
3253	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
3254	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
3255
3256	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
3257	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
3258
3259	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
3260	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
3261
3262	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
3263	ext	$t12.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
3264	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
3265	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
3266
3267	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
3268	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
3269
3270	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
3271	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
3272	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
3273
3274	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
3275	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
3276	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
3277
3278	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
3279	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
3280	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
3281
3282	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
3283	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
3284	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
3285
3286	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
3287	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
3288
3289	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
3290	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
3291	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
3292	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
3293
3294	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
3295	ldr	$rk12q, [$cc, #192]					@ load rk12
3296
3297	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
3298	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
3299	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
3300
3301	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
3302	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
3303	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
3304
3305	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
3306	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
3307
3308	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
3309	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
3310
3311	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
3312	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
3313
3314	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
3315	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
3316	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
3317
3318	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
3319	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
3320	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
3321
3322.L192_enc_tail:								@ TAIL
3323
3324	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
3325        ext     $h5.16b, $h5.16b, $h5.16b, #8
3326	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
3327
3328	ldr	$ctr_t0q, [$input_ptr], #16				@ AES block 8k+8 - l3ad plaintext
3329
3330	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
3331        ext     $h8.16b, $h8.16b, $h8.16b, #8
3332
3333	mov	$t1.16b, $rk12
3334
3335	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
3336        ext     $h6.16b, $h6.16b, $h6.16b, #8
3337	ext     $h7.16b, $h7.16b, $h7.16b, #8
3338	cmp	$main_end_input_ptr, #112
3339
3340	eor3	$res1b, $ctr_t0b, $ctr0b, $t1.16b			@ AES block 8k+8 - result
3341	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
3342	b.gt	.L192_enc_blocks_more_than_7
3343
3344	cmp	$main_end_input_ptr, #96
3345	mov	$ctr7b, $ctr6b
3346	movi	$acc_h.8b, #0
3347
3348	mov	$ctr6b, $ctr5b
3349	movi	$acc_l.8b, #0
3350	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3351
3352	mov	$ctr5b, $ctr4b
3353	mov	$ctr4b, $ctr3b
3354	mov	$ctr3b, $ctr2b
3355
3356	mov	$ctr2b, $ctr1b
3357	movi	$acc_m.8b, #0
3358	b.gt	.L192_enc_blocks_more_than_6
3359
3360	mov	$ctr7b, $ctr6b
3361	cmp	$main_end_input_ptr, #80
3362
3363	mov	$ctr6b, $ctr5b
3364	mov	$ctr5b, $ctr4b
3365	mov	$ctr4b, $ctr3b
3366
3367	mov	$ctr3b, $ctr1b
3368	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3369	b.gt	.L192_enc_blocks_more_than_5
3370
3371	cmp	$main_end_input_ptr, #64
3372	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3373
3374	mov	$ctr7b, $ctr6b
3375	mov	$ctr6b, $ctr5b
3376	mov	$ctr5b, $ctr4b
3377
3378	mov	$ctr4b, $ctr1b
3379	b.gt	.L192_enc_blocks_more_than_4
3380
3381	mov	$ctr7b, $ctr6b
3382	mov	$ctr6b, $ctr5b
3383	mov	$ctr5b, $ctr1b
3384
3385	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3386	cmp	$main_end_input_ptr, #48
3387	b.gt	.L192_enc_blocks_more_than_3
3388
3389	mov	$ctr7b, $ctr6b
3390	mov	$ctr6b, $ctr1b
3391	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3392
3393	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
3394	cmp	$main_end_input_ptr, #32
3395	b.gt	.L192_enc_blocks_more_than_2
3396
3397	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3398
3399	cmp	$main_end_input_ptr, #16
3400	mov	$ctr7b, $ctr1b
3401	b.gt	.L192_enc_blocks_more_than_1
3402
3403	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3404	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
3405	b	 .L192_enc_blocks_less_than_1
3406.L192_enc_blocks_more_than_7:						@ blocks left >  7
3407	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-7 block  - store result
3408
3409	rev64	$res0b, $res1b						@ GHASH final-7 block
3410	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
3411
3412	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3413
3414	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
3415
3416	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-6 block - load plaintext
3417
3418	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
3419	movi	$t0.8b, #0						@ suppress further partial tag feed in
3420	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
3421
3422	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
3423
3424	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
3425	eor3	$res1b, $ctr_t1b, $ctr1b, $t1.16b			@ AES final-6 block - result
3426.L192_enc_blocks_more_than_6:						@ blocks left >  6
3427
3428	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-6 block - store result
3429
3430	rev64	$res0b, $res1b						@ GHASH final-6 block
3431
3432	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-5 block - load plaintext
3433
3434	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3435
3436	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
3437
3438	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
3439	eor3	$res1b, $ctr_t1b, $ctr2b, $t1.16b			@ AES final-5 block - result
3440
3441	movi	$t0.8b, #0						@ suppress further partial tag feed in
3442	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
3443	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
3444
3445	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
3446
3447	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
3448	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
3449
3450	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
3451.L192_enc_blocks_more_than_5:						@ blocks left >  5
3452
3453	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-5 block - store result
3454
3455	rev64	$res0b, $res1b						@ GHASH final-5 block
3456
3457	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3458
3459	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
3460
3461	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-4 block - load plaintext
3462	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
3463
3464	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
3465	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
3466
3467	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
3468	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
3469
3470	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
3471	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
3472
3473	eor3	$res1b, $ctr_t1b, $ctr3b, $t1.16b			@ AES final-4 block - result
3474	movi	$t0.8b, #0						@ suppress further partial tag feed in
3475
3476	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
3477.L192_enc_blocks_more_than_4:						@ blocks left >  4
3478
3479	st1	{ $res1b}, [$output_ptr], #16				@ AES final-4 block - store result
3480
3481	rev64	$res0b, $res1b						@ GHASH final-4 block
3482
3483	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3484
3485	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-3 block - load plaintext
3486	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
3487	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
3488
3489	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
3490	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
3491
3492	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
3493
3494	movi	$t0.8b, #0						@ suppress further partial tag feed in
3495	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
3496
3497	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
3498
3499	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
3500	eor3	$res1b, $ctr_t1b, $ctr4b, $t1.16b			@ AES final-3 block - result
3501.L192_enc_blocks_more_than_3:						@ blocks left >  3
3502
3503	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
3504	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-3 block - store result
3505
3506	rev64	$res0b, $res1b						@ GHASH final-3 block
3507
3508	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3509	movi	$t0.8b, #0						@ suppress further partial tag feed in
3510
3511	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-2 block - load plaintext
3512	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
3513	ext     $h4.16b, $h4.16b, $h4.16b, #8
3514
3515	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
3516
3517	eor3	$res1b, $ctr_t1b, $ctr5b, $t1.16b			@ AES final-2 block - result
3518	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
3519
3520	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
3521	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
3522
3523	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
3524	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
3525
3526	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
3527
3528	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
3529	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
3530.L192_enc_blocks_more_than_2:						@ blocks left >  2
3531
3532	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-2 block - store result
3533
3534	rev64	$res0b, $res1b						@ GHASH final-2 block
3535	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
3536	ext     $h3.16b, $h3.16b, $h3.16b, #8
3537
3538	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3539
3540	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-1 block - load plaintext
3541	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
3542
3543	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
3544
3545	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
3546	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
3547	movi	$t0.8b, #0						@ suppress further partial tag feed in
3548
3549	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
3550
3551	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
3552	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
3553
3554	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
3555	eor3	$res1b, $ctr_t1b, $ctr6b, $t1.16b			@ AES final-1 block - result
3556.L192_enc_blocks_more_than_1:						@ blocks left >  1
3557
3558	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
3559	ext     $h2.16b, $h2.16b, $h2.16b, #8
3560	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-1 block - store result
3561
3562	rev64	$res0b, $res1b						@ GHASH final-1 block
3563
3564	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3565
3566	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
3567	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
3568
3569	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
3570	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
3571	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
3572
3573	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final block - load plaintext
3574	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
3575
3576	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
3577
3578	eor3	$res1b, $ctr_t1b, $ctr7b, $t1.16b			@ AES final block - result
3579	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
3580
3581	movi	$t0.8b, #0						@ suppress further partial tag feed in
3582
3583	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
3584	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
3585.L192_enc_blocks_less_than_1:						@ blocks left <= 1
3586
3587	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
3588	and	$bit_length, $bit_length, #127				@ bit_length %= 128
3589
3590	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
3591
3592	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
3593
3594	and	$bit_length, $bit_length, #127				@ bit_length %= 128
3595
3596	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
3597	cmp	$bit_length, #64
3598	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
3599
3600	csel	$temp2_x, $temp1_x, $temp0_x, lt
3601	csel	$temp3_x, $temp0_x, xzr, lt
3602
3603	mov	$ctr0.d[1], $temp3_x
3604	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
3605	ext     $h1.16b, $h1.16b, $h1.16b, #8
3606
3607	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
3608	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
3609
3610	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
3611
3612	rev64	$res0b, $res1b						@ GHASH final block
3613	bif	$res1b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
3614
3615	st1	{ $res1b}, [$output_ptr]				@ store all 16B
3616
3617	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
3618
3619	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
3620	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
3621
3622	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
3623	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
3624
3625	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
3626
3627	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
3628
3629	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
3630	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
3631
3632	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
3633	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
3634
3635	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
3636
3637	str	$rtmp_ctrq, [$counter]					@ store the updated counter
3638	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
3639
3640	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
3641
3642	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
3643
3644	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
3645	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
3646
3647	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
3648		ext	$acc_lb, $acc_lb, $acc_lb, #8
3649	rev64	$acc_lb, $acc_lb
3650	st1	{ $acc_l.16b }, [$current_tag]
3651
3652	mov	x0, $byte_length					@ return sizes
3653
3654	ldp	d10, d11, [sp, #16]
3655	ldp	d12, d13, [sp, #32]
3656	ldp	d14, d15, [sp, #48]
3657	ldp	d8, d9, [sp], #80
3658	ret
3659
3660.L192_enc_ret:
3661	mov w0, #0x0
3662	ret
3663.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
3664___
3665
3666#########################################################################################
3667# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const uint8_t * ciphertext,
3668#                                            uint64_t plaintext_length,
3669#                                            uint8_t * plaintext,
3670#                                            uint64_t *Xi,
3671#                                            unsigned char ivec[16],
3672#                                            const void *key);
3673#
3674$code.=<<___;
3675.global unroll8_eor3_aes_gcm_dec_192_kernel
3676.type   unroll8_eor3_aes_gcm_dec_192_kernel,%function
3677.align  4
3678unroll8_eor3_aes_gcm_dec_192_kernel:
3679	AARCH64_VALID_CALL_TARGET
3680	cbz	x1, .L192_dec_ret
3681	stp	d8, d9, [sp, #-80]!
3682	lsr	$byte_length, $bit_length, #3
3683	mov	$counter, x4
3684	mov	$cc, x5
3685	stp	d10, d11, [sp, #16]
3686	stp	d12, d13, [sp, #32]
3687	stp	d14, d15, [sp, #48]
3688        mov     x5, #0xc200000000000000
3689	stp     x5, xzr, [sp, #64]
3690	add     $modulo_constant, sp, #64
3691
3692	mov	$main_end_input_ptr, $byte_length
3693	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
3694	ld1	{ $acc_lb}, [$current_tag]
3695
3696		mov	$constant_temp, #0x100000000			@ set up counter increment
3697	movi	$rctr_inc.16b, #0x0
3698	mov	$rctr_inc.d[1], $constant_temp
3699
3700	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
3701
3702	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
3703
3704	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
3705	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
3706
3707	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
3708	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
3709
3710	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
3711	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
3712
3713	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
3714	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
3715
3716	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
3717	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
3718	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
3719
3720	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
3721	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
3722
3723	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
3724
3725	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
3726	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
3727	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
3728
3729	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
3730	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
3731	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
3732
3733	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
3734	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
3735	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
3736
3737	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
3738
3739	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
3740
3741	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
3742	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
3743	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
3744
3745	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
3746	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
3747
3748	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
3749	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
3750	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
3751
3752	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
3753	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
3754	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
3755
3756	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
3757	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
3758	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
3759
3760	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
3761
3762	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
3763	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
3764	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
3765
3766	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
3767	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
3768
3769	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
3770	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
3771	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
3772
3773	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
3774	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
3775	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
3776
3777	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
3778	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
3779	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
3780
3781	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
3782	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
3783	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
3784
3785	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
3786	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
3787
3788	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
3789	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
3790	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
3791
3792	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
3793	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
3794	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
3795
3796	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
3797
3798	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
3799	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
3800	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
3801
3802	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
3803	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
3804	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
3805
3806	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
3807	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
3808	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
3809
3810	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
3811
3812	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
3813	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
3814
3815	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
3816	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
3817	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
3818
3819	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
3820	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
3821	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
3822
3823	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
3824	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
3825	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3826
3827	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
3828	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
3829	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
3830
3831	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
3832	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
3833	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
3834
3835	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
3836	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
3837
3838	ld1	{ $acc_lb}, [$current_tag]
3839	ext	$acc_lb, $acc_lb, $acc_lb, #8
3840	rev64	$acc_lb, $acc_lb
3841
3842	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
3843
3844	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
3845	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
3846
3847	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
3848	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
3849	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
3850
3851	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
3852	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
3853
3854	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
3855	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
3856
3857	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 10
3858	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 10
3859	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 10
3860
3861	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 10
3862	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 10
3863	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 10
3864
3865	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 10
3866	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 10
3867	ldr	$rk12q, [$cc, #192]					@ load rk12
3868
3869	aese	$ctr0b, $rk11						@ AES block 0 - round 11
3870	aese	$ctr1b, $rk11						@ AES block 1 - round 11
3871	aese	$ctr4b, $rk11						@ AES block 4 - round 11
3872
3873	aese	$ctr6b, $rk11						@ AES block 6 - round 11
3874	aese	$ctr5b, $rk11						@ AES block 5 - round 11
3875	aese	$ctr7b, $rk11						@ AES block 7 - round 11
3876
3877	aese	$ctr2b, $rk11						@ AES block 2 - round 11
3878	aese	$ctr3b, $rk11						@ AES block 3 - round 11
3879	b.ge	.L192_dec_tail						@ handle tail
3880
3881	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 0, 1 - load ciphertext
3882
3883	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 2, 3 - load ciphertext
3884
3885	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 4, 5 - load ciphertext
3886
3887	eor3	$ctr1b, $res1b, $ctr1b, $rk12				@ AES block 1 - result
3888	eor3	$ctr0b, $res0b, $ctr0b, $rk12				@ AES block 0 - result
3889	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 0, 1 - store result
3890
3891	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
3892	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
3893
3894	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
3895	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
3896	eor3	$ctr3b, $res3b, $ctr3b, $rk12				@ AES block 3 - result
3897
3898	eor3	$ctr2b, $res2b, $ctr2b, $rk12				@ AES block 2 - result
3899	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 2, 3 - store result
3900	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 6, 7 - load ciphertext
3901
3902	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
3903	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
3904
3905	eor3	$ctr4b, $res4b, $ctr4b, $rk12				@ AES block 4 - result
3906
3907	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
3908	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
3909
3910	eor3	$ctr5b, $res5b, $ctr5b, $rk12				@ AES block 5 - result
3911	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 4, 5 - store result
3912	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
3913
3914	eor3	$ctr6b, $res6b, $ctr6b, $rk12				@ AES block 6 - result
3915	eor3	$ctr7b, $res7b, $ctr7b, $rk12				@ AES block 7 - result
3916	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
3917
3918	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
3919	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 6, 7 - store result
3920	b.ge	.L192_dec_prepretail					@ do prepretail
3921
3922.L192_dec_main_loop:							@ main loop start
3923	rev64	$res1b, $res1b						@ GHASH block 8k+1
3924	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
3925	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
3926
3927	rev64	$res0b, $res0b						@ GHASH block 8k
3928	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
3929	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
3930
3931	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
3932	ext     $h7.16b, $h7.16b, $h7.16b, #8
3933	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
3934	ext     $h8.16b, $h8.16b, $h8.16b, #8
3935	rev64	$res4b, $res4b						@ GHASH block 8k+4
3936	rev64	$res3b, $res3b						@ GHASH block 8k+3
3937
3938	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
3939	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
3940	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
3941
3942	rev64	$res5b, $res5b						@ GHASH block 8k+5
3943
3944	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
3945	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
3946	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
3947
3948	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
3949	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
3950	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
3951
3952	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
3953	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
3954	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
3955
3956	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
3957	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
3958	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
3959
3960	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
3961	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
3962	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
3963	ext     $h5.16b, $h5.16b, $h5.16b, #8
3964	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
3965	ext     $h6.16b, $h6.16b, $h6.16b, #8
3966
3967	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
3968	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
3969	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
3970
3971	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
3972	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
3973	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
3974
3975	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3976	rev64	$res2b, $res2b						@ GHASH block 8k+2
3977	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
3978
3979	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
3980	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
3981	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
3982	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
3983
3984	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
3985	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
3986	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
3987
3988	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
3989	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
3990	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
3991
3992	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
3993	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
3994	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
3995
3996	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
3997	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
3998	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
3999
4000	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
4001	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
4002	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
4003
4004	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
4005	ext     $h3.16b, $h3.16b, $h3.16b, #8
4006	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
4007	ext     $h4.16b, $h4.16b, $h4.16b, #8
4008	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
4009	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
4010
4011	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
4012	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4013	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4014
4015	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
4016	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
4017
4018	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
4019	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
4020	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
4021
4022	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
4023	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
4024	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
4025
4026	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4027	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
4028
4029	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
4030	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
4031	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
4032
4033	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
4034	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
4035	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
4036
4037	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
4038	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
4039	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
4040
4041	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
4042	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
4043	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
4044
4045	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
4046	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
4047	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
4048
4049	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
4050	ext     $h1.16b, $h1.16b, $h1.16b, #8
4051	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
4052	ext     $h2.16b, $h2.16b, $h2.16b, #8
4053	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
4054	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
4055
4056	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
4057	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
4058	rev64	$res7b, $res7b						@ GHASH block 8k+7
4059
4060	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
4061	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
4062	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
4063
4064	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
4065	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4066	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
4067
4068	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
4069	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
4070	rev64	$res6b, $res6b						@ GHASH block 8k+6
4071
4072	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4073	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4074	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
4075	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
4076
4077	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
4078	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
4079	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4080
4081	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
4082	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
4083	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
4084
4085	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
4086	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
4087	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
4088
4089	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
4090	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
4091	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
4092
4093	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
4094	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
4095	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
4096
4097	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
4098	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4099	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
4100
4101	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
4102	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
4103	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
4104
4105	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
4106	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
4107	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
4108
4109	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
4110	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
4111	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
4112
4113	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
4114	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
4115	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
4116
4117	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
4118	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
4119	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
4120
4121	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
4122	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
4123	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
4124
4125	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
4126	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
4127	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
4128
4129	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
4130	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
4131	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
4132
4133	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
4134	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
4135	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
4136
4137	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
4138	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
4139	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
4140
4141	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
4142	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
4143	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
4144
4145	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
4146	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load ciphertext
4147
4148	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
4149	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
4150	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load ciphertext
4151
4152	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
4153	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
4154	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
4155
4156	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
4157	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
4158	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
4159
4160	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
4161	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
4162	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load ciphertext
4163
4164	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
4165	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
4166	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
4167
4168	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
4169	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
4170	ldr	$rk12q, [$cc, #192]					@ load rk12
4171
4172	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load ciphertext
4173	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
4174	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
4175
4176	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
4177	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
4178	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
4179
4180	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
4181	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
4182	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
4183
4184	eor3	$ctr0b, $res0b, $ctr0b, $rk12				@ AES block 8k+8 - result
4185	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
4186	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
4187
4188	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
4189	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
4190	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
4191
4192	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
4193	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
4194	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
4195
4196	eor3	$ctr1b, $res1b, $ctr1b, $rk12				@ AES block 8k+9 - result
4197	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
4198	eor3	$ctr3b, $res3b, $ctr3b, $rk12				@ AES block 8k+11 - result
4199
4200	eor3	$ctr2b, $res2b, $ctr2b, $rk12				@ AES block 8k+10 - result
4201	eor3	$ctr7b, $res7b, $ctr7b, $rk12				@ AES block 8k+15 - result
4202	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
4203
4204	eor3	$ctr5b, $res5b, $ctr5b, $rk12				@ AES block 8k+13 - result
4205	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
4206	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
4207
4208	eor3	$ctr4b, $res4b, $ctr4b, $rk12				@ AES block 8k+12 - result
4209	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
4210	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
4211
4212	eor3	$ctr6b, $res6b, $ctr6b, $rk12				@ AES block 8k+14 - result
4213	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
4214	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
4215
4216	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
4217	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
4218
4219	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
4220	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
4221	b.lt	.L192_dec_main_loop
4222
4223.L192_dec_prepretail:							@ PREPRETAIL
4224	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
4225	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
4226	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
4227
4228	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
4229	ext     $h7.16b, $h7.16b, $h7.16b, #8
4230	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
4231	ext     $h8.16b, $h8.16b, $h8.16b, #8
4232	rev64	$res0b, $res0b						@ GHASH block 8k
4233	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
4234
4235	rev64	$res3b, $res3b						@ GHASH block 8k+3
4236	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
4237	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
4238
4239	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
4240	rev64	$res2b, $res2b						@ GHASH block 8k+2
4241	rev64	$res1b, $res1b						@ GHASH block 8k+1
4242
4243	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
4244	ext     $h5.16b, $h5.16b, $h5.16b, #8
4245	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
4246	ext     $h6.16b, $h6.16b, $h6.16b, #8
4247	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
4248
4249	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
4250	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
4251	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
4252
4253	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
4254	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
4255	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
4256
4257	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
4258	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
4259	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
4260
4261	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
4262	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
4263	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
4264
4265	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
4266	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
4267	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
4268
4269	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
4270	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
4271	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
4272
4273	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
4274	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
4275	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
4276
4277	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
4278	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
4279	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
4280
4281	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
4282	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
4283	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
4284
4285	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
4286	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
4287	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
4288	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
4289
4290	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
4291	rev64	$res5b, $res5b						@ GHASH block 8k+5
4292	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
4293
4294	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
4295	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
4296	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
4297
4298	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4299	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
4300	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
4301
4302	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
4303	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
4304	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
4305
4306	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
4307	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
4308	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
4309
4310	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
4311	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
4312	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
4313
4314	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
4315	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
4316	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
4317
4318	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
4319	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
4320	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
4321
4322	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
4323	ext     $h3.16b, $h3.16b, $h3.16b, #8
4324	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
4325	ext     $h4.16b, $h4.16b, $h4.16b, #8
4326	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
4327	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
4328
4329	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
4330	ext     $h1.16b, $h1.16b, $h1.16b, #8
4331	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
4332	ext     $h2.16b, $h2.16b, $h2.16b, #8
4333	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
4334	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
4335
4336	rev64	$res7b, $res7b						@ GHASH block 8k+7
4337
4338	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
4339	rev64	$res4b, $res4b						@ GHASH block 8k+4
4340
4341	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
4342	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
4343	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
4344
4345	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
4346	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
4347	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
4348
4349	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
4350	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
4351	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
4352
4353	rev64	$res6b, $res6b						@ GHASH block 8k+6
4354	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4355	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4356	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4357
4358	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
4359	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
4360	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
4361
4362	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
4363	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
4364	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
4365
4366	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
4367	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
4368	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
4369
4370	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
4371
4372	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
4373	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
4374	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
4375
4376	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
4377	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4378	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
4379
4380	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
4381	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
4382	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
4383
4384	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
4385	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
4386
4387	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
4388	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
4389	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
4390
4391	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
4392	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
4393	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
4394
4395	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
4396	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
4397	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
4398
4399	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
4400	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
4401	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
4402
4403	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
4404	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
4405	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
4406
4407	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
4408	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
4409	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
4410
4411	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
4412	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
4413	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
4414
4415	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
4416	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
4417	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
4418
4419	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
4420	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
4421	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
4422
4423	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
4424	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
4425
4426	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
4427	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
4428	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
4429
4430	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
4431	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
4432	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
4433
4434	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
4435	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
4436	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
4437
4438	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
4439	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
4440	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
4441
4442	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
4443	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
4444	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
4445
4446	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
4447	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
4448	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
4449
4450	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
4451	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
4452	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
4453
4454	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
4455	ldr	$rk12q, [$cc, #192]					@ load rk12
4456	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
4457
4458	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
4459	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
4460	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
4461
4462	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
4463	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
4464	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
4465
4466	aese	$ctr0b, $rk11						@ AES block 8k+8 - round 11
4467	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
4468	aese	$ctr5b, $rk11						@ AES block 8k+13 - round 11
4469
4470	aese	$ctr2b, $rk11						@ AES block 8k+10 - round 11
4471	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
4472	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
4473
4474	aese	$ctr6b, $rk11						@ AES block 8k+14 - round 11
4475	aese	$ctr4b, $rk11						@ AES block 8k+12 - round 11
4476	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
4477
4478	aese	$ctr3b, $rk11						@ AES block 8k+11 - round 11
4479	aese	$ctr1b, $rk11						@ AES block 8k+9 - round 11
4480	aese	$ctr7b, $rk11						@ AES block 8k+15 - round 11
4481
4482.L192_dec_tail:								@ TAIL
4483
4484	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr 	@ main_end_input_ptr is number of bytes left to process
4485
4486	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
4487        ext     $h5.16b, $h5.16b, $h5.16b, #8
4488	ldr	$res1q, [$input_ptr], #16				@ AES block 8k+8 - load ciphertext
4489
4490	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
4491        ext     $h8.16b, $h8.16b, $h8.16b, #8
4492
4493	mov	$t1.16b, $rk12
4494
4495	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
4496        ext     $h6.16b, $h6.16b, $h6.16b, #8
4497        ext     $h7.16b, $h7.16b, $h7.16b, #8
4498	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
4499
4500	eor3	$res4b, $res1b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
4501	cmp	$main_end_input_ptr, #112
4502	b.gt	.L192_dec_blocks_more_than_7
4503
4504	mov	$ctr7b, $ctr6b
4505	movi	$acc_h.8b, #0
4506	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4507
4508	mov	$ctr6b, $ctr5b
4509	mov	$ctr5b, $ctr4b
4510	mov	$ctr4b, $ctr3b
4511
4512	cmp	$main_end_input_ptr, #96
4513	movi	$acc_l.8b, #0
4514	mov	$ctr3b, $ctr2b
4515
4516	mov	$ctr2b, $ctr1b
4517	movi	$acc_m.8b, #0
4518	b.gt	.L192_dec_blocks_more_than_6
4519
4520	mov	$ctr7b, $ctr6b
4521	mov	$ctr6b, $ctr5b
4522	mov	$ctr5b, $ctr4b
4523
4524	mov	$ctr4b, $ctr3b
4525	mov	$ctr3b, $ctr1b
4526
4527	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4528	cmp	$main_end_input_ptr, #80
4529	b.gt	.L192_dec_blocks_more_than_5
4530
4531	mov	$ctr7b, $ctr6b
4532	mov	$ctr6b, $ctr5b
4533
4534	mov	$ctr5b, $ctr4b
4535	mov	$ctr4b, $ctr1b
4536	cmp	$main_end_input_ptr, #64
4537
4538	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4539	b.gt	.L192_dec_blocks_more_than_4
4540
4541	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4542	mov	$ctr7b, $ctr6b
4543	mov	$ctr6b, $ctr5b
4544
4545	mov	$ctr5b, $ctr1b
4546	cmp	$main_end_input_ptr, #48
4547	b.gt	.L192_dec_blocks_more_than_3
4548
4549	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4550	mov	$ctr7b, $ctr6b
4551	cmp	$main_end_input_ptr, #32
4552
4553	mov	$ctr6b, $ctr1b
4554	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4555	b.gt	.L192_dec_blocks_more_than_2
4556
4557	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4558
4559	mov	$ctr7b, $ctr1b
4560	cmp	$main_end_input_ptr, #16
4561	b.gt	.L192_dec_blocks_more_than_1
4562
4563	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4564	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4565	b	 .L192_dec_blocks_less_than_1
4566.L192_dec_blocks_more_than_7:						@ blocks left >  7
4567	rev64	$res0b, $res1b						@ GHASH final-7 block
4568
4569	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
4570	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4571
4572	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
4573	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
4574	ldr	$res1q, [$input_ptr], #16				@ AES final-6 block - load ciphertext
4575
4576	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
4577
4578	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
4579	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-7 block  - store result
4580
4581	eor3	$res4b, $res1b, $ctr1b, $t1.16b				@ AES final-6 block - result
4582
4583	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
4584	movi	$t0.8b, #0						@ suppress further partial tag feed in
4585.L192_dec_blocks_more_than_6:						@ blocks left >  6
4586
4587	rev64	$res0b, $res1b						@ GHASH final-6 block
4588
4589	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4590
4591	ldr	$res1q, [$input_ptr], #16				@ AES final-5 block - load ciphertext
4592	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
4593
4594	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
4595	movi	$t0.8b, #0						@ suppress further partial tag feed in
4596	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
4597
4598	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-6 block - store result
4599	eor3	$res4b, $res1b, $ctr2b, $t1.16b				@ AES final-5 block - result
4600
4601	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
4602	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
4603	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
4604
4605	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
4606	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
4607.L192_dec_blocks_more_than_5:						@ blocks left >  5
4608
4609	rev64	$res0b, $res1b						@ GHASH final-5 block
4610
4611	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4612
4613	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
4614
4615	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
4616
4617	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
4618	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
4619
4620	ldr	$res1q, [$input_ptr], #16				@ AES final-4 block - load ciphertext
4621
4622	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
4623	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
4624
4625	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
4626
4627	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
4628	movi	$t0.8b, #0						@ suppress further partial tag feed in
4629	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-5 block - store result
4630
4631	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
4632	eor3	$res4b, $res1b, $ctr3b, $t1.16b				@ AES final-4 block - result
4633.L192_dec_blocks_more_than_4:						@ blocks left >  4
4634
4635	rev64	$res0b, $res1b						@ GHASH final-4 block
4636
4637	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4638	movi	$t0.8b, #0						@ suppress further partial tag feed in
4639
4640	ldr	$res1q, [$input_ptr], #16				@ AES final-3 block - load ciphertext
4641	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
4642	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
4643
4644	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
4645
4646	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
4647
4648	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
4649	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-4 block - store result
4650	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
4651
4652	eor3	$res4b, $res1b, $ctr4b, $t1.16b				@ AES final-3 block - result
4653
4654	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
4655	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
4656.L192_dec_blocks_more_than_3:						@ blocks left >  3
4657
4658	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
4659	ext     $h4.16b, $h4.16b, $h4.16b, #8
4660	rev64	$res0b, $res1b						@ GHASH final-3 block
4661	ldr	$res1q, [$input_ptr], #16				@ AES final-2 block - load ciphertext
4662
4663	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4664
4665	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
4666	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
4667
4668	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
4669	movi	$t0.8b, #0						@ suppress further partial tag feed in
4670	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
4671
4672	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-3 block - store result
4673	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
4674	eor3	$res4b, $res1b, $ctr5b, $t1.16b				@ AES final-2 block - result
4675
4676	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
4677	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
4678
4679	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
4680
4681	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
4682
4683	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
4684.L192_dec_blocks_more_than_2:						@ blocks left >  2
4685
4686	rev64	$res0b, $res1b						@ GHASH final-2 block
4687	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
4688	ext     $h3.16b, $h3.16b, $h3.16b, #8
4689
4690	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4691
4692	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
4693	ldr	$res1q, [$input_ptr], #16				@ AES final-1 block - load ciphertext
4694
4695	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
4696
4697	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
4698
4699	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
4700	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
4701
4702	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
4703	movi	$t0.8b, #0						@ suppress further partial tag feed in
4704
4705	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
4706	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-2 block - store result
4707
4708	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
4709	eor3	$res4b, $res1b, $ctr6b, $t1.16b				@ AES final-1 block - result
4710.L192_dec_blocks_more_than_1:						@ blocks left >  1
4711
4712	rev64	$res0b, $res1b						@ GHASH final-1 block
4713	ldr	$res1q, [$input_ptr], #16				@ AES final block - load ciphertext
4714	ldr	$h2q, [$current_tag, #64]				@ load h1l | h1h
4715	ext     $h2.16b, $h2.16b, $h2.16b, #8
4716
4717	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4718	movi	$t0.8b, #0						@ suppress further partial tag feed in
4719	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
4720
4721	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
4722	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
4723	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-1 block - store result
4724
4725	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
4726
4727	eor3	$res4b, $res1b, $ctr7b, $t1.16b				@ AES final block - result
4728
4729	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
4730
4731	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
4732
4733	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
4734
4735	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
4736
4737	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
4738	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
4739.L192_dec_blocks_less_than_1:						@ blocks left <= 1
4740
4741	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
4742	and	$bit_length, $bit_length, #127				@ bit_length %= 128
4743
4744	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
4745	str	$rtmp_ctrq, [$counter]					@ store the updated counter
4746
4747	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
4748	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
4749
4750	and	$bit_length, $bit_length, #127				@ bit_length %= 128
4751
4752	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
4753	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
4754	cmp	$bit_length, #64
4755
4756	csel	$temp2_x, $temp1_x, $temp0_x, lt
4757	csel	$temp3_x, $temp0_x, xzr, lt
4758	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
4759	ext     $h1.16b, $h1.16b, $h1.16b, #8
4760
4761	mov	$ctr0.d[1], $temp3_x
4762	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
4763
4764	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
4765
4766	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
4767	bif	$res4b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
4768
4769	rev64	$res0b, $res1b						@ GHASH final block
4770
4771	st1	{ $res4b}, [$output_ptr]				@ store all 16B
4772
4773	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
4774
4775	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
4776	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
4777
4778	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
4779	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
4780	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
4781
4782	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
4783	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
4784
4785	eor	$t10.16b, $acc_hb, $acc_lb				@ MODULO - karatsuba tidy up
4786	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
4787	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
4788
4789	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
4790	ext	$acc_hb, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
4791
4792	eor	$acc_mb, $acc_mb, $t10.16b				@ MODULO - karatsuba tidy up
4793
4794	eor3	$acc_mb, $acc_mb, $acc_hb, $t11.16b			@ MODULO - fold into mid
4795
4796	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
4797	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
4798
4799	eor3	$acc_lb, $acc_lb, $acc_mb, $acc_hb			@ MODULO - fold into low
4800	ext	$acc_lb, $acc_lb, $acc_lb, #8
4801	rev64	$acc_lb, $acc_lb
4802	st1	{ $acc_l.16b }, [$current_tag]
4803
4804	mov	x0, $byte_length
4805
4806	ldp	d10, d11, [sp, #16]
4807	ldp	d12, d13, [sp, #32]
4808	ldp	d14, d15, [sp, #48]
4809	ldp	d8, d9, [sp], #80
4810	ret
4811
4812.L192_dec_ret:
4813	mov w0, #0x0
4814	ret
4815.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
4816___
4817}
4818
4819{
4820
4821my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
4822my ($temp2_x,$temp3_x)=map("x$_",(13..14));
4823my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
4824my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
4825my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
4826my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
4827my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
4828
4829my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
4830my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
4831my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
4832
4833my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
4834my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
4835
4836my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
4837my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
4838my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
4839my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
4840
4841my $t0="v16";
4842my $t0d="d16";
4843
4844my $t1="v29";
4845my $t2=$res1;
4846my $t3=$t1;
4847
4848my $t4=$res0;
4849my $t5=$res2;
4850my $t6=$t0;
4851
4852my $t7=$res3;
4853my $t8=$res4;
4854my $t9=$res5;
4855
4856my $t10=$res6;
4857my $t11="v21";
4858my $t12=$t1;
4859
4860my $rtmp_ctr="v30";
4861my $rtmp_ctrq="q30";
4862my $rctr_inc="v31";
4863my $rctr_incd="d31";
4864
4865my $mod_constantd=$t0d;
4866my $mod_constant=$t0;
4867
4868my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
4869my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
4870my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
4871my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
4872my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
4873my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
4874my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
4875my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
4876my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
4877my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
4878my $rk2q1="v28.1q";
4879my $rk3q1="v26.1q";
4880my $rk4v="v27";
4881#########################################################################################
4882# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const uint8_t * plaintext,
4883#                                            uint64_t plaintext_length,
4884#                                            uint8_t * ciphertext,
4885#                                            uint64_t *Xi,
4886#                                            unsigned char ivec[16],
4887#                                            const void *key);
4888#
4889$code.=<<___;
4890.global unroll8_eor3_aes_gcm_enc_256_kernel
4891.type   unroll8_eor3_aes_gcm_enc_256_kernel,%function
4892.align  4
4893unroll8_eor3_aes_gcm_enc_256_kernel:
4894	AARCH64_VALID_CALL_TARGET
4895	cbz	x1, .L256_enc_ret
4896	stp	d8, d9, [sp, #-80]!
4897	lsr	$byte_length, $bit_length, #3
4898	mov	$counter, x4
4899	mov	$cc, x5
4900	stp	d10, d11, [sp, #16]
4901	stp	d12, d13, [sp, #32]
4902	stp	d14, d15, [sp, #48]
4903	mov	x5, #0xc200000000000000
4904	stp	x5, xzr, [sp, #64]
4905	add	$modulo_constant, sp, #64
4906
4907	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
4908
4909	mov	$main_end_input_ptr, $byte_length
4910
4911	mov	$constant_temp, #0x100000000			@ set up counter increment
4912	movi	$rctr_inc.16b, #0x0
4913	mov	$rctr_inc.d[1], $constant_temp
4914	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
4915
4916	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80	@ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4917
4918	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
4919
4920	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
4921
4922	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
4923
4924	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
4925	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
4926
4927	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
4928	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
4929
4930	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
4931	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
4932
4933	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
4934	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
4935
4936	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
4937	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
4938	ldp	$rk0q, $rk1q, [$cc, #0]				 	@ load rk0, rk1
4939
4940	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
4941	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
4942
4943	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
4944
4945	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
4946	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
4947	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
4948
4949	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
4950	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
4951	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 0
4952
4953	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
4954	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 0
4955	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
4956
4957	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 1
4958	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
4959	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
4960
4961	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 1
4962	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
4963
4964	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
4965
4966	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
4967
4968	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
4969	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
4970	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 1
4971
4972	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
4973	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
4974	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
4975
4976	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
4977	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
4978	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
4979
4980	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
4981	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
4982	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
4983
4984	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
4985
4986	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
4987	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
4988	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
4989
4990	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
4991	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
4992
4993	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
4994	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
4995	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
4996
4997	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
4998	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
4999
5000	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
5001	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
5002	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
5003
5004	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
5005	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
5006	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
5007
5008	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
5009	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
5010	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
5011
5012	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
5013	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
5014	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
5015
5016	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
5017	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
5018	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
5019
5020	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
5021	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
5022	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
5023
5024	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
5025	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
5026	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
5027
5028	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
5029	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
5030
5031	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
5032	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
5033	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
5034
5035	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
5036	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
5037
5038	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
5039
5040	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
5041	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
5042
5043	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
5044	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
5045	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
5046
5047	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
5048	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
5049	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
5050
5051	ld1	{ $acc_lb}, [$current_tag]
5052	ext	$acc_lb, $acc_lb, $acc_lb, #8
5053	rev64	$acc_lb, $acc_lb
5054	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
5055
5056	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
5057	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
5058	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
5059
5060	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
5061	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
5062	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
5063
5064	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
5065
5066	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 10
5067	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 10
5068	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
5069
5070	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 10
5071	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 10
5072	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 10
5073
5074	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 10
5075	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 10
5076	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 10
5077
5078	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 11
5079	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
5080	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 11
5081
5082	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 11
5083	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 11
5084	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 11
5085
5086	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 11
5087	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 11
5088	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 11
5089
5090	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 7
5091	ldr	$rk14q, [$cc, #224]					@ load rk14
5092
5093	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 12
5094	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 12
5095	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 12
5096
5097	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 12
5098	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 12
5099	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 12
5100
5101	aese	$ctr2b, $rk13						@ AES block 2 - round 13
5102	aese	$ctr1b, $rk13						@ AES block 1 - round 13
5103	aese	$ctr4b, $rk13						@ AES block 4 - round 13
5104
5105	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 12
5106	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 12
5107
5108	aese	$ctr0b, $rk13						@ AES block 0 - round 13
5109	aese	$ctr5b, $rk13						@ AES block 5 - round 13
5110
5111	aese	$ctr6b, $rk13						@ AES block 6 - round 13
5112	aese	$ctr7b, $rk13						@ AES block 7 - round 13
5113	aese	$ctr3b, $rk13						@ AES block 3 - round 13
5114
5115	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3		@ end_input_ptr
5116	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
5117	b.ge	.L256_enc_tail						@ handle tail
5118
5119	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 0, 1 - load plaintext
5120
5121	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 2, 3 - load plaintext
5122
5123	eor3	$res0b, $ctr_t0b, $ctr0b, $rk14				@ AES block 0 - result
5124	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
5125	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
5126
5127	eor3	$res1b, $ctr_t1b, $ctr1b, $rk14				@ AES block 1 - result
5128	eor3	$res3b, $ctr_t3b, $ctr3b, $rk14				@ AES block 3 - result
5129
5130	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
5131	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
5132	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
5133
5134	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
5135	eor3	$res2b, $ctr_t2b, $ctr2b, $rk14				@ AES block 2 - result
5136	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
5137
5138	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
5139	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
5140	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 0, 1 - store result
5141
5142	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 2, 3 - store result
5143
5144	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
5145	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
5146
5147	eor3	$res4b, $ctr_t4b, $ctr4b, $rk14				@ AES block 4 - result
5148
5149	eor3	$res7b, $ctr_t7b, $ctr7b, $rk14				@ AES block 7 - result
5150	eor3	$res6b, $ctr_t6b, $ctr6b, $rk14				@ AES block 6 - result
5151	eor3	$res5b, $ctr_t5b, $ctr5b, $rk14				@ AES block 5 - result
5152
5153	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
5154	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
5155
5156	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
5157	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
5158	b.ge	.L256_enc_prepretail					@ do prepretail
5159
5160.L256_enc_main_loop:							@ main loop start
5161	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
5162
5163	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
5164	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
5165	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
5166	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
5167
5168	rev64	$res3b, $res3b						@ GHASH block 8k+3
5169	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
5170	ext     $h5.16b, $h5.16b, $h5.16b, #8
5171	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
5172	ext     $h6.16b, $h6.16b, $h6.16b, #8
5173	rev64	$res1b, $res1b						@ GHASH block 8k+1
5174
5175	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
5176	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
5177	rev64	$res0b, $res0b						@ GHASH block 8k
5178
5179	rev64	$res4b, $res4b						@ GHASH block 8k+4
5180	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
5181	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
5182	ext     $h7.16b, $h7.16b, $h7.16b, #8
5183	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
5184	ext     $h8.16b, $h8.16b, $h8.16b, #8
5185
5186	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
5187	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
5188	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
5189
5190	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
5191	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
5192	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
5193
5194	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
5195	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
5196	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
5197
5198	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
5199	eor	$res0b, $res0b, $acc_lb				 	@ PRE 1
5200	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
5201
5202	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
5203	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
5204	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
5205
5206	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
5207	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
5208	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
5209
5210	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
5211	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
5212	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
5213
5214	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5215	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5216	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
5217
5218	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
5219	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
5220	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
5221
5222	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
5223	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
5224	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
5225
5226	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
5227	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
5228	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
5229
5230	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
5231	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
5232	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
5233
5234	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
5235	rev64	$res6b, $res6b						@ GHASH block 8k+6
5236	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
5237
5238	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
5239	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
5240	rev64	$res2b, $res2b						@ GHASH block 8k+2
5241
5242	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
5243	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
5244	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
5245
5246	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
5247	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
5248	rev64	$res5b, $res5b						@ GHASH block 8k+5
5249
5250	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
5251	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
5252	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
5253	ext     $h3.16b, $h3.16b, $h3.16b, #8
5254	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
5255	ext     $h4.16b, $h4.16b, $h4.16b, #8
5256
5257	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5258	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
5259	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
5260
5261	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
5262	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
5263	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
5264
5265	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
5266	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
5267	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
5268
5269	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5270	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
5271	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
5272
5273	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5274	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
5275	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
5276
5277	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
5278	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
5279	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
5280
5281	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
5282	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
5283	rev64	$res7b, $res7b						@ GHASH block 8k+7
5284
5285	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
5286	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
5287	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
5288
5289	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
5290	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
5291	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
5292
5293	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
5294	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
5295	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
5296
5297	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
5298	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
5299	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
5300
5301	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
5302	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
5303	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
5304
5305	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
5306	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
5307	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
5308
5309	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
5310	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
5311	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
5312
5313	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
5314	ext     $h1.16b, $h1.16b, $h1.16b, #8
5315	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
5316	ext     $h2.16b, $h2.16b, $h2.16b, #8
5317	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
5318	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
5319
5320	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
5321	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5322	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
5323	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
5324
5325	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
5326	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
5327	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
5328
5329	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5330	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
5331	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
5332
5333	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
5334	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
5335	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
5336
5337	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
5338	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5339	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
5340
5341	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
5342	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
5343	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
5344
5345	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
5346	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
5347	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
5348
5349	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
5350	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
5351	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
5352
5353	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
5354	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5355	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
5356
5357	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
5358	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
5359	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
5360
5361	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
5362	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
5363	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
5364
5365	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
5366	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
5367	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
5368
5369	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
5370	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
5371	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
5372
5373	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
5374	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
5375	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
5376
5377	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
5378
5379	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
5380	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
5381	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
5382
5383	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
5384	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
5385	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
5386
5387	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
5388	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
5389	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
5390
5391	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
5392	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
5393	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
5394
5395	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
5396
5397	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
5398	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
5399
5400	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
5401	ldp	$ctr_t0q, $ctr_t1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load plaintext
5402	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
5403
5404	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
5405	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
5406	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
5407
5408	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
5409	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
5410
5411	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
5412	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
5413
5414	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
5415	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
5416
5417	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
5418	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
5419	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
5420
5421	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
5422	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
5423	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
5424
5425	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
5426	ldr	$rk14q, [$cc, #224]					@ load rk14
5427	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
5428
5429	ldp	$ctr_t2q, $ctr_t3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load plaintext
5430	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
5431	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
5432
5433	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
5434	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 12
5435	ldp	$ctr_t4q, $ctr_t5q, [$input_ptr], #32			@ AES block 4, 5 - load plaintext
5436
5437	ldp	$ctr_t6q, $ctr_t7q, [$input_ptr], #32			@ AES block 6, 7 - load plaintext
5438	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
5439	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
5440
5441	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
5442	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
5443	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
5444
5445	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
5446	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
5447	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
5448
5449	eor3	$res2b, $ctr_t2b, $ctr2b, $rk14				@ AES block 8k+10 - result
5450	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
5451	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
5452
5453	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
5454	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
5455	eor3	$res5b, $ctr_t5b, $ctr5b, $rk14				@ AES block 5 - result
5456
5457	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
5458	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
5459	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
5460
5461	eor3	$res4b, $ctr_t4b, $ctr4b, $rk14				@ AES block 4 - result
5462	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
5463	eor3	$res3b, $ctr_t3b, $ctr3b, $rk14				@ AES block 8k+11 - result
5464
5465	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
5466	eor3	$res1b, $ctr_t1b, $ctr1b, $rk14				@ AES block 8k+9 - result
5467	eor3	$res0b, $ctr_t0b, $ctr0b, $rk14				@ AES block 8k+8 - result
5468
5469	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
5470	stp	$res0q, $res1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
5471	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
5472
5473	eor3	$res7b, $ctr_t7b, $ctr7b, $rk14				@ AES block 7 - result
5474	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
5475	stp	$res2q, $res3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
5476
5477	eor3	$res6b, $ctr_t6b, $ctr6b, $rk14				@ AES block 6 - result
5478	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
5479	stp	$res4q, $res5q, [$output_ptr], #32			@ AES block 4, 5 - store result
5480
5481	stp	$res6q, $res7q, [$output_ptr], #32			@ AES block 6, 7 - store result
5482	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
5483	b.lt	.L256_enc_main_loop
5484
5485.L256_enc_prepretail:							@ PREPRETAIL
5486	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
5487	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
5488	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
5489
5490	rev64	$res2b, $res2b						@ GHASH block 8k+2
5491
5492	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
5493	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
5494
5495	rev64	$res5b, $res5b						@ GHASH block 8k+5
5496	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
5497	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
5498
5499	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
5500
5501	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
5502	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
5503	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
5504
5505	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
5506	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
5507
5508	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
5509	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
5510	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
5511
5512	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
5513	rev64	$res0b, $res0b						@ GHASH block 8k
5514	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
5515
5516	rev64	$res1b, $res1b						@ GHASH block 8k+1
5517	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
5518	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
5519
5520	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
5521	ext     $h7.16b, $h7.16b, $h7.16b, #8
5522	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
5523	ext     $h8.16b, $h8.16b, $h8.16b, #8
5524	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
5525
5526	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
5527	ext     $h5.16b, $h5.16b, $h5.16b, #8
5528	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
5529	ext     $h6.16b, $h6.16b, $h6.16b, #8
5530	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
5531	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
5532
5533	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
5534	eor	$res0b, $res0b, $acc_lb					@ PRE 1
5535
5536	rev64	$res3b, $res3b						@ GHASH block 8k+3
5537	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
5538
5539	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
5540	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
5541	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
5542
5543	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
5544	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
5545	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
5546
5547	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
5548	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
5549	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
5550
5551	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
5552	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5553	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
5554
5555	rev64	$res6b, $res6b						@ GHASH block 8k+6
5556	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
5557	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
5558
5559	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
5560	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
5561	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
5562
5563	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
5564	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
5565
5566	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
5567	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
5568	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
5569
5570	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
5571	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
5572	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
5573
5574	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
5575	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
5576	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
5577
5578	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
5579	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
5580	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
5581
5582	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
5583	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
5584	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
5585
5586	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
5587	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
5588	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
5589
5590	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
5591	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5592	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
5593
5594	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
5595	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
5596	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
5597
5598	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
5599	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
5600	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
5601
5602	rev64	$res4b, $res4b						@ GHASH block 8k+4
5603	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
5604	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
5605
5606	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
5607	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
5608	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
5609
5610	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
5611	ext     $h3.16b, $h3.16b, $h3.16b, #8
5612	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
5613	ext     $h4.16b, $h4.16b, $h4.16b, #8
5614	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
5615	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
5616
5617	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
5618	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
5619
5620	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
5621	rev64	$res7b, $res7b						@ GHASH block 8k+7
5622	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5623
5624	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
5625	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
5626	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
5627
5628	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
5629	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
5630	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
5631
5632	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
5633	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5634	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
5635	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
5636
5637	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
5638	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
5639	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
5640
5641	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
5642	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
5643	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
5644	ext     $h1.16b, $h1.16b, $h1.16b, #8
5645	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
5646	ext     $h2.16b, $h2.16b, $h2.16b, #8
5647
5648	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
5649	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
5650	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
5651
5652	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
5653	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
5654
5655	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
5656	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
5657	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
5658
5659	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
5660	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
5661	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
5662
5663	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
5664	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
5665	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
5666
5667	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5668	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
5669	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
5670
5671	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
5672	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
5673	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
5674
5675	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
5676	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
5677	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
5678
5679	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
5680	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
5681	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
5682
5683	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
5684	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
5685	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
5686
5687	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
5688	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
5689	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
5690
5691	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
5692	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
5693	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
5694
5695	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
5696	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
5697	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
5698
5699	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
5700	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
5701	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
5702
5703	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
5704
5705	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
5706	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
5707	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
5708
5709	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
5710	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
5711
5712	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
5713	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
5714	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
5715
5716	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
5717	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
5718	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
5719
5720	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
5721	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
5722	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
5723
5724	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
5725	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
5726	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
5727
5728	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
5729	ext	$t11.16b, $acc_hb, $acc_hb, #8			 	@ MODULO - other top alignment
5730	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
5731
5732	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
5733	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
5734	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
5735
5736	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
5737	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
5738	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
5739
5740	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
5741	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
5742	ldr	$rk14q, [$cc, #224]					@ load rk14
5743
5744	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 12
5745	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
5746	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
5747
5748	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
5749	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
5750	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
5751
5752	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
5753	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
5754
5755	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
5756	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
5757	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
5758
5759	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
5760	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
5761	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
5762
5763	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
5764	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
5765	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
5766
5767	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
5768	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
5769.L256_enc_tail:								@ TAIL
5770
5771	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8l | h8h
5772        ext     $h8.16b, $h8.16b, $h8.16b, #8
5773	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr		@ main_end_input_ptr is number of bytes left to process
5774
5775	ldr	$ctr_t0q, [$input_ptr], #16				@ AES block 8k+8 - load plaintext
5776
5777	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
5778        ext     $h5.16b, $h5.16b, $h5.16b, #8
5779
5780	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
5781	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
5782        ext     $h6.16b, $h6.16b, $h6.16b, #8
5783        ext     $h7.16b, $h7.16b, $h7.16b, #8
5784	mov	$t1.16b, $rk14
5785
5786	cmp	$main_end_input_ptr, #112
5787	eor3	$res1b, $ctr_t0b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
5788	b.gt	.L256_enc_blocks_more_than_7
5789
5790	movi	$acc_l.8b, #0
5791	mov	$ctr7b, $ctr6b
5792	movi	$acc_h.8b, #0
5793
5794	mov	$ctr6b, $ctr5b
5795	mov	$ctr5b, $ctr4b
5796	mov	$ctr4b, $ctr3b
5797
5798	mov	$ctr3b, $ctr2b
5799	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5800	mov	$ctr2b, $ctr1b
5801
5802	movi	$acc_m.8b, #0
5803	cmp	$main_end_input_ptr, #96
5804	b.gt	.L256_enc_blocks_more_than_6
5805
5806	mov	$ctr7b, $ctr6b
5807	mov	$ctr6b, $ctr5b
5808	cmp	$main_end_input_ptr, #80
5809
5810	mov	$ctr5b, $ctr4b
5811	mov	$ctr4b, $ctr3b
5812	mov	$ctr3b, $ctr1b
5813
5814	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5815	b.gt	.L256_enc_blocks_more_than_5
5816
5817	mov	$ctr7b, $ctr6b
5818	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5819
5820	mov	$ctr6b, $ctr5b
5821	mov	$ctr5b, $ctr4b
5822
5823	cmp	$main_end_input_ptr, #64
5824	mov	$ctr4b, $ctr1b
5825	b.gt	.L256_enc_blocks_more_than_4
5826
5827	cmp	$main_end_input_ptr, #48
5828	mov	$ctr7b, $ctr6b
5829	mov	$ctr6b, $ctr5b
5830
5831	mov	$ctr5b, $ctr1b
5832	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5833	b.gt	.L256_enc_blocks_more_than_3
5834
5835	cmp	$main_end_input_ptr, #32
5836	mov	$ctr7b, $ctr6b
5837	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5838
5839	mov	$ctr6b, $ctr1b
5840	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5841	b.gt	.L256_enc_blocks_more_than_2
5842
5843	mov	$ctr7b, $ctr1b
5844
5845	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5846	cmp	$main_end_input_ptr, #16
5847	b.gt	.L256_enc_blocks_more_than_1
5848
5849	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5850	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
5851	b	 .L256_enc_blocks_less_than_1
5852.L256_enc_blocks_more_than_7:						@ blocks left >  7
5853	st1	{ $res1b}, [$output_ptr], #16				@ AES final-7 block  - store result
5854
5855	rev64	$res0b, $res1b						@ GHASH final-7 block
5856
5857	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5858
5859	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-6 block - load plaintext
5860
5861	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
5862	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
5863	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
5864
5865	movi	$t0.8b, #0						@ suppress further partial tag feed in
5866
5867	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
5868	eor3	$res1b, $ctr_t1b, $ctr1b, $t1.16b			@ AES final-6 block - result
5869
5870	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d				@ GHASH final-7 block - mid
5871	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
5872.L256_enc_blocks_more_than_6:						@ blocks left >  6
5873
5874	st1	{ $res1b}, [$output_ptr], #16				@ AES final-6 block - store result
5875
5876	rev64	$res0b, $res1b						@ GHASH final-6 block
5877
5878	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5879
5880	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
5881	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
5882	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
5883
5884	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-5 block - load plaintext
5885
5886	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
5887
5888	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
5889
5890	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
5891	eor3	$res1b, $ctr_t1b, $ctr2b, $t1.16b			@ AES final-5 block - result
5892
5893	movi	$t0.8b, #0						@ suppress further partial tag feed in
5894
5895	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
5896	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
5897.L256_enc_blocks_more_than_5:						@ blocks left >  5
5898
5899	st1	{ $res1b}, [$output_ptr], #16				@ AES final-5 block - store result
5900
5901	rev64	$res0b, $res1b						@ GHASH final-5 block
5902
5903	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5904
5905	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
5906
5907	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
5908
5909	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
5910	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
5911
5912	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
5913
5914	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-4 block - load plaintext
5915	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
5916
5917	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
5918	movi	$t0.8b, #0						@ suppress further partial tag feed in
5919	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
5920
5921	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
5922	eor3	$res1b, $ctr_t1b, $ctr3b, $t1.16b			@ AES final-4 block - result
5923.L256_enc_blocks_more_than_4:						@ blocks left >  4
5924
5925	st1	{ $res1b}, [$output_ptr], #16				@ AES final-4 block - store result
5926
5927	rev64	$res0b, $res1b						@ GHASH final-4 block
5928
5929	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-3 block - load plaintext
5930
5931	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5932
5933	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
5934	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
5935
5936	eor3	$res1b, $ctr_t1b, $ctr4b, $t1.16b			@ AES final-3 block - result
5937	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
5938
5939	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
5940	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
5941
5942	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
5943
5944	movi	$t0.8b, #0						@ suppress further partial tag feed in
5945
5946	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
5947	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
5948.L256_enc_blocks_more_than_3:						@ blocks left >  3
5949
5950	st1	{ $res1b}, [$output_ptr], #16				@ AES final-3 block - store result
5951
5952	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
5953	ext     $h4.16b, $h4.16b, $h4.16b, #8
5954	rev64	$res0b, $res1b						@ GHASH final-3 block
5955
5956	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5957
5958	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
5959	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
5960
5961	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
5962	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
5963	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
5964
5965	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
5966	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-2 block - load plaintext
5967
5968	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
5969	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
5970
5971	eor3	$res1b, $ctr_t1b, $ctr5b, $t1.16b			@ AES final-2 block - result
5972	movi	$t0.8b, #0						@ suppress further partial tag feed in
5973
5974	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
5975	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
5976.L256_enc_blocks_more_than_2:						@ blocks left >  2
5977
5978	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
5979	ext     $h3.16b, $h3.16b, $h3.16b, #8
5980
5981	st1	{ $res1b}, [$output_ptr], #16			 	@ AES final-2 block - store result
5982
5983	rev64	$res0b, $res1b						@ GHASH final-2 block
5984	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final-1 block - load plaintext
5985
5986	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
5987
5988	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
5989
5990	movi	$t0.8b, #0						@ suppress further partial tag feed in
5991
5992	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
5993	eor3	$res1b, $ctr_t1b, $ctr6b, $t1.16b			@ AES final-1 block - result
5994
5995	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
5996
5997	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
5998
5999	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
6000	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
6001
6002	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
6003	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
6004.L256_enc_blocks_more_than_1:						@ blocks left >  1
6005
6006	st1	{ $res1b}, [$output_ptr], #16				@ AES final-1 block - store result
6007
6008	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
6009	ext     $h2.16b, $h2.16b, $h2.16b, #8
6010	rev64	$res0b, $res1b						@ GHASH final-1 block
6011	ldr	$ctr_t1q, [$input_ptr], #16				@ AES final block - load plaintext
6012
6013	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
6014	movi	$t0.8b, #0						@ suppress further partial tag feed in
6015
6016	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
6017	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
6018
6019	eor3	$res1b, $ctr_t1b, $ctr7b, $t1.16b			@ AES final block - result
6020	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
6021
6022	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
6023	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
6024
6025	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
6026
6027	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
6028	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
6029
6030	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
6031
6032	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
6033.L256_enc_blocks_less_than_1:						@ blocks left <= 1
6034
6035	and	$bit_length, $bit_length, #127				@ bit_length %= 128
6036
6037	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
6038
6039	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
6040
6041	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
6042	and	$bit_length, $bit_length, #127				@ bit_length %= 128
6043
6044	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
6045	cmp	$bit_length, #64
6046	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
6047
6048	csel	$temp3_x, $temp0_x, xzr, lt
6049	csel	$temp2_x, $temp1_x, $temp0_x, lt
6050
6051	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
6052	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
6053	ext     $h1.16b, $h1.16b, $h1.16b, #8
6054
6055	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
6056	mov	$ctr0.d[1], $temp3_x
6057
6058	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
6059
6060	rev64	$res0b, $res1b						@ GHASH final block
6061
6062	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
6063	bif	$res1b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
6064	str	$rtmp_ctrq, [$counter]					@ store the updated counter
6065
6066	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
6067	st1	{ $res1b}, [$output_ptr]				@ store all 16B
6068
6069	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
6070	pmull2	$rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
6071	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
6072
6073	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
6074	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
6075
6076	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
6077
6078	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
6079
6080	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
6081	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
6082
6083	ext	$t11.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
6084
6085	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
6086	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
6087
6088	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
6089
6090	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
6091	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
6092
6093	eor3	$acc_lb, $acc_lb, $acc_hb, $t11.16b		 	@ MODULO - fold into low
6094		ext	$acc_lb, $acc_lb, $acc_lb, #8
6095	rev64	$acc_lb, $acc_lb
6096	st1	{ $acc_l.16b }, [$current_tag]
6097	mov	x0, $byte_length					@ return sizes
6098
6099        ldp     d10, d11, [sp, #16]
6100	ldp     d12, d13, [sp, #32]
6101	ldp     d14, d15, [sp, #48]
6102	ldp     d8, d9, [sp], #80
6103	ret
6104
6105.L256_enc_ret:
6106	mov w0, #0x0
6107	ret
6108.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6109___
6110
6111{
6112#########################################################################################
6113# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const uint8_t * ciphertext,
6114#                                            uint64_t plaintext_length,
6115#                                            uint8_t * plaintext,
6116#                                            uint64_t *Xi,
6117#                                            unsigned char ivec[16],
6118#                                            const void *key);
6119#
6120$code.=<<___;
6121.global unroll8_eor3_aes_gcm_dec_256_kernel
6122.type   unroll8_eor3_aes_gcm_dec_256_kernel,%function
6123.align  4
6124unroll8_eor3_aes_gcm_dec_256_kernel:
6125	AARCH64_VALID_CALL_TARGET
6126	cbz	x1, .L256_dec_ret
6127	stp	d8, d9, [sp, #-80]!
6128	lsr	$byte_length, $bit_length, #3
6129	mov	$counter, x4
6130	mov	$cc, x5
6131	stp	d10, d11, [sp, #16]
6132	stp	d12, d13, [sp, #32]
6133	stp	d14, d15, [sp, #48]
6134	mov	x5, #0xc200000000000000
6135	stp	x5, xzr, [sp, #64]
6136	add	$modulo_constant, sp, #64
6137
6138	ld1	{ $ctr0b}, [$counter]					@ CTR block 0
6139
6140	mov	$constant_temp, #0x100000000			@ set up counter increment
6141	movi	$rctr_inc.16b, #0x0
6142	mov	$rctr_inc.d[1], $constant_temp
6143	mov	$main_end_input_ptr, $byte_length
6144
6145	sub	$main_end_input_ptr, $main_end_input_ptr, #1		@ byte_len - 1
6146
6147	rev32	$rtmp_ctr.16b, $ctr0.16b				@ set up reversed counter
6148
6149	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 0
6150
6151	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 1
6152	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 1
6153
6154	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 2
6155	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 2
6156	ldp	$rk0q, $rk1q, [$cc, #0]				  	@ load rk0, rk1
6157
6158	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 3
6159	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 3
6160
6161	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 4
6162	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 4
6163
6164	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 0
6165
6166	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 5
6167	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 5
6168
6169	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 0
6170	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 0
6171
6172	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 6
6173	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 6
6174
6175	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 7
6176	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 0
6177
6178	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b		        @ AES block 6 - round 0
6179	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 0
6180
6181	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 0
6182	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b		        @ AES block 7 - round 0
6183	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
6184
6185	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b		        @ AES block 6 - round 1
6186	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b		        @ AES block 4 - round 1
6187	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b		        @ AES block 0 - round 1
6188
6189	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 1
6190	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 1
6191	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 1
6192
6193	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 1
6194	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 1
6195
6196	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 2
6197	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 2
6198	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 2
6199
6200	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 2
6201	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 2
6202	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 2
6203
6204	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 2
6205	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 2
6206	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
6207
6208	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 3
6209	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 3
6210
6211	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 3
6212	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 3
6213
6214	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 3
6215	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 3
6216	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 3
6217
6218	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 3
6219
6220	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 4
6221	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 4
6222
6223	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 4
6224	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 4
6225	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 4
6226
6227	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 4
6228	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 4
6229	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 4
6230
6231	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 5
6232	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 5
6233
6234	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
6235	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 5
6236	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 5
6237
6238	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 5
6239
6240	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 5
6241	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 5
6242
6243	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 5
6244
6245	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 6
6246	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 6
6247	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 6
6248
6249	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 6
6250	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 6
6251	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 6
6252
6253	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 6
6254	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 6
6255	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
6256
6257	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 7
6258	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 7
6259
6260	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 7
6261	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 7
6262	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 7
6263
6264	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 7
6265	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 7
6266	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 7
6267
6268	and	$main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
6269	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 8
6270	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 8
6271
6272	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 8
6273	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 8
6274	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 8
6275
6276	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 8
6277	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 8
6278	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 8
6279
6280	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 9
6281
6282	ld1	{ $acc_lb}, [$current_tag]
6283	ext	$acc_lb, $acc_lb, $acc_lb, #8
6284	rev64	$acc_lb, $acc_lb
6285	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
6286	add	$end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
6287	add	$main_end_input_ptr, $main_end_input_ptr, $input_ptr
6288
6289	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 9
6290	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 9
6291
6292	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 9
6293	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 9
6294
6295	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 9
6296
6297	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 9
6298	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 9
6299
6300	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 10
6301	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 10
6302	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 10
6303
6304	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 10
6305	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 10
6306	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 10
6307
6308	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 10
6309	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 10
6310	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
6311
6312	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 11
6313	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
6314
6315	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 11
6316	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 11
6317	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 11
6318
6319	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 11
6320	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 11
6321	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 11
6322
6323	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 11
6324	ldr	$rk14q, [$cc, #224]					@ load rk14
6325
6326	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 1 - round 12
6327	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 4 - round 12
6328	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 5 - round 12
6329
6330	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
6331	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 3 - round 12
6332	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 2 - round 12
6333
6334	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 6 - round 12
6335	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 0 - round 12
6336	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 7 - round 12
6337
6338	aese	$ctr5b, $rk13						@ AES block 5 - round 13
6339	aese	$ctr1b, $rk13						@ AES block 1 - round 13
6340	aese	$ctr2b, $rk13						@ AES block 2 - round 13
6341
6342	aese	$ctr0b, $rk13						@ AES block 0 - round 13
6343	aese	$ctr4b, $rk13						@ AES block 4 - round 13
6344	aese	$ctr6b, $rk13						@ AES block 6 - round 13
6345
6346	aese	$ctr3b, $rk13						@ AES block 3 - round 13
6347	aese	$ctr7b, $rk13						@ AES block 7 - round 13
6348	b.ge	.L256_dec_tail						@ handle tail
6349
6350	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 0, 1 - load ciphertext
6351
6352	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 2, 3 - load ciphertext
6353
6354	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 4, 5 - load ciphertext
6355
6356	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 6, 7 - load ciphertext
6357	cmp	$input_ptr, $main_end_input_ptr				@ check if we have <= 8 blocks
6358
6359	eor3	$ctr1b, $res1b, $ctr1b, $rk14				@ AES block 1 - result
6360	eor3	$ctr0b, $res0b, $ctr0b, $rk14				@ AES block 0 - result
6361	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 0, 1 - store result
6362
6363	rev32	$ctr0.16b, $rtmp_ctr.16b				@ CTR block 8
6364	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8
6365	eor3	$ctr3b, $res3b, $ctr3b, $rk14				@ AES block 3 - result
6366
6367	eor3	$ctr5b, $res5b, $ctr5b, $rk14				@ AES block 5 - result
6368
6369	eor3	$ctr4b, $res4b, $ctr4b, $rk14				@ AES block 4 - result
6370	rev32	$ctr1.16b, $rtmp_ctr.16b				@ CTR block 9
6371	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 9
6372
6373	eor3	$ctr2b, $res2b, $ctr2b, $rk14				@ AES block 2 - result
6374	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 2, 3 - store result
6375
6376	rev32	$ctr2.16b, $rtmp_ctr.16b				@ CTR block 10
6377	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 10
6378
6379	eor3	$ctr6b, $res6b, $ctr6b, $rk14				@ AES block 6 - result
6380
6381	rev32	$ctr3.16b, $rtmp_ctr.16b				@ CTR block 11
6382	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 11
6383	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 4, 5 - store result
6384
6385	eor3	$ctr7b, $res7b, $ctr7b, $rk14				@ AES block 7 - result
6386	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 6, 7 - store result
6387
6388	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 12
6389	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 12
6390	b.ge	.L256_dec_prepretail					@ do prepretail
6391
6392.L256_dec_main_loop:							@ main loop start
6393	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
6394	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
6395	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
6396
6397	rev64	$res1b, $res1b						@ GHASH block 8k+1
6398	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
6399	ext     $h7.16b, $h7.16b, $h7.16b, #8
6400	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
6401	ext     $h8.16b, $h8.16b, $h8.16b, #8
6402
6403	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
6404	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
6405	rev64	$res0b, $res0b						@ GHASH block 8k
6406
6407	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
6408	rev64	$res4b, $res4b						@ GHASH block 8k+4
6409	rev64	$res3b, $res3b						@ GHASH block 8k+3
6410
6411	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
6412	rev64	$res7b, $res7b						@ GHASH block 8k+7
6413
6414	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
6415	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
6416	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
6417
6418	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
6419	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
6420	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
6421
6422	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
6423	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
6424	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
6425
6426	eor	$res0b, $res0b, $acc_lb					@ PRE 1
6427	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
6428	ext     $h5.16b, $h5.16b, $h5.16b, #8
6429	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
6430	ext     $h6.16b, $h6.16b, $h6.16b, #8
6431	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
6432
6433	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
6434	rev64	$res2b, $res2b						@ GHASH block 8k+2
6435	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
6436
6437	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
6438	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
6439	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
6440
6441	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6442	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
6443	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
6444
6445	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
6446	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
6447	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
6448
6449	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
6450	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
6451	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
6452
6453	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
6454	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
6455	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
6456
6457	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
6458	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
6459	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
6460
6461	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
6462	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
6463	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
6464
6465	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
6466	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
6467	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
6468
6469	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
6470	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
6471	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6472
6473	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
6474	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
6475	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
6476
6477	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
6478	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
6479	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
6480
6481	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
6482	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
6483	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
6484
6485	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
6486	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
6487	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
6488
6489	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
6490	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
6491	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
6492	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
6493
6494	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
6495	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
6496	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
6497
6498	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
6499	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
6500	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
6501
6502	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
6503	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
6504	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
6505
6506	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
6507	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6508	rev64	$res5b, $res5b						@ GHASH block 8k+5
6509
6510	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
6511	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
6512	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6513
6514	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
6515	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
6516	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
6517
6518	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6519	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
6520	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
6521
6522	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
6523	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
6524	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
6525
6526	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
6527	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
6528	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
6529
6530	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
6531	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
6532	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
6533
6534	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
6535	ext     $h3.16b, $h3.16b, $h3.16b, #8
6536	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
6537	ext     $h4.16b, $h4.16b, $h4.16b, #8
6538	rev64	$res6b, $res6b						@ GHASH block 8k+6
6539	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
6540
6541	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
6542	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
6543	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
6544
6545	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
6546	ext     $h1.16b, $h1.16b, $h1.16b, #8
6547	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
6548	ext     $h2.16b, $h2.16b, $h2.16b, #8
6549	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
6550	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
6551
6552	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
6553	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
6554	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
6555
6556	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
6557	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
6558	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
6559	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
6560
6561	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
6562	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
6563	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6564
6565	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
6566	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
6567	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
6568
6569	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
6570	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
6571	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
6572
6573	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
6574	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
6575	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
6576
6577	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6578	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
6579	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
6580
6581	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
6582	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
6583	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6584
6585	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
6586	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
6587	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
6588
6589	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
6590	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
6591	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
6592
6593	ldp	$res0q, $res1q, [$input_ptr], #32			@ AES block 8k+8, 8k+9 - load ciphertext
6594	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
6595	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
6596
6597	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
6598	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
6599	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
6600
6601	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
6602	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
6603	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
6604
6605	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
6606	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
6607	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
6608
6609	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
6610	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
6611	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
6612
6613	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
6614	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
6615	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
6616
6617	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
6618	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
6619	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
6620
6621	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
6622	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
6623	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
6624
6625	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
6626	rev32	$h1.16b, $rtmp_ctr.16b					@ CTR block 8k+16
6627	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
6628
6629	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+16
6630	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
6631	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
6632
6633	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
6634	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
6635
6636	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
6637	rev32	$h2.16b, $rtmp_ctr.16b					@ CTR block 8k+17
6638	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
6639
6640	ldp	$res2q, $res3q, [$input_ptr], #32			@ AES block 8k+10, 8k+11 - load ciphertext
6641	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
6642	ext	$t11.16b, $acc_hb, $acc_hb, #8				 @ MODULO - other top alignment
6643
6644	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
6645	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+17
6646	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
6647
6648	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
6649	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
6650	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
6651
6652	rev32	$h3.16b, $rtmp_ctr.16b					@ CTR block 8k+18
6653	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+18
6654	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
6655
6656	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
6657	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 12
6658	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
6659
6660	ldr	$rk14q, [$cc, #224]					@ load rk14
6661	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
6662	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
6663
6664	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
6665	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
6666	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
6667
6668	ldp	$res4q, $res5q, [$input_ptr], #32			@ AES block 8k+12, 8k+13 - load ciphertext
6669	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
6670	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
6671
6672	ldp	$res6q, $res7q, [$input_ptr], #32			@ AES block 8k+14, 8k+15 - load ciphertext
6673	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
6674	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
6675
6676	rev32	$h4.16b, $rtmp_ctr.16b					@ CTR block 8k+19
6677	eor3	$ctr2b, $res2b, $ctr2b, $rk14				@ AES block 8k+10 - result
6678	eor3	$ctr1b, $res1b, $ctr1b, $rk14				@ AES block 8k+9 - result
6679
6680	ext	$t11.16b, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
6681	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
6682
6683	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+19
6684	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
6685	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
6686
6687	eor3	$ctr5b, $res5b, $ctr5b, $rk14				@ AES block 8k+13 - result
6688	eor3	$ctr0b, $res0b, $ctr0b, $rk14				@ AES block 8k+8 - result
6689	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
6690
6691	stp	$ctr0q, $ctr1q, [$output_ptr], #32			@ AES block 8k+8, 8k+9 - store result
6692	mov	$ctr0.16b, $h1.16b					@ CTR block 8k+16
6693	eor3	$ctr4b, $res4b, $ctr4b, $rk14				@ AES block 8k+12 - result
6694
6695	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
6696	eor3	$ctr3b, $res3b, $ctr3b, $rk14				@ AES block 8k+11 - result
6697	stp	$ctr2q, $ctr3q, [$output_ptr], #32			@ AES block 8k+10, 8k+11 - store result
6698
6699	mov	$ctr3.16b, $h4.16b					@ CTR block 8k+19
6700	mov	$ctr2.16b, $h3.16b					@ CTR block 8k+18
6701	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
6702
6703	mov	$ctr1.16b, $h2.16b					@ CTR block 8k+17
6704	stp	$ctr4q, $ctr5q, [$output_ptr], #32			@ AES block 8k+12, 8k+13 - store result
6705	eor3	$ctr7b, $res7b, $ctr7b, $rk14				@ AES block 8k+15 - result
6706
6707	eor3	$ctr6b, $res6b, $ctr6b, $rk14				@ AES block 8k+14 - result
6708	rev32	$ctr4.16b, $rtmp_ctr.16b				@ CTR block 8k+20
6709	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+20
6710
6711	cmp	$input_ptr, $main_end_input_ptr				@ LOOP CONTROL
6712	stp	$ctr6q, $ctr7q, [$output_ptr], #32			@ AES block 8k+14, 8k+15 - store result
6713	b.lt	.L256_dec_main_loop
6714
6715.L256_dec_prepretail:							@ PREPRETAIL
6716	ldp	$rk0q, $rk1q, [$cc, #0]					@ load rk0, rk1
6717	rev32	$ctr5.16b, $rtmp_ctr.16b				@ CTR block 8k+13
6718	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+13
6719
6720	rev64	$res4b, $res4b						@ GHASH block 8k+4
6721	ldr	$h56kq, [$current_tag, #144]				@ load h6k | h5k
6722	ldr	$h78kq, [$current_tag, #192]				@ load h8k | h7k
6723
6724	rev32	$ctr6.16b, $rtmp_ctr.16b				@ CTR block 8k+14
6725	rev64	$res0b, $res0b						@ GHASH block 8k
6726	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+14
6727
6728	ext	$acc_lb, $acc_lb, $acc_lb, #8				@ PRE 0
6729	ldr	$h7q, [$current_tag, #176]				@ load h7l | h7h
6730	ext     $h7.16b, $h7.16b, $h7.16b, #8
6731	ldr	$h8q, [$current_tag, #208]				@ load h8l | h8h
6732	ext     $h8.16b, $h8.16b, $h8.16b, #8
6733	rev64	$res1b, $res1b						@ GHASH block 8k+1
6734
6735	rev32	$ctr7.16b, $rtmp_ctr.16b				@ CTR block 8k+15
6736	rev64	$res2b, $res2b						@ GHASH block 8k+2
6737	ldr	$h5q, [$current_tag, #128]				@ load h5l | h5h
6738	ext     $h5.16b, $h5.16b, $h5.16b, #8
6739	ldr	$h6q, [$current_tag, #160]				@ load h6l | h6h
6740	ext     $h6.16b, $h6.16b, $h6.16b, #8
6741
6742	aese	$ctr0b, $rk0  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 0
6743	aese	$ctr1b, $rk0  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 0
6744	aese	$ctr4b, $rk0  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 0
6745
6746	aese	$ctr3b, $rk0  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 0
6747	aese	$ctr5b, $rk0  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 0
6748	aese	$ctr6b, $rk0  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 0
6749
6750	aese	$ctr4b, $rk1  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 1
6751	aese	$ctr7b, $rk0  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 0
6752	aese	$ctr2b, $rk0  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 0
6753
6754	ldp	$rk2q, $rk3q, [$cc, #32]				@ load rk2, rk3
6755	aese	$ctr0b, $rk1  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 1
6756	eor	$res0b, $res0b, $acc_lb					@ PRE 1
6757
6758	aese	$ctr7b, $rk1  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 1
6759	aese	$ctr6b, $rk1  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 1
6760	aese	$ctr2b, $rk1  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 1
6761
6762	aese	$ctr3b, $rk1  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 1
6763	aese	$ctr1b, $rk1  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 1
6764	aese	$ctr5b, $rk1  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 1
6765
6766	pmull2  $t0.1q, $res1.2d, $h7.2d				@ GHASH block 8k+1 - high
6767	trn1	$acc_m.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6768	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH block 8k - low
6769
6770	rev64	$res3b, $res3b						@ GHASH block 8k+3
6771	pmull	$h7.1q, $res1.1d, $h7.1d				@ GHASH block 8k+1 - low
6772
6773	aese	$ctr5b, $rk2  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 2
6774	aese	$ctr7b, $rk2  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 2
6775	aese	$ctr1b, $rk2  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 2
6776
6777	aese	$ctr3b, $rk2  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 2
6778	aese	$ctr6b, $rk2  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 2
6779	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH block 8k - high
6780
6781	aese	$ctr0b, $rk2  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 2
6782	aese	$ctr7b, $rk3  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 3
6783
6784	aese	$ctr5b, $rk3  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 3
6785	rev64	$res6b, $res6b						@ GHASH block 8k+6
6786
6787	aese	$ctr0b, $rk3  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 3
6788	aese	$ctr2b, $rk2  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 2
6789	aese	$ctr6b, $rk3  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 3
6790
6791	pmull2  $t1.1q, $res2.2d, $h6.2d				@ GHASH block 8k+2 - high
6792	trn2	$res0.2d, $res1.2d, $res0.2d				@ GHASH block 8k, 8k+1 - mid
6793	aese	$ctr4b, $rk2  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 2
6794
6795	ldp	$rk4q, $rk5q, [$cc, #64]				@ load rk4, rk5
6796	aese	$ctr1b, $rk3  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 3
6797	pmull2  $t2.1q, $res3.2d, $h5.2d				@ GHASH block 8k+3 - high
6798
6799	aese	$ctr2b, $rk3  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 3
6800	eor	$acc_hb, $acc_hb, $t0.16b				@ GHASH block 8k+1 - high
6801	eor	$res0.16b, $res0.16b, $acc_m.16b			@ GHASH block 8k, 8k+1 - mid
6802
6803	aese	$ctr4b, $rk3  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 3
6804	pmull	$h6.1q, $res2.1d, $h6.1d				@ GHASH block 8k+2 - low
6805	aese	$ctr3b, $rk3  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 3
6806
6807	eor3	$acc_hb, $acc_hb, $t1.16b, $t2.16b			@ GHASH block 8k+2, 8k+3 - high
6808	trn1	$t3.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6809	trn2	$res2.2d, $res3.2d, $res2.2d				@ GHASH block 8k+2, 8k+3 - mid
6810
6811	pmull2  $acc_m.1q, $res0.2d, $h78k.2d				@ GHASH block 8k	- mid
6812	pmull	$h5.1q, $res3.1d, $h5.1d				@ GHASH block 8k+3 - low
6813	eor	$acc_lb, $acc_lb, $h7.16b				@ GHASH block 8k+1 - low
6814
6815	pmull	$h78k.1q, $res0.1d, $h78k.1d				@ GHASH block 8k+1 - mid
6816	aese	$ctr5b, $rk4  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 4
6817	aese	$ctr0b, $rk4  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 4
6818
6819	eor3	$acc_lb, $acc_lb, $h6.16b, $h5.16b			@ GHASH block 8k+2, 8k+3 - low
6820	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
6821	ext     $h1.16b, $h1.16b, $h1.16b, #8
6822	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
6823	ext     $h2.16b, $h2.16b, $h2.16b, #8
6824	aese	$ctr7b, $rk4  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 4
6825
6826	aese	$ctr2b, $rk4  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 4
6827	aese	$ctr6b, $rk4  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 4
6828	eor	$acc_mb, $acc_mb, $h78k.16b				@ GHASH block 8k+1 - mid
6829
6830	eor	$res2.16b, $res2.16b, $t3.16b				@ GHASH block 8k+2, 8k+3 - mid
6831	aese	$ctr7b, $rk5  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 5
6832	aese	$ctr1b, $rk4  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 4
6833
6834	aese	$ctr2b, $rk5  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 5
6835	aese	$ctr3b, $rk4  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 4
6836	aese	$ctr4b, $rk4  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 4
6837
6838	aese	$ctr1b, $rk5  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 5
6839	pmull2  $t3.1q, $res2.2d, $h56k.2d				@ GHASH block 8k+2 - mid
6840	aese	$ctr6b, $rk5  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 5
6841
6842	aese	$ctr4b, $rk5  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 5
6843	aese	$ctr3b, $rk5  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 5
6844	pmull	$h56k.1q, $res2.1d, $h56k.1d				@ GHASH block 8k+3 - mid
6845
6846	aese	$ctr0b, $rk5  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 5
6847	aese	$ctr5b, $rk5  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 5
6848	ldp	$rk6q, $rk7q, [$cc, #96]				@ load rk6, rk7
6849
6850	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
6851	ext     $h3.16b, $h3.16b, $h3.16b, #8
6852	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
6853	ext     $h4.16b, $h4.16b, $h4.16b, #8
6854	rev64	$res7b, $res7b						@ GHASH block 8k+7
6855	rev64	$res5b, $res5b						@ GHASH block 8k+5
6856
6857	eor3	$acc_mb, $acc_mb, $h56k.16b, $t3.16b			@ GHASH block 8k+2, 8k+3 - mid
6858
6859	trn1	$t6.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6860
6861	aese	$ctr0b, $rk6  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 6
6862	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
6863	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
6864	aese	$ctr6b, $rk6  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 6
6865
6866	aese	$ctr5b, $rk6  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 6
6867	aese	$ctr7b, $rk6  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 6
6868
6869	pmull2  $t4.1q, $res4.2d, $h4.2d				@ GHASH block 8k+4 - high
6870	pmull2  $t5.1q, $res5.2d, $h3.2d				@ GHASH block 8k+5 - high
6871	pmull	$h4.1q, $res4.1d, $h4.1d				@ GHASH block 8k+4 - low
6872
6873	trn2	$res4.2d, $res5.2d, $res4.2d				@ GHASH block 8k+4, 8k+5 - mid
6874	pmull	$h3.1q, $res5.1d, $h3.1d				@ GHASH block 8k+5 - low
6875	trn1	$t9.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6876
6877	aese	$ctr7b, $rk7  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 7
6878	pmull2  $t7.1q, $res6.2d, $h2.2d				@ GHASH block 8k+6 - high
6879	aese	$ctr1b, $rk6  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 6
6880
6881	aese	$ctr2b, $rk6  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 6
6882	aese	$ctr3b, $rk6  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 6
6883	aese	$ctr4b, $rk6  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 6
6884
6885	ldp	$rk8q, $rk9q, [$cc, #128]				@ load rk8, rk9
6886	pmull	$h2.1q, $res6.1d, $h2.1d				@ GHASH block 8k+6 - low
6887	aese	$ctr5b, $rk7  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 7
6888
6889	aese	$ctr1b, $rk7  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 7
6890	aese	$ctr4b, $rk7  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 7
6891
6892	aese	$ctr6b, $rk7  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 7
6893	aese	$ctr2b, $rk7  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 7
6894	eor3	$acc_hb, $acc_hb, $t4.16b, $t5.16b			@ GHASH block 8k+4, 8k+5 - high
6895
6896	aese	$ctr0b, $rk7  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 7
6897	trn2	$res6.2d, $res7.2d, $res6.2d				@ GHASH block 8k+6, 8k+7 - mid
6898	aese	$ctr3b, $rk7  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 7
6899
6900	aese	$ctr0b, $rk8  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 8
6901	aese	$ctr7b, $rk8  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 8
6902	aese	$ctr4b, $rk8  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 8
6903
6904	aese	$ctr1b, $rk8  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 8
6905	aese	$ctr5b, $rk8  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 8
6906	aese	$ctr6b, $rk8  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 8
6907
6908	aese	$ctr3b, $rk8  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 8
6909	aese	$ctr4b, $rk9  \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 9
6910	eor	$res4.16b, $res4.16b, $t6.16b				@ GHASH block 8k+4, 8k+5 - mid
6911
6912	aese	$ctr0b, $rk9  \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 9
6913	aese	$ctr1b, $rk9  \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 9
6914	eor	$res6.16b, $res6.16b, $t9.16b				@ GHASH block 8k+6, 8k+7 - mid
6915
6916	aese	$ctr6b, $rk9  \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 9
6917	aese	$ctr7b, $rk9  \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 9
6918	pmull2  $t6.1q, $res4.2d, $h34k.2d				@ GHASH block 8k+4 - mid
6919
6920	aese	$ctr2b, $rk8  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 8
6921	pmull	$h34k.1q, $res4.1d, $h34k.1d				@ GHASH block 8k+5 - mid
6922	pmull2  $t8.1q, $res7.2d, $h1.2d				@ GHASH block 8k+7 - high
6923
6924	pmull2  $t9.1q, $res6.2d, $h12k.2d				@ GHASH block 8k+6 - mid
6925	pmull	$h12k.1q, $res6.1d, $h12k.1d				@ GHASH block 8k+7 - mid
6926	pmull	$h1.1q, $res7.1d, $h1.1d				@ GHASH block 8k+7 - low
6927
6928	ldp	$rk10q, $rk11q, [$cc, #160]				@ load rk10, rk11
6929	eor3	$acc_lb, $acc_lb, $h4.16b, $h3.16b			@ GHASH block 8k+4, 8k+5 - low
6930	eor3	$acc_mb, $acc_mb, $h34k.16b, $t6.16b			@ GHASH block 8k+4, 8k+5 - mid
6931
6932	aese	$ctr2b, $rk9  \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 9
6933	aese	$ctr3b, $rk9  \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 9
6934	aese	$ctr5b, $rk9  \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 9
6935
6936	eor3	$acc_hb, $acc_hb, $t7.16b, $t8.16b			@ GHASH block 8k+6, 8k+7 - high
6937	eor3	$acc_lb, $acc_lb, $h2.16b, $h1.16b			@ GHASH block 8k+6, 8k+7 - low
6938	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
6939
6940	eor3	$acc_mb, $acc_mb, $h12k.16b, $t9.16b			@ GHASH block 8k+6, 8k+7 - mid
6941
6942	aese	$ctr4b, $rk10 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 10
6943	aese	$ctr6b, $rk10 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 10
6944	aese	$ctr5b, $rk10 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 10
6945
6946	aese	$ctr0b, $rk10 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 10
6947	aese	$ctr2b, $rk10 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 10
6948	aese	$ctr3b, $rk10 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 10
6949
6950	eor3	$acc_mb, $acc_mb, $acc_hb, $acc_lb		 	@ MODULO - karatsuba tidy up
6951
6952	aese	$ctr7b, $rk10 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 10
6953	aese	$ctr1b, $rk10 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 10
6954	ldp	$rk12q, $rk13q, [$cc, #192]				@ load rk12, rk13
6955
6956	ext	$t11.16b, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
6957
6958	aese	$ctr2b, $rk11 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 11
6959	aese	$ctr1b, $rk11 \n  aesmc	$ctr1b, $ctr1b			@ AES block 8k+9 - round 11
6960	aese	$ctr0b, $rk11 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 11
6961
6962	pmull	$t12.1q, $acc_h.1d, $mod_constant.1d			@ MODULO - top 64b align with mid
6963	aese	$ctr3b, $rk11 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 11
6964
6965	aese	$ctr7b, $rk11 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 11
6966	aese	$ctr6b, $rk11 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 11
6967	aese	$ctr4b, $rk11 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 11
6968
6969	aese	$ctr5b, $rk11 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 11
6970	aese	$ctr3b, $rk12 \n  aesmc	$ctr3b, $ctr3b			@ AES block 8k+11 - round 12
6971
6972	eor3	$acc_mb, $acc_mb, $t12.16b, $t11.16b			@ MODULO - fold into mid
6973
6974	aese	$ctr3b, $rk13						@ AES block 8k+11 - round 13
6975	aese	$ctr2b, $rk12 \n  aesmc	$ctr2b, $ctr2b			@ AES block 8k+10 - round 12
6976	aese	$ctr6b, $rk12 \n  aesmc	$ctr6b, $ctr6b			@ AES block 8k+14 - round 12
6977
6978	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
6979	aese	$ctr4b, $rk12 \n  aesmc	$ctr4b, $ctr4b			@ AES block 8k+12 - round 12
6980	aese	$ctr7b, $rk12 \n  aesmc	$ctr7b, $ctr7b			@ AES block 8k+15 - round 12
6981
6982	aese	$ctr0b, $rk12 \n  aesmc	$ctr0b, $ctr0b			@ AES block 8k+8 - round 12
6983	ldr	$rk14q, [$cc, #224]					@ load rk14
6984	aese	$ctr1b, $rk12 \n  aesmc	$ctr1b, $ctr1b	        	@ AES block 8k+9 - round 12
6985
6986	aese	$ctr4b, $rk13						@ AES block 8k+12 - round 13
6987	ext	$t11.16b, $acc_mb, $acc_mb, #8			 	@ MODULO - other mid alignment
6988	aese	$ctr5b, $rk12 \n  aesmc	$ctr5b, $ctr5b			@ AES block 8k+13 - round 12
6989
6990	aese	$ctr6b, $rk13						@ AES block 8k+14 - round 13
6991	aese	$ctr2b, $rk13						@ AES block 8k+10 - round 13
6992	aese	$ctr1b, $rk13						@ AES block 8k+9 - round 13
6993
6994	aese	$ctr5b, $rk13						@ AES block 8k+13 - round 13
6995	eor3	$acc_lb, $acc_lb, $t11.16b, $acc_hb		 	@ MODULO - fold into low
6996	add	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s		@ CTR block 8k+15
6997
6998	aese	$ctr7b, $rk13						@ AES block 8k+15 - round 13
6999	aese	$ctr0b, $rk13						@ AES block 8k+8 - round 13
7000.L256_dec_tail:								@ TAIL
7001
7002	ext	$t0.16b, $acc_lb, $acc_lb, #8				@ prepare final partial tag
7003	sub	$main_end_input_ptr, $end_input_ptr, $input_ptr		@ main_end_input_ptr is number of bytes left to process
7004	cmp	$main_end_input_ptr, #112
7005
7006	ldr	$res1q, [$input_ptr], #16				@ AES block 8k+8 - load ciphertext
7007
7008	ldp	$h78kq, $h8q, [$current_tag, #192]			@ load h8k | h7k
7009        ext     $h8.16b, $h8.16b, $h8.16b, #8
7010	mov	$t1.16b, $rk14
7011
7012	ldp	$h5q, $h56kq, [$current_tag, #128]			@ load h5l | h5h
7013        ext     $h5.16b, $h5.16b, $h5.16b, #8
7014
7015	eor3	$res4b, $res1b, $ctr0b, $t1.16b				@ AES block 8k+8 - result
7016	ldp	$h6q, $h7q, [$current_tag, #160]			@ load h6l | h6h
7017        ext     $h6.16b, $h6.16b, $h6.16b, #8
7018        ext     $h7.16b, $h7.16b, $h7.16b, #8
7019	b.gt	.L256_dec_blocks_more_than_7
7020
7021	mov	$ctr7b, $ctr6b
7022	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7023	mov	$ctr6b, $ctr5b
7024
7025	mov	$ctr5b, $ctr4b
7026	mov	$ctr4b, $ctr3b
7027	movi	$acc_l.8b, #0
7028
7029	movi	$acc_h.8b, #0
7030	movi	$acc_m.8b, #0
7031	mov	$ctr3b, $ctr2b
7032
7033	cmp	$main_end_input_ptr, #96
7034	mov	$ctr2b, $ctr1b
7035	b.gt	.L256_dec_blocks_more_than_6
7036
7037	mov	$ctr7b, $ctr6b
7038	mov	$ctr6b, $ctr5b
7039
7040	mov	$ctr5b, $ctr4b
7041	cmp	$main_end_input_ptr, #80
7042	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7043
7044	mov	$ctr4b, $ctr3b
7045	mov	$ctr3b, $ctr1b
7046	b.gt	.L256_dec_blocks_more_than_5
7047
7048	cmp	$main_end_input_ptr, #64
7049	mov	$ctr7b, $ctr6b
7050	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7051
7052	mov	$ctr6b, $ctr5b
7053
7054	mov	$ctr5b, $ctr4b
7055	mov	$ctr4b, $ctr1b
7056	b.gt	.L256_dec_blocks_more_than_4
7057
7058	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7059	mov	$ctr7b, $ctr6b
7060	cmp	$main_end_input_ptr, #48
7061
7062	mov	$ctr6b, $ctr5b
7063	mov	$ctr5b, $ctr1b
7064	b.gt	.L256_dec_blocks_more_than_3
7065
7066	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
7067	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7068	mov	$ctr7b, $ctr6b
7069
7070	cmp	$main_end_input_ptr, #32
7071	mov	$ctr6b, $ctr1b
7072	b.gt	.L256_dec_blocks_more_than_2
7073
7074	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7075
7076	mov	$ctr7b, $ctr1b
7077	cmp	$main_end_input_ptr, #16
7078	b.gt	.L256_dec_blocks_more_than_1
7079
7080	sub	$rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7081	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
7082	b	 .L256_dec_blocks_less_than_1
7083.L256_dec_blocks_more_than_7:						@ blocks left >  7
7084	rev64	$res0b, $res1b						@ GHASH final-7 block
7085	ldr	$res1q, [$input_ptr], #16				@ AES final-6 block - load ciphertext
7086	st1	{ $res4b}, [$output_ptr], #16				@ AES final-7 block  - store result
7087
7088	ins	$acc_m.d[0], $h78k.d[1]					@ GHASH final-7 block - mid
7089
7090	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7091
7092	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-7 block - mid
7093	eor3	$res4b, $res1b, $ctr1b, $t1.16b				@ AES final-6 block - result
7094
7095	pmull2  $acc_h.1q, $res0.2d, $h8.2d				@ GHASH final-7 block - high
7096
7097	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-7 block - mid
7098	movi	$t0.8b, #0						@ suppress further partial tag feed in
7099
7100	pmull	$acc_l.1q, $res0.1d, $h8.1d				@ GHASH final-7 block - low
7101	pmull	$acc_m.1q, $rk4v.1d, $acc_m.1d			 	@ GHASH final-7 block - mid
7102.L256_dec_blocks_more_than_6:						@ blocks left >  6
7103
7104	rev64	$res0b, $res1b						@ GHASH final-6 block
7105
7106	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7107	ldr	$res1q, [$input_ptr], #16				@ AES final-5 block - load ciphertext
7108	movi	$t0.8b, #0						@ suppress further partial tag feed in
7109
7110	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-6 block - mid
7111	st1	{ $res4b}, [$output_ptr], #16				@ AES final-6 block - store result
7112	pmull2  $rk2q1, $res0.2d, $h7.2d				@ GHASH final-6 block - high
7113
7114	pmull	$rk3q1, $res0.1d, $h7.1d				@ GHASH final-6 block - low
7115
7116	eor3	$res4b, $res1b, $ctr2b, $t1.16b				@ AES final-5 block - result
7117	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-6 block - low
7118	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-6 block - mid
7119
7120	pmull	$rk4v.1q, $rk4v.1d, $h78k.1d				@ GHASH final-6 block - mid
7121
7122	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-6 block - mid
7123	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-6 block - high
7124.L256_dec_blocks_more_than_5:						@ blocks left >  5
7125
7126	rev64	$res0b, $res1b						@ GHASH final-5 block
7127
7128	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7129
7130	pmull2  $rk2q1, $res0.2d, $h6.2d				@ GHASH final-5 block - high
7131	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-5 block - mid
7132
7133	ldr	$res1q, [$input_ptr], #16				@ AES final-4 block - load ciphertext
7134
7135	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-5 block - mid
7136	st1	{ $res4b}, [$output_ptr], #16			  	@ AES final-5 block - store result
7137
7138	pmull	$rk3q1, $res0.1d, $h6.1d				@ GHASH final-5 block - low
7139	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-5 block - mid
7140
7141	pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d				@ GHASH final-5 block - mid
7142
7143	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-5 block - high
7144	eor3	$res4b, $res1b, $ctr3b, $t1.16b				@ AES final-4 block - result
7145	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-5 block - low
7146
7147	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-5 block - mid
7148	movi	$t0.8b, #0						@ suppress further partial tag feed in
7149.L256_dec_blocks_more_than_4:						@ blocks left >  4
7150
7151	rev64	$res0b, $res1b						@ GHASH final-4 block
7152
7153	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7154
7155	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-4 block - mid
7156	ldr	$res1q, [$input_ptr], #16				@ AES final-3 block - load ciphertext
7157
7158	movi	$t0.8b, #0						@ suppress further partial tag feed in
7159
7160	pmull	$rk3q1, $res0.1d, $h5.1d				@ GHASH final-4 block - low
7161	pmull2  $rk2q1, $res0.2d, $h5.2d				@ GHASH final-4 block - high
7162
7163	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-4 block - mid
7164
7165	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-4 block - high
7166
7167	pmull	$rk4v.1q, $rk4v.1d, $h56k.1d				@ GHASH final-4 block - mid
7168
7169	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-4 block - low
7170	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-4 block - store result
7171
7172	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-4 block - mid
7173	eor3	$res4b, $res1b, $ctr4b, $t1.16b				@ AES final-3 block - result
7174.L256_dec_blocks_more_than_3:						@ blocks left >  3
7175
7176	ldr	$h4q, [$current_tag, #112]				@ load h4l | h4h
7177	ext     $h4.16b, $h4.16b, $h4.16b, #8
7178	rev64	$res0b, $res1b						@ GHASH final-3 block
7179
7180	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7181	ldr	$res1q, [$input_ptr], #16				@ AES final-2 block - load ciphertext
7182	ldr	$h34kq, [$current_tag, #96]				@ load h4k | h3k
7183
7184	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-3 block - mid
7185	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-3 block - store result
7186
7187	eor3	$res4b, $res1b, $ctr5b, $t1.16b				@ AES final-2 block - result
7188
7189	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-3 block - mid
7190
7191	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-3 block - mid
7192	pmull	$rk3q1, $res0.1d, $h4.1d				@ GHASH final-3 block - low
7193	pmull2  $rk2q1, $res0.2d, $h4.2d				@ GHASH final-3 block - high
7194
7195	movi	$t0.8b, #0						@ suppress further partial tag feed in
7196	pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d				@ GHASH final-3 block - mid
7197	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-3 block - low
7198
7199	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-3 block - high
7200
7201	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-3 block - mid
7202.L256_dec_blocks_more_than_2:						@ blocks left >  2
7203
7204	rev64	$res0b, $res1b						@ GHASH final-2 block
7205
7206	ldr	$h3q, [$current_tag, #80]				@ load h3l | h3h
7207	ext     $h3.16b, $h3.16b, $h3.16b, #8
7208	ldr	$res1q, [$input_ptr], #16				@ AES final-1 block - load ciphertext
7209
7210	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7211
7212	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-2 block - mid
7213
7214	pmull	$rk3q1, $res0.1d, $h3.1d				@ GHASH final-2 block - low
7215	st1	{ $res4b}, [$output_ptr], #16			  	@ AES final-2 block - store result
7216	eor3	$res4b, $res1b, $ctr6b, $t1.16b				@ AES final-1 block - result
7217
7218	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-2 block - mid
7219	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-2 block - low
7220	movi	$t0.8b, #0						@ suppress further partial tag feed in
7221
7222	pmull	$rk4v.1q, $rk4v.1d, $h34k.1d				@ GHASH final-2 block - mid
7223	pmull2  $rk2q1, $res0.2d, $h3.2d				@ GHASH final-2 block - high
7224
7225	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-2 block - mid
7226	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-2 block - high
7227.L256_dec_blocks_more_than_1:						@ blocks left >  1
7228
7229	rev64	$res0b, $res1b						@ GHASH final-1 block
7230
7231	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7232
7233	ins	$rk4v.d[0], $res0.d[1]					@ GHASH final-1 block - mid
7234	ldr	$h2q, [$current_tag, #64]				@ load h2l | h2h
7235	ext     $h2.16b, $h2.16b, $h2.16b, #8
7236
7237	eor	$rk4v.8b, $rk4v.8b, $res0.8b				@ GHASH final-1 block - mid
7238	ldr	$res1q, [$input_ptr], #16				@ AES final block - load ciphertext
7239	st1	{ $res4b}, [$output_ptr], #16			 	@ AES final-1 block - store result
7240
7241	ldr	$h12kq, [$current_tag, #48]				@ load h2k | h1k
7242	pmull	$rk3q1, $res0.1d, $h2.1d				@ GHASH final-1 block - low
7243
7244	ins	$rk4v.d[1], $rk4v.d[0]					@ GHASH final-1 block - mid
7245
7246	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final-1 block - low
7247
7248	eor3	$res4b, $res1b, $ctr7b, $t1.16b				@ AES final block - result
7249	pmull2  $rk2q1, $res0.2d, $h2.2d				@ GHASH final-1 block - high
7250
7251	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d				@ GHASH final-1 block - mid
7252
7253	movi	$t0.8b, #0						@ suppress further partial tag feed in
7254	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final-1 block - high
7255
7256	eor	$acc_mb, $acc_mb, $rk4v.16b				@ GHASH final-1 block - mid
7257.L256_dec_blocks_less_than_1:						@ blocks left <= 1
7258
7259	ld1	{ $rk0}, [$output_ptr]					@ load existing bytes where the possibly partial last block is to be stored
7260	mvn	$temp0_x, xzr						@ temp0_x = 0xffffffffffffffff
7261	and	$bit_length, $bit_length, #127				@ bit_length %= 128
7262
7263	sub	$bit_length, $bit_length, #128				@ bit_length -= 128
7264	rev32	$rtmp_ctr.16b, $rtmp_ctr.16b
7265	str	$rtmp_ctrq, [$counter]					@ store the updated counter
7266
7267	neg	$bit_length, $bit_length				@ bit_length = 128 - #bits in input (in range [1,128])
7268
7269	and	$bit_length, $bit_length, #127			 	@ bit_length %= 128
7270
7271	lsr	$temp0_x, $temp0_x, $bit_length				@ temp0_x is mask for top 64b of last block
7272	cmp	$bit_length, #64
7273	mvn	$temp1_x, xzr						@ temp1_x = 0xffffffffffffffff
7274
7275	csel	$temp3_x, $temp0_x, xzr, lt
7276	csel	$temp2_x, $temp1_x, $temp0_x, lt
7277
7278	mov	$ctr0.d[0], $temp2_x					@ ctr0b is mask for last block
7279	mov	$ctr0.d[1], $temp3_x
7280
7281	and	$res1b, $res1b, $ctr0b					@ possibly partial last block has zeroes in highest bits
7282	ldr	$h1q, [$current_tag, #32]				@ load h1l | h1h
7283	ext     $h1.16b, $h1.16b, $h1.16b, #8
7284	bif	$res4b, $rk0, $ctr0b					@ insert existing bytes in top end of result before storing
7285
7286	rev64	$res0b, $res1b						@ GHASH final block
7287
7288	eor	$res0b, $res0b, $t0.16b					@ feed in partial tag
7289
7290	ins	$t0.d[0], $res0.d[1]					@ GHASH final block - mid
7291	pmull2  $rk2q1, $res0.2d, $h1.2d				@ GHASH final block - high
7292
7293	eor	$t0.8b, $t0.8b, $res0.8b				@ GHASH final block - mid
7294
7295	pmull	$rk3q1, $res0.1d, $h1.1d				@ GHASH final block - low
7296	eor	$acc_hb, $acc_hb, $rk2					@ GHASH final block - high
7297
7298	pmull	$t0.1q, $t0.1d, $h12k.1d				@ GHASH final block - mid
7299
7300	eor	$acc_mb, $acc_mb, $t0.16b				@ GHASH final block - mid
7301	ldr	$mod_constantd, [$modulo_constant]			@ MODULO - load modulo constant
7302	eor	$acc_lb, $acc_lb, $rk3					@ GHASH final block - low
7303
7304	pmull	$t11.1q, $acc_h.1d, $mod_constant.1d		 	@ MODULO - top 64b align with mid
7305	eor	$t10.16b, $acc_hb, $acc_lb				@ MODULO - karatsuba tidy up
7306
7307	ext	$acc_hb, $acc_hb, $acc_hb, #8				@ MODULO - other top alignment
7308	st1	{ $res4b}, [$output_ptr]				@ store all 16B
7309
7310	eor	$acc_mb, $acc_mb, $t10.16b				@ MODULO - karatsuba tidy up
7311
7312	eor	$t11.16b, $acc_hb, $t11.16b				@ MODULO - fold into mid
7313	eor	$acc_mb, $acc_mb, $t11.16b				@ MODULO - fold into mid
7314
7315	pmull	$acc_h.1q, $acc_m.1d, $mod_constant.1d			@ MODULO - mid 64b align with low
7316
7317	ext	$acc_mb, $acc_mb, $acc_mb, #8				@ MODULO - other mid alignment
7318	eor	$acc_lb, $acc_lb, $acc_hb				@ MODULO - fold into low
7319
7320	eor	$acc_lb, $acc_lb, $acc_mb				@ MODULO - fold into low
7321	ext	$acc_lb, $acc_lb, $acc_lb, #8
7322	rev64	$acc_lb, $acc_lb
7323	st1	{ $acc_l.16b }, [$current_tag]
7324	mov	x0, $byte_length
7325
7326        ldp     d10, d11, [sp, #16]
7327	ldp     d12, d13, [sp, #32]
7328	ldp     d14, d15, [sp, #48]
7329	ldp     d8, d9, [sp], #80
7330	ret
7331
7332.L256_dec_ret:
7333	mov w0, #0x0
7334	ret
7335.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
7336___
7337}
7338}
7339
7340$code.=<<___;
7341.asciz  "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
7342.align  2
7343#endif
7344___
7345
7346{
7347    my  %opcode = (
7348    "rax1"    => 0xce608c00,    "eor3"    => 0xce000000,
7349    "bcax"    => 0xce200000,    "xar"    => 0xce800000    );
7350
7351    sub unsha3 {
7352         my ($mnemonic,$arg)=@_;
7353
7354         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
7355         &&
7356         sprintf ".inst\t0x%08x\t//%s %s",
7357            $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
7358            $mnemonic,$arg;
7359    }
7360    sub unvmov {
7361        my $arg=shift;
7362
7363        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
7364        sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
7365                             $3<8?$3:$3+8,($4 eq "lo")?0:1;
7366    }
7367
7368     foreach(split("\n",$code)) {
7369        s/@\s/\/\//o;               # old->new style commentary
7370        s/\`([^\`]*)\`/eval($1)/ge;
7371
7372        m/\bld1r\b/ and s/\.16b/.2d/g    or
7373        s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
7374        print $_,"\n";
7375     }
7376}
7377
7378close STDOUT or die "error closing STDOUT: $!"; # enforce flush
7379