xref: /openssl/crypto/aes/asm/bsaes-armv8.pl (revision 7ed6de99)
1#!/usr/bin/env perl
2# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9use strict;
10
11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13my $xlate;
14
15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
16( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
18die "can't locate arm-xlate.pl";
19
20open OUT,"| \"$^X\" $xlate $flavour $output";
21*STDOUT=*OUT;
22
23my $code = data();
24print $code;
25
26close STDOUT or die "error closing STDOUT: $!"; # enforce flush
27
28sub data
29{
30    local $/;
31    return <DATA>;
32}
33
34__END__
35// Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved.
36//
37// Licensed under the OpenSSL license (the "License").  You may not use
38// this file except in compliance with the License.  You can obtain a copy
39// in the file LICENSE in the source distribution or at
40// https://www.openssl.org/source/license.html
41//
42// ====================================================================
43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
44// project. Rights for redistribution and usage in source and binary
45// forms are granted according to the OpenSSL license.
46// ====================================================================
47//
48// This implementation is a translation of bsaes-armv7 for AArch64.
49// No attempt has been made to carry across the build switches for
50// kernel targets, since the Linux kernel crypto support has moved on
51// from when it was based on OpenSSL.
52
53// A lot of hand-scheduling has been performed. Consequently, this code
54// doesn't factor out neatly into macros in the same way that the
55// AArch32 version did, and there is little to be gained by wrapping it
56// up in Perl, and it is presented as pure assembly.
57
58
59#include "crypto/arm_arch.h"
60
61.text
62
63.extern AES_cbc_encrypt
64.extern AES_encrypt
65.extern AES_decrypt
66
67.type   _bsaes_decrypt8,%function
68.align  4
69// On entry:
70//   x9 -> key (previously expanded using _bsaes_key_convert)
71//   x10 = number of rounds
72//   v0-v7 input data
73// On exit:
74//   x9-x11 corrupted
75//   other general-purpose registers preserved
76//   v0-v7 output data
77//   v11-v15 preserved
78//   other SIMD registers corrupted
79_bsaes_decrypt8:
80        ldr     q8, [x9], #16
81        adr     x11, .LM0ISR
82        movi    v9.16b, #0x55
83        ldr     q10, [x11], #16
84        movi    v16.16b, #0x33
85        movi    v17.16b, #0x0f
86        sub     x10, x10, #1
87        eor     v0.16b, v0.16b, v8.16b
88        eor     v1.16b, v1.16b, v8.16b
89        eor     v2.16b, v2.16b, v8.16b
90        eor     v4.16b, v4.16b, v8.16b
91        eor     v3.16b, v3.16b, v8.16b
92        eor     v5.16b, v5.16b, v8.16b
93        tbl     v0.16b, {v0.16b}, v10.16b
94        tbl     v1.16b, {v1.16b}, v10.16b
95        tbl     v2.16b, {v2.16b}, v10.16b
96        tbl     v4.16b, {v4.16b}, v10.16b
97        eor     v6.16b, v6.16b, v8.16b
98        eor     v7.16b, v7.16b, v8.16b
99        tbl     v3.16b, {v3.16b}, v10.16b
100        tbl     v5.16b, {v5.16b}, v10.16b
101        tbl     v6.16b, {v6.16b}, v10.16b
102        ushr    v8.2d, v0.2d, #1
103        tbl     v7.16b, {v7.16b}, v10.16b
104        ushr    v10.2d, v4.2d, #1
105        ushr    v18.2d, v2.2d, #1
106        eor     v8.16b, v8.16b, v1.16b
107        ushr    v19.2d, v6.2d, #1
108        eor     v10.16b, v10.16b, v5.16b
109        eor     v18.16b, v18.16b, v3.16b
110        and     v8.16b, v8.16b, v9.16b
111        eor     v19.16b, v19.16b, v7.16b
112        and     v10.16b, v10.16b, v9.16b
113        and     v18.16b, v18.16b, v9.16b
114        eor     v1.16b, v1.16b, v8.16b
115        shl     v8.2d, v8.2d, #1
116        and     v9.16b, v19.16b, v9.16b
117        eor     v5.16b, v5.16b, v10.16b
118        shl     v10.2d, v10.2d, #1
119        eor     v3.16b, v3.16b, v18.16b
120        shl     v18.2d, v18.2d, #1
121        eor     v0.16b, v0.16b, v8.16b
122        shl     v8.2d, v9.2d, #1
123        eor     v7.16b, v7.16b, v9.16b
124        eor     v4.16b, v4.16b, v10.16b
125        eor     v2.16b, v2.16b, v18.16b
126        ushr    v9.2d, v1.2d, #2
127        eor     v6.16b, v6.16b, v8.16b
128        ushr    v8.2d, v0.2d, #2
129        ushr    v10.2d, v5.2d, #2
130        ushr    v18.2d, v4.2d, #2
131        eor     v9.16b, v9.16b, v3.16b
132        eor     v8.16b, v8.16b, v2.16b
133        eor     v10.16b, v10.16b, v7.16b
134        eor     v18.16b, v18.16b, v6.16b
135        and     v9.16b, v9.16b, v16.16b
136        and     v8.16b, v8.16b, v16.16b
137        and     v10.16b, v10.16b, v16.16b
138        and     v16.16b, v18.16b, v16.16b
139        eor     v3.16b, v3.16b, v9.16b
140        shl     v9.2d, v9.2d, #2
141        eor     v2.16b, v2.16b, v8.16b
142        shl     v8.2d, v8.2d, #2
143        eor     v7.16b, v7.16b, v10.16b
144        shl     v10.2d, v10.2d, #2
145        eor     v6.16b, v6.16b, v16.16b
146        shl     v16.2d, v16.2d, #2
147        eor     v1.16b, v1.16b, v9.16b
148        eor     v0.16b, v0.16b, v8.16b
149        eor     v5.16b, v5.16b, v10.16b
150        eor     v4.16b, v4.16b, v16.16b
151        ushr    v8.2d, v3.2d, #4
152        ushr    v9.2d, v2.2d, #4
153        ushr    v10.2d, v1.2d, #4
154        ushr    v16.2d, v0.2d, #4
155        eor     v8.16b, v8.16b, v7.16b
156        eor     v9.16b, v9.16b, v6.16b
157        eor     v10.16b, v10.16b, v5.16b
158        eor     v16.16b, v16.16b, v4.16b
159        and     v8.16b, v8.16b, v17.16b
160        and     v9.16b, v9.16b, v17.16b
161        and     v10.16b, v10.16b, v17.16b
162        and     v16.16b, v16.16b, v17.16b
163        eor     v7.16b, v7.16b, v8.16b
164        shl     v8.2d, v8.2d, #4
165        eor     v6.16b, v6.16b, v9.16b
166        shl     v9.2d, v9.2d, #4
167        eor     v5.16b, v5.16b, v10.16b
168        shl     v10.2d, v10.2d, #4
169        eor     v4.16b, v4.16b, v16.16b
170        shl     v16.2d, v16.2d, #4
171        eor     v3.16b, v3.16b, v8.16b
172        eor     v2.16b, v2.16b, v9.16b
173        eor     v1.16b, v1.16b, v10.16b
174        eor     v0.16b, v0.16b, v16.16b
175        b       .Ldec_sbox
176.align  4
177.Ldec_loop:
178        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
179        ldp     q8, q9, [x9], #32
180        eor     v0.16b, v16.16b, v0.16b
181        ldr     q10, [x9], #16
182        eor     v1.16b, v17.16b, v1.16b
183        ldr     q16, [x9], #16
184        eor     v2.16b, v18.16b, v2.16b
185        eor     v3.16b, v19.16b, v3.16b
186        eor     v4.16b, v8.16b, v4.16b
187        eor     v5.16b, v9.16b, v5.16b
188        eor     v6.16b, v10.16b, v6.16b
189        eor     v7.16b, v16.16b, v7.16b
190        tbl     v0.16b, {v0.16b}, v28.16b
191        tbl     v1.16b, {v1.16b}, v28.16b
192        tbl     v2.16b, {v2.16b}, v28.16b
193        tbl     v3.16b, {v3.16b}, v28.16b
194        tbl     v4.16b, {v4.16b}, v28.16b
195        tbl     v5.16b, {v5.16b}, v28.16b
196        tbl     v6.16b, {v6.16b}, v28.16b
197        tbl     v7.16b, {v7.16b}, v28.16b
198.Ldec_sbox:
199        eor     v1.16b, v1.16b, v4.16b
200        eor     v3.16b, v3.16b, v4.16b
201        subs    x10, x10, #1
202        eor     v4.16b, v4.16b, v7.16b
203        eor     v2.16b, v2.16b, v7.16b
204        eor     v1.16b, v1.16b, v6.16b
205        eor     v6.16b, v6.16b, v4.16b
206        eor     v2.16b, v2.16b, v5.16b
207        eor     v0.16b, v0.16b, v1.16b
208        eor     v7.16b, v7.16b, v6.16b
209        eor     v8.16b, v6.16b, v2.16b
210        and     v9.16b, v4.16b, v6.16b
211        eor     v10.16b, v2.16b, v6.16b
212        eor     v3.16b, v3.16b, v0.16b
213        eor     v5.16b, v5.16b, v0.16b
214        eor     v16.16b, v7.16b, v4.16b
215        eor     v17.16b, v4.16b, v0.16b
216        and     v18.16b, v0.16b, v2.16b
217        eor     v19.16b, v7.16b, v4.16b
218        eor     v1.16b, v1.16b, v3.16b
219        eor     v20.16b, v3.16b, v0.16b
220        eor     v21.16b, v5.16b, v2.16b
221        eor     v22.16b, v3.16b, v7.16b
222        and     v8.16b, v17.16b, v8.16b
223        orr     v17.16b, v3.16b, v5.16b
224        eor     v23.16b, v1.16b, v6.16b
225        eor     v24.16b, v20.16b, v16.16b
226        eor     v25.16b, v1.16b, v5.16b
227        orr     v26.16b, v20.16b, v21.16b
228        and     v20.16b, v20.16b, v21.16b
229        and     v27.16b, v7.16b, v1.16b
230        eor     v21.16b, v21.16b, v23.16b
231        orr     v28.16b, v16.16b, v23.16b
232        orr     v29.16b, v22.16b, v25.16b
233        eor     v26.16b, v26.16b, v8.16b
234        and     v16.16b, v16.16b, v23.16b
235        and     v22.16b, v22.16b, v25.16b
236        and     v21.16b, v24.16b, v21.16b
237        eor     v8.16b, v28.16b, v8.16b
238        eor     v23.16b, v5.16b, v2.16b
239        eor     v24.16b, v1.16b, v6.16b
240        eor     v16.16b, v16.16b, v22.16b
241        eor     v22.16b, v3.16b, v0.16b
242        eor     v25.16b, v29.16b, v21.16b
243        eor     v21.16b, v26.16b, v21.16b
244        eor     v8.16b, v8.16b, v20.16b
245        eor     v26.16b, v23.16b, v24.16b
246        eor     v16.16b, v16.16b, v20.16b
247        eor     v28.16b, v22.16b, v19.16b
248        eor     v20.16b, v25.16b, v20.16b
249        eor     v9.16b, v21.16b, v9.16b
250        eor     v8.16b, v8.16b, v18.16b
251        eor     v18.16b, v5.16b, v1.16b
252        eor     v21.16b, v16.16b, v17.16b
253        eor     v16.16b, v16.16b, v17.16b
254        eor     v17.16b, v20.16b, v27.16b
255        eor     v20.16b, v3.16b, v7.16b
256        eor     v25.16b, v9.16b, v8.16b
257        eor     v27.16b, v0.16b, v4.16b
258        and     v29.16b, v9.16b, v17.16b
259        eor     v30.16b, v8.16b, v29.16b
260        eor     v31.16b, v21.16b, v29.16b
261        eor     v29.16b, v21.16b, v29.16b
262        bsl     v30.16b, v17.16b, v21.16b
263        bsl     v31.16b, v9.16b, v8.16b
264        bsl     v16.16b, v30.16b, v29.16b
265        bsl     v21.16b, v29.16b, v30.16b
266        eor     v8.16b, v31.16b, v30.16b
267        and     v1.16b, v1.16b, v31.16b
268        and     v9.16b, v16.16b, v31.16b
269        and     v6.16b, v6.16b, v30.16b
270        eor     v16.16b, v17.16b, v21.16b
271        and     v4.16b, v4.16b, v30.16b
272        eor     v17.16b, v8.16b, v30.16b
273        and     v21.16b, v24.16b, v8.16b
274        eor     v9.16b, v9.16b, v25.16b
275        and     v19.16b, v19.16b, v8.16b
276        eor     v24.16b, v30.16b, v16.16b
277        eor     v25.16b, v30.16b, v16.16b
278        and     v7.16b, v7.16b, v17.16b
279        and     v10.16b, v10.16b, v16.16b
280        eor     v29.16b, v9.16b, v16.16b
281        eor     v30.16b, v31.16b, v9.16b
282        and     v0.16b, v24.16b, v0.16b
283        and     v9.16b, v18.16b, v9.16b
284        and     v2.16b, v25.16b, v2.16b
285        eor     v10.16b, v10.16b, v6.16b
286        eor     v18.16b, v29.16b, v16.16b
287        and     v5.16b, v30.16b, v5.16b
288        eor     v24.16b, v8.16b, v29.16b
289        and     v25.16b, v26.16b, v29.16b
290        and     v26.16b, v28.16b, v29.16b
291        eor     v8.16b, v8.16b, v29.16b
292        eor     v17.16b, v17.16b, v18.16b
293        eor     v5.16b, v1.16b, v5.16b
294        and     v23.16b, v24.16b, v23.16b
295        eor     v21.16b, v21.16b, v25.16b
296        eor     v19.16b, v19.16b, v26.16b
297        eor     v0.16b, v4.16b, v0.16b
298        and     v3.16b, v17.16b, v3.16b
299        eor     v1.16b, v9.16b, v1.16b
300        eor     v9.16b, v25.16b, v23.16b
301        eor     v5.16b, v5.16b, v21.16b
302        eor     v2.16b, v6.16b, v2.16b
303        and     v6.16b, v8.16b, v22.16b
304        eor     v3.16b, v7.16b, v3.16b
305        and     v8.16b, v20.16b, v18.16b
306        eor     v10.16b, v10.16b, v9.16b
307        eor     v0.16b, v0.16b, v19.16b
308        eor     v9.16b, v1.16b, v9.16b
309        eor     v1.16b, v2.16b, v21.16b
310        eor     v3.16b, v3.16b, v19.16b
311        and     v16.16b, v27.16b, v16.16b
312        eor     v17.16b, v26.16b, v6.16b
313        eor     v6.16b, v8.16b, v7.16b
314        eor     v7.16b, v1.16b, v9.16b
315        eor     v1.16b, v5.16b, v3.16b
316        eor     v2.16b, v10.16b, v3.16b
317        eor     v4.16b, v16.16b, v4.16b
318        eor     v8.16b, v6.16b, v17.16b
319        eor     v5.16b, v9.16b, v3.16b
320        eor     v9.16b, v0.16b, v1.16b
321        eor     v6.16b, v7.16b, v1.16b
322        eor     v0.16b, v4.16b, v17.16b
323        eor     v4.16b, v8.16b, v7.16b
324        eor     v7.16b, v9.16b, v2.16b
325        eor     v8.16b, v3.16b, v0.16b
326        eor     v7.16b, v7.16b, v5.16b
327        eor     v3.16b, v4.16b, v7.16b
328        eor     v4.16b, v7.16b, v0.16b
329        eor     v7.16b, v8.16b, v3.16b
330        bcc     .Ldec_done
331        ext     v8.16b, v0.16b, v0.16b, #8
332        ext     v9.16b, v1.16b, v1.16b, #8
333        ldr     q28, [x11]                  // load from .LISR in common case (x10 > 0)
334        ext     v10.16b, v6.16b, v6.16b, #8
335        ext     v16.16b, v3.16b, v3.16b, #8
336        ext     v17.16b, v5.16b, v5.16b, #8
337        ext     v18.16b, v4.16b, v4.16b, #8
338        eor     v8.16b, v8.16b, v0.16b
339        eor     v9.16b, v9.16b, v1.16b
340        eor     v10.16b, v10.16b, v6.16b
341        eor     v16.16b, v16.16b, v3.16b
342        eor     v17.16b, v17.16b, v5.16b
343        ext     v19.16b, v2.16b, v2.16b, #8
344        ext     v20.16b, v7.16b, v7.16b, #8
345        eor     v18.16b, v18.16b, v4.16b
346        eor     v6.16b, v6.16b, v8.16b
347        eor     v8.16b, v2.16b, v10.16b
348        eor     v4.16b, v4.16b, v9.16b
349        eor     v2.16b, v19.16b, v2.16b
350        eor     v9.16b, v20.16b, v7.16b
351        eor     v0.16b, v0.16b, v16.16b
352        eor     v1.16b, v1.16b, v16.16b
353        eor     v6.16b, v6.16b, v17.16b
354        eor     v8.16b, v8.16b, v16.16b
355        eor     v7.16b, v7.16b, v18.16b
356        eor     v4.16b, v4.16b, v16.16b
357        eor     v2.16b, v3.16b, v2.16b
358        eor     v1.16b, v1.16b, v17.16b
359        eor     v3.16b, v5.16b, v9.16b
360        eor     v5.16b, v8.16b, v17.16b
361        eor     v7.16b, v7.16b, v17.16b
362        ext     v8.16b, v0.16b, v0.16b, #12
363        ext     v9.16b, v6.16b, v6.16b, #12
364        ext     v10.16b, v4.16b, v4.16b, #12
365        ext     v16.16b, v1.16b, v1.16b, #12
366        ext     v17.16b, v5.16b, v5.16b, #12
367        ext     v18.16b, v7.16b, v7.16b, #12
368        eor     v0.16b, v0.16b, v8.16b
369        eor     v6.16b, v6.16b, v9.16b
370        eor     v4.16b, v4.16b, v10.16b
371        ext     v19.16b, v2.16b, v2.16b, #12
372        ext     v20.16b, v3.16b, v3.16b, #12
373        eor     v1.16b, v1.16b, v16.16b
374        eor     v5.16b, v5.16b, v17.16b
375        eor     v7.16b, v7.16b, v18.16b
376        eor     v2.16b, v2.16b, v19.16b
377        eor     v16.16b, v16.16b, v0.16b
378        eor     v3.16b, v3.16b, v20.16b
379        eor     v17.16b, v17.16b, v4.16b
380        eor     v10.16b, v10.16b, v6.16b
381        ext     v0.16b, v0.16b, v0.16b, #8
382        eor     v9.16b, v9.16b, v1.16b
383        ext     v1.16b, v1.16b, v1.16b, #8
384        eor     v8.16b, v8.16b, v3.16b
385        eor     v16.16b, v16.16b, v3.16b
386        eor     v18.16b, v18.16b, v5.16b
387        eor     v19.16b, v19.16b, v7.16b
388        ext     v21.16b, v5.16b, v5.16b, #8
389        ext     v5.16b, v7.16b, v7.16b, #8
390        eor     v7.16b, v20.16b, v2.16b
391        ext     v4.16b, v4.16b, v4.16b, #8
392        ext     v20.16b, v3.16b, v3.16b, #8
393        eor     v17.16b, v17.16b, v3.16b
394        ext     v2.16b, v2.16b, v2.16b, #8
395        eor     v3.16b, v10.16b, v3.16b
396        ext     v10.16b, v6.16b, v6.16b, #8
397        eor     v0.16b, v0.16b, v8.16b
398        eor     v1.16b, v1.16b, v16.16b
399        eor     v5.16b, v5.16b, v18.16b
400        eor     v3.16b, v3.16b, v4.16b
401        eor     v7.16b, v20.16b, v7.16b
402        eor     v6.16b, v2.16b, v19.16b
403        eor     v4.16b, v21.16b, v17.16b
404        eor     v2.16b, v10.16b, v9.16b
405        bne     .Ldec_loop
406        ldr     q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
407        b       .Ldec_loop
408.align  4
409.Ldec_done:
410        ushr    v8.2d, v0.2d, #1
411        movi    v9.16b, #0x55
412        ldr     q10, [x9]
413        ushr    v16.2d, v2.2d, #1
414        movi    v17.16b, #0x33
415        ushr    v18.2d, v6.2d, #1
416        movi    v19.16b, #0x0f
417        eor     v8.16b, v8.16b, v1.16b
418        ushr    v20.2d, v3.2d, #1
419        eor     v16.16b, v16.16b, v7.16b
420        eor     v18.16b, v18.16b, v4.16b
421        and     v8.16b, v8.16b, v9.16b
422        eor     v20.16b, v20.16b, v5.16b
423        and     v16.16b, v16.16b, v9.16b
424        and     v18.16b, v18.16b, v9.16b
425        shl     v21.2d, v8.2d, #1
426        eor     v1.16b, v1.16b, v8.16b
427        and     v8.16b, v20.16b, v9.16b
428        eor     v7.16b, v7.16b, v16.16b
429        shl     v9.2d, v16.2d, #1
430        eor     v4.16b, v4.16b, v18.16b
431        shl     v16.2d, v18.2d, #1
432        eor     v0.16b, v0.16b, v21.16b
433        shl     v18.2d, v8.2d, #1
434        eor     v5.16b, v5.16b, v8.16b
435        eor     v2.16b, v2.16b, v9.16b
436        eor     v6.16b, v6.16b, v16.16b
437        ushr    v8.2d, v1.2d, #2
438        eor     v3.16b, v3.16b, v18.16b
439        ushr    v9.2d, v0.2d, #2
440        ushr    v16.2d, v7.2d, #2
441        ushr    v18.2d, v2.2d, #2
442        eor     v8.16b, v8.16b, v4.16b
443        eor     v9.16b, v9.16b, v6.16b
444        eor     v16.16b, v16.16b, v5.16b
445        eor     v18.16b, v18.16b, v3.16b
446        and     v8.16b, v8.16b, v17.16b
447        and     v9.16b, v9.16b, v17.16b
448        and     v16.16b, v16.16b, v17.16b
449        and     v17.16b, v18.16b, v17.16b
450        eor     v4.16b, v4.16b, v8.16b
451        shl     v8.2d, v8.2d, #2
452        eor     v6.16b, v6.16b, v9.16b
453        shl     v9.2d, v9.2d, #2
454        eor     v5.16b, v5.16b, v16.16b
455        shl     v16.2d, v16.2d, #2
456        eor     v3.16b, v3.16b, v17.16b
457        shl     v17.2d, v17.2d, #2
458        eor     v1.16b, v1.16b, v8.16b
459        eor     v0.16b, v0.16b, v9.16b
460        eor     v7.16b, v7.16b, v16.16b
461        eor     v2.16b, v2.16b, v17.16b
462        ushr    v8.2d, v4.2d, #4
463        ushr    v9.2d, v6.2d, #4
464        ushr    v16.2d, v1.2d, #4
465        ushr    v17.2d, v0.2d, #4
466        eor     v8.16b, v8.16b, v5.16b
467        eor     v9.16b, v9.16b, v3.16b
468        eor     v16.16b, v16.16b, v7.16b
469        eor     v17.16b, v17.16b, v2.16b
470        and     v8.16b, v8.16b, v19.16b
471        and     v9.16b, v9.16b, v19.16b
472        and     v16.16b, v16.16b, v19.16b
473        and     v17.16b, v17.16b, v19.16b
474        eor     v5.16b, v5.16b, v8.16b
475        shl     v8.2d, v8.2d, #4
476        eor     v3.16b, v3.16b, v9.16b
477        shl     v9.2d, v9.2d, #4
478        eor     v7.16b, v7.16b, v16.16b
479        shl     v16.2d, v16.2d, #4
480        eor     v2.16b, v2.16b, v17.16b
481        shl     v17.2d, v17.2d, #4
482        eor     v4.16b, v4.16b, v8.16b
483        eor     v6.16b, v6.16b, v9.16b
484        eor     v7.16b, v7.16b, v10.16b
485        eor     v1.16b, v1.16b, v16.16b
486        eor     v2.16b, v2.16b, v10.16b
487        eor     v0.16b, v0.16b, v17.16b
488        eor     v4.16b, v4.16b, v10.16b
489        eor     v6.16b, v6.16b, v10.16b
490        eor     v3.16b, v3.16b, v10.16b
491        eor     v5.16b, v5.16b, v10.16b
492        eor     v1.16b, v1.16b, v10.16b
493        eor     v0.16b, v0.16b, v10.16b
494        ret
495.size   _bsaes_decrypt8,.-_bsaes_decrypt8
496
497.type   _bsaes_const,%object
498.align  6
499_bsaes_const:
500// InvShiftRows constants
501// Used in _bsaes_decrypt8, which assumes contiguity
502// .LM0ISR used with round 0 key
503// .LISR   used with middle round keys
504// .LISRM0 used with final round key
505.LM0ISR:
506.quad   0x0a0e0206070b0f03, 0x0004080c0d010509
507.LISR:
508.quad   0x0504070602010003, 0x0f0e0d0c080b0a09
509.LISRM0:
510.quad   0x01040b0e0205080f, 0x0306090c00070a0d
511
512// ShiftRows constants
513// Used in _bsaes_encrypt8, which assumes contiguity
514// .LM0SR used with round 0 key
515// .LSR   used with middle round keys
516// .LSRM0 used with final round key
517.LM0SR:
518.quad   0x0a0e02060f03070b, 0x0004080c05090d01
519.LSR:
520.quad   0x0504070600030201, 0x0f0e0d0c0a09080b
521.LSRM0:
522.quad   0x0304090e00050a0f, 0x01060b0c0207080d
523
524.LM0_bigendian:
525.quad   0x02060a0e03070b0f, 0x0004080c0105090d
526.LM0_littleendian:
527.quad   0x0105090d0004080c, 0x03070b0f02060a0e
528
529// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
530// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
531.LREVM0SR:
532.quad   0x090d01050c000408, 0x03070b0f060a0e02
533
534.align  6
535.size   _bsaes_const,.-_bsaes_const
536
537.type   _bsaes_encrypt8,%function
538.align  4
539// On entry:
540//   x9 -> key (previously expanded using _bsaes_key_convert)
541//   x10 = number of rounds
542//   v0-v7 input data
543// On exit:
544//   x9-x11 corrupted
545//   other general-purpose registers preserved
546//   v0-v7 output data
547//   v11-v15 preserved
548//   other SIMD registers corrupted
549_bsaes_encrypt8:
550        ldr     q8, [x9], #16
551        adr     x11, .LM0SR
552        ldr     q9, [x11], #16
553_bsaes_encrypt8_alt:
554        eor     v0.16b, v0.16b, v8.16b
555        eor     v1.16b, v1.16b, v8.16b
556        sub     x10, x10, #1
557        eor     v2.16b, v2.16b, v8.16b
558        eor     v4.16b, v4.16b, v8.16b
559        eor     v3.16b, v3.16b, v8.16b
560        eor     v5.16b, v5.16b, v8.16b
561        tbl     v0.16b, {v0.16b}, v9.16b
562        tbl     v1.16b, {v1.16b}, v9.16b
563        tbl     v2.16b, {v2.16b}, v9.16b
564        tbl     v4.16b, {v4.16b}, v9.16b
565        eor     v6.16b, v6.16b, v8.16b
566        eor     v7.16b, v7.16b, v8.16b
567        tbl     v3.16b, {v3.16b}, v9.16b
568        tbl     v5.16b, {v5.16b}, v9.16b
569        tbl     v6.16b, {v6.16b}, v9.16b
570        ushr    v8.2d, v0.2d, #1
571        movi    v10.16b, #0x55
572        tbl     v7.16b, {v7.16b}, v9.16b
573        ushr    v9.2d, v4.2d, #1
574        movi    v16.16b, #0x33
575        ushr    v17.2d, v2.2d, #1
576        eor     v8.16b, v8.16b, v1.16b
577        movi    v18.16b, #0x0f
578        ushr    v19.2d, v6.2d, #1
579        eor     v9.16b, v9.16b, v5.16b
580        eor     v17.16b, v17.16b, v3.16b
581        and     v8.16b, v8.16b, v10.16b
582        eor     v19.16b, v19.16b, v7.16b
583        and     v9.16b, v9.16b, v10.16b
584        and     v17.16b, v17.16b, v10.16b
585        eor     v1.16b, v1.16b, v8.16b
586        shl     v8.2d, v8.2d, #1
587        and     v10.16b, v19.16b, v10.16b
588        eor     v5.16b, v5.16b, v9.16b
589        shl     v9.2d, v9.2d, #1
590        eor     v3.16b, v3.16b, v17.16b
591        shl     v17.2d, v17.2d, #1
592        eor     v0.16b, v0.16b, v8.16b
593        shl     v8.2d, v10.2d, #1
594        eor     v7.16b, v7.16b, v10.16b
595        eor     v4.16b, v4.16b, v9.16b
596        eor     v2.16b, v2.16b, v17.16b
597        ushr    v9.2d, v1.2d, #2
598        eor     v6.16b, v6.16b, v8.16b
599        ushr    v8.2d, v0.2d, #2
600        ushr    v10.2d, v5.2d, #2
601        ushr    v17.2d, v4.2d, #2
602        eor     v9.16b, v9.16b, v3.16b
603        eor     v8.16b, v8.16b, v2.16b
604        eor     v10.16b, v10.16b, v7.16b
605        eor     v17.16b, v17.16b, v6.16b
606        and     v9.16b, v9.16b, v16.16b
607        and     v8.16b, v8.16b, v16.16b
608        and     v10.16b, v10.16b, v16.16b
609        and     v16.16b, v17.16b, v16.16b
610        eor     v3.16b, v3.16b, v9.16b
611        shl     v9.2d, v9.2d, #2
612        eor     v2.16b, v2.16b, v8.16b
613        shl     v8.2d, v8.2d, #2
614        eor     v7.16b, v7.16b, v10.16b
615        shl     v10.2d, v10.2d, #2
616        eor     v6.16b, v6.16b, v16.16b
617        shl     v16.2d, v16.2d, #2
618        eor     v1.16b, v1.16b, v9.16b
619        eor     v0.16b, v0.16b, v8.16b
620        eor     v5.16b, v5.16b, v10.16b
621        eor     v4.16b, v4.16b, v16.16b
622        ushr    v8.2d, v3.2d, #4
623        ushr    v9.2d, v2.2d, #4
624        ushr    v10.2d, v1.2d, #4
625        ushr    v16.2d, v0.2d, #4
626        eor     v8.16b, v8.16b, v7.16b
627        eor     v9.16b, v9.16b, v6.16b
628        eor     v10.16b, v10.16b, v5.16b
629        eor     v16.16b, v16.16b, v4.16b
630        and     v8.16b, v8.16b, v18.16b
631        and     v9.16b, v9.16b, v18.16b
632        and     v10.16b, v10.16b, v18.16b
633        and     v16.16b, v16.16b, v18.16b
634        eor     v7.16b, v7.16b, v8.16b
635        shl     v8.2d, v8.2d, #4
636        eor     v6.16b, v6.16b, v9.16b
637        shl     v9.2d, v9.2d, #4
638        eor     v5.16b, v5.16b, v10.16b
639        shl     v10.2d, v10.2d, #4
640        eor     v4.16b, v4.16b, v16.16b
641        shl     v16.2d, v16.2d, #4
642        eor     v3.16b, v3.16b, v8.16b
643        eor     v2.16b, v2.16b, v9.16b
644        eor     v1.16b, v1.16b, v10.16b
645        eor     v0.16b, v0.16b, v16.16b
646        b       .Lenc_sbox
647.align  4
648.Lenc_loop:
649        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
650        ldp     q8, q9, [x9], #32
651        eor     v0.16b, v16.16b, v0.16b
652        ldr     q10, [x9], #16
653        eor     v1.16b, v17.16b, v1.16b
654        ldr     q16, [x9], #16
655        eor     v2.16b, v18.16b, v2.16b
656        eor     v3.16b, v19.16b, v3.16b
657        eor     v4.16b, v8.16b, v4.16b
658        eor     v5.16b, v9.16b, v5.16b
659        eor     v6.16b, v10.16b, v6.16b
660        eor     v7.16b, v16.16b, v7.16b
661        tbl     v0.16b, {v0.16b}, v28.16b
662        tbl     v1.16b, {v1.16b}, v28.16b
663        tbl     v2.16b, {v2.16b}, v28.16b
664        tbl     v3.16b, {v3.16b}, v28.16b
665        tbl     v4.16b, {v4.16b}, v28.16b
666        tbl     v5.16b, {v5.16b}, v28.16b
667        tbl     v6.16b, {v6.16b}, v28.16b
668        tbl     v7.16b, {v7.16b}, v28.16b
669.Lenc_sbox:
670        eor     v5.16b, v5.16b, v6.16b
671        eor     v3.16b, v3.16b, v0.16b
672        subs    x10, x10, #1
673        eor     v2.16b, v2.16b, v1.16b
674        eor     v5.16b, v5.16b, v0.16b
675        eor     v8.16b, v3.16b, v7.16b
676        eor     v6.16b, v6.16b, v2.16b
677        eor     v7.16b, v7.16b, v5.16b
678        eor     v8.16b, v8.16b, v4.16b
679        eor     v3.16b, v6.16b, v3.16b
680        eor     v4.16b, v4.16b, v5.16b
681        eor     v6.16b, v1.16b, v5.16b
682        eor     v2.16b, v2.16b, v7.16b
683        eor     v1.16b, v8.16b, v1.16b
684        eor     v8.16b, v7.16b, v4.16b
685        eor     v9.16b, v3.16b, v0.16b
686        eor     v10.16b, v7.16b, v6.16b
687        eor     v16.16b, v5.16b, v3.16b
688        eor     v17.16b, v6.16b, v2.16b
689        eor     v18.16b, v5.16b, v1.16b
690        eor     v19.16b, v2.16b, v4.16b
691        eor     v20.16b, v1.16b, v0.16b
692        orr     v21.16b, v8.16b, v9.16b
693        orr     v22.16b, v10.16b, v16.16b
694        eor     v23.16b, v8.16b, v17.16b
695        eor     v24.16b, v9.16b, v18.16b
696        and     v19.16b, v19.16b, v20.16b
697        orr     v20.16b, v17.16b, v18.16b
698        and     v8.16b, v8.16b, v9.16b
699        and     v9.16b, v17.16b, v18.16b
700        and     v17.16b, v23.16b, v24.16b
701        and     v10.16b, v10.16b, v16.16b
702        eor     v16.16b, v21.16b, v19.16b
703        eor     v18.16b, v20.16b, v19.16b
704        and     v19.16b, v2.16b, v1.16b
705        and     v20.16b, v6.16b, v5.16b
706        eor     v21.16b, v22.16b, v17.16b
707        eor     v9.16b, v9.16b, v10.16b
708        eor     v10.16b, v16.16b, v17.16b
709        eor     v16.16b, v18.16b, v8.16b
710        and     v17.16b, v4.16b, v0.16b
711        orr     v18.16b, v7.16b, v3.16b
712        eor     v21.16b, v21.16b, v8.16b
713        eor     v8.16b, v9.16b, v8.16b
714        eor     v9.16b, v10.16b, v19.16b
715        eor     v10.16b, v3.16b, v0.16b
716        eor     v16.16b, v16.16b, v17.16b
717        eor     v17.16b, v5.16b, v1.16b
718        eor     v19.16b, v21.16b, v20.16b
719        eor     v20.16b, v8.16b, v18.16b
720        eor     v8.16b, v8.16b, v18.16b
721        eor     v18.16b, v7.16b, v4.16b
722        eor     v21.16b, v9.16b, v16.16b
723        eor     v22.16b, v6.16b, v2.16b
724        and     v23.16b, v9.16b, v19.16b
725        eor     v24.16b, v10.16b, v17.16b
726        eor     v25.16b, v0.16b, v1.16b
727        eor     v26.16b, v7.16b, v6.16b
728        eor     v27.16b, v18.16b, v22.16b
729        eor     v28.16b, v3.16b, v5.16b
730        eor     v29.16b, v16.16b, v23.16b
731        eor     v30.16b, v20.16b, v23.16b
732        eor     v23.16b, v20.16b, v23.16b
733        eor     v31.16b, v4.16b, v2.16b
734        bsl     v29.16b, v19.16b, v20.16b
735        bsl     v30.16b, v9.16b, v16.16b
736        bsl     v8.16b, v29.16b, v23.16b
737        bsl     v20.16b, v23.16b, v29.16b
738        eor     v9.16b, v30.16b, v29.16b
739        and     v5.16b, v5.16b, v30.16b
740        and     v8.16b, v8.16b, v30.16b
741        and     v1.16b, v1.16b, v29.16b
742        eor     v16.16b, v19.16b, v20.16b
743        and     v2.16b, v2.16b, v29.16b
744        eor     v19.16b, v9.16b, v29.16b
745        and     v17.16b, v17.16b, v9.16b
746        eor     v8.16b, v8.16b, v21.16b
747        and     v20.16b, v22.16b, v9.16b
748        eor     v21.16b, v29.16b, v16.16b
749        eor     v22.16b, v29.16b, v16.16b
750        and     v23.16b, v25.16b, v16.16b
751        and     v6.16b, v6.16b, v19.16b
752        eor     v25.16b, v8.16b, v16.16b
753        eor     v29.16b, v30.16b, v8.16b
754        and     v4.16b, v21.16b, v4.16b
755        and     v8.16b, v28.16b, v8.16b
756        and     v0.16b, v22.16b, v0.16b
757        eor     v21.16b, v23.16b, v1.16b
758        eor     v22.16b, v9.16b, v25.16b
759        eor     v9.16b, v9.16b, v25.16b
760        eor     v23.16b, v25.16b, v16.16b
761        and     v3.16b, v29.16b, v3.16b
762        and     v24.16b, v24.16b, v25.16b
763        and     v25.16b, v27.16b, v25.16b
764        and     v10.16b, v22.16b, v10.16b
765        and     v9.16b, v9.16b, v18.16b
766        eor     v18.16b, v19.16b, v23.16b
767        and     v19.16b, v26.16b, v23.16b
768        eor     v3.16b, v5.16b, v3.16b
769        eor     v17.16b, v17.16b, v24.16b
770        eor     v10.16b, v24.16b, v10.16b
771        and     v16.16b, v31.16b, v16.16b
772        eor     v20.16b, v20.16b, v25.16b
773        eor     v9.16b, v25.16b, v9.16b
774        eor     v4.16b, v2.16b, v4.16b
775        and     v7.16b, v18.16b, v7.16b
776        eor     v18.16b, v19.16b, v6.16b
777        eor     v5.16b, v8.16b, v5.16b
778        eor     v0.16b, v1.16b, v0.16b
779        eor     v1.16b, v21.16b, v10.16b
780        eor     v8.16b, v3.16b, v17.16b
781        eor     v2.16b, v16.16b, v2.16b
782        eor     v3.16b, v6.16b, v7.16b
783        eor     v6.16b, v18.16b, v9.16b
784        eor     v4.16b, v4.16b, v20.16b
785        eor     v10.16b, v5.16b, v10.16b
786        eor     v0.16b, v0.16b, v17.16b
787        eor     v9.16b, v2.16b, v9.16b
788        eor     v3.16b, v3.16b, v20.16b
789        eor     v7.16b, v6.16b, v1.16b
790        eor     v5.16b, v8.16b, v4.16b
791        eor     v6.16b, v10.16b, v1.16b
792        eor     v2.16b, v4.16b, v0.16b
793        eor     v4.16b, v3.16b, v10.16b
794        eor     v9.16b, v9.16b, v7.16b
795        eor     v3.16b, v0.16b, v5.16b
796        eor     v0.16b, v1.16b, v4.16b
797        eor     v1.16b, v4.16b, v8.16b
798        eor     v4.16b, v9.16b, v5.16b
799        eor     v6.16b, v6.16b, v3.16b
800        bcc     .Lenc_done
801        ext     v8.16b, v0.16b, v0.16b, #12
802        ext     v9.16b, v4.16b, v4.16b, #12
803        ldr     q28, [x11]
804        ext     v10.16b, v6.16b, v6.16b, #12
805        ext     v16.16b, v1.16b, v1.16b, #12
806        ext     v17.16b, v3.16b, v3.16b, #12
807        ext     v18.16b, v7.16b, v7.16b, #12
808        eor     v0.16b, v0.16b, v8.16b
809        eor     v4.16b, v4.16b, v9.16b
810        eor     v6.16b, v6.16b, v10.16b
811        ext     v19.16b, v2.16b, v2.16b, #12
812        ext     v20.16b, v5.16b, v5.16b, #12
813        eor     v1.16b, v1.16b, v16.16b
814        eor     v3.16b, v3.16b, v17.16b
815        eor     v7.16b, v7.16b, v18.16b
816        eor     v2.16b, v2.16b, v19.16b
817        eor     v16.16b, v16.16b, v0.16b
818        eor     v5.16b, v5.16b, v20.16b
819        eor     v17.16b, v17.16b, v6.16b
820        eor     v10.16b, v10.16b, v4.16b
821        ext     v0.16b, v0.16b, v0.16b, #8
822        eor     v9.16b, v9.16b, v1.16b
823        ext     v1.16b, v1.16b, v1.16b, #8
824        eor     v8.16b, v8.16b, v5.16b
825        eor     v16.16b, v16.16b, v5.16b
826        eor     v18.16b, v18.16b, v3.16b
827        eor     v19.16b, v19.16b, v7.16b
828        ext     v3.16b, v3.16b, v3.16b, #8
829        ext     v7.16b, v7.16b, v7.16b, #8
830        eor     v20.16b, v20.16b, v2.16b
831        ext     v6.16b, v6.16b, v6.16b, #8
832        ext     v21.16b, v5.16b, v5.16b, #8
833        eor     v17.16b, v17.16b, v5.16b
834        ext     v2.16b, v2.16b, v2.16b, #8
835        eor     v10.16b, v10.16b, v5.16b
836        ext     v22.16b, v4.16b, v4.16b, #8
837        eor     v0.16b, v0.16b, v8.16b
838        eor     v1.16b, v1.16b, v16.16b
839        eor     v5.16b, v7.16b, v18.16b
840        eor     v4.16b, v3.16b, v17.16b
841        eor     v3.16b, v6.16b, v10.16b
842        eor     v7.16b, v21.16b, v20.16b
843        eor     v6.16b, v2.16b, v19.16b
844        eor     v2.16b, v22.16b, v9.16b
845        bne     .Lenc_loop
846        ldr     q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
847        b       .Lenc_loop
848.align  4
849.Lenc_done:
850        ushr    v8.2d, v0.2d, #1
851        movi    v9.16b, #0x55
852        ldr     q10, [x9]
853        ushr    v16.2d, v3.2d, #1
854        movi    v17.16b, #0x33
855        ushr    v18.2d, v4.2d, #1
856        movi    v19.16b, #0x0f
857        eor     v8.16b, v8.16b, v1.16b
858        ushr    v20.2d, v2.2d, #1
859        eor     v16.16b, v16.16b, v7.16b
860        eor     v18.16b, v18.16b, v6.16b
861        and     v8.16b, v8.16b, v9.16b
862        eor     v20.16b, v20.16b, v5.16b
863        and     v16.16b, v16.16b, v9.16b
864        and     v18.16b, v18.16b, v9.16b
865        shl     v21.2d, v8.2d, #1
866        eor     v1.16b, v1.16b, v8.16b
867        and     v8.16b, v20.16b, v9.16b
868        eor     v7.16b, v7.16b, v16.16b
869        shl     v9.2d, v16.2d, #1
870        eor     v6.16b, v6.16b, v18.16b
871        shl     v16.2d, v18.2d, #1
872        eor     v0.16b, v0.16b, v21.16b
873        shl     v18.2d, v8.2d, #1
874        eor     v5.16b, v5.16b, v8.16b
875        eor     v3.16b, v3.16b, v9.16b
876        eor     v4.16b, v4.16b, v16.16b
877        ushr    v8.2d, v1.2d, #2
878        eor     v2.16b, v2.16b, v18.16b
879        ushr    v9.2d, v0.2d, #2
880        ushr    v16.2d, v7.2d, #2
881        ushr    v18.2d, v3.2d, #2
882        eor     v8.16b, v8.16b, v6.16b
883        eor     v9.16b, v9.16b, v4.16b
884        eor     v16.16b, v16.16b, v5.16b
885        eor     v18.16b, v18.16b, v2.16b
886        and     v8.16b, v8.16b, v17.16b
887        and     v9.16b, v9.16b, v17.16b
888        and     v16.16b, v16.16b, v17.16b
889        and     v17.16b, v18.16b, v17.16b
890        eor     v6.16b, v6.16b, v8.16b
891        shl     v8.2d, v8.2d, #2
892        eor     v4.16b, v4.16b, v9.16b
893        shl     v9.2d, v9.2d, #2
894        eor     v5.16b, v5.16b, v16.16b
895        shl     v16.2d, v16.2d, #2
896        eor     v2.16b, v2.16b, v17.16b
897        shl     v17.2d, v17.2d, #2
898        eor     v1.16b, v1.16b, v8.16b
899        eor     v0.16b, v0.16b, v9.16b
900        eor     v7.16b, v7.16b, v16.16b
901        eor     v3.16b, v3.16b, v17.16b
902        ushr    v8.2d, v6.2d, #4
903        ushr    v9.2d, v4.2d, #4
904        ushr    v16.2d, v1.2d, #4
905        ushr    v17.2d, v0.2d, #4
906        eor     v8.16b, v8.16b, v5.16b
907        eor     v9.16b, v9.16b, v2.16b
908        eor     v16.16b, v16.16b, v7.16b
909        eor     v17.16b, v17.16b, v3.16b
910        and     v8.16b, v8.16b, v19.16b
911        and     v9.16b, v9.16b, v19.16b
912        and     v16.16b, v16.16b, v19.16b
913        and     v17.16b, v17.16b, v19.16b
914        eor     v5.16b, v5.16b, v8.16b
915        shl     v8.2d, v8.2d, #4
916        eor     v2.16b, v2.16b, v9.16b
917        shl     v9.2d, v9.2d, #4
918        eor     v7.16b, v7.16b, v16.16b
919        shl     v16.2d, v16.2d, #4
920        eor     v3.16b, v3.16b, v17.16b
921        shl     v17.2d, v17.2d, #4
922        eor     v6.16b, v6.16b, v8.16b
923        eor     v4.16b, v4.16b, v9.16b
924        eor     v7.16b, v7.16b, v10.16b
925        eor     v1.16b, v1.16b, v16.16b
926        eor     v3.16b, v3.16b, v10.16b
927        eor     v0.16b, v0.16b, v17.16b
928        eor     v6.16b, v6.16b, v10.16b
929        eor     v4.16b, v4.16b, v10.16b
930        eor     v2.16b, v2.16b, v10.16b
931        eor     v5.16b, v5.16b, v10.16b
932        eor     v1.16b, v1.16b, v10.16b
933        eor     v0.16b, v0.16b, v10.16b
934        ret
935.size   _bsaes_encrypt8,.-_bsaes_encrypt8
936
937.type   _bsaes_key_convert,%function
938.align  4
939// On entry:
940//   x9 -> input key (big-endian)
941//   x10 = number of rounds
942//   x17 -> output key (native endianness)
943// On exit:
944//   x9, x10 corrupted
945//   x11 -> .LM0_bigendian
946//   x17 -> last quadword of output key
947//   other general-purpose registers preserved
948//   v2-v6 preserved
949//   v7.16b[] = 0x63
950//   v8-v14 preserved
951//   v15 = last round key (converted to native endianness)
952//   other SIMD registers corrupted
953_bsaes_key_convert:
954#ifdef __AARCH64EL__
955        adr     x11, .LM0_littleendian
956#else
957        adr     x11, .LM0_bigendian
958#endif
959        ldr     q0, [x9], #16               // load round 0 key
960        ldr     q1, [x11]                   // .LM0
961        ldr     q15, [x9], #16              // load round 1 key
962
963        movi    v7.16b, #0x63               // compose .L63
964        movi    v16.16b, #0x01              // bit masks
965        movi    v17.16b, #0x02
966        movi    v18.16b, #0x04
967        movi    v19.16b, #0x08
968        movi    v20.16b, #0x10
969        movi    v21.16b, #0x20
970        movi    v22.16b, #0x40
971        movi    v23.16b, #0x80
972
973#ifdef __AARCH64EL__
974        rev32   v0.16b, v0.16b
975#endif
976        sub     x10, x10, #1
977        str     q0, [x17], #16              // save round 0 key
978
979.align  4
980.Lkey_loop:
981        tbl     v0.16b, {v15.16b}, v1.16b
982        ldr     q15, [x9], #16              // load next round key
983
984        eor     v0.16b, v0.16b, v7.16b
985        cmtst   v24.16b, v0.16b, v16.16b
986        cmtst   v25.16b, v0.16b, v17.16b
987        cmtst   v26.16b, v0.16b, v18.16b
988        cmtst   v27.16b, v0.16b, v19.16b
989        cmtst   v28.16b, v0.16b, v20.16b
990        cmtst   v29.16b, v0.16b, v21.16b
991        cmtst   v30.16b, v0.16b, v22.16b
992        cmtst   v31.16b, v0.16b, v23.16b
993        sub     x10, x10, #1
994        st1     {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
995        st1     {v28.16b-v31.16b}, [x17], #64
996        cbnz    x10, .Lkey_loop
997
998        // don't save last round key
999#ifdef __AARCH64EL__
1000        rev32   v15.16b, v15.16b
1001        adr     x11, .LM0_bigendian
1002#endif
1003        ret
1004.size   _bsaes_key_convert,.-_bsaes_key_convert
1005
1006.globl  ossl_bsaes_cbc_encrypt
1007.type   ossl_bsaes_cbc_encrypt,%function
1008.align  4
1009// On entry:
1010//   x0 -> input ciphertext
1011//   x1 -> output plaintext
1012//   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
1013//   x3 -> key
1014//   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
1015//   w5 must be == 0
1016// On exit:
1017//   Output plaintext filled in
1018//   Initialisation vector overwritten with last quadword of ciphertext
1019//   No output registers, usual AAPCS64 register preservation
1020ossl_bsaes_cbc_encrypt:
1021        AARCH64_VALID_CALL_TARGET
1022        cmp     x2, #128
1023        bhs     .Lcbc_do_bsaes
1024        b       AES_cbc_encrypt
1025.Lcbc_do_bsaes:
1026
1027        // it is up to the caller to make sure we are called with enc == 0
1028
1029        stp     x29, x30, [sp, #-48]!
1030        stp     d8, d9, [sp, #16]
1031        stp     d10, d15, [sp, #32]
1032        lsr     x2, x2, #4                  // len in 16 byte blocks
1033
1034        ldr     w15, [x3, #240]             // get # of rounds
1035        mov     x14, sp
1036
1037        // allocate the key schedule on the stack
1038        add     x17, sp, #96
1039        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1040
1041        // populate the key schedule
1042        mov     x9, x3                      // pass key
1043        mov     x10, x15                    // pass # of rounds
1044        mov     sp, x17                     // sp is sp
1045        bl      _bsaes_key_convert
1046        ldr     q6,  [sp]
1047        str     q15, [x17]                  // save last round key
1048        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1049        str     q6, [sp]
1050
1051        ldr     q15, [x4]                   // load IV
1052        b       .Lcbc_dec_loop
1053
1054.align  4
1055.Lcbc_dec_loop:
1056        subs    x2, x2, #0x8
1057        bmi     .Lcbc_dec_loop_finish
1058
1059        ldr     q0, [x0], #16               // load input
1060        mov     x9, sp                      // pass the key
1061        ldr     q1, [x0], #16
1062        mov     x10, x15
1063        ldr     q2, [x0], #16
1064        ldr     q3, [x0], #16
1065        ldr     q4, [x0], #16
1066        ldr     q5, [x0], #16
1067        ldr     q6, [x0], #16
1068        ldr     q7, [x0], #-7*16
1069
1070        bl      _bsaes_decrypt8
1071
1072        ldr     q16, [x0], #16              // reload input
1073        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1074        eor     v1.16b, v1.16b, v16.16b
1075        str     q0, [x1], #16               // write output
1076        ldr     q0, [x0], #16
1077        str     q1, [x1], #16
1078        ldr     q1, [x0], #16
1079        eor     v1.16b, v4.16b, v1.16b
1080        ldr     q4, [x0], #16
1081        eor     v2.16b, v2.16b, v4.16b
1082        eor     v0.16b, v6.16b, v0.16b
1083        ldr     q4, [x0], #16
1084        str     q0, [x1], #16
1085        str     q1, [x1], #16
1086        eor     v0.16b, v7.16b, v4.16b
1087        ldr     q1, [x0], #16
1088        str     q2, [x1], #16
1089        ldr     q2, [x0], #16
1090        ldr     q15, [x0], #16
1091        str     q0, [x1], #16
1092        eor     v0.16b, v5.16b, v2.16b
1093        eor     v1.16b, v3.16b, v1.16b
1094        str     q1, [x1], #16
1095        str     q0, [x1], #16
1096
1097        b       .Lcbc_dec_loop
1098
1099.Lcbc_dec_loop_finish:
1100        adds    x2, x2, #8
1101        beq     .Lcbc_dec_done
1102
1103        ldr     q0, [x0], #16               // load input
1104        cmp     x2, #2
1105        blo     .Lcbc_dec_one
1106        ldr     q1, [x0], #16
1107        mov     x9, sp                      // pass the key
1108        mov     x10, x15
1109        beq     .Lcbc_dec_two
1110        ldr     q2, [x0], #16
1111        cmp     x2, #4
1112        blo     .Lcbc_dec_three
1113        ldr     q3, [x0], #16
1114        beq     .Lcbc_dec_four
1115        ldr     q4, [x0], #16
1116        cmp     x2, #6
1117        blo     .Lcbc_dec_five
1118        ldr     q5, [x0], #16
1119        beq     .Lcbc_dec_six
1120        ldr     q6, [x0], #-6*16
1121
1122        bl      _bsaes_decrypt8
1123
1124        ldr     q5, [x0], #16               // reload input
1125        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1126        ldr     q8, [x0], #16
1127        ldr     q9, [x0], #16
1128        ldr     q10, [x0], #16
1129        str     q0, [x1], #16               // write output
1130        ldr     q0, [x0], #16
1131        eor     v1.16b, v1.16b, v5.16b
1132        ldr     q5, [x0], #16
1133        eor     v6.16b, v6.16b, v8.16b
1134        ldr     q15, [x0]
1135        eor     v4.16b, v4.16b, v9.16b
1136        eor     v2.16b, v2.16b, v10.16b
1137        str     q1, [x1], #16
1138        eor     v0.16b, v7.16b, v0.16b
1139        str     q6, [x1], #16
1140        eor     v1.16b, v3.16b, v5.16b
1141        str     q4, [x1], #16
1142        str     q2, [x1], #16
1143        str     q0, [x1], #16
1144        str     q1, [x1]
1145        b       .Lcbc_dec_done
1146.align  4
1147.Lcbc_dec_six:
1148        sub     x0, x0, #0x60
1149        bl      _bsaes_decrypt8
1150        ldr     q3, [x0], #16               // reload input
1151        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1152        ldr     q5, [x0], #16
1153        ldr     q8, [x0], #16
1154        ldr     q9, [x0], #16
1155        str     q0, [x1], #16               // write output
1156        ldr     q0, [x0], #16
1157        eor     v1.16b, v1.16b, v3.16b
1158        ldr     q15, [x0]
1159        eor     v3.16b, v6.16b, v5.16b
1160        eor     v4.16b, v4.16b, v8.16b
1161        eor     v2.16b, v2.16b, v9.16b
1162        str     q1, [x1], #16
1163        eor     v0.16b, v7.16b, v0.16b
1164        str     q3, [x1], #16
1165        str     q4, [x1], #16
1166        str     q2, [x1], #16
1167        str     q0, [x1]
1168        b       .Lcbc_dec_done
1169.align  4
1170.Lcbc_dec_five:
1171        sub     x0, x0, #0x50
1172        bl      _bsaes_decrypt8
1173        ldr     q3, [x0], #16               // reload input
1174        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1175        ldr     q5, [x0], #16
1176        ldr     q7, [x0], #16
1177        ldr     q8, [x0], #16
1178        str     q0, [x1], #16               // write output
1179        ldr     q15, [x0]
1180        eor     v0.16b, v1.16b, v3.16b
1181        eor     v1.16b, v6.16b, v5.16b
1182        eor     v3.16b, v4.16b, v7.16b
1183        str     q0, [x1], #16
1184        eor     v0.16b, v2.16b, v8.16b
1185        str     q1, [x1], #16
1186        str     q3, [x1], #16
1187        str     q0, [x1]
1188        b       .Lcbc_dec_done
1189.align  4
1190.Lcbc_dec_four:
1191        sub     x0, x0, #0x40
1192        bl      _bsaes_decrypt8
1193        ldr     q2, [x0], #16               // reload input
1194        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1195        ldr     q3, [x0], #16
1196        ldr     q5, [x0], #16
1197        str     q0, [x1], #16               // write output
1198        ldr     q15, [x0]
1199        eor     v0.16b, v1.16b, v2.16b
1200        eor     v1.16b, v6.16b, v3.16b
1201        eor     v2.16b, v4.16b, v5.16b
1202        str     q0, [x1], #16
1203        str     q1, [x1], #16
1204        str     q2, [x1]
1205        b       .Lcbc_dec_done
1206.align  4
1207.Lcbc_dec_three:
1208        sub     x0, x0, #0x30
1209        bl      _bsaes_decrypt8
1210        ldr     q2, [x0], #16               // reload input
1211        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1212        ldr     q3, [x0], #16
1213        ldr     q15, [x0]
1214        str     q0, [x1], #16               // write output
1215        eor     v0.16b, v1.16b, v2.16b
1216        eor     v1.16b, v6.16b, v3.16b
1217        str     q0, [x1], #16
1218        str     q1, [x1]
1219        b       .Lcbc_dec_done
1220.align  4
1221.Lcbc_dec_two:
1222        sub     x0, x0, #0x20
1223        bl      _bsaes_decrypt8
1224        ldr     q2, [x0], #16               // reload input
1225        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1226        ldr     q15, [x0]
1227        str     q0, [x1], #16               // write output
1228        eor     v0.16b, v1.16b, v2.16b
1229        str     q0, [x1]
1230        b       .Lcbc_dec_done
1231.align  4
1232.Lcbc_dec_one:
1233        sub     x0, x0, #0x10
1234        stp     x1, x4, [sp, #-32]!
1235        str     x14, [sp, #16]
1236        mov     v8.16b, v15.16b
1237        mov     v15.16b, v0.16b
1238        mov     x2, x3
1239        bl      AES_decrypt
1240        ldr     x14, [sp, #16]
1241        ldp     x1, x4, [sp], #32
1242        ldr     q0, [x1]                    // load result
1243        eor     v0.16b, v0.16b, v8.16b      // ^= IV
1244        str     q0, [x1]                    // write output
1245
1246.align  4
1247.Lcbc_dec_done:
1248        movi    v0.16b, #0
1249        movi    v1.16b, #0
1250.Lcbc_dec_bzero:// wipe key schedule [if any]
1251        stp     q0, q1, [sp], #32
1252        cmp     sp, x14
1253        bne     .Lcbc_dec_bzero
1254        str     q15, [x4]                   // return IV
1255        ldp     d8, d9, [sp, #16]
1256        ldp     d10, d15, [sp, #32]
1257        ldp     x29, x30, [sp], #48
1258        ret
1259.size   ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1260
1261.globl  ossl_bsaes_ctr32_encrypt_blocks
1262.type   ossl_bsaes_ctr32_encrypt_blocks,%function
1263.align  4
1264// On entry:
1265//   x0 -> input text (whole 16-byte blocks)
1266//   x1 -> output text (whole 16-byte blocks)
1267//   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
1268//   x3 -> key
1269//   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
1270// On exit:
1271//   Output text filled in
1272//   No output registers, usual AAPCS64 register preservation
1273ossl_bsaes_ctr32_encrypt_blocks:
1274        AARCH64_VALID_CALL_TARGET
1275        cmp     x2, #8                      // use plain AES for
1276        blo     .Lctr_enc_short             // small sizes
1277
1278        stp     x29, x30, [sp, #-80]!
1279        stp     d8, d9, [sp, #16]
1280        stp     d10, d11, [sp, #32]
1281        stp     d12, d13, [sp, #48]
1282        stp     d14, d15, [sp, #64]
1283
1284        ldr     w15, [x3, #240]             // get # of rounds
1285        mov     x14, sp
1286
1287        // allocate the key schedule on the stack
1288        add     x17, sp, #96
1289        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1290
1291        // populate the key schedule
1292        mov     x9, x3                      // pass key
1293        mov     x10, x15                    // pass # of rounds
1294        mov     sp, x17                     // sp is sp
1295        bl      _bsaes_key_convert
1296        eor     v7.16b, v7.16b, v15.16b     // fix up last round key
1297        str     q7, [x17]                   // save last round key
1298
1299        ldr     q0, [x4]                    // load counter
1300        add     x13, x11, #.LREVM0SR-.LM0_bigendian
1301        ldr     q4, [sp]                    // load round0 key
1302
1303        movi    v8.4s, #1                   // compose 1<<96
1304        movi    v9.16b, #0
1305        rev32   v15.16b, v0.16b
1306        rev32   v0.16b, v0.16b
1307        ext     v11.16b, v9.16b, v8.16b, #4
1308        rev32   v4.16b, v4.16b
1309        add     v12.4s, v11.4s, v11.4s      // compose 2<<96
1310        str     q4, [sp]                    // save adjusted round0 key
1311        add     v13.4s, v11.4s, v12.4s      // compose 3<<96
1312        add     v14.4s, v12.4s, v12.4s      // compose 4<<96
1313        b       .Lctr_enc_loop
1314
1315.align  4
1316.Lctr_enc_loop:
1317        // Intermix prologue from _bsaes_encrypt8 to use the opportunity
1318        // to flip byte order in 32-bit counter
1319
1320        add     v1.4s, v15.4s, v11.4s       // +1
1321        add     x9, sp, #0x10               // pass next round key
1322        add     v2.4s, v15.4s, v12.4s       // +2
1323        ldr     q9, [x13]                   // .LREVM0SR
1324        ldr     q8, [sp]                    // load round0 key
1325        add     v3.4s, v15.4s, v13.4s       // +3
1326        mov     x10, x15                    // pass rounds
1327        sub     x11, x13, #.LREVM0SR-.LSR   // pass constants
1328        add     v6.4s, v2.4s, v14.4s
1329        add     v4.4s, v15.4s, v14.4s       // +4
1330        add     v7.4s, v3.4s, v14.4s
1331        add     v15.4s, v4.4s, v14.4s       // next counter
1332        add     v5.4s, v1.4s, v14.4s
1333
1334        bl      _bsaes_encrypt8_alt
1335
1336        subs    x2, x2, #8
1337        blo     .Lctr_enc_loop_done
1338
1339        ldr     q16, [x0], #16
1340        ldr     q17, [x0], #16
1341        eor     v1.16b, v1.16b, v17.16b
1342        ldr     q17, [x0], #16
1343        eor     v0.16b, v0.16b, v16.16b
1344        eor     v4.16b, v4.16b, v17.16b
1345        str     q0, [x1], #16
1346        ldr     q16, [x0], #16
1347        str     q1, [x1], #16
1348        mov     v0.16b, v15.16b
1349        str     q4, [x1], #16
1350        ldr     q1, [x0], #16
1351        eor     v4.16b, v6.16b, v16.16b
1352        eor     v1.16b, v3.16b, v1.16b
1353        ldr     q3, [x0], #16
1354        eor     v3.16b, v7.16b, v3.16b
1355        ldr     q6, [x0], #16
1356        eor     v2.16b, v2.16b, v6.16b
1357        ldr     q6, [x0], #16
1358        eor     v5.16b, v5.16b, v6.16b
1359        str     q4, [x1], #16
1360        str     q1, [x1], #16
1361        str     q3, [x1], #16
1362        str     q2, [x1], #16
1363        str     q5, [x1], #16
1364
1365        bne     .Lctr_enc_loop
1366        b       .Lctr_enc_done
1367
1368.align  4
1369.Lctr_enc_loop_done:
1370        add     x2, x2, #8
1371        ldr     q16, [x0], #16              // load input
1372        eor     v0.16b, v0.16b, v16.16b
1373        str     q0, [x1], #16               // write output
1374        cmp     x2, #2
1375        blo     .Lctr_enc_done
1376        ldr     q17, [x0], #16
1377        eor     v1.16b, v1.16b, v17.16b
1378        str     q1, [x1], #16
1379        beq     .Lctr_enc_done
1380        ldr     q18, [x0], #16
1381        eor     v4.16b, v4.16b, v18.16b
1382        str     q4, [x1], #16
1383        cmp     x2, #4
1384        blo     .Lctr_enc_done
1385        ldr     q19, [x0], #16
1386        eor     v6.16b, v6.16b, v19.16b
1387        str     q6, [x1], #16
1388        beq     .Lctr_enc_done
1389        ldr     q20, [x0], #16
1390        eor     v3.16b, v3.16b, v20.16b
1391        str     q3, [x1], #16
1392        cmp     x2, #6
1393        blo     .Lctr_enc_done
1394        ldr     q21, [x0], #16
1395        eor     v7.16b, v7.16b, v21.16b
1396        str     q7, [x1], #16
1397        beq     .Lctr_enc_done
1398        ldr     q22, [x0]
1399        eor     v2.16b, v2.16b, v22.16b
1400        str     q2, [x1], #16
1401
1402.Lctr_enc_done:
1403        movi    v0.16b, #0
1404        movi    v1.16b, #0
1405.Lctr_enc_bzero: // wipe key schedule [if any]
1406        stp     q0, q1, [sp], #32
1407        cmp     sp, x14
1408        bne     .Lctr_enc_bzero
1409
1410        ldp     d8, d9, [sp, #16]
1411        ldp     d10, d11, [sp, #32]
1412        ldp     d12, d13, [sp, #48]
1413        ldp     d14, d15, [sp, #64]
1414        ldp     x29, x30, [sp], #80
1415        ret
1416
1417.Lctr_enc_short:
1418        stp     x29, x30, [sp, #-96]!
1419        stp     x19, x20, [sp, #16]
1420        stp     x21, x22, [sp, #32]
1421        str     x23, [sp, #48]
1422
1423        mov     x19, x0                     // copy arguments
1424        mov     x20, x1
1425        mov     x21, x2
1426        mov     x22, x3
1427        ldr     w23, [x4, #12]              // load counter .LSW
1428        ldr     q1, [x4]                    // load whole counter value
1429#ifdef __AARCH64EL__
1430        rev     w23, w23
1431#endif
1432        str     q1, [sp, #80]               // copy counter value
1433
1434.Lctr_enc_short_loop:
1435        add     x0, sp, #80                 // input counter value
1436        add     x1, sp, #64                 // output on the stack
1437        mov     x2, x22                     // key
1438
1439        bl      AES_encrypt
1440
1441        ldr     q0, [x19], #16              // load input
1442        ldr     q1, [sp, #64]               // load encrypted counter
1443        add     x23, x23, #1
1444#ifdef __AARCH64EL__
1445        rev     w0, w23
1446        str     w0, [sp, #80+12]            // next counter value
1447#else
1448        str     w23, [sp, #80+12]           // next counter value
1449#endif
1450        eor     v0.16b, v0.16b, v1.16b
1451        str     q0, [x20], #16              // store output
1452        subs    x21, x21, #1
1453        bne     .Lctr_enc_short_loop
1454
1455        movi    v0.16b, #0
1456        movi    v1.16b, #0
1457        stp     q0, q1, [sp, #64]
1458
1459        ldr     x23, [sp, #48]
1460        ldp     x21, x22, [sp, #32]
1461        ldp     x19, x20, [sp, #16]
1462        ldp     x29, x30, [sp], #96
1463        ret
1464.size   ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1465
1466.globl  ossl_bsaes_xts_encrypt
1467.type   ossl_bsaes_xts_encrypt,%function
1468.align  4
1469// On entry:
1470//   x0 -> input plaintext
1471//   x1 -> output ciphertext
1472//   x2 -> length of text in bytes (must be at least 16)
1473//   x3 -> key1 (used to encrypt the XORed plaintext blocks)
1474//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1475//   x5 -> 16-byte initial vector (typically, sector number)
1476// On exit:
1477//   Output ciphertext filled in
1478//   No output registers, usual AAPCS64 register preservation
1479ossl_bsaes_xts_encrypt:
1480        AARCH64_VALID_CALL_TARGET
1481        // Stack layout:
1482        // sp ->
1483        //        nrounds*128-96 bytes: key schedule
1484        // x19 ->
1485        //        16 bytes: frame record
1486        //        4*16 bytes: tweak storage across _bsaes_encrypt8
1487        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1488        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1489        stp     x29, x30, [sp, #-192]!
1490        stp     x19, x20, [sp, #80]
1491        stp     x21, x22, [sp, #96]
1492        str     x23, [sp, #112]
1493        stp     d8, d9, [sp, #128]
1494        stp     d10, d11, [sp, #144]
1495        stp     d12, d13, [sp, #160]
1496        stp     d14, d15, [sp, #176]
1497
1498        mov     x19, sp
1499        mov     x20, x0
1500        mov     x21, x1
1501        mov     x22, x2
1502        mov     x23, x3
1503
1504        // generate initial tweak
1505        sub     sp, sp, #16
1506        mov     x0, x5                      // iv[]
1507        mov     x1, sp
1508        mov     x2, x4                      // key2
1509        bl      AES_encrypt
1510        ldr     q11, [sp], #16
1511
1512        ldr     w1, [x23, #240]             // get # of rounds
1513        // allocate the key schedule on the stack
1514        add     x17, sp, #96
1515        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1516
1517        // populate the key schedule
1518        mov     x9, x23                     // pass key
1519        mov     x10, x1                     // pass # of rounds
1520        mov     sp, x17
1521        bl      _bsaes_key_convert
1522        eor     v15.16b, v15.16b, v7.16b    // fix up last round key
1523        str     q15, [x17]                  // save last round key
1524
1525        subs    x22, x22, #0x80
1526        blo     .Lxts_enc_short
1527        b       .Lxts_enc_loop
1528
1529.align  4
1530.Lxts_enc_loop:
1531        ldr     q8, .Lxts_magic
1532        mov     x10, x1                     // pass rounds
1533        add     x2, x19, #16
1534        ldr     q0, [x20], #16
1535        sshr    v1.2d, v11.2d, #63
1536        mov     x9, sp                      // pass key schedule
1537        ldr     q6, .Lxts_magic+16
1538        add     v2.2d, v11.2d, v11.2d
1539        cmtst   v3.2d, v11.2d, v6.2d
1540        and     v1.16b, v1.16b, v8.16b
1541        ext     v1.16b, v1.16b, v1.16b, #8
1542        and     v3.16b, v3.16b, v8.16b
1543        ldr     q4, [x20], #16
1544        eor     v12.16b, v2.16b, v1.16b
1545        eor     v1.16b, v4.16b, v12.16b
1546        eor     v0.16b, v0.16b, v11.16b
1547        cmtst   v2.2d, v12.2d, v6.2d
1548        add     v4.2d, v12.2d, v12.2d
1549        add     x0, x19, #16
1550        ext     v3.16b, v3.16b, v3.16b, #8
1551        and     v2.16b, v2.16b, v8.16b
1552        eor     v13.16b, v4.16b, v3.16b
1553        ldr     q3, [x20], #16
1554        ext     v4.16b, v2.16b, v2.16b, #8
1555        eor     v2.16b, v3.16b, v13.16b
1556        ldr     q3, [x20], #16
1557        add     v5.2d, v13.2d, v13.2d
1558        cmtst   v7.2d, v13.2d, v6.2d
1559        and     v7.16b, v7.16b, v8.16b
1560        ldr     q9, [x20], #16
1561        ext     v7.16b, v7.16b, v7.16b, #8
1562        ldr     q10, [x20], #16
1563        eor     v14.16b, v5.16b, v4.16b
1564        ldr     q16, [x20], #16
1565        add     v4.2d, v14.2d, v14.2d
1566        eor     v3.16b, v3.16b, v14.16b
1567        eor     v15.16b, v4.16b, v7.16b
1568        add     v5.2d, v15.2d, v15.2d
1569        ldr     q7, [x20], #16
1570        cmtst   v4.2d, v14.2d, v6.2d
1571        and     v17.16b, v4.16b, v8.16b
1572        cmtst   v18.2d, v15.2d, v6.2d
1573        eor     v4.16b, v9.16b, v15.16b
1574        ext     v9.16b, v17.16b, v17.16b, #8
1575        eor     v9.16b, v5.16b, v9.16b
1576        add     v17.2d, v9.2d, v9.2d
1577        and     v18.16b, v18.16b, v8.16b
1578        eor     v5.16b, v10.16b, v9.16b
1579        str     q9, [x2], #16
1580        ext     v10.16b, v18.16b, v18.16b, #8
1581        cmtst   v9.2d, v9.2d, v6.2d
1582        and     v9.16b, v9.16b, v8.16b
1583        eor     v10.16b, v17.16b, v10.16b
1584        cmtst   v17.2d, v10.2d, v6.2d
1585        eor     v6.16b, v16.16b, v10.16b
1586        str     q10, [x2], #16
1587        ext     v9.16b, v9.16b, v9.16b, #8
1588        add     v10.2d, v10.2d, v10.2d
1589        eor     v9.16b, v10.16b, v9.16b
1590        str     q9, [x2], #16
1591        eor     v7.16b, v7.16b, v9.16b
1592        add     v9.2d, v9.2d, v9.2d
1593        and     v8.16b, v17.16b, v8.16b
1594        ext     v8.16b, v8.16b, v8.16b, #8
1595        eor     v8.16b, v9.16b, v8.16b
1596        str     q8, [x2]                    // next round tweak
1597
1598        bl      _bsaes_encrypt8
1599
1600        ldr     q8, [x0], #16
1601        eor     v0.16b, v0.16b, v11.16b
1602        eor     v1.16b, v1.16b, v12.16b
1603        ldr     q9, [x0], #16
1604        eor     v4.16b, v4.16b, v13.16b
1605        eor     v6.16b, v6.16b, v14.16b
1606        ldr     q10, [x0], #16
1607        eor     v3.16b, v3.16b, v15.16b
1608        subs    x22, x22, #0x80
1609        str     q0, [x21], #16
1610        ldr     q11, [x0]                   // next round tweak
1611        str     q1, [x21], #16
1612        eor     v0.16b, v7.16b, v8.16b
1613        eor     v1.16b, v2.16b, v9.16b
1614        str     q4, [x21], #16
1615        eor     v2.16b, v5.16b, v10.16b
1616        str     q6, [x21], #16
1617        str     q3, [x21], #16
1618        str     q0, [x21], #16
1619        str     q1, [x21], #16
1620        str     q2, [x21], #16
1621        bpl     .Lxts_enc_loop
1622
1623.Lxts_enc_short:
1624        adds    x22, x22, #0x70
1625        bmi     .Lxts_enc_done
1626
1627        ldr     q8, .Lxts_magic
1628        sshr    v1.2d, v11.2d, #63
1629        add     v2.2d, v11.2d, v11.2d
1630        ldr     q9, .Lxts_magic+16
1631        subs    x22, x22, #0x10
1632        ldr     q0, [x20], #16
1633        and     v1.16b, v1.16b, v8.16b
1634        cmtst   v3.2d, v11.2d, v9.2d
1635        ext     v1.16b, v1.16b, v1.16b, #8
1636        and     v3.16b, v3.16b, v8.16b
1637        eor     v12.16b, v2.16b, v1.16b
1638        ext     v1.16b, v3.16b, v3.16b, #8
1639        add     v2.2d, v12.2d, v12.2d
1640        cmtst   v3.2d, v12.2d, v9.2d
1641        eor     v13.16b, v2.16b, v1.16b
1642        and     v22.16b, v3.16b, v8.16b
1643        bmi     .Lxts_enc_1
1644
1645        ext     v2.16b, v22.16b, v22.16b, #8
1646        add     v3.2d, v13.2d, v13.2d
1647        ldr     q1, [x20], #16
1648        cmtst   v4.2d, v13.2d, v9.2d
1649        subs    x22, x22, #0x10
1650        eor     v14.16b, v3.16b, v2.16b
1651        and     v23.16b, v4.16b, v8.16b
1652        bmi     .Lxts_enc_2
1653
1654        ext     v3.16b, v23.16b, v23.16b, #8
1655        add     v4.2d, v14.2d, v14.2d
1656        ldr     q2, [x20], #16
1657        cmtst   v5.2d, v14.2d, v9.2d
1658        eor     v0.16b, v0.16b, v11.16b
1659        subs    x22, x22, #0x10
1660        eor     v15.16b, v4.16b, v3.16b
1661        and     v24.16b, v5.16b, v8.16b
1662        bmi     .Lxts_enc_3
1663
1664        ext     v4.16b, v24.16b, v24.16b, #8
1665        add     v5.2d, v15.2d, v15.2d
1666        ldr     q3, [x20], #16
1667        cmtst   v6.2d, v15.2d, v9.2d
1668        eor     v1.16b, v1.16b, v12.16b
1669        subs    x22, x22, #0x10
1670        eor     v16.16b, v5.16b, v4.16b
1671        and     v25.16b, v6.16b, v8.16b
1672        bmi     .Lxts_enc_4
1673
1674        ext     v5.16b, v25.16b, v25.16b, #8
1675        add     v6.2d, v16.2d, v16.2d
1676        add     x0, x19, #16
1677        cmtst   v7.2d, v16.2d, v9.2d
1678        ldr     q4, [x20], #16
1679        eor     v2.16b, v2.16b, v13.16b
1680        str     q16, [x0], #16
1681        subs    x22, x22, #0x10
1682        eor     v17.16b, v6.16b, v5.16b
1683        and     v26.16b, v7.16b, v8.16b
1684        bmi     .Lxts_enc_5
1685
1686        ext     v7.16b, v26.16b, v26.16b, #8
1687        add     v18.2d, v17.2d, v17.2d
1688        ldr     q5, [x20], #16
1689        eor     v3.16b, v3.16b, v14.16b
1690        str     q17, [x0], #16
1691        subs    x22, x22, #0x10
1692        eor     v18.16b, v18.16b, v7.16b
1693        bmi     .Lxts_enc_6
1694
1695        ldr     q6, [x20], #16
1696        eor     v4.16b, v4.16b, v15.16b
1697        eor     v5.16b, v5.16b, v16.16b
1698        str     q18, [x0]                   // next round tweak
1699        mov     x9, sp                      // pass key schedule
1700        mov     x10, x1
1701        add     x0, x19, #16
1702        sub     x22, x22, #0x10
1703        eor     v6.16b, v6.16b, v17.16b
1704
1705        bl      _bsaes_encrypt8
1706
1707        ldr     q16, [x0], #16
1708        eor     v0.16b, v0.16b, v11.16b
1709        eor     v1.16b, v1.16b, v12.16b
1710        ldr     q17, [x0], #16
1711        eor     v4.16b, v4.16b, v13.16b
1712        eor     v6.16b, v6.16b, v14.16b
1713        eor     v3.16b, v3.16b, v15.16b
1714        ldr     q11, [x0]                   // next round tweak
1715        str     q0, [x21], #16
1716        str     q1, [x21], #16
1717        eor     v0.16b, v7.16b, v16.16b
1718        eor     v1.16b, v2.16b, v17.16b
1719        str     q4, [x21], #16
1720        str     q6, [x21], #16
1721        str     q3, [x21], #16
1722        str     q0, [x21], #16
1723        str     q1, [x21], #16
1724        b       .Lxts_enc_done
1725
1726.align  4
1727.Lxts_enc_6:
1728        eor     v4.16b, v4.16b, v15.16b
1729        eor     v5.16b, v5.16b, v16.16b
1730        mov     x9, sp                      // pass key schedule
1731        mov     x10, x1                     // pass rounds
1732        add     x0, x19, #16
1733
1734        bl      _bsaes_encrypt8
1735
1736        ldr     q16, [x0], #16
1737        eor     v0.16b, v0.16b, v11.16b
1738        eor     v1.16b, v1.16b, v12.16b
1739        eor     v4.16b, v4.16b, v13.16b
1740        eor     v6.16b, v6.16b, v14.16b
1741        ldr     q11, [x0]                   // next round tweak
1742        eor     v3.16b, v3.16b, v15.16b
1743        str     q0, [x21], #16
1744        str     q1, [x21], #16
1745        eor     v0.16b, v7.16b, v16.16b
1746        str     q4, [x21], #16
1747        str     q6, [x21], #16
1748        str     q3, [x21], #16
1749        str     q0, [x21], #16
1750        b       .Lxts_enc_done
1751
1752.align  4
1753.Lxts_enc_5:
1754        eor     v3.16b, v3.16b, v14.16b
1755        eor     v4.16b, v4.16b, v15.16b
1756        mov     x9, sp                      // pass key schedule
1757        mov     x10, x1                     // pass rounds
1758        add     x0, x19, #16
1759
1760        bl      _bsaes_encrypt8
1761
1762        eor     v0.16b, v0.16b, v11.16b
1763        eor     v1.16b, v1.16b, v12.16b
1764        ldr     q11, [x0]                   // next round tweak
1765        eor     v4.16b, v4.16b, v13.16b
1766        eor     v6.16b, v6.16b, v14.16b
1767        eor     v3.16b, v3.16b, v15.16b
1768        str     q0, [x21], #16
1769        str     q1, [x21], #16
1770        str     q4, [x21], #16
1771        str     q6, [x21], #16
1772        str     q3, [x21], #16
1773        b       .Lxts_enc_done
1774
1775.align  4
1776.Lxts_enc_4:
1777        eor     v2.16b, v2.16b, v13.16b
1778        eor     v3.16b, v3.16b, v14.16b
1779        mov     x9, sp                      // pass key schedule
1780        mov     x10, x1                     // pass rounds
1781        add     x0, x19, #16
1782
1783        bl      _bsaes_encrypt8
1784
1785        eor     v0.16b, v0.16b, v11.16b
1786        eor     v1.16b, v1.16b, v12.16b
1787        eor     v4.16b, v4.16b, v13.16b
1788        eor     v6.16b, v6.16b, v14.16b
1789        mov     v11.16b, v15.16b            // next round tweak
1790        str     q0, [x21], #16
1791        str     q1, [x21], #16
1792        str     q4, [x21], #16
1793        str     q6, [x21], #16
1794        b       .Lxts_enc_done
1795
1796.align  4
1797.Lxts_enc_3:
1798        eor     v1.16b, v1.16b, v12.16b
1799        eor     v2.16b, v2.16b, v13.16b
1800        mov     x9, sp                      // pass key schedule
1801        mov     x10, x1                     // pass rounds
1802        add     x0, x19, #16
1803
1804        bl      _bsaes_encrypt8
1805
1806        eor     v0.16b, v0.16b, v11.16b
1807        eor     v1.16b, v1.16b, v12.16b
1808        eor     v4.16b, v4.16b, v13.16b
1809        mov     v11.16b, v14.16b            // next round tweak
1810        str     q0, [x21], #16
1811        str     q1, [x21], #16
1812        str     q4, [x21], #16
1813        b       .Lxts_enc_done
1814
1815.align  4
1816.Lxts_enc_2:
1817        eor     v0.16b, v0.16b, v11.16b
1818        eor     v1.16b, v1.16b, v12.16b
1819        mov     x9, sp                      // pass key schedule
1820        mov     x10, x1                     // pass rounds
1821        add     x0, x19, #16
1822
1823        bl      _bsaes_encrypt8
1824
1825        eor     v0.16b, v0.16b, v11.16b
1826        eor     v1.16b, v1.16b, v12.16b
1827        mov     v11.16b, v13.16b            // next round tweak
1828        str     q0, [x21], #16
1829        str     q1, [x21], #16
1830        b       .Lxts_enc_done
1831
1832.align  4
1833.Lxts_enc_1:
1834        eor     v0.16b, v0.16b, v11.16b
1835        sub     x0, sp, #16
1836        sub     x1, sp, #16
1837        mov     x2, x23
1838        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1839        mov     v14.d[0], v12.d[1]
1840        str     q0, [sp, #-16]!
1841
1842        bl      AES_encrypt
1843
1844        ldr     q0, [sp], #16
1845        trn1    v13.2d, v11.2d, v13.2d
1846        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
1847        eor     v0.16b, v0.16b, v13.16b
1848        str     q0, [x21], #16
1849
1850.Lxts_enc_done:
1851        adds    x22, x22, #0x10
1852        beq     .Lxts_enc_ret
1853
1854        sub     x6, x21, #0x10
1855        // Penultimate plaintext block produces final ciphertext part-block
1856        // plus remaining part of final plaintext block. Move ciphertext part
1857        // to final position and reuse penultimate ciphertext block buffer to
1858        // construct final plaintext block
1859.Lxts_enc_steal:
1860        ldrb    w0, [x20], #1
1861        ldrb    w1, [x21, #-0x10]
1862        strb    w0, [x21, #-0x10]
1863        strb    w1, [x21], #1
1864
1865        subs    x22, x22, #1
1866        bhi     .Lxts_enc_steal
1867
1868        // Finally encrypt the penultimate ciphertext block using the
1869        // last tweak
1870        ldr     q0, [x6]
1871        eor     v0.16b, v0.16b, v11.16b
1872        str     q0, [sp, #-16]!
1873        mov     x0, sp
1874        mov     x1, sp
1875        mov     x2, x23
1876        mov     x21, x6
1877        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1878
1879        bl      AES_encrypt
1880
1881        trn1    v11.2d, v11.2d, v13.2d
1882        ldr     q0, [sp], #16
1883        eor     v0.16b, v0.16b, v11.16b
1884        str     q0, [x21]
1885
1886.Lxts_enc_ret:
1887
1888        movi    v0.16b, #0
1889        movi    v1.16b, #0
1890.Lxts_enc_bzero: // wipe key schedule
1891        stp     q0, q1, [sp], #32
1892        cmp     sp, x19
1893        bne     .Lxts_enc_bzero
1894
1895        ldp     x19, x20, [sp, #80]
1896        ldp     x21, x22, [sp, #96]
1897        ldr     x23, [sp, #112]
1898        ldp     d8, d9, [sp, #128]
1899        ldp     d10, d11, [sp, #144]
1900        ldp     d12, d13, [sp, #160]
1901        ldp     d14, d15, [sp, #176]
1902        ldp     x29, x30, [sp], #192
1903        ret
1904.size   ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
1905
1906// The assembler doesn't seem capable of de-duplicating these when expressed
1907// using `ldr qd,=` syntax, so assign a symbolic address
1908.align  5
1909.Lxts_magic:
1910.quad   1, 0x87, 0x4000000000000000, 0x4000000000000000
1911
1912.globl  ossl_bsaes_xts_decrypt
1913.type   ossl_bsaes_xts_decrypt,%function
1914.align  4
1915// On entry:
1916//   x0 -> input ciphertext
1917//   x1 -> output plaintext
1918//   x2 -> length of text in bytes (must be at least 16)
1919//   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
1920//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1921//   x5 -> 16-byte initial vector (typically, sector number)
1922// On exit:
1923//   Output plaintext filled in
1924//   No output registers, usual AAPCS64 register preservation
1925ossl_bsaes_xts_decrypt:
1926        AARCH64_VALID_CALL_TARGET
1927        // Stack layout:
1928        // sp ->
1929        //        nrounds*128-96 bytes: key schedule
1930        // x19 ->
1931        //        16 bytes: frame record
1932        //        4*16 bytes: tweak storage across _bsaes_decrypt8
1933        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1934        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1935        stp     x29, x30, [sp, #-192]!
1936        stp     x19, x20, [sp, #80]
1937        stp     x21, x22, [sp, #96]
1938        str     x23, [sp, #112]
1939        stp     d8, d9, [sp, #128]
1940        stp     d10, d11, [sp, #144]
1941        stp     d12, d13, [sp, #160]
1942        stp     d14, d15, [sp, #176]
1943
1944        mov     x19, sp
1945        mov     x20, x0
1946        mov     x21, x1
1947        mov     x22, x2
1948        mov     x23, x3
1949
1950        // generate initial tweak
1951        sub     sp, sp, #16
1952        mov     x0, x5                      // iv[]
1953        mov     x1, sp
1954        mov     x2, x4                      // key2
1955        bl      AES_encrypt
1956        ldr     q11, [sp], #16
1957
1958        ldr     w1, [x23, #240]             // get # of rounds
1959        // allocate the key schedule on the stack
1960        add     x17, sp, #96
1961        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1962
1963        // populate the key schedule
1964        mov     x9, x23                     // pass key
1965        mov     x10, x1                     // pass # of rounds
1966        mov     sp, x17
1967        bl      _bsaes_key_convert
1968        ldr     q6,  [sp]
1969        str     q15, [x17]                  // save last round key
1970        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1971        str     q6, [sp]
1972
1973        sub     x30, x22, #0x10
1974        tst     x22, #0xf                   // if not multiple of 16
1975        csel    x22, x30, x22, ne           // subtract another 16 bytes
1976        subs    x22, x22, #0x80
1977
1978        blo     .Lxts_dec_short
1979        b       .Lxts_dec_loop
1980
1981.align  4
1982.Lxts_dec_loop:
1983        ldr     q8, .Lxts_magic
1984        mov     x10, x1                     // pass rounds
1985        add     x2, x19, #16
1986        ldr     q0, [x20], #16
1987        sshr    v1.2d, v11.2d, #63
1988        mov     x9, sp                      // pass key schedule
1989        ldr     q6, .Lxts_magic+16
1990        add     v2.2d, v11.2d, v11.2d
1991        cmtst   v3.2d, v11.2d, v6.2d
1992        and     v1.16b, v1.16b, v8.16b
1993        ext     v1.16b, v1.16b, v1.16b, #8
1994        and     v3.16b, v3.16b, v8.16b
1995        ldr     q4, [x20], #16
1996        eor     v12.16b, v2.16b, v1.16b
1997        eor     v1.16b, v4.16b, v12.16b
1998        eor     v0.16b, v0.16b, v11.16b
1999        cmtst   v2.2d, v12.2d, v6.2d
2000        add     v4.2d, v12.2d, v12.2d
2001        add     x0, x19, #16
2002        ext     v3.16b, v3.16b, v3.16b, #8
2003        and     v2.16b, v2.16b, v8.16b
2004        eor     v13.16b, v4.16b, v3.16b
2005        ldr     q3, [x20], #16
2006        ext     v4.16b, v2.16b, v2.16b, #8
2007        eor     v2.16b, v3.16b, v13.16b
2008        ldr     q3, [x20], #16
2009        add     v5.2d, v13.2d, v13.2d
2010        cmtst   v7.2d, v13.2d, v6.2d
2011        and     v7.16b, v7.16b, v8.16b
2012        ldr     q9, [x20], #16
2013        ext     v7.16b, v7.16b, v7.16b, #8
2014        ldr     q10, [x20], #16
2015        eor     v14.16b, v5.16b, v4.16b
2016        ldr     q16, [x20], #16
2017        add     v4.2d, v14.2d, v14.2d
2018        eor     v3.16b, v3.16b, v14.16b
2019        eor     v15.16b, v4.16b, v7.16b
2020        add     v5.2d, v15.2d, v15.2d
2021        ldr     q7, [x20], #16
2022        cmtst   v4.2d, v14.2d, v6.2d
2023        and     v17.16b, v4.16b, v8.16b
2024        cmtst   v18.2d, v15.2d, v6.2d
2025        eor     v4.16b, v9.16b, v15.16b
2026        ext     v9.16b, v17.16b, v17.16b, #8
2027        eor     v9.16b, v5.16b, v9.16b
2028        add     v17.2d, v9.2d, v9.2d
2029        and     v18.16b, v18.16b, v8.16b
2030        eor     v5.16b, v10.16b, v9.16b
2031        str     q9, [x2], #16
2032        ext     v10.16b, v18.16b, v18.16b, #8
2033        cmtst   v9.2d, v9.2d, v6.2d
2034        and     v9.16b, v9.16b, v8.16b
2035        eor     v10.16b, v17.16b, v10.16b
2036        cmtst   v17.2d, v10.2d, v6.2d
2037        eor     v6.16b, v16.16b, v10.16b
2038        str     q10, [x2], #16
2039        ext     v9.16b, v9.16b, v9.16b, #8
2040        add     v10.2d, v10.2d, v10.2d
2041        eor     v9.16b, v10.16b, v9.16b
2042        str     q9, [x2], #16
2043        eor     v7.16b, v7.16b, v9.16b
2044        add     v9.2d, v9.2d, v9.2d
2045        and     v8.16b, v17.16b, v8.16b
2046        ext     v8.16b, v8.16b, v8.16b, #8
2047        eor     v8.16b, v9.16b, v8.16b
2048        str     q8, [x2]                    // next round tweak
2049
2050        bl      _bsaes_decrypt8
2051
2052        eor     v6.16b, v6.16b, v13.16b
2053        eor     v0.16b, v0.16b, v11.16b
2054        ldr     q8, [x0], #16
2055        eor     v7.16b, v7.16b, v8.16b
2056        str     q0, [x21], #16
2057        eor     v0.16b, v1.16b, v12.16b
2058        ldr     q1, [x0], #16
2059        eor     v1.16b, v3.16b, v1.16b
2060        subs    x22, x22, #0x80
2061        eor     v2.16b, v2.16b, v15.16b
2062        eor     v3.16b, v4.16b, v14.16b
2063        ldr     q4, [x0], #16
2064        str     q0, [x21], #16
2065        ldr     q11, [x0]                   // next round tweak
2066        eor     v0.16b, v5.16b, v4.16b
2067        str     q6, [x21], #16
2068        str     q3, [x21], #16
2069        str     q2, [x21], #16
2070        str     q7, [x21], #16
2071        str     q1, [x21], #16
2072        str     q0, [x21], #16
2073        bpl     .Lxts_dec_loop
2074
2075.Lxts_dec_short:
2076        adds    x22, x22, #0x70
2077        bmi     .Lxts_dec_done
2078
2079        ldr     q8, .Lxts_magic
2080        sshr    v1.2d, v11.2d, #63
2081        add     v2.2d, v11.2d, v11.2d
2082        ldr     q9, .Lxts_magic+16
2083        subs    x22, x22, #0x10
2084        ldr     q0, [x20], #16
2085        and     v1.16b, v1.16b, v8.16b
2086        cmtst   v3.2d, v11.2d, v9.2d
2087        ext     v1.16b, v1.16b, v1.16b, #8
2088        and     v3.16b, v3.16b, v8.16b
2089        eor     v12.16b, v2.16b, v1.16b
2090        ext     v1.16b, v3.16b, v3.16b, #8
2091        add     v2.2d, v12.2d, v12.2d
2092        cmtst   v3.2d, v12.2d, v9.2d
2093        eor     v13.16b, v2.16b, v1.16b
2094        and     v22.16b, v3.16b, v8.16b
2095        bmi     .Lxts_dec_1
2096
2097        ext     v2.16b, v22.16b, v22.16b, #8
2098        add     v3.2d, v13.2d, v13.2d
2099        ldr     q1, [x20], #16
2100        cmtst   v4.2d, v13.2d, v9.2d
2101        subs    x22, x22, #0x10
2102        eor     v14.16b, v3.16b, v2.16b
2103        and     v23.16b, v4.16b, v8.16b
2104        bmi     .Lxts_dec_2
2105
2106        ext     v3.16b, v23.16b, v23.16b, #8
2107        add     v4.2d, v14.2d, v14.2d
2108        ldr     q2, [x20], #16
2109        cmtst   v5.2d, v14.2d, v9.2d
2110        eor     v0.16b, v0.16b, v11.16b
2111        subs    x22, x22, #0x10
2112        eor     v15.16b, v4.16b, v3.16b
2113        and     v24.16b, v5.16b, v8.16b
2114        bmi     .Lxts_dec_3
2115
2116        ext     v4.16b, v24.16b, v24.16b, #8
2117        add     v5.2d, v15.2d, v15.2d
2118        ldr     q3, [x20], #16
2119        cmtst   v6.2d, v15.2d, v9.2d
2120        eor     v1.16b, v1.16b, v12.16b
2121        subs    x22, x22, #0x10
2122        eor     v16.16b, v5.16b, v4.16b
2123        and     v25.16b, v6.16b, v8.16b
2124        bmi     .Lxts_dec_4
2125
2126        ext     v5.16b, v25.16b, v25.16b, #8
2127        add     v6.2d, v16.2d, v16.2d
2128        add     x0, x19, #16
2129        cmtst   v7.2d, v16.2d, v9.2d
2130        ldr     q4, [x20], #16
2131        eor     v2.16b, v2.16b, v13.16b
2132        str     q16, [x0], #16
2133        subs    x22, x22, #0x10
2134        eor     v17.16b, v6.16b, v5.16b
2135        and     v26.16b, v7.16b, v8.16b
2136        bmi     .Lxts_dec_5
2137
2138        ext     v7.16b, v26.16b, v26.16b, #8
2139        add     v18.2d, v17.2d, v17.2d
2140        ldr     q5, [x20], #16
2141        eor     v3.16b, v3.16b, v14.16b
2142        str     q17, [x0], #16
2143        subs    x22, x22, #0x10
2144        eor     v18.16b, v18.16b, v7.16b
2145        bmi     .Lxts_dec_6
2146
2147        ldr     q6, [x20], #16
2148        eor     v4.16b, v4.16b, v15.16b
2149        eor     v5.16b, v5.16b, v16.16b
2150        str     q18, [x0]                   // next round tweak
2151        mov     x9, sp                      // pass key schedule
2152        mov     x10, x1
2153        add     x0, x19, #16
2154        sub     x22, x22, #0x10
2155        eor     v6.16b, v6.16b, v17.16b
2156
2157        bl      _bsaes_decrypt8
2158
2159        ldr     q16, [x0], #16
2160        eor     v0.16b, v0.16b, v11.16b
2161        eor     v1.16b, v1.16b, v12.16b
2162        ldr     q17, [x0], #16
2163        eor     v6.16b, v6.16b, v13.16b
2164        eor     v4.16b, v4.16b, v14.16b
2165        eor     v2.16b, v2.16b, v15.16b
2166        ldr     q11, [x0]                   // next round tweak
2167        str     q0, [x21], #16
2168        str     q1, [x21], #16
2169        eor     v0.16b, v7.16b, v16.16b
2170        eor     v1.16b, v3.16b, v17.16b
2171        str     q6, [x21], #16
2172        str     q4, [x21], #16
2173        str     q2, [x21], #16
2174        str     q0, [x21], #16
2175        str     q1, [x21], #16
2176        b       .Lxts_dec_done
2177
2178.align  4
2179.Lxts_dec_6:
2180        eor     v4.16b, v4.16b, v15.16b
2181        eor     v5.16b, v5.16b, v16.16b
2182        mov     x9, sp                      // pass key schedule
2183        mov     x10, x1                     // pass rounds
2184        add     x0, x19, #16
2185
2186        bl      _bsaes_decrypt8
2187
2188        ldr     q16, [x0], #16
2189        eor     v0.16b, v0.16b, v11.16b
2190        eor     v1.16b, v1.16b, v12.16b
2191        eor     v6.16b, v6.16b, v13.16b
2192        eor     v4.16b, v4.16b, v14.16b
2193        ldr     q11, [x0]                   // next round tweak
2194        eor     v2.16b, v2.16b, v15.16b
2195        str     q0, [x21], #16
2196        str     q1, [x21], #16
2197        eor     v0.16b, v7.16b, v16.16b
2198        str     q6, [x21], #16
2199        str     q4, [x21], #16
2200        str     q2, [x21], #16
2201        str     q0, [x21], #16
2202        b       .Lxts_dec_done
2203
2204.align  4
2205.Lxts_dec_5:
2206        eor     v3.16b, v3.16b, v14.16b
2207        eor     v4.16b, v4.16b, v15.16b
2208        mov     x9, sp                      // pass key schedule
2209        mov     x10, x1                     // pass rounds
2210        add     x0, x19, #16
2211
2212        bl      _bsaes_decrypt8
2213
2214        eor     v0.16b, v0.16b, v11.16b
2215        eor     v1.16b, v1.16b, v12.16b
2216        ldr     q11, [x0]                   // next round tweak
2217        eor     v6.16b, v6.16b, v13.16b
2218        eor     v4.16b, v4.16b, v14.16b
2219        eor     v2.16b, v2.16b, v15.16b
2220        str     q0, [x21], #16
2221        str     q1, [x21], #16
2222        str     q6, [x21], #16
2223        str     q4, [x21], #16
2224        str     q2, [x21], #16
2225        b       .Lxts_dec_done
2226
2227.align  4
2228.Lxts_dec_4:
2229        eor     v2.16b, v2.16b, v13.16b
2230        eor     v3.16b, v3.16b, v14.16b
2231        mov     x9, sp                      // pass key schedule
2232        mov     x10, x1                     // pass rounds
2233        add     x0, x19, #16
2234
2235        bl      _bsaes_decrypt8
2236
2237        eor     v0.16b, v0.16b, v11.16b
2238        eor     v1.16b, v1.16b, v12.16b
2239        eor     v6.16b, v6.16b, v13.16b
2240        eor     v4.16b, v4.16b, v14.16b
2241        mov     v11.16b, v15.16b            // next round tweak
2242        str     q0, [x21], #16
2243        str     q1, [x21], #16
2244        str     q6, [x21], #16
2245        str     q4, [x21], #16
2246        b       .Lxts_dec_done
2247
2248.align  4
2249.Lxts_dec_3:
2250        eor     v1.16b, v1.16b, v12.16b
2251        eor     v2.16b, v2.16b, v13.16b
2252        mov     x9, sp                      // pass key schedule
2253        mov     x10, x1                     // pass rounds
2254        add     x0, x19, #16
2255
2256        bl      _bsaes_decrypt8
2257
2258        eor     v0.16b, v0.16b, v11.16b
2259        eor     v1.16b, v1.16b, v12.16b
2260        eor     v6.16b, v6.16b, v13.16b
2261        mov     v11.16b, v14.16b            // next round tweak
2262        str     q0, [x21], #16
2263        str     q1, [x21], #16
2264        str     q6, [x21], #16
2265        b       .Lxts_dec_done
2266
2267.align  4
2268.Lxts_dec_2:
2269        eor     v0.16b, v0.16b, v11.16b
2270        eor     v1.16b, v1.16b, v12.16b
2271        mov     x9, sp                      // pass key schedule
2272        mov     x10, x1                     // pass rounds
2273        add     x0, x19, #16
2274
2275        bl      _bsaes_decrypt8
2276
2277        eor     v0.16b, v0.16b, v11.16b
2278        eor     v1.16b, v1.16b, v12.16b
2279        mov     v11.16b, v13.16b            // next round tweak
2280        str     q0, [x21], #16
2281        str     q1, [x21], #16
2282        b       .Lxts_dec_done
2283
2284.align  4
2285.Lxts_dec_1:
2286        eor     v0.16b, v0.16b, v11.16b
2287        sub     x0, sp, #16
2288        sub     x1, sp, #16
2289        mov     x2, x23
2290        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2291        mov     v14.d[0], v12.d[1]
2292        str     q0, [sp, #-16]!
2293
2294        bl      AES_decrypt
2295
2296        ldr     q0, [sp], #16
2297        trn1    v13.2d, v11.2d, v13.2d
2298        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
2299        eor     v0.16b, v0.16b, v13.16b
2300        str     q0, [x21], #16
2301
2302.Lxts_dec_done:
2303        adds    x22, x22, #0x10
2304        beq     .Lxts_dec_ret
2305
2306        // calculate one round of extra tweak for the stolen ciphertext
2307        ldr     q8, .Lxts_magic
2308        sshr    v6.2d, v11.2d, #63
2309        and     v6.16b, v6.16b, v8.16b
2310        add     v12.2d, v11.2d, v11.2d
2311        ext     v6.16b, v6.16b, v6.16b, #8
2312        eor     v12.16b, v12.16b, v6.16b
2313
2314        // perform the final decryption with the last tweak value
2315        ldr     q0, [x20], #16
2316        eor     v0.16b, v0.16b, v12.16b
2317        str     q0, [sp, #-16]!
2318        mov     x0, sp
2319        mov     x1, sp
2320        mov     x2, x23
2321        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2322        mov     v14.d[0], v12.d[1]
2323
2324        bl      AES_decrypt
2325
2326        trn1    v12.2d, v12.2d, v14.2d
2327        trn1    v11.2d, v11.2d, v13.2d
2328        ldr     q0, [sp], #16
2329        eor     v0.16b, v0.16b, v12.16b
2330        str     q0, [x21]
2331
2332        mov     x6, x21
2333        // Penultimate ciphertext block produces final plaintext part-block
2334        // plus remaining part of final ciphertext block. Move plaintext part
2335        // to final position and reuse penultimate plaintext block buffer to
2336        // construct final ciphertext block
2337.Lxts_dec_steal:
2338        ldrb    w1, [x21]
2339        ldrb    w0, [x20], #1
2340        strb    w1, [x21, #0x10]
2341        strb    w0, [x21], #1
2342
2343        subs    x22, x22, #1
2344        bhi     .Lxts_dec_steal
2345
2346        // Finally decrypt the penultimate plaintext block using the
2347        // penultimate tweak
2348        ldr     q0, [x6]
2349        eor     v0.16b, v0.16b, v11.16b
2350        str     q0, [sp, #-16]!
2351        mov     x0, sp
2352        mov     x1, sp
2353        mov     x2, x23
2354        mov     x21, x6
2355
2356        bl      AES_decrypt
2357
2358        trn1    v11.2d, v11.2d, v13.2d
2359        ldr     q0, [sp], #16
2360        eor     v0.16b, v0.16b, v11.16b
2361        str     q0, [x21]
2362
2363.Lxts_dec_ret:
2364
2365        movi    v0.16b, #0
2366        movi    v1.16b, #0
2367.Lxts_dec_bzero: // wipe key schedule
2368        stp     q0, q1, [sp], #32
2369        cmp     sp, x19
2370        bne     .Lxts_dec_bzero
2371
2372        ldp     x19, x20, [sp, #80]
2373        ldp     x21, x22, [sp, #96]
2374        ldr     x23, [sp, #112]
2375        ldp     d8, d9, [sp, #128]
2376        ldp     d10, d11, [sp, #144]
2377        ldp     d12, d13, [sp, #160]
2378        ldp     d14, d15, [sp, #176]
2379        ldp     x29, x30, [sp], #192
2380        ret
2381.size   ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
2382