xref: /openssl/crypto/aes/asm/bsaes-armv8.pl (revision 1efd8533)
1#!/usr/bin/env perl
2# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9use strict;
10
11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13my $xlate;
14
15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
16( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
18die "can't locate arm-xlate.pl";
19
20open OUT,"| \"$^X\" $xlate $flavour $output";
21*STDOUT=*OUT;
22
23my $code = data();
24print $code;
25
26close STDOUT or die "error closing STDOUT: $!"; # enforce flush
27
28sub data
29{
30    local $/;
31    return <DATA>;
32}
33
34__END__
35// Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
36//
37// Licensed under the OpenSSL license (the "License").  You may not use
38// this file except in compliance with the License.  You can obtain a copy
39// in the file LICENSE in the source distribution or at
40// https://www.openssl.org/source/license.html
41//
42// ====================================================================
43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
44// project. Rights for redistribution and usage in source and binary
45// forms are granted according to the OpenSSL license.
46// ====================================================================
47//
48// This implementation is a translation of bsaes-armv7 for AArch64.
49// No attempt has been made to carry across the build switches for
50// kernel targets, since the Linux kernel crypto support has moved on
51// from when it was based on OpenSSL.
52
53// A lot of hand-scheduling has been performed. Consequently, this code
54// doesn't factor out neatly into macros in the same way that the
55// AArch32 version did, and there is little to be gained by wrapping it
56// up in Perl, and it is presented as pure assembly.
57
58
59#include "crypto/arm_arch.h"
60
61.text
62
63.extern AES_cbc_encrypt
64.extern AES_encrypt
65.extern AES_decrypt
66
67.type   _bsaes_decrypt8,%function
68.align  4
69// On entry:
70//   x9 -> key (previously expanded using _bsaes_key_convert)
71//   x10 = number of rounds
72//   v0-v7 input data
73// On exit:
74//   x9-x11 corrupted
75//   other general-purpose registers preserved
76//   v0-v7 output data
77//   v11-v15 preserved
78//   other SIMD registers corrupted
79_bsaes_decrypt8:
80        ldr     q8, [x9], #16
81        adr     x11, .LM0ISR
82        movi    v9.16b, #0x55
83        ldr     q10, [x11], #16
84        movi    v16.16b, #0x33
85        movi    v17.16b, #0x0f
86        sub     x10, x10, #1
87        eor     v0.16b, v0.16b, v8.16b
88        eor     v1.16b, v1.16b, v8.16b
89        eor     v2.16b, v2.16b, v8.16b
90        eor     v4.16b, v4.16b, v8.16b
91        eor     v3.16b, v3.16b, v8.16b
92        eor     v5.16b, v5.16b, v8.16b
93        tbl     v0.16b, {v0.16b}, v10.16b
94        tbl     v1.16b, {v1.16b}, v10.16b
95        tbl     v2.16b, {v2.16b}, v10.16b
96        tbl     v4.16b, {v4.16b}, v10.16b
97        eor     v6.16b, v6.16b, v8.16b
98        eor     v7.16b, v7.16b, v8.16b
99        tbl     v3.16b, {v3.16b}, v10.16b
100        tbl     v5.16b, {v5.16b}, v10.16b
101        tbl     v6.16b, {v6.16b}, v10.16b
102        ushr    v8.2d, v0.2d, #1
103        tbl     v7.16b, {v7.16b}, v10.16b
104        ushr    v10.2d, v4.2d, #1
105        ushr    v18.2d, v2.2d, #1
106        eor     v8.16b, v8.16b, v1.16b
107        ushr    v19.2d, v6.2d, #1
108        eor     v10.16b, v10.16b, v5.16b
109        eor     v18.16b, v18.16b, v3.16b
110        and     v8.16b, v8.16b, v9.16b
111        eor     v19.16b, v19.16b, v7.16b
112        and     v10.16b, v10.16b, v9.16b
113        and     v18.16b, v18.16b, v9.16b
114        eor     v1.16b, v1.16b, v8.16b
115        shl     v8.2d, v8.2d, #1
116        and     v9.16b, v19.16b, v9.16b
117        eor     v5.16b, v5.16b, v10.16b
118        shl     v10.2d, v10.2d, #1
119        eor     v3.16b, v3.16b, v18.16b
120        shl     v18.2d, v18.2d, #1
121        eor     v0.16b, v0.16b, v8.16b
122        shl     v8.2d, v9.2d, #1
123        eor     v7.16b, v7.16b, v9.16b
124        eor     v4.16b, v4.16b, v10.16b
125        eor     v2.16b, v2.16b, v18.16b
126        ushr    v9.2d, v1.2d, #2
127        eor     v6.16b, v6.16b, v8.16b
128        ushr    v8.2d, v0.2d, #2
129        ushr    v10.2d, v5.2d, #2
130        ushr    v18.2d, v4.2d, #2
131        eor     v9.16b, v9.16b, v3.16b
132        eor     v8.16b, v8.16b, v2.16b
133        eor     v10.16b, v10.16b, v7.16b
134        eor     v18.16b, v18.16b, v6.16b
135        and     v9.16b, v9.16b, v16.16b
136        and     v8.16b, v8.16b, v16.16b
137        and     v10.16b, v10.16b, v16.16b
138        and     v16.16b, v18.16b, v16.16b
139        eor     v3.16b, v3.16b, v9.16b
140        shl     v9.2d, v9.2d, #2
141        eor     v2.16b, v2.16b, v8.16b
142        shl     v8.2d, v8.2d, #2
143        eor     v7.16b, v7.16b, v10.16b
144        shl     v10.2d, v10.2d, #2
145        eor     v6.16b, v6.16b, v16.16b
146        shl     v16.2d, v16.2d, #2
147        eor     v1.16b, v1.16b, v9.16b
148        eor     v0.16b, v0.16b, v8.16b
149        eor     v5.16b, v5.16b, v10.16b
150        eor     v4.16b, v4.16b, v16.16b
151        ushr    v8.2d, v3.2d, #4
152        ushr    v9.2d, v2.2d, #4
153        ushr    v10.2d, v1.2d, #4
154        ushr    v16.2d, v0.2d, #4
155        eor     v8.16b, v8.16b, v7.16b
156        eor     v9.16b, v9.16b, v6.16b
157        eor     v10.16b, v10.16b, v5.16b
158        eor     v16.16b, v16.16b, v4.16b
159        and     v8.16b, v8.16b, v17.16b
160        and     v9.16b, v9.16b, v17.16b
161        and     v10.16b, v10.16b, v17.16b
162        and     v16.16b, v16.16b, v17.16b
163        eor     v7.16b, v7.16b, v8.16b
164        shl     v8.2d, v8.2d, #4
165        eor     v6.16b, v6.16b, v9.16b
166        shl     v9.2d, v9.2d, #4
167        eor     v5.16b, v5.16b, v10.16b
168        shl     v10.2d, v10.2d, #4
169        eor     v4.16b, v4.16b, v16.16b
170        shl     v16.2d, v16.2d, #4
171        eor     v3.16b, v3.16b, v8.16b
172        eor     v2.16b, v2.16b, v9.16b
173        eor     v1.16b, v1.16b, v10.16b
174        eor     v0.16b, v0.16b, v16.16b
175        b       .Ldec_sbox
176.align  4
177.Ldec_loop:
178        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
179        ldp     q8, q9, [x9], #32
180        eor     v0.16b, v16.16b, v0.16b
181        ldr     q10, [x9], #16
182        eor     v1.16b, v17.16b, v1.16b
183        ldr     q16, [x9], #16
184        eor     v2.16b, v18.16b, v2.16b
185        eor     v3.16b, v19.16b, v3.16b
186        eor     v4.16b, v8.16b, v4.16b
187        eor     v5.16b, v9.16b, v5.16b
188        eor     v6.16b, v10.16b, v6.16b
189        eor     v7.16b, v16.16b, v7.16b
190        tbl     v0.16b, {v0.16b}, v28.16b
191        tbl     v1.16b, {v1.16b}, v28.16b
192        tbl     v2.16b, {v2.16b}, v28.16b
193        tbl     v3.16b, {v3.16b}, v28.16b
194        tbl     v4.16b, {v4.16b}, v28.16b
195        tbl     v5.16b, {v5.16b}, v28.16b
196        tbl     v6.16b, {v6.16b}, v28.16b
197        tbl     v7.16b, {v7.16b}, v28.16b
198.Ldec_sbox:
199        eor     v1.16b, v1.16b, v4.16b
200        eor     v3.16b, v3.16b, v4.16b
201        subs    x10, x10, #1
202        eor     v4.16b, v4.16b, v7.16b
203        eor     v2.16b, v2.16b, v7.16b
204        eor     v1.16b, v1.16b, v6.16b
205        eor     v6.16b, v6.16b, v4.16b
206        eor     v2.16b, v2.16b, v5.16b
207        eor     v0.16b, v0.16b, v1.16b
208        eor     v7.16b, v7.16b, v6.16b
209        eor     v8.16b, v6.16b, v2.16b
210        and     v9.16b, v4.16b, v6.16b
211        eor     v10.16b, v2.16b, v6.16b
212        eor     v3.16b, v3.16b, v0.16b
213        eor     v5.16b, v5.16b, v0.16b
214        eor     v16.16b, v7.16b, v4.16b
215        eor     v17.16b, v4.16b, v0.16b
216        and     v18.16b, v0.16b, v2.16b
217        eor     v19.16b, v7.16b, v4.16b
218        eor     v1.16b, v1.16b, v3.16b
219        eor     v20.16b, v3.16b, v0.16b
220        eor     v21.16b, v5.16b, v2.16b
221        eor     v22.16b, v3.16b, v7.16b
222        and     v8.16b, v17.16b, v8.16b
223        orr     v17.16b, v3.16b, v5.16b
224        eor     v23.16b, v1.16b, v6.16b
225        eor     v24.16b, v20.16b, v16.16b
226        eor     v25.16b, v1.16b, v5.16b
227        orr     v26.16b, v20.16b, v21.16b
228        and     v20.16b, v20.16b, v21.16b
229        and     v27.16b, v7.16b, v1.16b
230        eor     v21.16b, v21.16b, v23.16b
231        orr     v28.16b, v16.16b, v23.16b
232        orr     v29.16b, v22.16b, v25.16b
233        eor     v26.16b, v26.16b, v8.16b
234        and     v16.16b, v16.16b, v23.16b
235        and     v22.16b, v22.16b, v25.16b
236        and     v21.16b, v24.16b, v21.16b
237        eor     v8.16b, v28.16b, v8.16b
238        eor     v23.16b, v5.16b, v2.16b
239        eor     v24.16b, v1.16b, v6.16b
240        eor     v16.16b, v16.16b, v22.16b
241        eor     v22.16b, v3.16b, v0.16b
242        eor     v25.16b, v29.16b, v21.16b
243        eor     v21.16b, v26.16b, v21.16b
244        eor     v8.16b, v8.16b, v20.16b
245        eor     v26.16b, v23.16b, v24.16b
246        eor     v16.16b, v16.16b, v20.16b
247        eor     v28.16b, v22.16b, v19.16b
248        eor     v20.16b, v25.16b, v20.16b
249        eor     v9.16b, v21.16b, v9.16b
250        eor     v8.16b, v8.16b, v18.16b
251        eor     v18.16b, v5.16b, v1.16b
252        eor     v21.16b, v16.16b, v17.16b
253        eor     v16.16b, v16.16b, v17.16b
254        eor     v17.16b, v20.16b, v27.16b
255        eor     v20.16b, v3.16b, v7.16b
256        eor     v25.16b, v9.16b, v8.16b
257        eor     v27.16b, v0.16b, v4.16b
258        and     v29.16b, v9.16b, v17.16b
259        eor     v30.16b, v8.16b, v29.16b
260        eor     v31.16b, v21.16b, v29.16b
261        eor     v29.16b, v21.16b, v29.16b
262        bsl     v30.16b, v17.16b, v21.16b
263        bsl     v31.16b, v9.16b, v8.16b
264        bsl     v16.16b, v30.16b, v29.16b
265        bsl     v21.16b, v29.16b, v30.16b
266        eor     v8.16b, v31.16b, v30.16b
267        and     v1.16b, v1.16b, v31.16b
268        and     v9.16b, v16.16b, v31.16b
269        and     v6.16b, v6.16b, v30.16b
270        eor     v16.16b, v17.16b, v21.16b
271        and     v4.16b, v4.16b, v30.16b
272        eor     v17.16b, v8.16b, v30.16b
273        and     v21.16b, v24.16b, v8.16b
274        eor     v9.16b, v9.16b, v25.16b
275        and     v19.16b, v19.16b, v8.16b
276        eor     v24.16b, v30.16b, v16.16b
277        eor     v25.16b, v30.16b, v16.16b
278        and     v7.16b, v7.16b, v17.16b
279        and     v10.16b, v10.16b, v16.16b
280        eor     v29.16b, v9.16b, v16.16b
281        eor     v30.16b, v31.16b, v9.16b
282        and     v0.16b, v24.16b, v0.16b
283        and     v9.16b, v18.16b, v9.16b
284        and     v2.16b, v25.16b, v2.16b
285        eor     v10.16b, v10.16b, v6.16b
286        eor     v18.16b, v29.16b, v16.16b
287        and     v5.16b, v30.16b, v5.16b
288        eor     v24.16b, v8.16b, v29.16b
289        and     v25.16b, v26.16b, v29.16b
290        and     v26.16b, v28.16b, v29.16b
291        eor     v8.16b, v8.16b, v29.16b
292        eor     v17.16b, v17.16b, v18.16b
293        eor     v5.16b, v1.16b, v5.16b
294        and     v23.16b, v24.16b, v23.16b
295        eor     v21.16b, v21.16b, v25.16b
296        eor     v19.16b, v19.16b, v26.16b
297        eor     v0.16b, v4.16b, v0.16b
298        and     v3.16b, v17.16b, v3.16b
299        eor     v1.16b, v9.16b, v1.16b
300        eor     v9.16b, v25.16b, v23.16b
301        eor     v5.16b, v5.16b, v21.16b
302        eor     v2.16b, v6.16b, v2.16b
303        and     v6.16b, v8.16b, v22.16b
304        eor     v3.16b, v7.16b, v3.16b
305        and     v8.16b, v20.16b, v18.16b
306        eor     v10.16b, v10.16b, v9.16b
307        eor     v0.16b, v0.16b, v19.16b
308        eor     v9.16b, v1.16b, v9.16b
309        eor     v1.16b, v2.16b, v21.16b
310        eor     v3.16b, v3.16b, v19.16b
311        and     v16.16b, v27.16b, v16.16b
312        eor     v17.16b, v26.16b, v6.16b
313        eor     v6.16b, v8.16b, v7.16b
314        eor     v7.16b, v1.16b, v9.16b
315        eor     v1.16b, v5.16b, v3.16b
316        eor     v2.16b, v10.16b, v3.16b
317        eor     v4.16b, v16.16b, v4.16b
318        eor     v8.16b, v6.16b, v17.16b
319        eor     v5.16b, v9.16b, v3.16b
320        eor     v9.16b, v0.16b, v1.16b
321        eor     v6.16b, v7.16b, v1.16b
322        eor     v0.16b, v4.16b, v17.16b
323        eor     v4.16b, v8.16b, v7.16b
324        eor     v7.16b, v9.16b, v2.16b
325        eor     v8.16b, v3.16b, v0.16b
326        eor     v7.16b, v7.16b, v5.16b
327        eor     v3.16b, v4.16b, v7.16b
328        eor     v4.16b, v7.16b, v0.16b
329        eor     v7.16b, v8.16b, v3.16b
330        bcc     .Ldec_done
331        ext     v8.16b, v0.16b, v0.16b, #8
332        ext     v9.16b, v1.16b, v1.16b, #8
333        ldr     q28, [x11]                  // load from .LISR in common case (x10 > 0)
334        ext     v10.16b, v6.16b, v6.16b, #8
335        ext     v16.16b, v3.16b, v3.16b, #8
336        ext     v17.16b, v5.16b, v5.16b, #8
337        ext     v18.16b, v4.16b, v4.16b, #8
338        eor     v8.16b, v8.16b, v0.16b
339        eor     v9.16b, v9.16b, v1.16b
340        eor     v10.16b, v10.16b, v6.16b
341        eor     v16.16b, v16.16b, v3.16b
342        eor     v17.16b, v17.16b, v5.16b
343        ext     v19.16b, v2.16b, v2.16b, #8
344        ext     v20.16b, v7.16b, v7.16b, #8
345        eor     v18.16b, v18.16b, v4.16b
346        eor     v6.16b, v6.16b, v8.16b
347        eor     v8.16b, v2.16b, v10.16b
348        eor     v4.16b, v4.16b, v9.16b
349        eor     v2.16b, v19.16b, v2.16b
350        eor     v9.16b, v20.16b, v7.16b
351        eor     v0.16b, v0.16b, v16.16b
352        eor     v1.16b, v1.16b, v16.16b
353        eor     v6.16b, v6.16b, v17.16b
354        eor     v8.16b, v8.16b, v16.16b
355        eor     v7.16b, v7.16b, v18.16b
356        eor     v4.16b, v4.16b, v16.16b
357        eor     v2.16b, v3.16b, v2.16b
358        eor     v1.16b, v1.16b, v17.16b
359        eor     v3.16b, v5.16b, v9.16b
360        eor     v5.16b, v8.16b, v17.16b
361        eor     v7.16b, v7.16b, v17.16b
362        ext     v8.16b, v0.16b, v0.16b, #12
363        ext     v9.16b, v6.16b, v6.16b, #12
364        ext     v10.16b, v4.16b, v4.16b, #12
365        ext     v16.16b, v1.16b, v1.16b, #12
366        ext     v17.16b, v5.16b, v5.16b, #12
367        ext     v18.16b, v7.16b, v7.16b, #12
368        eor     v0.16b, v0.16b, v8.16b
369        eor     v6.16b, v6.16b, v9.16b
370        eor     v4.16b, v4.16b, v10.16b
371        ext     v19.16b, v2.16b, v2.16b, #12
372        ext     v20.16b, v3.16b, v3.16b, #12
373        eor     v1.16b, v1.16b, v16.16b
374        eor     v5.16b, v5.16b, v17.16b
375        eor     v7.16b, v7.16b, v18.16b
376        eor     v2.16b, v2.16b, v19.16b
377        eor     v16.16b, v16.16b, v0.16b
378        eor     v3.16b, v3.16b, v20.16b
379        eor     v17.16b, v17.16b, v4.16b
380        eor     v10.16b, v10.16b, v6.16b
381        ext     v0.16b, v0.16b, v0.16b, #8
382        eor     v9.16b, v9.16b, v1.16b
383        ext     v1.16b, v1.16b, v1.16b, #8
384        eor     v8.16b, v8.16b, v3.16b
385        eor     v16.16b, v16.16b, v3.16b
386        eor     v18.16b, v18.16b, v5.16b
387        eor     v19.16b, v19.16b, v7.16b
388        ext     v21.16b, v5.16b, v5.16b, #8
389        ext     v5.16b, v7.16b, v7.16b, #8
390        eor     v7.16b, v20.16b, v2.16b
391        ext     v4.16b, v4.16b, v4.16b, #8
392        ext     v20.16b, v3.16b, v3.16b, #8
393        eor     v17.16b, v17.16b, v3.16b
394        ext     v2.16b, v2.16b, v2.16b, #8
395        eor     v3.16b, v10.16b, v3.16b
396        ext     v10.16b, v6.16b, v6.16b, #8
397        eor     v0.16b, v0.16b, v8.16b
398        eor     v1.16b, v1.16b, v16.16b
399        eor     v5.16b, v5.16b, v18.16b
400        eor     v3.16b, v3.16b, v4.16b
401        eor     v7.16b, v20.16b, v7.16b
402        eor     v6.16b, v2.16b, v19.16b
403        eor     v4.16b, v21.16b, v17.16b
404        eor     v2.16b, v10.16b, v9.16b
405        bne     .Ldec_loop
406        ldr     q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
407        b       .Ldec_loop
408.align  4
409.Ldec_done:
410        ushr    v8.2d, v0.2d, #1
411        movi    v9.16b, #0x55
412        ldr     q10, [x9]
413        ushr    v16.2d, v2.2d, #1
414        movi    v17.16b, #0x33
415        ushr    v18.2d, v6.2d, #1
416        movi    v19.16b, #0x0f
417        eor     v8.16b, v8.16b, v1.16b
418        ushr    v20.2d, v3.2d, #1
419        eor     v16.16b, v16.16b, v7.16b
420        eor     v18.16b, v18.16b, v4.16b
421        and     v8.16b, v8.16b, v9.16b
422        eor     v20.16b, v20.16b, v5.16b
423        and     v16.16b, v16.16b, v9.16b
424        and     v18.16b, v18.16b, v9.16b
425        shl     v21.2d, v8.2d, #1
426        eor     v1.16b, v1.16b, v8.16b
427        and     v8.16b, v20.16b, v9.16b
428        eor     v7.16b, v7.16b, v16.16b
429        shl     v9.2d, v16.2d, #1
430        eor     v4.16b, v4.16b, v18.16b
431        shl     v16.2d, v18.2d, #1
432        eor     v0.16b, v0.16b, v21.16b
433        shl     v18.2d, v8.2d, #1
434        eor     v5.16b, v5.16b, v8.16b
435        eor     v2.16b, v2.16b, v9.16b
436        eor     v6.16b, v6.16b, v16.16b
437        ushr    v8.2d, v1.2d, #2
438        eor     v3.16b, v3.16b, v18.16b
439        ushr    v9.2d, v0.2d, #2
440        ushr    v16.2d, v7.2d, #2
441        ushr    v18.2d, v2.2d, #2
442        eor     v8.16b, v8.16b, v4.16b
443        eor     v9.16b, v9.16b, v6.16b
444        eor     v16.16b, v16.16b, v5.16b
445        eor     v18.16b, v18.16b, v3.16b
446        and     v8.16b, v8.16b, v17.16b
447        and     v9.16b, v9.16b, v17.16b
448        and     v16.16b, v16.16b, v17.16b
449        and     v17.16b, v18.16b, v17.16b
450        eor     v4.16b, v4.16b, v8.16b
451        shl     v8.2d, v8.2d, #2
452        eor     v6.16b, v6.16b, v9.16b
453        shl     v9.2d, v9.2d, #2
454        eor     v5.16b, v5.16b, v16.16b
455        shl     v16.2d, v16.2d, #2
456        eor     v3.16b, v3.16b, v17.16b
457        shl     v17.2d, v17.2d, #2
458        eor     v1.16b, v1.16b, v8.16b
459        eor     v0.16b, v0.16b, v9.16b
460        eor     v7.16b, v7.16b, v16.16b
461        eor     v2.16b, v2.16b, v17.16b
462        ushr    v8.2d, v4.2d, #4
463        ushr    v9.2d, v6.2d, #4
464        ushr    v16.2d, v1.2d, #4
465        ushr    v17.2d, v0.2d, #4
466        eor     v8.16b, v8.16b, v5.16b
467        eor     v9.16b, v9.16b, v3.16b
468        eor     v16.16b, v16.16b, v7.16b
469        eor     v17.16b, v17.16b, v2.16b
470        and     v8.16b, v8.16b, v19.16b
471        and     v9.16b, v9.16b, v19.16b
472        and     v16.16b, v16.16b, v19.16b
473        and     v17.16b, v17.16b, v19.16b
474        eor     v5.16b, v5.16b, v8.16b
475        shl     v8.2d, v8.2d, #4
476        eor     v3.16b, v3.16b, v9.16b
477        shl     v9.2d, v9.2d, #4
478        eor     v7.16b, v7.16b, v16.16b
479        shl     v16.2d, v16.2d, #4
480        eor     v2.16b, v2.16b, v17.16b
481        shl     v17.2d, v17.2d, #4
482        eor     v4.16b, v4.16b, v8.16b
483        eor     v6.16b, v6.16b, v9.16b
484        eor     v7.16b, v7.16b, v10.16b
485        eor     v1.16b, v1.16b, v16.16b
486        eor     v2.16b, v2.16b, v10.16b
487        eor     v0.16b, v0.16b, v17.16b
488        eor     v4.16b, v4.16b, v10.16b
489        eor     v6.16b, v6.16b, v10.16b
490        eor     v3.16b, v3.16b, v10.16b
491        eor     v5.16b, v5.16b, v10.16b
492        eor     v1.16b, v1.16b, v10.16b
493        eor     v0.16b, v0.16b, v10.16b
494        ret
495.size   _bsaes_decrypt8,.-_bsaes_decrypt8
496
497.type   _bsaes_const,%object
498.align  6
499_bsaes_const:
500// InvShiftRows constants
501// Used in _bsaes_decrypt8, which assumes contiguity
502// .LM0ISR used with round 0 key
503// .LISR   used with middle round keys
504// .LISRM0 used with final round key
505.LM0ISR:
506.quad   0x0a0e0206070b0f03, 0x0004080c0d010509
507.LISR:
508.quad   0x0504070602010003, 0x0f0e0d0c080b0a09
509.LISRM0:
510.quad   0x01040b0e0205080f, 0x0306090c00070a0d
511
512// ShiftRows constants
513// Used in _bsaes_encrypt8, which assumes contiguity
514// .LM0SR used with round 0 key
515// .LSR   used with middle round keys
516// .LSRM0 used with final round key
517.LM0SR:
518.quad   0x0a0e02060f03070b, 0x0004080c05090d01
519.LSR:
520.quad   0x0504070600030201, 0x0f0e0d0c0a09080b
521.LSRM0:
522.quad   0x0304090e00050a0f, 0x01060b0c0207080d
523
524.LM0_bigendian:
525.quad   0x02060a0e03070b0f, 0x0004080c0105090d
526.LM0_littleendian:
527.quad   0x0105090d0004080c, 0x03070b0f02060a0e
528
529// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
530// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
531.LREVM0SR:
532.quad   0x090d01050c000408, 0x03070b0f060a0e02
533
534.align  6
535.size   _bsaes_const,.-_bsaes_const
536
537.type   _bsaes_encrypt8,%function
538.align  4
539// On entry:
540//   x9 -> key (previously expanded using _bsaes_key_convert)
541//   x10 = number of rounds
542//   v0-v7 input data
543// On exit:
544//   x9-x11 corrupted
545//   other general-purpose registers preserved
546//   v0-v7 output data
547//   v11-v15 preserved
548//   other SIMD registers corrupted
549_bsaes_encrypt8:
550        ldr     q8, [x9], #16
551        adr     x11, .LM0SR
552        ldr     q9, [x11], #16
553_bsaes_encrypt8_alt:
554        eor     v0.16b, v0.16b, v8.16b
555        eor     v1.16b, v1.16b, v8.16b
556        sub     x10, x10, #1
557        eor     v2.16b, v2.16b, v8.16b
558        eor     v4.16b, v4.16b, v8.16b
559        eor     v3.16b, v3.16b, v8.16b
560        eor     v5.16b, v5.16b, v8.16b
561        tbl     v0.16b, {v0.16b}, v9.16b
562        tbl     v1.16b, {v1.16b}, v9.16b
563        tbl     v2.16b, {v2.16b}, v9.16b
564        tbl     v4.16b, {v4.16b}, v9.16b
565        eor     v6.16b, v6.16b, v8.16b
566        eor     v7.16b, v7.16b, v8.16b
567        tbl     v3.16b, {v3.16b}, v9.16b
568        tbl     v5.16b, {v5.16b}, v9.16b
569        tbl     v6.16b, {v6.16b}, v9.16b
570        ushr    v8.2d, v0.2d, #1
571        movi    v10.16b, #0x55
572        tbl     v7.16b, {v7.16b}, v9.16b
573        ushr    v9.2d, v4.2d, #1
574        movi    v16.16b, #0x33
575        ushr    v17.2d, v2.2d, #1
576        eor     v8.16b, v8.16b, v1.16b
577        movi    v18.16b, #0x0f
578        ushr    v19.2d, v6.2d, #1
579        eor     v9.16b, v9.16b, v5.16b
580        eor     v17.16b, v17.16b, v3.16b
581        and     v8.16b, v8.16b, v10.16b
582        eor     v19.16b, v19.16b, v7.16b
583        and     v9.16b, v9.16b, v10.16b
584        and     v17.16b, v17.16b, v10.16b
585        eor     v1.16b, v1.16b, v8.16b
586        shl     v8.2d, v8.2d, #1
587        and     v10.16b, v19.16b, v10.16b
588        eor     v5.16b, v5.16b, v9.16b
589        shl     v9.2d, v9.2d, #1
590        eor     v3.16b, v3.16b, v17.16b
591        shl     v17.2d, v17.2d, #1
592        eor     v0.16b, v0.16b, v8.16b
593        shl     v8.2d, v10.2d, #1
594        eor     v7.16b, v7.16b, v10.16b
595        eor     v4.16b, v4.16b, v9.16b
596        eor     v2.16b, v2.16b, v17.16b
597        ushr    v9.2d, v1.2d, #2
598        eor     v6.16b, v6.16b, v8.16b
599        ushr    v8.2d, v0.2d, #2
600        ushr    v10.2d, v5.2d, #2
601        ushr    v17.2d, v4.2d, #2
602        eor     v9.16b, v9.16b, v3.16b
603        eor     v8.16b, v8.16b, v2.16b
604        eor     v10.16b, v10.16b, v7.16b
605        eor     v17.16b, v17.16b, v6.16b
606        and     v9.16b, v9.16b, v16.16b
607        and     v8.16b, v8.16b, v16.16b
608        and     v10.16b, v10.16b, v16.16b
609        and     v16.16b, v17.16b, v16.16b
610        eor     v3.16b, v3.16b, v9.16b
611        shl     v9.2d, v9.2d, #2
612        eor     v2.16b, v2.16b, v8.16b
613        shl     v8.2d, v8.2d, #2
614        eor     v7.16b, v7.16b, v10.16b
615        shl     v10.2d, v10.2d, #2
616        eor     v6.16b, v6.16b, v16.16b
617        shl     v16.2d, v16.2d, #2
618        eor     v1.16b, v1.16b, v9.16b
619        eor     v0.16b, v0.16b, v8.16b
620        eor     v5.16b, v5.16b, v10.16b
621        eor     v4.16b, v4.16b, v16.16b
622        ushr    v8.2d, v3.2d, #4
623        ushr    v9.2d, v2.2d, #4
624        ushr    v10.2d, v1.2d, #4
625        ushr    v16.2d, v0.2d, #4
626        eor     v8.16b, v8.16b, v7.16b
627        eor     v9.16b, v9.16b, v6.16b
628        eor     v10.16b, v10.16b, v5.16b
629        eor     v16.16b, v16.16b, v4.16b
630        and     v8.16b, v8.16b, v18.16b
631        and     v9.16b, v9.16b, v18.16b
632        and     v10.16b, v10.16b, v18.16b
633        and     v16.16b, v16.16b, v18.16b
634        eor     v7.16b, v7.16b, v8.16b
635        shl     v8.2d, v8.2d, #4
636        eor     v6.16b, v6.16b, v9.16b
637        shl     v9.2d, v9.2d, #4
638        eor     v5.16b, v5.16b, v10.16b
639        shl     v10.2d, v10.2d, #4
640        eor     v4.16b, v4.16b, v16.16b
641        shl     v16.2d, v16.2d, #4
642        eor     v3.16b, v3.16b, v8.16b
643        eor     v2.16b, v2.16b, v9.16b
644        eor     v1.16b, v1.16b, v10.16b
645        eor     v0.16b, v0.16b, v16.16b
646        b       .Lenc_sbox
647.align  4
648.Lenc_loop:
649        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
650        ldp     q8, q9, [x9], #32
651        eor     v0.16b, v16.16b, v0.16b
652        ldr     q10, [x9], #16
653        eor     v1.16b, v17.16b, v1.16b
654        ldr     q16, [x9], #16
655        eor     v2.16b, v18.16b, v2.16b
656        eor     v3.16b, v19.16b, v3.16b
657        eor     v4.16b, v8.16b, v4.16b
658        eor     v5.16b, v9.16b, v5.16b
659        eor     v6.16b, v10.16b, v6.16b
660        eor     v7.16b, v16.16b, v7.16b
661        tbl     v0.16b, {v0.16b}, v28.16b
662        tbl     v1.16b, {v1.16b}, v28.16b
663        tbl     v2.16b, {v2.16b}, v28.16b
664        tbl     v3.16b, {v3.16b}, v28.16b
665        tbl     v4.16b, {v4.16b}, v28.16b
666        tbl     v5.16b, {v5.16b}, v28.16b
667        tbl     v6.16b, {v6.16b}, v28.16b
668        tbl     v7.16b, {v7.16b}, v28.16b
669.Lenc_sbox:
670        eor     v5.16b, v5.16b, v6.16b
671        eor     v3.16b, v3.16b, v0.16b
672        subs    x10, x10, #1
673        eor     v2.16b, v2.16b, v1.16b
674        eor     v5.16b, v5.16b, v0.16b
675        eor     v8.16b, v3.16b, v7.16b
676        eor     v6.16b, v6.16b, v2.16b
677        eor     v7.16b, v7.16b, v5.16b
678        eor     v8.16b, v8.16b, v4.16b
679        eor     v3.16b, v6.16b, v3.16b
680        eor     v4.16b, v4.16b, v5.16b
681        eor     v6.16b, v1.16b, v5.16b
682        eor     v2.16b, v2.16b, v7.16b
683        eor     v1.16b, v8.16b, v1.16b
684        eor     v8.16b, v7.16b, v4.16b
685        eor     v9.16b, v3.16b, v0.16b
686        eor     v10.16b, v7.16b, v6.16b
687        eor     v16.16b, v5.16b, v3.16b
688        eor     v17.16b, v6.16b, v2.16b
689        eor     v18.16b, v5.16b, v1.16b
690        eor     v19.16b, v2.16b, v4.16b
691        eor     v20.16b, v1.16b, v0.16b
692        orr     v21.16b, v8.16b, v9.16b
693        orr     v22.16b, v10.16b, v16.16b
694        eor     v23.16b, v8.16b, v17.16b
695        eor     v24.16b, v9.16b, v18.16b
696        and     v19.16b, v19.16b, v20.16b
697        orr     v20.16b, v17.16b, v18.16b
698        and     v8.16b, v8.16b, v9.16b
699        and     v9.16b, v17.16b, v18.16b
700        and     v17.16b, v23.16b, v24.16b
701        and     v10.16b, v10.16b, v16.16b
702        eor     v16.16b, v21.16b, v19.16b
703        eor     v18.16b, v20.16b, v19.16b
704        and     v19.16b, v2.16b, v1.16b
705        and     v20.16b, v6.16b, v5.16b
706        eor     v21.16b, v22.16b, v17.16b
707        eor     v9.16b, v9.16b, v10.16b
708        eor     v10.16b, v16.16b, v17.16b
709        eor     v16.16b, v18.16b, v8.16b
710        and     v17.16b, v4.16b, v0.16b
711        orr     v18.16b, v7.16b, v3.16b
712        eor     v21.16b, v21.16b, v8.16b
713        eor     v8.16b, v9.16b, v8.16b
714        eor     v9.16b, v10.16b, v19.16b
715        eor     v10.16b, v3.16b, v0.16b
716        eor     v16.16b, v16.16b, v17.16b
717        eor     v17.16b, v5.16b, v1.16b
718        eor     v19.16b, v21.16b, v20.16b
719        eor     v20.16b, v8.16b, v18.16b
720        eor     v8.16b, v8.16b, v18.16b
721        eor     v18.16b, v7.16b, v4.16b
722        eor     v21.16b, v9.16b, v16.16b
723        eor     v22.16b, v6.16b, v2.16b
724        and     v23.16b, v9.16b, v19.16b
725        eor     v24.16b, v10.16b, v17.16b
726        eor     v25.16b, v0.16b, v1.16b
727        eor     v26.16b, v7.16b, v6.16b
728        eor     v27.16b, v18.16b, v22.16b
729        eor     v28.16b, v3.16b, v5.16b
730        eor     v29.16b, v16.16b, v23.16b
731        eor     v30.16b, v20.16b, v23.16b
732        eor     v23.16b, v20.16b, v23.16b
733        eor     v31.16b, v4.16b, v2.16b
734        bsl     v29.16b, v19.16b, v20.16b
735        bsl     v30.16b, v9.16b, v16.16b
736        bsl     v8.16b, v29.16b, v23.16b
737        bsl     v20.16b, v23.16b, v29.16b
738        eor     v9.16b, v30.16b, v29.16b
739        and     v5.16b, v5.16b, v30.16b
740        and     v8.16b, v8.16b, v30.16b
741        and     v1.16b, v1.16b, v29.16b
742        eor     v16.16b, v19.16b, v20.16b
743        and     v2.16b, v2.16b, v29.16b
744        eor     v19.16b, v9.16b, v29.16b
745        and     v17.16b, v17.16b, v9.16b
746        eor     v8.16b, v8.16b, v21.16b
747        and     v20.16b, v22.16b, v9.16b
748        eor     v21.16b, v29.16b, v16.16b
749        eor     v22.16b, v29.16b, v16.16b
750        and     v23.16b, v25.16b, v16.16b
751        and     v6.16b, v6.16b, v19.16b
752        eor     v25.16b, v8.16b, v16.16b
753        eor     v29.16b, v30.16b, v8.16b
754        and     v4.16b, v21.16b, v4.16b
755        and     v8.16b, v28.16b, v8.16b
756        and     v0.16b, v22.16b, v0.16b
757        eor     v21.16b, v23.16b, v1.16b
758        eor     v22.16b, v9.16b, v25.16b
759        eor     v9.16b, v9.16b, v25.16b
760        eor     v23.16b, v25.16b, v16.16b
761        and     v3.16b, v29.16b, v3.16b
762        and     v24.16b, v24.16b, v25.16b
763        and     v25.16b, v27.16b, v25.16b
764        and     v10.16b, v22.16b, v10.16b
765        and     v9.16b, v9.16b, v18.16b
766        eor     v18.16b, v19.16b, v23.16b
767        and     v19.16b, v26.16b, v23.16b
768        eor     v3.16b, v5.16b, v3.16b
769        eor     v17.16b, v17.16b, v24.16b
770        eor     v10.16b, v24.16b, v10.16b
771        and     v16.16b, v31.16b, v16.16b
772        eor     v20.16b, v20.16b, v25.16b
773        eor     v9.16b, v25.16b, v9.16b
774        eor     v4.16b, v2.16b, v4.16b
775        and     v7.16b, v18.16b, v7.16b
776        eor     v18.16b, v19.16b, v6.16b
777        eor     v5.16b, v8.16b, v5.16b
778        eor     v0.16b, v1.16b, v0.16b
779        eor     v1.16b, v21.16b, v10.16b
780        eor     v8.16b, v3.16b, v17.16b
781        eor     v2.16b, v16.16b, v2.16b
782        eor     v3.16b, v6.16b, v7.16b
783        eor     v6.16b, v18.16b, v9.16b
784        eor     v4.16b, v4.16b, v20.16b
785        eor     v10.16b, v5.16b, v10.16b
786        eor     v0.16b, v0.16b, v17.16b
787        eor     v9.16b, v2.16b, v9.16b
788        eor     v3.16b, v3.16b, v20.16b
789        eor     v7.16b, v6.16b, v1.16b
790        eor     v5.16b, v8.16b, v4.16b
791        eor     v6.16b, v10.16b, v1.16b
792        eor     v2.16b, v4.16b, v0.16b
793        eor     v4.16b, v3.16b, v10.16b
794        eor     v9.16b, v9.16b, v7.16b
795        eor     v3.16b, v0.16b, v5.16b
796        eor     v0.16b, v1.16b, v4.16b
797        eor     v1.16b, v4.16b, v8.16b
798        eor     v4.16b, v9.16b, v5.16b
799        eor     v6.16b, v6.16b, v3.16b
800        bcc     .Lenc_done
801        ext     v8.16b, v0.16b, v0.16b, #12
802        ext     v9.16b, v4.16b, v4.16b, #12
803        ldr     q28, [x11]
804        ext     v10.16b, v6.16b, v6.16b, #12
805        ext     v16.16b, v1.16b, v1.16b, #12
806        ext     v17.16b, v3.16b, v3.16b, #12
807        ext     v18.16b, v7.16b, v7.16b, #12
808        eor     v0.16b, v0.16b, v8.16b
809        eor     v4.16b, v4.16b, v9.16b
810        eor     v6.16b, v6.16b, v10.16b
811        ext     v19.16b, v2.16b, v2.16b, #12
812        ext     v20.16b, v5.16b, v5.16b, #12
813        eor     v1.16b, v1.16b, v16.16b
814        eor     v3.16b, v3.16b, v17.16b
815        eor     v7.16b, v7.16b, v18.16b
816        eor     v2.16b, v2.16b, v19.16b
817        eor     v16.16b, v16.16b, v0.16b
818        eor     v5.16b, v5.16b, v20.16b
819        eor     v17.16b, v17.16b, v6.16b
820        eor     v10.16b, v10.16b, v4.16b
821        ext     v0.16b, v0.16b, v0.16b, #8
822        eor     v9.16b, v9.16b, v1.16b
823        ext     v1.16b, v1.16b, v1.16b, #8
824        eor     v8.16b, v8.16b, v5.16b
825        eor     v16.16b, v16.16b, v5.16b
826        eor     v18.16b, v18.16b, v3.16b
827        eor     v19.16b, v19.16b, v7.16b
828        ext     v3.16b, v3.16b, v3.16b, #8
829        ext     v7.16b, v7.16b, v7.16b, #8
830        eor     v20.16b, v20.16b, v2.16b
831        ext     v6.16b, v6.16b, v6.16b, #8
832        ext     v21.16b, v5.16b, v5.16b, #8
833        eor     v17.16b, v17.16b, v5.16b
834        ext     v2.16b, v2.16b, v2.16b, #8
835        eor     v10.16b, v10.16b, v5.16b
836        ext     v22.16b, v4.16b, v4.16b, #8
837        eor     v0.16b, v0.16b, v8.16b
838        eor     v1.16b, v1.16b, v16.16b
839        eor     v5.16b, v7.16b, v18.16b
840        eor     v4.16b, v3.16b, v17.16b
841        eor     v3.16b, v6.16b, v10.16b
842        eor     v7.16b, v21.16b, v20.16b
843        eor     v6.16b, v2.16b, v19.16b
844        eor     v2.16b, v22.16b, v9.16b
845        bne     .Lenc_loop
846        ldr     q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
847        b       .Lenc_loop
848.align  4
849.Lenc_done:
850        ushr    v8.2d, v0.2d, #1
851        movi    v9.16b, #0x55
852        ldr     q10, [x9]
853        ushr    v16.2d, v3.2d, #1
854        movi    v17.16b, #0x33
855        ushr    v18.2d, v4.2d, #1
856        movi    v19.16b, #0x0f
857        eor     v8.16b, v8.16b, v1.16b
858        ushr    v20.2d, v2.2d, #1
859        eor     v16.16b, v16.16b, v7.16b
860        eor     v18.16b, v18.16b, v6.16b
861        and     v8.16b, v8.16b, v9.16b
862        eor     v20.16b, v20.16b, v5.16b
863        and     v16.16b, v16.16b, v9.16b
864        and     v18.16b, v18.16b, v9.16b
865        shl     v21.2d, v8.2d, #1
866        eor     v1.16b, v1.16b, v8.16b
867        and     v8.16b, v20.16b, v9.16b
868        eor     v7.16b, v7.16b, v16.16b
869        shl     v9.2d, v16.2d, #1
870        eor     v6.16b, v6.16b, v18.16b
871        shl     v16.2d, v18.2d, #1
872        eor     v0.16b, v0.16b, v21.16b
873        shl     v18.2d, v8.2d, #1
874        eor     v5.16b, v5.16b, v8.16b
875        eor     v3.16b, v3.16b, v9.16b
876        eor     v4.16b, v4.16b, v16.16b
877        ushr    v8.2d, v1.2d, #2
878        eor     v2.16b, v2.16b, v18.16b
879        ushr    v9.2d, v0.2d, #2
880        ushr    v16.2d, v7.2d, #2
881        ushr    v18.2d, v3.2d, #2
882        eor     v8.16b, v8.16b, v6.16b
883        eor     v9.16b, v9.16b, v4.16b
884        eor     v16.16b, v16.16b, v5.16b
885        eor     v18.16b, v18.16b, v2.16b
886        and     v8.16b, v8.16b, v17.16b
887        and     v9.16b, v9.16b, v17.16b
888        and     v16.16b, v16.16b, v17.16b
889        and     v17.16b, v18.16b, v17.16b
890        eor     v6.16b, v6.16b, v8.16b
891        shl     v8.2d, v8.2d, #2
892        eor     v4.16b, v4.16b, v9.16b
893        shl     v9.2d, v9.2d, #2
894        eor     v5.16b, v5.16b, v16.16b
895        shl     v16.2d, v16.2d, #2
896        eor     v2.16b, v2.16b, v17.16b
897        shl     v17.2d, v17.2d, #2
898        eor     v1.16b, v1.16b, v8.16b
899        eor     v0.16b, v0.16b, v9.16b
900        eor     v7.16b, v7.16b, v16.16b
901        eor     v3.16b, v3.16b, v17.16b
902        ushr    v8.2d, v6.2d, #4
903        ushr    v9.2d, v4.2d, #4
904        ushr    v16.2d, v1.2d, #4
905        ushr    v17.2d, v0.2d, #4
906        eor     v8.16b, v8.16b, v5.16b
907        eor     v9.16b, v9.16b, v2.16b
908        eor     v16.16b, v16.16b, v7.16b
909        eor     v17.16b, v17.16b, v3.16b
910        and     v8.16b, v8.16b, v19.16b
911        and     v9.16b, v9.16b, v19.16b
912        and     v16.16b, v16.16b, v19.16b
913        and     v17.16b, v17.16b, v19.16b
914        eor     v5.16b, v5.16b, v8.16b
915        shl     v8.2d, v8.2d, #4
916        eor     v2.16b, v2.16b, v9.16b
917        shl     v9.2d, v9.2d, #4
918        eor     v7.16b, v7.16b, v16.16b
919        shl     v16.2d, v16.2d, #4
920        eor     v3.16b, v3.16b, v17.16b
921        shl     v17.2d, v17.2d, #4
922        eor     v6.16b, v6.16b, v8.16b
923        eor     v4.16b, v4.16b, v9.16b
924        eor     v7.16b, v7.16b, v10.16b
925        eor     v1.16b, v1.16b, v16.16b
926        eor     v3.16b, v3.16b, v10.16b
927        eor     v0.16b, v0.16b, v17.16b
928        eor     v6.16b, v6.16b, v10.16b
929        eor     v4.16b, v4.16b, v10.16b
930        eor     v2.16b, v2.16b, v10.16b
931        eor     v5.16b, v5.16b, v10.16b
932        eor     v1.16b, v1.16b, v10.16b
933        eor     v0.16b, v0.16b, v10.16b
934        ret
935.size   _bsaes_encrypt8,.-_bsaes_encrypt8
936
937.type   _bsaes_key_convert,%function
938.align  4
939// On entry:
940//   x9 -> input key (big-endian)
941//   x10 = number of rounds
942//   x17 -> output key (native endianness)
943// On exit:
944//   x9, x10 corrupted
945//   x11 -> .LM0_bigendian
946//   x17 -> last quadword of output key
947//   other general-purpose registers preserved
948//   v2-v6 preserved
949//   v7.16b[] = 0x63
950//   v8-v14 preserved
951//   v15 = last round key (converted to native endianness)
952//   other SIMD registers corrupted
953_bsaes_key_convert:
954#ifdef __AARCH64EL__
955        adr     x11, .LM0_littleendian
956#else
957        adr     x11, .LM0_bigendian
958#endif
959        ldr     q0, [x9], #16               // load round 0 key
960        ldr     q1, [x11]                   // .LM0
961        ldr     q15, [x9], #16              // load round 1 key
962
963        movi    v7.16b, #0x63               // compose .L63
964        movi    v16.16b, #0x01              // bit masks
965        movi    v17.16b, #0x02
966        movi    v18.16b, #0x04
967        movi    v19.16b, #0x08
968        movi    v20.16b, #0x10
969        movi    v21.16b, #0x20
970        movi    v22.16b, #0x40
971        movi    v23.16b, #0x80
972
973#ifdef __AARCH64EL__
974        rev32   v0.16b, v0.16b
975#endif
976        sub     x10, x10, #1
977        str     q0, [x17], #16              // save round 0 key
978
979.align  4
980.Lkey_loop:
981        tbl     v0.16b, {v15.16b}, v1.16b
982        ldr     q15, [x9], #16              // load next round key
983
984        eor     v0.16b, v0.16b, v7.16b
985        cmtst   v24.16b, v0.16b, v16.16b
986        cmtst   v25.16b, v0.16b, v17.16b
987        cmtst   v26.16b, v0.16b, v18.16b
988        cmtst   v27.16b, v0.16b, v19.16b
989        cmtst   v28.16b, v0.16b, v20.16b
990        cmtst   v29.16b, v0.16b, v21.16b
991        cmtst   v30.16b, v0.16b, v22.16b
992        cmtst   v31.16b, v0.16b, v23.16b
993        sub     x10, x10, #1
994        st1     {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
995        st1     {v28.16b-v31.16b}, [x17], #64
996        cbnz    x10, .Lkey_loop
997
998        // don't save last round key
999#ifdef __AARCH64EL__
1000        rev32   v15.16b, v15.16b
1001        adr     x11, .LM0_bigendian
1002#endif
1003        ret
1004.size   _bsaes_key_convert,.-_bsaes_key_convert
1005
1006.globl  ossl_bsaes_cbc_encrypt
1007.type   ossl_bsaes_cbc_encrypt,%function
1008.align  4
1009// On entry:
1010//   x0 -> input ciphertext
1011//   x1 -> output plaintext
1012//   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
1013//   x3 -> key
1014//   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
1015//   w5 must be == 0
1016// On exit:
1017//   Output plaintext filled in
1018//   Initialisation vector overwritten with last quadword of ciphertext
1019//   No output registers, usual AAPCS64 register preservation
1020ossl_bsaes_cbc_encrypt:
1021        cmp     x2, #128
1022        bhs     .Lcbc_do_bsaes
1023        b       AES_cbc_encrypt
1024.Lcbc_do_bsaes:
1025
1026        // it is up to the caller to make sure we are called with enc == 0
1027
1028        stp     x29, x30, [sp, #-48]!
1029        stp     d8, d9, [sp, #16]
1030        stp     d10, d15, [sp, #32]
1031        lsr     x2, x2, #4                  // len in 16 byte blocks
1032
1033        ldr     w15, [x3, #240]             // get # of rounds
1034        mov     x14, sp
1035
1036        // allocate the key schedule on the stack
1037        add     x17, sp, #96
1038        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1039
1040        // populate the key schedule
1041        mov     x9, x3                      // pass key
1042        mov     x10, x15                    // pass # of rounds
1043        mov     sp, x17                     // sp is sp
1044        bl      _bsaes_key_convert
1045        ldr     q6,  [sp]
1046        str     q15, [x17]                  // save last round key
1047        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1048        str     q6, [sp]
1049
1050        ldr     q15, [x4]                   // load IV
1051        b       .Lcbc_dec_loop
1052
1053.align  4
1054.Lcbc_dec_loop:
1055        subs    x2, x2, #0x8
1056        bmi     .Lcbc_dec_loop_finish
1057
1058        ldr     q0, [x0], #16               // load input
1059        mov     x9, sp                      // pass the key
1060        ldr     q1, [x0], #16
1061        mov     x10, x15
1062        ldr     q2, [x0], #16
1063        ldr     q3, [x0], #16
1064        ldr     q4, [x0], #16
1065        ldr     q5, [x0], #16
1066        ldr     q6, [x0], #16
1067        ldr     q7, [x0], #-7*16
1068
1069        bl      _bsaes_decrypt8
1070
1071        ldr     q16, [x0], #16              // reload input
1072        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1073        eor     v1.16b, v1.16b, v16.16b
1074        str     q0, [x1], #16               // write output
1075        ldr     q0, [x0], #16
1076        str     q1, [x1], #16
1077        ldr     q1, [x0], #16
1078        eor     v1.16b, v4.16b, v1.16b
1079        ldr     q4, [x0], #16
1080        eor     v2.16b, v2.16b, v4.16b
1081        eor     v0.16b, v6.16b, v0.16b
1082        ldr     q4, [x0], #16
1083        str     q0, [x1], #16
1084        str     q1, [x1], #16
1085        eor     v0.16b, v7.16b, v4.16b
1086        ldr     q1, [x0], #16
1087        str     q2, [x1], #16
1088        ldr     q2, [x0], #16
1089        ldr     q15, [x0], #16
1090        str     q0, [x1], #16
1091        eor     v0.16b, v5.16b, v2.16b
1092        eor     v1.16b, v3.16b, v1.16b
1093        str     q1, [x1], #16
1094        str     q0, [x1], #16
1095
1096        b       .Lcbc_dec_loop
1097
1098.Lcbc_dec_loop_finish:
1099        adds    x2, x2, #8
1100        beq     .Lcbc_dec_done
1101
1102        ldr     q0, [x0], #16               // load input
1103        cmp     x2, #2
1104        blo     .Lcbc_dec_one
1105        ldr     q1, [x0], #16
1106        mov     x9, sp                      // pass the key
1107        mov     x10, x15
1108        beq     .Lcbc_dec_two
1109        ldr     q2, [x0], #16
1110        cmp     x2, #4
1111        blo     .Lcbc_dec_three
1112        ldr     q3, [x0], #16
1113        beq     .Lcbc_dec_four
1114        ldr     q4, [x0], #16
1115        cmp     x2, #6
1116        blo     .Lcbc_dec_five
1117        ldr     q5, [x0], #16
1118        beq     .Lcbc_dec_six
1119        ldr     q6, [x0], #-6*16
1120
1121        bl      _bsaes_decrypt8
1122
1123        ldr     q5, [x0], #16               // reload input
1124        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1125        ldr     q8, [x0], #16
1126        ldr     q9, [x0], #16
1127        ldr     q10, [x0], #16
1128        str     q0, [x1], #16               // write output
1129        ldr     q0, [x0], #16
1130        eor     v1.16b, v1.16b, v5.16b
1131        ldr     q5, [x0], #16
1132        eor     v6.16b, v6.16b, v8.16b
1133        ldr     q15, [x0]
1134        eor     v4.16b, v4.16b, v9.16b
1135        eor     v2.16b, v2.16b, v10.16b
1136        str     q1, [x1], #16
1137        eor     v0.16b, v7.16b, v0.16b
1138        str     q6, [x1], #16
1139        eor     v1.16b, v3.16b, v5.16b
1140        str     q4, [x1], #16
1141        str     q2, [x1], #16
1142        str     q0, [x1], #16
1143        str     q1, [x1]
1144        b       .Lcbc_dec_done
1145.align  4
1146.Lcbc_dec_six:
1147        sub     x0, x0, #0x60
1148        bl      _bsaes_decrypt8
1149        ldr     q3, [x0], #16               // reload input
1150        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1151        ldr     q5, [x0], #16
1152        ldr     q8, [x0], #16
1153        ldr     q9, [x0], #16
1154        str     q0, [x1], #16               // write output
1155        ldr     q0, [x0], #16
1156        eor     v1.16b, v1.16b, v3.16b
1157        ldr     q15, [x0]
1158        eor     v3.16b, v6.16b, v5.16b
1159        eor     v4.16b, v4.16b, v8.16b
1160        eor     v2.16b, v2.16b, v9.16b
1161        str     q1, [x1], #16
1162        eor     v0.16b, v7.16b, v0.16b
1163        str     q3, [x1], #16
1164        str     q4, [x1], #16
1165        str     q2, [x1], #16
1166        str     q0, [x1]
1167        b       .Lcbc_dec_done
1168.align  4
1169.Lcbc_dec_five:
1170        sub     x0, x0, #0x50
1171        bl      _bsaes_decrypt8
1172        ldr     q3, [x0], #16               // reload input
1173        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1174        ldr     q5, [x0], #16
1175        ldr     q7, [x0], #16
1176        ldr     q8, [x0], #16
1177        str     q0, [x1], #16               // write output
1178        ldr     q15, [x0]
1179        eor     v0.16b, v1.16b, v3.16b
1180        eor     v1.16b, v6.16b, v5.16b
1181        eor     v3.16b, v4.16b, v7.16b
1182        str     q0, [x1], #16
1183        eor     v0.16b, v2.16b, v8.16b
1184        str     q1, [x1], #16
1185        str     q3, [x1], #16
1186        str     q0, [x1]
1187        b       .Lcbc_dec_done
1188.align  4
1189.Lcbc_dec_four:
1190        sub     x0, x0, #0x40
1191        bl      _bsaes_decrypt8
1192        ldr     q2, [x0], #16               // reload input
1193        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1194        ldr     q3, [x0], #16
1195        ldr     q5, [x0], #16
1196        str     q0, [x1], #16               // write output
1197        ldr     q15, [x0]
1198        eor     v0.16b, v1.16b, v2.16b
1199        eor     v1.16b, v6.16b, v3.16b
1200        eor     v2.16b, v4.16b, v5.16b
1201        str     q0, [x1], #16
1202        str     q1, [x1], #16
1203        str     q2, [x1]
1204        b       .Lcbc_dec_done
1205.align  4
1206.Lcbc_dec_three:
1207        sub     x0, x0, #0x30
1208        bl      _bsaes_decrypt8
1209        ldr     q2, [x0], #16               // reload input
1210        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1211        ldr     q3, [x0], #16
1212        ldr     q15, [x0]
1213        str     q0, [x1], #16               // write output
1214        eor     v0.16b, v1.16b, v2.16b
1215        eor     v1.16b, v6.16b, v3.16b
1216        str     q0, [x1], #16
1217        str     q1, [x1]
1218        b       .Lcbc_dec_done
1219.align  4
1220.Lcbc_dec_two:
1221        sub     x0, x0, #0x20
1222        bl      _bsaes_decrypt8
1223        ldr     q2, [x0], #16               // reload input
1224        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1225        ldr     q15, [x0]
1226        str     q0, [x1], #16               // write output
1227        eor     v0.16b, v1.16b, v2.16b
1228        str     q0, [x1]
1229        b       .Lcbc_dec_done
1230.align  4
1231.Lcbc_dec_one:
1232        sub     x0, x0, #0x10
1233        stp     x1, x4, [sp, #-32]!
1234        str     x14, [sp, #16]
1235        mov     v8.16b, v15.16b
1236        mov     v15.16b, v0.16b
1237        mov     x2, x3
1238        bl      AES_decrypt
1239        ldr     x14, [sp, #16]
1240        ldp     x1, x4, [sp], #32
1241        ldr     q0, [x1]                    // load result
1242        eor     v0.16b, v0.16b, v8.16b      // ^= IV
1243        str     q0, [x1]                    // write output
1244
1245.align  4
1246.Lcbc_dec_done:
1247        movi    v0.16b, #0
1248        movi    v1.16b, #0
1249.Lcbc_dec_bzero:// wipe key schedule [if any]
1250        stp     q0, q1, [sp], #32
1251        cmp     sp, x14
1252        bne     .Lcbc_dec_bzero
1253        str     q15, [x4]                   // return IV
1254        ldp     d8, d9, [sp, #16]
1255        ldp     d10, d15, [sp, #32]
1256        ldp     x29, x30, [sp], #48
1257        ret
1258.size   ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1259
1260.globl  ossl_bsaes_ctr32_encrypt_blocks
1261.type   ossl_bsaes_ctr32_encrypt_blocks,%function
1262.align  4
1263// On entry:
1264//   x0 -> input text (whole 16-byte blocks)
1265//   x1 -> output text (whole 16-byte blocks)
1266//   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
1267//   x3 -> key
1268//   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
1269// On exit:
1270//   Output text filled in
1271//   No output registers, usual AAPCS64 register preservation
1272ossl_bsaes_ctr32_encrypt_blocks:
1273
1274        cmp     x2, #8                      // use plain AES for
1275        blo     .Lctr_enc_short             // small sizes
1276
1277        stp     x29, x30, [sp, #-80]!
1278        stp     d8, d9, [sp, #16]
1279        stp     d10, d11, [sp, #32]
1280        stp     d12, d13, [sp, #48]
1281        stp     d14, d15, [sp, #64]
1282
1283        ldr     w15, [x3, #240]             // get # of rounds
1284        mov     x14, sp
1285
1286        // allocate the key schedule on the stack
1287        add     x17, sp, #96
1288        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1289
1290        // populate the key schedule
1291        mov     x9, x3                      // pass key
1292        mov     x10, x15                    // pass # of rounds
1293        mov     sp, x17                     // sp is sp
1294        bl      _bsaes_key_convert
1295        eor     v7.16b, v7.16b, v15.16b     // fix up last round key
1296        str     q7, [x17]                   // save last round key
1297
1298        ldr     q0, [x4]                    // load counter
1299        add     x13, x11, #.LREVM0SR-.LM0_bigendian
1300        ldr     q4, [sp]                    // load round0 key
1301
1302        movi    v8.4s, #1                   // compose 1<<96
1303        movi    v9.16b, #0
1304        rev32   v15.16b, v0.16b
1305        rev32   v0.16b, v0.16b
1306        ext     v11.16b, v9.16b, v8.16b, #4
1307        rev32   v4.16b, v4.16b
1308        add     v12.4s, v11.4s, v11.4s      // compose 2<<96
1309        str     q4, [sp]                    // save adjusted round0 key
1310        add     v13.4s, v11.4s, v12.4s      // compose 3<<96
1311        add     v14.4s, v12.4s, v12.4s      // compose 4<<96
1312        b       .Lctr_enc_loop
1313
1314.align  4
1315.Lctr_enc_loop:
1316        // Intermix prologue from _bsaes_encrypt8 to use the opportunity
1317        // to flip byte order in 32-bit counter
1318
1319        add     v1.4s, v15.4s, v11.4s       // +1
1320        add     x9, sp, #0x10               // pass next round key
1321        add     v2.4s, v15.4s, v12.4s       // +2
1322        ldr     q9, [x13]                   // .LREVM0SR
1323        ldr     q8, [sp]                    // load round0 key
1324        add     v3.4s, v15.4s, v13.4s       // +3
1325        mov     x10, x15                    // pass rounds
1326        sub     x11, x13, #.LREVM0SR-.LSR   // pass constants
1327        add     v6.4s, v2.4s, v14.4s
1328        add     v4.4s, v15.4s, v14.4s       // +4
1329        add     v7.4s, v3.4s, v14.4s
1330        add     v15.4s, v4.4s, v14.4s       // next counter
1331        add     v5.4s, v1.4s, v14.4s
1332
1333        bl      _bsaes_encrypt8_alt
1334
1335        subs    x2, x2, #8
1336        blo     .Lctr_enc_loop_done
1337
1338        ldr     q16, [x0], #16
1339        ldr     q17, [x0], #16
1340        eor     v1.16b, v1.16b, v17.16b
1341        ldr     q17, [x0], #16
1342        eor     v0.16b, v0.16b, v16.16b
1343        eor     v4.16b, v4.16b, v17.16b
1344        str     q0, [x1], #16
1345        ldr     q16, [x0], #16
1346        str     q1, [x1], #16
1347        mov     v0.16b, v15.16b
1348        str     q4, [x1], #16
1349        ldr     q1, [x0], #16
1350        eor     v4.16b, v6.16b, v16.16b
1351        eor     v1.16b, v3.16b, v1.16b
1352        ldr     q3, [x0], #16
1353        eor     v3.16b, v7.16b, v3.16b
1354        ldr     q6, [x0], #16
1355        eor     v2.16b, v2.16b, v6.16b
1356        ldr     q6, [x0], #16
1357        eor     v5.16b, v5.16b, v6.16b
1358        str     q4, [x1], #16
1359        str     q1, [x1], #16
1360        str     q3, [x1], #16
1361        str     q2, [x1], #16
1362        str     q5, [x1], #16
1363
1364        bne     .Lctr_enc_loop
1365        b       .Lctr_enc_done
1366
1367.align  4
1368.Lctr_enc_loop_done:
1369        add     x2, x2, #8
1370        ldr     q16, [x0], #16              // load input
1371        eor     v0.16b, v0.16b, v16.16b
1372        str     q0, [x1], #16               // write output
1373        cmp     x2, #2
1374        blo     .Lctr_enc_done
1375        ldr     q17, [x0], #16
1376        eor     v1.16b, v1.16b, v17.16b
1377        str     q1, [x1], #16
1378        beq     .Lctr_enc_done
1379        ldr     q18, [x0], #16
1380        eor     v4.16b, v4.16b, v18.16b
1381        str     q4, [x1], #16
1382        cmp     x2, #4
1383        blo     .Lctr_enc_done
1384        ldr     q19, [x0], #16
1385        eor     v6.16b, v6.16b, v19.16b
1386        str     q6, [x1], #16
1387        beq     .Lctr_enc_done
1388        ldr     q20, [x0], #16
1389        eor     v3.16b, v3.16b, v20.16b
1390        str     q3, [x1], #16
1391        cmp     x2, #6
1392        blo     .Lctr_enc_done
1393        ldr     q21, [x0], #16
1394        eor     v7.16b, v7.16b, v21.16b
1395        str     q7, [x1], #16
1396        beq     .Lctr_enc_done
1397        ldr     q22, [x0]
1398        eor     v2.16b, v2.16b, v22.16b
1399        str     q2, [x1], #16
1400
1401.Lctr_enc_done:
1402        movi    v0.16b, #0
1403        movi    v1.16b, #0
1404.Lctr_enc_bzero: // wipe key schedule [if any]
1405        stp     q0, q1, [sp], #32
1406        cmp     sp, x14
1407        bne     .Lctr_enc_bzero
1408
1409        ldp     d8, d9, [sp, #16]
1410        ldp     d10, d11, [sp, #32]
1411        ldp     d12, d13, [sp, #48]
1412        ldp     d14, d15, [sp, #64]
1413        ldp     x29, x30, [sp], #80
1414        ret
1415
1416.Lctr_enc_short:
1417        stp     x29, x30, [sp, #-96]!
1418        stp     x19, x20, [sp, #16]
1419        stp     x21, x22, [sp, #32]
1420        str     x23, [sp, #48]
1421
1422        mov     x19, x0                     // copy arguments
1423        mov     x20, x1
1424        mov     x21, x2
1425        mov     x22, x3
1426        ldr     w23, [x4, #12]              // load counter .LSW
1427        ldr     q1, [x4]                    // load whole counter value
1428#ifdef __AARCH64EL__
1429        rev     w23, w23
1430#endif
1431        str     q1, [sp, #80]               // copy counter value
1432
1433.Lctr_enc_short_loop:
1434        add     x0, sp, #80                 // input counter value
1435        add     x1, sp, #64                 // output on the stack
1436        mov     x2, x22                     // key
1437
1438        bl      AES_encrypt
1439
1440        ldr     q0, [x19], #16              // load input
1441        ldr     q1, [sp, #64]               // load encrypted counter
1442        add     x23, x23, #1
1443#ifdef __AARCH64EL__
1444        rev     w0, w23
1445        str     w0, [sp, #80+12]            // next counter value
1446#else
1447        str     w23, [sp, #80+12]           // next counter value
1448#endif
1449        eor     v0.16b, v0.16b, v1.16b
1450        str     q0, [x20], #16              // store output
1451        subs    x21, x21, #1
1452        bne     .Lctr_enc_short_loop
1453
1454        movi    v0.16b, #0
1455        movi    v1.16b, #0
1456        stp     q0, q1, [sp, #64]
1457
1458        ldr     x23, [sp, #48]
1459        ldp     x21, x22, [sp, #32]
1460        ldp     x19, x20, [sp, #16]
1461        ldp     x29, x30, [sp], #96
1462        ret
1463.size   ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1464
1465.globl  ossl_bsaes_xts_encrypt
1466.type   ossl_bsaes_xts_encrypt,%function
1467.align  4
1468// On entry:
1469//   x0 -> input plaintext
1470//   x1 -> output ciphertext
1471//   x2 -> length of text in bytes (must be at least 16)
1472//   x3 -> key1 (used to encrypt the XORed plaintext blocks)
1473//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1474//   x5 -> 16-byte initial vector (typically, sector number)
1475// On exit:
1476//   Output ciphertext filled in
1477//   No output registers, usual AAPCS64 register preservation
1478ossl_bsaes_xts_encrypt:
1479        // Stack layout:
1480        // sp ->
1481        //        nrounds*128-96 bytes: key schedule
1482        // x19 ->
1483        //        16 bytes: frame record
1484        //        4*16 bytes: tweak storage across _bsaes_encrypt8
1485        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1486        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1487        stp     x29, x30, [sp, #-192]!
1488        stp     x19, x20, [sp, #80]
1489        stp     x21, x22, [sp, #96]
1490        str     x23, [sp, #112]
1491        stp     d8, d9, [sp, #128]
1492        stp     d10, d11, [sp, #144]
1493        stp     d12, d13, [sp, #160]
1494        stp     d14, d15, [sp, #176]
1495
1496        mov     x19, sp
1497        mov     x20, x0
1498        mov     x21, x1
1499        mov     x22, x2
1500        mov     x23, x3
1501
1502        // generate initial tweak
1503        sub     sp, sp, #16
1504        mov     x0, x5                      // iv[]
1505        mov     x1, sp
1506        mov     x2, x4                      // key2
1507        bl      AES_encrypt
1508        ldr     q11, [sp], #16
1509
1510        ldr     w1, [x23, #240]             // get # of rounds
1511        // allocate the key schedule on the stack
1512        add     x17, sp, #96
1513        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1514
1515        // populate the key schedule
1516        mov     x9, x23                     // pass key
1517        mov     x10, x1                     // pass # of rounds
1518        mov     sp, x17
1519        bl      _bsaes_key_convert
1520        eor     v15.16b, v15.16b, v7.16b    // fix up last round key
1521        str     q15, [x17]                  // save last round key
1522
1523        subs    x22, x22, #0x80
1524        blo     .Lxts_enc_short
1525        b       .Lxts_enc_loop
1526
1527.align  4
1528.Lxts_enc_loop:
1529        ldr     q8, .Lxts_magic
1530        mov     x10, x1                     // pass rounds
1531        add     x2, x19, #16
1532        ldr     q0, [x20], #16
1533        sshr    v1.2d, v11.2d, #63
1534        mov     x9, sp                      // pass key schedule
1535        ldr     q6, .Lxts_magic+16
1536        add     v2.2d, v11.2d, v11.2d
1537        cmtst   v3.2d, v11.2d, v6.2d
1538        and     v1.16b, v1.16b, v8.16b
1539        ext     v1.16b, v1.16b, v1.16b, #8
1540        and     v3.16b, v3.16b, v8.16b
1541        ldr     q4, [x20], #16
1542        eor     v12.16b, v2.16b, v1.16b
1543        eor     v1.16b, v4.16b, v12.16b
1544        eor     v0.16b, v0.16b, v11.16b
1545        cmtst   v2.2d, v12.2d, v6.2d
1546        add     v4.2d, v12.2d, v12.2d
1547        add     x0, x19, #16
1548        ext     v3.16b, v3.16b, v3.16b, #8
1549        and     v2.16b, v2.16b, v8.16b
1550        eor     v13.16b, v4.16b, v3.16b
1551        ldr     q3, [x20], #16
1552        ext     v4.16b, v2.16b, v2.16b, #8
1553        eor     v2.16b, v3.16b, v13.16b
1554        ldr     q3, [x20], #16
1555        add     v5.2d, v13.2d, v13.2d
1556        cmtst   v7.2d, v13.2d, v6.2d
1557        and     v7.16b, v7.16b, v8.16b
1558        ldr     q9, [x20], #16
1559        ext     v7.16b, v7.16b, v7.16b, #8
1560        ldr     q10, [x20], #16
1561        eor     v14.16b, v5.16b, v4.16b
1562        ldr     q16, [x20], #16
1563        add     v4.2d, v14.2d, v14.2d
1564        eor     v3.16b, v3.16b, v14.16b
1565        eor     v15.16b, v4.16b, v7.16b
1566        add     v5.2d, v15.2d, v15.2d
1567        ldr     q7, [x20], #16
1568        cmtst   v4.2d, v14.2d, v6.2d
1569        and     v17.16b, v4.16b, v8.16b
1570        cmtst   v18.2d, v15.2d, v6.2d
1571        eor     v4.16b, v9.16b, v15.16b
1572        ext     v9.16b, v17.16b, v17.16b, #8
1573        eor     v9.16b, v5.16b, v9.16b
1574        add     v17.2d, v9.2d, v9.2d
1575        and     v18.16b, v18.16b, v8.16b
1576        eor     v5.16b, v10.16b, v9.16b
1577        str     q9, [x2], #16
1578        ext     v10.16b, v18.16b, v18.16b, #8
1579        cmtst   v9.2d, v9.2d, v6.2d
1580        and     v9.16b, v9.16b, v8.16b
1581        eor     v10.16b, v17.16b, v10.16b
1582        cmtst   v17.2d, v10.2d, v6.2d
1583        eor     v6.16b, v16.16b, v10.16b
1584        str     q10, [x2], #16
1585        ext     v9.16b, v9.16b, v9.16b, #8
1586        add     v10.2d, v10.2d, v10.2d
1587        eor     v9.16b, v10.16b, v9.16b
1588        str     q9, [x2], #16
1589        eor     v7.16b, v7.16b, v9.16b
1590        add     v9.2d, v9.2d, v9.2d
1591        and     v8.16b, v17.16b, v8.16b
1592        ext     v8.16b, v8.16b, v8.16b, #8
1593        eor     v8.16b, v9.16b, v8.16b
1594        str     q8, [x2]                    // next round tweak
1595
1596        bl      _bsaes_encrypt8
1597
1598        ldr     q8, [x0], #16
1599        eor     v0.16b, v0.16b, v11.16b
1600        eor     v1.16b, v1.16b, v12.16b
1601        ldr     q9, [x0], #16
1602        eor     v4.16b, v4.16b, v13.16b
1603        eor     v6.16b, v6.16b, v14.16b
1604        ldr     q10, [x0], #16
1605        eor     v3.16b, v3.16b, v15.16b
1606        subs    x22, x22, #0x80
1607        str     q0, [x21], #16
1608        ldr     q11, [x0]                   // next round tweak
1609        str     q1, [x21], #16
1610        eor     v0.16b, v7.16b, v8.16b
1611        eor     v1.16b, v2.16b, v9.16b
1612        str     q4, [x21], #16
1613        eor     v2.16b, v5.16b, v10.16b
1614        str     q6, [x21], #16
1615        str     q3, [x21], #16
1616        str     q0, [x21], #16
1617        str     q1, [x21], #16
1618        str     q2, [x21], #16
1619        bpl     .Lxts_enc_loop
1620
1621.Lxts_enc_short:
1622        adds    x22, x22, #0x70
1623        bmi     .Lxts_enc_done
1624
1625        ldr     q8, .Lxts_magic
1626        sshr    v1.2d, v11.2d, #63
1627        add     v2.2d, v11.2d, v11.2d
1628        ldr     q9, .Lxts_magic+16
1629        subs    x22, x22, #0x10
1630        ldr     q0, [x20], #16
1631        and     v1.16b, v1.16b, v8.16b
1632        cmtst   v3.2d, v11.2d, v9.2d
1633        ext     v1.16b, v1.16b, v1.16b, #8
1634        and     v3.16b, v3.16b, v8.16b
1635        eor     v12.16b, v2.16b, v1.16b
1636        ext     v1.16b, v3.16b, v3.16b, #8
1637        add     v2.2d, v12.2d, v12.2d
1638        cmtst   v3.2d, v12.2d, v9.2d
1639        eor     v13.16b, v2.16b, v1.16b
1640        and     v22.16b, v3.16b, v8.16b
1641        bmi     .Lxts_enc_1
1642
1643        ext     v2.16b, v22.16b, v22.16b, #8
1644        add     v3.2d, v13.2d, v13.2d
1645        ldr     q1, [x20], #16
1646        cmtst   v4.2d, v13.2d, v9.2d
1647        subs    x22, x22, #0x10
1648        eor     v14.16b, v3.16b, v2.16b
1649        and     v23.16b, v4.16b, v8.16b
1650        bmi     .Lxts_enc_2
1651
1652        ext     v3.16b, v23.16b, v23.16b, #8
1653        add     v4.2d, v14.2d, v14.2d
1654        ldr     q2, [x20], #16
1655        cmtst   v5.2d, v14.2d, v9.2d
1656        eor     v0.16b, v0.16b, v11.16b
1657        subs    x22, x22, #0x10
1658        eor     v15.16b, v4.16b, v3.16b
1659        and     v24.16b, v5.16b, v8.16b
1660        bmi     .Lxts_enc_3
1661
1662        ext     v4.16b, v24.16b, v24.16b, #8
1663        add     v5.2d, v15.2d, v15.2d
1664        ldr     q3, [x20], #16
1665        cmtst   v6.2d, v15.2d, v9.2d
1666        eor     v1.16b, v1.16b, v12.16b
1667        subs    x22, x22, #0x10
1668        eor     v16.16b, v5.16b, v4.16b
1669        and     v25.16b, v6.16b, v8.16b
1670        bmi     .Lxts_enc_4
1671
1672        ext     v5.16b, v25.16b, v25.16b, #8
1673        add     v6.2d, v16.2d, v16.2d
1674        add     x0, x19, #16
1675        cmtst   v7.2d, v16.2d, v9.2d
1676        ldr     q4, [x20], #16
1677        eor     v2.16b, v2.16b, v13.16b
1678        str     q16, [x0], #16
1679        subs    x22, x22, #0x10
1680        eor     v17.16b, v6.16b, v5.16b
1681        and     v26.16b, v7.16b, v8.16b
1682        bmi     .Lxts_enc_5
1683
1684        ext     v7.16b, v26.16b, v26.16b, #8
1685        add     v18.2d, v17.2d, v17.2d
1686        ldr     q5, [x20], #16
1687        eor     v3.16b, v3.16b, v14.16b
1688        str     q17, [x0], #16
1689        subs    x22, x22, #0x10
1690        eor     v18.16b, v18.16b, v7.16b
1691        bmi     .Lxts_enc_6
1692
1693        ldr     q6, [x20], #16
1694        eor     v4.16b, v4.16b, v15.16b
1695        eor     v5.16b, v5.16b, v16.16b
1696        str     q18, [x0]                   // next round tweak
1697        mov     x9, sp                      // pass key schedule
1698        mov     x10, x1
1699        add     x0, x19, #16
1700        sub     x22, x22, #0x10
1701        eor     v6.16b, v6.16b, v17.16b
1702
1703        bl      _bsaes_encrypt8
1704
1705        ldr     q16, [x0], #16
1706        eor     v0.16b, v0.16b, v11.16b
1707        eor     v1.16b, v1.16b, v12.16b
1708        ldr     q17, [x0], #16
1709        eor     v4.16b, v4.16b, v13.16b
1710        eor     v6.16b, v6.16b, v14.16b
1711        eor     v3.16b, v3.16b, v15.16b
1712        ldr     q11, [x0]                   // next round tweak
1713        str     q0, [x21], #16
1714        str     q1, [x21], #16
1715        eor     v0.16b, v7.16b, v16.16b
1716        eor     v1.16b, v2.16b, v17.16b
1717        str     q4, [x21], #16
1718        str     q6, [x21], #16
1719        str     q3, [x21], #16
1720        str     q0, [x21], #16
1721        str     q1, [x21], #16
1722        b       .Lxts_enc_done
1723
1724.align  4
1725.Lxts_enc_6:
1726        eor     v4.16b, v4.16b, v15.16b
1727        eor     v5.16b, v5.16b, v16.16b
1728        mov     x9, sp                      // pass key schedule
1729        mov     x10, x1                     // pass rounds
1730        add     x0, x19, #16
1731
1732        bl      _bsaes_encrypt8
1733
1734        ldr     q16, [x0], #16
1735        eor     v0.16b, v0.16b, v11.16b
1736        eor     v1.16b, v1.16b, v12.16b
1737        eor     v4.16b, v4.16b, v13.16b
1738        eor     v6.16b, v6.16b, v14.16b
1739        ldr     q11, [x0]                   // next round tweak
1740        eor     v3.16b, v3.16b, v15.16b
1741        str     q0, [x21], #16
1742        str     q1, [x21], #16
1743        eor     v0.16b, v7.16b, v16.16b
1744        str     q4, [x21], #16
1745        str     q6, [x21], #16
1746        str     q3, [x21], #16
1747        str     q0, [x21], #16
1748        b       .Lxts_enc_done
1749
1750.align  4
1751.Lxts_enc_5:
1752        eor     v3.16b, v3.16b, v14.16b
1753        eor     v4.16b, v4.16b, v15.16b
1754        mov     x9, sp                      // pass key schedule
1755        mov     x10, x1                     // pass rounds
1756        add     x0, x19, #16
1757
1758        bl      _bsaes_encrypt8
1759
1760        eor     v0.16b, v0.16b, v11.16b
1761        eor     v1.16b, v1.16b, v12.16b
1762        ldr     q11, [x0]                   // next round tweak
1763        eor     v4.16b, v4.16b, v13.16b
1764        eor     v6.16b, v6.16b, v14.16b
1765        eor     v3.16b, v3.16b, v15.16b
1766        str     q0, [x21], #16
1767        str     q1, [x21], #16
1768        str     q4, [x21], #16
1769        str     q6, [x21], #16
1770        str     q3, [x21], #16
1771        b       .Lxts_enc_done
1772
1773.align  4
1774.Lxts_enc_4:
1775        eor     v2.16b, v2.16b, v13.16b
1776        eor     v3.16b, v3.16b, v14.16b
1777        mov     x9, sp                      // pass key schedule
1778        mov     x10, x1                     // pass rounds
1779        add     x0, x19, #16
1780
1781        bl      _bsaes_encrypt8
1782
1783        eor     v0.16b, v0.16b, v11.16b
1784        eor     v1.16b, v1.16b, v12.16b
1785        eor     v4.16b, v4.16b, v13.16b
1786        eor     v6.16b, v6.16b, v14.16b
1787        mov     v11.16b, v15.16b            // next round tweak
1788        str     q0, [x21], #16
1789        str     q1, [x21], #16
1790        str     q4, [x21], #16
1791        str     q6, [x21], #16
1792        b       .Lxts_enc_done
1793
1794.align  4
1795.Lxts_enc_3:
1796        eor     v1.16b, v1.16b, v12.16b
1797        eor     v2.16b, v2.16b, v13.16b
1798        mov     x9, sp                      // pass key schedule
1799        mov     x10, x1                     // pass rounds
1800        add     x0, x19, #16
1801
1802        bl      _bsaes_encrypt8
1803
1804        eor     v0.16b, v0.16b, v11.16b
1805        eor     v1.16b, v1.16b, v12.16b
1806        eor     v4.16b, v4.16b, v13.16b
1807        mov     v11.16b, v14.16b            // next round tweak
1808        str     q0, [x21], #16
1809        str     q1, [x21], #16
1810        str     q4, [x21], #16
1811        b       .Lxts_enc_done
1812
1813.align  4
1814.Lxts_enc_2:
1815        eor     v0.16b, v0.16b, v11.16b
1816        eor     v1.16b, v1.16b, v12.16b
1817        mov     x9, sp                      // pass key schedule
1818        mov     x10, x1                     // pass rounds
1819        add     x0, x19, #16
1820
1821        bl      _bsaes_encrypt8
1822
1823        eor     v0.16b, v0.16b, v11.16b
1824        eor     v1.16b, v1.16b, v12.16b
1825        mov     v11.16b, v13.16b            // next round tweak
1826        str     q0, [x21], #16
1827        str     q1, [x21], #16
1828        b       .Lxts_enc_done
1829
1830.align  4
1831.Lxts_enc_1:
1832        eor     v0.16b, v0.16b, v11.16b
1833        sub     x0, sp, #16
1834        sub     x1, sp, #16
1835        mov     x2, x23
1836        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1837        mov     v14.d[0], v12.d[1]
1838        str     q0, [sp, #-16]!
1839
1840        bl      AES_encrypt
1841
1842        ldr     q0, [sp], #16
1843        trn1    v13.2d, v11.2d, v13.2d
1844        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
1845        eor     v0.16b, v0.16b, v13.16b
1846        str     q0, [x21], #16
1847
1848.Lxts_enc_done:
1849        adds    x22, x22, #0x10
1850        beq     .Lxts_enc_ret
1851
1852        sub     x6, x21, #0x10
1853        // Penultimate plaintext block produces final ciphertext part-block
1854        // plus remaining part of final plaintext block. Move ciphertext part
1855        // to final position and re-use penultimate ciphertext block buffer to
1856        // construct final plaintext block
1857.Lxts_enc_steal:
1858        ldrb    w0, [x20], #1
1859        ldrb    w1, [x21, #-0x10]
1860        strb    w0, [x21, #-0x10]
1861        strb    w1, [x21], #1
1862
1863        subs    x22, x22, #1
1864        bhi     .Lxts_enc_steal
1865
1866        // Finally encrypt the penultimate ciphertext block using the
1867        // last tweak
1868        ldr     q0, [x6]
1869        eor     v0.16b, v0.16b, v11.16b
1870        str     q0, [sp, #-16]!
1871        mov     x0, sp
1872        mov     x1, sp
1873        mov     x2, x23
1874        mov     x21, x6
1875        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1876
1877        bl      AES_encrypt
1878
1879        trn1    v11.2d, v11.2d, v13.2d
1880        ldr     q0, [sp], #16
1881        eor     v0.16b, v0.16b, v11.16b
1882        str     q0, [x21]
1883
1884.Lxts_enc_ret:
1885
1886        movi    v0.16b, #0
1887        movi    v1.16b, #0
1888.Lxts_enc_bzero: // wipe key schedule
1889        stp     q0, q1, [sp], #32
1890        cmp     sp, x19
1891        bne     .Lxts_enc_bzero
1892
1893        ldp     x19, x20, [sp, #80]
1894        ldp     x21, x22, [sp, #96]
1895        ldr     x23, [sp, #112]
1896        ldp     d8, d9, [sp, #128]
1897        ldp     d10, d11, [sp, #144]
1898        ldp     d12, d13, [sp, #160]
1899        ldp     d14, d15, [sp, #176]
1900        ldp     x29, x30, [sp], #192
1901        ret
1902.size   ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
1903
1904// The assembler doesn't seem capable of de-duplicating these when expressed
1905// using `ldr qd,=` syntax, so assign a symbolic address
1906.align  5
1907.Lxts_magic:
1908.quad   1, 0x87, 0x4000000000000000, 0x4000000000000000
1909
1910.globl  ossl_bsaes_xts_decrypt
1911.type   ossl_bsaes_xts_decrypt,%function
1912.align  4
1913// On entry:
1914//   x0 -> input ciphertext
1915//   x1 -> output plaintext
1916//   x2 -> length of text in bytes (must be at least 16)
1917//   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
1918//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1919//   x5 -> 16-byte initial vector (typically, sector number)
1920// On exit:
1921//   Output plaintext filled in
1922//   No output registers, usual AAPCS64 register preservation
1923ossl_bsaes_xts_decrypt:
1924        // Stack layout:
1925        // sp ->
1926        //        nrounds*128-96 bytes: key schedule
1927        // x19 ->
1928        //        16 bytes: frame record
1929        //        4*16 bytes: tweak storage across _bsaes_decrypt8
1930        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1931        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1932        stp     x29, x30, [sp, #-192]!
1933        stp     x19, x20, [sp, #80]
1934        stp     x21, x22, [sp, #96]
1935        str     x23, [sp, #112]
1936        stp     d8, d9, [sp, #128]
1937        stp     d10, d11, [sp, #144]
1938        stp     d12, d13, [sp, #160]
1939        stp     d14, d15, [sp, #176]
1940
1941        mov     x19, sp
1942        mov     x20, x0
1943        mov     x21, x1
1944        mov     x22, x2
1945        mov     x23, x3
1946
1947        // generate initial tweak
1948        sub     sp, sp, #16
1949        mov     x0, x5                      // iv[]
1950        mov     x1, sp
1951        mov     x2, x4                      // key2
1952        bl      AES_encrypt
1953        ldr     q11, [sp], #16
1954
1955        ldr     w1, [x23, #240]             // get # of rounds
1956        // allocate the key schedule on the stack
1957        add     x17, sp, #96
1958        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1959
1960        // populate the key schedule
1961        mov     x9, x23                     // pass key
1962        mov     x10, x1                     // pass # of rounds
1963        mov     sp, x17
1964        bl      _bsaes_key_convert
1965        ldr     q6,  [sp]
1966        str     q15, [x17]                  // save last round key
1967        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1968        str     q6, [sp]
1969
1970        sub     x30, x22, #0x10
1971        tst     x22, #0xf                   // if not multiple of 16
1972        csel    x22, x30, x22, ne           // subtract another 16 bytes
1973        subs    x22, x22, #0x80
1974
1975        blo     .Lxts_dec_short
1976        b       .Lxts_dec_loop
1977
1978.align  4
1979.Lxts_dec_loop:
1980        ldr     q8, .Lxts_magic
1981        mov     x10, x1                     // pass rounds
1982        add     x2, x19, #16
1983        ldr     q0, [x20], #16
1984        sshr    v1.2d, v11.2d, #63
1985        mov     x9, sp                      // pass key schedule
1986        ldr     q6, .Lxts_magic+16
1987        add     v2.2d, v11.2d, v11.2d
1988        cmtst   v3.2d, v11.2d, v6.2d
1989        and     v1.16b, v1.16b, v8.16b
1990        ext     v1.16b, v1.16b, v1.16b, #8
1991        and     v3.16b, v3.16b, v8.16b
1992        ldr     q4, [x20], #16
1993        eor     v12.16b, v2.16b, v1.16b
1994        eor     v1.16b, v4.16b, v12.16b
1995        eor     v0.16b, v0.16b, v11.16b
1996        cmtst   v2.2d, v12.2d, v6.2d
1997        add     v4.2d, v12.2d, v12.2d
1998        add     x0, x19, #16
1999        ext     v3.16b, v3.16b, v3.16b, #8
2000        and     v2.16b, v2.16b, v8.16b
2001        eor     v13.16b, v4.16b, v3.16b
2002        ldr     q3, [x20], #16
2003        ext     v4.16b, v2.16b, v2.16b, #8
2004        eor     v2.16b, v3.16b, v13.16b
2005        ldr     q3, [x20], #16
2006        add     v5.2d, v13.2d, v13.2d
2007        cmtst   v7.2d, v13.2d, v6.2d
2008        and     v7.16b, v7.16b, v8.16b
2009        ldr     q9, [x20], #16
2010        ext     v7.16b, v7.16b, v7.16b, #8
2011        ldr     q10, [x20], #16
2012        eor     v14.16b, v5.16b, v4.16b
2013        ldr     q16, [x20], #16
2014        add     v4.2d, v14.2d, v14.2d
2015        eor     v3.16b, v3.16b, v14.16b
2016        eor     v15.16b, v4.16b, v7.16b
2017        add     v5.2d, v15.2d, v15.2d
2018        ldr     q7, [x20], #16
2019        cmtst   v4.2d, v14.2d, v6.2d
2020        and     v17.16b, v4.16b, v8.16b
2021        cmtst   v18.2d, v15.2d, v6.2d
2022        eor     v4.16b, v9.16b, v15.16b
2023        ext     v9.16b, v17.16b, v17.16b, #8
2024        eor     v9.16b, v5.16b, v9.16b
2025        add     v17.2d, v9.2d, v9.2d
2026        and     v18.16b, v18.16b, v8.16b
2027        eor     v5.16b, v10.16b, v9.16b
2028        str     q9, [x2], #16
2029        ext     v10.16b, v18.16b, v18.16b, #8
2030        cmtst   v9.2d, v9.2d, v6.2d
2031        and     v9.16b, v9.16b, v8.16b
2032        eor     v10.16b, v17.16b, v10.16b
2033        cmtst   v17.2d, v10.2d, v6.2d
2034        eor     v6.16b, v16.16b, v10.16b
2035        str     q10, [x2], #16
2036        ext     v9.16b, v9.16b, v9.16b, #8
2037        add     v10.2d, v10.2d, v10.2d
2038        eor     v9.16b, v10.16b, v9.16b
2039        str     q9, [x2], #16
2040        eor     v7.16b, v7.16b, v9.16b
2041        add     v9.2d, v9.2d, v9.2d
2042        and     v8.16b, v17.16b, v8.16b
2043        ext     v8.16b, v8.16b, v8.16b, #8
2044        eor     v8.16b, v9.16b, v8.16b
2045        str     q8, [x2]                    // next round tweak
2046
2047        bl      _bsaes_decrypt8
2048
2049        eor     v6.16b, v6.16b, v13.16b
2050        eor     v0.16b, v0.16b, v11.16b
2051        ldr     q8, [x0], #16
2052        eor     v7.16b, v7.16b, v8.16b
2053        str     q0, [x21], #16
2054        eor     v0.16b, v1.16b, v12.16b
2055        ldr     q1, [x0], #16
2056        eor     v1.16b, v3.16b, v1.16b
2057        subs    x22, x22, #0x80
2058        eor     v2.16b, v2.16b, v15.16b
2059        eor     v3.16b, v4.16b, v14.16b
2060        ldr     q4, [x0], #16
2061        str     q0, [x21], #16
2062        ldr     q11, [x0]                   // next round tweak
2063        eor     v0.16b, v5.16b, v4.16b
2064        str     q6, [x21], #16
2065        str     q3, [x21], #16
2066        str     q2, [x21], #16
2067        str     q7, [x21], #16
2068        str     q1, [x21], #16
2069        str     q0, [x21], #16
2070        bpl     .Lxts_dec_loop
2071
2072.Lxts_dec_short:
2073        adds    x22, x22, #0x70
2074        bmi     .Lxts_dec_done
2075
2076        ldr     q8, .Lxts_magic
2077        sshr    v1.2d, v11.2d, #63
2078        add     v2.2d, v11.2d, v11.2d
2079        ldr     q9, .Lxts_magic+16
2080        subs    x22, x22, #0x10
2081        ldr     q0, [x20], #16
2082        and     v1.16b, v1.16b, v8.16b
2083        cmtst   v3.2d, v11.2d, v9.2d
2084        ext     v1.16b, v1.16b, v1.16b, #8
2085        and     v3.16b, v3.16b, v8.16b
2086        eor     v12.16b, v2.16b, v1.16b
2087        ext     v1.16b, v3.16b, v3.16b, #8
2088        add     v2.2d, v12.2d, v12.2d
2089        cmtst   v3.2d, v12.2d, v9.2d
2090        eor     v13.16b, v2.16b, v1.16b
2091        and     v22.16b, v3.16b, v8.16b
2092        bmi     .Lxts_dec_1
2093
2094        ext     v2.16b, v22.16b, v22.16b, #8
2095        add     v3.2d, v13.2d, v13.2d
2096        ldr     q1, [x20], #16
2097        cmtst   v4.2d, v13.2d, v9.2d
2098        subs    x22, x22, #0x10
2099        eor     v14.16b, v3.16b, v2.16b
2100        and     v23.16b, v4.16b, v8.16b
2101        bmi     .Lxts_dec_2
2102
2103        ext     v3.16b, v23.16b, v23.16b, #8
2104        add     v4.2d, v14.2d, v14.2d
2105        ldr     q2, [x20], #16
2106        cmtst   v5.2d, v14.2d, v9.2d
2107        eor     v0.16b, v0.16b, v11.16b
2108        subs    x22, x22, #0x10
2109        eor     v15.16b, v4.16b, v3.16b
2110        and     v24.16b, v5.16b, v8.16b
2111        bmi     .Lxts_dec_3
2112
2113        ext     v4.16b, v24.16b, v24.16b, #8
2114        add     v5.2d, v15.2d, v15.2d
2115        ldr     q3, [x20], #16
2116        cmtst   v6.2d, v15.2d, v9.2d
2117        eor     v1.16b, v1.16b, v12.16b
2118        subs    x22, x22, #0x10
2119        eor     v16.16b, v5.16b, v4.16b
2120        and     v25.16b, v6.16b, v8.16b
2121        bmi     .Lxts_dec_4
2122
2123        ext     v5.16b, v25.16b, v25.16b, #8
2124        add     v6.2d, v16.2d, v16.2d
2125        add     x0, x19, #16
2126        cmtst   v7.2d, v16.2d, v9.2d
2127        ldr     q4, [x20], #16
2128        eor     v2.16b, v2.16b, v13.16b
2129        str     q16, [x0], #16
2130        subs    x22, x22, #0x10
2131        eor     v17.16b, v6.16b, v5.16b
2132        and     v26.16b, v7.16b, v8.16b
2133        bmi     .Lxts_dec_5
2134
2135        ext     v7.16b, v26.16b, v26.16b, #8
2136        add     v18.2d, v17.2d, v17.2d
2137        ldr     q5, [x20], #16
2138        eor     v3.16b, v3.16b, v14.16b
2139        str     q17, [x0], #16
2140        subs    x22, x22, #0x10
2141        eor     v18.16b, v18.16b, v7.16b
2142        bmi     .Lxts_dec_6
2143
2144        ldr     q6, [x20], #16
2145        eor     v4.16b, v4.16b, v15.16b
2146        eor     v5.16b, v5.16b, v16.16b
2147        str     q18, [x0]                   // next round tweak
2148        mov     x9, sp                      // pass key schedule
2149        mov     x10, x1
2150        add     x0, x19, #16
2151        sub     x22, x22, #0x10
2152        eor     v6.16b, v6.16b, v17.16b
2153
2154        bl      _bsaes_decrypt8
2155
2156        ldr     q16, [x0], #16
2157        eor     v0.16b, v0.16b, v11.16b
2158        eor     v1.16b, v1.16b, v12.16b
2159        ldr     q17, [x0], #16
2160        eor     v6.16b, v6.16b, v13.16b
2161        eor     v4.16b, v4.16b, v14.16b
2162        eor     v2.16b, v2.16b, v15.16b
2163        ldr     q11, [x0]                   // next round tweak
2164        str     q0, [x21], #16
2165        str     q1, [x21], #16
2166        eor     v0.16b, v7.16b, v16.16b
2167        eor     v1.16b, v3.16b, v17.16b
2168        str     q6, [x21], #16
2169        str     q4, [x21], #16
2170        str     q2, [x21], #16
2171        str     q0, [x21], #16
2172        str     q1, [x21], #16
2173        b       .Lxts_dec_done
2174
2175.align  4
2176.Lxts_dec_6:
2177        eor     v4.16b, v4.16b, v15.16b
2178        eor     v5.16b, v5.16b, v16.16b
2179        mov     x9, sp                      // pass key schedule
2180        mov     x10, x1                     // pass rounds
2181        add     x0, x19, #16
2182
2183        bl      _bsaes_decrypt8
2184
2185        ldr     q16, [x0], #16
2186        eor     v0.16b, v0.16b, v11.16b
2187        eor     v1.16b, v1.16b, v12.16b
2188        eor     v6.16b, v6.16b, v13.16b
2189        eor     v4.16b, v4.16b, v14.16b
2190        ldr     q11, [x0]                   // next round tweak
2191        eor     v2.16b, v2.16b, v15.16b
2192        str     q0, [x21], #16
2193        str     q1, [x21], #16
2194        eor     v0.16b, v7.16b, v16.16b
2195        str     q6, [x21], #16
2196        str     q4, [x21], #16
2197        str     q2, [x21], #16
2198        str     q0, [x21], #16
2199        b       .Lxts_dec_done
2200
2201.align  4
2202.Lxts_dec_5:
2203        eor     v3.16b, v3.16b, v14.16b
2204        eor     v4.16b, v4.16b, v15.16b
2205        mov     x9, sp                      // pass key schedule
2206        mov     x10, x1                     // pass rounds
2207        add     x0, x19, #16
2208
2209        bl      _bsaes_decrypt8
2210
2211        eor     v0.16b, v0.16b, v11.16b
2212        eor     v1.16b, v1.16b, v12.16b
2213        ldr     q11, [x0]                   // next round tweak
2214        eor     v6.16b, v6.16b, v13.16b
2215        eor     v4.16b, v4.16b, v14.16b
2216        eor     v2.16b, v2.16b, v15.16b
2217        str     q0, [x21], #16
2218        str     q1, [x21], #16
2219        str     q6, [x21], #16
2220        str     q4, [x21], #16
2221        str     q2, [x21], #16
2222        b       .Lxts_dec_done
2223
2224.align  4
2225.Lxts_dec_4:
2226        eor     v2.16b, v2.16b, v13.16b
2227        eor     v3.16b, v3.16b, v14.16b
2228        mov     x9, sp                      // pass key schedule
2229        mov     x10, x1                     // pass rounds
2230        add     x0, x19, #16
2231
2232        bl      _bsaes_decrypt8
2233
2234        eor     v0.16b, v0.16b, v11.16b
2235        eor     v1.16b, v1.16b, v12.16b
2236        eor     v6.16b, v6.16b, v13.16b
2237        eor     v4.16b, v4.16b, v14.16b
2238        mov     v11.16b, v15.16b            // next round tweak
2239        str     q0, [x21], #16
2240        str     q1, [x21], #16
2241        str     q6, [x21], #16
2242        str     q4, [x21], #16
2243        b       .Lxts_dec_done
2244
2245.align  4
2246.Lxts_dec_3:
2247        eor     v1.16b, v1.16b, v12.16b
2248        eor     v2.16b, v2.16b, v13.16b
2249        mov     x9, sp                      // pass key schedule
2250        mov     x10, x1                     // pass rounds
2251        add     x0, x19, #16
2252
2253        bl      _bsaes_decrypt8
2254
2255        eor     v0.16b, v0.16b, v11.16b
2256        eor     v1.16b, v1.16b, v12.16b
2257        eor     v6.16b, v6.16b, v13.16b
2258        mov     v11.16b, v14.16b            // next round tweak
2259        str     q0, [x21], #16
2260        str     q1, [x21], #16
2261        str     q6, [x21], #16
2262        b       .Lxts_dec_done
2263
2264.align  4
2265.Lxts_dec_2:
2266        eor     v0.16b, v0.16b, v11.16b
2267        eor     v1.16b, v1.16b, v12.16b
2268        mov     x9, sp                      // pass key schedule
2269        mov     x10, x1                     // pass rounds
2270        add     x0, x19, #16
2271
2272        bl      _bsaes_decrypt8
2273
2274        eor     v0.16b, v0.16b, v11.16b
2275        eor     v1.16b, v1.16b, v12.16b
2276        mov     v11.16b, v13.16b            // next round tweak
2277        str     q0, [x21], #16
2278        str     q1, [x21], #16
2279        b       .Lxts_dec_done
2280
2281.align  4
2282.Lxts_dec_1:
2283        eor     v0.16b, v0.16b, v11.16b
2284        sub     x0, sp, #16
2285        sub     x1, sp, #16
2286        mov     x2, x23
2287        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2288        mov     v14.d[0], v12.d[1]
2289        str     q0, [sp, #-16]!
2290
2291        bl      AES_decrypt
2292
2293        ldr     q0, [sp], #16
2294        trn1    v13.2d, v11.2d, v13.2d
2295        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
2296        eor     v0.16b, v0.16b, v13.16b
2297        str     q0, [x21], #16
2298
2299.Lxts_dec_done:
2300        adds    x22, x22, #0x10
2301        beq     .Lxts_dec_ret
2302
2303        // calculate one round of extra tweak for the stolen ciphertext
2304        ldr     q8, .Lxts_magic
2305        sshr    v6.2d, v11.2d, #63
2306        and     v6.16b, v6.16b, v8.16b
2307        add     v12.2d, v11.2d, v11.2d
2308        ext     v6.16b, v6.16b, v6.16b, #8
2309        eor     v12.16b, v12.16b, v6.16b
2310
2311        // perform the final decryption with the last tweak value
2312        ldr     q0, [x20], #16
2313        eor     v0.16b, v0.16b, v12.16b
2314        str     q0, [sp, #-16]!
2315        mov     x0, sp
2316        mov     x1, sp
2317        mov     x2, x23
2318        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2319        mov     v14.d[0], v12.d[1]
2320
2321        bl      AES_decrypt
2322
2323        trn1    v12.2d, v12.2d, v14.2d
2324        trn1    v11.2d, v11.2d, v13.2d
2325        ldr     q0, [sp], #16
2326        eor     v0.16b, v0.16b, v12.16b
2327        str     q0, [x21]
2328
2329        mov     x6, x21
2330        // Penultimate ciphertext block produces final plaintext part-block
2331        // plus remaining part of final ciphertext block. Move plaintext part
2332        // to final position and re-use penultimate plaintext block buffer to
2333        // construct final ciphertext block
2334.Lxts_dec_steal:
2335        ldrb    w1, [x21]
2336        ldrb    w0, [x20], #1
2337        strb    w1, [x21, #0x10]
2338        strb    w0, [x21], #1
2339
2340        subs    x22, x22, #1
2341        bhi     .Lxts_dec_steal
2342
2343        // Finally decrypt the penultimate plaintext block using the
2344        // penultimate tweak
2345        ldr     q0, [x6]
2346        eor     v0.16b, v0.16b, v11.16b
2347        str     q0, [sp, #-16]!
2348        mov     x0, sp
2349        mov     x1, sp
2350        mov     x2, x23
2351        mov     x21, x6
2352
2353        bl      AES_decrypt
2354
2355        trn1    v11.2d, v11.2d, v13.2d
2356        ldr     q0, [sp], #16
2357        eor     v0.16b, v0.16b, v11.16b
2358        str     q0, [x21]
2359
2360.Lxts_dec_ret:
2361
2362        movi    v0.16b, #0
2363        movi    v1.16b, #0
2364.Lxts_dec_bzero: // wipe key schedule
2365        stp     q0, q1, [sp], #32
2366        cmp     sp, x19
2367        bne     .Lxts_dec_bzero
2368
2369        ldp     x19, x20, [sp, #80]
2370        ldp     x21, x22, [sp, #96]
2371        ldr     x23, [sp, #112]
2372        ldp     d8, d9, [sp, #128]
2373        ldp     d10, d11, [sp, #144]
2374        ldp     d12, d13, [sp, #160]
2375        ldp     d14, d15, [sp, #176]
2376        ldp     x29, x30, [sp], #192
2377        ret
2378.size   ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
2379