1#!/usr/bin/env perl 2# Copyright 2020-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9use strict; 10 11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 13my $xlate; 14 15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; 16( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or 18die "can't locate arm-xlate.pl"; 19 20open OUT,"| \"$^X\" $xlate $flavour $output"; 21*STDOUT=*OUT; 22 23my $code = data(); 24print $code; 25 26close STDOUT or die "error closing STDOUT: $!"; # enforce flush 27 28sub data 29{ 30 local $/; 31 return <DATA>; 32} 33 34__END__ 35// Copyright 2021-2024 The OpenSSL Project Authors. All Rights Reserved. 36// 37// Licensed under the OpenSSL license (the "License"). You may not use 38// this file except in compliance with the License. You can obtain a copy 39// in the file LICENSE in the source distribution or at 40// https://www.openssl.org/source/license.html 41// 42// ==================================================================== 43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL 44// project. Rights for redistribution and usage in source and binary 45// forms are granted according to the OpenSSL license. 46// ==================================================================== 47// 48// This implementation is a translation of bsaes-armv7 for AArch64. 49// No attempt has been made to carry across the build switches for 50// kernel targets, since the Linux kernel crypto support has moved on 51// from when it was based on OpenSSL. 52 53// A lot of hand-scheduling has been performed. Consequently, this code 54// doesn't factor out neatly into macros in the same way that the 55// AArch32 version did, and there is little to be gained by wrapping it 56// up in Perl, and it is presented as pure assembly. 57 58 59#include "crypto/arm_arch.h" 60 61.text 62 63.extern AES_cbc_encrypt 64.extern AES_encrypt 65.extern AES_decrypt 66 67.type _bsaes_decrypt8,%function 68.align 4 69// On entry: 70// x9 -> key (previously expanded using _bsaes_key_convert) 71// x10 = number of rounds 72// v0-v7 input data 73// On exit: 74// x9-x11 corrupted 75// other general-purpose registers preserved 76// v0-v7 output data 77// v11-v15 preserved 78// other SIMD registers corrupted 79_bsaes_decrypt8: 80 ldr q8, [x9], #16 81 adr x11, .LM0ISR 82 movi v9.16b, #0x55 83 ldr q10, [x11], #16 84 movi v16.16b, #0x33 85 movi v17.16b, #0x0f 86 sub x10, x10, #1 87 eor v0.16b, v0.16b, v8.16b 88 eor v1.16b, v1.16b, v8.16b 89 eor v2.16b, v2.16b, v8.16b 90 eor v4.16b, v4.16b, v8.16b 91 eor v3.16b, v3.16b, v8.16b 92 eor v5.16b, v5.16b, v8.16b 93 tbl v0.16b, {v0.16b}, v10.16b 94 tbl v1.16b, {v1.16b}, v10.16b 95 tbl v2.16b, {v2.16b}, v10.16b 96 tbl v4.16b, {v4.16b}, v10.16b 97 eor v6.16b, v6.16b, v8.16b 98 eor v7.16b, v7.16b, v8.16b 99 tbl v3.16b, {v3.16b}, v10.16b 100 tbl v5.16b, {v5.16b}, v10.16b 101 tbl v6.16b, {v6.16b}, v10.16b 102 ushr v8.2d, v0.2d, #1 103 tbl v7.16b, {v7.16b}, v10.16b 104 ushr v10.2d, v4.2d, #1 105 ushr v18.2d, v2.2d, #1 106 eor v8.16b, v8.16b, v1.16b 107 ushr v19.2d, v6.2d, #1 108 eor v10.16b, v10.16b, v5.16b 109 eor v18.16b, v18.16b, v3.16b 110 and v8.16b, v8.16b, v9.16b 111 eor v19.16b, v19.16b, v7.16b 112 and v10.16b, v10.16b, v9.16b 113 and v18.16b, v18.16b, v9.16b 114 eor v1.16b, v1.16b, v8.16b 115 shl v8.2d, v8.2d, #1 116 and v9.16b, v19.16b, v9.16b 117 eor v5.16b, v5.16b, v10.16b 118 shl v10.2d, v10.2d, #1 119 eor v3.16b, v3.16b, v18.16b 120 shl v18.2d, v18.2d, #1 121 eor v0.16b, v0.16b, v8.16b 122 shl v8.2d, v9.2d, #1 123 eor v7.16b, v7.16b, v9.16b 124 eor v4.16b, v4.16b, v10.16b 125 eor v2.16b, v2.16b, v18.16b 126 ushr v9.2d, v1.2d, #2 127 eor v6.16b, v6.16b, v8.16b 128 ushr v8.2d, v0.2d, #2 129 ushr v10.2d, v5.2d, #2 130 ushr v18.2d, v4.2d, #2 131 eor v9.16b, v9.16b, v3.16b 132 eor v8.16b, v8.16b, v2.16b 133 eor v10.16b, v10.16b, v7.16b 134 eor v18.16b, v18.16b, v6.16b 135 and v9.16b, v9.16b, v16.16b 136 and v8.16b, v8.16b, v16.16b 137 and v10.16b, v10.16b, v16.16b 138 and v16.16b, v18.16b, v16.16b 139 eor v3.16b, v3.16b, v9.16b 140 shl v9.2d, v9.2d, #2 141 eor v2.16b, v2.16b, v8.16b 142 shl v8.2d, v8.2d, #2 143 eor v7.16b, v7.16b, v10.16b 144 shl v10.2d, v10.2d, #2 145 eor v6.16b, v6.16b, v16.16b 146 shl v16.2d, v16.2d, #2 147 eor v1.16b, v1.16b, v9.16b 148 eor v0.16b, v0.16b, v8.16b 149 eor v5.16b, v5.16b, v10.16b 150 eor v4.16b, v4.16b, v16.16b 151 ushr v8.2d, v3.2d, #4 152 ushr v9.2d, v2.2d, #4 153 ushr v10.2d, v1.2d, #4 154 ushr v16.2d, v0.2d, #4 155 eor v8.16b, v8.16b, v7.16b 156 eor v9.16b, v9.16b, v6.16b 157 eor v10.16b, v10.16b, v5.16b 158 eor v16.16b, v16.16b, v4.16b 159 and v8.16b, v8.16b, v17.16b 160 and v9.16b, v9.16b, v17.16b 161 and v10.16b, v10.16b, v17.16b 162 and v16.16b, v16.16b, v17.16b 163 eor v7.16b, v7.16b, v8.16b 164 shl v8.2d, v8.2d, #4 165 eor v6.16b, v6.16b, v9.16b 166 shl v9.2d, v9.2d, #4 167 eor v5.16b, v5.16b, v10.16b 168 shl v10.2d, v10.2d, #4 169 eor v4.16b, v4.16b, v16.16b 170 shl v16.2d, v16.2d, #4 171 eor v3.16b, v3.16b, v8.16b 172 eor v2.16b, v2.16b, v9.16b 173 eor v1.16b, v1.16b, v10.16b 174 eor v0.16b, v0.16b, v16.16b 175 b .Ldec_sbox 176.align 4 177.Ldec_loop: 178 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 179 ldp q8, q9, [x9], #32 180 eor v0.16b, v16.16b, v0.16b 181 ldr q10, [x9], #16 182 eor v1.16b, v17.16b, v1.16b 183 ldr q16, [x9], #16 184 eor v2.16b, v18.16b, v2.16b 185 eor v3.16b, v19.16b, v3.16b 186 eor v4.16b, v8.16b, v4.16b 187 eor v5.16b, v9.16b, v5.16b 188 eor v6.16b, v10.16b, v6.16b 189 eor v7.16b, v16.16b, v7.16b 190 tbl v0.16b, {v0.16b}, v28.16b 191 tbl v1.16b, {v1.16b}, v28.16b 192 tbl v2.16b, {v2.16b}, v28.16b 193 tbl v3.16b, {v3.16b}, v28.16b 194 tbl v4.16b, {v4.16b}, v28.16b 195 tbl v5.16b, {v5.16b}, v28.16b 196 tbl v6.16b, {v6.16b}, v28.16b 197 tbl v7.16b, {v7.16b}, v28.16b 198.Ldec_sbox: 199 eor v1.16b, v1.16b, v4.16b 200 eor v3.16b, v3.16b, v4.16b 201 subs x10, x10, #1 202 eor v4.16b, v4.16b, v7.16b 203 eor v2.16b, v2.16b, v7.16b 204 eor v1.16b, v1.16b, v6.16b 205 eor v6.16b, v6.16b, v4.16b 206 eor v2.16b, v2.16b, v5.16b 207 eor v0.16b, v0.16b, v1.16b 208 eor v7.16b, v7.16b, v6.16b 209 eor v8.16b, v6.16b, v2.16b 210 and v9.16b, v4.16b, v6.16b 211 eor v10.16b, v2.16b, v6.16b 212 eor v3.16b, v3.16b, v0.16b 213 eor v5.16b, v5.16b, v0.16b 214 eor v16.16b, v7.16b, v4.16b 215 eor v17.16b, v4.16b, v0.16b 216 and v18.16b, v0.16b, v2.16b 217 eor v19.16b, v7.16b, v4.16b 218 eor v1.16b, v1.16b, v3.16b 219 eor v20.16b, v3.16b, v0.16b 220 eor v21.16b, v5.16b, v2.16b 221 eor v22.16b, v3.16b, v7.16b 222 and v8.16b, v17.16b, v8.16b 223 orr v17.16b, v3.16b, v5.16b 224 eor v23.16b, v1.16b, v6.16b 225 eor v24.16b, v20.16b, v16.16b 226 eor v25.16b, v1.16b, v5.16b 227 orr v26.16b, v20.16b, v21.16b 228 and v20.16b, v20.16b, v21.16b 229 and v27.16b, v7.16b, v1.16b 230 eor v21.16b, v21.16b, v23.16b 231 orr v28.16b, v16.16b, v23.16b 232 orr v29.16b, v22.16b, v25.16b 233 eor v26.16b, v26.16b, v8.16b 234 and v16.16b, v16.16b, v23.16b 235 and v22.16b, v22.16b, v25.16b 236 and v21.16b, v24.16b, v21.16b 237 eor v8.16b, v28.16b, v8.16b 238 eor v23.16b, v5.16b, v2.16b 239 eor v24.16b, v1.16b, v6.16b 240 eor v16.16b, v16.16b, v22.16b 241 eor v22.16b, v3.16b, v0.16b 242 eor v25.16b, v29.16b, v21.16b 243 eor v21.16b, v26.16b, v21.16b 244 eor v8.16b, v8.16b, v20.16b 245 eor v26.16b, v23.16b, v24.16b 246 eor v16.16b, v16.16b, v20.16b 247 eor v28.16b, v22.16b, v19.16b 248 eor v20.16b, v25.16b, v20.16b 249 eor v9.16b, v21.16b, v9.16b 250 eor v8.16b, v8.16b, v18.16b 251 eor v18.16b, v5.16b, v1.16b 252 eor v21.16b, v16.16b, v17.16b 253 eor v16.16b, v16.16b, v17.16b 254 eor v17.16b, v20.16b, v27.16b 255 eor v20.16b, v3.16b, v7.16b 256 eor v25.16b, v9.16b, v8.16b 257 eor v27.16b, v0.16b, v4.16b 258 and v29.16b, v9.16b, v17.16b 259 eor v30.16b, v8.16b, v29.16b 260 eor v31.16b, v21.16b, v29.16b 261 eor v29.16b, v21.16b, v29.16b 262 bsl v30.16b, v17.16b, v21.16b 263 bsl v31.16b, v9.16b, v8.16b 264 bsl v16.16b, v30.16b, v29.16b 265 bsl v21.16b, v29.16b, v30.16b 266 eor v8.16b, v31.16b, v30.16b 267 and v1.16b, v1.16b, v31.16b 268 and v9.16b, v16.16b, v31.16b 269 and v6.16b, v6.16b, v30.16b 270 eor v16.16b, v17.16b, v21.16b 271 and v4.16b, v4.16b, v30.16b 272 eor v17.16b, v8.16b, v30.16b 273 and v21.16b, v24.16b, v8.16b 274 eor v9.16b, v9.16b, v25.16b 275 and v19.16b, v19.16b, v8.16b 276 eor v24.16b, v30.16b, v16.16b 277 eor v25.16b, v30.16b, v16.16b 278 and v7.16b, v7.16b, v17.16b 279 and v10.16b, v10.16b, v16.16b 280 eor v29.16b, v9.16b, v16.16b 281 eor v30.16b, v31.16b, v9.16b 282 and v0.16b, v24.16b, v0.16b 283 and v9.16b, v18.16b, v9.16b 284 and v2.16b, v25.16b, v2.16b 285 eor v10.16b, v10.16b, v6.16b 286 eor v18.16b, v29.16b, v16.16b 287 and v5.16b, v30.16b, v5.16b 288 eor v24.16b, v8.16b, v29.16b 289 and v25.16b, v26.16b, v29.16b 290 and v26.16b, v28.16b, v29.16b 291 eor v8.16b, v8.16b, v29.16b 292 eor v17.16b, v17.16b, v18.16b 293 eor v5.16b, v1.16b, v5.16b 294 and v23.16b, v24.16b, v23.16b 295 eor v21.16b, v21.16b, v25.16b 296 eor v19.16b, v19.16b, v26.16b 297 eor v0.16b, v4.16b, v0.16b 298 and v3.16b, v17.16b, v3.16b 299 eor v1.16b, v9.16b, v1.16b 300 eor v9.16b, v25.16b, v23.16b 301 eor v5.16b, v5.16b, v21.16b 302 eor v2.16b, v6.16b, v2.16b 303 and v6.16b, v8.16b, v22.16b 304 eor v3.16b, v7.16b, v3.16b 305 and v8.16b, v20.16b, v18.16b 306 eor v10.16b, v10.16b, v9.16b 307 eor v0.16b, v0.16b, v19.16b 308 eor v9.16b, v1.16b, v9.16b 309 eor v1.16b, v2.16b, v21.16b 310 eor v3.16b, v3.16b, v19.16b 311 and v16.16b, v27.16b, v16.16b 312 eor v17.16b, v26.16b, v6.16b 313 eor v6.16b, v8.16b, v7.16b 314 eor v7.16b, v1.16b, v9.16b 315 eor v1.16b, v5.16b, v3.16b 316 eor v2.16b, v10.16b, v3.16b 317 eor v4.16b, v16.16b, v4.16b 318 eor v8.16b, v6.16b, v17.16b 319 eor v5.16b, v9.16b, v3.16b 320 eor v9.16b, v0.16b, v1.16b 321 eor v6.16b, v7.16b, v1.16b 322 eor v0.16b, v4.16b, v17.16b 323 eor v4.16b, v8.16b, v7.16b 324 eor v7.16b, v9.16b, v2.16b 325 eor v8.16b, v3.16b, v0.16b 326 eor v7.16b, v7.16b, v5.16b 327 eor v3.16b, v4.16b, v7.16b 328 eor v4.16b, v7.16b, v0.16b 329 eor v7.16b, v8.16b, v3.16b 330 bcc .Ldec_done 331 ext v8.16b, v0.16b, v0.16b, #8 332 ext v9.16b, v1.16b, v1.16b, #8 333 ldr q28, [x11] // load from .LISR in common case (x10 > 0) 334 ext v10.16b, v6.16b, v6.16b, #8 335 ext v16.16b, v3.16b, v3.16b, #8 336 ext v17.16b, v5.16b, v5.16b, #8 337 ext v18.16b, v4.16b, v4.16b, #8 338 eor v8.16b, v8.16b, v0.16b 339 eor v9.16b, v9.16b, v1.16b 340 eor v10.16b, v10.16b, v6.16b 341 eor v16.16b, v16.16b, v3.16b 342 eor v17.16b, v17.16b, v5.16b 343 ext v19.16b, v2.16b, v2.16b, #8 344 ext v20.16b, v7.16b, v7.16b, #8 345 eor v18.16b, v18.16b, v4.16b 346 eor v6.16b, v6.16b, v8.16b 347 eor v8.16b, v2.16b, v10.16b 348 eor v4.16b, v4.16b, v9.16b 349 eor v2.16b, v19.16b, v2.16b 350 eor v9.16b, v20.16b, v7.16b 351 eor v0.16b, v0.16b, v16.16b 352 eor v1.16b, v1.16b, v16.16b 353 eor v6.16b, v6.16b, v17.16b 354 eor v8.16b, v8.16b, v16.16b 355 eor v7.16b, v7.16b, v18.16b 356 eor v4.16b, v4.16b, v16.16b 357 eor v2.16b, v3.16b, v2.16b 358 eor v1.16b, v1.16b, v17.16b 359 eor v3.16b, v5.16b, v9.16b 360 eor v5.16b, v8.16b, v17.16b 361 eor v7.16b, v7.16b, v17.16b 362 ext v8.16b, v0.16b, v0.16b, #12 363 ext v9.16b, v6.16b, v6.16b, #12 364 ext v10.16b, v4.16b, v4.16b, #12 365 ext v16.16b, v1.16b, v1.16b, #12 366 ext v17.16b, v5.16b, v5.16b, #12 367 ext v18.16b, v7.16b, v7.16b, #12 368 eor v0.16b, v0.16b, v8.16b 369 eor v6.16b, v6.16b, v9.16b 370 eor v4.16b, v4.16b, v10.16b 371 ext v19.16b, v2.16b, v2.16b, #12 372 ext v20.16b, v3.16b, v3.16b, #12 373 eor v1.16b, v1.16b, v16.16b 374 eor v5.16b, v5.16b, v17.16b 375 eor v7.16b, v7.16b, v18.16b 376 eor v2.16b, v2.16b, v19.16b 377 eor v16.16b, v16.16b, v0.16b 378 eor v3.16b, v3.16b, v20.16b 379 eor v17.16b, v17.16b, v4.16b 380 eor v10.16b, v10.16b, v6.16b 381 ext v0.16b, v0.16b, v0.16b, #8 382 eor v9.16b, v9.16b, v1.16b 383 ext v1.16b, v1.16b, v1.16b, #8 384 eor v8.16b, v8.16b, v3.16b 385 eor v16.16b, v16.16b, v3.16b 386 eor v18.16b, v18.16b, v5.16b 387 eor v19.16b, v19.16b, v7.16b 388 ext v21.16b, v5.16b, v5.16b, #8 389 ext v5.16b, v7.16b, v7.16b, #8 390 eor v7.16b, v20.16b, v2.16b 391 ext v4.16b, v4.16b, v4.16b, #8 392 ext v20.16b, v3.16b, v3.16b, #8 393 eor v17.16b, v17.16b, v3.16b 394 ext v2.16b, v2.16b, v2.16b, #8 395 eor v3.16b, v10.16b, v3.16b 396 ext v10.16b, v6.16b, v6.16b, #8 397 eor v0.16b, v0.16b, v8.16b 398 eor v1.16b, v1.16b, v16.16b 399 eor v5.16b, v5.16b, v18.16b 400 eor v3.16b, v3.16b, v4.16b 401 eor v7.16b, v20.16b, v7.16b 402 eor v6.16b, v2.16b, v19.16b 403 eor v4.16b, v21.16b, v17.16b 404 eor v2.16b, v10.16b, v9.16b 405 bne .Ldec_loop 406 ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) 407 b .Ldec_loop 408.align 4 409.Ldec_done: 410 ushr v8.2d, v0.2d, #1 411 movi v9.16b, #0x55 412 ldr q10, [x9] 413 ushr v16.2d, v2.2d, #1 414 movi v17.16b, #0x33 415 ushr v18.2d, v6.2d, #1 416 movi v19.16b, #0x0f 417 eor v8.16b, v8.16b, v1.16b 418 ushr v20.2d, v3.2d, #1 419 eor v16.16b, v16.16b, v7.16b 420 eor v18.16b, v18.16b, v4.16b 421 and v8.16b, v8.16b, v9.16b 422 eor v20.16b, v20.16b, v5.16b 423 and v16.16b, v16.16b, v9.16b 424 and v18.16b, v18.16b, v9.16b 425 shl v21.2d, v8.2d, #1 426 eor v1.16b, v1.16b, v8.16b 427 and v8.16b, v20.16b, v9.16b 428 eor v7.16b, v7.16b, v16.16b 429 shl v9.2d, v16.2d, #1 430 eor v4.16b, v4.16b, v18.16b 431 shl v16.2d, v18.2d, #1 432 eor v0.16b, v0.16b, v21.16b 433 shl v18.2d, v8.2d, #1 434 eor v5.16b, v5.16b, v8.16b 435 eor v2.16b, v2.16b, v9.16b 436 eor v6.16b, v6.16b, v16.16b 437 ushr v8.2d, v1.2d, #2 438 eor v3.16b, v3.16b, v18.16b 439 ushr v9.2d, v0.2d, #2 440 ushr v16.2d, v7.2d, #2 441 ushr v18.2d, v2.2d, #2 442 eor v8.16b, v8.16b, v4.16b 443 eor v9.16b, v9.16b, v6.16b 444 eor v16.16b, v16.16b, v5.16b 445 eor v18.16b, v18.16b, v3.16b 446 and v8.16b, v8.16b, v17.16b 447 and v9.16b, v9.16b, v17.16b 448 and v16.16b, v16.16b, v17.16b 449 and v17.16b, v18.16b, v17.16b 450 eor v4.16b, v4.16b, v8.16b 451 shl v8.2d, v8.2d, #2 452 eor v6.16b, v6.16b, v9.16b 453 shl v9.2d, v9.2d, #2 454 eor v5.16b, v5.16b, v16.16b 455 shl v16.2d, v16.2d, #2 456 eor v3.16b, v3.16b, v17.16b 457 shl v17.2d, v17.2d, #2 458 eor v1.16b, v1.16b, v8.16b 459 eor v0.16b, v0.16b, v9.16b 460 eor v7.16b, v7.16b, v16.16b 461 eor v2.16b, v2.16b, v17.16b 462 ushr v8.2d, v4.2d, #4 463 ushr v9.2d, v6.2d, #4 464 ushr v16.2d, v1.2d, #4 465 ushr v17.2d, v0.2d, #4 466 eor v8.16b, v8.16b, v5.16b 467 eor v9.16b, v9.16b, v3.16b 468 eor v16.16b, v16.16b, v7.16b 469 eor v17.16b, v17.16b, v2.16b 470 and v8.16b, v8.16b, v19.16b 471 and v9.16b, v9.16b, v19.16b 472 and v16.16b, v16.16b, v19.16b 473 and v17.16b, v17.16b, v19.16b 474 eor v5.16b, v5.16b, v8.16b 475 shl v8.2d, v8.2d, #4 476 eor v3.16b, v3.16b, v9.16b 477 shl v9.2d, v9.2d, #4 478 eor v7.16b, v7.16b, v16.16b 479 shl v16.2d, v16.2d, #4 480 eor v2.16b, v2.16b, v17.16b 481 shl v17.2d, v17.2d, #4 482 eor v4.16b, v4.16b, v8.16b 483 eor v6.16b, v6.16b, v9.16b 484 eor v7.16b, v7.16b, v10.16b 485 eor v1.16b, v1.16b, v16.16b 486 eor v2.16b, v2.16b, v10.16b 487 eor v0.16b, v0.16b, v17.16b 488 eor v4.16b, v4.16b, v10.16b 489 eor v6.16b, v6.16b, v10.16b 490 eor v3.16b, v3.16b, v10.16b 491 eor v5.16b, v5.16b, v10.16b 492 eor v1.16b, v1.16b, v10.16b 493 eor v0.16b, v0.16b, v10.16b 494 ret 495.size _bsaes_decrypt8,.-_bsaes_decrypt8 496 497.type _bsaes_const,%object 498.align 6 499_bsaes_const: 500// InvShiftRows constants 501// Used in _bsaes_decrypt8, which assumes contiguity 502// .LM0ISR used with round 0 key 503// .LISR used with middle round keys 504// .LISRM0 used with final round key 505.LM0ISR: 506.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 507.LISR: 508.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 509.LISRM0: 510.quad 0x01040b0e0205080f, 0x0306090c00070a0d 511 512// ShiftRows constants 513// Used in _bsaes_encrypt8, which assumes contiguity 514// .LM0SR used with round 0 key 515// .LSR used with middle round keys 516// .LSRM0 used with final round key 517.LM0SR: 518.quad 0x0a0e02060f03070b, 0x0004080c05090d01 519.LSR: 520.quad 0x0504070600030201, 0x0f0e0d0c0a09080b 521.LSRM0: 522.quad 0x0304090e00050a0f, 0x01060b0c0207080d 523 524.LM0_bigendian: 525.quad 0x02060a0e03070b0f, 0x0004080c0105090d 526.LM0_littleendian: 527.quad 0x0105090d0004080c, 0x03070b0f02060a0e 528 529// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into 530// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR 531.LREVM0SR: 532.quad 0x090d01050c000408, 0x03070b0f060a0e02 533 534.align 6 535.size _bsaes_const,.-_bsaes_const 536 537.type _bsaes_encrypt8,%function 538.align 4 539// On entry: 540// x9 -> key (previously expanded using _bsaes_key_convert) 541// x10 = number of rounds 542// v0-v7 input data 543// On exit: 544// x9-x11 corrupted 545// other general-purpose registers preserved 546// v0-v7 output data 547// v11-v15 preserved 548// other SIMD registers corrupted 549_bsaes_encrypt8: 550 ldr q8, [x9], #16 551 adr x11, .LM0SR 552 ldr q9, [x11], #16 553_bsaes_encrypt8_alt: 554 eor v0.16b, v0.16b, v8.16b 555 eor v1.16b, v1.16b, v8.16b 556 sub x10, x10, #1 557 eor v2.16b, v2.16b, v8.16b 558 eor v4.16b, v4.16b, v8.16b 559 eor v3.16b, v3.16b, v8.16b 560 eor v5.16b, v5.16b, v8.16b 561 tbl v0.16b, {v0.16b}, v9.16b 562 tbl v1.16b, {v1.16b}, v9.16b 563 tbl v2.16b, {v2.16b}, v9.16b 564 tbl v4.16b, {v4.16b}, v9.16b 565 eor v6.16b, v6.16b, v8.16b 566 eor v7.16b, v7.16b, v8.16b 567 tbl v3.16b, {v3.16b}, v9.16b 568 tbl v5.16b, {v5.16b}, v9.16b 569 tbl v6.16b, {v6.16b}, v9.16b 570 ushr v8.2d, v0.2d, #1 571 movi v10.16b, #0x55 572 tbl v7.16b, {v7.16b}, v9.16b 573 ushr v9.2d, v4.2d, #1 574 movi v16.16b, #0x33 575 ushr v17.2d, v2.2d, #1 576 eor v8.16b, v8.16b, v1.16b 577 movi v18.16b, #0x0f 578 ushr v19.2d, v6.2d, #1 579 eor v9.16b, v9.16b, v5.16b 580 eor v17.16b, v17.16b, v3.16b 581 and v8.16b, v8.16b, v10.16b 582 eor v19.16b, v19.16b, v7.16b 583 and v9.16b, v9.16b, v10.16b 584 and v17.16b, v17.16b, v10.16b 585 eor v1.16b, v1.16b, v8.16b 586 shl v8.2d, v8.2d, #1 587 and v10.16b, v19.16b, v10.16b 588 eor v5.16b, v5.16b, v9.16b 589 shl v9.2d, v9.2d, #1 590 eor v3.16b, v3.16b, v17.16b 591 shl v17.2d, v17.2d, #1 592 eor v0.16b, v0.16b, v8.16b 593 shl v8.2d, v10.2d, #1 594 eor v7.16b, v7.16b, v10.16b 595 eor v4.16b, v4.16b, v9.16b 596 eor v2.16b, v2.16b, v17.16b 597 ushr v9.2d, v1.2d, #2 598 eor v6.16b, v6.16b, v8.16b 599 ushr v8.2d, v0.2d, #2 600 ushr v10.2d, v5.2d, #2 601 ushr v17.2d, v4.2d, #2 602 eor v9.16b, v9.16b, v3.16b 603 eor v8.16b, v8.16b, v2.16b 604 eor v10.16b, v10.16b, v7.16b 605 eor v17.16b, v17.16b, v6.16b 606 and v9.16b, v9.16b, v16.16b 607 and v8.16b, v8.16b, v16.16b 608 and v10.16b, v10.16b, v16.16b 609 and v16.16b, v17.16b, v16.16b 610 eor v3.16b, v3.16b, v9.16b 611 shl v9.2d, v9.2d, #2 612 eor v2.16b, v2.16b, v8.16b 613 shl v8.2d, v8.2d, #2 614 eor v7.16b, v7.16b, v10.16b 615 shl v10.2d, v10.2d, #2 616 eor v6.16b, v6.16b, v16.16b 617 shl v16.2d, v16.2d, #2 618 eor v1.16b, v1.16b, v9.16b 619 eor v0.16b, v0.16b, v8.16b 620 eor v5.16b, v5.16b, v10.16b 621 eor v4.16b, v4.16b, v16.16b 622 ushr v8.2d, v3.2d, #4 623 ushr v9.2d, v2.2d, #4 624 ushr v10.2d, v1.2d, #4 625 ushr v16.2d, v0.2d, #4 626 eor v8.16b, v8.16b, v7.16b 627 eor v9.16b, v9.16b, v6.16b 628 eor v10.16b, v10.16b, v5.16b 629 eor v16.16b, v16.16b, v4.16b 630 and v8.16b, v8.16b, v18.16b 631 and v9.16b, v9.16b, v18.16b 632 and v10.16b, v10.16b, v18.16b 633 and v16.16b, v16.16b, v18.16b 634 eor v7.16b, v7.16b, v8.16b 635 shl v8.2d, v8.2d, #4 636 eor v6.16b, v6.16b, v9.16b 637 shl v9.2d, v9.2d, #4 638 eor v5.16b, v5.16b, v10.16b 639 shl v10.2d, v10.2d, #4 640 eor v4.16b, v4.16b, v16.16b 641 shl v16.2d, v16.2d, #4 642 eor v3.16b, v3.16b, v8.16b 643 eor v2.16b, v2.16b, v9.16b 644 eor v1.16b, v1.16b, v10.16b 645 eor v0.16b, v0.16b, v16.16b 646 b .Lenc_sbox 647.align 4 648.Lenc_loop: 649 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 650 ldp q8, q9, [x9], #32 651 eor v0.16b, v16.16b, v0.16b 652 ldr q10, [x9], #16 653 eor v1.16b, v17.16b, v1.16b 654 ldr q16, [x9], #16 655 eor v2.16b, v18.16b, v2.16b 656 eor v3.16b, v19.16b, v3.16b 657 eor v4.16b, v8.16b, v4.16b 658 eor v5.16b, v9.16b, v5.16b 659 eor v6.16b, v10.16b, v6.16b 660 eor v7.16b, v16.16b, v7.16b 661 tbl v0.16b, {v0.16b}, v28.16b 662 tbl v1.16b, {v1.16b}, v28.16b 663 tbl v2.16b, {v2.16b}, v28.16b 664 tbl v3.16b, {v3.16b}, v28.16b 665 tbl v4.16b, {v4.16b}, v28.16b 666 tbl v5.16b, {v5.16b}, v28.16b 667 tbl v6.16b, {v6.16b}, v28.16b 668 tbl v7.16b, {v7.16b}, v28.16b 669.Lenc_sbox: 670 eor v5.16b, v5.16b, v6.16b 671 eor v3.16b, v3.16b, v0.16b 672 subs x10, x10, #1 673 eor v2.16b, v2.16b, v1.16b 674 eor v5.16b, v5.16b, v0.16b 675 eor v8.16b, v3.16b, v7.16b 676 eor v6.16b, v6.16b, v2.16b 677 eor v7.16b, v7.16b, v5.16b 678 eor v8.16b, v8.16b, v4.16b 679 eor v3.16b, v6.16b, v3.16b 680 eor v4.16b, v4.16b, v5.16b 681 eor v6.16b, v1.16b, v5.16b 682 eor v2.16b, v2.16b, v7.16b 683 eor v1.16b, v8.16b, v1.16b 684 eor v8.16b, v7.16b, v4.16b 685 eor v9.16b, v3.16b, v0.16b 686 eor v10.16b, v7.16b, v6.16b 687 eor v16.16b, v5.16b, v3.16b 688 eor v17.16b, v6.16b, v2.16b 689 eor v18.16b, v5.16b, v1.16b 690 eor v19.16b, v2.16b, v4.16b 691 eor v20.16b, v1.16b, v0.16b 692 orr v21.16b, v8.16b, v9.16b 693 orr v22.16b, v10.16b, v16.16b 694 eor v23.16b, v8.16b, v17.16b 695 eor v24.16b, v9.16b, v18.16b 696 and v19.16b, v19.16b, v20.16b 697 orr v20.16b, v17.16b, v18.16b 698 and v8.16b, v8.16b, v9.16b 699 and v9.16b, v17.16b, v18.16b 700 and v17.16b, v23.16b, v24.16b 701 and v10.16b, v10.16b, v16.16b 702 eor v16.16b, v21.16b, v19.16b 703 eor v18.16b, v20.16b, v19.16b 704 and v19.16b, v2.16b, v1.16b 705 and v20.16b, v6.16b, v5.16b 706 eor v21.16b, v22.16b, v17.16b 707 eor v9.16b, v9.16b, v10.16b 708 eor v10.16b, v16.16b, v17.16b 709 eor v16.16b, v18.16b, v8.16b 710 and v17.16b, v4.16b, v0.16b 711 orr v18.16b, v7.16b, v3.16b 712 eor v21.16b, v21.16b, v8.16b 713 eor v8.16b, v9.16b, v8.16b 714 eor v9.16b, v10.16b, v19.16b 715 eor v10.16b, v3.16b, v0.16b 716 eor v16.16b, v16.16b, v17.16b 717 eor v17.16b, v5.16b, v1.16b 718 eor v19.16b, v21.16b, v20.16b 719 eor v20.16b, v8.16b, v18.16b 720 eor v8.16b, v8.16b, v18.16b 721 eor v18.16b, v7.16b, v4.16b 722 eor v21.16b, v9.16b, v16.16b 723 eor v22.16b, v6.16b, v2.16b 724 and v23.16b, v9.16b, v19.16b 725 eor v24.16b, v10.16b, v17.16b 726 eor v25.16b, v0.16b, v1.16b 727 eor v26.16b, v7.16b, v6.16b 728 eor v27.16b, v18.16b, v22.16b 729 eor v28.16b, v3.16b, v5.16b 730 eor v29.16b, v16.16b, v23.16b 731 eor v30.16b, v20.16b, v23.16b 732 eor v23.16b, v20.16b, v23.16b 733 eor v31.16b, v4.16b, v2.16b 734 bsl v29.16b, v19.16b, v20.16b 735 bsl v30.16b, v9.16b, v16.16b 736 bsl v8.16b, v29.16b, v23.16b 737 bsl v20.16b, v23.16b, v29.16b 738 eor v9.16b, v30.16b, v29.16b 739 and v5.16b, v5.16b, v30.16b 740 and v8.16b, v8.16b, v30.16b 741 and v1.16b, v1.16b, v29.16b 742 eor v16.16b, v19.16b, v20.16b 743 and v2.16b, v2.16b, v29.16b 744 eor v19.16b, v9.16b, v29.16b 745 and v17.16b, v17.16b, v9.16b 746 eor v8.16b, v8.16b, v21.16b 747 and v20.16b, v22.16b, v9.16b 748 eor v21.16b, v29.16b, v16.16b 749 eor v22.16b, v29.16b, v16.16b 750 and v23.16b, v25.16b, v16.16b 751 and v6.16b, v6.16b, v19.16b 752 eor v25.16b, v8.16b, v16.16b 753 eor v29.16b, v30.16b, v8.16b 754 and v4.16b, v21.16b, v4.16b 755 and v8.16b, v28.16b, v8.16b 756 and v0.16b, v22.16b, v0.16b 757 eor v21.16b, v23.16b, v1.16b 758 eor v22.16b, v9.16b, v25.16b 759 eor v9.16b, v9.16b, v25.16b 760 eor v23.16b, v25.16b, v16.16b 761 and v3.16b, v29.16b, v3.16b 762 and v24.16b, v24.16b, v25.16b 763 and v25.16b, v27.16b, v25.16b 764 and v10.16b, v22.16b, v10.16b 765 and v9.16b, v9.16b, v18.16b 766 eor v18.16b, v19.16b, v23.16b 767 and v19.16b, v26.16b, v23.16b 768 eor v3.16b, v5.16b, v3.16b 769 eor v17.16b, v17.16b, v24.16b 770 eor v10.16b, v24.16b, v10.16b 771 and v16.16b, v31.16b, v16.16b 772 eor v20.16b, v20.16b, v25.16b 773 eor v9.16b, v25.16b, v9.16b 774 eor v4.16b, v2.16b, v4.16b 775 and v7.16b, v18.16b, v7.16b 776 eor v18.16b, v19.16b, v6.16b 777 eor v5.16b, v8.16b, v5.16b 778 eor v0.16b, v1.16b, v0.16b 779 eor v1.16b, v21.16b, v10.16b 780 eor v8.16b, v3.16b, v17.16b 781 eor v2.16b, v16.16b, v2.16b 782 eor v3.16b, v6.16b, v7.16b 783 eor v6.16b, v18.16b, v9.16b 784 eor v4.16b, v4.16b, v20.16b 785 eor v10.16b, v5.16b, v10.16b 786 eor v0.16b, v0.16b, v17.16b 787 eor v9.16b, v2.16b, v9.16b 788 eor v3.16b, v3.16b, v20.16b 789 eor v7.16b, v6.16b, v1.16b 790 eor v5.16b, v8.16b, v4.16b 791 eor v6.16b, v10.16b, v1.16b 792 eor v2.16b, v4.16b, v0.16b 793 eor v4.16b, v3.16b, v10.16b 794 eor v9.16b, v9.16b, v7.16b 795 eor v3.16b, v0.16b, v5.16b 796 eor v0.16b, v1.16b, v4.16b 797 eor v1.16b, v4.16b, v8.16b 798 eor v4.16b, v9.16b, v5.16b 799 eor v6.16b, v6.16b, v3.16b 800 bcc .Lenc_done 801 ext v8.16b, v0.16b, v0.16b, #12 802 ext v9.16b, v4.16b, v4.16b, #12 803 ldr q28, [x11] 804 ext v10.16b, v6.16b, v6.16b, #12 805 ext v16.16b, v1.16b, v1.16b, #12 806 ext v17.16b, v3.16b, v3.16b, #12 807 ext v18.16b, v7.16b, v7.16b, #12 808 eor v0.16b, v0.16b, v8.16b 809 eor v4.16b, v4.16b, v9.16b 810 eor v6.16b, v6.16b, v10.16b 811 ext v19.16b, v2.16b, v2.16b, #12 812 ext v20.16b, v5.16b, v5.16b, #12 813 eor v1.16b, v1.16b, v16.16b 814 eor v3.16b, v3.16b, v17.16b 815 eor v7.16b, v7.16b, v18.16b 816 eor v2.16b, v2.16b, v19.16b 817 eor v16.16b, v16.16b, v0.16b 818 eor v5.16b, v5.16b, v20.16b 819 eor v17.16b, v17.16b, v6.16b 820 eor v10.16b, v10.16b, v4.16b 821 ext v0.16b, v0.16b, v0.16b, #8 822 eor v9.16b, v9.16b, v1.16b 823 ext v1.16b, v1.16b, v1.16b, #8 824 eor v8.16b, v8.16b, v5.16b 825 eor v16.16b, v16.16b, v5.16b 826 eor v18.16b, v18.16b, v3.16b 827 eor v19.16b, v19.16b, v7.16b 828 ext v3.16b, v3.16b, v3.16b, #8 829 ext v7.16b, v7.16b, v7.16b, #8 830 eor v20.16b, v20.16b, v2.16b 831 ext v6.16b, v6.16b, v6.16b, #8 832 ext v21.16b, v5.16b, v5.16b, #8 833 eor v17.16b, v17.16b, v5.16b 834 ext v2.16b, v2.16b, v2.16b, #8 835 eor v10.16b, v10.16b, v5.16b 836 ext v22.16b, v4.16b, v4.16b, #8 837 eor v0.16b, v0.16b, v8.16b 838 eor v1.16b, v1.16b, v16.16b 839 eor v5.16b, v7.16b, v18.16b 840 eor v4.16b, v3.16b, v17.16b 841 eor v3.16b, v6.16b, v10.16b 842 eor v7.16b, v21.16b, v20.16b 843 eor v6.16b, v2.16b, v19.16b 844 eor v2.16b, v22.16b, v9.16b 845 bne .Lenc_loop 846 ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) 847 b .Lenc_loop 848.align 4 849.Lenc_done: 850 ushr v8.2d, v0.2d, #1 851 movi v9.16b, #0x55 852 ldr q10, [x9] 853 ushr v16.2d, v3.2d, #1 854 movi v17.16b, #0x33 855 ushr v18.2d, v4.2d, #1 856 movi v19.16b, #0x0f 857 eor v8.16b, v8.16b, v1.16b 858 ushr v20.2d, v2.2d, #1 859 eor v16.16b, v16.16b, v7.16b 860 eor v18.16b, v18.16b, v6.16b 861 and v8.16b, v8.16b, v9.16b 862 eor v20.16b, v20.16b, v5.16b 863 and v16.16b, v16.16b, v9.16b 864 and v18.16b, v18.16b, v9.16b 865 shl v21.2d, v8.2d, #1 866 eor v1.16b, v1.16b, v8.16b 867 and v8.16b, v20.16b, v9.16b 868 eor v7.16b, v7.16b, v16.16b 869 shl v9.2d, v16.2d, #1 870 eor v6.16b, v6.16b, v18.16b 871 shl v16.2d, v18.2d, #1 872 eor v0.16b, v0.16b, v21.16b 873 shl v18.2d, v8.2d, #1 874 eor v5.16b, v5.16b, v8.16b 875 eor v3.16b, v3.16b, v9.16b 876 eor v4.16b, v4.16b, v16.16b 877 ushr v8.2d, v1.2d, #2 878 eor v2.16b, v2.16b, v18.16b 879 ushr v9.2d, v0.2d, #2 880 ushr v16.2d, v7.2d, #2 881 ushr v18.2d, v3.2d, #2 882 eor v8.16b, v8.16b, v6.16b 883 eor v9.16b, v9.16b, v4.16b 884 eor v16.16b, v16.16b, v5.16b 885 eor v18.16b, v18.16b, v2.16b 886 and v8.16b, v8.16b, v17.16b 887 and v9.16b, v9.16b, v17.16b 888 and v16.16b, v16.16b, v17.16b 889 and v17.16b, v18.16b, v17.16b 890 eor v6.16b, v6.16b, v8.16b 891 shl v8.2d, v8.2d, #2 892 eor v4.16b, v4.16b, v9.16b 893 shl v9.2d, v9.2d, #2 894 eor v5.16b, v5.16b, v16.16b 895 shl v16.2d, v16.2d, #2 896 eor v2.16b, v2.16b, v17.16b 897 shl v17.2d, v17.2d, #2 898 eor v1.16b, v1.16b, v8.16b 899 eor v0.16b, v0.16b, v9.16b 900 eor v7.16b, v7.16b, v16.16b 901 eor v3.16b, v3.16b, v17.16b 902 ushr v8.2d, v6.2d, #4 903 ushr v9.2d, v4.2d, #4 904 ushr v16.2d, v1.2d, #4 905 ushr v17.2d, v0.2d, #4 906 eor v8.16b, v8.16b, v5.16b 907 eor v9.16b, v9.16b, v2.16b 908 eor v16.16b, v16.16b, v7.16b 909 eor v17.16b, v17.16b, v3.16b 910 and v8.16b, v8.16b, v19.16b 911 and v9.16b, v9.16b, v19.16b 912 and v16.16b, v16.16b, v19.16b 913 and v17.16b, v17.16b, v19.16b 914 eor v5.16b, v5.16b, v8.16b 915 shl v8.2d, v8.2d, #4 916 eor v2.16b, v2.16b, v9.16b 917 shl v9.2d, v9.2d, #4 918 eor v7.16b, v7.16b, v16.16b 919 shl v16.2d, v16.2d, #4 920 eor v3.16b, v3.16b, v17.16b 921 shl v17.2d, v17.2d, #4 922 eor v6.16b, v6.16b, v8.16b 923 eor v4.16b, v4.16b, v9.16b 924 eor v7.16b, v7.16b, v10.16b 925 eor v1.16b, v1.16b, v16.16b 926 eor v3.16b, v3.16b, v10.16b 927 eor v0.16b, v0.16b, v17.16b 928 eor v6.16b, v6.16b, v10.16b 929 eor v4.16b, v4.16b, v10.16b 930 eor v2.16b, v2.16b, v10.16b 931 eor v5.16b, v5.16b, v10.16b 932 eor v1.16b, v1.16b, v10.16b 933 eor v0.16b, v0.16b, v10.16b 934 ret 935.size _bsaes_encrypt8,.-_bsaes_encrypt8 936 937.type _bsaes_key_convert,%function 938.align 4 939// On entry: 940// x9 -> input key (big-endian) 941// x10 = number of rounds 942// x17 -> output key (native endianness) 943// On exit: 944// x9, x10 corrupted 945// x11 -> .LM0_bigendian 946// x17 -> last quadword of output key 947// other general-purpose registers preserved 948// v2-v6 preserved 949// v7.16b[] = 0x63 950// v8-v14 preserved 951// v15 = last round key (converted to native endianness) 952// other SIMD registers corrupted 953_bsaes_key_convert: 954#ifdef __AARCH64EL__ 955 adr x11, .LM0_littleendian 956#else 957 adr x11, .LM0_bigendian 958#endif 959 ldr q0, [x9], #16 // load round 0 key 960 ldr q1, [x11] // .LM0 961 ldr q15, [x9], #16 // load round 1 key 962 963 movi v7.16b, #0x63 // compose .L63 964 movi v16.16b, #0x01 // bit masks 965 movi v17.16b, #0x02 966 movi v18.16b, #0x04 967 movi v19.16b, #0x08 968 movi v20.16b, #0x10 969 movi v21.16b, #0x20 970 movi v22.16b, #0x40 971 movi v23.16b, #0x80 972 973#ifdef __AARCH64EL__ 974 rev32 v0.16b, v0.16b 975#endif 976 sub x10, x10, #1 977 str q0, [x17], #16 // save round 0 key 978 979.align 4 980.Lkey_loop: 981 tbl v0.16b, {v15.16b}, v1.16b 982 ldr q15, [x9], #16 // load next round key 983 984 eor v0.16b, v0.16b, v7.16b 985 cmtst v24.16b, v0.16b, v16.16b 986 cmtst v25.16b, v0.16b, v17.16b 987 cmtst v26.16b, v0.16b, v18.16b 988 cmtst v27.16b, v0.16b, v19.16b 989 cmtst v28.16b, v0.16b, v20.16b 990 cmtst v29.16b, v0.16b, v21.16b 991 cmtst v30.16b, v0.16b, v22.16b 992 cmtst v31.16b, v0.16b, v23.16b 993 sub x10, x10, #1 994 st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key 995 st1 {v28.16b-v31.16b}, [x17], #64 996 cbnz x10, .Lkey_loop 997 998 // don't save last round key 999#ifdef __AARCH64EL__ 1000 rev32 v15.16b, v15.16b 1001 adr x11, .LM0_bigendian 1002#endif 1003 ret 1004.size _bsaes_key_convert,.-_bsaes_key_convert 1005 1006.globl ossl_bsaes_cbc_encrypt 1007.type ossl_bsaes_cbc_encrypt,%function 1008.align 4 1009// On entry: 1010// x0 -> input ciphertext 1011// x1 -> output plaintext 1012// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) 1013// x3 -> key 1014// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) 1015// w5 must be == 0 1016// On exit: 1017// Output plaintext filled in 1018// Initialisation vector overwritten with last quadword of ciphertext 1019// No output registers, usual AAPCS64 register preservation 1020ossl_bsaes_cbc_encrypt: 1021 AARCH64_VALID_CALL_TARGET 1022 cmp x2, #128 1023 bhs .Lcbc_do_bsaes 1024 b AES_cbc_encrypt 1025.Lcbc_do_bsaes: 1026 1027 // it is up to the caller to make sure we are called with enc == 0 1028 1029 stp x29, x30, [sp, #-48]! 1030 stp d8, d9, [sp, #16] 1031 stp d10, d15, [sp, #32] 1032 lsr x2, x2, #4 // len in 16 byte blocks 1033 1034 ldr w15, [x3, #240] // get # of rounds 1035 mov x14, sp 1036 1037 // allocate the key schedule on the stack 1038 add x17, sp, #96 1039 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1040 1041 // populate the key schedule 1042 mov x9, x3 // pass key 1043 mov x10, x15 // pass # of rounds 1044 mov sp, x17 // sp is sp 1045 bl _bsaes_key_convert 1046 ldr q6, [sp] 1047 str q15, [x17] // save last round key 1048 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1049 str q6, [sp] 1050 1051 ldr q15, [x4] // load IV 1052 b .Lcbc_dec_loop 1053 1054.align 4 1055.Lcbc_dec_loop: 1056 subs x2, x2, #0x8 1057 bmi .Lcbc_dec_loop_finish 1058 1059 ldr q0, [x0], #16 // load input 1060 mov x9, sp // pass the key 1061 ldr q1, [x0], #16 1062 mov x10, x15 1063 ldr q2, [x0], #16 1064 ldr q3, [x0], #16 1065 ldr q4, [x0], #16 1066 ldr q5, [x0], #16 1067 ldr q6, [x0], #16 1068 ldr q7, [x0], #-7*16 1069 1070 bl _bsaes_decrypt8 1071 1072 ldr q16, [x0], #16 // reload input 1073 eor v0.16b, v0.16b, v15.16b // ^= IV 1074 eor v1.16b, v1.16b, v16.16b 1075 str q0, [x1], #16 // write output 1076 ldr q0, [x0], #16 1077 str q1, [x1], #16 1078 ldr q1, [x0], #16 1079 eor v1.16b, v4.16b, v1.16b 1080 ldr q4, [x0], #16 1081 eor v2.16b, v2.16b, v4.16b 1082 eor v0.16b, v6.16b, v0.16b 1083 ldr q4, [x0], #16 1084 str q0, [x1], #16 1085 str q1, [x1], #16 1086 eor v0.16b, v7.16b, v4.16b 1087 ldr q1, [x0], #16 1088 str q2, [x1], #16 1089 ldr q2, [x0], #16 1090 ldr q15, [x0], #16 1091 str q0, [x1], #16 1092 eor v0.16b, v5.16b, v2.16b 1093 eor v1.16b, v3.16b, v1.16b 1094 str q1, [x1], #16 1095 str q0, [x1], #16 1096 1097 b .Lcbc_dec_loop 1098 1099.Lcbc_dec_loop_finish: 1100 adds x2, x2, #8 1101 beq .Lcbc_dec_done 1102 1103 ldr q0, [x0], #16 // load input 1104 cmp x2, #2 1105 blo .Lcbc_dec_one 1106 ldr q1, [x0], #16 1107 mov x9, sp // pass the key 1108 mov x10, x15 1109 beq .Lcbc_dec_two 1110 ldr q2, [x0], #16 1111 cmp x2, #4 1112 blo .Lcbc_dec_three 1113 ldr q3, [x0], #16 1114 beq .Lcbc_dec_four 1115 ldr q4, [x0], #16 1116 cmp x2, #6 1117 blo .Lcbc_dec_five 1118 ldr q5, [x0], #16 1119 beq .Lcbc_dec_six 1120 ldr q6, [x0], #-6*16 1121 1122 bl _bsaes_decrypt8 1123 1124 ldr q5, [x0], #16 // reload input 1125 eor v0.16b, v0.16b, v15.16b // ^= IV 1126 ldr q8, [x0], #16 1127 ldr q9, [x0], #16 1128 ldr q10, [x0], #16 1129 str q0, [x1], #16 // write output 1130 ldr q0, [x0], #16 1131 eor v1.16b, v1.16b, v5.16b 1132 ldr q5, [x0], #16 1133 eor v6.16b, v6.16b, v8.16b 1134 ldr q15, [x0] 1135 eor v4.16b, v4.16b, v9.16b 1136 eor v2.16b, v2.16b, v10.16b 1137 str q1, [x1], #16 1138 eor v0.16b, v7.16b, v0.16b 1139 str q6, [x1], #16 1140 eor v1.16b, v3.16b, v5.16b 1141 str q4, [x1], #16 1142 str q2, [x1], #16 1143 str q0, [x1], #16 1144 str q1, [x1] 1145 b .Lcbc_dec_done 1146.align 4 1147.Lcbc_dec_six: 1148 sub x0, x0, #0x60 1149 bl _bsaes_decrypt8 1150 ldr q3, [x0], #16 // reload input 1151 eor v0.16b, v0.16b, v15.16b // ^= IV 1152 ldr q5, [x0], #16 1153 ldr q8, [x0], #16 1154 ldr q9, [x0], #16 1155 str q0, [x1], #16 // write output 1156 ldr q0, [x0], #16 1157 eor v1.16b, v1.16b, v3.16b 1158 ldr q15, [x0] 1159 eor v3.16b, v6.16b, v5.16b 1160 eor v4.16b, v4.16b, v8.16b 1161 eor v2.16b, v2.16b, v9.16b 1162 str q1, [x1], #16 1163 eor v0.16b, v7.16b, v0.16b 1164 str q3, [x1], #16 1165 str q4, [x1], #16 1166 str q2, [x1], #16 1167 str q0, [x1] 1168 b .Lcbc_dec_done 1169.align 4 1170.Lcbc_dec_five: 1171 sub x0, x0, #0x50 1172 bl _bsaes_decrypt8 1173 ldr q3, [x0], #16 // reload input 1174 eor v0.16b, v0.16b, v15.16b // ^= IV 1175 ldr q5, [x0], #16 1176 ldr q7, [x0], #16 1177 ldr q8, [x0], #16 1178 str q0, [x1], #16 // write output 1179 ldr q15, [x0] 1180 eor v0.16b, v1.16b, v3.16b 1181 eor v1.16b, v6.16b, v5.16b 1182 eor v3.16b, v4.16b, v7.16b 1183 str q0, [x1], #16 1184 eor v0.16b, v2.16b, v8.16b 1185 str q1, [x1], #16 1186 str q3, [x1], #16 1187 str q0, [x1] 1188 b .Lcbc_dec_done 1189.align 4 1190.Lcbc_dec_four: 1191 sub x0, x0, #0x40 1192 bl _bsaes_decrypt8 1193 ldr q2, [x0], #16 // reload input 1194 eor v0.16b, v0.16b, v15.16b // ^= IV 1195 ldr q3, [x0], #16 1196 ldr q5, [x0], #16 1197 str q0, [x1], #16 // write output 1198 ldr q15, [x0] 1199 eor v0.16b, v1.16b, v2.16b 1200 eor v1.16b, v6.16b, v3.16b 1201 eor v2.16b, v4.16b, v5.16b 1202 str q0, [x1], #16 1203 str q1, [x1], #16 1204 str q2, [x1] 1205 b .Lcbc_dec_done 1206.align 4 1207.Lcbc_dec_three: 1208 sub x0, x0, #0x30 1209 bl _bsaes_decrypt8 1210 ldr q2, [x0], #16 // reload input 1211 eor v0.16b, v0.16b, v15.16b // ^= IV 1212 ldr q3, [x0], #16 1213 ldr q15, [x0] 1214 str q0, [x1], #16 // write output 1215 eor v0.16b, v1.16b, v2.16b 1216 eor v1.16b, v6.16b, v3.16b 1217 str q0, [x1], #16 1218 str q1, [x1] 1219 b .Lcbc_dec_done 1220.align 4 1221.Lcbc_dec_two: 1222 sub x0, x0, #0x20 1223 bl _bsaes_decrypt8 1224 ldr q2, [x0], #16 // reload input 1225 eor v0.16b, v0.16b, v15.16b // ^= IV 1226 ldr q15, [x0] 1227 str q0, [x1], #16 // write output 1228 eor v0.16b, v1.16b, v2.16b 1229 str q0, [x1] 1230 b .Lcbc_dec_done 1231.align 4 1232.Lcbc_dec_one: 1233 sub x0, x0, #0x10 1234 stp x1, x4, [sp, #-32]! 1235 str x14, [sp, #16] 1236 mov v8.16b, v15.16b 1237 mov v15.16b, v0.16b 1238 mov x2, x3 1239 bl AES_decrypt 1240 ldr x14, [sp, #16] 1241 ldp x1, x4, [sp], #32 1242 ldr q0, [x1] // load result 1243 eor v0.16b, v0.16b, v8.16b // ^= IV 1244 str q0, [x1] // write output 1245 1246.align 4 1247.Lcbc_dec_done: 1248 movi v0.16b, #0 1249 movi v1.16b, #0 1250.Lcbc_dec_bzero:// wipe key schedule [if any] 1251 stp q0, q1, [sp], #32 1252 cmp sp, x14 1253 bne .Lcbc_dec_bzero 1254 str q15, [x4] // return IV 1255 ldp d8, d9, [sp, #16] 1256 ldp d10, d15, [sp, #32] 1257 ldp x29, x30, [sp], #48 1258 ret 1259.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt 1260 1261.globl ossl_bsaes_ctr32_encrypt_blocks 1262.type ossl_bsaes_ctr32_encrypt_blocks,%function 1263.align 4 1264// On entry: 1265// x0 -> input text (whole 16-byte blocks) 1266// x1 -> output text (whole 16-byte blocks) 1267// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) 1268// x3 -> key 1269// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block 1270// On exit: 1271// Output text filled in 1272// No output registers, usual AAPCS64 register preservation 1273ossl_bsaes_ctr32_encrypt_blocks: 1274 AARCH64_VALID_CALL_TARGET 1275 cmp x2, #8 // use plain AES for 1276 blo .Lctr_enc_short // small sizes 1277 1278 stp x29, x30, [sp, #-80]! 1279 stp d8, d9, [sp, #16] 1280 stp d10, d11, [sp, #32] 1281 stp d12, d13, [sp, #48] 1282 stp d14, d15, [sp, #64] 1283 1284 ldr w15, [x3, #240] // get # of rounds 1285 mov x14, sp 1286 1287 // allocate the key schedule on the stack 1288 add x17, sp, #96 1289 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1290 1291 // populate the key schedule 1292 mov x9, x3 // pass key 1293 mov x10, x15 // pass # of rounds 1294 mov sp, x17 // sp is sp 1295 bl _bsaes_key_convert 1296 eor v7.16b, v7.16b, v15.16b // fix up last round key 1297 str q7, [x17] // save last round key 1298 1299 ldr q0, [x4] // load counter 1300 add x13, x11, #.LREVM0SR-.LM0_bigendian 1301 ldr q4, [sp] // load round0 key 1302 1303 movi v8.4s, #1 // compose 1<<96 1304 movi v9.16b, #0 1305 rev32 v15.16b, v0.16b 1306 rev32 v0.16b, v0.16b 1307 ext v11.16b, v9.16b, v8.16b, #4 1308 rev32 v4.16b, v4.16b 1309 add v12.4s, v11.4s, v11.4s // compose 2<<96 1310 str q4, [sp] // save adjusted round0 key 1311 add v13.4s, v11.4s, v12.4s // compose 3<<96 1312 add v14.4s, v12.4s, v12.4s // compose 4<<96 1313 b .Lctr_enc_loop 1314 1315.align 4 1316.Lctr_enc_loop: 1317 // Intermix prologue from _bsaes_encrypt8 to use the opportunity 1318 // to flip byte order in 32-bit counter 1319 1320 add v1.4s, v15.4s, v11.4s // +1 1321 add x9, sp, #0x10 // pass next round key 1322 add v2.4s, v15.4s, v12.4s // +2 1323 ldr q9, [x13] // .LREVM0SR 1324 ldr q8, [sp] // load round0 key 1325 add v3.4s, v15.4s, v13.4s // +3 1326 mov x10, x15 // pass rounds 1327 sub x11, x13, #.LREVM0SR-.LSR // pass constants 1328 add v6.4s, v2.4s, v14.4s 1329 add v4.4s, v15.4s, v14.4s // +4 1330 add v7.4s, v3.4s, v14.4s 1331 add v15.4s, v4.4s, v14.4s // next counter 1332 add v5.4s, v1.4s, v14.4s 1333 1334 bl _bsaes_encrypt8_alt 1335 1336 subs x2, x2, #8 1337 blo .Lctr_enc_loop_done 1338 1339 ldr q16, [x0], #16 1340 ldr q17, [x0], #16 1341 eor v1.16b, v1.16b, v17.16b 1342 ldr q17, [x0], #16 1343 eor v0.16b, v0.16b, v16.16b 1344 eor v4.16b, v4.16b, v17.16b 1345 str q0, [x1], #16 1346 ldr q16, [x0], #16 1347 str q1, [x1], #16 1348 mov v0.16b, v15.16b 1349 str q4, [x1], #16 1350 ldr q1, [x0], #16 1351 eor v4.16b, v6.16b, v16.16b 1352 eor v1.16b, v3.16b, v1.16b 1353 ldr q3, [x0], #16 1354 eor v3.16b, v7.16b, v3.16b 1355 ldr q6, [x0], #16 1356 eor v2.16b, v2.16b, v6.16b 1357 ldr q6, [x0], #16 1358 eor v5.16b, v5.16b, v6.16b 1359 str q4, [x1], #16 1360 str q1, [x1], #16 1361 str q3, [x1], #16 1362 str q2, [x1], #16 1363 str q5, [x1], #16 1364 1365 bne .Lctr_enc_loop 1366 b .Lctr_enc_done 1367 1368.align 4 1369.Lctr_enc_loop_done: 1370 add x2, x2, #8 1371 ldr q16, [x0], #16 // load input 1372 eor v0.16b, v0.16b, v16.16b 1373 str q0, [x1], #16 // write output 1374 cmp x2, #2 1375 blo .Lctr_enc_done 1376 ldr q17, [x0], #16 1377 eor v1.16b, v1.16b, v17.16b 1378 str q1, [x1], #16 1379 beq .Lctr_enc_done 1380 ldr q18, [x0], #16 1381 eor v4.16b, v4.16b, v18.16b 1382 str q4, [x1], #16 1383 cmp x2, #4 1384 blo .Lctr_enc_done 1385 ldr q19, [x0], #16 1386 eor v6.16b, v6.16b, v19.16b 1387 str q6, [x1], #16 1388 beq .Lctr_enc_done 1389 ldr q20, [x0], #16 1390 eor v3.16b, v3.16b, v20.16b 1391 str q3, [x1], #16 1392 cmp x2, #6 1393 blo .Lctr_enc_done 1394 ldr q21, [x0], #16 1395 eor v7.16b, v7.16b, v21.16b 1396 str q7, [x1], #16 1397 beq .Lctr_enc_done 1398 ldr q22, [x0] 1399 eor v2.16b, v2.16b, v22.16b 1400 str q2, [x1], #16 1401 1402.Lctr_enc_done: 1403 movi v0.16b, #0 1404 movi v1.16b, #0 1405.Lctr_enc_bzero: // wipe key schedule [if any] 1406 stp q0, q1, [sp], #32 1407 cmp sp, x14 1408 bne .Lctr_enc_bzero 1409 1410 ldp d8, d9, [sp, #16] 1411 ldp d10, d11, [sp, #32] 1412 ldp d12, d13, [sp, #48] 1413 ldp d14, d15, [sp, #64] 1414 ldp x29, x30, [sp], #80 1415 ret 1416 1417.Lctr_enc_short: 1418 stp x29, x30, [sp, #-96]! 1419 stp x19, x20, [sp, #16] 1420 stp x21, x22, [sp, #32] 1421 str x23, [sp, #48] 1422 1423 mov x19, x0 // copy arguments 1424 mov x20, x1 1425 mov x21, x2 1426 mov x22, x3 1427 ldr w23, [x4, #12] // load counter .LSW 1428 ldr q1, [x4] // load whole counter value 1429#ifdef __AARCH64EL__ 1430 rev w23, w23 1431#endif 1432 str q1, [sp, #80] // copy counter value 1433 1434.Lctr_enc_short_loop: 1435 add x0, sp, #80 // input counter value 1436 add x1, sp, #64 // output on the stack 1437 mov x2, x22 // key 1438 1439 bl AES_encrypt 1440 1441 ldr q0, [x19], #16 // load input 1442 ldr q1, [sp, #64] // load encrypted counter 1443 add x23, x23, #1 1444#ifdef __AARCH64EL__ 1445 rev w0, w23 1446 str w0, [sp, #80+12] // next counter value 1447#else 1448 str w23, [sp, #80+12] // next counter value 1449#endif 1450 eor v0.16b, v0.16b, v1.16b 1451 str q0, [x20], #16 // store output 1452 subs x21, x21, #1 1453 bne .Lctr_enc_short_loop 1454 1455 movi v0.16b, #0 1456 movi v1.16b, #0 1457 stp q0, q1, [sp, #64] 1458 1459 ldr x23, [sp, #48] 1460 ldp x21, x22, [sp, #32] 1461 ldp x19, x20, [sp, #16] 1462 ldp x29, x30, [sp], #96 1463 ret 1464.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks 1465 1466.globl ossl_bsaes_xts_encrypt 1467.type ossl_bsaes_xts_encrypt,%function 1468.align 4 1469// On entry: 1470// x0 -> input plaintext 1471// x1 -> output ciphertext 1472// x2 -> length of text in bytes (must be at least 16) 1473// x3 -> key1 (used to encrypt the XORed plaintext blocks) 1474// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1475// x5 -> 16-byte initial vector (typically, sector number) 1476// On exit: 1477// Output ciphertext filled in 1478// No output registers, usual AAPCS64 register preservation 1479ossl_bsaes_xts_encrypt: 1480 AARCH64_VALID_CALL_TARGET 1481 // Stack layout: 1482 // sp -> 1483 // nrounds*128-96 bytes: key schedule 1484 // x19 -> 1485 // 16 bytes: frame record 1486 // 4*16 bytes: tweak storage across _bsaes_encrypt8 1487 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1488 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1489 stp x29, x30, [sp, #-192]! 1490 stp x19, x20, [sp, #80] 1491 stp x21, x22, [sp, #96] 1492 str x23, [sp, #112] 1493 stp d8, d9, [sp, #128] 1494 stp d10, d11, [sp, #144] 1495 stp d12, d13, [sp, #160] 1496 stp d14, d15, [sp, #176] 1497 1498 mov x19, sp 1499 mov x20, x0 1500 mov x21, x1 1501 mov x22, x2 1502 mov x23, x3 1503 1504 // generate initial tweak 1505 sub sp, sp, #16 1506 mov x0, x5 // iv[] 1507 mov x1, sp 1508 mov x2, x4 // key2 1509 bl AES_encrypt 1510 ldr q11, [sp], #16 1511 1512 ldr w1, [x23, #240] // get # of rounds 1513 // allocate the key schedule on the stack 1514 add x17, sp, #96 1515 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1516 1517 // populate the key schedule 1518 mov x9, x23 // pass key 1519 mov x10, x1 // pass # of rounds 1520 mov sp, x17 1521 bl _bsaes_key_convert 1522 eor v15.16b, v15.16b, v7.16b // fix up last round key 1523 str q15, [x17] // save last round key 1524 1525 subs x22, x22, #0x80 1526 blo .Lxts_enc_short 1527 b .Lxts_enc_loop 1528 1529.align 4 1530.Lxts_enc_loop: 1531 ldr q8, .Lxts_magic 1532 mov x10, x1 // pass rounds 1533 add x2, x19, #16 1534 ldr q0, [x20], #16 1535 sshr v1.2d, v11.2d, #63 1536 mov x9, sp // pass key schedule 1537 ldr q6, .Lxts_magic+16 1538 add v2.2d, v11.2d, v11.2d 1539 cmtst v3.2d, v11.2d, v6.2d 1540 and v1.16b, v1.16b, v8.16b 1541 ext v1.16b, v1.16b, v1.16b, #8 1542 and v3.16b, v3.16b, v8.16b 1543 ldr q4, [x20], #16 1544 eor v12.16b, v2.16b, v1.16b 1545 eor v1.16b, v4.16b, v12.16b 1546 eor v0.16b, v0.16b, v11.16b 1547 cmtst v2.2d, v12.2d, v6.2d 1548 add v4.2d, v12.2d, v12.2d 1549 add x0, x19, #16 1550 ext v3.16b, v3.16b, v3.16b, #8 1551 and v2.16b, v2.16b, v8.16b 1552 eor v13.16b, v4.16b, v3.16b 1553 ldr q3, [x20], #16 1554 ext v4.16b, v2.16b, v2.16b, #8 1555 eor v2.16b, v3.16b, v13.16b 1556 ldr q3, [x20], #16 1557 add v5.2d, v13.2d, v13.2d 1558 cmtst v7.2d, v13.2d, v6.2d 1559 and v7.16b, v7.16b, v8.16b 1560 ldr q9, [x20], #16 1561 ext v7.16b, v7.16b, v7.16b, #8 1562 ldr q10, [x20], #16 1563 eor v14.16b, v5.16b, v4.16b 1564 ldr q16, [x20], #16 1565 add v4.2d, v14.2d, v14.2d 1566 eor v3.16b, v3.16b, v14.16b 1567 eor v15.16b, v4.16b, v7.16b 1568 add v5.2d, v15.2d, v15.2d 1569 ldr q7, [x20], #16 1570 cmtst v4.2d, v14.2d, v6.2d 1571 and v17.16b, v4.16b, v8.16b 1572 cmtst v18.2d, v15.2d, v6.2d 1573 eor v4.16b, v9.16b, v15.16b 1574 ext v9.16b, v17.16b, v17.16b, #8 1575 eor v9.16b, v5.16b, v9.16b 1576 add v17.2d, v9.2d, v9.2d 1577 and v18.16b, v18.16b, v8.16b 1578 eor v5.16b, v10.16b, v9.16b 1579 str q9, [x2], #16 1580 ext v10.16b, v18.16b, v18.16b, #8 1581 cmtst v9.2d, v9.2d, v6.2d 1582 and v9.16b, v9.16b, v8.16b 1583 eor v10.16b, v17.16b, v10.16b 1584 cmtst v17.2d, v10.2d, v6.2d 1585 eor v6.16b, v16.16b, v10.16b 1586 str q10, [x2], #16 1587 ext v9.16b, v9.16b, v9.16b, #8 1588 add v10.2d, v10.2d, v10.2d 1589 eor v9.16b, v10.16b, v9.16b 1590 str q9, [x2], #16 1591 eor v7.16b, v7.16b, v9.16b 1592 add v9.2d, v9.2d, v9.2d 1593 and v8.16b, v17.16b, v8.16b 1594 ext v8.16b, v8.16b, v8.16b, #8 1595 eor v8.16b, v9.16b, v8.16b 1596 str q8, [x2] // next round tweak 1597 1598 bl _bsaes_encrypt8 1599 1600 ldr q8, [x0], #16 1601 eor v0.16b, v0.16b, v11.16b 1602 eor v1.16b, v1.16b, v12.16b 1603 ldr q9, [x0], #16 1604 eor v4.16b, v4.16b, v13.16b 1605 eor v6.16b, v6.16b, v14.16b 1606 ldr q10, [x0], #16 1607 eor v3.16b, v3.16b, v15.16b 1608 subs x22, x22, #0x80 1609 str q0, [x21], #16 1610 ldr q11, [x0] // next round tweak 1611 str q1, [x21], #16 1612 eor v0.16b, v7.16b, v8.16b 1613 eor v1.16b, v2.16b, v9.16b 1614 str q4, [x21], #16 1615 eor v2.16b, v5.16b, v10.16b 1616 str q6, [x21], #16 1617 str q3, [x21], #16 1618 str q0, [x21], #16 1619 str q1, [x21], #16 1620 str q2, [x21], #16 1621 bpl .Lxts_enc_loop 1622 1623.Lxts_enc_short: 1624 adds x22, x22, #0x70 1625 bmi .Lxts_enc_done 1626 1627 ldr q8, .Lxts_magic 1628 sshr v1.2d, v11.2d, #63 1629 add v2.2d, v11.2d, v11.2d 1630 ldr q9, .Lxts_magic+16 1631 subs x22, x22, #0x10 1632 ldr q0, [x20], #16 1633 and v1.16b, v1.16b, v8.16b 1634 cmtst v3.2d, v11.2d, v9.2d 1635 ext v1.16b, v1.16b, v1.16b, #8 1636 and v3.16b, v3.16b, v8.16b 1637 eor v12.16b, v2.16b, v1.16b 1638 ext v1.16b, v3.16b, v3.16b, #8 1639 add v2.2d, v12.2d, v12.2d 1640 cmtst v3.2d, v12.2d, v9.2d 1641 eor v13.16b, v2.16b, v1.16b 1642 and v22.16b, v3.16b, v8.16b 1643 bmi .Lxts_enc_1 1644 1645 ext v2.16b, v22.16b, v22.16b, #8 1646 add v3.2d, v13.2d, v13.2d 1647 ldr q1, [x20], #16 1648 cmtst v4.2d, v13.2d, v9.2d 1649 subs x22, x22, #0x10 1650 eor v14.16b, v3.16b, v2.16b 1651 and v23.16b, v4.16b, v8.16b 1652 bmi .Lxts_enc_2 1653 1654 ext v3.16b, v23.16b, v23.16b, #8 1655 add v4.2d, v14.2d, v14.2d 1656 ldr q2, [x20], #16 1657 cmtst v5.2d, v14.2d, v9.2d 1658 eor v0.16b, v0.16b, v11.16b 1659 subs x22, x22, #0x10 1660 eor v15.16b, v4.16b, v3.16b 1661 and v24.16b, v5.16b, v8.16b 1662 bmi .Lxts_enc_3 1663 1664 ext v4.16b, v24.16b, v24.16b, #8 1665 add v5.2d, v15.2d, v15.2d 1666 ldr q3, [x20], #16 1667 cmtst v6.2d, v15.2d, v9.2d 1668 eor v1.16b, v1.16b, v12.16b 1669 subs x22, x22, #0x10 1670 eor v16.16b, v5.16b, v4.16b 1671 and v25.16b, v6.16b, v8.16b 1672 bmi .Lxts_enc_4 1673 1674 ext v5.16b, v25.16b, v25.16b, #8 1675 add v6.2d, v16.2d, v16.2d 1676 add x0, x19, #16 1677 cmtst v7.2d, v16.2d, v9.2d 1678 ldr q4, [x20], #16 1679 eor v2.16b, v2.16b, v13.16b 1680 str q16, [x0], #16 1681 subs x22, x22, #0x10 1682 eor v17.16b, v6.16b, v5.16b 1683 and v26.16b, v7.16b, v8.16b 1684 bmi .Lxts_enc_5 1685 1686 ext v7.16b, v26.16b, v26.16b, #8 1687 add v18.2d, v17.2d, v17.2d 1688 ldr q5, [x20], #16 1689 eor v3.16b, v3.16b, v14.16b 1690 str q17, [x0], #16 1691 subs x22, x22, #0x10 1692 eor v18.16b, v18.16b, v7.16b 1693 bmi .Lxts_enc_6 1694 1695 ldr q6, [x20], #16 1696 eor v4.16b, v4.16b, v15.16b 1697 eor v5.16b, v5.16b, v16.16b 1698 str q18, [x0] // next round tweak 1699 mov x9, sp // pass key schedule 1700 mov x10, x1 1701 add x0, x19, #16 1702 sub x22, x22, #0x10 1703 eor v6.16b, v6.16b, v17.16b 1704 1705 bl _bsaes_encrypt8 1706 1707 ldr q16, [x0], #16 1708 eor v0.16b, v0.16b, v11.16b 1709 eor v1.16b, v1.16b, v12.16b 1710 ldr q17, [x0], #16 1711 eor v4.16b, v4.16b, v13.16b 1712 eor v6.16b, v6.16b, v14.16b 1713 eor v3.16b, v3.16b, v15.16b 1714 ldr q11, [x0] // next round tweak 1715 str q0, [x21], #16 1716 str q1, [x21], #16 1717 eor v0.16b, v7.16b, v16.16b 1718 eor v1.16b, v2.16b, v17.16b 1719 str q4, [x21], #16 1720 str q6, [x21], #16 1721 str q3, [x21], #16 1722 str q0, [x21], #16 1723 str q1, [x21], #16 1724 b .Lxts_enc_done 1725 1726.align 4 1727.Lxts_enc_6: 1728 eor v4.16b, v4.16b, v15.16b 1729 eor v5.16b, v5.16b, v16.16b 1730 mov x9, sp // pass key schedule 1731 mov x10, x1 // pass rounds 1732 add x0, x19, #16 1733 1734 bl _bsaes_encrypt8 1735 1736 ldr q16, [x0], #16 1737 eor v0.16b, v0.16b, v11.16b 1738 eor v1.16b, v1.16b, v12.16b 1739 eor v4.16b, v4.16b, v13.16b 1740 eor v6.16b, v6.16b, v14.16b 1741 ldr q11, [x0] // next round tweak 1742 eor v3.16b, v3.16b, v15.16b 1743 str q0, [x21], #16 1744 str q1, [x21], #16 1745 eor v0.16b, v7.16b, v16.16b 1746 str q4, [x21], #16 1747 str q6, [x21], #16 1748 str q3, [x21], #16 1749 str q0, [x21], #16 1750 b .Lxts_enc_done 1751 1752.align 4 1753.Lxts_enc_5: 1754 eor v3.16b, v3.16b, v14.16b 1755 eor v4.16b, v4.16b, v15.16b 1756 mov x9, sp // pass key schedule 1757 mov x10, x1 // pass rounds 1758 add x0, x19, #16 1759 1760 bl _bsaes_encrypt8 1761 1762 eor v0.16b, v0.16b, v11.16b 1763 eor v1.16b, v1.16b, v12.16b 1764 ldr q11, [x0] // next round tweak 1765 eor v4.16b, v4.16b, v13.16b 1766 eor v6.16b, v6.16b, v14.16b 1767 eor v3.16b, v3.16b, v15.16b 1768 str q0, [x21], #16 1769 str q1, [x21], #16 1770 str q4, [x21], #16 1771 str q6, [x21], #16 1772 str q3, [x21], #16 1773 b .Lxts_enc_done 1774 1775.align 4 1776.Lxts_enc_4: 1777 eor v2.16b, v2.16b, v13.16b 1778 eor v3.16b, v3.16b, v14.16b 1779 mov x9, sp // pass key schedule 1780 mov x10, x1 // pass rounds 1781 add x0, x19, #16 1782 1783 bl _bsaes_encrypt8 1784 1785 eor v0.16b, v0.16b, v11.16b 1786 eor v1.16b, v1.16b, v12.16b 1787 eor v4.16b, v4.16b, v13.16b 1788 eor v6.16b, v6.16b, v14.16b 1789 mov v11.16b, v15.16b // next round tweak 1790 str q0, [x21], #16 1791 str q1, [x21], #16 1792 str q4, [x21], #16 1793 str q6, [x21], #16 1794 b .Lxts_enc_done 1795 1796.align 4 1797.Lxts_enc_3: 1798 eor v1.16b, v1.16b, v12.16b 1799 eor v2.16b, v2.16b, v13.16b 1800 mov x9, sp // pass key schedule 1801 mov x10, x1 // pass rounds 1802 add x0, x19, #16 1803 1804 bl _bsaes_encrypt8 1805 1806 eor v0.16b, v0.16b, v11.16b 1807 eor v1.16b, v1.16b, v12.16b 1808 eor v4.16b, v4.16b, v13.16b 1809 mov v11.16b, v14.16b // next round tweak 1810 str q0, [x21], #16 1811 str q1, [x21], #16 1812 str q4, [x21], #16 1813 b .Lxts_enc_done 1814 1815.align 4 1816.Lxts_enc_2: 1817 eor v0.16b, v0.16b, v11.16b 1818 eor v1.16b, v1.16b, v12.16b 1819 mov x9, sp // pass key schedule 1820 mov x10, x1 // pass rounds 1821 add x0, x19, #16 1822 1823 bl _bsaes_encrypt8 1824 1825 eor v0.16b, v0.16b, v11.16b 1826 eor v1.16b, v1.16b, v12.16b 1827 mov v11.16b, v13.16b // next round tweak 1828 str q0, [x21], #16 1829 str q1, [x21], #16 1830 b .Lxts_enc_done 1831 1832.align 4 1833.Lxts_enc_1: 1834 eor v0.16b, v0.16b, v11.16b 1835 sub x0, sp, #16 1836 sub x1, sp, #16 1837 mov x2, x23 1838 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1839 mov v14.d[0], v12.d[1] 1840 str q0, [sp, #-16]! 1841 1842 bl AES_encrypt 1843 1844 ldr q0, [sp], #16 1845 trn1 v13.2d, v11.2d, v13.2d 1846 trn1 v11.2d, v12.2d, v14.2d // next round tweak 1847 eor v0.16b, v0.16b, v13.16b 1848 str q0, [x21], #16 1849 1850.Lxts_enc_done: 1851 adds x22, x22, #0x10 1852 beq .Lxts_enc_ret 1853 1854 sub x6, x21, #0x10 1855 // Penultimate plaintext block produces final ciphertext part-block 1856 // plus remaining part of final plaintext block. Move ciphertext part 1857 // to final position and reuse penultimate ciphertext block buffer to 1858 // construct final plaintext block 1859.Lxts_enc_steal: 1860 ldrb w0, [x20], #1 1861 ldrb w1, [x21, #-0x10] 1862 strb w0, [x21, #-0x10] 1863 strb w1, [x21], #1 1864 1865 subs x22, x22, #1 1866 bhi .Lxts_enc_steal 1867 1868 // Finally encrypt the penultimate ciphertext block using the 1869 // last tweak 1870 ldr q0, [x6] 1871 eor v0.16b, v0.16b, v11.16b 1872 str q0, [sp, #-16]! 1873 mov x0, sp 1874 mov x1, sp 1875 mov x2, x23 1876 mov x21, x6 1877 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1878 1879 bl AES_encrypt 1880 1881 trn1 v11.2d, v11.2d, v13.2d 1882 ldr q0, [sp], #16 1883 eor v0.16b, v0.16b, v11.16b 1884 str q0, [x21] 1885 1886.Lxts_enc_ret: 1887 1888 movi v0.16b, #0 1889 movi v1.16b, #0 1890.Lxts_enc_bzero: // wipe key schedule 1891 stp q0, q1, [sp], #32 1892 cmp sp, x19 1893 bne .Lxts_enc_bzero 1894 1895 ldp x19, x20, [sp, #80] 1896 ldp x21, x22, [sp, #96] 1897 ldr x23, [sp, #112] 1898 ldp d8, d9, [sp, #128] 1899 ldp d10, d11, [sp, #144] 1900 ldp d12, d13, [sp, #160] 1901 ldp d14, d15, [sp, #176] 1902 ldp x29, x30, [sp], #192 1903 ret 1904.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt 1905 1906// The assembler doesn't seem capable of de-duplicating these when expressed 1907// using `ldr qd,=` syntax, so assign a symbolic address 1908.align 5 1909.Lxts_magic: 1910.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 1911 1912.globl ossl_bsaes_xts_decrypt 1913.type ossl_bsaes_xts_decrypt,%function 1914.align 4 1915// On entry: 1916// x0 -> input ciphertext 1917// x1 -> output plaintext 1918// x2 -> length of text in bytes (must be at least 16) 1919// x3 -> key1 (used to decrypt the XORed ciphertext blocks) 1920// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1921// x5 -> 16-byte initial vector (typically, sector number) 1922// On exit: 1923// Output plaintext filled in 1924// No output registers, usual AAPCS64 register preservation 1925ossl_bsaes_xts_decrypt: 1926 AARCH64_VALID_CALL_TARGET 1927 // Stack layout: 1928 // sp -> 1929 // nrounds*128-96 bytes: key schedule 1930 // x19 -> 1931 // 16 bytes: frame record 1932 // 4*16 bytes: tweak storage across _bsaes_decrypt8 1933 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1934 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1935 stp x29, x30, [sp, #-192]! 1936 stp x19, x20, [sp, #80] 1937 stp x21, x22, [sp, #96] 1938 str x23, [sp, #112] 1939 stp d8, d9, [sp, #128] 1940 stp d10, d11, [sp, #144] 1941 stp d12, d13, [sp, #160] 1942 stp d14, d15, [sp, #176] 1943 1944 mov x19, sp 1945 mov x20, x0 1946 mov x21, x1 1947 mov x22, x2 1948 mov x23, x3 1949 1950 // generate initial tweak 1951 sub sp, sp, #16 1952 mov x0, x5 // iv[] 1953 mov x1, sp 1954 mov x2, x4 // key2 1955 bl AES_encrypt 1956 ldr q11, [sp], #16 1957 1958 ldr w1, [x23, #240] // get # of rounds 1959 // allocate the key schedule on the stack 1960 add x17, sp, #96 1961 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1962 1963 // populate the key schedule 1964 mov x9, x23 // pass key 1965 mov x10, x1 // pass # of rounds 1966 mov sp, x17 1967 bl _bsaes_key_convert 1968 ldr q6, [sp] 1969 str q15, [x17] // save last round key 1970 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1971 str q6, [sp] 1972 1973 sub x30, x22, #0x10 1974 tst x22, #0xf // if not multiple of 16 1975 csel x22, x30, x22, ne // subtract another 16 bytes 1976 subs x22, x22, #0x80 1977 1978 blo .Lxts_dec_short 1979 b .Lxts_dec_loop 1980 1981.align 4 1982.Lxts_dec_loop: 1983 ldr q8, .Lxts_magic 1984 mov x10, x1 // pass rounds 1985 add x2, x19, #16 1986 ldr q0, [x20], #16 1987 sshr v1.2d, v11.2d, #63 1988 mov x9, sp // pass key schedule 1989 ldr q6, .Lxts_magic+16 1990 add v2.2d, v11.2d, v11.2d 1991 cmtst v3.2d, v11.2d, v6.2d 1992 and v1.16b, v1.16b, v8.16b 1993 ext v1.16b, v1.16b, v1.16b, #8 1994 and v3.16b, v3.16b, v8.16b 1995 ldr q4, [x20], #16 1996 eor v12.16b, v2.16b, v1.16b 1997 eor v1.16b, v4.16b, v12.16b 1998 eor v0.16b, v0.16b, v11.16b 1999 cmtst v2.2d, v12.2d, v6.2d 2000 add v4.2d, v12.2d, v12.2d 2001 add x0, x19, #16 2002 ext v3.16b, v3.16b, v3.16b, #8 2003 and v2.16b, v2.16b, v8.16b 2004 eor v13.16b, v4.16b, v3.16b 2005 ldr q3, [x20], #16 2006 ext v4.16b, v2.16b, v2.16b, #8 2007 eor v2.16b, v3.16b, v13.16b 2008 ldr q3, [x20], #16 2009 add v5.2d, v13.2d, v13.2d 2010 cmtst v7.2d, v13.2d, v6.2d 2011 and v7.16b, v7.16b, v8.16b 2012 ldr q9, [x20], #16 2013 ext v7.16b, v7.16b, v7.16b, #8 2014 ldr q10, [x20], #16 2015 eor v14.16b, v5.16b, v4.16b 2016 ldr q16, [x20], #16 2017 add v4.2d, v14.2d, v14.2d 2018 eor v3.16b, v3.16b, v14.16b 2019 eor v15.16b, v4.16b, v7.16b 2020 add v5.2d, v15.2d, v15.2d 2021 ldr q7, [x20], #16 2022 cmtst v4.2d, v14.2d, v6.2d 2023 and v17.16b, v4.16b, v8.16b 2024 cmtst v18.2d, v15.2d, v6.2d 2025 eor v4.16b, v9.16b, v15.16b 2026 ext v9.16b, v17.16b, v17.16b, #8 2027 eor v9.16b, v5.16b, v9.16b 2028 add v17.2d, v9.2d, v9.2d 2029 and v18.16b, v18.16b, v8.16b 2030 eor v5.16b, v10.16b, v9.16b 2031 str q9, [x2], #16 2032 ext v10.16b, v18.16b, v18.16b, #8 2033 cmtst v9.2d, v9.2d, v6.2d 2034 and v9.16b, v9.16b, v8.16b 2035 eor v10.16b, v17.16b, v10.16b 2036 cmtst v17.2d, v10.2d, v6.2d 2037 eor v6.16b, v16.16b, v10.16b 2038 str q10, [x2], #16 2039 ext v9.16b, v9.16b, v9.16b, #8 2040 add v10.2d, v10.2d, v10.2d 2041 eor v9.16b, v10.16b, v9.16b 2042 str q9, [x2], #16 2043 eor v7.16b, v7.16b, v9.16b 2044 add v9.2d, v9.2d, v9.2d 2045 and v8.16b, v17.16b, v8.16b 2046 ext v8.16b, v8.16b, v8.16b, #8 2047 eor v8.16b, v9.16b, v8.16b 2048 str q8, [x2] // next round tweak 2049 2050 bl _bsaes_decrypt8 2051 2052 eor v6.16b, v6.16b, v13.16b 2053 eor v0.16b, v0.16b, v11.16b 2054 ldr q8, [x0], #16 2055 eor v7.16b, v7.16b, v8.16b 2056 str q0, [x21], #16 2057 eor v0.16b, v1.16b, v12.16b 2058 ldr q1, [x0], #16 2059 eor v1.16b, v3.16b, v1.16b 2060 subs x22, x22, #0x80 2061 eor v2.16b, v2.16b, v15.16b 2062 eor v3.16b, v4.16b, v14.16b 2063 ldr q4, [x0], #16 2064 str q0, [x21], #16 2065 ldr q11, [x0] // next round tweak 2066 eor v0.16b, v5.16b, v4.16b 2067 str q6, [x21], #16 2068 str q3, [x21], #16 2069 str q2, [x21], #16 2070 str q7, [x21], #16 2071 str q1, [x21], #16 2072 str q0, [x21], #16 2073 bpl .Lxts_dec_loop 2074 2075.Lxts_dec_short: 2076 adds x22, x22, #0x70 2077 bmi .Lxts_dec_done 2078 2079 ldr q8, .Lxts_magic 2080 sshr v1.2d, v11.2d, #63 2081 add v2.2d, v11.2d, v11.2d 2082 ldr q9, .Lxts_magic+16 2083 subs x22, x22, #0x10 2084 ldr q0, [x20], #16 2085 and v1.16b, v1.16b, v8.16b 2086 cmtst v3.2d, v11.2d, v9.2d 2087 ext v1.16b, v1.16b, v1.16b, #8 2088 and v3.16b, v3.16b, v8.16b 2089 eor v12.16b, v2.16b, v1.16b 2090 ext v1.16b, v3.16b, v3.16b, #8 2091 add v2.2d, v12.2d, v12.2d 2092 cmtst v3.2d, v12.2d, v9.2d 2093 eor v13.16b, v2.16b, v1.16b 2094 and v22.16b, v3.16b, v8.16b 2095 bmi .Lxts_dec_1 2096 2097 ext v2.16b, v22.16b, v22.16b, #8 2098 add v3.2d, v13.2d, v13.2d 2099 ldr q1, [x20], #16 2100 cmtst v4.2d, v13.2d, v9.2d 2101 subs x22, x22, #0x10 2102 eor v14.16b, v3.16b, v2.16b 2103 and v23.16b, v4.16b, v8.16b 2104 bmi .Lxts_dec_2 2105 2106 ext v3.16b, v23.16b, v23.16b, #8 2107 add v4.2d, v14.2d, v14.2d 2108 ldr q2, [x20], #16 2109 cmtst v5.2d, v14.2d, v9.2d 2110 eor v0.16b, v0.16b, v11.16b 2111 subs x22, x22, #0x10 2112 eor v15.16b, v4.16b, v3.16b 2113 and v24.16b, v5.16b, v8.16b 2114 bmi .Lxts_dec_3 2115 2116 ext v4.16b, v24.16b, v24.16b, #8 2117 add v5.2d, v15.2d, v15.2d 2118 ldr q3, [x20], #16 2119 cmtst v6.2d, v15.2d, v9.2d 2120 eor v1.16b, v1.16b, v12.16b 2121 subs x22, x22, #0x10 2122 eor v16.16b, v5.16b, v4.16b 2123 and v25.16b, v6.16b, v8.16b 2124 bmi .Lxts_dec_4 2125 2126 ext v5.16b, v25.16b, v25.16b, #8 2127 add v6.2d, v16.2d, v16.2d 2128 add x0, x19, #16 2129 cmtst v7.2d, v16.2d, v9.2d 2130 ldr q4, [x20], #16 2131 eor v2.16b, v2.16b, v13.16b 2132 str q16, [x0], #16 2133 subs x22, x22, #0x10 2134 eor v17.16b, v6.16b, v5.16b 2135 and v26.16b, v7.16b, v8.16b 2136 bmi .Lxts_dec_5 2137 2138 ext v7.16b, v26.16b, v26.16b, #8 2139 add v18.2d, v17.2d, v17.2d 2140 ldr q5, [x20], #16 2141 eor v3.16b, v3.16b, v14.16b 2142 str q17, [x0], #16 2143 subs x22, x22, #0x10 2144 eor v18.16b, v18.16b, v7.16b 2145 bmi .Lxts_dec_6 2146 2147 ldr q6, [x20], #16 2148 eor v4.16b, v4.16b, v15.16b 2149 eor v5.16b, v5.16b, v16.16b 2150 str q18, [x0] // next round tweak 2151 mov x9, sp // pass key schedule 2152 mov x10, x1 2153 add x0, x19, #16 2154 sub x22, x22, #0x10 2155 eor v6.16b, v6.16b, v17.16b 2156 2157 bl _bsaes_decrypt8 2158 2159 ldr q16, [x0], #16 2160 eor v0.16b, v0.16b, v11.16b 2161 eor v1.16b, v1.16b, v12.16b 2162 ldr q17, [x0], #16 2163 eor v6.16b, v6.16b, v13.16b 2164 eor v4.16b, v4.16b, v14.16b 2165 eor v2.16b, v2.16b, v15.16b 2166 ldr q11, [x0] // next round tweak 2167 str q0, [x21], #16 2168 str q1, [x21], #16 2169 eor v0.16b, v7.16b, v16.16b 2170 eor v1.16b, v3.16b, v17.16b 2171 str q6, [x21], #16 2172 str q4, [x21], #16 2173 str q2, [x21], #16 2174 str q0, [x21], #16 2175 str q1, [x21], #16 2176 b .Lxts_dec_done 2177 2178.align 4 2179.Lxts_dec_6: 2180 eor v4.16b, v4.16b, v15.16b 2181 eor v5.16b, v5.16b, v16.16b 2182 mov x9, sp // pass key schedule 2183 mov x10, x1 // pass rounds 2184 add x0, x19, #16 2185 2186 bl _bsaes_decrypt8 2187 2188 ldr q16, [x0], #16 2189 eor v0.16b, v0.16b, v11.16b 2190 eor v1.16b, v1.16b, v12.16b 2191 eor v6.16b, v6.16b, v13.16b 2192 eor v4.16b, v4.16b, v14.16b 2193 ldr q11, [x0] // next round tweak 2194 eor v2.16b, v2.16b, v15.16b 2195 str q0, [x21], #16 2196 str q1, [x21], #16 2197 eor v0.16b, v7.16b, v16.16b 2198 str q6, [x21], #16 2199 str q4, [x21], #16 2200 str q2, [x21], #16 2201 str q0, [x21], #16 2202 b .Lxts_dec_done 2203 2204.align 4 2205.Lxts_dec_5: 2206 eor v3.16b, v3.16b, v14.16b 2207 eor v4.16b, v4.16b, v15.16b 2208 mov x9, sp // pass key schedule 2209 mov x10, x1 // pass rounds 2210 add x0, x19, #16 2211 2212 bl _bsaes_decrypt8 2213 2214 eor v0.16b, v0.16b, v11.16b 2215 eor v1.16b, v1.16b, v12.16b 2216 ldr q11, [x0] // next round tweak 2217 eor v6.16b, v6.16b, v13.16b 2218 eor v4.16b, v4.16b, v14.16b 2219 eor v2.16b, v2.16b, v15.16b 2220 str q0, [x21], #16 2221 str q1, [x21], #16 2222 str q6, [x21], #16 2223 str q4, [x21], #16 2224 str q2, [x21], #16 2225 b .Lxts_dec_done 2226 2227.align 4 2228.Lxts_dec_4: 2229 eor v2.16b, v2.16b, v13.16b 2230 eor v3.16b, v3.16b, v14.16b 2231 mov x9, sp // pass key schedule 2232 mov x10, x1 // pass rounds 2233 add x0, x19, #16 2234 2235 bl _bsaes_decrypt8 2236 2237 eor v0.16b, v0.16b, v11.16b 2238 eor v1.16b, v1.16b, v12.16b 2239 eor v6.16b, v6.16b, v13.16b 2240 eor v4.16b, v4.16b, v14.16b 2241 mov v11.16b, v15.16b // next round tweak 2242 str q0, [x21], #16 2243 str q1, [x21], #16 2244 str q6, [x21], #16 2245 str q4, [x21], #16 2246 b .Lxts_dec_done 2247 2248.align 4 2249.Lxts_dec_3: 2250 eor v1.16b, v1.16b, v12.16b 2251 eor v2.16b, v2.16b, v13.16b 2252 mov x9, sp // pass key schedule 2253 mov x10, x1 // pass rounds 2254 add x0, x19, #16 2255 2256 bl _bsaes_decrypt8 2257 2258 eor v0.16b, v0.16b, v11.16b 2259 eor v1.16b, v1.16b, v12.16b 2260 eor v6.16b, v6.16b, v13.16b 2261 mov v11.16b, v14.16b // next round tweak 2262 str q0, [x21], #16 2263 str q1, [x21], #16 2264 str q6, [x21], #16 2265 b .Lxts_dec_done 2266 2267.align 4 2268.Lxts_dec_2: 2269 eor v0.16b, v0.16b, v11.16b 2270 eor v1.16b, v1.16b, v12.16b 2271 mov x9, sp // pass key schedule 2272 mov x10, x1 // pass rounds 2273 add x0, x19, #16 2274 2275 bl _bsaes_decrypt8 2276 2277 eor v0.16b, v0.16b, v11.16b 2278 eor v1.16b, v1.16b, v12.16b 2279 mov v11.16b, v13.16b // next round tweak 2280 str q0, [x21], #16 2281 str q1, [x21], #16 2282 b .Lxts_dec_done 2283 2284.align 4 2285.Lxts_dec_1: 2286 eor v0.16b, v0.16b, v11.16b 2287 sub x0, sp, #16 2288 sub x1, sp, #16 2289 mov x2, x23 2290 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2291 mov v14.d[0], v12.d[1] 2292 str q0, [sp, #-16]! 2293 2294 bl AES_decrypt 2295 2296 ldr q0, [sp], #16 2297 trn1 v13.2d, v11.2d, v13.2d 2298 trn1 v11.2d, v12.2d, v14.2d // next round tweak 2299 eor v0.16b, v0.16b, v13.16b 2300 str q0, [x21], #16 2301 2302.Lxts_dec_done: 2303 adds x22, x22, #0x10 2304 beq .Lxts_dec_ret 2305 2306 // calculate one round of extra tweak for the stolen ciphertext 2307 ldr q8, .Lxts_magic 2308 sshr v6.2d, v11.2d, #63 2309 and v6.16b, v6.16b, v8.16b 2310 add v12.2d, v11.2d, v11.2d 2311 ext v6.16b, v6.16b, v6.16b, #8 2312 eor v12.16b, v12.16b, v6.16b 2313 2314 // perform the final decryption with the last tweak value 2315 ldr q0, [x20], #16 2316 eor v0.16b, v0.16b, v12.16b 2317 str q0, [sp, #-16]! 2318 mov x0, sp 2319 mov x1, sp 2320 mov x2, x23 2321 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2322 mov v14.d[0], v12.d[1] 2323 2324 bl AES_decrypt 2325 2326 trn1 v12.2d, v12.2d, v14.2d 2327 trn1 v11.2d, v11.2d, v13.2d 2328 ldr q0, [sp], #16 2329 eor v0.16b, v0.16b, v12.16b 2330 str q0, [x21] 2331 2332 mov x6, x21 2333 // Penultimate ciphertext block produces final plaintext part-block 2334 // plus remaining part of final ciphertext block. Move plaintext part 2335 // to final position and reuse penultimate plaintext block buffer to 2336 // construct final ciphertext block 2337.Lxts_dec_steal: 2338 ldrb w1, [x21] 2339 ldrb w0, [x20], #1 2340 strb w1, [x21, #0x10] 2341 strb w0, [x21], #1 2342 2343 subs x22, x22, #1 2344 bhi .Lxts_dec_steal 2345 2346 // Finally decrypt the penultimate plaintext block using the 2347 // penultimate tweak 2348 ldr q0, [x6] 2349 eor v0.16b, v0.16b, v11.16b 2350 str q0, [sp, #-16]! 2351 mov x0, sp 2352 mov x1, sp 2353 mov x2, x23 2354 mov x21, x6 2355 2356 bl AES_decrypt 2357 2358 trn1 v11.2d, v11.2d, v13.2d 2359 ldr q0, [sp], #16 2360 eor v0.16b, v0.16b, v11.16b 2361 str q0, [x21] 2362 2363.Lxts_dec_ret: 2364 2365 movi v0.16b, #0 2366 movi v1.16b, #0 2367.Lxts_dec_bzero: // wipe key schedule 2368 stp q0, q1, [sp], #32 2369 cmp sp, x19 2370 bne .Lxts_dec_bzero 2371 2372 ldp x19, x20, [sp, #80] 2373 ldp x21, x22, [sp, #96] 2374 ldr x23, [sp, #112] 2375 ldp d8, d9, [sp, #128] 2376 ldp d10, d11, [sp, #144] 2377 ldp d12, d13, [sp, #160] 2378 ldp d14, d15, [sp, #176] 2379 ldp x29, x30, [sp], #192 2380 ret 2381.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt 2382