1#!/usr/bin/env perl 2# Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9use strict; 10 11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 13my $xlate; 14 15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; 16( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or 18die "can't locate arm-xlate.pl"; 19 20open OUT,"| \"$^X\" $xlate $flavour $output"; 21*STDOUT=*OUT; 22 23my $code = data(); 24print $code; 25 26close STDOUT or die "error closing STDOUT: $!"; # enforce flush 27 28sub data 29{ 30 local $/; 31 return <DATA>; 32} 33 34__END__ 35// Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. 36// 37// Licensed under the OpenSSL license (the "License"). You may not use 38// this file except in compliance with the License. You can obtain a copy 39// in the file LICENSE in the source distribution or at 40// https://www.openssl.org/source/license.html 41// 42// ==================================================================== 43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL 44// project. Rights for redistribution and usage in source and binary 45// forms are granted according to the OpenSSL license. 46// ==================================================================== 47// 48// This implementation is a translation of bsaes-armv7 for AArch64. 49// No attempt has been made to carry across the build switches for 50// kernel targets, since the Linux kernel crypto support has moved on 51// from when it was based on OpenSSL. 52 53// A lot of hand-scheduling has been performed. Consequently, this code 54// doesn't factor out neatly into macros in the same way that the 55// AArch32 version did, and there is little to be gained by wrapping it 56// up in Perl, and it is presented as pure assembly. 57 58 59#include "crypto/arm_arch.h" 60 61.text 62 63.extern AES_cbc_encrypt 64.extern AES_encrypt 65.extern AES_decrypt 66 67.type _bsaes_decrypt8,%function 68.align 4 69// On entry: 70// x9 -> key (previously expanded using _bsaes_key_convert) 71// x10 = number of rounds 72// v0-v7 input data 73// On exit: 74// x9-x11 corrupted 75// other general-purpose registers preserved 76// v0-v7 output data 77// v11-v15 preserved 78// other SIMD registers corrupted 79_bsaes_decrypt8: 80 ldr q8, [x9], #16 81 adr x11, .LM0ISR 82 movi v9.16b, #0x55 83 ldr q10, [x11], #16 84 movi v16.16b, #0x33 85 movi v17.16b, #0x0f 86 sub x10, x10, #1 87 eor v0.16b, v0.16b, v8.16b 88 eor v1.16b, v1.16b, v8.16b 89 eor v2.16b, v2.16b, v8.16b 90 eor v4.16b, v4.16b, v8.16b 91 eor v3.16b, v3.16b, v8.16b 92 eor v5.16b, v5.16b, v8.16b 93 tbl v0.16b, {v0.16b}, v10.16b 94 tbl v1.16b, {v1.16b}, v10.16b 95 tbl v2.16b, {v2.16b}, v10.16b 96 tbl v4.16b, {v4.16b}, v10.16b 97 eor v6.16b, v6.16b, v8.16b 98 eor v7.16b, v7.16b, v8.16b 99 tbl v3.16b, {v3.16b}, v10.16b 100 tbl v5.16b, {v5.16b}, v10.16b 101 tbl v6.16b, {v6.16b}, v10.16b 102 ushr v8.2d, v0.2d, #1 103 tbl v7.16b, {v7.16b}, v10.16b 104 ushr v10.2d, v4.2d, #1 105 ushr v18.2d, v2.2d, #1 106 eor v8.16b, v8.16b, v1.16b 107 ushr v19.2d, v6.2d, #1 108 eor v10.16b, v10.16b, v5.16b 109 eor v18.16b, v18.16b, v3.16b 110 and v8.16b, v8.16b, v9.16b 111 eor v19.16b, v19.16b, v7.16b 112 and v10.16b, v10.16b, v9.16b 113 and v18.16b, v18.16b, v9.16b 114 eor v1.16b, v1.16b, v8.16b 115 shl v8.2d, v8.2d, #1 116 and v9.16b, v19.16b, v9.16b 117 eor v5.16b, v5.16b, v10.16b 118 shl v10.2d, v10.2d, #1 119 eor v3.16b, v3.16b, v18.16b 120 shl v18.2d, v18.2d, #1 121 eor v0.16b, v0.16b, v8.16b 122 shl v8.2d, v9.2d, #1 123 eor v7.16b, v7.16b, v9.16b 124 eor v4.16b, v4.16b, v10.16b 125 eor v2.16b, v2.16b, v18.16b 126 ushr v9.2d, v1.2d, #2 127 eor v6.16b, v6.16b, v8.16b 128 ushr v8.2d, v0.2d, #2 129 ushr v10.2d, v5.2d, #2 130 ushr v18.2d, v4.2d, #2 131 eor v9.16b, v9.16b, v3.16b 132 eor v8.16b, v8.16b, v2.16b 133 eor v10.16b, v10.16b, v7.16b 134 eor v18.16b, v18.16b, v6.16b 135 and v9.16b, v9.16b, v16.16b 136 and v8.16b, v8.16b, v16.16b 137 and v10.16b, v10.16b, v16.16b 138 and v16.16b, v18.16b, v16.16b 139 eor v3.16b, v3.16b, v9.16b 140 shl v9.2d, v9.2d, #2 141 eor v2.16b, v2.16b, v8.16b 142 shl v8.2d, v8.2d, #2 143 eor v7.16b, v7.16b, v10.16b 144 shl v10.2d, v10.2d, #2 145 eor v6.16b, v6.16b, v16.16b 146 shl v16.2d, v16.2d, #2 147 eor v1.16b, v1.16b, v9.16b 148 eor v0.16b, v0.16b, v8.16b 149 eor v5.16b, v5.16b, v10.16b 150 eor v4.16b, v4.16b, v16.16b 151 ushr v8.2d, v3.2d, #4 152 ushr v9.2d, v2.2d, #4 153 ushr v10.2d, v1.2d, #4 154 ushr v16.2d, v0.2d, #4 155 eor v8.16b, v8.16b, v7.16b 156 eor v9.16b, v9.16b, v6.16b 157 eor v10.16b, v10.16b, v5.16b 158 eor v16.16b, v16.16b, v4.16b 159 and v8.16b, v8.16b, v17.16b 160 and v9.16b, v9.16b, v17.16b 161 and v10.16b, v10.16b, v17.16b 162 and v16.16b, v16.16b, v17.16b 163 eor v7.16b, v7.16b, v8.16b 164 shl v8.2d, v8.2d, #4 165 eor v6.16b, v6.16b, v9.16b 166 shl v9.2d, v9.2d, #4 167 eor v5.16b, v5.16b, v10.16b 168 shl v10.2d, v10.2d, #4 169 eor v4.16b, v4.16b, v16.16b 170 shl v16.2d, v16.2d, #4 171 eor v3.16b, v3.16b, v8.16b 172 eor v2.16b, v2.16b, v9.16b 173 eor v1.16b, v1.16b, v10.16b 174 eor v0.16b, v0.16b, v16.16b 175 b .Ldec_sbox 176.align 4 177.Ldec_loop: 178 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 179 ldp q8, q9, [x9], #32 180 eor v0.16b, v16.16b, v0.16b 181 ldr q10, [x9], #16 182 eor v1.16b, v17.16b, v1.16b 183 ldr q16, [x9], #16 184 eor v2.16b, v18.16b, v2.16b 185 eor v3.16b, v19.16b, v3.16b 186 eor v4.16b, v8.16b, v4.16b 187 eor v5.16b, v9.16b, v5.16b 188 eor v6.16b, v10.16b, v6.16b 189 eor v7.16b, v16.16b, v7.16b 190 tbl v0.16b, {v0.16b}, v28.16b 191 tbl v1.16b, {v1.16b}, v28.16b 192 tbl v2.16b, {v2.16b}, v28.16b 193 tbl v3.16b, {v3.16b}, v28.16b 194 tbl v4.16b, {v4.16b}, v28.16b 195 tbl v5.16b, {v5.16b}, v28.16b 196 tbl v6.16b, {v6.16b}, v28.16b 197 tbl v7.16b, {v7.16b}, v28.16b 198.Ldec_sbox: 199 eor v1.16b, v1.16b, v4.16b 200 eor v3.16b, v3.16b, v4.16b 201 subs x10, x10, #1 202 eor v4.16b, v4.16b, v7.16b 203 eor v2.16b, v2.16b, v7.16b 204 eor v1.16b, v1.16b, v6.16b 205 eor v6.16b, v6.16b, v4.16b 206 eor v2.16b, v2.16b, v5.16b 207 eor v0.16b, v0.16b, v1.16b 208 eor v7.16b, v7.16b, v6.16b 209 eor v8.16b, v6.16b, v2.16b 210 and v9.16b, v4.16b, v6.16b 211 eor v10.16b, v2.16b, v6.16b 212 eor v3.16b, v3.16b, v0.16b 213 eor v5.16b, v5.16b, v0.16b 214 eor v16.16b, v7.16b, v4.16b 215 eor v17.16b, v4.16b, v0.16b 216 and v18.16b, v0.16b, v2.16b 217 eor v19.16b, v7.16b, v4.16b 218 eor v1.16b, v1.16b, v3.16b 219 eor v20.16b, v3.16b, v0.16b 220 eor v21.16b, v5.16b, v2.16b 221 eor v22.16b, v3.16b, v7.16b 222 and v8.16b, v17.16b, v8.16b 223 orr v17.16b, v3.16b, v5.16b 224 eor v23.16b, v1.16b, v6.16b 225 eor v24.16b, v20.16b, v16.16b 226 eor v25.16b, v1.16b, v5.16b 227 orr v26.16b, v20.16b, v21.16b 228 and v20.16b, v20.16b, v21.16b 229 and v27.16b, v7.16b, v1.16b 230 eor v21.16b, v21.16b, v23.16b 231 orr v28.16b, v16.16b, v23.16b 232 orr v29.16b, v22.16b, v25.16b 233 eor v26.16b, v26.16b, v8.16b 234 and v16.16b, v16.16b, v23.16b 235 and v22.16b, v22.16b, v25.16b 236 and v21.16b, v24.16b, v21.16b 237 eor v8.16b, v28.16b, v8.16b 238 eor v23.16b, v5.16b, v2.16b 239 eor v24.16b, v1.16b, v6.16b 240 eor v16.16b, v16.16b, v22.16b 241 eor v22.16b, v3.16b, v0.16b 242 eor v25.16b, v29.16b, v21.16b 243 eor v21.16b, v26.16b, v21.16b 244 eor v8.16b, v8.16b, v20.16b 245 eor v26.16b, v23.16b, v24.16b 246 eor v16.16b, v16.16b, v20.16b 247 eor v28.16b, v22.16b, v19.16b 248 eor v20.16b, v25.16b, v20.16b 249 eor v9.16b, v21.16b, v9.16b 250 eor v8.16b, v8.16b, v18.16b 251 eor v18.16b, v5.16b, v1.16b 252 eor v21.16b, v16.16b, v17.16b 253 eor v16.16b, v16.16b, v17.16b 254 eor v17.16b, v20.16b, v27.16b 255 eor v20.16b, v3.16b, v7.16b 256 eor v25.16b, v9.16b, v8.16b 257 eor v27.16b, v0.16b, v4.16b 258 and v29.16b, v9.16b, v17.16b 259 eor v30.16b, v8.16b, v29.16b 260 eor v31.16b, v21.16b, v29.16b 261 eor v29.16b, v21.16b, v29.16b 262 bsl v30.16b, v17.16b, v21.16b 263 bsl v31.16b, v9.16b, v8.16b 264 bsl v16.16b, v30.16b, v29.16b 265 bsl v21.16b, v29.16b, v30.16b 266 eor v8.16b, v31.16b, v30.16b 267 and v1.16b, v1.16b, v31.16b 268 and v9.16b, v16.16b, v31.16b 269 and v6.16b, v6.16b, v30.16b 270 eor v16.16b, v17.16b, v21.16b 271 and v4.16b, v4.16b, v30.16b 272 eor v17.16b, v8.16b, v30.16b 273 and v21.16b, v24.16b, v8.16b 274 eor v9.16b, v9.16b, v25.16b 275 and v19.16b, v19.16b, v8.16b 276 eor v24.16b, v30.16b, v16.16b 277 eor v25.16b, v30.16b, v16.16b 278 and v7.16b, v7.16b, v17.16b 279 and v10.16b, v10.16b, v16.16b 280 eor v29.16b, v9.16b, v16.16b 281 eor v30.16b, v31.16b, v9.16b 282 and v0.16b, v24.16b, v0.16b 283 and v9.16b, v18.16b, v9.16b 284 and v2.16b, v25.16b, v2.16b 285 eor v10.16b, v10.16b, v6.16b 286 eor v18.16b, v29.16b, v16.16b 287 and v5.16b, v30.16b, v5.16b 288 eor v24.16b, v8.16b, v29.16b 289 and v25.16b, v26.16b, v29.16b 290 and v26.16b, v28.16b, v29.16b 291 eor v8.16b, v8.16b, v29.16b 292 eor v17.16b, v17.16b, v18.16b 293 eor v5.16b, v1.16b, v5.16b 294 and v23.16b, v24.16b, v23.16b 295 eor v21.16b, v21.16b, v25.16b 296 eor v19.16b, v19.16b, v26.16b 297 eor v0.16b, v4.16b, v0.16b 298 and v3.16b, v17.16b, v3.16b 299 eor v1.16b, v9.16b, v1.16b 300 eor v9.16b, v25.16b, v23.16b 301 eor v5.16b, v5.16b, v21.16b 302 eor v2.16b, v6.16b, v2.16b 303 and v6.16b, v8.16b, v22.16b 304 eor v3.16b, v7.16b, v3.16b 305 and v8.16b, v20.16b, v18.16b 306 eor v10.16b, v10.16b, v9.16b 307 eor v0.16b, v0.16b, v19.16b 308 eor v9.16b, v1.16b, v9.16b 309 eor v1.16b, v2.16b, v21.16b 310 eor v3.16b, v3.16b, v19.16b 311 and v16.16b, v27.16b, v16.16b 312 eor v17.16b, v26.16b, v6.16b 313 eor v6.16b, v8.16b, v7.16b 314 eor v7.16b, v1.16b, v9.16b 315 eor v1.16b, v5.16b, v3.16b 316 eor v2.16b, v10.16b, v3.16b 317 eor v4.16b, v16.16b, v4.16b 318 eor v8.16b, v6.16b, v17.16b 319 eor v5.16b, v9.16b, v3.16b 320 eor v9.16b, v0.16b, v1.16b 321 eor v6.16b, v7.16b, v1.16b 322 eor v0.16b, v4.16b, v17.16b 323 eor v4.16b, v8.16b, v7.16b 324 eor v7.16b, v9.16b, v2.16b 325 eor v8.16b, v3.16b, v0.16b 326 eor v7.16b, v7.16b, v5.16b 327 eor v3.16b, v4.16b, v7.16b 328 eor v4.16b, v7.16b, v0.16b 329 eor v7.16b, v8.16b, v3.16b 330 bcc .Ldec_done 331 ext v8.16b, v0.16b, v0.16b, #8 332 ext v9.16b, v1.16b, v1.16b, #8 333 ldr q28, [x11] // load from .LISR in common case (x10 > 0) 334 ext v10.16b, v6.16b, v6.16b, #8 335 ext v16.16b, v3.16b, v3.16b, #8 336 ext v17.16b, v5.16b, v5.16b, #8 337 ext v18.16b, v4.16b, v4.16b, #8 338 eor v8.16b, v8.16b, v0.16b 339 eor v9.16b, v9.16b, v1.16b 340 eor v10.16b, v10.16b, v6.16b 341 eor v16.16b, v16.16b, v3.16b 342 eor v17.16b, v17.16b, v5.16b 343 ext v19.16b, v2.16b, v2.16b, #8 344 ext v20.16b, v7.16b, v7.16b, #8 345 eor v18.16b, v18.16b, v4.16b 346 eor v6.16b, v6.16b, v8.16b 347 eor v8.16b, v2.16b, v10.16b 348 eor v4.16b, v4.16b, v9.16b 349 eor v2.16b, v19.16b, v2.16b 350 eor v9.16b, v20.16b, v7.16b 351 eor v0.16b, v0.16b, v16.16b 352 eor v1.16b, v1.16b, v16.16b 353 eor v6.16b, v6.16b, v17.16b 354 eor v8.16b, v8.16b, v16.16b 355 eor v7.16b, v7.16b, v18.16b 356 eor v4.16b, v4.16b, v16.16b 357 eor v2.16b, v3.16b, v2.16b 358 eor v1.16b, v1.16b, v17.16b 359 eor v3.16b, v5.16b, v9.16b 360 eor v5.16b, v8.16b, v17.16b 361 eor v7.16b, v7.16b, v17.16b 362 ext v8.16b, v0.16b, v0.16b, #12 363 ext v9.16b, v6.16b, v6.16b, #12 364 ext v10.16b, v4.16b, v4.16b, #12 365 ext v16.16b, v1.16b, v1.16b, #12 366 ext v17.16b, v5.16b, v5.16b, #12 367 ext v18.16b, v7.16b, v7.16b, #12 368 eor v0.16b, v0.16b, v8.16b 369 eor v6.16b, v6.16b, v9.16b 370 eor v4.16b, v4.16b, v10.16b 371 ext v19.16b, v2.16b, v2.16b, #12 372 ext v20.16b, v3.16b, v3.16b, #12 373 eor v1.16b, v1.16b, v16.16b 374 eor v5.16b, v5.16b, v17.16b 375 eor v7.16b, v7.16b, v18.16b 376 eor v2.16b, v2.16b, v19.16b 377 eor v16.16b, v16.16b, v0.16b 378 eor v3.16b, v3.16b, v20.16b 379 eor v17.16b, v17.16b, v4.16b 380 eor v10.16b, v10.16b, v6.16b 381 ext v0.16b, v0.16b, v0.16b, #8 382 eor v9.16b, v9.16b, v1.16b 383 ext v1.16b, v1.16b, v1.16b, #8 384 eor v8.16b, v8.16b, v3.16b 385 eor v16.16b, v16.16b, v3.16b 386 eor v18.16b, v18.16b, v5.16b 387 eor v19.16b, v19.16b, v7.16b 388 ext v21.16b, v5.16b, v5.16b, #8 389 ext v5.16b, v7.16b, v7.16b, #8 390 eor v7.16b, v20.16b, v2.16b 391 ext v4.16b, v4.16b, v4.16b, #8 392 ext v20.16b, v3.16b, v3.16b, #8 393 eor v17.16b, v17.16b, v3.16b 394 ext v2.16b, v2.16b, v2.16b, #8 395 eor v3.16b, v10.16b, v3.16b 396 ext v10.16b, v6.16b, v6.16b, #8 397 eor v0.16b, v0.16b, v8.16b 398 eor v1.16b, v1.16b, v16.16b 399 eor v5.16b, v5.16b, v18.16b 400 eor v3.16b, v3.16b, v4.16b 401 eor v7.16b, v20.16b, v7.16b 402 eor v6.16b, v2.16b, v19.16b 403 eor v4.16b, v21.16b, v17.16b 404 eor v2.16b, v10.16b, v9.16b 405 bne .Ldec_loop 406 ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) 407 b .Ldec_loop 408.align 4 409.Ldec_done: 410 ushr v8.2d, v0.2d, #1 411 movi v9.16b, #0x55 412 ldr q10, [x9] 413 ushr v16.2d, v2.2d, #1 414 movi v17.16b, #0x33 415 ushr v18.2d, v6.2d, #1 416 movi v19.16b, #0x0f 417 eor v8.16b, v8.16b, v1.16b 418 ushr v20.2d, v3.2d, #1 419 eor v16.16b, v16.16b, v7.16b 420 eor v18.16b, v18.16b, v4.16b 421 and v8.16b, v8.16b, v9.16b 422 eor v20.16b, v20.16b, v5.16b 423 and v16.16b, v16.16b, v9.16b 424 and v18.16b, v18.16b, v9.16b 425 shl v21.2d, v8.2d, #1 426 eor v1.16b, v1.16b, v8.16b 427 and v8.16b, v20.16b, v9.16b 428 eor v7.16b, v7.16b, v16.16b 429 shl v9.2d, v16.2d, #1 430 eor v4.16b, v4.16b, v18.16b 431 shl v16.2d, v18.2d, #1 432 eor v0.16b, v0.16b, v21.16b 433 shl v18.2d, v8.2d, #1 434 eor v5.16b, v5.16b, v8.16b 435 eor v2.16b, v2.16b, v9.16b 436 eor v6.16b, v6.16b, v16.16b 437 ushr v8.2d, v1.2d, #2 438 eor v3.16b, v3.16b, v18.16b 439 ushr v9.2d, v0.2d, #2 440 ushr v16.2d, v7.2d, #2 441 ushr v18.2d, v2.2d, #2 442 eor v8.16b, v8.16b, v4.16b 443 eor v9.16b, v9.16b, v6.16b 444 eor v16.16b, v16.16b, v5.16b 445 eor v18.16b, v18.16b, v3.16b 446 and v8.16b, v8.16b, v17.16b 447 and v9.16b, v9.16b, v17.16b 448 and v16.16b, v16.16b, v17.16b 449 and v17.16b, v18.16b, v17.16b 450 eor v4.16b, v4.16b, v8.16b 451 shl v8.2d, v8.2d, #2 452 eor v6.16b, v6.16b, v9.16b 453 shl v9.2d, v9.2d, #2 454 eor v5.16b, v5.16b, v16.16b 455 shl v16.2d, v16.2d, #2 456 eor v3.16b, v3.16b, v17.16b 457 shl v17.2d, v17.2d, #2 458 eor v1.16b, v1.16b, v8.16b 459 eor v0.16b, v0.16b, v9.16b 460 eor v7.16b, v7.16b, v16.16b 461 eor v2.16b, v2.16b, v17.16b 462 ushr v8.2d, v4.2d, #4 463 ushr v9.2d, v6.2d, #4 464 ushr v16.2d, v1.2d, #4 465 ushr v17.2d, v0.2d, #4 466 eor v8.16b, v8.16b, v5.16b 467 eor v9.16b, v9.16b, v3.16b 468 eor v16.16b, v16.16b, v7.16b 469 eor v17.16b, v17.16b, v2.16b 470 and v8.16b, v8.16b, v19.16b 471 and v9.16b, v9.16b, v19.16b 472 and v16.16b, v16.16b, v19.16b 473 and v17.16b, v17.16b, v19.16b 474 eor v5.16b, v5.16b, v8.16b 475 shl v8.2d, v8.2d, #4 476 eor v3.16b, v3.16b, v9.16b 477 shl v9.2d, v9.2d, #4 478 eor v7.16b, v7.16b, v16.16b 479 shl v16.2d, v16.2d, #4 480 eor v2.16b, v2.16b, v17.16b 481 shl v17.2d, v17.2d, #4 482 eor v4.16b, v4.16b, v8.16b 483 eor v6.16b, v6.16b, v9.16b 484 eor v7.16b, v7.16b, v10.16b 485 eor v1.16b, v1.16b, v16.16b 486 eor v2.16b, v2.16b, v10.16b 487 eor v0.16b, v0.16b, v17.16b 488 eor v4.16b, v4.16b, v10.16b 489 eor v6.16b, v6.16b, v10.16b 490 eor v3.16b, v3.16b, v10.16b 491 eor v5.16b, v5.16b, v10.16b 492 eor v1.16b, v1.16b, v10.16b 493 eor v0.16b, v0.16b, v10.16b 494 ret 495.size _bsaes_decrypt8,.-_bsaes_decrypt8 496 497.type _bsaes_const,%object 498.align 6 499_bsaes_const: 500// InvShiftRows constants 501// Used in _bsaes_decrypt8, which assumes contiguity 502// .LM0ISR used with round 0 key 503// .LISR used with middle round keys 504// .LISRM0 used with final round key 505.LM0ISR: 506.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 507.LISR: 508.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 509.LISRM0: 510.quad 0x01040b0e0205080f, 0x0306090c00070a0d 511 512// ShiftRows constants 513// Used in _bsaes_encrypt8, which assumes contiguity 514// .LM0SR used with round 0 key 515// .LSR used with middle round keys 516// .LSRM0 used with final round key 517.LM0SR: 518.quad 0x0a0e02060f03070b, 0x0004080c05090d01 519.LSR: 520.quad 0x0504070600030201, 0x0f0e0d0c0a09080b 521.LSRM0: 522.quad 0x0304090e00050a0f, 0x01060b0c0207080d 523 524.LM0_bigendian: 525.quad 0x02060a0e03070b0f, 0x0004080c0105090d 526.LM0_littleendian: 527.quad 0x0105090d0004080c, 0x03070b0f02060a0e 528 529// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into 530// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR 531.LREVM0SR: 532.quad 0x090d01050c000408, 0x03070b0f060a0e02 533 534.align 6 535.size _bsaes_const,.-_bsaes_const 536 537.type _bsaes_encrypt8,%function 538.align 4 539// On entry: 540// x9 -> key (previously expanded using _bsaes_key_convert) 541// x10 = number of rounds 542// v0-v7 input data 543// On exit: 544// x9-x11 corrupted 545// other general-purpose registers preserved 546// v0-v7 output data 547// v11-v15 preserved 548// other SIMD registers corrupted 549_bsaes_encrypt8: 550 ldr q8, [x9], #16 551 adr x11, .LM0SR 552 ldr q9, [x11], #16 553_bsaes_encrypt8_alt: 554 eor v0.16b, v0.16b, v8.16b 555 eor v1.16b, v1.16b, v8.16b 556 sub x10, x10, #1 557 eor v2.16b, v2.16b, v8.16b 558 eor v4.16b, v4.16b, v8.16b 559 eor v3.16b, v3.16b, v8.16b 560 eor v5.16b, v5.16b, v8.16b 561 tbl v0.16b, {v0.16b}, v9.16b 562 tbl v1.16b, {v1.16b}, v9.16b 563 tbl v2.16b, {v2.16b}, v9.16b 564 tbl v4.16b, {v4.16b}, v9.16b 565 eor v6.16b, v6.16b, v8.16b 566 eor v7.16b, v7.16b, v8.16b 567 tbl v3.16b, {v3.16b}, v9.16b 568 tbl v5.16b, {v5.16b}, v9.16b 569 tbl v6.16b, {v6.16b}, v9.16b 570 ushr v8.2d, v0.2d, #1 571 movi v10.16b, #0x55 572 tbl v7.16b, {v7.16b}, v9.16b 573 ushr v9.2d, v4.2d, #1 574 movi v16.16b, #0x33 575 ushr v17.2d, v2.2d, #1 576 eor v8.16b, v8.16b, v1.16b 577 movi v18.16b, #0x0f 578 ushr v19.2d, v6.2d, #1 579 eor v9.16b, v9.16b, v5.16b 580 eor v17.16b, v17.16b, v3.16b 581 and v8.16b, v8.16b, v10.16b 582 eor v19.16b, v19.16b, v7.16b 583 and v9.16b, v9.16b, v10.16b 584 and v17.16b, v17.16b, v10.16b 585 eor v1.16b, v1.16b, v8.16b 586 shl v8.2d, v8.2d, #1 587 and v10.16b, v19.16b, v10.16b 588 eor v5.16b, v5.16b, v9.16b 589 shl v9.2d, v9.2d, #1 590 eor v3.16b, v3.16b, v17.16b 591 shl v17.2d, v17.2d, #1 592 eor v0.16b, v0.16b, v8.16b 593 shl v8.2d, v10.2d, #1 594 eor v7.16b, v7.16b, v10.16b 595 eor v4.16b, v4.16b, v9.16b 596 eor v2.16b, v2.16b, v17.16b 597 ushr v9.2d, v1.2d, #2 598 eor v6.16b, v6.16b, v8.16b 599 ushr v8.2d, v0.2d, #2 600 ushr v10.2d, v5.2d, #2 601 ushr v17.2d, v4.2d, #2 602 eor v9.16b, v9.16b, v3.16b 603 eor v8.16b, v8.16b, v2.16b 604 eor v10.16b, v10.16b, v7.16b 605 eor v17.16b, v17.16b, v6.16b 606 and v9.16b, v9.16b, v16.16b 607 and v8.16b, v8.16b, v16.16b 608 and v10.16b, v10.16b, v16.16b 609 and v16.16b, v17.16b, v16.16b 610 eor v3.16b, v3.16b, v9.16b 611 shl v9.2d, v9.2d, #2 612 eor v2.16b, v2.16b, v8.16b 613 shl v8.2d, v8.2d, #2 614 eor v7.16b, v7.16b, v10.16b 615 shl v10.2d, v10.2d, #2 616 eor v6.16b, v6.16b, v16.16b 617 shl v16.2d, v16.2d, #2 618 eor v1.16b, v1.16b, v9.16b 619 eor v0.16b, v0.16b, v8.16b 620 eor v5.16b, v5.16b, v10.16b 621 eor v4.16b, v4.16b, v16.16b 622 ushr v8.2d, v3.2d, #4 623 ushr v9.2d, v2.2d, #4 624 ushr v10.2d, v1.2d, #4 625 ushr v16.2d, v0.2d, #4 626 eor v8.16b, v8.16b, v7.16b 627 eor v9.16b, v9.16b, v6.16b 628 eor v10.16b, v10.16b, v5.16b 629 eor v16.16b, v16.16b, v4.16b 630 and v8.16b, v8.16b, v18.16b 631 and v9.16b, v9.16b, v18.16b 632 and v10.16b, v10.16b, v18.16b 633 and v16.16b, v16.16b, v18.16b 634 eor v7.16b, v7.16b, v8.16b 635 shl v8.2d, v8.2d, #4 636 eor v6.16b, v6.16b, v9.16b 637 shl v9.2d, v9.2d, #4 638 eor v5.16b, v5.16b, v10.16b 639 shl v10.2d, v10.2d, #4 640 eor v4.16b, v4.16b, v16.16b 641 shl v16.2d, v16.2d, #4 642 eor v3.16b, v3.16b, v8.16b 643 eor v2.16b, v2.16b, v9.16b 644 eor v1.16b, v1.16b, v10.16b 645 eor v0.16b, v0.16b, v16.16b 646 b .Lenc_sbox 647.align 4 648.Lenc_loop: 649 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 650 ldp q8, q9, [x9], #32 651 eor v0.16b, v16.16b, v0.16b 652 ldr q10, [x9], #16 653 eor v1.16b, v17.16b, v1.16b 654 ldr q16, [x9], #16 655 eor v2.16b, v18.16b, v2.16b 656 eor v3.16b, v19.16b, v3.16b 657 eor v4.16b, v8.16b, v4.16b 658 eor v5.16b, v9.16b, v5.16b 659 eor v6.16b, v10.16b, v6.16b 660 eor v7.16b, v16.16b, v7.16b 661 tbl v0.16b, {v0.16b}, v28.16b 662 tbl v1.16b, {v1.16b}, v28.16b 663 tbl v2.16b, {v2.16b}, v28.16b 664 tbl v3.16b, {v3.16b}, v28.16b 665 tbl v4.16b, {v4.16b}, v28.16b 666 tbl v5.16b, {v5.16b}, v28.16b 667 tbl v6.16b, {v6.16b}, v28.16b 668 tbl v7.16b, {v7.16b}, v28.16b 669.Lenc_sbox: 670 eor v5.16b, v5.16b, v6.16b 671 eor v3.16b, v3.16b, v0.16b 672 subs x10, x10, #1 673 eor v2.16b, v2.16b, v1.16b 674 eor v5.16b, v5.16b, v0.16b 675 eor v8.16b, v3.16b, v7.16b 676 eor v6.16b, v6.16b, v2.16b 677 eor v7.16b, v7.16b, v5.16b 678 eor v8.16b, v8.16b, v4.16b 679 eor v3.16b, v6.16b, v3.16b 680 eor v4.16b, v4.16b, v5.16b 681 eor v6.16b, v1.16b, v5.16b 682 eor v2.16b, v2.16b, v7.16b 683 eor v1.16b, v8.16b, v1.16b 684 eor v8.16b, v7.16b, v4.16b 685 eor v9.16b, v3.16b, v0.16b 686 eor v10.16b, v7.16b, v6.16b 687 eor v16.16b, v5.16b, v3.16b 688 eor v17.16b, v6.16b, v2.16b 689 eor v18.16b, v5.16b, v1.16b 690 eor v19.16b, v2.16b, v4.16b 691 eor v20.16b, v1.16b, v0.16b 692 orr v21.16b, v8.16b, v9.16b 693 orr v22.16b, v10.16b, v16.16b 694 eor v23.16b, v8.16b, v17.16b 695 eor v24.16b, v9.16b, v18.16b 696 and v19.16b, v19.16b, v20.16b 697 orr v20.16b, v17.16b, v18.16b 698 and v8.16b, v8.16b, v9.16b 699 and v9.16b, v17.16b, v18.16b 700 and v17.16b, v23.16b, v24.16b 701 and v10.16b, v10.16b, v16.16b 702 eor v16.16b, v21.16b, v19.16b 703 eor v18.16b, v20.16b, v19.16b 704 and v19.16b, v2.16b, v1.16b 705 and v20.16b, v6.16b, v5.16b 706 eor v21.16b, v22.16b, v17.16b 707 eor v9.16b, v9.16b, v10.16b 708 eor v10.16b, v16.16b, v17.16b 709 eor v16.16b, v18.16b, v8.16b 710 and v17.16b, v4.16b, v0.16b 711 orr v18.16b, v7.16b, v3.16b 712 eor v21.16b, v21.16b, v8.16b 713 eor v8.16b, v9.16b, v8.16b 714 eor v9.16b, v10.16b, v19.16b 715 eor v10.16b, v3.16b, v0.16b 716 eor v16.16b, v16.16b, v17.16b 717 eor v17.16b, v5.16b, v1.16b 718 eor v19.16b, v21.16b, v20.16b 719 eor v20.16b, v8.16b, v18.16b 720 eor v8.16b, v8.16b, v18.16b 721 eor v18.16b, v7.16b, v4.16b 722 eor v21.16b, v9.16b, v16.16b 723 eor v22.16b, v6.16b, v2.16b 724 and v23.16b, v9.16b, v19.16b 725 eor v24.16b, v10.16b, v17.16b 726 eor v25.16b, v0.16b, v1.16b 727 eor v26.16b, v7.16b, v6.16b 728 eor v27.16b, v18.16b, v22.16b 729 eor v28.16b, v3.16b, v5.16b 730 eor v29.16b, v16.16b, v23.16b 731 eor v30.16b, v20.16b, v23.16b 732 eor v23.16b, v20.16b, v23.16b 733 eor v31.16b, v4.16b, v2.16b 734 bsl v29.16b, v19.16b, v20.16b 735 bsl v30.16b, v9.16b, v16.16b 736 bsl v8.16b, v29.16b, v23.16b 737 bsl v20.16b, v23.16b, v29.16b 738 eor v9.16b, v30.16b, v29.16b 739 and v5.16b, v5.16b, v30.16b 740 and v8.16b, v8.16b, v30.16b 741 and v1.16b, v1.16b, v29.16b 742 eor v16.16b, v19.16b, v20.16b 743 and v2.16b, v2.16b, v29.16b 744 eor v19.16b, v9.16b, v29.16b 745 and v17.16b, v17.16b, v9.16b 746 eor v8.16b, v8.16b, v21.16b 747 and v20.16b, v22.16b, v9.16b 748 eor v21.16b, v29.16b, v16.16b 749 eor v22.16b, v29.16b, v16.16b 750 and v23.16b, v25.16b, v16.16b 751 and v6.16b, v6.16b, v19.16b 752 eor v25.16b, v8.16b, v16.16b 753 eor v29.16b, v30.16b, v8.16b 754 and v4.16b, v21.16b, v4.16b 755 and v8.16b, v28.16b, v8.16b 756 and v0.16b, v22.16b, v0.16b 757 eor v21.16b, v23.16b, v1.16b 758 eor v22.16b, v9.16b, v25.16b 759 eor v9.16b, v9.16b, v25.16b 760 eor v23.16b, v25.16b, v16.16b 761 and v3.16b, v29.16b, v3.16b 762 and v24.16b, v24.16b, v25.16b 763 and v25.16b, v27.16b, v25.16b 764 and v10.16b, v22.16b, v10.16b 765 and v9.16b, v9.16b, v18.16b 766 eor v18.16b, v19.16b, v23.16b 767 and v19.16b, v26.16b, v23.16b 768 eor v3.16b, v5.16b, v3.16b 769 eor v17.16b, v17.16b, v24.16b 770 eor v10.16b, v24.16b, v10.16b 771 and v16.16b, v31.16b, v16.16b 772 eor v20.16b, v20.16b, v25.16b 773 eor v9.16b, v25.16b, v9.16b 774 eor v4.16b, v2.16b, v4.16b 775 and v7.16b, v18.16b, v7.16b 776 eor v18.16b, v19.16b, v6.16b 777 eor v5.16b, v8.16b, v5.16b 778 eor v0.16b, v1.16b, v0.16b 779 eor v1.16b, v21.16b, v10.16b 780 eor v8.16b, v3.16b, v17.16b 781 eor v2.16b, v16.16b, v2.16b 782 eor v3.16b, v6.16b, v7.16b 783 eor v6.16b, v18.16b, v9.16b 784 eor v4.16b, v4.16b, v20.16b 785 eor v10.16b, v5.16b, v10.16b 786 eor v0.16b, v0.16b, v17.16b 787 eor v9.16b, v2.16b, v9.16b 788 eor v3.16b, v3.16b, v20.16b 789 eor v7.16b, v6.16b, v1.16b 790 eor v5.16b, v8.16b, v4.16b 791 eor v6.16b, v10.16b, v1.16b 792 eor v2.16b, v4.16b, v0.16b 793 eor v4.16b, v3.16b, v10.16b 794 eor v9.16b, v9.16b, v7.16b 795 eor v3.16b, v0.16b, v5.16b 796 eor v0.16b, v1.16b, v4.16b 797 eor v1.16b, v4.16b, v8.16b 798 eor v4.16b, v9.16b, v5.16b 799 eor v6.16b, v6.16b, v3.16b 800 bcc .Lenc_done 801 ext v8.16b, v0.16b, v0.16b, #12 802 ext v9.16b, v4.16b, v4.16b, #12 803 ldr q28, [x11] 804 ext v10.16b, v6.16b, v6.16b, #12 805 ext v16.16b, v1.16b, v1.16b, #12 806 ext v17.16b, v3.16b, v3.16b, #12 807 ext v18.16b, v7.16b, v7.16b, #12 808 eor v0.16b, v0.16b, v8.16b 809 eor v4.16b, v4.16b, v9.16b 810 eor v6.16b, v6.16b, v10.16b 811 ext v19.16b, v2.16b, v2.16b, #12 812 ext v20.16b, v5.16b, v5.16b, #12 813 eor v1.16b, v1.16b, v16.16b 814 eor v3.16b, v3.16b, v17.16b 815 eor v7.16b, v7.16b, v18.16b 816 eor v2.16b, v2.16b, v19.16b 817 eor v16.16b, v16.16b, v0.16b 818 eor v5.16b, v5.16b, v20.16b 819 eor v17.16b, v17.16b, v6.16b 820 eor v10.16b, v10.16b, v4.16b 821 ext v0.16b, v0.16b, v0.16b, #8 822 eor v9.16b, v9.16b, v1.16b 823 ext v1.16b, v1.16b, v1.16b, #8 824 eor v8.16b, v8.16b, v5.16b 825 eor v16.16b, v16.16b, v5.16b 826 eor v18.16b, v18.16b, v3.16b 827 eor v19.16b, v19.16b, v7.16b 828 ext v3.16b, v3.16b, v3.16b, #8 829 ext v7.16b, v7.16b, v7.16b, #8 830 eor v20.16b, v20.16b, v2.16b 831 ext v6.16b, v6.16b, v6.16b, #8 832 ext v21.16b, v5.16b, v5.16b, #8 833 eor v17.16b, v17.16b, v5.16b 834 ext v2.16b, v2.16b, v2.16b, #8 835 eor v10.16b, v10.16b, v5.16b 836 ext v22.16b, v4.16b, v4.16b, #8 837 eor v0.16b, v0.16b, v8.16b 838 eor v1.16b, v1.16b, v16.16b 839 eor v5.16b, v7.16b, v18.16b 840 eor v4.16b, v3.16b, v17.16b 841 eor v3.16b, v6.16b, v10.16b 842 eor v7.16b, v21.16b, v20.16b 843 eor v6.16b, v2.16b, v19.16b 844 eor v2.16b, v22.16b, v9.16b 845 bne .Lenc_loop 846 ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) 847 b .Lenc_loop 848.align 4 849.Lenc_done: 850 ushr v8.2d, v0.2d, #1 851 movi v9.16b, #0x55 852 ldr q10, [x9] 853 ushr v16.2d, v3.2d, #1 854 movi v17.16b, #0x33 855 ushr v18.2d, v4.2d, #1 856 movi v19.16b, #0x0f 857 eor v8.16b, v8.16b, v1.16b 858 ushr v20.2d, v2.2d, #1 859 eor v16.16b, v16.16b, v7.16b 860 eor v18.16b, v18.16b, v6.16b 861 and v8.16b, v8.16b, v9.16b 862 eor v20.16b, v20.16b, v5.16b 863 and v16.16b, v16.16b, v9.16b 864 and v18.16b, v18.16b, v9.16b 865 shl v21.2d, v8.2d, #1 866 eor v1.16b, v1.16b, v8.16b 867 and v8.16b, v20.16b, v9.16b 868 eor v7.16b, v7.16b, v16.16b 869 shl v9.2d, v16.2d, #1 870 eor v6.16b, v6.16b, v18.16b 871 shl v16.2d, v18.2d, #1 872 eor v0.16b, v0.16b, v21.16b 873 shl v18.2d, v8.2d, #1 874 eor v5.16b, v5.16b, v8.16b 875 eor v3.16b, v3.16b, v9.16b 876 eor v4.16b, v4.16b, v16.16b 877 ushr v8.2d, v1.2d, #2 878 eor v2.16b, v2.16b, v18.16b 879 ushr v9.2d, v0.2d, #2 880 ushr v16.2d, v7.2d, #2 881 ushr v18.2d, v3.2d, #2 882 eor v8.16b, v8.16b, v6.16b 883 eor v9.16b, v9.16b, v4.16b 884 eor v16.16b, v16.16b, v5.16b 885 eor v18.16b, v18.16b, v2.16b 886 and v8.16b, v8.16b, v17.16b 887 and v9.16b, v9.16b, v17.16b 888 and v16.16b, v16.16b, v17.16b 889 and v17.16b, v18.16b, v17.16b 890 eor v6.16b, v6.16b, v8.16b 891 shl v8.2d, v8.2d, #2 892 eor v4.16b, v4.16b, v9.16b 893 shl v9.2d, v9.2d, #2 894 eor v5.16b, v5.16b, v16.16b 895 shl v16.2d, v16.2d, #2 896 eor v2.16b, v2.16b, v17.16b 897 shl v17.2d, v17.2d, #2 898 eor v1.16b, v1.16b, v8.16b 899 eor v0.16b, v0.16b, v9.16b 900 eor v7.16b, v7.16b, v16.16b 901 eor v3.16b, v3.16b, v17.16b 902 ushr v8.2d, v6.2d, #4 903 ushr v9.2d, v4.2d, #4 904 ushr v16.2d, v1.2d, #4 905 ushr v17.2d, v0.2d, #4 906 eor v8.16b, v8.16b, v5.16b 907 eor v9.16b, v9.16b, v2.16b 908 eor v16.16b, v16.16b, v7.16b 909 eor v17.16b, v17.16b, v3.16b 910 and v8.16b, v8.16b, v19.16b 911 and v9.16b, v9.16b, v19.16b 912 and v16.16b, v16.16b, v19.16b 913 and v17.16b, v17.16b, v19.16b 914 eor v5.16b, v5.16b, v8.16b 915 shl v8.2d, v8.2d, #4 916 eor v2.16b, v2.16b, v9.16b 917 shl v9.2d, v9.2d, #4 918 eor v7.16b, v7.16b, v16.16b 919 shl v16.2d, v16.2d, #4 920 eor v3.16b, v3.16b, v17.16b 921 shl v17.2d, v17.2d, #4 922 eor v6.16b, v6.16b, v8.16b 923 eor v4.16b, v4.16b, v9.16b 924 eor v7.16b, v7.16b, v10.16b 925 eor v1.16b, v1.16b, v16.16b 926 eor v3.16b, v3.16b, v10.16b 927 eor v0.16b, v0.16b, v17.16b 928 eor v6.16b, v6.16b, v10.16b 929 eor v4.16b, v4.16b, v10.16b 930 eor v2.16b, v2.16b, v10.16b 931 eor v5.16b, v5.16b, v10.16b 932 eor v1.16b, v1.16b, v10.16b 933 eor v0.16b, v0.16b, v10.16b 934 ret 935.size _bsaes_encrypt8,.-_bsaes_encrypt8 936 937.type _bsaes_key_convert,%function 938.align 4 939// On entry: 940// x9 -> input key (big-endian) 941// x10 = number of rounds 942// x17 -> output key (native endianness) 943// On exit: 944// x9, x10 corrupted 945// x11 -> .LM0_bigendian 946// x17 -> last quadword of output key 947// other general-purpose registers preserved 948// v2-v6 preserved 949// v7.16b[] = 0x63 950// v8-v14 preserved 951// v15 = last round key (converted to native endianness) 952// other SIMD registers corrupted 953_bsaes_key_convert: 954#ifdef __AARCH64EL__ 955 adr x11, .LM0_littleendian 956#else 957 adr x11, .LM0_bigendian 958#endif 959 ldr q0, [x9], #16 // load round 0 key 960 ldr q1, [x11] // .LM0 961 ldr q15, [x9], #16 // load round 1 key 962 963 movi v7.16b, #0x63 // compose .L63 964 movi v16.16b, #0x01 // bit masks 965 movi v17.16b, #0x02 966 movi v18.16b, #0x04 967 movi v19.16b, #0x08 968 movi v20.16b, #0x10 969 movi v21.16b, #0x20 970 movi v22.16b, #0x40 971 movi v23.16b, #0x80 972 973#ifdef __AARCH64EL__ 974 rev32 v0.16b, v0.16b 975#endif 976 sub x10, x10, #1 977 str q0, [x17], #16 // save round 0 key 978 979.align 4 980.Lkey_loop: 981 tbl v0.16b, {v15.16b}, v1.16b 982 ldr q15, [x9], #16 // load next round key 983 984 eor v0.16b, v0.16b, v7.16b 985 cmtst v24.16b, v0.16b, v16.16b 986 cmtst v25.16b, v0.16b, v17.16b 987 cmtst v26.16b, v0.16b, v18.16b 988 cmtst v27.16b, v0.16b, v19.16b 989 cmtst v28.16b, v0.16b, v20.16b 990 cmtst v29.16b, v0.16b, v21.16b 991 cmtst v30.16b, v0.16b, v22.16b 992 cmtst v31.16b, v0.16b, v23.16b 993 sub x10, x10, #1 994 st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key 995 st1 {v28.16b-v31.16b}, [x17], #64 996 cbnz x10, .Lkey_loop 997 998 // don't save last round key 999#ifdef __AARCH64EL__ 1000 rev32 v15.16b, v15.16b 1001 adr x11, .LM0_bigendian 1002#endif 1003 ret 1004.size _bsaes_key_convert,.-_bsaes_key_convert 1005 1006.globl ossl_bsaes_cbc_encrypt 1007.type ossl_bsaes_cbc_encrypt,%function 1008.align 4 1009// On entry: 1010// x0 -> input ciphertext 1011// x1 -> output plaintext 1012// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) 1013// x3 -> key 1014// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) 1015// w5 must be == 0 1016// On exit: 1017// Output plaintext filled in 1018// Initialisation vector overwritten with last quadword of ciphertext 1019// No output registers, usual AAPCS64 register preservation 1020ossl_bsaes_cbc_encrypt: 1021 cmp x2, #128 1022 bhs .Lcbc_do_bsaes 1023 b AES_cbc_encrypt 1024.Lcbc_do_bsaes: 1025 1026 // it is up to the caller to make sure we are called with enc == 0 1027 1028 stp x29, x30, [sp, #-48]! 1029 stp d8, d9, [sp, #16] 1030 stp d10, d15, [sp, #32] 1031 lsr x2, x2, #4 // len in 16 byte blocks 1032 1033 ldr w15, [x3, #240] // get # of rounds 1034 mov x14, sp 1035 1036 // allocate the key schedule on the stack 1037 add x17, sp, #96 1038 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1039 1040 // populate the key schedule 1041 mov x9, x3 // pass key 1042 mov x10, x15 // pass # of rounds 1043 mov sp, x17 // sp is sp 1044 bl _bsaes_key_convert 1045 ldr q6, [sp] 1046 str q15, [x17] // save last round key 1047 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1048 str q6, [sp] 1049 1050 ldr q15, [x4] // load IV 1051 b .Lcbc_dec_loop 1052 1053.align 4 1054.Lcbc_dec_loop: 1055 subs x2, x2, #0x8 1056 bmi .Lcbc_dec_loop_finish 1057 1058 ldr q0, [x0], #16 // load input 1059 mov x9, sp // pass the key 1060 ldr q1, [x0], #16 1061 mov x10, x15 1062 ldr q2, [x0], #16 1063 ldr q3, [x0], #16 1064 ldr q4, [x0], #16 1065 ldr q5, [x0], #16 1066 ldr q6, [x0], #16 1067 ldr q7, [x0], #-7*16 1068 1069 bl _bsaes_decrypt8 1070 1071 ldr q16, [x0], #16 // reload input 1072 eor v0.16b, v0.16b, v15.16b // ^= IV 1073 eor v1.16b, v1.16b, v16.16b 1074 str q0, [x1], #16 // write output 1075 ldr q0, [x0], #16 1076 str q1, [x1], #16 1077 ldr q1, [x0], #16 1078 eor v1.16b, v4.16b, v1.16b 1079 ldr q4, [x0], #16 1080 eor v2.16b, v2.16b, v4.16b 1081 eor v0.16b, v6.16b, v0.16b 1082 ldr q4, [x0], #16 1083 str q0, [x1], #16 1084 str q1, [x1], #16 1085 eor v0.16b, v7.16b, v4.16b 1086 ldr q1, [x0], #16 1087 str q2, [x1], #16 1088 ldr q2, [x0], #16 1089 ldr q15, [x0], #16 1090 str q0, [x1], #16 1091 eor v0.16b, v5.16b, v2.16b 1092 eor v1.16b, v3.16b, v1.16b 1093 str q1, [x1], #16 1094 str q0, [x1], #16 1095 1096 b .Lcbc_dec_loop 1097 1098.Lcbc_dec_loop_finish: 1099 adds x2, x2, #8 1100 beq .Lcbc_dec_done 1101 1102 ldr q0, [x0], #16 // load input 1103 cmp x2, #2 1104 blo .Lcbc_dec_one 1105 ldr q1, [x0], #16 1106 mov x9, sp // pass the key 1107 mov x10, x15 1108 beq .Lcbc_dec_two 1109 ldr q2, [x0], #16 1110 cmp x2, #4 1111 blo .Lcbc_dec_three 1112 ldr q3, [x0], #16 1113 beq .Lcbc_dec_four 1114 ldr q4, [x0], #16 1115 cmp x2, #6 1116 blo .Lcbc_dec_five 1117 ldr q5, [x0], #16 1118 beq .Lcbc_dec_six 1119 ldr q6, [x0], #-6*16 1120 1121 bl _bsaes_decrypt8 1122 1123 ldr q5, [x0], #16 // reload input 1124 eor v0.16b, v0.16b, v15.16b // ^= IV 1125 ldr q8, [x0], #16 1126 ldr q9, [x0], #16 1127 ldr q10, [x0], #16 1128 str q0, [x1], #16 // write output 1129 ldr q0, [x0], #16 1130 eor v1.16b, v1.16b, v5.16b 1131 ldr q5, [x0], #16 1132 eor v6.16b, v6.16b, v8.16b 1133 ldr q15, [x0] 1134 eor v4.16b, v4.16b, v9.16b 1135 eor v2.16b, v2.16b, v10.16b 1136 str q1, [x1], #16 1137 eor v0.16b, v7.16b, v0.16b 1138 str q6, [x1], #16 1139 eor v1.16b, v3.16b, v5.16b 1140 str q4, [x1], #16 1141 str q2, [x1], #16 1142 str q0, [x1], #16 1143 str q1, [x1] 1144 b .Lcbc_dec_done 1145.align 4 1146.Lcbc_dec_six: 1147 sub x0, x0, #0x60 1148 bl _bsaes_decrypt8 1149 ldr q3, [x0], #16 // reload input 1150 eor v0.16b, v0.16b, v15.16b // ^= IV 1151 ldr q5, [x0], #16 1152 ldr q8, [x0], #16 1153 ldr q9, [x0], #16 1154 str q0, [x1], #16 // write output 1155 ldr q0, [x0], #16 1156 eor v1.16b, v1.16b, v3.16b 1157 ldr q15, [x0] 1158 eor v3.16b, v6.16b, v5.16b 1159 eor v4.16b, v4.16b, v8.16b 1160 eor v2.16b, v2.16b, v9.16b 1161 str q1, [x1], #16 1162 eor v0.16b, v7.16b, v0.16b 1163 str q3, [x1], #16 1164 str q4, [x1], #16 1165 str q2, [x1], #16 1166 str q0, [x1] 1167 b .Lcbc_dec_done 1168.align 4 1169.Lcbc_dec_five: 1170 sub x0, x0, #0x50 1171 bl _bsaes_decrypt8 1172 ldr q3, [x0], #16 // reload input 1173 eor v0.16b, v0.16b, v15.16b // ^= IV 1174 ldr q5, [x0], #16 1175 ldr q7, [x0], #16 1176 ldr q8, [x0], #16 1177 str q0, [x1], #16 // write output 1178 ldr q15, [x0] 1179 eor v0.16b, v1.16b, v3.16b 1180 eor v1.16b, v6.16b, v5.16b 1181 eor v3.16b, v4.16b, v7.16b 1182 str q0, [x1], #16 1183 eor v0.16b, v2.16b, v8.16b 1184 str q1, [x1], #16 1185 str q3, [x1], #16 1186 str q0, [x1] 1187 b .Lcbc_dec_done 1188.align 4 1189.Lcbc_dec_four: 1190 sub x0, x0, #0x40 1191 bl _bsaes_decrypt8 1192 ldr q2, [x0], #16 // reload input 1193 eor v0.16b, v0.16b, v15.16b // ^= IV 1194 ldr q3, [x0], #16 1195 ldr q5, [x0], #16 1196 str q0, [x1], #16 // write output 1197 ldr q15, [x0] 1198 eor v0.16b, v1.16b, v2.16b 1199 eor v1.16b, v6.16b, v3.16b 1200 eor v2.16b, v4.16b, v5.16b 1201 str q0, [x1], #16 1202 str q1, [x1], #16 1203 str q2, [x1] 1204 b .Lcbc_dec_done 1205.align 4 1206.Lcbc_dec_three: 1207 sub x0, x0, #0x30 1208 bl _bsaes_decrypt8 1209 ldr q2, [x0], #16 // reload input 1210 eor v0.16b, v0.16b, v15.16b // ^= IV 1211 ldr q3, [x0], #16 1212 ldr q15, [x0] 1213 str q0, [x1], #16 // write output 1214 eor v0.16b, v1.16b, v2.16b 1215 eor v1.16b, v6.16b, v3.16b 1216 str q0, [x1], #16 1217 str q1, [x1] 1218 b .Lcbc_dec_done 1219.align 4 1220.Lcbc_dec_two: 1221 sub x0, x0, #0x20 1222 bl _bsaes_decrypt8 1223 ldr q2, [x0], #16 // reload input 1224 eor v0.16b, v0.16b, v15.16b // ^= IV 1225 ldr q15, [x0] 1226 str q0, [x1], #16 // write output 1227 eor v0.16b, v1.16b, v2.16b 1228 str q0, [x1] 1229 b .Lcbc_dec_done 1230.align 4 1231.Lcbc_dec_one: 1232 sub x0, x0, #0x10 1233 stp x1, x4, [sp, #-32]! 1234 str x14, [sp, #16] 1235 mov v8.16b, v15.16b 1236 mov v15.16b, v0.16b 1237 mov x2, x3 1238 bl AES_decrypt 1239 ldr x14, [sp, #16] 1240 ldp x1, x4, [sp], #32 1241 ldr q0, [x1] // load result 1242 eor v0.16b, v0.16b, v8.16b // ^= IV 1243 str q0, [x1] // write output 1244 1245.align 4 1246.Lcbc_dec_done: 1247 movi v0.16b, #0 1248 movi v1.16b, #0 1249.Lcbc_dec_bzero:// wipe key schedule [if any] 1250 stp q0, q1, [sp], #32 1251 cmp sp, x14 1252 bne .Lcbc_dec_bzero 1253 str q15, [x4] // return IV 1254 ldp d8, d9, [sp, #16] 1255 ldp d10, d15, [sp, #32] 1256 ldp x29, x30, [sp], #48 1257 ret 1258.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt 1259 1260.globl ossl_bsaes_ctr32_encrypt_blocks 1261.type ossl_bsaes_ctr32_encrypt_blocks,%function 1262.align 4 1263// On entry: 1264// x0 -> input text (whole 16-byte blocks) 1265// x1 -> output text (whole 16-byte blocks) 1266// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) 1267// x3 -> key 1268// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block 1269// On exit: 1270// Output text filled in 1271// No output registers, usual AAPCS64 register preservation 1272ossl_bsaes_ctr32_encrypt_blocks: 1273 1274 cmp x2, #8 // use plain AES for 1275 blo .Lctr_enc_short // small sizes 1276 1277 stp x29, x30, [sp, #-80]! 1278 stp d8, d9, [sp, #16] 1279 stp d10, d11, [sp, #32] 1280 stp d12, d13, [sp, #48] 1281 stp d14, d15, [sp, #64] 1282 1283 ldr w15, [x3, #240] // get # of rounds 1284 mov x14, sp 1285 1286 // allocate the key schedule on the stack 1287 add x17, sp, #96 1288 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1289 1290 // populate the key schedule 1291 mov x9, x3 // pass key 1292 mov x10, x15 // pass # of rounds 1293 mov sp, x17 // sp is sp 1294 bl _bsaes_key_convert 1295 eor v7.16b, v7.16b, v15.16b // fix up last round key 1296 str q7, [x17] // save last round key 1297 1298 ldr q0, [x4] // load counter 1299 add x13, x11, #.LREVM0SR-.LM0_bigendian 1300 ldr q4, [sp] // load round0 key 1301 1302 movi v8.4s, #1 // compose 1<<96 1303 movi v9.16b, #0 1304 rev32 v15.16b, v0.16b 1305 rev32 v0.16b, v0.16b 1306 ext v11.16b, v9.16b, v8.16b, #4 1307 rev32 v4.16b, v4.16b 1308 add v12.4s, v11.4s, v11.4s // compose 2<<96 1309 str q4, [sp] // save adjusted round0 key 1310 add v13.4s, v11.4s, v12.4s // compose 3<<96 1311 add v14.4s, v12.4s, v12.4s // compose 4<<96 1312 b .Lctr_enc_loop 1313 1314.align 4 1315.Lctr_enc_loop: 1316 // Intermix prologue from _bsaes_encrypt8 to use the opportunity 1317 // to flip byte order in 32-bit counter 1318 1319 add v1.4s, v15.4s, v11.4s // +1 1320 add x9, sp, #0x10 // pass next round key 1321 add v2.4s, v15.4s, v12.4s // +2 1322 ldr q9, [x13] // .LREVM0SR 1323 ldr q8, [sp] // load round0 key 1324 add v3.4s, v15.4s, v13.4s // +3 1325 mov x10, x15 // pass rounds 1326 sub x11, x13, #.LREVM0SR-.LSR // pass constants 1327 add v6.4s, v2.4s, v14.4s 1328 add v4.4s, v15.4s, v14.4s // +4 1329 add v7.4s, v3.4s, v14.4s 1330 add v15.4s, v4.4s, v14.4s // next counter 1331 add v5.4s, v1.4s, v14.4s 1332 1333 bl _bsaes_encrypt8_alt 1334 1335 subs x2, x2, #8 1336 blo .Lctr_enc_loop_done 1337 1338 ldr q16, [x0], #16 1339 ldr q17, [x0], #16 1340 eor v1.16b, v1.16b, v17.16b 1341 ldr q17, [x0], #16 1342 eor v0.16b, v0.16b, v16.16b 1343 eor v4.16b, v4.16b, v17.16b 1344 str q0, [x1], #16 1345 ldr q16, [x0], #16 1346 str q1, [x1], #16 1347 mov v0.16b, v15.16b 1348 str q4, [x1], #16 1349 ldr q1, [x0], #16 1350 eor v4.16b, v6.16b, v16.16b 1351 eor v1.16b, v3.16b, v1.16b 1352 ldr q3, [x0], #16 1353 eor v3.16b, v7.16b, v3.16b 1354 ldr q6, [x0], #16 1355 eor v2.16b, v2.16b, v6.16b 1356 ldr q6, [x0], #16 1357 eor v5.16b, v5.16b, v6.16b 1358 str q4, [x1], #16 1359 str q1, [x1], #16 1360 str q3, [x1], #16 1361 str q2, [x1], #16 1362 str q5, [x1], #16 1363 1364 bne .Lctr_enc_loop 1365 b .Lctr_enc_done 1366 1367.align 4 1368.Lctr_enc_loop_done: 1369 add x2, x2, #8 1370 ldr q16, [x0], #16 // load input 1371 eor v0.16b, v0.16b, v16.16b 1372 str q0, [x1], #16 // write output 1373 cmp x2, #2 1374 blo .Lctr_enc_done 1375 ldr q17, [x0], #16 1376 eor v1.16b, v1.16b, v17.16b 1377 str q1, [x1], #16 1378 beq .Lctr_enc_done 1379 ldr q18, [x0], #16 1380 eor v4.16b, v4.16b, v18.16b 1381 str q4, [x1], #16 1382 cmp x2, #4 1383 blo .Lctr_enc_done 1384 ldr q19, [x0], #16 1385 eor v6.16b, v6.16b, v19.16b 1386 str q6, [x1], #16 1387 beq .Lctr_enc_done 1388 ldr q20, [x0], #16 1389 eor v3.16b, v3.16b, v20.16b 1390 str q3, [x1], #16 1391 cmp x2, #6 1392 blo .Lctr_enc_done 1393 ldr q21, [x0], #16 1394 eor v7.16b, v7.16b, v21.16b 1395 str q7, [x1], #16 1396 beq .Lctr_enc_done 1397 ldr q22, [x0] 1398 eor v2.16b, v2.16b, v22.16b 1399 str q2, [x1], #16 1400 1401.Lctr_enc_done: 1402 movi v0.16b, #0 1403 movi v1.16b, #0 1404.Lctr_enc_bzero: // wipe key schedule [if any] 1405 stp q0, q1, [sp], #32 1406 cmp sp, x14 1407 bne .Lctr_enc_bzero 1408 1409 ldp d8, d9, [sp, #16] 1410 ldp d10, d11, [sp, #32] 1411 ldp d12, d13, [sp, #48] 1412 ldp d14, d15, [sp, #64] 1413 ldp x29, x30, [sp], #80 1414 ret 1415 1416.Lctr_enc_short: 1417 stp x29, x30, [sp, #-96]! 1418 stp x19, x20, [sp, #16] 1419 stp x21, x22, [sp, #32] 1420 str x23, [sp, #48] 1421 1422 mov x19, x0 // copy arguments 1423 mov x20, x1 1424 mov x21, x2 1425 mov x22, x3 1426 ldr w23, [x4, #12] // load counter .LSW 1427 ldr q1, [x4] // load whole counter value 1428#ifdef __AARCH64EL__ 1429 rev w23, w23 1430#endif 1431 str q1, [sp, #80] // copy counter value 1432 1433.Lctr_enc_short_loop: 1434 add x0, sp, #80 // input counter value 1435 add x1, sp, #64 // output on the stack 1436 mov x2, x22 // key 1437 1438 bl AES_encrypt 1439 1440 ldr q0, [x19], #16 // load input 1441 ldr q1, [sp, #64] // load encrypted counter 1442 add x23, x23, #1 1443#ifdef __AARCH64EL__ 1444 rev w0, w23 1445 str w0, [sp, #80+12] // next counter value 1446#else 1447 str w23, [sp, #80+12] // next counter value 1448#endif 1449 eor v0.16b, v0.16b, v1.16b 1450 str q0, [x20], #16 // store output 1451 subs x21, x21, #1 1452 bne .Lctr_enc_short_loop 1453 1454 movi v0.16b, #0 1455 movi v1.16b, #0 1456 stp q0, q1, [sp, #64] 1457 1458 ldr x23, [sp, #48] 1459 ldp x21, x22, [sp, #32] 1460 ldp x19, x20, [sp, #16] 1461 ldp x29, x30, [sp], #96 1462 ret 1463.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks 1464 1465.globl ossl_bsaes_xts_encrypt 1466.type ossl_bsaes_xts_encrypt,%function 1467.align 4 1468// On entry: 1469// x0 -> input plaintext 1470// x1 -> output ciphertext 1471// x2 -> length of text in bytes (must be at least 16) 1472// x3 -> key1 (used to encrypt the XORed plaintext blocks) 1473// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1474// x5 -> 16-byte initial vector (typically, sector number) 1475// On exit: 1476// Output ciphertext filled in 1477// No output registers, usual AAPCS64 register preservation 1478ossl_bsaes_xts_encrypt: 1479 // Stack layout: 1480 // sp -> 1481 // nrounds*128-96 bytes: key schedule 1482 // x19 -> 1483 // 16 bytes: frame record 1484 // 4*16 bytes: tweak storage across _bsaes_encrypt8 1485 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1486 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1487 stp x29, x30, [sp, #-192]! 1488 stp x19, x20, [sp, #80] 1489 stp x21, x22, [sp, #96] 1490 str x23, [sp, #112] 1491 stp d8, d9, [sp, #128] 1492 stp d10, d11, [sp, #144] 1493 stp d12, d13, [sp, #160] 1494 stp d14, d15, [sp, #176] 1495 1496 mov x19, sp 1497 mov x20, x0 1498 mov x21, x1 1499 mov x22, x2 1500 mov x23, x3 1501 1502 // generate initial tweak 1503 sub sp, sp, #16 1504 mov x0, x5 // iv[] 1505 mov x1, sp 1506 mov x2, x4 // key2 1507 bl AES_encrypt 1508 ldr q11, [sp], #16 1509 1510 ldr w1, [x23, #240] // get # of rounds 1511 // allocate the key schedule on the stack 1512 add x17, sp, #96 1513 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1514 1515 // populate the key schedule 1516 mov x9, x23 // pass key 1517 mov x10, x1 // pass # of rounds 1518 mov sp, x17 1519 bl _bsaes_key_convert 1520 eor v15.16b, v15.16b, v7.16b // fix up last round key 1521 str q15, [x17] // save last round key 1522 1523 subs x22, x22, #0x80 1524 blo .Lxts_enc_short 1525 b .Lxts_enc_loop 1526 1527.align 4 1528.Lxts_enc_loop: 1529 ldr q8, .Lxts_magic 1530 mov x10, x1 // pass rounds 1531 add x2, x19, #16 1532 ldr q0, [x20], #16 1533 sshr v1.2d, v11.2d, #63 1534 mov x9, sp // pass key schedule 1535 ldr q6, .Lxts_magic+16 1536 add v2.2d, v11.2d, v11.2d 1537 cmtst v3.2d, v11.2d, v6.2d 1538 and v1.16b, v1.16b, v8.16b 1539 ext v1.16b, v1.16b, v1.16b, #8 1540 and v3.16b, v3.16b, v8.16b 1541 ldr q4, [x20], #16 1542 eor v12.16b, v2.16b, v1.16b 1543 eor v1.16b, v4.16b, v12.16b 1544 eor v0.16b, v0.16b, v11.16b 1545 cmtst v2.2d, v12.2d, v6.2d 1546 add v4.2d, v12.2d, v12.2d 1547 add x0, x19, #16 1548 ext v3.16b, v3.16b, v3.16b, #8 1549 and v2.16b, v2.16b, v8.16b 1550 eor v13.16b, v4.16b, v3.16b 1551 ldr q3, [x20], #16 1552 ext v4.16b, v2.16b, v2.16b, #8 1553 eor v2.16b, v3.16b, v13.16b 1554 ldr q3, [x20], #16 1555 add v5.2d, v13.2d, v13.2d 1556 cmtst v7.2d, v13.2d, v6.2d 1557 and v7.16b, v7.16b, v8.16b 1558 ldr q9, [x20], #16 1559 ext v7.16b, v7.16b, v7.16b, #8 1560 ldr q10, [x20], #16 1561 eor v14.16b, v5.16b, v4.16b 1562 ldr q16, [x20], #16 1563 add v4.2d, v14.2d, v14.2d 1564 eor v3.16b, v3.16b, v14.16b 1565 eor v15.16b, v4.16b, v7.16b 1566 add v5.2d, v15.2d, v15.2d 1567 ldr q7, [x20], #16 1568 cmtst v4.2d, v14.2d, v6.2d 1569 and v17.16b, v4.16b, v8.16b 1570 cmtst v18.2d, v15.2d, v6.2d 1571 eor v4.16b, v9.16b, v15.16b 1572 ext v9.16b, v17.16b, v17.16b, #8 1573 eor v9.16b, v5.16b, v9.16b 1574 add v17.2d, v9.2d, v9.2d 1575 and v18.16b, v18.16b, v8.16b 1576 eor v5.16b, v10.16b, v9.16b 1577 str q9, [x2], #16 1578 ext v10.16b, v18.16b, v18.16b, #8 1579 cmtst v9.2d, v9.2d, v6.2d 1580 and v9.16b, v9.16b, v8.16b 1581 eor v10.16b, v17.16b, v10.16b 1582 cmtst v17.2d, v10.2d, v6.2d 1583 eor v6.16b, v16.16b, v10.16b 1584 str q10, [x2], #16 1585 ext v9.16b, v9.16b, v9.16b, #8 1586 add v10.2d, v10.2d, v10.2d 1587 eor v9.16b, v10.16b, v9.16b 1588 str q9, [x2], #16 1589 eor v7.16b, v7.16b, v9.16b 1590 add v9.2d, v9.2d, v9.2d 1591 and v8.16b, v17.16b, v8.16b 1592 ext v8.16b, v8.16b, v8.16b, #8 1593 eor v8.16b, v9.16b, v8.16b 1594 str q8, [x2] // next round tweak 1595 1596 bl _bsaes_encrypt8 1597 1598 ldr q8, [x0], #16 1599 eor v0.16b, v0.16b, v11.16b 1600 eor v1.16b, v1.16b, v12.16b 1601 ldr q9, [x0], #16 1602 eor v4.16b, v4.16b, v13.16b 1603 eor v6.16b, v6.16b, v14.16b 1604 ldr q10, [x0], #16 1605 eor v3.16b, v3.16b, v15.16b 1606 subs x22, x22, #0x80 1607 str q0, [x21], #16 1608 ldr q11, [x0] // next round tweak 1609 str q1, [x21], #16 1610 eor v0.16b, v7.16b, v8.16b 1611 eor v1.16b, v2.16b, v9.16b 1612 str q4, [x21], #16 1613 eor v2.16b, v5.16b, v10.16b 1614 str q6, [x21], #16 1615 str q3, [x21], #16 1616 str q0, [x21], #16 1617 str q1, [x21], #16 1618 str q2, [x21], #16 1619 bpl .Lxts_enc_loop 1620 1621.Lxts_enc_short: 1622 adds x22, x22, #0x70 1623 bmi .Lxts_enc_done 1624 1625 ldr q8, .Lxts_magic 1626 sshr v1.2d, v11.2d, #63 1627 add v2.2d, v11.2d, v11.2d 1628 ldr q9, .Lxts_magic+16 1629 subs x22, x22, #0x10 1630 ldr q0, [x20], #16 1631 and v1.16b, v1.16b, v8.16b 1632 cmtst v3.2d, v11.2d, v9.2d 1633 ext v1.16b, v1.16b, v1.16b, #8 1634 and v3.16b, v3.16b, v8.16b 1635 eor v12.16b, v2.16b, v1.16b 1636 ext v1.16b, v3.16b, v3.16b, #8 1637 add v2.2d, v12.2d, v12.2d 1638 cmtst v3.2d, v12.2d, v9.2d 1639 eor v13.16b, v2.16b, v1.16b 1640 and v22.16b, v3.16b, v8.16b 1641 bmi .Lxts_enc_1 1642 1643 ext v2.16b, v22.16b, v22.16b, #8 1644 add v3.2d, v13.2d, v13.2d 1645 ldr q1, [x20], #16 1646 cmtst v4.2d, v13.2d, v9.2d 1647 subs x22, x22, #0x10 1648 eor v14.16b, v3.16b, v2.16b 1649 and v23.16b, v4.16b, v8.16b 1650 bmi .Lxts_enc_2 1651 1652 ext v3.16b, v23.16b, v23.16b, #8 1653 add v4.2d, v14.2d, v14.2d 1654 ldr q2, [x20], #16 1655 cmtst v5.2d, v14.2d, v9.2d 1656 eor v0.16b, v0.16b, v11.16b 1657 subs x22, x22, #0x10 1658 eor v15.16b, v4.16b, v3.16b 1659 and v24.16b, v5.16b, v8.16b 1660 bmi .Lxts_enc_3 1661 1662 ext v4.16b, v24.16b, v24.16b, #8 1663 add v5.2d, v15.2d, v15.2d 1664 ldr q3, [x20], #16 1665 cmtst v6.2d, v15.2d, v9.2d 1666 eor v1.16b, v1.16b, v12.16b 1667 subs x22, x22, #0x10 1668 eor v16.16b, v5.16b, v4.16b 1669 and v25.16b, v6.16b, v8.16b 1670 bmi .Lxts_enc_4 1671 1672 ext v5.16b, v25.16b, v25.16b, #8 1673 add v6.2d, v16.2d, v16.2d 1674 add x0, x19, #16 1675 cmtst v7.2d, v16.2d, v9.2d 1676 ldr q4, [x20], #16 1677 eor v2.16b, v2.16b, v13.16b 1678 str q16, [x0], #16 1679 subs x22, x22, #0x10 1680 eor v17.16b, v6.16b, v5.16b 1681 and v26.16b, v7.16b, v8.16b 1682 bmi .Lxts_enc_5 1683 1684 ext v7.16b, v26.16b, v26.16b, #8 1685 add v18.2d, v17.2d, v17.2d 1686 ldr q5, [x20], #16 1687 eor v3.16b, v3.16b, v14.16b 1688 str q17, [x0], #16 1689 subs x22, x22, #0x10 1690 eor v18.16b, v18.16b, v7.16b 1691 bmi .Lxts_enc_6 1692 1693 ldr q6, [x20], #16 1694 eor v4.16b, v4.16b, v15.16b 1695 eor v5.16b, v5.16b, v16.16b 1696 str q18, [x0] // next round tweak 1697 mov x9, sp // pass key schedule 1698 mov x10, x1 1699 add x0, x19, #16 1700 sub x22, x22, #0x10 1701 eor v6.16b, v6.16b, v17.16b 1702 1703 bl _bsaes_encrypt8 1704 1705 ldr q16, [x0], #16 1706 eor v0.16b, v0.16b, v11.16b 1707 eor v1.16b, v1.16b, v12.16b 1708 ldr q17, [x0], #16 1709 eor v4.16b, v4.16b, v13.16b 1710 eor v6.16b, v6.16b, v14.16b 1711 eor v3.16b, v3.16b, v15.16b 1712 ldr q11, [x0] // next round tweak 1713 str q0, [x21], #16 1714 str q1, [x21], #16 1715 eor v0.16b, v7.16b, v16.16b 1716 eor v1.16b, v2.16b, v17.16b 1717 str q4, [x21], #16 1718 str q6, [x21], #16 1719 str q3, [x21], #16 1720 str q0, [x21], #16 1721 str q1, [x21], #16 1722 b .Lxts_enc_done 1723 1724.align 4 1725.Lxts_enc_6: 1726 eor v4.16b, v4.16b, v15.16b 1727 eor v5.16b, v5.16b, v16.16b 1728 mov x9, sp // pass key schedule 1729 mov x10, x1 // pass rounds 1730 add x0, x19, #16 1731 1732 bl _bsaes_encrypt8 1733 1734 ldr q16, [x0], #16 1735 eor v0.16b, v0.16b, v11.16b 1736 eor v1.16b, v1.16b, v12.16b 1737 eor v4.16b, v4.16b, v13.16b 1738 eor v6.16b, v6.16b, v14.16b 1739 ldr q11, [x0] // next round tweak 1740 eor v3.16b, v3.16b, v15.16b 1741 str q0, [x21], #16 1742 str q1, [x21], #16 1743 eor v0.16b, v7.16b, v16.16b 1744 str q4, [x21], #16 1745 str q6, [x21], #16 1746 str q3, [x21], #16 1747 str q0, [x21], #16 1748 b .Lxts_enc_done 1749 1750.align 4 1751.Lxts_enc_5: 1752 eor v3.16b, v3.16b, v14.16b 1753 eor v4.16b, v4.16b, v15.16b 1754 mov x9, sp // pass key schedule 1755 mov x10, x1 // pass rounds 1756 add x0, x19, #16 1757 1758 bl _bsaes_encrypt8 1759 1760 eor v0.16b, v0.16b, v11.16b 1761 eor v1.16b, v1.16b, v12.16b 1762 ldr q11, [x0] // next round tweak 1763 eor v4.16b, v4.16b, v13.16b 1764 eor v6.16b, v6.16b, v14.16b 1765 eor v3.16b, v3.16b, v15.16b 1766 str q0, [x21], #16 1767 str q1, [x21], #16 1768 str q4, [x21], #16 1769 str q6, [x21], #16 1770 str q3, [x21], #16 1771 b .Lxts_enc_done 1772 1773.align 4 1774.Lxts_enc_4: 1775 eor v2.16b, v2.16b, v13.16b 1776 eor v3.16b, v3.16b, v14.16b 1777 mov x9, sp // pass key schedule 1778 mov x10, x1 // pass rounds 1779 add x0, x19, #16 1780 1781 bl _bsaes_encrypt8 1782 1783 eor v0.16b, v0.16b, v11.16b 1784 eor v1.16b, v1.16b, v12.16b 1785 eor v4.16b, v4.16b, v13.16b 1786 eor v6.16b, v6.16b, v14.16b 1787 mov v11.16b, v15.16b // next round tweak 1788 str q0, [x21], #16 1789 str q1, [x21], #16 1790 str q4, [x21], #16 1791 str q6, [x21], #16 1792 b .Lxts_enc_done 1793 1794.align 4 1795.Lxts_enc_3: 1796 eor v1.16b, v1.16b, v12.16b 1797 eor v2.16b, v2.16b, v13.16b 1798 mov x9, sp // pass key schedule 1799 mov x10, x1 // pass rounds 1800 add x0, x19, #16 1801 1802 bl _bsaes_encrypt8 1803 1804 eor v0.16b, v0.16b, v11.16b 1805 eor v1.16b, v1.16b, v12.16b 1806 eor v4.16b, v4.16b, v13.16b 1807 mov v11.16b, v14.16b // next round tweak 1808 str q0, [x21], #16 1809 str q1, [x21], #16 1810 str q4, [x21], #16 1811 b .Lxts_enc_done 1812 1813.align 4 1814.Lxts_enc_2: 1815 eor v0.16b, v0.16b, v11.16b 1816 eor v1.16b, v1.16b, v12.16b 1817 mov x9, sp // pass key schedule 1818 mov x10, x1 // pass rounds 1819 add x0, x19, #16 1820 1821 bl _bsaes_encrypt8 1822 1823 eor v0.16b, v0.16b, v11.16b 1824 eor v1.16b, v1.16b, v12.16b 1825 mov v11.16b, v13.16b // next round tweak 1826 str q0, [x21], #16 1827 str q1, [x21], #16 1828 b .Lxts_enc_done 1829 1830.align 4 1831.Lxts_enc_1: 1832 eor v0.16b, v0.16b, v11.16b 1833 sub x0, sp, #16 1834 sub x1, sp, #16 1835 mov x2, x23 1836 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1837 mov v14.d[0], v12.d[1] 1838 str q0, [sp, #-16]! 1839 1840 bl AES_encrypt 1841 1842 ldr q0, [sp], #16 1843 trn1 v13.2d, v11.2d, v13.2d 1844 trn1 v11.2d, v12.2d, v14.2d // next round tweak 1845 eor v0.16b, v0.16b, v13.16b 1846 str q0, [x21], #16 1847 1848.Lxts_enc_done: 1849 adds x22, x22, #0x10 1850 beq .Lxts_enc_ret 1851 1852 sub x6, x21, #0x10 1853 // Penultimate plaintext block produces final ciphertext part-block 1854 // plus remaining part of final plaintext block. Move ciphertext part 1855 // to final position and re-use penultimate ciphertext block buffer to 1856 // construct final plaintext block 1857.Lxts_enc_steal: 1858 ldrb w0, [x20], #1 1859 ldrb w1, [x21, #-0x10] 1860 strb w0, [x21, #-0x10] 1861 strb w1, [x21], #1 1862 1863 subs x22, x22, #1 1864 bhi .Lxts_enc_steal 1865 1866 // Finally encrypt the penultimate ciphertext block using the 1867 // last tweak 1868 ldr q0, [x6] 1869 eor v0.16b, v0.16b, v11.16b 1870 str q0, [sp, #-16]! 1871 mov x0, sp 1872 mov x1, sp 1873 mov x2, x23 1874 mov x21, x6 1875 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1876 1877 bl AES_encrypt 1878 1879 trn1 v11.2d, v11.2d, v13.2d 1880 ldr q0, [sp], #16 1881 eor v0.16b, v0.16b, v11.16b 1882 str q0, [x21] 1883 1884.Lxts_enc_ret: 1885 1886 movi v0.16b, #0 1887 movi v1.16b, #0 1888.Lxts_enc_bzero: // wipe key schedule 1889 stp q0, q1, [sp], #32 1890 cmp sp, x19 1891 bne .Lxts_enc_bzero 1892 1893 ldp x19, x20, [sp, #80] 1894 ldp x21, x22, [sp, #96] 1895 ldr x23, [sp, #112] 1896 ldp d8, d9, [sp, #128] 1897 ldp d10, d11, [sp, #144] 1898 ldp d12, d13, [sp, #160] 1899 ldp d14, d15, [sp, #176] 1900 ldp x29, x30, [sp], #192 1901 ret 1902.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt 1903 1904// The assembler doesn't seem capable of de-duplicating these when expressed 1905// using `ldr qd,=` syntax, so assign a symbolic address 1906.align 5 1907.Lxts_magic: 1908.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 1909 1910.globl ossl_bsaes_xts_decrypt 1911.type ossl_bsaes_xts_decrypt,%function 1912.align 4 1913// On entry: 1914// x0 -> input ciphertext 1915// x1 -> output plaintext 1916// x2 -> length of text in bytes (must be at least 16) 1917// x3 -> key1 (used to decrypt the XORed ciphertext blocks) 1918// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1919// x5 -> 16-byte initial vector (typically, sector number) 1920// On exit: 1921// Output plaintext filled in 1922// No output registers, usual AAPCS64 register preservation 1923ossl_bsaes_xts_decrypt: 1924 // Stack layout: 1925 // sp -> 1926 // nrounds*128-96 bytes: key schedule 1927 // x19 -> 1928 // 16 bytes: frame record 1929 // 4*16 bytes: tweak storage across _bsaes_decrypt8 1930 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1931 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1932 stp x29, x30, [sp, #-192]! 1933 stp x19, x20, [sp, #80] 1934 stp x21, x22, [sp, #96] 1935 str x23, [sp, #112] 1936 stp d8, d9, [sp, #128] 1937 stp d10, d11, [sp, #144] 1938 stp d12, d13, [sp, #160] 1939 stp d14, d15, [sp, #176] 1940 1941 mov x19, sp 1942 mov x20, x0 1943 mov x21, x1 1944 mov x22, x2 1945 mov x23, x3 1946 1947 // generate initial tweak 1948 sub sp, sp, #16 1949 mov x0, x5 // iv[] 1950 mov x1, sp 1951 mov x2, x4 // key2 1952 bl AES_encrypt 1953 ldr q11, [sp], #16 1954 1955 ldr w1, [x23, #240] // get # of rounds 1956 // allocate the key schedule on the stack 1957 add x17, sp, #96 1958 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1959 1960 // populate the key schedule 1961 mov x9, x23 // pass key 1962 mov x10, x1 // pass # of rounds 1963 mov sp, x17 1964 bl _bsaes_key_convert 1965 ldr q6, [sp] 1966 str q15, [x17] // save last round key 1967 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1968 str q6, [sp] 1969 1970 sub x30, x22, #0x10 1971 tst x22, #0xf // if not multiple of 16 1972 csel x22, x30, x22, ne // subtract another 16 bytes 1973 subs x22, x22, #0x80 1974 1975 blo .Lxts_dec_short 1976 b .Lxts_dec_loop 1977 1978.align 4 1979.Lxts_dec_loop: 1980 ldr q8, .Lxts_magic 1981 mov x10, x1 // pass rounds 1982 add x2, x19, #16 1983 ldr q0, [x20], #16 1984 sshr v1.2d, v11.2d, #63 1985 mov x9, sp // pass key schedule 1986 ldr q6, .Lxts_magic+16 1987 add v2.2d, v11.2d, v11.2d 1988 cmtst v3.2d, v11.2d, v6.2d 1989 and v1.16b, v1.16b, v8.16b 1990 ext v1.16b, v1.16b, v1.16b, #8 1991 and v3.16b, v3.16b, v8.16b 1992 ldr q4, [x20], #16 1993 eor v12.16b, v2.16b, v1.16b 1994 eor v1.16b, v4.16b, v12.16b 1995 eor v0.16b, v0.16b, v11.16b 1996 cmtst v2.2d, v12.2d, v6.2d 1997 add v4.2d, v12.2d, v12.2d 1998 add x0, x19, #16 1999 ext v3.16b, v3.16b, v3.16b, #8 2000 and v2.16b, v2.16b, v8.16b 2001 eor v13.16b, v4.16b, v3.16b 2002 ldr q3, [x20], #16 2003 ext v4.16b, v2.16b, v2.16b, #8 2004 eor v2.16b, v3.16b, v13.16b 2005 ldr q3, [x20], #16 2006 add v5.2d, v13.2d, v13.2d 2007 cmtst v7.2d, v13.2d, v6.2d 2008 and v7.16b, v7.16b, v8.16b 2009 ldr q9, [x20], #16 2010 ext v7.16b, v7.16b, v7.16b, #8 2011 ldr q10, [x20], #16 2012 eor v14.16b, v5.16b, v4.16b 2013 ldr q16, [x20], #16 2014 add v4.2d, v14.2d, v14.2d 2015 eor v3.16b, v3.16b, v14.16b 2016 eor v15.16b, v4.16b, v7.16b 2017 add v5.2d, v15.2d, v15.2d 2018 ldr q7, [x20], #16 2019 cmtst v4.2d, v14.2d, v6.2d 2020 and v17.16b, v4.16b, v8.16b 2021 cmtst v18.2d, v15.2d, v6.2d 2022 eor v4.16b, v9.16b, v15.16b 2023 ext v9.16b, v17.16b, v17.16b, #8 2024 eor v9.16b, v5.16b, v9.16b 2025 add v17.2d, v9.2d, v9.2d 2026 and v18.16b, v18.16b, v8.16b 2027 eor v5.16b, v10.16b, v9.16b 2028 str q9, [x2], #16 2029 ext v10.16b, v18.16b, v18.16b, #8 2030 cmtst v9.2d, v9.2d, v6.2d 2031 and v9.16b, v9.16b, v8.16b 2032 eor v10.16b, v17.16b, v10.16b 2033 cmtst v17.2d, v10.2d, v6.2d 2034 eor v6.16b, v16.16b, v10.16b 2035 str q10, [x2], #16 2036 ext v9.16b, v9.16b, v9.16b, #8 2037 add v10.2d, v10.2d, v10.2d 2038 eor v9.16b, v10.16b, v9.16b 2039 str q9, [x2], #16 2040 eor v7.16b, v7.16b, v9.16b 2041 add v9.2d, v9.2d, v9.2d 2042 and v8.16b, v17.16b, v8.16b 2043 ext v8.16b, v8.16b, v8.16b, #8 2044 eor v8.16b, v9.16b, v8.16b 2045 str q8, [x2] // next round tweak 2046 2047 bl _bsaes_decrypt8 2048 2049 eor v6.16b, v6.16b, v13.16b 2050 eor v0.16b, v0.16b, v11.16b 2051 ldr q8, [x0], #16 2052 eor v7.16b, v7.16b, v8.16b 2053 str q0, [x21], #16 2054 eor v0.16b, v1.16b, v12.16b 2055 ldr q1, [x0], #16 2056 eor v1.16b, v3.16b, v1.16b 2057 subs x22, x22, #0x80 2058 eor v2.16b, v2.16b, v15.16b 2059 eor v3.16b, v4.16b, v14.16b 2060 ldr q4, [x0], #16 2061 str q0, [x21], #16 2062 ldr q11, [x0] // next round tweak 2063 eor v0.16b, v5.16b, v4.16b 2064 str q6, [x21], #16 2065 str q3, [x21], #16 2066 str q2, [x21], #16 2067 str q7, [x21], #16 2068 str q1, [x21], #16 2069 str q0, [x21], #16 2070 bpl .Lxts_dec_loop 2071 2072.Lxts_dec_short: 2073 adds x22, x22, #0x70 2074 bmi .Lxts_dec_done 2075 2076 ldr q8, .Lxts_magic 2077 sshr v1.2d, v11.2d, #63 2078 add v2.2d, v11.2d, v11.2d 2079 ldr q9, .Lxts_magic+16 2080 subs x22, x22, #0x10 2081 ldr q0, [x20], #16 2082 and v1.16b, v1.16b, v8.16b 2083 cmtst v3.2d, v11.2d, v9.2d 2084 ext v1.16b, v1.16b, v1.16b, #8 2085 and v3.16b, v3.16b, v8.16b 2086 eor v12.16b, v2.16b, v1.16b 2087 ext v1.16b, v3.16b, v3.16b, #8 2088 add v2.2d, v12.2d, v12.2d 2089 cmtst v3.2d, v12.2d, v9.2d 2090 eor v13.16b, v2.16b, v1.16b 2091 and v22.16b, v3.16b, v8.16b 2092 bmi .Lxts_dec_1 2093 2094 ext v2.16b, v22.16b, v22.16b, #8 2095 add v3.2d, v13.2d, v13.2d 2096 ldr q1, [x20], #16 2097 cmtst v4.2d, v13.2d, v9.2d 2098 subs x22, x22, #0x10 2099 eor v14.16b, v3.16b, v2.16b 2100 and v23.16b, v4.16b, v8.16b 2101 bmi .Lxts_dec_2 2102 2103 ext v3.16b, v23.16b, v23.16b, #8 2104 add v4.2d, v14.2d, v14.2d 2105 ldr q2, [x20], #16 2106 cmtst v5.2d, v14.2d, v9.2d 2107 eor v0.16b, v0.16b, v11.16b 2108 subs x22, x22, #0x10 2109 eor v15.16b, v4.16b, v3.16b 2110 and v24.16b, v5.16b, v8.16b 2111 bmi .Lxts_dec_3 2112 2113 ext v4.16b, v24.16b, v24.16b, #8 2114 add v5.2d, v15.2d, v15.2d 2115 ldr q3, [x20], #16 2116 cmtst v6.2d, v15.2d, v9.2d 2117 eor v1.16b, v1.16b, v12.16b 2118 subs x22, x22, #0x10 2119 eor v16.16b, v5.16b, v4.16b 2120 and v25.16b, v6.16b, v8.16b 2121 bmi .Lxts_dec_4 2122 2123 ext v5.16b, v25.16b, v25.16b, #8 2124 add v6.2d, v16.2d, v16.2d 2125 add x0, x19, #16 2126 cmtst v7.2d, v16.2d, v9.2d 2127 ldr q4, [x20], #16 2128 eor v2.16b, v2.16b, v13.16b 2129 str q16, [x0], #16 2130 subs x22, x22, #0x10 2131 eor v17.16b, v6.16b, v5.16b 2132 and v26.16b, v7.16b, v8.16b 2133 bmi .Lxts_dec_5 2134 2135 ext v7.16b, v26.16b, v26.16b, #8 2136 add v18.2d, v17.2d, v17.2d 2137 ldr q5, [x20], #16 2138 eor v3.16b, v3.16b, v14.16b 2139 str q17, [x0], #16 2140 subs x22, x22, #0x10 2141 eor v18.16b, v18.16b, v7.16b 2142 bmi .Lxts_dec_6 2143 2144 ldr q6, [x20], #16 2145 eor v4.16b, v4.16b, v15.16b 2146 eor v5.16b, v5.16b, v16.16b 2147 str q18, [x0] // next round tweak 2148 mov x9, sp // pass key schedule 2149 mov x10, x1 2150 add x0, x19, #16 2151 sub x22, x22, #0x10 2152 eor v6.16b, v6.16b, v17.16b 2153 2154 bl _bsaes_decrypt8 2155 2156 ldr q16, [x0], #16 2157 eor v0.16b, v0.16b, v11.16b 2158 eor v1.16b, v1.16b, v12.16b 2159 ldr q17, [x0], #16 2160 eor v6.16b, v6.16b, v13.16b 2161 eor v4.16b, v4.16b, v14.16b 2162 eor v2.16b, v2.16b, v15.16b 2163 ldr q11, [x0] // next round tweak 2164 str q0, [x21], #16 2165 str q1, [x21], #16 2166 eor v0.16b, v7.16b, v16.16b 2167 eor v1.16b, v3.16b, v17.16b 2168 str q6, [x21], #16 2169 str q4, [x21], #16 2170 str q2, [x21], #16 2171 str q0, [x21], #16 2172 str q1, [x21], #16 2173 b .Lxts_dec_done 2174 2175.align 4 2176.Lxts_dec_6: 2177 eor v4.16b, v4.16b, v15.16b 2178 eor v5.16b, v5.16b, v16.16b 2179 mov x9, sp // pass key schedule 2180 mov x10, x1 // pass rounds 2181 add x0, x19, #16 2182 2183 bl _bsaes_decrypt8 2184 2185 ldr q16, [x0], #16 2186 eor v0.16b, v0.16b, v11.16b 2187 eor v1.16b, v1.16b, v12.16b 2188 eor v6.16b, v6.16b, v13.16b 2189 eor v4.16b, v4.16b, v14.16b 2190 ldr q11, [x0] // next round tweak 2191 eor v2.16b, v2.16b, v15.16b 2192 str q0, [x21], #16 2193 str q1, [x21], #16 2194 eor v0.16b, v7.16b, v16.16b 2195 str q6, [x21], #16 2196 str q4, [x21], #16 2197 str q2, [x21], #16 2198 str q0, [x21], #16 2199 b .Lxts_dec_done 2200 2201.align 4 2202.Lxts_dec_5: 2203 eor v3.16b, v3.16b, v14.16b 2204 eor v4.16b, v4.16b, v15.16b 2205 mov x9, sp // pass key schedule 2206 mov x10, x1 // pass rounds 2207 add x0, x19, #16 2208 2209 bl _bsaes_decrypt8 2210 2211 eor v0.16b, v0.16b, v11.16b 2212 eor v1.16b, v1.16b, v12.16b 2213 ldr q11, [x0] // next round tweak 2214 eor v6.16b, v6.16b, v13.16b 2215 eor v4.16b, v4.16b, v14.16b 2216 eor v2.16b, v2.16b, v15.16b 2217 str q0, [x21], #16 2218 str q1, [x21], #16 2219 str q6, [x21], #16 2220 str q4, [x21], #16 2221 str q2, [x21], #16 2222 b .Lxts_dec_done 2223 2224.align 4 2225.Lxts_dec_4: 2226 eor v2.16b, v2.16b, v13.16b 2227 eor v3.16b, v3.16b, v14.16b 2228 mov x9, sp // pass key schedule 2229 mov x10, x1 // pass rounds 2230 add x0, x19, #16 2231 2232 bl _bsaes_decrypt8 2233 2234 eor v0.16b, v0.16b, v11.16b 2235 eor v1.16b, v1.16b, v12.16b 2236 eor v6.16b, v6.16b, v13.16b 2237 eor v4.16b, v4.16b, v14.16b 2238 mov v11.16b, v15.16b // next round tweak 2239 str q0, [x21], #16 2240 str q1, [x21], #16 2241 str q6, [x21], #16 2242 str q4, [x21], #16 2243 b .Lxts_dec_done 2244 2245.align 4 2246.Lxts_dec_3: 2247 eor v1.16b, v1.16b, v12.16b 2248 eor v2.16b, v2.16b, v13.16b 2249 mov x9, sp // pass key schedule 2250 mov x10, x1 // pass rounds 2251 add x0, x19, #16 2252 2253 bl _bsaes_decrypt8 2254 2255 eor v0.16b, v0.16b, v11.16b 2256 eor v1.16b, v1.16b, v12.16b 2257 eor v6.16b, v6.16b, v13.16b 2258 mov v11.16b, v14.16b // next round tweak 2259 str q0, [x21], #16 2260 str q1, [x21], #16 2261 str q6, [x21], #16 2262 b .Lxts_dec_done 2263 2264.align 4 2265.Lxts_dec_2: 2266 eor v0.16b, v0.16b, v11.16b 2267 eor v1.16b, v1.16b, v12.16b 2268 mov x9, sp // pass key schedule 2269 mov x10, x1 // pass rounds 2270 add x0, x19, #16 2271 2272 bl _bsaes_decrypt8 2273 2274 eor v0.16b, v0.16b, v11.16b 2275 eor v1.16b, v1.16b, v12.16b 2276 mov v11.16b, v13.16b // next round tweak 2277 str q0, [x21], #16 2278 str q1, [x21], #16 2279 b .Lxts_dec_done 2280 2281.align 4 2282.Lxts_dec_1: 2283 eor v0.16b, v0.16b, v11.16b 2284 sub x0, sp, #16 2285 sub x1, sp, #16 2286 mov x2, x23 2287 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2288 mov v14.d[0], v12.d[1] 2289 str q0, [sp, #-16]! 2290 2291 bl AES_decrypt 2292 2293 ldr q0, [sp], #16 2294 trn1 v13.2d, v11.2d, v13.2d 2295 trn1 v11.2d, v12.2d, v14.2d // next round tweak 2296 eor v0.16b, v0.16b, v13.16b 2297 str q0, [x21], #16 2298 2299.Lxts_dec_done: 2300 adds x22, x22, #0x10 2301 beq .Lxts_dec_ret 2302 2303 // calculate one round of extra tweak for the stolen ciphertext 2304 ldr q8, .Lxts_magic 2305 sshr v6.2d, v11.2d, #63 2306 and v6.16b, v6.16b, v8.16b 2307 add v12.2d, v11.2d, v11.2d 2308 ext v6.16b, v6.16b, v6.16b, #8 2309 eor v12.16b, v12.16b, v6.16b 2310 2311 // perform the final decryption with the last tweak value 2312 ldr q0, [x20], #16 2313 eor v0.16b, v0.16b, v12.16b 2314 str q0, [sp, #-16]! 2315 mov x0, sp 2316 mov x1, sp 2317 mov x2, x23 2318 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2319 mov v14.d[0], v12.d[1] 2320 2321 bl AES_decrypt 2322 2323 trn1 v12.2d, v12.2d, v14.2d 2324 trn1 v11.2d, v11.2d, v13.2d 2325 ldr q0, [sp], #16 2326 eor v0.16b, v0.16b, v12.16b 2327 str q0, [x21] 2328 2329 mov x6, x21 2330 // Penultimate ciphertext block produces final plaintext part-block 2331 // plus remaining part of final ciphertext block. Move plaintext part 2332 // to final position and re-use penultimate plaintext block buffer to 2333 // construct final ciphertext block 2334.Lxts_dec_steal: 2335 ldrb w1, [x21] 2336 ldrb w0, [x20], #1 2337 strb w1, [x21, #0x10] 2338 strb w0, [x21], #1 2339 2340 subs x22, x22, #1 2341 bhi .Lxts_dec_steal 2342 2343 // Finally decrypt the penultimate plaintext block using the 2344 // penultimate tweak 2345 ldr q0, [x6] 2346 eor v0.16b, v0.16b, v11.16b 2347 str q0, [sp, #-16]! 2348 mov x0, sp 2349 mov x1, sp 2350 mov x2, x23 2351 mov x21, x6 2352 2353 bl AES_decrypt 2354 2355 trn1 v11.2d, v11.2d, v13.2d 2356 ldr q0, [sp], #16 2357 eor v0.16b, v0.16b, v11.16b 2358 str q0, [x21] 2359 2360.Lxts_dec_ret: 2361 2362 movi v0.16b, #0 2363 movi v1.16b, #0 2364.Lxts_dec_bzero: // wipe key schedule 2365 stp q0, q1, [sp], #32 2366 cmp sp, x19 2367 bne .Lxts_dec_bzero 2368 2369 ldp x19, x20, [sp, #80] 2370 ldp x21, x22, [sp, #96] 2371 ldr x23, [sp, #112] 2372 ldp d8, d9, [sp, #128] 2373 ldp d10, d11, [sp, #144] 2374 ldp d12, d13, [sp, #160] 2375 ldp d14, d15, [sp, #176] 2376 ldp x29, x30, [sp], #192 2377 ret 2378.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt 2379