1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10###################################################################### 11## Constant-time SSSE3 AES core implementation. 12## version 0.1 13## 14## By Mike Hamburg (Stanford University), 2009 15## Public domain. 16## 17## For details see http://shiftleft.org/papers/vector_aes/ and 18## http://crypto.stanford.edu/vpaes/. 19## 20###################################################################### 21# ARMv8 NEON adaptation by <appro@openssl.org> 22# 23# Reason for undertaken effort is that there is at least one popular 24# SoC based on Cortex-A53 that doesn't have crypto extensions. 25# 26# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] 27# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] 28# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] 29# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] 30# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] 31# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] 32# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] 33# ThunderX2(***) 39.4(**) 33.8/48.6(**) 34# 35# (*) ECB denotes approximate result for parallelizable modes 36# such as CBC decrypt, CTR, etc.; 37# (**) these results are worse than scalar compiler-generated 38# code, but it's constant-time and therefore preferred; 39# (***) presented for reference/comparison purposes; 40 41# $output is the last argument if it looks like a file (it has an extension) 42# $flavour is the first argument if it doesn't look like a file 43$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 44$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49die "can't locate arm-xlate.pl"; 50 51open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 52 or die "can't call $xlate: $!"; 53*STDOUT=*OUT; 54 55$code.=<<___; 56#include "arm_arch.h" 57 58.text 59 60.type _vpaes_consts,%object 61.align 7 // totally strategic alignment 62_vpaes_consts: 63.Lk_mc_forward: // mc_forward 64 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 65 .quad 0x080B0A0904070605, 0x000302010C0F0E0D 66 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 67 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 68.Lk_mc_backward:// mc_backward 69 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B 70 .quad 0x020100030E0D0C0F, 0x0A09080B06050407 71 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 72 .quad 0x0A09080B06050407, 0x020100030E0D0C0F 73.Lk_sr: // sr 74 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 75 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 76 .quad 0x0F060D040B020900, 0x070E050C030A0108 77 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 78 79// 80// "Hot" constants 81// 82.Lk_inv: // inv, inva 83 .quad 0x0E05060F0D080180, 0x040703090A0B0C02 84 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 85.Lk_ipt: // input transform (lo, hi) 86 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 87 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 88.Lk_sbo: // sbou, sbot 89 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 90 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 91.Lk_sb1: // sb1u, sb1t 92 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 93 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 94.Lk_sb2: // sb2u, sb2t 95 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 96 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 97 98// 99// Decryption stuff 100// 101.Lk_dipt: // decryption input transform 102 .quad 0x0F505B040B545F00, 0x154A411E114E451A 103 .quad 0x86E383E660056500, 0x12771772F491F194 104.Lk_dsbo: // decryption sbox final output 105 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 106 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 107.Lk_dsb9: // decryption sbox output *9*u, *9*t 108 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 109 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 110.Lk_dsbd: // decryption sbox output *D*u, *D*t 111 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 112 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 113.Lk_dsbb: // decryption sbox output *B*u, *B*t 114 .quad 0xD022649296B44200, 0x602646F6B0F2D404 115 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 116.Lk_dsbe: // decryption sbox output *E*u, *E*t 117 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 118 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 119 120// 121// Key schedule constants 122// 123.Lk_dksd: // decryption key schedule: invskew x*D 124 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 125 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 126.Lk_dksb: // decryption key schedule: invskew x*B 127 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 128 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 129.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 130 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 131 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 132.Lk_dks9: // decryption key schedule: invskew x*9 133 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 134 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 135 136.Lk_rcon: // rcon 137 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 138 139.Lk_opt: // output transform 140 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 141 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 142.Lk_deskew: // deskew tables: inverts the sbox's "skew" 143 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 144 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 145 146.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" 147.size _vpaes_consts,.-_vpaes_consts 148.align 6 149___ 150 151{ 152my ($inp,$out,$key) = map("x$_",(0..2)); 153 154my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); 155my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); 156my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); 157 158$code.=<<___; 159// 160// _aes_preheat 161// 162// Fills register %r10 -> .aes_consts (so you can -fPIC) 163// and %xmm9-%xmm15 as specified below. 164// 165.type _vpaes_encrypt_preheat,%function 166.align 4 167_vpaes_encrypt_preheat: 168 adr x10, .Lk_inv 169 movi v17.16b, #0x0f 170 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv 171 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 172 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 173 ret 174.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 175 176// 177// _aes_encrypt_core 178// 179// AES-encrypt %xmm0. 180// 181// Inputs: 182// %xmm0 = input 183// %xmm9-%xmm15 as in _vpaes_preheat 184// (%rdx) = scheduled keys 185// 186// Output in %xmm0 187// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 188// Preserves %xmm6 - %xmm8 so you get some local vectors 189// 190// 191.type _vpaes_encrypt_core,%function 192.align 4 193_vpaes_encrypt_core: 194 mov x9, $key 195 ldr w8, [$key,#240] // pull rounds 196 adr x11, .Lk_mc_forward+16 197 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 198 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 199 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 200 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 201 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 202 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 203 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 204 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 205 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 206 b .Lenc_entry 207 208.align 4 209.Lenc_loop: 210 // middle of middle round 211 add x10, x11, #0x40 212 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 213 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 214 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 215 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 216 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 217 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 218 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 219 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 220 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 221 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 222 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 223 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 224 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 225 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 226 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 227 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 228 sub w8, w8, #1 // nr-- 229 230.Lenc_entry: 231 // top of round 232 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 233 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 234 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 235 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 236 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 237 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 238 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 239 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 240 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 241 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 242 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 243 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 244 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 245 cbnz w8, .Lenc_loop 246 247 // middle of last round 248 add x10, x11, #0x80 249 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 250 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 251 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 252 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 253 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 254 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 255 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 256 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 257 ret 258.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 259 260.globl vpaes_encrypt 261.type vpaes_encrypt,%function 262.align 4 263vpaes_encrypt: 264 AARCH64_SIGN_LINK_REGISTER 265 stp x29,x30,[sp,#-16]! 266 add x29,sp,#0 267 268 ld1 {v7.16b}, [$inp] 269 bl _vpaes_encrypt_preheat 270 bl _vpaes_encrypt_core 271 st1 {v0.16b}, [$out] 272 273 ldp x29,x30,[sp],#16 274 AARCH64_VALIDATE_LINK_REGISTER 275 ret 276.size vpaes_encrypt,.-vpaes_encrypt 277 278.type _vpaes_encrypt_2x,%function 279.align 4 280_vpaes_encrypt_2x: 281 mov x9, $key 282 ldr w8, [$key,#240] // pull rounds 283 adr x11, .Lk_mc_forward+16 284 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 285 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 286 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 287 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 288 and v9.16b, v15.16b, v17.16b 289 ushr v8.16b, v15.16b, #4 290 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 291 tbl v9.16b, {$iptlo}, v9.16b 292 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 293 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 294 tbl v10.16b, {$ipthi}, v8.16b 295 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 296 eor v8.16b, v9.16b, v16.16b 297 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 298 eor v8.16b, v8.16b, v10.16b 299 b .Lenc_2x_entry 300 301.align 4 302.Lenc_2x_loop: 303 // middle of middle round 304 add x10, x11, #0x40 305 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 306 tbl v12.16b, {$sb1t}, v10.16b 307 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 308 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 309 tbl v8.16b, {$sb1u}, v11.16b 310 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 311 eor v12.16b, v12.16b, v16.16b 312 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 313 tbl v13.16b, {$sb2t}, v10.16b 314 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 315 eor v8.16b, v8.16b, v12.16b 316 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 317 tbl v10.16b, {$sb2u}, v11.16b 318 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 319 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 320 tbl v11.16b, {v8.16b}, v1.16b 321 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 322 eor v10.16b, v10.16b, v13.16b 323 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 324 tbl v8.16b, {v8.16b}, v4.16b 325 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 326 eor v11.16b, v11.16b, v10.16b 327 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 328 tbl v12.16b, {v11.16b},v1.16b 329 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 330 eor v8.16b, v8.16b, v11.16b 331 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 332 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 333 eor v8.16b, v8.16b, v12.16b 334 sub w8, w8, #1 // nr-- 335 336.Lenc_2x_entry: 337 // top of round 338 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 339 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 340 and v9.16b, v8.16b, v17.16b 341 ushr v8.16b, v8.16b, #4 342 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 343 tbl v13.16b, {$invhi},v9.16b 344 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 345 eor v9.16b, v9.16b, v8.16b 346 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 347 tbl v11.16b, {$invlo},v8.16b 348 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 349 tbl v12.16b, {$invlo},v9.16b 350 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 351 eor v11.16b, v11.16b, v13.16b 352 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 353 eor v12.16b, v12.16b, v13.16b 354 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 355 tbl v10.16b, {$invlo},v11.16b 356 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 357 tbl v11.16b, {$invlo},v12.16b 358 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 359 eor v10.16b, v10.16b, v9.16b 360 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 361 eor v11.16b, v11.16b, v8.16b 362 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 363 cbnz w8, .Lenc_2x_loop 364 365 // middle of last round 366 add x10, x11, #0x80 367 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 368 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 369 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 370 tbl v12.16b, {$sbou}, v10.16b 371 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 372 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 373 tbl v8.16b, {$sbot}, v11.16b 374 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 375 eor v12.16b, v12.16b, v16.16b 376 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 377 eor v8.16b, v8.16b, v12.16b 378 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 379 tbl v1.16b, {v8.16b},v1.16b 380 ret 381.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 382 383.type _vpaes_decrypt_preheat,%function 384.align 4 385_vpaes_decrypt_preheat: 386 adr x10, .Lk_inv 387 movi v17.16b, #0x0f 388 adr x11, .Lk_dipt 389 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv 390 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 391 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 392 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 393 ret 394.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 395 396// 397// Decryption core 398// 399// Same API as encryption core. 400// 401.type _vpaes_decrypt_core,%function 402.align 4 403_vpaes_decrypt_core: 404 mov x9, $key 405 ldr w8, [$key,#240] // pull rounds 406 407 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 408 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 409 eor x11, x11, #0x30 // xor \$0x30, %r11 410 adr x10, .Lk_sr 411 and x11, x11, #0x30 // and \$0x30, %r11 412 add x11, x11, x10 413 adr x10, .Lk_mc_forward+48 414 415 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 416 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 417 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 418 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 419 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 420 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 421 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 422 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 423 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 424 b .Ldec_entry 425 426.align 4 427.Ldec_loop: 428// 429// Inverse mix columns 430// 431 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 432 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 433 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 434 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 435 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 436 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 437 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 438 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 439 440 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 441 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 442 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 443 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 444 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 445 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 446 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 447 448 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 449 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 450 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 451 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 452 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 453 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 454 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 455 456 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 457 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 458 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 459 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 460 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 461 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 462 sub w8, w8, #1 // sub \$1,%rax # nr-- 463 464.Ldec_entry: 465 // top of round 466 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 467 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 468 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 469 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 470 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 471 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 472 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 473 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 474 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 475 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 476 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 477 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 478 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 479 cbnz w8, .Ldec_loop 480 481 // middle of last round 482 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 483 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 484 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 485 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 486 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 487 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 488 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 489 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 490 ret 491.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 492 493.globl vpaes_decrypt 494.type vpaes_decrypt,%function 495.align 4 496vpaes_decrypt: 497 AARCH64_SIGN_LINK_REGISTER 498 stp x29,x30,[sp,#-16]! 499 add x29,sp,#0 500 501 ld1 {v7.16b}, [$inp] 502 bl _vpaes_decrypt_preheat 503 bl _vpaes_decrypt_core 504 st1 {v0.16b}, [$out] 505 506 ldp x29,x30,[sp],#16 507 AARCH64_VALIDATE_LINK_REGISTER 508 ret 509.size vpaes_decrypt,.-vpaes_decrypt 510 511// v14-v15 input, v0-v1 output 512.type _vpaes_decrypt_2x,%function 513.align 4 514_vpaes_decrypt_2x: 515 mov x9, $key 516 ldr w8, [$key,#240] // pull rounds 517 518 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 519 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 520 eor x11, x11, #0x30 // xor \$0x30, %r11 521 adr x10, .Lk_sr 522 and x11, x11, #0x30 // and \$0x30, %r11 523 add x11, x11, x10 524 adr x10, .Lk_mc_forward+48 525 526 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 527 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 528 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 529 and v9.16b, v15.16b, v17.16b 530 ushr v8.16b, v15.16b, #4 531 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 532 tbl v10.16b, {$iptlo},v9.16b 533 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 534 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 535 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 536 tbl v8.16b, {$ipthi},v8.16b 537 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 538 eor v10.16b, v10.16b, v16.16b 539 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 540 eor v8.16b, v8.16b, v10.16b 541 b .Ldec_2x_entry 542 543.align 4 544.Ldec_2x_loop: 545// 546// Inverse mix columns 547// 548 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 549 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 550 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 551 tbl v12.16b, {$sb9u}, v10.16b 552 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 553 tbl v9.16b, {$sb9t}, v11.16b 554 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 555 eor v8.16b, v12.16b, v16.16b 556 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 557 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 558 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 559 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 560 561 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 562 tbl v12.16b, {$sbdu}, v10.16b 563 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 564 tbl v8.16b, {v8.16b},v5.16b 565 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 566 tbl v9.16b, {$sbdt}, v11.16b 567 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 568 eor v8.16b, v8.16b, v12.16b 569 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 570 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 571 eor v8.16b, v8.16b, v9.16b 572 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 573 574 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 575 tbl v12.16b, {$sbbu}, v10.16b 576 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 577 tbl v8.16b, {v8.16b},v5.16b 578 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 579 tbl v9.16b, {$sbbt}, v11.16b 580 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 581 eor v8.16b, v8.16b, v12.16b 582 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 583 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 584 eor v8.16b, v8.16b, v9.16b 585 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 586 587 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 588 tbl v12.16b, {$sbeu}, v10.16b 589 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 590 tbl v8.16b, {v8.16b},v5.16b 591 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 592 tbl v9.16b, {$sbet}, v11.16b 593 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 594 eor v8.16b, v8.16b, v12.16b 595 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 596 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 597 eor v8.16b, v8.16b, v9.16b 598 sub w8, w8, #1 // sub \$1,%rax # nr-- 599 600.Ldec_2x_entry: 601 // top of round 602 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 603 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 604 and v9.16b, v8.16b, v17.16b 605 ushr v8.16b, v8.16b, #4 606 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 607 tbl v10.16b, {$invhi},v9.16b 608 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 609 eor v9.16b, v9.16b, v8.16b 610 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 611 tbl v11.16b, {$invlo},v8.16b 612 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 613 tbl v12.16b, {$invlo},v9.16b 614 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 615 eor v11.16b, v11.16b, v10.16b 616 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 617 eor v12.16b, v12.16b, v10.16b 618 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 619 tbl v10.16b, {$invlo},v11.16b 620 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 621 tbl v11.16b, {$invlo},v12.16b 622 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 623 eor v10.16b, v10.16b, v9.16b 624 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 625 eor v11.16b, v11.16b, v8.16b 626 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 627 cbnz w8, .Ldec_2x_loop 628 629 // middle of last round 630 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 631 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 632 tbl v12.16b, {$sbou}, v10.16b 633 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 634 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 635 tbl v9.16b, {$sbot}, v11.16b 636 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 637 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 638 eor v12.16b, v12.16b, v16.16b 639 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 640 eor v8.16b, v9.16b, v12.16b 641 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 642 tbl v1.16b, {v8.16b},v2.16b 643 ret 644.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 645___ 646} 647{ 648my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); 649my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); 650 651$code.=<<___; 652//////////////////////////////////////////////////////// 653// // 654// AES key schedule // 655// // 656//////////////////////////////////////////////////////// 657.type _vpaes_key_preheat,%function 658.align 4 659_vpaes_key_preheat: 660 adr x10, .Lk_inv 661 movi v16.16b, #0x5b // .Lk_s63 662 adr x11, .Lk_sb1 663 movi v17.16b, #0x0f // .Lk_s0F 664 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt 665 adr x10, .Lk_dksd 666 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 667 adr x11, .Lk_mc_forward 668 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 669 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 670 ld1 {v8.2d}, [x10] // .Lk_rcon 671 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 672 ret 673.size _vpaes_key_preheat,.-_vpaes_key_preheat 674 675.type _vpaes_schedule_core,%function 676.align 4 677_vpaes_schedule_core: 678 AARCH64_SIGN_LINK_REGISTER 679 stp x29, x30, [sp,#-16]! 680 add x29,sp,#0 681 682 bl _vpaes_key_preheat // load the tables 683 684 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 685 686 // input transform 687 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 688 bl _vpaes_schedule_transform 689 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 690 691 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 692 add x8, x8, x10 693 cbnz $dir, .Lschedule_am_decrypting 694 695 // encrypting, output zeroth round key after transform 696 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) 697 b .Lschedule_go 698 699.Lschedule_am_decrypting: 700 // decrypting, output zeroth round key after shiftrows 701 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 702 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 703 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) 704 eor x8, x8, #0x30 // xor \$0x30, %r8 705 706.Lschedule_go: 707 cmp $bits, #192 // cmp \$192, %esi 708 b.hi .Lschedule_256 709 b.eq .Lschedule_192 710 // 128: fall though 711 712// 713// .schedule_128 714// 715// 128-bit specific part of key schedule. 716// 717// This schedule is really simple, because all its parts 718// are accomplished by the subroutines. 719// 720.Lschedule_128: 721 mov $inp, #10 // mov \$10, %esi 722 723.Loop_schedule_128: 724 sub $inp, $inp, #1 // dec %esi 725 bl _vpaes_schedule_round 726 cbz $inp, .Lschedule_mangle_last 727 bl _vpaes_schedule_mangle // write output 728 b .Loop_schedule_128 729 730// 731// .aes_schedule_192 732// 733// 192-bit specific part of key schedule. 734// 735// The main body of this schedule is the same as the 128-bit 736// schedule, but with more smearing. The long, high side is 737// stored in %xmm7 as before, and the short, low side is in 738// the high bits of %xmm6. 739// 740// This schedule is somewhat nastier, however, because each 741// round produces 192 bits of key material, or 1.5 round keys. 742// Therefore, on each cycle we do 2 rounds and produce 3 round 743// keys. 744// 745.align 4 746.Lschedule_192: 747 sub $inp, $inp, #8 748 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 749 bl _vpaes_schedule_transform // input transform 750 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 751 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 752 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 753 mov $inp, #4 // mov \$4, %esi 754 755.Loop_schedule_192: 756 sub $inp, $inp, #1 // dec %esi 757 bl _vpaes_schedule_round 758 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 759 bl _vpaes_schedule_mangle // save key n 760 bl _vpaes_schedule_192_smear 761 bl _vpaes_schedule_mangle // save key n+1 762 bl _vpaes_schedule_round 763 cbz $inp, .Lschedule_mangle_last 764 bl _vpaes_schedule_mangle // save key n+2 765 bl _vpaes_schedule_192_smear 766 b .Loop_schedule_192 767 768// 769// .aes_schedule_256 770// 771// 256-bit specific part of key schedule. 772// 773// The structure here is very similar to the 128-bit 774// schedule, but with an additional "low side" in 775// %xmm6. The low side's rounds are the same as the 776// high side's, except no rcon and no rotation. 777// 778.align 4 779.Lschedule_256: 780 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 781 bl _vpaes_schedule_transform // input transform 782 mov $inp, #7 // mov \$7, %esi 783 784.Loop_schedule_256: 785 sub $inp, $inp, #1 // dec %esi 786 bl _vpaes_schedule_mangle // output low result 787 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 788 789 // high round 790 bl _vpaes_schedule_round 791 cbz $inp, .Lschedule_mangle_last 792 bl _vpaes_schedule_mangle 793 794 // low round. swap xmm7 and xmm6 795 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 796 movi v4.16b, #0 797 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 798 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 799 bl _vpaes_schedule_low_round 800 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 801 802 b .Loop_schedule_256 803 804// 805// .aes_schedule_mangle_last 806// 807// Mangler for last round of key schedule 808// Mangles %xmm0 809// when encrypting, outputs out(%xmm0) ^ 63 810// when decrypting, outputs unskew(%xmm0) 811// 812// Always called right before return... jumps to cleanup and exits 813// 814.align 4 815.Lschedule_mangle_last: 816 // schedule last round key from xmm0 817 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 818 cbnz $dir, .Lschedule_mangle_last_dec 819 820 // encrypting 821 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 822 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 823 add $out, $out, #32 // add \$32, %rdx 824 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 825 826.Lschedule_mangle_last_dec: 827 ld1 {v20.2d-v21.2d}, [x11] // reload constants 828 sub $out, $out, #16 // add \$-16, %rdx 829 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 830 bl _vpaes_schedule_transform // output transform 831 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key 832 833 // cleanup 834 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 835 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 836 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 837 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 838 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 839 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 840 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 841 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 842 ldp x29, x30, [sp],#16 843 AARCH64_VALIDATE_LINK_REGISTER 844 ret 845.size _vpaes_schedule_core,.-_vpaes_schedule_core 846 847// 848// .aes_schedule_192_smear 849// 850// Smear the short, low side in the 192-bit key schedule. 851// 852// Inputs: 853// %xmm7: high side, b a x y 854// %xmm6: low side, d c 0 0 855// %xmm13: 0 856// 857// Outputs: 858// %xmm6: b+c+d b+c 0 0 859// %xmm0: b+c+d b+c b a 860// 861.type _vpaes_schedule_192_smear,%function 862.align 4 863_vpaes_schedule_192_smear: 864 movi v1.16b, #0 865 dup v0.4s, v7.s[3] 866 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 867 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 868 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 869 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 870 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 871 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 872 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 873 ret 874.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 875 876// 877// .aes_schedule_round 878// 879// Runs one main round of the key schedule on %xmm0, %xmm7 880// 881// Specifically, runs subbytes on the high dword of %xmm0 882// then rotates it by one byte and xors into the low dword of 883// %xmm7. 884// 885// Adds rcon from low byte of %xmm8, then rotates %xmm8 for 886// next rcon. 887// 888// Smears the dwords of %xmm7 by xoring the low into the 889// second low, result into third, result into highest. 890// 891// Returns results in %xmm7 = %xmm0. 892// Clobbers %xmm1-%xmm4, %r11. 893// 894.type _vpaes_schedule_round,%function 895.align 4 896_vpaes_schedule_round: 897 // extract rcon from xmm8 898 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 899 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 900 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 901 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 902 903 // rotate 904 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 905 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 906 907 // fall through... 908 909 // low round: same as high round, but no rotation and no rcon. 910_vpaes_schedule_low_round: 911 // smear xmm7 912 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 913 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 914 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 915 916 // subbytes 917 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 918 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i 919 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 920 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 921 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 922 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 923 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 924 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 925 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 926 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 927 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 928 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 929 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 930 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 931 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 932 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 933 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 934 935 // add in smeared stuff 936 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 937 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 938 ret 939.size _vpaes_schedule_round,.-_vpaes_schedule_round 940 941// 942// .aes_schedule_transform 943// 944// Linear-transform %xmm0 according to tables at (%r11) 945// 946// Requires that %xmm9 = 0x0F0F... as in preheat 947// Output in %xmm0 948// Clobbers %xmm1, %xmm2 949// 950.type _vpaes_schedule_transform,%function 951.align 4 952_vpaes_schedule_transform: 953 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 954 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 955 // vmovdqa (%r11), %xmm2 # lo 956 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 957 // vmovdqa 16(%r11), %xmm1 # hi 958 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 959 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 960 ret 961.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 962 963// 964// .aes_schedule_mangle 965// 966// Mangle xmm0 from (basis-transformed) standard version 967// to our version. 968// 969// On encrypt, 970// xor with 0x63 971// multiply by circulant 0,1,1,1 972// apply shiftrows transform 973// 974// On decrypt, 975// xor with 0x63 976// multiply by "inverse mixcolumns" circulant E,B,D,9 977// deskew 978// apply shiftrows transform 979// 980// 981// Writes out to (%rdx), and increments or decrements it 982// Keeps track of round number mod 4 in %r8 983// Preserves xmm0 984// Clobbers xmm1-xmm5 985// 986.type _vpaes_schedule_mangle,%function 987.align 4 988_vpaes_schedule_mangle: 989 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 990 // vmovdqa .Lk_mc_forward(%rip),%xmm5 991 cbnz $dir, .Lschedule_mangle_dec 992 993 // encrypting 994 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 995 add $out, $out, #16 // add \$16, %rdx 996 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 997 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 998 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 999 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 1000 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1001 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 1002 1003 b .Lschedule_mangle_both 1004.align 4 1005.Lschedule_mangle_dec: 1006 // inverse mix columns 1007 // lea .Lk_dksd(%rip),%r11 1008 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 1009 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1010 1011 // vmovdqa 0x00(%r11), %xmm2 1012 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1013 // vmovdqa 0x10(%r11), %xmm3 1014 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1015 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1016 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1017 1018 // vmovdqa 0x20(%r11), %xmm2 1019 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1020 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1021 // vmovdqa 0x30(%r11), %xmm3 1022 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1023 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1024 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1025 1026 // vmovdqa 0x40(%r11), %xmm2 1027 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1028 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1029 // vmovdqa 0x50(%r11), %xmm3 1030 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1031 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1032 1033 // vmovdqa 0x60(%r11), %xmm2 1034 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1035 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1036 // vmovdqa 0x70(%r11), %xmm4 1037 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 1038 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1039 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1040 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 1041 1042 sub $out, $out, #16 // add \$-16, %rdx 1043 1044.Lschedule_mangle_both: 1045 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1046 add x8, x8, #64-16 // add \$-16, %r8 1047 and x8, x8, #~(1<<6) // and \$0x30, %r8 1048 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) 1049 ret 1050.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1051 1052.globl vpaes_set_encrypt_key 1053.type vpaes_set_encrypt_key,%function 1054.align 4 1055vpaes_set_encrypt_key: 1056 AARCH64_SIGN_LINK_REGISTER 1057 stp x29,x30,[sp,#-16]! 1058 add x29,sp,#0 1059 stp d8,d9,[sp,#-16]! // ABI spec says so 1060 1061 lsr w9, $bits, #5 // shr \$5,%eax 1062 add w9, w9, #5 // \$5,%eax 1063 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1064 1065 mov $dir, #0 // mov \$0,%ecx 1066 mov x8, #0x30 // mov \$0x30,%r8d 1067 bl _vpaes_schedule_core 1068 eor x0, x0, x0 1069 1070 ldp d8,d9,[sp],#16 1071 ldp x29,x30,[sp],#16 1072 AARCH64_VALIDATE_LINK_REGISTER 1073 ret 1074.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1075 1076.globl vpaes_set_decrypt_key 1077.type vpaes_set_decrypt_key,%function 1078.align 4 1079vpaes_set_decrypt_key: 1080 AARCH64_SIGN_LINK_REGISTER 1081 stp x29,x30,[sp,#-16]! 1082 add x29,sp,#0 1083 stp d8,d9,[sp,#-16]! // ABI spec says so 1084 1085 lsr w9, $bits, #5 // shr \$5,%eax 1086 add w9, w9, #5 // \$5,%eax 1087 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1088 lsl w9, w9, #4 // shl \$4,%eax 1089 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx 1090 add $out, $out, x9 1091 1092 mov $dir, #1 // mov \$1,%ecx 1093 lsr w8, $bits, #1 // shr \$1,%r8d 1094 and x8, x8, #32 // and \$32,%r8d 1095 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 1096 bl _vpaes_schedule_core 1097 1098 ldp d8,d9,[sp],#16 1099 ldp x29,x30,[sp],#16 1100 AARCH64_VALIDATE_LINK_REGISTER 1101 ret 1102.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1103___ 1104} 1105{ 1106my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); 1107 1108$code.=<<___; 1109.globl vpaes_cbc_encrypt 1110.type vpaes_cbc_encrypt,%function 1111.align 4 1112vpaes_cbc_encrypt: 1113 AARCH64_SIGN_LINK_REGISTER 1114 cbz $len, .Lcbc_abort 1115 cmp w5, #0 // check direction 1116 b.eq vpaes_cbc_decrypt 1117 1118 stp x29,x30,[sp,#-16]! 1119 add x29,sp,#0 1120 1121 mov x17, $len // reassign 1122 mov x2, $key // reassign 1123 1124 ld1 {v0.16b}, [$ivec] // load ivec 1125 bl _vpaes_encrypt_preheat 1126 b .Lcbc_enc_loop 1127 1128.align 4 1129.Lcbc_enc_loop: 1130 ld1 {v7.16b}, [$inp],#16 // load input 1131 eor v7.16b, v7.16b, v0.16b // xor with ivec 1132 bl _vpaes_encrypt_core 1133 st1 {v0.16b}, [$out],#16 // save output 1134 subs x17, x17, #16 1135 b.hi .Lcbc_enc_loop 1136 1137 st1 {v0.16b}, [$ivec] // write ivec 1138 1139 ldp x29,x30,[sp],#16 1140.Lcbc_abort: 1141 AARCH64_VALIDATE_LINK_REGISTER 1142 ret 1143.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1144 1145.type vpaes_cbc_decrypt,%function 1146.align 4 1147vpaes_cbc_decrypt: 1148 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1149 // only from vpaes_cbc_encrypt which has already signed the return address. 1150 stp x29,x30,[sp,#-16]! 1151 add x29,sp,#0 1152 stp d8,d9,[sp,#-16]! // ABI spec says so 1153 stp d10,d11,[sp,#-16]! 1154 stp d12,d13,[sp,#-16]! 1155 stp d14,d15,[sp,#-16]! 1156 1157 mov x17, $len // reassign 1158 mov x2, $key // reassign 1159 ld1 {v6.16b}, [$ivec] // load ivec 1160 bl _vpaes_decrypt_preheat 1161 tst x17, #16 1162 b.eq .Lcbc_dec_loop2x 1163 1164 ld1 {v7.16b}, [$inp], #16 // load input 1165 bl _vpaes_decrypt_core 1166 eor v0.16b, v0.16b, v6.16b // xor with ivec 1167 orr v6.16b, v7.16b, v7.16b // next ivec value 1168 st1 {v0.16b}, [$out], #16 1169 subs x17, x17, #16 1170 b.ls .Lcbc_dec_done 1171 1172.align 4 1173.Lcbc_dec_loop2x: 1174 ld1 {v14.16b,v15.16b}, [$inp], #32 1175 bl _vpaes_decrypt_2x 1176 eor v0.16b, v0.16b, v6.16b // xor with ivec 1177 eor v1.16b, v1.16b, v14.16b 1178 orr v6.16b, v15.16b, v15.16b 1179 st1 {v0.16b,v1.16b}, [$out], #32 1180 subs x17, x17, #32 1181 b.hi .Lcbc_dec_loop2x 1182 1183.Lcbc_dec_done: 1184 st1 {v6.16b}, [$ivec] 1185 1186 ldp d14,d15,[sp],#16 1187 ldp d12,d13,[sp],#16 1188 ldp d10,d11,[sp],#16 1189 ldp d8,d9,[sp],#16 1190 ldp x29,x30,[sp],#16 1191 AARCH64_VALIDATE_LINK_REGISTER 1192 ret 1193.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1194___ 1195if (1) { 1196$code.=<<___; 1197.globl vpaes_ecb_encrypt 1198.type vpaes_ecb_encrypt,%function 1199.align 4 1200vpaes_ecb_encrypt: 1201 AARCH64_SIGN_LINK_REGISTER 1202 stp x29,x30,[sp,#-16]! 1203 add x29,sp,#0 1204 stp d8,d9,[sp,#-16]! // ABI spec says so 1205 stp d10,d11,[sp,#-16]! 1206 stp d12,d13,[sp,#-16]! 1207 stp d14,d15,[sp,#-16]! 1208 1209 mov x17, $len 1210 mov x2, $key 1211 bl _vpaes_encrypt_preheat 1212 tst x17, #16 1213 b.eq .Lecb_enc_loop 1214 1215 ld1 {v7.16b}, [$inp],#16 1216 bl _vpaes_encrypt_core 1217 st1 {v0.16b}, [$out],#16 1218 subs x17, x17, #16 1219 b.ls .Lecb_enc_done 1220 1221.align 4 1222.Lecb_enc_loop: 1223 ld1 {v14.16b,v15.16b}, [$inp], #32 1224 bl _vpaes_encrypt_2x 1225 st1 {v0.16b,v1.16b}, [$out], #32 1226 subs x17, x17, #32 1227 b.hi .Lecb_enc_loop 1228 1229.Lecb_enc_done: 1230 ldp d14,d15,[sp],#16 1231 ldp d12,d13,[sp],#16 1232 ldp d10,d11,[sp],#16 1233 ldp d8,d9,[sp],#16 1234 ldp x29,x30,[sp],#16 1235 AARCH64_VALIDATE_LINK_REGISTER 1236 ret 1237.size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt 1238 1239.globl vpaes_ecb_decrypt 1240.type vpaes_ecb_decrypt,%function 1241.align 4 1242vpaes_ecb_decrypt: 1243 AARCH64_SIGN_LINK_REGISTER 1244 stp x29,x30,[sp,#-16]! 1245 add x29,sp,#0 1246 stp d8,d9,[sp,#-16]! // ABI spec says so 1247 stp d10,d11,[sp,#-16]! 1248 stp d12,d13,[sp,#-16]! 1249 stp d14,d15,[sp,#-16]! 1250 1251 mov x17, $len 1252 mov x2, $key 1253 bl _vpaes_decrypt_preheat 1254 tst x17, #16 1255 b.eq .Lecb_dec_loop 1256 1257 ld1 {v7.16b}, [$inp],#16 1258 bl _vpaes_encrypt_core 1259 st1 {v0.16b}, [$out],#16 1260 subs x17, x17, #16 1261 b.ls .Lecb_dec_done 1262 1263.align 4 1264.Lecb_dec_loop: 1265 ld1 {v14.16b,v15.16b}, [$inp], #32 1266 bl _vpaes_decrypt_2x 1267 st1 {v0.16b,v1.16b}, [$out], #32 1268 subs x17, x17, #32 1269 b.hi .Lecb_dec_loop 1270 1271.Lecb_dec_done: 1272 ldp d14,d15,[sp],#16 1273 ldp d12,d13,[sp],#16 1274 ldp d10,d11,[sp],#16 1275 ldp d8,d9,[sp],#16 1276 ldp x29,x30,[sp],#16 1277 AARCH64_VALIDATE_LINK_REGISTER 1278 ret 1279.size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt 1280___ 1281} } 1282print $code; 1283 1284close STDOUT or die "error closing STDOUT: $!"; 1285