1#! /usr/bin/env perl 2# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10###################################################################### 11## Constant-time SSSE3 AES core implementation. 12## version 0.1 13## 14## By Mike Hamburg (Stanford University), 2009 15## Public domain. 16## 17## For details see http://shiftleft.org/papers/vector_aes/ and 18## http://crypto.stanford.edu/vpaes/. 19 20# CBC encrypt/decrypt performance in cycles per byte processed with 21# 128-bit key. 22# 23# aes-ppc.pl this 24# PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4 25# PPC970/G5 37.9/55.0/(28.5) 22.2/28.5 26# POWER6 42.7/54.3/(28.2) 63.0/92.8(**) 27# POWER7 32.3/42.9/(18.4) 18.5/23.3 28# 29# (*) This is ~10% worse than reported in paper. The reason is 30# twofold. This module doesn't make any assumption about 31# key schedule (or data for that matter) alignment and handles 32# it in-line. Secondly it, being transliterated from 33# vpaes-x86_64.pl, relies on "nested inversion" better suited 34# for Intel CPUs. 35# (**) Inadequate POWER6 performance is due to astronomic AltiVec 36# latency, 9 cycles per simple logical operation. 37 38# $output is the last argument if it looks like a file (it has an extension) 39# $flavour is the first argument if it doesn't look like a file 40$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 41$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 42 43if ($flavour =~ /64/) { 44 $SIZE_T =8; 45 $LRSAVE =2*$SIZE_T; 46 $STU ="stdu"; 47 $POP ="ld"; 48 $PUSH ="std"; 49 $UCMP ="cmpld"; 50} elsif ($flavour =~ /32/) { 51 $SIZE_T =4; 52 $LRSAVE =$SIZE_T; 53 $STU ="stwu"; 54 $POP ="lwz"; 55 $PUSH ="stw"; 56 $UCMP ="cmplw"; 57} else { die "nonsense $flavour"; } 58 59$sp="r1"; 60$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload 61 62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 64( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 65die "can't locate ppc-xlate.pl"; 66 67open STDOUT,"| $^X $xlate $flavour \"$output\"" 68 || die "can't call $xlate: $!"; 69 70$code.=<<___; 71.machine "any" 72 73.text 74 75.align 7 # totally strategic alignment 76_vpaes_consts: 77Lk_mc_forward: # mc_forward 78 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv 79 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv 80 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv 81 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv 82Lk_mc_backward: # mc_backward 83 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv 84 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv 85 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv 86 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv 87Lk_sr: # sr 88 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv 89 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv 90 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv 91 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv 92 93## 94## "Hot" constants 95## 96Lk_inv: # inv, inva 97 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev 98 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev 99Lk_ipt: # input transform (lo, hi) 100 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev 101 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev 102Lk_sbo: # sbou, sbot 103 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev 104 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev 105Lk_sb1: # sb1u, sb1t 106 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev 107 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev 108Lk_sb2: # sb2u, sb2t 109 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev 110 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev 111 112## 113## Decryption stuff 114## 115Lk_dipt: # decryption input transform 116 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev 117 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev 118Lk_dsbo: # decryption sbox final output 119 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev 120 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev 121Lk_dsb9: # decryption sbox output *9*u, *9*t 122 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev 123 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev 124Lk_dsbd: # decryption sbox output *D*u, *D*t 125 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev 126 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev 127Lk_dsbb: # decryption sbox output *B*u, *B*t 128 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev 129 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev 130Lk_dsbe: # decryption sbox output *E*u, *E*t 131 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev 132 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev 133 134## 135## Key schedule constants 136## 137Lk_dksd: # decryption key schedule: invskew x*D 138 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev 139 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev 140Lk_dksb: # decryption key schedule: invskew x*B 141 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev 142 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev 143Lk_dkse: # decryption key schedule: invskew x*E + 0x63 144 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev 145 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev 146Lk_dks9: # decryption key schedule: invskew x*9 147 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev 148 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev 149 150Lk_rcon: # rcon 151 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis 152Lk_s63: 153 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis 154 155Lk_opt: # output transform 156 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev 157 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev 158Lk_deskew: # deskew tables: inverts the sbox's "skew" 159 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev 160 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev 161.align 5 162Lconsts: 163 mflr r0 164 bcl 20,31,\$+4 165 mflr r12 #vvvvv "distance between . and _vpaes_consts 166 addi r12,r12,-0x308 167 mtlr r0 168 blr 169 .long 0 170 .byte 0,12,0x14,0,0,0,0,0 171.asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)" 172.align 6 173___ 174 175my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31)); 176{ 177my ($inp,$out,$key) = map("r$_",(3..5)); 178 179my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15)); 180my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19)); 181my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23)); 182 183$code.=<<___; 184## 185## _aes_preheat 186## 187## Fills register %r10 -> .aes_consts (so you can -fPIC) 188## and %xmm9-%xmm15 as specified below. 189## 190.align 4 191_vpaes_encrypt_preheat: 192 mflr r8 193 bl Lconsts 194 mtlr r8 195 li r11, 0xc0 # Lk_inv 196 li r10, 0xd0 197 li r9, 0xe0 # Lk_ipt 198 li r8, 0xf0 199 vxor v7, v7, v7 # 0x00..00 200 vspltisb v8,4 # 0x04..04 201 vspltisb v9,0x0f # 0x0f..0f 202 lvx $invlo, r12, r11 203 li r11, 0x100 204 lvx $invhi, r12, r10 205 li r10, 0x110 206 lvx $iptlo, r12, r9 207 li r9, 0x120 208 lvx $ipthi, r12, r8 209 li r8, 0x130 210 lvx $sbou, r12, r11 211 li r11, 0x140 212 lvx $sbot, r12, r10 213 li r10, 0x150 214 lvx $sb1u, r12, r9 215 lvx $sb1t, r12, r8 216 lvx $sb2u, r12, r11 217 lvx $sb2t, r12, r10 218 blr 219 .long 0 220 .byte 0,12,0x14,0,0,0,0,0 221 222## 223## _aes_encrypt_core 224## 225## AES-encrypt %xmm0. 226## 227## Inputs: 228## %xmm0 = input 229## %xmm9-%xmm15 as in _vpaes_preheat 230## (%rdx) = scheduled keys 231## 232## Output in %xmm0 233## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax 234## 235## 236.align 5 237_vpaes_encrypt_core: 238 lwz r8, 240($key) # pull rounds 239 li r9, 16 240 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key 241 li r11, 0x10 242 lvx v6, r9, $key 243 addi r9, r9, 16 244 ?vperm v5, v5, v6, $keyperm # align round key 245 addi r10, r11, 0x40 246 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 247 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 248 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2 249 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0 250 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 251 mtctr r8 252 b Lenc_entry 253 254.align 4 255Lenc_loop: 256 # middle of middle round 257 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 258 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 259 addi r11, r11, 16 260 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 261 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 262 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4 263 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 264 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 265 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 266 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 267 addi r10, r11, 0x40 268 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 269 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 270 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 271 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 272 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 273 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 274 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 275 276Lenc_entry: 277 # top of round 278 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 279 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 280 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 281 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 282 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 283 vand v0, v0, v9 284 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 285 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 286 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 287 vmr v5, v6 288 lvx v6, r9, $key # vmovdqu (%r9), %xmm5 289 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 290 addi r9, r9, 16 291 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 292 ?vperm v5, v5, v6, $keyperm # align round key 293 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 294 bdnz Lenc_loop 295 296 # middle of last round 297 addi r10, r11, 0x80 298 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 299 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 300 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 301 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 302 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 303 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 304 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 305 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0 306 blr 307 .long 0 308 .byte 0,12,0x14,0,0,0,0,0 309 310.globl .vpaes_encrypt 311.align 5 312.vpaes_encrypt: 313 $STU $sp,-$FRAME($sp) 314 li r10,`15+6*$SIZE_T` 315 li r11,`31+6*$SIZE_T` 316 mflr r6 317 mfspr r7, 256 # save vrsave 318 stvx v20,r10,$sp 319 addi r10,r10,32 320 stvx v21,r11,$sp 321 addi r11,r11,32 322 stvx v22,r10,$sp 323 addi r10,r10,32 324 stvx v23,r11,$sp 325 addi r11,r11,32 326 stvx v24,r10,$sp 327 addi r10,r10,32 328 stvx v25,r11,$sp 329 addi r11,r11,32 330 stvx v26,r10,$sp 331 addi r10,r10,32 332 stvx v27,r11,$sp 333 addi r11,r11,32 334 stvx v28,r10,$sp 335 addi r10,r10,32 336 stvx v29,r11,$sp 337 addi r11,r11,32 338 stvx v30,r10,$sp 339 stvx v31,r11,$sp 340 stw r7,`$FRAME-4`($sp) # save vrsave 341 li r0, -1 342 $PUSH r6,`$FRAME+$LRSAVE`($sp) 343 mtspr 256, r0 # preserve all AltiVec registers 344 345 bl _vpaes_encrypt_preheat 346 347 ?lvsl $inpperm, 0, $inp # prepare for unaligned access 348 lvx v0, 0, $inp 349 addi $inp, $inp, 15 # 15 is not a typo 350 ?lvsr $outperm, 0, $out 351 ?lvsl $keyperm, 0, $key # prepare for unaligned access 352 lvx $inptail, 0, $inp # redundant in aligned case 353 ?vperm v0, v0, $inptail, $inpperm 354 355 bl _vpaes_encrypt_core 356 357 andi. r8, $out, 15 358 li r9, 16 359 beq Lenc_out_aligned 360 361 vperm v0, v0, v0, $outperm # rotate right/left 362 mtctr r9 363Lenc_out_unaligned: 364 stvebx v0, 0, $out 365 addi $out, $out, 1 366 bdnz Lenc_out_unaligned 367 b Lenc_done 368 369.align 4 370Lenc_out_aligned: 371 stvx v0, 0, $out 372Lenc_done: 373 374 li r10,`15+6*$SIZE_T` 375 li r11,`31+6*$SIZE_T` 376 mtlr r6 377 mtspr 256, r7 # restore vrsave 378 lvx v20,r10,$sp 379 addi r10,r10,32 380 lvx v21,r11,$sp 381 addi r11,r11,32 382 lvx v22,r10,$sp 383 addi r10,r10,32 384 lvx v23,r11,$sp 385 addi r11,r11,32 386 lvx v24,r10,$sp 387 addi r10,r10,32 388 lvx v25,r11,$sp 389 addi r11,r11,32 390 lvx v26,r10,$sp 391 addi r10,r10,32 392 lvx v27,r11,$sp 393 addi r11,r11,32 394 lvx v28,r10,$sp 395 addi r10,r10,32 396 lvx v29,r11,$sp 397 addi r11,r11,32 398 lvx v30,r10,$sp 399 lvx v31,r11,$sp 400 addi $sp,$sp,$FRAME 401 blr 402 .long 0 403 .byte 0,12,0x04,1,0x80,0,3,0 404 .long 0 405.size .vpaes_encrypt,.-.vpaes_encrypt 406 407.align 4 408_vpaes_decrypt_preheat: 409 mflr r8 410 bl Lconsts 411 mtlr r8 412 li r11, 0xc0 # Lk_inv 413 li r10, 0xd0 414 li r9, 0x160 # Ldipt 415 li r8, 0x170 416 vxor v7, v7, v7 # 0x00..00 417 vspltisb v8,4 # 0x04..04 418 vspltisb v9,0x0f # 0x0f..0f 419 lvx $invlo, r12, r11 420 li r11, 0x180 421 lvx $invhi, r12, r10 422 li r10, 0x190 423 lvx $iptlo, r12, r9 424 li r9, 0x1a0 425 lvx $ipthi, r12, r8 426 li r8, 0x1b0 427 lvx $sbou, r12, r11 428 li r11, 0x1c0 429 lvx $sbot, r12, r10 430 li r10, 0x1d0 431 lvx $sb9u, r12, r9 432 li r9, 0x1e0 433 lvx $sb9t, r12, r8 434 li r8, 0x1f0 435 lvx $sbdu, r12, r11 436 li r11, 0x200 437 lvx $sbdt, r12, r10 438 li r10, 0x210 439 lvx $sbbu, r12, r9 440 lvx $sbbt, r12, r8 441 lvx $sbeu, r12, r11 442 lvx $sbet, r12, r10 443 blr 444 .long 0 445 .byte 0,12,0x14,0,0,0,0,0 446 447## 448## Decryption core 449## 450## Same API as encryption core. 451## 452.align 4 453_vpaes_decrypt_core: 454 lwz r8, 240($key) # pull rounds 455 li r9, 16 456 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key 457 li r11, 0x30 458 lvx v6, r9, $key 459 addi r9, r9, 16 460 ?vperm v5, v5, v6, $keyperm # align round key 461 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 462 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 463 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 464 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2 465 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 466 mtctr r8 467 b Ldec_entry 468 469.align 4 470Ldec_loop: 471# 472# Inverse mix columns 473# 474 lvx v0, r12, r11 # v5 and v0 are flipped 475 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 476 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 477 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 478 subi r11, r11, 16 479 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 480 andi. r11, r11, 0x30 481 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 482 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 483 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 484 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 485 486 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 487 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 488 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 489 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 490 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 491 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 492 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 493 494 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 495 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 496 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 497 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 498 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 499 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 500 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 501 502 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 503 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 504 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 505 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 506 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 507 508Ldec_entry: 509 # top of round 510 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 511 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 512 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 513 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 514 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 515 vand v0, v0, v9 516 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 517 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 518 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 519 vmr v5, v6 520 lvx v6, r9, $key # vmovdqu (%r9), %xmm0 521 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 522 addi r9, r9, 16 523 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 524 ?vperm v5, v5, v6, $keyperm # align round key 525 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 526 bdnz Ldec_loop 527 528 # middle of last round 529 addi r10, r11, 0x80 530 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 531 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 532 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 533 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 534 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 535 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 536 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A 537 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0 538 blr 539 .long 0 540 .byte 0,12,0x14,0,0,0,0,0 541 542.globl .vpaes_decrypt 543.align 5 544.vpaes_decrypt: 545 $STU $sp,-$FRAME($sp) 546 li r10,`15+6*$SIZE_T` 547 li r11,`31+6*$SIZE_T` 548 mflr r6 549 mfspr r7, 256 # save vrsave 550 stvx v20,r10,$sp 551 addi r10,r10,32 552 stvx v21,r11,$sp 553 addi r11,r11,32 554 stvx v22,r10,$sp 555 addi r10,r10,32 556 stvx v23,r11,$sp 557 addi r11,r11,32 558 stvx v24,r10,$sp 559 addi r10,r10,32 560 stvx v25,r11,$sp 561 addi r11,r11,32 562 stvx v26,r10,$sp 563 addi r10,r10,32 564 stvx v27,r11,$sp 565 addi r11,r11,32 566 stvx v28,r10,$sp 567 addi r10,r10,32 568 stvx v29,r11,$sp 569 addi r11,r11,32 570 stvx v30,r10,$sp 571 stvx v31,r11,$sp 572 stw r7,`$FRAME-4`($sp) # save vrsave 573 li r0, -1 574 $PUSH r6,`$FRAME+$LRSAVE`($sp) 575 mtspr 256, r0 # preserve all AltiVec registers 576 577 bl _vpaes_decrypt_preheat 578 579 ?lvsl $inpperm, 0, $inp # prepare for unaligned access 580 lvx v0, 0, $inp 581 addi $inp, $inp, 15 # 15 is not a typo 582 ?lvsr $outperm, 0, $out 583 ?lvsl $keyperm, 0, $key 584 lvx $inptail, 0, $inp # redundant in aligned case 585 ?vperm v0, v0, $inptail, $inpperm 586 587 bl _vpaes_decrypt_core 588 589 andi. r8, $out, 15 590 li r9, 16 591 beq Ldec_out_aligned 592 593 vperm v0, v0, v0, $outperm # rotate right/left 594 mtctr r9 595Ldec_out_unaligned: 596 stvebx v0, 0, $out 597 addi $out, $out, 1 598 bdnz Ldec_out_unaligned 599 b Ldec_done 600 601.align 4 602Ldec_out_aligned: 603 stvx v0, 0, $out 604Ldec_done: 605 606 li r10,`15+6*$SIZE_T` 607 li r11,`31+6*$SIZE_T` 608 mtlr r6 609 mtspr 256, r7 # restore vrsave 610 lvx v20,r10,$sp 611 addi r10,r10,32 612 lvx v21,r11,$sp 613 addi r11,r11,32 614 lvx v22,r10,$sp 615 addi r10,r10,32 616 lvx v23,r11,$sp 617 addi r11,r11,32 618 lvx v24,r10,$sp 619 addi r10,r10,32 620 lvx v25,r11,$sp 621 addi r11,r11,32 622 lvx v26,r10,$sp 623 addi r10,r10,32 624 lvx v27,r11,$sp 625 addi r11,r11,32 626 lvx v28,r10,$sp 627 addi r10,r10,32 628 lvx v29,r11,$sp 629 addi r11,r11,32 630 lvx v30,r10,$sp 631 lvx v31,r11,$sp 632 addi $sp,$sp,$FRAME 633 blr 634 .long 0 635 .byte 0,12,0x04,1,0x80,0,3,0 636 .long 0 637.size .vpaes_decrypt,.-.vpaes_decrypt 638 639.globl .vpaes_cbc_encrypt 640.align 5 641.vpaes_cbc_encrypt: 642 ${UCMP}i r5,16 643 bltlr- 644 645 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp) 646 mflr r0 647 li r10,`15+6*$SIZE_T` 648 li r11,`31+6*$SIZE_T` 649 mfspr r12, 256 650 stvx v20,r10,$sp 651 addi r10,r10,32 652 stvx v21,r11,$sp 653 addi r11,r11,32 654 stvx v22,r10,$sp 655 addi r10,r10,32 656 stvx v23,r11,$sp 657 addi r11,r11,32 658 stvx v24,r10,$sp 659 addi r10,r10,32 660 stvx v25,r11,$sp 661 addi r11,r11,32 662 stvx v26,r10,$sp 663 addi r10,r10,32 664 stvx v27,r11,$sp 665 addi r11,r11,32 666 stvx v28,r10,$sp 667 addi r10,r10,32 668 stvx v29,r11,$sp 669 addi r11,r11,32 670 stvx v30,r10,$sp 671 stvx v31,r11,$sp 672 stw r12,`$FRAME-4`($sp) # save vrsave 673 $PUSH r30,`$FRAME+$SIZE_T*0`($sp) 674 $PUSH r31,`$FRAME+$SIZE_T*1`($sp) 675 li r9, -16 676 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 677 678 and r30, r5, r9 # copy length&-16 679 andi. r9, $out, 15 # is $out aligned? 680 mr r5, r6 # copy pointer to key 681 mr r31, r7 # copy pointer to iv 682 li r6, -1 683 mcrf cr1, cr0 # put aside $out alignment flag 684 mr r7, r12 # copy vrsave 685 mtspr 256, r6 # preserve all AltiVec registers 686 687 lvx v24, 0, r31 # load [potentially unaligned] iv 688 li r9, 15 689 ?lvsl $inpperm, 0, r31 690 lvx v25, r9, r31 691 ?vperm v24, v24, v25, $inpperm 692 693 cmpwi r8, 0 # test direction 694 neg r8, $inp # prepare for unaligned access 695 vxor v7, v7, v7 696 ?lvsl $keyperm, 0, $key 697 ?lvsr $outperm, 0, $out 698 ?lvsr $inpperm, 0, r8 # -$inp 699 vnor $outmask, v7, v7 # 0xff..ff 700 lvx $inptail, 0, $inp 701 ?vperm $outmask, v7, $outmask, $outperm 702 addi $inp, $inp, 15 # 15 is not a typo 703 704 beq Lcbc_decrypt 705 706 bl _vpaes_encrypt_preheat 707 li r0, 16 708 709 beq cr1, Lcbc_enc_loop # $out is aligned 710 711 vmr v0, $inptail 712 lvx $inptail, 0, $inp 713 addi $inp, $inp, 16 714 ?vperm v0, v0, $inptail, $inpperm 715 vxor v0, v0, v24 # ^= iv 716 717 bl _vpaes_encrypt_core 718 719 andi. r8, $out, 15 720 vmr v24, v0 # put aside iv 721 sub r9, $out, r8 722 vperm $outhead, v0, v0, $outperm # rotate right/left 723 724Lcbc_enc_head: 725 stvebx $outhead, r8, r9 726 cmpwi r8, 15 727 addi r8, r8, 1 728 bne Lcbc_enc_head 729 730 sub. r30, r30, r0 # len -= 16 731 addi $out, $out, 16 732 beq Lcbc_unaligned_done 733 734Lcbc_enc_loop: 735 vmr v0, $inptail 736 lvx $inptail, 0, $inp 737 addi $inp, $inp, 16 738 ?vperm v0, v0, $inptail, $inpperm 739 vxor v0, v0, v24 # ^= iv 740 741 bl _vpaes_encrypt_core 742 743 vmr v24, v0 # put aside iv 744 sub. r30, r30, r0 # len -= 16 745 vperm v0, v0, v0, $outperm # rotate right/left 746 vsel v1, $outhead, v0, $outmask 747 vmr $outhead, v0 748 stvx v1, 0, $out 749 addi $out, $out, 16 750 bne Lcbc_enc_loop 751 752 b Lcbc_done 753 754.align 5 755Lcbc_decrypt: 756 bl _vpaes_decrypt_preheat 757 li r0, 16 758 759 beq cr1, Lcbc_dec_loop # $out is aligned 760 761 vmr v0, $inptail 762 lvx $inptail, 0, $inp 763 addi $inp, $inp, 16 764 ?vperm v0, v0, $inptail, $inpperm 765 vmr v25, v0 # put aside input 766 767 bl _vpaes_decrypt_core 768 769 andi. r8, $out, 15 770 vxor v0, v0, v24 # ^= iv 771 vmr v24, v25 772 sub r9, $out, r8 773 vperm $outhead, v0, v0, $outperm # rotate right/left 774 775Lcbc_dec_head: 776 stvebx $outhead, r8, r9 777 cmpwi r8, 15 778 addi r8, r8, 1 779 bne Lcbc_dec_head 780 781 sub. r30, r30, r0 # len -= 16 782 addi $out, $out, 16 783 beq Lcbc_unaligned_done 784 785Lcbc_dec_loop: 786 vmr v0, $inptail 787 lvx $inptail, 0, $inp 788 addi $inp, $inp, 16 789 ?vperm v0, v0, $inptail, $inpperm 790 vmr v25, v0 # put aside input 791 792 bl _vpaes_decrypt_core 793 794 vxor v0, v0, v24 # ^= iv 795 vmr v24, v25 796 sub. r30, r30, r0 # len -= 16 797 vperm v0, v0, v0, $outperm # rotate right/left 798 vsel v1, $outhead, v0, $outmask 799 vmr $outhead, v0 800 stvx v1, 0, $out 801 addi $out, $out, 16 802 bne Lcbc_dec_loop 803 804Lcbc_done: 805 beq cr1, Lcbc_write_iv # $out is aligned 806 807Lcbc_unaligned_done: 808 andi. r8, $out, 15 809 sub $out, $out, r8 810 li r9, 0 811Lcbc_tail: 812 stvebx $outhead, r9, $out 813 addi r9, r9, 1 814 cmpw r9, r8 815 bne Lcbc_tail 816 817Lcbc_write_iv: 818 neg r8, r31 # write [potentially unaligned] iv 819 li r10, 4 820 ?lvsl $outperm, 0, r8 821 li r11, 8 822 li r12, 12 823 vperm v24, v24, v24, $outperm # rotate right/left 824 stvewx v24, 0, r31 # ivp is at least 32-bit aligned 825 stvewx v24, r10, r31 826 stvewx v24, r11, r31 827 stvewx v24, r12, r31 828 829 mtspr 256, r7 # restore vrsave 830 li r10,`15+6*$SIZE_T` 831 li r11,`31+6*$SIZE_T` 832 lvx v20,r10,$sp 833 addi r10,r10,32 834 lvx v21,r11,$sp 835 addi r11,r11,32 836 lvx v22,r10,$sp 837 addi r10,r10,32 838 lvx v23,r11,$sp 839 addi r11,r11,32 840 lvx v24,r10,$sp 841 addi r10,r10,32 842 lvx v25,r11,$sp 843 addi r11,r11,32 844 lvx v26,r10,$sp 845 addi r10,r10,32 846 lvx v27,r11,$sp 847 addi r11,r11,32 848 lvx v28,r10,$sp 849 addi r10,r10,32 850 lvx v29,r11,$sp 851 addi r11,r11,32 852 lvx v30,r10,$sp 853 lvx v31,r11,$sp 854Lcbc_abort: 855 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 856 $POP r30,`$FRAME+$SIZE_T*0`($sp) 857 $POP r31,`$FRAME+$SIZE_T*1`($sp) 858 mtlr r0 859 addi $sp,$sp,`$FRAME+$SIZE_T*2` 860 blr 861 .long 0 862 .byte 0,12,0x04,1,0x80,2,6,0 863 .long 0 864.size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt 865___ 866} 867{ 868my ($inp,$bits,$out)=map("r$_",(3..5)); 869my $dir="cr1"; 870my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24)); 871 872$code.=<<___; 873######################################################## 874## ## 875## AES key schedule ## 876## ## 877######################################################## 878.align 4 879_vpaes_key_preheat: 880 mflr r8 881 bl Lconsts 882 mtlr r8 883 li r11, 0xc0 # Lk_inv 884 li r10, 0xd0 885 li r9, 0xe0 # L_ipt 886 li r8, 0xf0 887 888 vspltisb v8,4 # 0x04..04 889 vxor v9,v9,v9 # 0x00..00 890 lvx $invlo, r12, r11 # Lk_inv 891 li r11, 0x120 892 lvx $invhi, r12, r10 893 li r10, 0x130 894 lvx $iptlo, r12, r9 # Lk_ipt 895 li r9, 0x220 896 lvx $ipthi, r12, r8 897 li r8, 0x230 898 899 lvx v14, r12, r11 # Lk_sb1 900 li r11, 0x240 901 lvx v15, r12, r10 902 li r10, 0x250 903 904 lvx v16, r12, r9 # Lk_dksd 905 li r9, 0x260 906 lvx v17, r12, r8 907 li r8, 0x270 908 lvx v18, r12, r11 # Lk_dksb 909 li r11, 0x280 910 lvx v19, r12, r10 911 li r10, 0x290 912 lvx v20, r12, r9 # Lk_dkse 913 li r9, 0x2a0 914 lvx v21, r12, r8 915 li r8, 0x2b0 916 lvx v22, r12, r11 # Lk_dks9 917 lvx v23, r12, r10 918 919 lvx v24, r12, r9 # Lk_rcon 920 lvx v25, 0, r12 # Lk_mc_forward[0] 921 lvx v26, r12, r8 # Lks63 922 blr 923 .long 0 924 .byte 0,12,0x14,0,0,0,0,0 925 926.align 4 927_vpaes_schedule_core: 928 mflr r7 929 930 bl _vpaes_key_preheat # load the tables 931 932 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned) 933 neg r8, $inp # prepare for unaligned access 934 lvx v0, 0, $inp 935 addi $inp, $inp, 15 # 15 is not typo 936 ?lvsr $inpperm, 0, r8 # -$inp 937 lvx v6, 0, $inp # v6 serves as inptail 938 addi $inp, $inp, 8 939 ?vperm v0, v0, v6, $inpperm 940 941 # input transform 942 vmr v3, v0 # vmovdqa %xmm0, %xmm3 943 bl _vpaes_schedule_transform 944 vmr v7, v0 # vmovdqa %xmm0, %xmm7 945 946 bne $dir, Lschedule_am_decrypting 947 948 # encrypting, output zeroth round key after transform 949 li r8, 0x30 # mov \$0x30,%r8d 950 li r9, 4 951 li r10, 8 952 li r11, 12 953 954 ?lvsr $outperm, 0, $out # prepare for unaligned access 955 vnor $outmask, v9, v9 # 0xff..ff 956 ?vperm $outmask, v9, $outmask, $outperm 957 958 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) 959 vperm $outhead, v0, v0, $outperm # rotate right/left 960 stvewx $outhead, 0, $out # some are superfluous 961 stvewx $outhead, r9, $out 962 stvewx $outhead, r10, $out 963 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 964 stvewx $outhead, r11, $out 965 b Lschedule_go 966 967Lschedule_am_decrypting: 968 srwi r8, $bits, 1 # shr \$1,%r8d 969 andi. r8, r8, 32 # and \$32,%r8d 970 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 971 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 972 # decrypting, output zeroth round key after shiftrows 973 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 974 li r9, 4 975 li r10, 8 976 li r11, 12 977 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 978 979 neg r0, $out # prepare for unaligned access 980 ?lvsl $outperm, 0, r0 981 vnor $outmask, v9, v9 # 0xff..ff 982 ?vperm $outmask, $outmask, v9, $outperm 983 984 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) 985 vperm $outhead, v4, v4, $outperm # rotate right/left 986 stvewx $outhead, 0, $out # some are superfluous 987 stvewx $outhead, r9, $out 988 stvewx $outhead, r10, $out 989 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 990 stvewx $outhead, r11, $out 991 addi $out, $out, 15 # 15 is not typo 992 xori r8, r8, 0x30 # xor \$0x30, %r8 993 994Lschedule_go: 995 cmplwi $bits, 192 # cmp \$192, %esi 996 bgt Lschedule_256 997 beq Lschedule_192 998 # 128: fall though 999 1000## 1001## .schedule_128 1002## 1003## 128-bit specific part of key schedule. 1004## 1005## This schedule is really simple, because all its parts 1006## are accomplished by the subroutines. 1007## 1008Lschedule_128: 1009 li r0, 10 # mov \$10, %esi 1010 mtctr r0 1011 1012Loop_schedule_128: 1013 bl _vpaes_schedule_round 1014 bdz Lschedule_mangle_last # dec %esi 1015 bl _vpaes_schedule_mangle # write output 1016 b Loop_schedule_128 1017 1018## 1019## .aes_schedule_192 1020## 1021## 192-bit specific part of key schedule. 1022## 1023## The main body of this schedule is the same as the 128-bit 1024## schedule, but with more smearing. The long, high side is 1025## stored in %xmm7 as before, and the short, low side is in 1026## the high bits of %xmm6. 1027## 1028## This schedule is somewhat nastier, however, because each 1029## round produces 192 bits of key material, or 1.5 round keys. 1030## Therefore, on each cycle we do 2 rounds and produce 3 round 1031## keys. 1032## 1033.align 4 1034Lschedule_192: 1035 li r0, 4 # mov \$4, %esi 1036 lvx v0, 0, $inp 1037 ?vperm v0, v6, v0, $inpperm 1038 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 1039 bl _vpaes_schedule_transform # input transform 1040 ?vsldoi v6, v0, v9, 8 1041 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros 1042 mtctr r0 1043 1044Loop_schedule_192: 1045 bl _vpaes_schedule_round 1046 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 1047 bl _vpaes_schedule_mangle # save key n 1048 bl _vpaes_schedule_192_smear 1049 bl _vpaes_schedule_mangle # save key n+1 1050 bl _vpaes_schedule_round 1051 bdz Lschedule_mangle_last # dec %esi 1052 bl _vpaes_schedule_mangle # save key n+2 1053 bl _vpaes_schedule_192_smear 1054 b Loop_schedule_192 1055 1056## 1057## .aes_schedule_256 1058## 1059## 256-bit specific part of key schedule. 1060## 1061## The structure here is very similar to the 128-bit 1062## schedule, but with an additional "low side" in 1063## %xmm6. The low side's rounds are the same as the 1064## high side's, except no rcon and no rotation. 1065## 1066.align 4 1067Lschedule_256: 1068 li r0, 7 # mov \$7, %esi 1069 addi $inp, $inp, 8 1070 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 1071 ?vperm v0, v6, v0, $inpperm 1072 bl _vpaes_schedule_transform # input transform 1073 mtctr r0 1074 1075Loop_schedule_256: 1076 bl _vpaes_schedule_mangle # output low result 1077 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 1078 1079 # high round 1080 bl _vpaes_schedule_round 1081 bdz Lschedule_mangle_last # dec %esi 1082 bl _vpaes_schedule_mangle 1083 1084 # low round. swap xmm7 and xmm6 1085 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 1086 vmr v5, v7 # vmovdqa %xmm7, %xmm5 1087 vmr v7, v6 # vmovdqa %xmm6, %xmm7 1088 bl _vpaes_schedule_low_round 1089 vmr v7, v5 # vmovdqa %xmm5, %xmm7 1090 1091 b Loop_schedule_256 1092## 1093## .aes_schedule_mangle_last 1094## 1095## Mangler for last round of key schedule 1096## Mangles %xmm0 1097## when encrypting, outputs out(%xmm0) ^ 63 1098## when decrypting, outputs unskew(%xmm0) 1099## 1100## Always called right before return... jumps to cleanup and exits 1101## 1102.align 4 1103Lschedule_mangle_last: 1104 # schedule last round key from xmm0 1105 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11 1106 li r9, 0x2f0 1107 bne $dir, Lschedule_mangle_last_dec 1108 1109 # encrypting 1110 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1 1111 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform 1112 li r9, 0x2d0 # prepare to output transform 1113 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute 1114 1115 lvx $iptlo, r11, r12 # reload $ipt 1116 lvx $ipthi, r9, r12 1117 addi $out, $out, 16 # add \$16, %rdx 1118 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 1119 bl _vpaes_schedule_transform # output transform 1120 1121 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 1122 vperm v0, v0, v0, $outperm # rotate right/left 1123 li r10, 4 1124 vsel v2, $outhead, v0, $outmask 1125 li r11, 8 1126 stvx v2, 0, $out 1127 li r12, 12 1128 stvewx v0, 0, $out # some (or all) are redundant 1129 stvewx v0, r10, $out 1130 stvewx v0, r11, $out 1131 stvewx v0, r12, $out 1132 b Lschedule_mangle_done 1133 1134.align 4 1135Lschedule_mangle_last_dec: 1136 lvx $iptlo, r11, r12 # reload $ipt 1137 lvx $ipthi, r9, r12 1138 addi $out, $out, -16 # add \$-16, %rdx 1139 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 1140 bl _vpaes_schedule_transform # output transform 1141 1142 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 1143 addi r9, $out, -15 # -15 is not typo 1144 vperm v0, v0, v0, $outperm # rotate right/left 1145 li r10, 4 1146 vsel v2, $outhead, v0, $outmask 1147 li r11, 8 1148 stvx v2, 0, $out 1149 li r12, 12 1150 stvewx v0, 0, r9 # some (or all) are redundant 1151 stvewx v0, r10, r9 1152 stvewx v0, r11, r9 1153 stvewx v0, r12, r9 1154 1155 1156Lschedule_mangle_done: 1157 mtlr r7 1158 # cleanup 1159 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0 1160 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1 1161 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2 1162 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3 1163 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 1164 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5 1165 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6 1166 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7 1167 1168 blr 1169 .long 0 1170 .byte 0,12,0x14,0,0,0,0,0 1171 1172## 1173## .aes_schedule_192_smear 1174## 1175## Smear the short, low side in the 192-bit key schedule. 1176## 1177## Inputs: 1178## %xmm7: high side, b a x y 1179## %xmm6: low side, d c 0 0 1180## %xmm13: 0 1181## 1182## Outputs: 1183## %xmm6: b+c+d b+c 0 0 1184## %xmm0: b+c+d b+c b a 1185## 1186.align 4 1187_vpaes_schedule_192_smear: 1188 ?vspltw v0, v7, 3 1189 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 1190 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 1191 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 1192 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 1193 vmr v0, v6 1194 ?vsldoi v6, v6, v9, 8 1195 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros 1196 blr 1197 .long 0 1198 .byte 0,12,0x14,0,0,0,0,0 1199 1200## 1201## .aes_schedule_round 1202## 1203## Runs one main round of the key schedule on %xmm0, %xmm7 1204## 1205## Specifically, runs subbytes on the high dword of %xmm0 1206## then rotates it by one byte and xors into the low dword of 1207## %xmm7. 1208## 1209## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 1210## next rcon. 1211## 1212## Smears the dwords of %xmm7 by xoring the low into the 1213## second low, result into third, result into highest. 1214## 1215## Returns results in %xmm7 = %xmm0. 1216## Clobbers %xmm1-%xmm4, %r11. 1217## 1218.align 4 1219_vpaes_schedule_round: 1220 # extract rcon from xmm8 1221 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 1222 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 1223 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 1224 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 1225 1226 # rotate 1227 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 1228 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 1229 1230 # fall through... 1231 1232 # low round: same as high round, but no rotation and no rcon. 1233_vpaes_schedule_low_round: 1234 # smear xmm7 1235 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 1236 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 1237 vspltisb v1, 0x0f # 0x0f..0f 1238 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 1239 1240 # subbytes 1241 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k 1242 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 1243 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7 1244 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 1245 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 1246 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 1247 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 1248 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 1249 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7 1250 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 1251 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 1252 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 1253 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io 1254 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 1255 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 1256 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 1257 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 1258 1259 # add in smeared stuff 1260 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0 1261 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7 1262 blr 1263 .long 0 1264 .byte 0,12,0x14,0,0,0,0,0 1265 1266## 1267## .aes_schedule_transform 1268## 1269## Linear-transform %xmm0 according to tables at (%r11) 1270## 1271## Requires that %xmm9 = 0x0F0F... as in preheat 1272## Output in %xmm0 1273## Clobbers %xmm2 1274## 1275.align 4 1276_vpaes_schedule_transform: 1277 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1 1278 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 1279 # vmovdqa (%r11), %xmm2 # lo 1280 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 1281 # vmovdqa 16(%r11), %xmm1 # hi 1282 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0 1283 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0 1284 blr 1285 .long 0 1286 .byte 0,12,0x14,0,0,0,0,0 1287 1288## 1289## .aes_schedule_mangle 1290## 1291## Mangle xmm0 from (basis-transformed) standard version 1292## to our version. 1293## 1294## On encrypt, 1295## xor with 0x63 1296## multiply by circulant 0,1,1,1 1297## apply shiftrows transform 1298## 1299## On decrypt, 1300## xor with 0x63 1301## multiply by "inverse mixcolumns" circulant E,B,D,9 1302## deskew 1303## apply shiftrows transform 1304## 1305## 1306## Writes out to (%rdx), and increments or decrements it 1307## Keeps track of round number mod 4 in %r8 1308## Preserves xmm0 1309## Clobbers xmm1-xmm5 1310## 1311.align 4 1312_vpaes_schedule_mangle: 1313 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later 1314 # vmovdqa .Lk_mc_forward(%rip),%xmm5 1315 bne $dir, Lschedule_mangle_dec 1316 1317 # encrypting 1318 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4 1319 addi $out, $out, 16 # add \$16, %rdx 1320 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4 1321 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1 1322 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3 1323 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4 1324 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 1325 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3 1326 1327 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 1328 addi r8, r8, -16 # add \$-16, %r8 1329 andi. r8, r8, 0x30 # and \$0x30, %r8 1330 1331 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 1332 vperm v1, v3, v3, $outperm # rotate right/left 1333 vsel v2, $outhead, v1, $outmask 1334 vmr $outhead, v1 1335 stvx v2, 0, $out 1336 blr 1337 1338.align 4 1339Lschedule_mangle_dec: 1340 # inverse mix columns 1341 # lea .Lk_dksd(%rip),%r11 1342 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 1343 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1344 1345 # vmovdqa 0x00(%r11), %xmm2 1346 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2 1347 # vmovdqa 0x10(%r11), %xmm3 1348 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3 1349 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1350 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1351 1352 # vmovdqa 0x20(%r11), %xmm2 1353 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2 1354 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1355 # vmovdqa 0x30(%r11), %xmm3 1356 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3 1357 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1358 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1359 1360 # vmovdqa 0x40(%r11), %xmm2 1361 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2 1362 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1363 # vmovdqa 0x50(%r11), %xmm3 1364 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3 1365 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1366 1367 # vmovdqa 0x60(%r11), %xmm2 1368 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2 1369 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1370 # vmovdqa 0x70(%r11), %xmm4 1371 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4 1372 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 1373 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1374 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3 1375 1376 addi $out, $out, -16 # add \$-16, %rdx 1377 1378 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 1379 addi r8, r8, -16 # add \$-16, %r8 1380 andi. r8, r8, 0x30 # and \$0x30, %r8 1381 1382 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 1383 vperm v1, v3, v3, $outperm # rotate right/left 1384 vsel v2, $outhead, v1, $outmask 1385 vmr $outhead, v1 1386 stvx v2, 0, $out 1387 blr 1388 .long 0 1389 .byte 0,12,0x14,0,0,0,0,0 1390 1391.globl .vpaes_set_encrypt_key 1392.align 5 1393.vpaes_set_encrypt_key: 1394 $STU $sp,-$FRAME($sp) 1395 li r10,`15+6*$SIZE_T` 1396 li r11,`31+6*$SIZE_T` 1397 mflr r0 1398 mfspr r6, 256 # save vrsave 1399 stvx v20,r10,$sp 1400 addi r10,r10,32 1401 stvx v21,r11,$sp 1402 addi r11,r11,32 1403 stvx v22,r10,$sp 1404 addi r10,r10,32 1405 stvx v23,r11,$sp 1406 addi r11,r11,32 1407 stvx v24,r10,$sp 1408 addi r10,r10,32 1409 stvx v25,r11,$sp 1410 addi r11,r11,32 1411 stvx v26,r10,$sp 1412 addi r10,r10,32 1413 stvx v27,r11,$sp 1414 addi r11,r11,32 1415 stvx v28,r10,$sp 1416 addi r10,r10,32 1417 stvx v29,r11,$sp 1418 addi r11,r11,32 1419 stvx v30,r10,$sp 1420 stvx v31,r11,$sp 1421 stw r6,`$FRAME-4`($sp) # save vrsave 1422 li r7, -1 1423 $PUSH r0, `$FRAME+$LRSAVE`($sp) 1424 mtspr 256, r7 # preserve all AltiVec registers 1425 1426 srwi r9, $bits, 5 # shr \$5,%eax 1427 addi r9, r9, 6 # add \$5,%eax 1428 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1429 1430 cmplw $dir, $bits, $bits # set encrypt direction 1431 li r8, 0x30 # mov \$0x30,%r8d 1432 bl _vpaes_schedule_core 1433 1434 $POP r0, `$FRAME+$LRSAVE`($sp) 1435 li r10,`15+6*$SIZE_T` 1436 li r11,`31+6*$SIZE_T` 1437 mtspr 256, r6 # restore vrsave 1438 mtlr r0 1439 xor r3, r3, r3 1440 lvx v20,r10,$sp 1441 addi r10,r10,32 1442 lvx v21,r11,$sp 1443 addi r11,r11,32 1444 lvx v22,r10,$sp 1445 addi r10,r10,32 1446 lvx v23,r11,$sp 1447 addi r11,r11,32 1448 lvx v24,r10,$sp 1449 addi r10,r10,32 1450 lvx v25,r11,$sp 1451 addi r11,r11,32 1452 lvx v26,r10,$sp 1453 addi r10,r10,32 1454 lvx v27,r11,$sp 1455 addi r11,r11,32 1456 lvx v28,r10,$sp 1457 addi r10,r10,32 1458 lvx v29,r11,$sp 1459 addi r11,r11,32 1460 lvx v30,r10,$sp 1461 lvx v31,r11,$sp 1462 addi $sp,$sp,$FRAME 1463 blr 1464 .long 0 1465 .byte 0,12,0x04,1,0x80,0,3,0 1466 .long 0 1467.size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key 1468 1469.globl .vpaes_set_decrypt_key 1470.align 4 1471.vpaes_set_decrypt_key: 1472 $STU $sp,-$FRAME($sp) 1473 li r10,`15+6*$SIZE_T` 1474 li r11,`31+6*$SIZE_T` 1475 mflr r0 1476 mfspr r6, 256 # save vrsave 1477 stvx v20,r10,$sp 1478 addi r10,r10,32 1479 stvx v21,r11,$sp 1480 addi r11,r11,32 1481 stvx v22,r10,$sp 1482 addi r10,r10,32 1483 stvx v23,r11,$sp 1484 addi r11,r11,32 1485 stvx v24,r10,$sp 1486 addi r10,r10,32 1487 stvx v25,r11,$sp 1488 addi r11,r11,32 1489 stvx v26,r10,$sp 1490 addi r10,r10,32 1491 stvx v27,r11,$sp 1492 addi r11,r11,32 1493 stvx v28,r10,$sp 1494 addi r10,r10,32 1495 stvx v29,r11,$sp 1496 addi r11,r11,32 1497 stvx v30,r10,$sp 1498 stvx v31,r11,$sp 1499 stw r6,`$FRAME-4`($sp) # save vrsave 1500 li r7, -1 1501 $PUSH r0, `$FRAME+$LRSAVE`($sp) 1502 mtspr 256, r7 # preserve all AltiVec registers 1503 1504 srwi r9, $bits, 5 # shr \$5,%eax 1505 addi r9, r9, 6 # add \$5,%eax 1506 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1507 1508 slwi r9, r9, 4 # shl \$4,%eax 1509 add $out, $out, r9 # lea (%rdx,%rax),%rdx 1510 1511 cmplwi $dir, $bits, 0 # set decrypt direction 1512 srwi r8, $bits, 1 # shr \$1,%r8d 1513 andi. r8, r8, 32 # and \$32,%r8d 1514 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 1515 bl _vpaes_schedule_core 1516 1517 $POP r0, `$FRAME+$LRSAVE`($sp) 1518 li r10,`15+6*$SIZE_T` 1519 li r11,`31+6*$SIZE_T` 1520 mtspr 256, r6 # restore vrsave 1521 mtlr r0 1522 xor r3, r3, r3 1523 lvx v20,r10,$sp 1524 addi r10,r10,32 1525 lvx v21,r11,$sp 1526 addi r11,r11,32 1527 lvx v22,r10,$sp 1528 addi r10,r10,32 1529 lvx v23,r11,$sp 1530 addi r11,r11,32 1531 lvx v24,r10,$sp 1532 addi r10,r10,32 1533 lvx v25,r11,$sp 1534 addi r11,r11,32 1535 lvx v26,r10,$sp 1536 addi r10,r10,32 1537 lvx v27,r11,$sp 1538 addi r11,r11,32 1539 lvx v28,r10,$sp 1540 addi r10,r10,32 1541 lvx v29,r11,$sp 1542 addi r11,r11,32 1543 lvx v30,r10,$sp 1544 lvx v31,r11,$sp 1545 addi $sp,$sp,$FRAME 1546 blr 1547 .long 0 1548 .byte 0,12,0x04,1,0x80,0,3,0 1549 .long 0 1550.size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key 1551___ 1552} 1553 1554my $consts=1; 1555foreach (split("\n",$code)) { 1556 s/\`([^\`]*)\`/eval $1/geo; 1557 1558 # constants table endian-specific conversion 1559 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { 1560 my $conv=$2; 1561 my @bytes=(); 1562 1563 # convert to endian-agnostic format 1564 foreach (split(/,\s+/,$1)) { 1565 my $l = /^0/?oct:int; 1566 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; 1567 } 1568 1569 # little-endian conversion 1570 if ($flavour =~ /le$/o) { 1571 SWITCH: for($conv) { 1572 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; 1573 /\?rev/ && do { @bytes=reverse(@bytes); last; }; 1574 } 1575 } 1576 1577 #emit 1578 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; 1579 next; 1580 } 1581 $consts=0 if (m/Lconsts:/o); # end of table 1582 1583 # instructions prefixed with '?' are endian-specific and need 1584 # to be adjusted accordingly... 1585 if ($flavour =~ /le$/o) { # little-endian 1586 s/\?lvsr/lvsl/o or 1587 s/\?lvsl/lvsr/o or 1588 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or 1589 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or 1590 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; 1591 } else { # big-endian 1592 s/\?([a-z]+)/$1/o; 1593 } 1594 1595 print $_,"\n"; 1596} 1597 1598close STDOUT or die "error closing STDOUT: $!"; 1599