1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ChaCha20 for C64x+. 18# 19# October 2015 20# 21# Performance is 3.54 cycles per processed byte, which is ~4.3 times 22# faster than code generated by TI compiler. Compiler also disables 23# interrupts for some reason, thus making interrupt response time 24# dependent on input length. This module on the other hand is free 25# from such limitation. 26 27$output=pop and open STDOUT,">$output"; 28 29($OUT,$INP,$LEN,$KEYB,$COUNTERA)=("A4","B4","A6","B6","A8"); 30($KEYA,$COUNTERB,$STEP)=("A7","B7","A3"); 31 32@X= ("A16","B16","A17","B17","A18","B18","A19","B19", 33 "A20","B20","A21","B21","A22","B22","A23","B23"); 34@Y= ("A24","B24","A25","B25","A26","B26","A27","B27", 35 "A28","B28","A29","B29","A30","B30","A31","B31"); 36@DAT=("A6", "A7", "B6", "B7", "A8", "A9", "B8", "B9", 37 "A10","A11","B10","B11","A12","A13","B12","B13"); 38 39# yes, overlaps with @DAT, used only in 2x interleave code path... 40@K2x=("A6", "B6", "A7", "B7", "A8", "B8", "A9", "B9", 41 "A10","B10","A11","B11","A2", "B2", "A13","B13"); 42 43$code.=<<___; 44 .text 45 46 .if .ASSEMBLER_VERSION<7000000 47 .asg 0,__TI_EABI__ 48 .endif 49 .if __TI_EABI__ 50 .asg ChaCha20_ctr32,_ChaCha20_ctr32 51 .endif 52 53 .asg B3,RA 54 .asg A15,FP 55 .asg B15,SP 56 57 .global _ChaCha20_ctr32 58 .align 32 59_ChaCha20_ctr32: 60 .asmfunc stack_usage(40+64) 61 MV $LEN,A0 ; reassign 62 [!A0] BNOP RA ; no data 63|| [A0] STW FP,*SP--(40+64) ; save frame pointer and alloca(40+64) 64|| [A0] MV SP,FP 65 [A0] STDW B13:B12,*SP[4+8] ; ABI says so 66|| [A0] MV $KEYB,$KEYA 67|| [A0] MV $COUNTERA,$COUNTERB 68 [A0] STDW B11:B10,*SP[3+8] 69|| [A0] STDW A13:A12,*FP[-3] 70 [A0] STDW A11:A10,*FP[-4] 71|| [A0] MVK 128,$STEP ; 2 * input block size 72 73 [A0] LDW *${KEYA}[0],@Y[4] ; load key 74|| [A0] LDW *${KEYB}[1],@Y[5] 75|| [A0] MVK 0x00007865,@Y[0] ; synthesize sigma 76|| [A0] MVK 0x0000646e,@Y[1] 77 [A0] LDW *${KEYA}[2],@Y[6] 78|| [A0] LDW *${KEYB}[3],@Y[7] 79|| [A0] MVKH 0x61700000,@Y[0] 80|| [A0] MVKH 0x33200000,@Y[1] 81 LDW *${KEYA}[4],@Y[8] 82|| LDW *${KEYB}[5],@Y[9] 83|| MVK 0x00002d32,@Y[2] 84|| MVK 0x00006574,@Y[3] 85 LDW *${KEYA}[6],@Y[10] 86|| LDW *${KEYB}[7],@Y[11] 87|| MVKH 0x79620000,@Y[2] 88|| MVKH 0x6b200000,@Y[3] 89 LDW *${COUNTERA}[0],@Y[12] ; load counter||nonce 90|| LDW *${COUNTERB}[1],@Y[13] 91|| CMPLTU A0,$STEP,A1 ; is length < 2*blocks? 92 LDW *${COUNTERA}[2],@Y[14] 93|| LDW *${COUNTERB}[3],@Y[15] 94|| [A1] BNOP top1x? 95 [A1] MVK 64,$STEP ; input block size 96|| MVK 10,B0 ; inner loop counter 97 98 DMV @Y[2],@Y[0],@X[2]:@X[0] ; copy block 99|| DMV @Y[3],@Y[1],@X[3]:@X[1] 100||[!A1] STDW @Y[2]:@Y[0],*FP[-12] ; offload key material to stack 101||[!A1] STDW @Y[3]:@Y[1],*SP[2] 102 DMV @Y[6],@Y[4],@X[6]:@X[4] 103|| DMV @Y[7],@Y[5],@X[7]:@X[5] 104||[!A1] STDW @Y[6]:@Y[4],*FP[-10] 105||[!A1] STDW @Y[7]:@Y[5],*SP[4] 106 DMV @Y[10],@Y[8],@X[10]:@X[8] 107|| DMV @Y[11],@Y[9],@X[11]:@X[9] 108||[!A1] STDW @Y[10]:@Y[8],*FP[-8] 109||[!A1] STDW @Y[11]:@Y[9],*SP[6] 110 DMV @Y[14],@Y[12],@X[14]:@X[12] 111|| DMV @Y[15],@Y[13],@X[15]:@X[13] 112||[!A1] MV @Y[12],@K2x[12] ; counter 113||[!A1] MV @Y[13],@K2x[13] 114||[!A1] STW @Y[14],*FP[-6*2] 115||[!A1] STW @Y[15],*SP[8*2] 116___ 117{ ################################################################ 118 # 2x interleave gives 50% performance improvement 119 # 120my ($a0,$a1,$a2,$a3) = (0..3); 121my ($b0,$b1,$b2,$b3) = (4..7); 122my ($c0,$c1,$c2,$c3) = (8..11); 123my ($d0,$d1,$d2,$d3) = (12..15); 124 125$code.=<<___; 126outer2x?: 127 ADD @X[$b1],@X[$a1],@X[$a1] 128|| ADD @X[$b2],@X[$a2],@X[$a2] 129|| ADD @X[$b0],@X[$a0],@X[$a0] 130|| ADD @X[$b3],@X[$a3],@X[$a3] 131|| DMV @Y[2],@Y[0],@K2x[2]:@K2x[0] 132|| DMV @Y[3],@Y[1],@K2x[3]:@K2x[1] 133 XOR @X[$a1],@X[$d1],@X[$d1] 134|| XOR @X[$a2],@X[$d2],@X[$d2] 135|| XOR @X[$a0],@X[$d0],@X[$d0] 136|| XOR @X[$a3],@X[$d3],@X[$d3] 137|| DMV @Y[6],@Y[4],@K2x[6]:@K2x[4] 138|| DMV @Y[7],@Y[5],@K2x[7]:@K2x[5] 139 SWAP2 @X[$d1],@X[$d1] ; rotate by 16 140|| SWAP2 @X[$d2],@X[$d2] 141|| SWAP2 @X[$d0],@X[$d0] 142|| SWAP2 @X[$d3],@X[$d3] 143 144 ADD @X[$d1],@X[$c1],@X[$c1] 145|| ADD @X[$d2],@X[$c2],@X[$c2] 146|| ADD @X[$d0],@X[$c0],@X[$c0] 147|| ADD @X[$d3],@X[$c3],@X[$c3] 148|| DMV @Y[10],@Y[8],@K2x[10]:@K2x[8] 149|| DMV @Y[11],@Y[9],@K2x[11]:@K2x[9] 150 XOR @X[$c1],@X[$b1],@X[$b1] 151|| XOR @X[$c2],@X[$b2],@X[$b2] 152|| XOR @X[$c0],@X[$b0],@X[$b0] 153|| XOR @X[$c3],@X[$b3],@X[$b3] 154|| ADD 1,@Y[12],@Y[12] ; adjust counter for 2nd block 155 ROTL @X[$b1],12,@X[$b1] 156|| ROTL @X[$b2],12,@X[$b2] 157|| MV @Y[14],@K2x[14] 158|| MV @Y[15],@K2x[15] 159top2x?: 160 ROTL @X[$b0],12,@X[$b0] 161|| ROTL @X[$b3],12,@X[$b3] 162|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 163|| ADD @Y[$b2],@Y[$a2],@Y[$a2] 164 ADD @Y[$b0],@Y[$a0],@Y[$a0] 165|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 166 167|| ADD @X[$b1],@X[$a1],@X[$a1] 168|| ADD @X[$b2],@X[$a2],@X[$a2] 169|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 170|| XOR @Y[$a2],@Y[$d2],@Y[$d2] 171 XOR @Y[$a0],@Y[$d0],@Y[$d0] 172|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 173|| ADD @X[$b0],@X[$a0],@X[$a0] 174|| ADD @X[$b3],@X[$a3],@X[$a3] 175|| XOR @X[$a1],@X[$d1],@X[$d1] 176|| XOR @X[$a2],@X[$d2],@X[$d2] 177 XOR @X[$a0],@X[$d0],@X[$d0] 178|| XOR @X[$a3],@X[$d3],@X[$d3] 179|| ROTL @X[$d1],8,@X[$d1] 180|| ROTL @X[$d2],8,@X[$d2] 181|| SWAP2 @Y[$d1],@Y[$d1] ; rotate by 16 182|| SWAP2 @Y[$d2],@Y[$d2] 183|| SWAP2 @Y[$d0],@Y[$d0] 184|| SWAP2 @Y[$d3],@Y[$d3] 185 ROTL @X[$d0],8,@X[$d0] 186|| ROTL @X[$d3],8,@X[$d3] 187|| ADD @Y[$d1],@Y[$c1],@Y[$c1] 188|| ADD @Y[$d2],@Y[$c2],@Y[$c2] 189|| ADD @Y[$d0],@Y[$c0],@Y[$c0] 190|| ADD @Y[$d3],@Y[$c3],@Y[$c3] 191|| BNOP middle2x1? ; protect from interrupt 192 193 ADD @X[$d1],@X[$c1],@X[$c1] 194|| ADD @X[$d2],@X[$c2],@X[$c2] 195|| XOR @Y[$c1],@Y[$b1],@Y[$b1] 196|| XOR @Y[$c2],@Y[$b2],@Y[$b2] 197|| XOR @Y[$c0],@Y[$b0],@Y[$b0] 198|| XOR @Y[$c3],@Y[$b3],@Y[$b3] 199 ADD @X[$d0],@X[$c0],@X[$c0] 200|| ADD @X[$d3],@X[$c3],@X[$c3] 201|| XOR @X[$c1],@X[$b1],@X[$b1] 202|| XOR @X[$c2],@X[$b2],@X[$b2] 203|| ROTL @X[$d1],0,@X[$d2] ; moved to avoid cross-path stall 204|| ROTL @X[$d2],0,@X[$d3] 205 XOR @X[$c0],@X[$b0],@X[$b0] 206|| XOR @X[$c3],@X[$b3],@X[$b3] 207|| MV @X[$d0],@X[$d1] 208|| MV @X[$d3],@X[$d0] 209|| ROTL @Y[$b1],12,@Y[$b1] 210|| ROTL @Y[$b2],12,@Y[$b2] 211 ROTL @X[$b1],7,@X[$b0] ; avoided cross-path stall 212|| ROTL @X[$b2],7,@X[$b1] 213 ROTL @X[$b0],7,@X[$b3] 214|| ROTL @X[$b3],7,@X[$b2] 215middle2x1?: 216 217 ROTL @Y[$b0],12,@Y[$b0] 218|| ROTL @Y[$b3],12,@Y[$b3] 219|| ADD @X[$b0],@X[$a0],@X[$a0] 220|| ADD @X[$b1],@X[$a1],@X[$a1] 221 ADD @X[$b2],@X[$a2],@X[$a2] 222|| ADD @X[$b3],@X[$a3],@X[$a3] 223 224|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 225|| ADD @Y[$b2],@Y[$a2],@Y[$a2] 226|| XOR @X[$a0],@X[$d0],@X[$d0] 227|| XOR @X[$a1],@X[$d1],@X[$d1] 228 XOR @X[$a2],@X[$d2],@X[$d2] 229|| XOR @X[$a3],@X[$d3],@X[$d3] 230|| ADD @Y[$b0],@Y[$a0],@Y[$a0] 231|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 232|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 233|| XOR @Y[$a2],@Y[$d2],@Y[$d2] 234 XOR @Y[$a0],@Y[$d0],@Y[$d0] 235|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 236|| ROTL @Y[$d1],8,@Y[$d1] 237|| ROTL @Y[$d2],8,@Y[$d2] 238|| SWAP2 @X[$d0],@X[$d0] ; rotate by 16 239|| SWAP2 @X[$d1],@X[$d1] 240|| SWAP2 @X[$d2],@X[$d2] 241|| SWAP2 @X[$d3],@X[$d3] 242 ROTL @Y[$d0],8,@Y[$d0] 243|| ROTL @Y[$d3],8,@Y[$d3] 244|| ADD @X[$d0],@X[$c2],@X[$c2] 245|| ADD @X[$d1],@X[$c3],@X[$c3] 246|| ADD @X[$d2],@X[$c0],@X[$c0] 247|| ADD @X[$d3],@X[$c1],@X[$c1] 248|| BNOP middle2x2? ; protect from interrupt 249 250 ADD @Y[$d1],@Y[$c1],@Y[$c1] 251|| ADD @Y[$d2],@Y[$c2],@Y[$c2] 252|| XOR @X[$c2],@X[$b0],@X[$b0] 253|| XOR @X[$c3],@X[$b1],@X[$b1] 254|| XOR @X[$c0],@X[$b2],@X[$b2] 255|| XOR @X[$c1],@X[$b3],@X[$b3] 256 ADD @Y[$d0],@Y[$c0],@Y[$c0] 257|| ADD @Y[$d3],@Y[$c3],@Y[$c3] 258|| XOR @Y[$c1],@Y[$b1],@Y[$b1] 259|| XOR @Y[$c2],@Y[$b2],@Y[$b2] 260|| ROTL @Y[$d1],0,@Y[$d2] ; moved to avoid cross-path stall 261|| ROTL @Y[$d2],0,@Y[$d3] 262 XOR @Y[$c0],@Y[$b0],@Y[$b0] 263|| XOR @Y[$c3],@Y[$b3],@Y[$b3] 264|| MV @Y[$d0],@Y[$d1] 265|| MV @Y[$d3],@Y[$d0] 266|| ROTL @X[$b0],12,@X[$b0] 267|| ROTL @X[$b1],12,@X[$b1] 268 ROTL @Y[$b1],7,@Y[$b0] ; avoided cross-path stall 269|| ROTL @Y[$b2],7,@Y[$b1] 270 ROTL @Y[$b0],7,@Y[$b3] 271|| ROTL @Y[$b3],7,@Y[$b2] 272middle2x2?: 273 274 ROTL @X[$b2],12,@X[$b2] 275|| ROTL @X[$b3],12,@X[$b3] 276|| ADD @Y[$b0],@Y[$a0],@Y[$a0] 277|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 278 ADD @Y[$b2],@Y[$a2],@Y[$a2] 279|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 280 281|| ADD @X[$b0],@X[$a0],@X[$a0] 282|| ADD @X[$b1],@X[$a1],@X[$a1] 283|| XOR @Y[$a0],@Y[$d0],@Y[$d0] 284|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 285 XOR @Y[$a2],@Y[$d2],@Y[$d2] 286|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 287|| ADD @X[$b2],@X[$a2],@X[$a2] 288|| ADD @X[$b3],@X[$a3],@X[$a3] 289|| XOR @X[$a0],@X[$d0],@X[$d0] 290|| XOR @X[$a1],@X[$d1],@X[$d1] 291 XOR @X[$a2],@X[$d2],@X[$d2] 292|| XOR @X[$a3],@X[$d3],@X[$d3] 293|| ROTL @X[$d0],8,@X[$d0] 294|| ROTL @X[$d1],8,@X[$d1] 295|| SWAP2 @Y[$d0],@Y[$d0] ; rotate by 16 296|| SWAP2 @Y[$d1],@Y[$d1] 297|| SWAP2 @Y[$d2],@Y[$d2] 298|| SWAP2 @Y[$d3],@Y[$d3] 299 ROTL @X[$d2],8,@X[$d2] 300|| ROTL @X[$d3],8,@X[$d3] 301|| ADD @Y[$d0],@Y[$c2],@Y[$c2] 302|| ADD @Y[$d1],@Y[$c3],@Y[$c3] 303|| ADD @Y[$d2],@Y[$c0],@Y[$c0] 304|| ADD @Y[$d3],@Y[$c1],@Y[$c1] 305|| BNOP bottom2x1? ; protect from interrupt 306 307 ADD @X[$d0],@X[$c2],@X[$c2] 308|| ADD @X[$d1],@X[$c3],@X[$c3] 309|| XOR @Y[$c2],@Y[$b0],@Y[$b0] 310|| XOR @Y[$c3],@Y[$b1],@Y[$b1] 311|| XOR @Y[$c0],@Y[$b2],@Y[$b2] 312|| XOR @Y[$c1],@Y[$b3],@Y[$b3] 313 ADD @X[$d2],@X[$c0],@X[$c0] 314|| ADD @X[$d3],@X[$c1],@X[$c1] 315|| XOR @X[$c2],@X[$b0],@X[$b0] 316|| XOR @X[$c3],@X[$b1],@X[$b1] 317|| ROTL @X[$d0],0,@X[$d3] ; moved to avoid cross-path stall 318|| ROTL @X[$d1],0,@X[$d0] 319 XOR @X[$c0],@X[$b2],@X[$b2] 320|| XOR @X[$c1],@X[$b3],@X[$b3] 321|| MV @X[$d2],@X[$d1] 322|| MV @X[$d3],@X[$d2] 323|| ROTL @Y[$b0],12,@Y[$b0] 324|| ROTL @Y[$b1],12,@Y[$b1] 325 ROTL @X[$b0],7,@X[$b1] ; avoided cross-path stall 326|| ROTL @X[$b1],7,@X[$b2] 327 ROTL @X[$b2],7,@X[$b3] 328|| ROTL @X[$b3],7,@X[$b0] 329|| [B0] SUB B0,1,B0 ; decrement inner loop counter 330bottom2x1?: 331 332 ROTL @Y[$b2],12,@Y[$b2] 333|| ROTL @Y[$b3],12,@Y[$b3] 334|| [B0] ADD @X[$b1],@X[$a1],@X[$a1] ; modulo-scheduled 335|| [B0] ADD @X[$b2],@X[$a2],@X[$a2] 336 [B0] ADD @X[$b0],@X[$a0],@X[$a0] 337|| [B0] ADD @X[$b3],@X[$a3],@X[$a3] 338 339|| ADD @Y[$b0],@Y[$a0],@Y[$a0] 340|| ADD @Y[$b1],@Y[$a1],@Y[$a1] 341|| [B0] XOR @X[$a1],@X[$d1],@X[$d1] 342|| [B0] XOR @X[$a2],@X[$d2],@X[$d2] 343 [B0] XOR @X[$a0],@X[$d0],@X[$d0] 344|| [B0] XOR @X[$a3],@X[$d3],@X[$d3] 345|| ADD @Y[$b2],@Y[$a2],@Y[$a2] 346|| ADD @Y[$b3],@Y[$a3],@Y[$a3] 347|| XOR @Y[$a0],@Y[$d0],@Y[$d0] 348|| XOR @Y[$a1],@Y[$d1],@Y[$d1] 349 XOR @Y[$a2],@Y[$d2],@Y[$d2] 350|| XOR @Y[$a3],@Y[$d3],@Y[$d3] 351|| ROTL @Y[$d0],8,@Y[$d0] 352|| ROTL @Y[$d1],8,@Y[$d1] 353|| [B0] SWAP2 @X[$d1],@X[$d1] ; rotate by 16 354|| [B0] SWAP2 @X[$d2],@X[$d2] 355|| [B0] SWAP2 @X[$d0],@X[$d0] 356|| [B0] SWAP2 @X[$d3],@X[$d3] 357 ROTL @Y[$d2],8,@Y[$d2] 358|| ROTL @Y[$d3],8,@Y[$d3] 359|| [B0] ADD @X[$d1],@X[$c1],@X[$c1] 360|| [B0] ADD @X[$d2],@X[$c2],@X[$c2] 361|| [B0] ADD @X[$d0],@X[$c0],@X[$c0] 362|| [B0] ADD @X[$d3],@X[$c3],@X[$c3] 363|| [B0] BNOP top2x? ; even protects from interrupt 364 365 ADD @Y[$d0],@Y[$c2],@Y[$c2] 366|| ADD @Y[$d1],@Y[$c3],@Y[$c3] 367|| [B0] XOR @X[$c1],@X[$b1],@X[$b1] 368|| [B0] XOR @X[$c2],@X[$b2],@X[$b2] 369|| [B0] XOR @X[$c0],@X[$b0],@X[$b0] 370|| [B0] XOR @X[$c3],@X[$b3],@X[$b3] 371 ADD @Y[$d2],@Y[$c0],@Y[$c0] 372|| ADD @Y[$d3],@Y[$c1],@Y[$c1] 373|| XOR @Y[$c2],@Y[$b0],@Y[$b0] 374|| XOR @Y[$c3],@Y[$b1],@Y[$b1] 375|| ROTL @Y[$d0],0,@Y[$d3] ; moved to avoid cross-path stall 376|| ROTL @Y[$d1],0,@Y[$d0] 377 XOR @Y[$c0],@Y[$b2],@Y[$b2] 378|| XOR @Y[$c1],@Y[$b3],@Y[$b3] 379|| MV @Y[$d2],@Y[$d1] 380|| MV @Y[$d3],@Y[$d2] 381|| [B0] ROTL @X[$b1],12,@X[$b1] 382|| [B0] ROTL @X[$b2],12,@X[$b2] 383 ROTL @Y[$b0],7,@Y[$b1] ; avoided cross-path stall 384|| ROTL @Y[$b1],7,@Y[$b2] 385 ROTL @Y[$b2],7,@Y[$b3] 386|| ROTL @Y[$b3],7,@Y[$b0] 387bottom2x2?: 388___ 389} 390 391$code.=<<___; 392 ADD @K2x[0],@X[0],@X[0] ; accumulate key material 393|| ADD @K2x[1],@X[1],@X[1] 394|| ADD @K2x[2],@X[2],@X[2] 395|| ADD @K2x[3],@X[3],@X[3] 396 ADD @K2x[0],@Y[0],@Y[0] 397|| ADD @K2x[1],@Y[1],@Y[1] 398|| ADD @K2x[2],@Y[2],@Y[2] 399|| ADD @K2x[3],@Y[3],@Y[3] 400|| LDNDW *${INP}++[8],@DAT[1]:@DAT[0] 401 ADD @K2x[4],@X[4],@X[4] 402|| ADD @K2x[5],@X[5],@X[5] 403|| ADD @K2x[6],@X[6],@X[6] 404|| ADD @K2x[7],@X[7],@X[7] 405|| LDNDW *${INP}[-7],@DAT[3]:@DAT[2] 406 ADD @K2x[4],@Y[4],@Y[4] 407|| ADD @K2x[5],@Y[5],@Y[5] 408|| ADD @K2x[6],@Y[6],@Y[6] 409|| ADD @K2x[7],@Y[7],@Y[7] 410|| LDNDW *${INP}[-6],@DAT[5]:@DAT[4] 411 ADD @K2x[8],@X[8],@X[8] 412|| ADD @K2x[9],@X[9],@X[9] 413|| ADD @K2x[10],@X[10],@X[10] 414|| ADD @K2x[11],@X[11],@X[11] 415|| LDNDW *${INP}[-5],@DAT[7]:@DAT[6] 416 ADD @K2x[8],@Y[8],@Y[8] 417|| ADD @K2x[9],@Y[9],@Y[9] 418|| ADD @K2x[10],@Y[10],@Y[10] 419|| ADD @K2x[11],@Y[11],@Y[11] 420|| LDNDW *${INP}[-4],@DAT[9]:@DAT[8] 421 ADD @K2x[12],@X[12],@X[12] 422|| ADD @K2x[13],@X[13],@X[13] 423|| ADD @K2x[14],@X[14],@X[14] 424|| ADD @K2x[15],@X[15],@X[15] 425|| LDNDW *${INP}[-3],@DAT[11]:@DAT[10] 426 ADD @K2x[12],@Y[12],@Y[12] 427|| ADD @K2x[13],@Y[13],@Y[13] 428|| ADD @K2x[14],@Y[14],@Y[14] 429|| ADD @K2x[15],@Y[15],@Y[15] 430|| LDNDW *${INP}[-2],@DAT[13]:@DAT[12] 431 ADD 1,@Y[12],@Y[12] ; adjust counter for 2nd block 432|| ADD 2,@K2x[12],@K2x[12] ; increment counter 433|| LDNDW *${INP}[-1],@DAT[15]:@DAT[14] 434 435 .if .BIG_ENDIAN 436 SWAP2 @X[0],@X[0] 437|| SWAP2 @X[1],@X[1] 438|| SWAP2 @X[2],@X[2] 439|| SWAP2 @X[3],@X[3] 440 SWAP2 @X[4],@X[4] 441|| SWAP2 @X[5],@X[5] 442|| SWAP2 @X[6],@X[6] 443|| SWAP2 @X[7],@X[7] 444 SWAP2 @X[8],@X[8] 445|| SWAP2 @X[9],@X[9] 446|| SWAP4 @X[0],@X[1] 447|| SWAP4 @X[1],@X[0] 448 SWAP2 @X[10],@X[10] 449|| SWAP2 @X[11],@X[11] 450|| SWAP4 @X[2],@X[3] 451|| SWAP4 @X[3],@X[2] 452 SWAP2 @X[12],@X[12] 453|| SWAP2 @X[13],@X[13] 454|| SWAP4 @X[4],@X[5] 455|| SWAP4 @X[5],@X[4] 456 SWAP2 @X[14],@X[14] 457|| SWAP2 @X[15],@X[15] 458|| SWAP4 @X[6],@X[7] 459|| SWAP4 @X[7],@X[6] 460 SWAP4 @X[8],@X[9] 461|| SWAP4 @X[9],@X[8] 462|| SWAP2 @Y[0],@Y[0] 463|| SWAP2 @Y[1],@Y[1] 464 SWAP4 @X[10],@X[11] 465|| SWAP4 @X[11],@X[10] 466|| SWAP2 @Y[2],@Y[2] 467|| SWAP2 @Y[3],@Y[3] 468 SWAP4 @X[12],@X[13] 469|| SWAP4 @X[13],@X[12] 470|| SWAP2 @Y[4],@Y[4] 471|| SWAP2 @Y[5],@Y[5] 472 SWAP4 @X[14],@X[15] 473|| SWAP4 @X[15],@X[14] 474|| SWAP2 @Y[6],@Y[6] 475|| SWAP2 @Y[7],@Y[7] 476 SWAP2 @Y[8],@Y[8] 477|| SWAP2 @Y[9],@Y[9] 478|| SWAP4 @Y[0],@Y[1] 479|| SWAP4 @Y[1],@Y[0] 480 SWAP2 @Y[10],@Y[10] 481|| SWAP2 @Y[11],@Y[11] 482|| SWAP4 @Y[2],@Y[3] 483|| SWAP4 @Y[3],@Y[2] 484 SWAP2 @Y[12],@Y[12] 485|| SWAP2 @Y[13],@Y[13] 486|| SWAP4 @Y[4],@Y[5] 487|| SWAP4 @Y[5],@Y[4] 488 SWAP2 @Y[14],@Y[14] 489|| SWAP2 @Y[15],@Y[15] 490|| SWAP4 @Y[6],@Y[7] 491|| SWAP4 @Y[7],@Y[6] 492 SWAP4 @Y[8],@Y[9] 493|| SWAP4 @Y[9],@Y[8] 494 SWAP4 @Y[10],@Y[11] 495|| SWAP4 @Y[11],@Y[10] 496 SWAP4 @Y[12],@Y[13] 497|| SWAP4 @Y[13],@Y[12] 498 SWAP4 @Y[14],@Y[15] 499|| SWAP4 @Y[15],@Y[14] 500 .endif 501 502 XOR @DAT[0],@X[0],@X[0] ; xor 1st block 503|| XOR @DAT[3],@X[3],@X[3] 504|| XOR @DAT[2],@X[2],@X[1] 505|| XOR @DAT[1],@X[1],@X[2] 506|| LDNDW *${INP}++[8],@DAT[1]:@DAT[0] 507 XOR @DAT[4],@X[4],@X[4] 508|| XOR @DAT[7],@X[7],@X[7] 509|| LDNDW *${INP}[-7],@DAT[3]:@DAT[2] 510 XOR @DAT[6],@X[6],@X[5] 511|| XOR @DAT[5],@X[5],@X[6] 512|| LDNDW *${INP}[-6],@DAT[5]:@DAT[4] 513 XOR @DAT[8],@X[8],@X[8] 514|| XOR @DAT[11],@X[11],@X[11] 515|| LDNDW *${INP}[-5],@DAT[7]:@DAT[6] 516 XOR @DAT[10],@X[10],@X[9] 517|| XOR @DAT[9],@X[9],@X[10] 518|| LDNDW *${INP}[-4],@DAT[9]:@DAT[8] 519 XOR @DAT[12],@X[12],@X[12] 520|| XOR @DAT[15],@X[15],@X[15] 521|| LDNDW *${INP}[-3],@DAT[11]:@DAT[10] 522 XOR @DAT[14],@X[14],@X[13] 523|| XOR @DAT[13],@X[13],@X[14] 524|| LDNDW *${INP}[-2],@DAT[13]:@DAT[12] 525 [A0] SUB A0,$STEP,A0 ; SUB A0,128,A0 526|| LDNDW *${INP}[-1],@DAT[15]:@DAT[14] 527 528 XOR @Y[0],@DAT[0],@DAT[0] ; xor 2nd block 529|| XOR @Y[1],@DAT[1],@DAT[1] 530|| STNDW @X[2]:@X[0],*${OUT}++[8] 531 XOR @Y[2],@DAT[2],@DAT[2] 532|| XOR @Y[3],@DAT[3],@DAT[3] 533|| STNDW @X[3]:@X[1],*${OUT}[-7] 534 XOR @Y[4],@DAT[4],@DAT[4] 535|| [A0] LDDW *FP[-12],@X[2]:@X[0] ; re-load key material from stack 536|| [A0] LDDW *SP[2], @X[3]:@X[1] 537 XOR @Y[5],@DAT[5],@DAT[5] 538|| STNDW @X[6]:@X[4],*${OUT}[-6] 539 XOR @Y[6],@DAT[6],@DAT[6] 540|| XOR @Y[7],@DAT[7],@DAT[7] 541|| STNDW @X[7]:@X[5],*${OUT}[-5] 542 XOR @Y[8],@DAT[8],@DAT[8] 543|| [A0] LDDW *FP[-10],@X[6]:@X[4] 544|| [A0] LDDW *SP[4], @X[7]:@X[5] 545 XOR @Y[9],@DAT[9],@DAT[9] 546|| STNDW @X[10]:@X[8],*${OUT}[-4] 547 XOR @Y[10],@DAT[10],@DAT[10] 548|| XOR @Y[11],@DAT[11],@DAT[11] 549|| STNDW @X[11]:@X[9],*${OUT}[-3] 550 XOR @Y[12],@DAT[12],@DAT[12] 551|| [A0] LDDW *FP[-8], @X[10]:@X[8] 552|| [A0] LDDW *SP[6], @X[11]:@X[9] 553 XOR @Y[13],@DAT[13],@DAT[13] 554|| STNDW @X[14]:@X[12],*${OUT}[-2] 555 XOR @Y[14],@DAT[14],@DAT[14] 556|| XOR @Y[15],@DAT[15],@DAT[15] 557|| STNDW @X[15]:@X[13],*${OUT}[-1] 558 559 [A0] MV @K2x[12],@X[12] 560|| [A0] MV @K2x[13],@X[13] 561|| [A0] LDW *FP[-6*2], @X[14] 562|| [A0] LDW *SP[8*2], @X[15] 563 564 [A0] DMV @X[2],@X[0],@Y[2]:@Y[0] ; duplicate key material 565|| STNDW @DAT[1]:@DAT[0],*${OUT}++[8] 566 [A0] DMV @X[3],@X[1],@Y[3]:@Y[1] 567|| STNDW @DAT[3]:@DAT[2],*${OUT}[-7] 568 [A0] DMV @X[6],@X[4],@Y[6]:@Y[4] 569|| STNDW @DAT[5]:@DAT[4],*${OUT}[-6] 570|| CMPLTU A0,$STEP,A1 ; is remaining length < 2*blocks? 571||[!A0] BNOP epilogue? 572 [A0] DMV @X[7],@X[5],@Y[7]:@Y[5] 573|| STNDW @DAT[7]:@DAT[6],*${OUT}[-5] 574||[!A1] BNOP outer2x? 575 [A0] DMV @X[10],@X[8],@Y[10]:@Y[8] 576|| STNDW @DAT[9]:@DAT[8],*${OUT}[-4] 577 [A0] DMV @X[11],@X[9],@Y[11]:@Y[9] 578|| STNDW @DAT[11]:@DAT[10],*${OUT}[-3] 579 [A0] DMV @X[14],@X[12],@Y[14]:@Y[12] 580|| STNDW @DAT[13]:@DAT[12],*${OUT}[-2] 581 [A0] DMV @X[15],@X[13],@Y[15]:@Y[13] 582|| STNDW @DAT[15]:@DAT[14],*${OUT}[-1] 583;;===== branch to epilogue? is taken here 584 [A1] MVK 64,$STEP 585|| [A0] MVK 10,B0 ; inner loop counter 586;;===== branch to outer2x? is taken here 587___ 588{ 589my ($a0,$a1,$a2,$a3) = (0..3); 590my ($b0,$b1,$b2,$b3) = (4..7); 591my ($c0,$c1,$c2,$c3) = (8..11); 592my ($d0,$d1,$d2,$d3) = (12..15); 593 594$code.=<<___; 595top1x?: 596 ADD @X[$b1],@X[$a1],@X[$a1] 597|| ADD @X[$b2],@X[$a2],@X[$a2] 598 ADD @X[$b0],@X[$a0],@X[$a0] 599|| ADD @X[$b3],@X[$a3],@X[$a3] 600|| XOR @X[$a1],@X[$d1],@X[$d1] 601|| XOR @X[$a2],@X[$d2],@X[$d2] 602 XOR @X[$a0],@X[$d0],@X[$d0] 603|| XOR @X[$a3],@X[$d3],@X[$d3] 604|| SWAP2 @X[$d1],@X[$d1] ; rotate by 16 605|| SWAP2 @X[$d2],@X[$d2] 606 SWAP2 @X[$d0],@X[$d0] 607|| SWAP2 @X[$d3],@X[$d3] 608 609|| ADD @X[$d1],@X[$c1],@X[$c1] 610|| ADD @X[$d2],@X[$c2],@X[$c2] 611 ADD @X[$d0],@X[$c0],@X[$c0] 612|| ADD @X[$d3],@X[$c3],@X[$c3] 613|| XOR @X[$c1],@X[$b1],@X[$b1] 614|| XOR @X[$c2],@X[$b2],@X[$b2] 615 XOR @X[$c0],@X[$b0],@X[$b0] 616|| XOR @X[$c3],@X[$b3],@X[$b3] 617|| ROTL @X[$b1],12,@X[$b1] 618|| ROTL @X[$b2],12,@X[$b2] 619 ROTL @X[$b0],12,@X[$b0] 620|| ROTL @X[$b3],12,@X[$b3] 621 622 ADD @X[$b1],@X[$a1],@X[$a1] 623|| ADD @X[$b2],@X[$a2],@X[$a2] 624 ADD @X[$b0],@X[$a0],@X[$a0] 625|| ADD @X[$b3],@X[$a3],@X[$a3] 626|| XOR @X[$a1],@X[$d1],@X[$d1] 627|| XOR @X[$a2],@X[$d2],@X[$d2] 628 XOR @X[$a0],@X[$d0],@X[$d0] 629|| XOR @X[$a3],@X[$d3],@X[$d3] 630|| ROTL @X[$d1],8,@X[$d1] 631|| ROTL @X[$d2],8,@X[$d2] 632 ROTL @X[$d0],8,@X[$d0] 633|| ROTL @X[$d3],8,@X[$d3] 634|| BNOP middle1x? ; protect from interrupt 635 636 ADD @X[$d1],@X[$c1],@X[$c1] 637|| ADD @X[$d2],@X[$c2],@X[$c2] 638 ADD @X[$d0],@X[$c0],@X[$c0] 639|| ADD @X[$d3],@X[$c3],@X[$c3] 640|| XOR @X[$c1],@X[$b1],@X[$b1] 641|| XOR @X[$c2],@X[$b2],@X[$b2] 642|| ROTL @X[$d1],0,@X[$d2] ; moved to avoid cross-path stall 643|| ROTL @X[$d2],0,@X[$d3] 644 XOR @X[$c0],@X[$b0],@X[$b0] 645|| XOR @X[$c3],@X[$b3],@X[$b3] 646|| ROTL @X[$d0],0,@X[$d1] 647|| ROTL @X[$d3],0,@X[$d0] 648 ROTL @X[$b1],7,@X[$b0] ; avoided cross-path stall 649|| ROTL @X[$b2],7,@X[$b1] 650 ROTL @X[$b0],7,@X[$b3] 651|| ROTL @X[$b3],7,@X[$b2] 652middle1x?: 653 654 ADD @X[$b0],@X[$a0],@X[$a0] 655|| ADD @X[$b1],@X[$a1],@X[$a1] 656 ADD @X[$b2],@X[$a2],@X[$a2] 657|| ADD @X[$b3],@X[$a3],@X[$a3] 658|| XOR @X[$a0],@X[$d0],@X[$d0] 659|| XOR @X[$a1],@X[$d1],@X[$d1] 660 XOR @X[$a2],@X[$d2],@X[$d2] 661|| XOR @X[$a3],@X[$d3],@X[$d3] 662|| SWAP2 @X[$d0],@X[$d0] ; rotate by 16 663|| SWAP2 @X[$d1],@X[$d1] 664 SWAP2 @X[$d2],@X[$d2] 665|| SWAP2 @X[$d3],@X[$d3] 666 667|| ADD @X[$d0],@X[$c2],@X[$c2] 668|| ADD @X[$d1],@X[$c3],@X[$c3] 669 ADD @X[$d2],@X[$c0],@X[$c0] 670|| ADD @X[$d3],@X[$c1],@X[$c1] 671|| XOR @X[$c2],@X[$b0],@X[$b0] 672|| XOR @X[$c3],@X[$b1],@X[$b1] 673 XOR @X[$c0],@X[$b2],@X[$b2] 674|| XOR @X[$c1],@X[$b3],@X[$b3] 675|| ROTL @X[$b0],12,@X[$b0] 676|| ROTL @X[$b1],12,@X[$b1] 677 ROTL @X[$b2],12,@X[$b2] 678|| ROTL @X[$b3],12,@X[$b3] 679 680 ADD @X[$b0],@X[$a0],@X[$a0] 681|| ADD @X[$b1],@X[$a1],@X[$a1] 682|| [B0] SUB B0,1,B0 ; decrement inner loop counter 683 ADD @X[$b2],@X[$a2],@X[$a2] 684|| ADD @X[$b3],@X[$a3],@X[$a3] 685|| XOR @X[$a0],@X[$d0],@X[$d0] 686|| XOR @X[$a1],@X[$d1],@X[$d1] 687 XOR @X[$a2],@X[$d2],@X[$d2] 688|| XOR @X[$a3],@X[$d3],@X[$d3] 689|| ROTL @X[$d0],8,@X[$d0] 690|| ROTL @X[$d1],8,@X[$d1] 691 ROTL @X[$d2],8,@X[$d2] 692|| ROTL @X[$d3],8,@X[$d3] 693|| [B0] BNOP top1x? ; even protects from interrupt 694 695 ADD @X[$d0],@X[$c2],@X[$c2] 696|| ADD @X[$d1],@X[$c3],@X[$c3] 697 ADD @X[$d2],@X[$c0],@X[$c0] 698|| ADD @X[$d3],@X[$c1],@X[$c1] 699|| XOR @X[$c2],@X[$b0],@X[$b0] 700|| XOR @X[$c3],@X[$b1],@X[$b1] 701|| ROTL @X[$d0],0,@X[$d3] ; moved to avoid cross-path stall 702|| ROTL @X[$d1],0,@X[$d0] 703 XOR @X[$c0],@X[$b2],@X[$b2] 704|| XOR @X[$c1],@X[$b3],@X[$b3] 705|| ROTL @X[$d2],0,@X[$d1] 706|| ROTL @X[$d3],0,@X[$d2] 707 ROTL @X[$b0],7,@X[$b1] ; avoided cross-path stall 708|| ROTL @X[$b1],7,@X[$b2] 709 ROTL @X[$b2],7,@X[$b3] 710|| ROTL @X[$b3],7,@X[$b0] 711||[!B0] CMPLTU A0,$STEP,A1 ; less than 64 bytes left? 712bottom1x?: 713___ 714} 715 716$code.=<<___; 717 ADD @Y[0],@X[0],@X[0] ; accumulate key material 718|| ADD @Y[1],@X[1],@X[1] 719|| ADD @Y[2],@X[2],@X[2] 720|| ADD @Y[3],@X[3],@X[3] 721||[!A1] LDNDW *${INP}++[8],@DAT[1]:@DAT[0] 722|| [A1] BNOP tail? 723 ADD @Y[4],@X[4],@X[4] 724|| ADD @Y[5],@X[5],@X[5] 725|| ADD @Y[6],@X[6],@X[6] 726|| ADD @Y[7],@X[7],@X[7] 727||[!A1] LDNDW *${INP}[-7],@DAT[3]:@DAT[2] 728 ADD @Y[8],@X[8],@X[8] 729|| ADD @Y[9],@X[9],@X[9] 730|| ADD @Y[10],@X[10],@X[10] 731|| ADD @Y[11],@X[11],@X[11] 732||[!A1] LDNDW *${INP}[-6],@DAT[5]:@DAT[4] 733 ADD @Y[12],@X[12],@X[12] 734|| ADD @Y[13],@X[13],@X[13] 735|| ADD @Y[14],@X[14],@X[14] 736|| ADD @Y[15],@X[15],@X[15] 737||[!A1] LDNDW *${INP}[-5],@DAT[7]:@DAT[6] 738 [!A1] LDNDW *${INP}[-4],@DAT[9]:@DAT[8] 739 [!A1] LDNDW *${INP}[-3],@DAT[11]:@DAT[10] 740 LDNDW *${INP}[-2],@DAT[13]:@DAT[12] 741 LDNDW *${INP}[-1],@DAT[15]:@DAT[14] 742 743 .if .BIG_ENDIAN 744 SWAP2 @X[0],@X[0] 745|| SWAP2 @X[1],@X[1] 746|| SWAP2 @X[2],@X[2] 747|| SWAP2 @X[3],@X[3] 748 SWAP2 @X[4],@X[4] 749|| SWAP2 @X[5],@X[5] 750|| SWAP2 @X[6],@X[6] 751|| SWAP2 @X[7],@X[7] 752 SWAP2 @X[8],@X[8] 753|| SWAP2 @X[9],@X[9] 754|| SWAP4 @X[0],@X[1] 755|| SWAP4 @X[1],@X[0] 756 SWAP2 @X[10],@X[10] 757|| SWAP2 @X[11],@X[11] 758|| SWAP4 @X[2],@X[3] 759|| SWAP4 @X[3],@X[2] 760 SWAP2 @X[12],@X[12] 761|| SWAP2 @X[13],@X[13] 762|| SWAP4 @X[4],@X[5] 763|| SWAP4 @X[5],@X[4] 764 SWAP2 @X[14],@X[14] 765|| SWAP2 @X[15],@X[15] 766|| SWAP4 @X[6],@X[7] 767|| SWAP4 @X[7],@X[6] 768 SWAP4 @X[8],@X[9] 769|| SWAP4 @X[9],@X[8] 770 SWAP4 @X[10],@X[11] 771|| SWAP4 @X[11],@X[10] 772 SWAP4 @X[12],@X[13] 773|| SWAP4 @X[13],@X[12] 774 SWAP4 @X[14],@X[15] 775|| SWAP4 @X[15],@X[14] 776 .else 777 NOP 1 778 .endif 779 780 XOR @X[0],@DAT[0],@DAT[0] ; xor with input 781|| XOR @X[1],@DAT[1],@DAT[1] 782|| XOR @X[2],@DAT[2],@DAT[2] 783|| XOR @X[3],@DAT[3],@DAT[3] 784|| [A0] SUB A0,$STEP,A0 ; SUB A0,64,A0 785 XOR @X[4],@DAT[4],@DAT[4] 786|| XOR @X[5],@DAT[5],@DAT[5] 787|| XOR @X[6],@DAT[6],@DAT[6] 788|| XOR @X[7],@DAT[7],@DAT[7] 789|| STNDW @DAT[1]:@DAT[0],*${OUT}++[8] 790 XOR @X[8],@DAT[8],@DAT[8] 791|| XOR @X[9],@DAT[9],@DAT[9] 792|| XOR @X[10],@DAT[10],@DAT[10] 793|| XOR @X[11],@DAT[11],@DAT[11] 794|| STNDW @DAT[3]:@DAT[2],*${OUT}[-7] 795 XOR @X[12],@DAT[12],@DAT[12] 796|| XOR @X[13],@DAT[13],@DAT[13] 797|| XOR @X[14],@DAT[14],@DAT[14] 798|| XOR @X[15],@DAT[15],@DAT[15] 799|| STNDW @DAT[5]:@DAT[4],*${OUT}[-6] 800|| [A0] BNOP top1x? 801 [A0] DMV @Y[2],@Y[0],@X[2]:@X[0] ; duplicate key material 802|| [A0] DMV @Y[3],@Y[1],@X[3]:@X[1] 803|| STNDW @DAT[7]:@DAT[6],*${OUT}[-5] 804 [A0] DMV @Y[6],@Y[4],@X[6]:@X[4] 805|| [A0] DMV @Y[7],@Y[5],@X[7]:@X[5] 806|| STNDW @DAT[9]:@DAT[8],*${OUT}[-4] 807 [A0] DMV @Y[10],@Y[8],@X[10]:@X[8] 808|| [A0] DMV @Y[11],@Y[9],@X[11]:@X[9] 809|| [A0] ADD 1,@Y[12],@Y[12] ; increment counter 810|| STNDW @DAT[11]:@DAT[10],*${OUT}[-3] 811 [A0] DMV @Y[14],@Y[12],@X[14]:@X[12] 812|| [A0] DMV @Y[15],@Y[13],@X[15]:@X[13] 813|| STNDW @DAT[13]:@DAT[12],*${OUT}[-2] 814 [A0] MVK 10,B0 ; inner loop counter 815|| STNDW @DAT[15]:@DAT[14],*${OUT}[-1] 816;;===== branch to top1x? is taken here 817 818epilogue?: 819 LDDW *FP[-4],A11:A10 ; ABI says so 820 LDDW *FP[-3],A13:A12 821|| LDDW *SP[3+8],B11:B10 822 LDDW *SP[4+8],B13:B12 823|| BNOP RA 824 LDW *++SP(40+64),FP ; restore frame pointer 825 NOP 4 826 827tail?: 828 LDBU *${INP}++[1],B24 ; load byte by byte 829|| SUB A0,1,A0 830|| SUB A0,1,B1 831 [!B1] BNOP epilogue? ; interrupts are disabled for whole time 832|| [A0] LDBU *${INP}++[1],B24 833|| [A0] SUB A0,1,A0 834|| SUB B1,1,B1 835 [!B1] BNOP epilogue? 836|| [A0] LDBU *${INP}++[1],B24 837|| [A0] SUB A0,1,A0 838|| SUB B1,1,B1 839 [!B1] BNOP epilogue? 840|| ROTL @X[0],0,A24 841|| [A0] LDBU *${INP}++[1],B24 842|| [A0] SUB A0,1,A0 843|| SUB B1,1,B1 844 [!B1] BNOP epilogue? 845|| ROTL @X[0],24,A24 846|| [A0] LDBU *${INP}++[1],A24 847|| [A0] SUB A0,1,A0 848|| SUB B1,1,B1 849 [!B1] BNOP epilogue? 850|| ROTL @X[0],16,A24 851|| [A0] LDBU *${INP}++[1],A24 852|| [A0] SUB A0,1,A0 853|| SUB B1,1,B1 854|| XOR A24,B24,B25 855 STB B25,*${OUT}++[1] ; store byte by byte 856||[!B1] BNOP epilogue? 857|| ROTL @X[0],8,A24 858|| [A0] LDBU *${INP}++[1],A24 859|| [A0] SUB A0,1,A0 860|| SUB B1,1,B1 861|| XOR A24,B24,B25 862 STB B25,*${OUT}++[1] 863___ 864sub TAIL_STEP { 865my $Xi= shift; 866my $T = ($Xi=~/^B/?"B24":"A24"); # match @X[i] to avoid cross path 867my $D = $T; $D=~tr/AB/BA/; 868my $O = $D; $O=~s/24/25/; 869 870$code.=<<___; 871||[!B1] BNOP epilogue? 872|| ROTL $Xi,0,$T 873|| [A0] LDBU *${INP}++[1],$D 874|| [A0] SUB A0,1,A0 875|| SUB B1,1,B1 876|| XOR A24,B24,$O 877 STB $O,*${OUT}++[1] 878||[!B1] BNOP epilogue? 879|| ROTL $Xi,24,$T 880|| [A0] LDBU *${INP}++[1],$T 881|| [A0] SUB A0,1,A0 882|| SUB B1,1,B1 883|| XOR A24,B24,$O 884 STB $O,*${OUT}++[1] 885||[!B1] BNOP epilogue? 886|| ROTL $Xi,16,$T 887|| [A0] LDBU *${INP}++[1],$T 888|| [A0] SUB A0,1,A0 889|| SUB B1,1,B1 890|| XOR A24,B24,$O 891 STB $O,*${OUT}++[1] 892||[!B1] BNOP epilogue? 893|| ROTL $Xi,8,$T 894|| [A0] LDBU *${INP}++[1],$T 895|| [A0] SUB A0,1,A0 896|| SUB B1,1,B1 897|| XOR A24,B24,$O 898 STB $O,*${OUT}++[1] 899___ 900} 901 foreach (1..14) { TAIL_STEP(@X[$_]); } 902$code.=<<___; 903||[!B1] BNOP epilogue? 904|| ROTL @X[15],0,B24 905|| XOR A24,B24,A25 906 STB A25,*${OUT}++[1] 907|| ROTL @X[15],24,B24 908|| XOR A24,B24,A25 909 STB A25,*${OUT}++[1] 910|| ROTL @X[15],16,B24 911|| XOR A24,B24,A25 912 STB A25,*${OUT}++[1] 913|| XOR A24,B24,A25 914 STB A25,*${OUT}++[1] 915|| XOR A24,B24,B25 916 STB B25,*${OUT}++[1] 917 .endasmfunc 918 919 .sect .const 920 .cstring "ChaCha20 for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 921 .align 4 922___ 923 924print $code; 925close STDOUT or die "error closing STDOUT: $!"; 926