1#! /usr/bin/env perl 2# Author: Min Zhou <zhoumin@loongson.cn> 3# Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved. 4# 5# Licensed under the Apache License 2.0 (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9 10use strict; 11 12my $code; 13 14# Here is the scalar register layout for LoongArch. 15my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22)); 16my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11)); 17my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21)); 18my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8)=map("\$r$_",(23..31)); 19 20# The saved floating-point registers in the LP64D ABI. In LoongArch 21# with vector extension, the low 64 bits of a vector register alias with 22# the corresponding FPR. So we must save and restore the corresponding 23# FPR if we'll write into a vector register. The ABI only requires 24# saving and restoring the FPR (i.e. 64 bits of the corresponding vector 25# register), not the entire vector register. 26my ($fs0,$fs1,$fs2,$fs3,$fs4,$fs5,$fs6,$fs7)=map("\$f$_",(24..31)); 27 28# Here is the 128-bit vector register layout for LSX extension. 29my ($vr0,$vr1,$vr2,$vr3,$vr4,$vr5,$vr6,$vr7,$vr8,$vr9,$vr10, 30 $vr11,$vr12,$vr13,$vr14,$vr15,$vr16,$vr17,$vr18,$vr19, 31 $vr20,$vr21,$vr22,$vr23,$vr24,$vr25,$vr26,$vr27,$vr28, 32 $vr29,$vr30,$vr31)=map("\$vr$_",(0..31)); 33 34# Here is the 256-bit vector register layout for LASX extension. 35my ($xr0,$xr1,$xr2,$xr3,$xr4,$xr5,$xr6,$xr7,$xr8,$xr9,$xr10, 36 $xr11,$xr12,$xr13,$xr14,$xr15,$xr16,$xr17,$xr18,$xr19, 37 $xr20,$xr21,$xr22,$xr23,$xr24,$xr25,$xr26,$xr27,$xr28, 38 $xr29,$xr30,$xr31)=map("\$xr$_",(0..31)); 39 40my $output; 41for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } 42open STDOUT,">$output"; 43 44# Input parameter block 45my ($out, $inp, $len, $key, $counter) = ($a0, $a1, $a2, $a3, $a4); 46 47$code .= <<EOF; 48#include "loongarch_arch.h" 49 50.text 51 52.extern OPENSSL_loongarch_hwcap_P 53 54.align 6 55.Lsigma: 56.ascii "expand 32-byte k" 57.Linc8x: 58.long 0,1,2,3,4,5,6,7 59.Linc4x: 60.long 0,1,2,3 61 62.globl ChaCha20_ctr32 63.type ChaCha20_ctr32 function 64 65.align 6 66ChaCha20_ctr32: 67 # $a0 = arg #1 (out pointer) 68 # $a1 = arg #2 (inp pointer) 69 # $a2 = arg #3 (len) 70 # $a3 = arg #4 (key array) 71 # $a4 = arg #5 (counter array) 72 73 beqz $len,.Lno_data 74 ori $t3,$zero,64 75 la.pcrel $t0,OPENSSL_loongarch_hwcap_P 76 ld.w $t0,$t0,0 77 78 bleu $len,$t3,.LChaCha20_1x # goto 1x when len <= 64 79 80 andi $t0,$t0,LOONGARCH_HWCAP_LASX | LOONGARCH_HWCAP_LSX 81 beqz $t0,.LChaCha20_1x 82 83 addi.d $sp,$sp,-64 84 fst.d $fs0,$sp,0 85 fst.d $fs1,$sp,8 86 fst.d $fs2,$sp,16 87 fst.d $fs3,$sp,24 88 fst.d $fs4,$sp,32 89 fst.d $fs5,$sp,40 90 fst.d $fs6,$sp,48 91 fst.d $fs7,$sp,56 92 93 andi $t1,$t0,LOONGARCH_HWCAP_LASX 94 bnez $t1,.LChaCha20_8x 95 96 b .LChaCha20_4x 97 98EOF 99 100######################################################################## 101# Scalar code path that handles all lengths. 102{ 103# Load the initial states in array @x[*] and update directly 104my @x = ($t0, $t1, $t2, $t3, $t4, $t5, $t6, $t7, 105 $s0, $s1, $s2, $s3, $s4, $s5, $s6, $s7); 106 107sub ROUND { 108 my ($a0,$b0,$c0,$d0) = @_; 109 my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 110 my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 111 my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 112 113$code .= <<EOF; 114 add.w @x[$a0],@x[$a0],@x[$b0] 115 xor @x[$d0],@x[$d0],@x[$a0] 116 rotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits 117 add.w @x[$a1],@x[$a1],@x[$b1] 118 xor @x[$d1],@x[$d1],@x[$a1] 119 rotri.w @x[$d1],@x[$d1],16 120 121 add.w @x[$c0],@x[$c0],@x[$d0] 122 xor @x[$b0],@x[$b0],@x[$c0] 123 rotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits 124 add.w @x[$c1],@x[$c1],@x[$d1] 125 xor @x[$b1],@x[$b1],@x[$c1] 126 rotri.w @x[$b1],@x[$b1],20 127 128 add.w @x[$a0],@x[$a0],@x[$b0] 129 xor @x[$d0],@x[$d0],@x[$a0] 130 rotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits 131 add.w @x[$a1],@x[$a1],@x[$b1] 132 xor @x[$d1],@x[$d1],@x[$a1] 133 rotri.w @x[$d1],@x[$d1],24 134 135 add.w @x[$c0],@x[$c0],@x[$d0] 136 xor @x[$b0],@x[$b0],@x[$c0] 137 rotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits 138 add.w @x[$c1],@x[$c1],@x[$d1] 139 xor @x[$b1],@x[$b1],@x[$c1] 140 rotri.w @x[$b1],@x[$b1],25 141 142 add.w @x[$a2],@x[$a2],@x[$b2] 143 xor @x[$d2],@x[$d2],@x[$a2] 144 rotri.w @x[$d2],@x[$d2],16 145 add.w @x[$a3],@x[$a3],@x[$b3] 146 xor @x[$d3],@x[$d3],@x[$a3] 147 rotri.w @x[$d3],@x[$d3],16 148 149 add.w @x[$c2],@x[$c2],@x[$d2] 150 xor @x[$b2],@x[$b2],@x[$c2] 151 rotri.w @x[$b2],@x[$b2],20 152 add.w @x[$c3],@x[$c3],@x[$d3] 153 xor @x[$b3],@x[$b3],@x[$c3] 154 rotri.w @x[$b3],@x[$b3],20 155 156 add.w @x[$a2],@x[$a2],@x[$b2] 157 xor @x[$d2],@x[$d2],@x[$a2] 158 rotri.w @x[$d2],@x[$d2],24 159 add.w @x[$a3],@x[$a3],@x[$b3] 160 xor @x[$d3],@x[$d3],@x[$a3] 161 rotri.w @x[$d3],@x[$d3],24 162 163 add.w @x[$c2],@x[$c2],@x[$d2] 164 xor @x[$b2],@x[$b2],@x[$c2] 165 rotri.w @x[$b2],@x[$b2],25 166 add.w @x[$c3],@x[$c3],@x[$d3] 167 xor @x[$b3],@x[$b3],@x[$c3] 168 rotri.w @x[$b3],@x[$b3],25 169 170EOF 171} 172 173$code .= <<EOF; 174.align 6 175.LChaCha20_1x: 176 addi.d $sp,$sp,-256 177 st.d $s0,$sp,0 178 st.d $s1,$sp,8 179 st.d $s2,$sp,16 180 st.d $s3,$sp,24 181 st.d $s4,$sp,32 182 st.d $s5,$sp,40 183 st.d $s6,$sp,48 184 st.d $s7,$sp,56 185 st.d $s8,$sp,64 186 187 # Save the initial block counter in $s8 188 ld.w $s8,$counter,0 189 b .Loop_outer_1x 190 191.align 5 192.Loop_outer_1x: 193 # Load constants 194 la.local $t8,.Lsigma 195 ld.w @x[0],$t8,0 # 'expa' 196 ld.w @x[1],$t8,4 # 'nd 3' 197 ld.w @x[2],$t8,8 # '2-by' 198 ld.w @x[3],$t8,12 # 'te k' 199 200 # Load key 201 ld.w @x[4],$key,4*0 202 ld.w @x[5],$key,4*1 203 ld.w @x[6],$key,4*2 204 ld.w @x[7],$key,4*3 205 ld.w @x[8],$key,4*4 206 ld.w @x[9],$key,4*5 207 ld.w @x[10],$key,4*6 208 ld.w @x[11],$key,4*7 209 210 # Load block counter 211 move @x[12],$s8 212 213 # Load nonce 214 ld.w @x[13],$counter,4*1 215 ld.w @x[14],$counter,4*2 216 ld.w @x[15],$counter,4*3 217 218 # Update states in \@x[*] for 20 rounds 219 ori $t8,$zero,10 220 b .Loop_1x 221 222.align 5 223.Loop_1x: 224EOF 225 226&ROUND (0, 4, 8, 12); 227&ROUND (0, 5, 10, 15); 228 229$code .= <<EOF; 230 addi.w $t8,$t8,-1 231 bnez $t8,.Loop_1x 232 233 # Get the final states by adding the initial states 234 la.local $t8,.Lsigma 235 ld.w $a7,$t8,4*0 236 ld.w $a6,$t8,4*1 237 ld.w $a5,$t8,4*2 238 add.w @x[0],@x[0],$a7 239 add.w @x[1],@x[1],$a6 240 add.w @x[2],@x[2],$a5 241 ld.w $a7,$t8,4*3 242 add.w @x[3],@x[3],$a7 243 244 ld.w $t8,$key,4*0 245 ld.w $a7,$key,4*1 246 ld.w $a6,$key,4*2 247 ld.w $a5,$key,4*3 248 add.w @x[4],@x[4],$t8 249 add.w @x[5],@x[5],$a7 250 add.w @x[6],@x[6],$a6 251 add.w @x[7],@x[7],$a5 252 253 ld.w $t8,$key,4*4 254 ld.w $a7,$key,4*5 255 ld.w $a6,$key,4*6 256 ld.w $a5,$key,4*7 257 add.w @x[8],@x[8],$t8 258 add.w @x[9],@x[9],$a7 259 add.w @x[10],@x[10],$a6 260 add.w @x[11],@x[11],$a5 261 262 add.w @x[12],@x[12],$s8 263 264 ld.w $t8,$counter,4*1 265 ld.w $a7,$counter,4*2 266 ld.w $a6,$counter,4*3 267 add.w @x[13],@x[13],$t8 268 add.w @x[14],@x[14],$a7 269 add.w @x[15],@x[15],$a6 270 271 ori $t8,$zero,64 272 bltu $len,$t8,.Ltail_1x 273 274 # Get the encrypted message by xor states with plaintext 275 ld.w $t8,$inp,4*0 276 ld.w $a7,$inp,4*1 277 ld.w $a6,$inp,4*2 278 ld.w $a5,$inp,4*3 279 xor $t8,$t8,@x[0] 280 xor $a7,$a7,@x[1] 281 xor $a6,$a6,@x[2] 282 xor $a5,$a5,@x[3] 283 st.w $t8,$out,4*0 284 st.w $a7,$out,4*1 285 st.w $a6,$out,4*2 286 st.w $a5,$out,4*3 287 288 ld.w $t8,$inp,4*4 289 ld.w $a7,$inp,4*5 290 ld.w $a6,$inp,4*6 291 ld.w $a5,$inp,4*7 292 xor $t8,$t8,@x[4] 293 xor $a7,$a7,@x[5] 294 xor $a6,$a6,@x[6] 295 xor $a5,$a5,@x[7] 296 st.w $t8,$out,4*4 297 st.w $a7,$out,4*5 298 st.w $a6,$out,4*6 299 st.w $a5,$out,4*7 300 301 ld.w $t8,$inp,4*8 302 ld.w $a7,$inp,4*9 303 ld.w $a6,$inp,4*10 304 ld.w $a5,$inp,4*11 305 xor $t8,$t8,@x[8] 306 xor $a7,$a7,@x[9] 307 xor $a6,$a6,@x[10] 308 xor $a5,$a5,@x[11] 309 st.w $t8,$out,4*8 310 st.w $a7,$out,4*9 311 st.w $a6,$out,4*10 312 st.w $a5,$out,4*11 313 314 ld.w $t8,$inp,4*12 315 ld.w $a7,$inp,4*13 316 ld.w $a6,$inp,4*14 317 ld.w $a5,$inp,4*15 318 xor $t8,$t8,@x[12] 319 xor $a7,$a7,@x[13] 320 xor $a6,$a6,@x[14] 321 xor $a5,$a5,@x[15] 322 st.w $t8,$out,4*12 323 st.w $a7,$out,4*13 324 st.w $a6,$out,4*14 325 st.w $a5,$out,4*15 326 327 addi.d $len,$len,-64 328 beqz $len,.Ldone_1x 329 addi.d $inp,$inp,64 330 addi.d $out,$out,64 331 addi.w $s8,$s8,1 332 b .Loop_outer_1x 333 334.align 4 335.Ltail_1x: 336 # Handle the tail for 1x (1 <= tail_len <= 63) 337 addi.d $a7,$sp,72 338 st.w @x[0],$a7,4*0 339 st.w @x[1],$a7,4*1 340 st.w @x[2],$a7,4*2 341 st.w @x[3],$a7,4*3 342 st.w @x[4],$a7,4*4 343 st.w @x[5],$a7,4*5 344 st.w @x[6],$a7,4*6 345 st.w @x[7],$a7,4*7 346 st.w @x[8],$a7,4*8 347 st.w @x[9],$a7,4*9 348 st.w @x[10],$a7,4*10 349 st.w @x[11],$a7,4*11 350 st.w @x[12],$a7,4*12 351 st.w @x[13],$a7,4*13 352 st.w @x[14],$a7,4*14 353 st.w @x[15],$a7,4*15 354 355 move $t8,$zero 356 357.Loop_tail_1x: 358 # Xor input with states byte by byte 359 ldx.bu $a6,$inp,$t8 360 ldx.bu $a5,$a7,$t8 361 xor $a6,$a6,$a5 362 stx.b $a6,$out,$t8 363 addi.w $t8,$t8,1 364 addi.d $len,$len,-1 365 bnez $len,.Loop_tail_1x 366 b .Ldone_1x 367 368.Ldone_1x: 369 ld.d $s0,$sp,0 370 ld.d $s1,$sp,8 371 ld.d $s2,$sp,16 372 ld.d $s3,$sp,24 373 ld.d $s4,$sp,32 374 ld.d $s5,$sp,40 375 ld.d $s6,$sp,48 376 ld.d $s7,$sp,56 377 ld.d $s8,$sp,64 378 addi.d $sp,$sp,256 379 380 b .Lend 381 382EOF 383} 384 385######################################################################## 386# 128-bit LSX code path that handles all lengths. 387{ 388# Load the initial states in array @x[*] and update directly. 389my @x = ($vr0, $vr1, $vr2, $vr3, $vr4, $vr5, $vr6, $vr7, 390 $vr8, $vr9, $vr10, $vr11, $vr12, $vr13, $vr14, $vr15); 391 392# Save the initial states in array @y[*] 393my @y = ($vr16, $vr17, $vr18, $vr19, $vr20, $vr21, $vr22, $vr23, 394 $vr24, $vr25, $vr26, $vr27, $vr28, $vr29, $vr30, $vr31); 395 396sub ROUND_4x { 397 my ($a0,$b0,$c0,$d0) = @_; 398 my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 399 my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 400 my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 401 402$code .= <<EOF; 403 vadd.w @x[$a0],@x[$a0],@x[$b0] 404 vxor.v @x[$d0],@x[$d0],@x[$a0] 405 vrotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits 406 vadd.w @x[$a1],@x[$a1],@x[$b1] 407 vxor.v @x[$d1],@x[$d1],@x[$a1] 408 vrotri.w @x[$d1],@x[$d1],16 409 410 vadd.w @x[$c0],@x[$c0],@x[$d0] 411 vxor.v @x[$b0],@x[$b0],@x[$c0] 412 vrotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits 413 vadd.w @x[$c1],@x[$c1],@x[$d1] 414 vxor.v @x[$b1],@x[$b1],@x[$c1] 415 vrotri.w @x[$b1],@x[$b1],20 416 417 vadd.w @x[$a0],@x[$a0],@x[$b0] 418 vxor.v @x[$d0],@x[$d0],@x[$a0] 419 vrotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits 420 vadd.w @x[$a1],@x[$a1],@x[$b1] 421 vxor.v @x[$d1],@x[$d1],@x[$a1] 422 vrotri.w @x[$d1],@x[$d1],24 423 424 vadd.w @x[$c0],@x[$c0],@x[$d0] 425 vxor.v @x[$b0],@x[$b0],@x[$c0] 426 vrotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits 427 vadd.w @x[$c1],@x[$c1],@x[$d1] 428 vxor.v @x[$b1],@x[$b1],@x[$c1] 429 vrotri.w @x[$b1],@x[$b1],25 430 431 vadd.w @x[$a2],@x[$a2],@x[$b2] 432 vxor.v @x[$d2],@x[$d2],@x[$a2] 433 vrotri.w @x[$d2],@x[$d2],16 434 vadd.w @x[$a3],@x[$a3],@x[$b3] 435 vxor.v @x[$d3],@x[$d3],@x[$a3] 436 vrotri.w @x[$d3],@x[$d3],16 437 438 vadd.w @x[$c2],@x[$c2],@x[$d2] 439 vxor.v @x[$b2],@x[$b2],@x[$c2] 440 vrotri.w @x[$b2],@x[$b2],20 441 vadd.w @x[$c3],@x[$c3],@x[$d3] 442 vxor.v @x[$b3],@x[$b3],@x[$c3] 443 vrotri.w @x[$b3],@x[$b3],20 444 445 vadd.w @x[$a2],@x[$a2],@x[$b2] 446 vxor.v @x[$d2],@x[$d2],@x[$a2] 447 vrotri.w @x[$d2],@x[$d2],24 448 vadd.w @x[$a3],@x[$a3],@x[$b3] 449 vxor.v @x[$d3],@x[$d3],@x[$a3] 450 vrotri.w @x[$d3],@x[$d3],24 451 452 vadd.w @x[$c2],@x[$c2],@x[$d2] 453 vxor.v @x[$b2],@x[$b2],@x[$c2] 454 vrotri.w @x[$b2],@x[$b2],25 455 vadd.w @x[$c3],@x[$c3],@x[$d3] 456 vxor.v @x[$b3],@x[$b3],@x[$c3] 457 vrotri.w @x[$b3],@x[$b3],25 458 459EOF 460} 461 462$code .= <<EOF; 463.align 6 464.LChaCha20_4x: 465 addi.d $sp,$sp,-128 466 467 # Save the initial block counter in $t4 468 ld.w $t4,$counter,0 469 b .Loop_outer_4x 470 471.align 5 472.Loop_outer_4x: 473 # Load constant 474 la.local $t8,.Lsigma 475 vldrepl.w @x[0],$t8,4*0 # 'expa' 476 vldrepl.w @x[1],$t8,4*1 # 'nd 3' 477 vldrepl.w @x[2],$t8,4*2 # '2-by' 478 vldrepl.w @x[3],$t8,4*3 # 'te k' 479 480 # Load key 481 vldrepl.w @x[4],$key,4*0 482 vldrepl.w @x[5],$key,4*1 483 vldrepl.w @x[6],$key,4*2 484 vldrepl.w @x[7],$key,4*3 485 vldrepl.w @x[8],$key,4*4 486 vldrepl.w @x[9],$key,4*5 487 vldrepl.w @x[10],$key,4*6 488 vldrepl.w @x[11],$key,4*7 489 490 # Load block counter 491 vreplgr2vr.w @x[12],$t4 492 493 # Load nonce 494 vldrepl.w @x[13],$counter,4*1 495 vldrepl.w @x[14],$counter,4*2 496 vldrepl.w @x[15],$counter,4*3 497 498 # Get the correct block counter for each block 499 la.local $t8,.Linc4x 500 vld @y[0],$t8,0 501 vadd.w @x[12],@x[12],@y[0] 502 503 # Copy the initial states from \@x[*] to \@y[*] 504 vori.b @y[0],@x[0],0 505 vori.b @y[1],@x[1],0 506 vori.b @y[2],@x[2],0 507 vori.b @y[3],@x[3],0 508 vori.b @y[4],@x[4],0 509 vori.b @y[5],@x[5],0 510 vori.b @y[6],@x[6],0 511 vori.b @y[7],@x[7],0 512 vori.b @y[8],@x[8],0 513 vori.b @y[9],@x[9],0 514 vori.b @y[10],@x[10],0 515 vori.b @y[11],@x[11],0 516 vori.b @y[12],@x[12],0 517 vori.b @y[13],@x[13],0 518 vori.b @y[14],@x[14],0 519 vori.b @y[15],@x[15],0 520 521 # Update states in \@x[*] for 20 rounds 522 ori $t8,$zero,10 523 b .Loop_4x 524 525.align 5 526.Loop_4x: 527EOF 528 529&ROUND_4x (0, 4, 8, 12); 530&ROUND_4x (0, 5, 10, 15); 531 532$code .= <<EOF; 533 addi.w $t8,$t8,-1 534 bnez $t8,.Loop_4x 535 536 # Get the final states by adding the initial states 537 vadd.w @x[0],@x[0],@y[0] 538 vadd.w @x[1],@x[1],@y[1] 539 vadd.w @x[2],@x[2],@y[2] 540 vadd.w @x[3],@x[3],@y[3] 541 vadd.w @x[4],@x[4],@y[4] 542 vadd.w @x[5],@x[5],@y[5] 543 vadd.w @x[6],@x[6],@y[6] 544 vadd.w @x[7],@x[7],@y[7] 545 vadd.w @x[8],@x[8],@y[8] 546 vadd.w @x[9],@x[9],@y[9] 547 vadd.w @x[10],@x[10],@y[10] 548 vadd.w @x[11],@x[11],@y[11] 549 vadd.w @x[12],@x[12],@y[12] 550 vadd.w @x[13],@x[13],@y[13] 551 vadd.w @x[14],@x[14],@y[14] 552 vadd.w @x[15],@x[15],@y[15] 553 554 # Get the transpose of \@x[*] and save them in \@x[*] 555 vilvl.w @y[0],@x[1],@x[0] 556 vilvh.w @y[1],@x[1],@x[0] 557 vilvl.w @y[2],@x[3],@x[2] 558 vilvh.w @y[3],@x[3],@x[2] 559 vilvl.w @y[4],@x[5],@x[4] 560 vilvh.w @y[5],@x[5],@x[4] 561 vilvl.w @y[6],@x[7],@x[6] 562 vilvh.w @y[7],@x[7],@x[6] 563 vilvl.w @y[8],@x[9],@x[8] 564 vilvh.w @y[9],@x[9],@x[8] 565 vilvl.w @y[10],@x[11],@x[10] 566 vilvh.w @y[11],@x[11],@x[10] 567 vilvl.w @y[12],@x[13],@x[12] 568 vilvh.w @y[13],@x[13],@x[12] 569 vilvl.w @y[14],@x[15],@x[14] 570 vilvh.w @y[15],@x[15],@x[14] 571 572 vilvl.d @x[0],@y[2],@y[0] 573 vilvh.d @x[1],@y[2],@y[0] 574 vilvl.d @x[2],@y[3],@y[1] 575 vilvh.d @x[3],@y[3],@y[1] 576 vilvl.d @x[4],@y[6],@y[4] 577 vilvh.d @x[5],@y[6],@y[4] 578 vilvl.d @x[6],@y[7],@y[5] 579 vilvh.d @x[7],@y[7],@y[5] 580 vilvl.d @x[8],@y[10],@y[8] 581 vilvh.d @x[9],@y[10],@y[8] 582 vilvl.d @x[10],@y[11],@y[9] 583 vilvh.d @x[11],@y[11],@y[9] 584 vilvl.d @x[12],@y[14],@y[12] 585 vilvh.d @x[13],@y[14],@y[12] 586 vilvl.d @x[14],@y[15],@y[13] 587 vilvh.d @x[15],@y[15],@y[13] 588EOF 589 590# Adjust the order of elements in @x[*] for ease of use. 591@x = (@x[0],@x[4],@x[8],@x[12],@x[1],@x[5],@x[9],@x[13], 592 @x[2],@x[6],@x[10],@x[14],@x[3],@x[7],@x[11],@x[15]); 593 594$code .= <<EOF; 595 ori $t8,$zero,64*4 596 bltu $len,$t8,.Ltail_4x 597 598 # Get the encrypted message by xor states with plaintext 599 vld @y[0],$inp,16*0 600 vld @y[1],$inp,16*1 601 vld @y[2],$inp,16*2 602 vld @y[3],$inp,16*3 603 vxor.v @y[0],@y[0],@x[0] 604 vxor.v @y[1],@y[1],@x[1] 605 vxor.v @y[2],@y[2],@x[2] 606 vxor.v @y[3],@y[3],@x[3] 607 vst @y[0],$out,16*0 608 vst @y[1],$out,16*1 609 vst @y[2],$out,16*2 610 vst @y[3],$out,16*3 611 612 vld @y[0],$inp,16*4 613 vld @y[1],$inp,16*5 614 vld @y[2],$inp,16*6 615 vld @y[3],$inp,16*7 616 vxor.v @y[0],@y[0],@x[4] 617 vxor.v @y[1],@y[1],@x[5] 618 vxor.v @y[2],@y[2],@x[6] 619 vxor.v @y[3],@y[3],@x[7] 620 vst @y[0],$out,16*4 621 vst @y[1],$out,16*5 622 vst @y[2],$out,16*6 623 vst @y[3],$out,16*7 624 625 vld @y[0],$inp,16*8 626 vld @y[1],$inp,16*9 627 vld @y[2],$inp,16*10 628 vld @y[3],$inp,16*11 629 vxor.v @y[0],@y[0],@x[8] 630 vxor.v @y[1],@y[1],@x[9] 631 vxor.v @y[2],@y[2],@x[10] 632 vxor.v @y[3],@y[3],@x[11] 633 vst @y[0],$out,16*8 634 vst @y[1],$out,16*9 635 vst @y[2],$out,16*10 636 vst @y[3],$out,16*11 637 638 vld @y[0],$inp,16*12 639 vld @y[1],$inp,16*13 640 vld @y[2],$inp,16*14 641 vld @y[3],$inp,16*15 642 vxor.v @y[0],@y[0],@x[12] 643 vxor.v @y[1],@y[1],@x[13] 644 vxor.v @y[2],@y[2],@x[14] 645 vxor.v @y[3],@y[3],@x[15] 646 vst @y[0],$out,16*12 647 vst @y[1],$out,16*13 648 vst @y[2],$out,16*14 649 vst @y[3],$out,16*15 650 651 addi.d $len,$len,-64*4 652 beqz $len,.Ldone_4x 653 addi.d $inp,$inp,64*4 654 addi.d $out,$out,64*4 655 addi.w $t4,$t4,4 656 b .Loop_outer_4x 657 658.Ltail_4x: 659 # Handle the tail for 4x (1 <= tail_len <= 255) 660 ori $t8,$zero,192 661 bgeu $len,$t8,.L192_or_more4x 662 ori $t8,$zero,128 663 bgeu $len,$t8,.L128_or_more4x 664 ori $t8,$zero,64 665 bgeu $len,$t8,.L64_or_more4x 666 667 vst @x[0],$sp,16*0 668 vst @x[1],$sp,16*1 669 vst @x[2],$sp,16*2 670 vst @x[3],$sp,16*3 671 move $t8,$zero 672 b .Loop_tail_4x 673 674.align 5 675.L64_or_more4x: 676 vld @y[0],$inp,16*0 677 vld @y[1],$inp,16*1 678 vld @y[2],$inp,16*2 679 vld @y[3],$inp,16*3 680 vxor.v @y[0],@y[0],@x[0] 681 vxor.v @y[1],@y[1],@x[1] 682 vxor.v @y[2],@y[2],@x[2] 683 vxor.v @y[3],@y[3],@x[3] 684 vst @y[0],$out,16*0 685 vst @y[1],$out,16*1 686 vst @y[2],$out,16*2 687 vst @y[3],$out,16*3 688 689 addi.d $len,$len,-64 690 beqz $len,.Ldone_4x 691 addi.d $inp,$inp,64 692 addi.d $out,$out,64 693 vst @x[4],$sp,16*0 694 vst @x[5],$sp,16*1 695 vst @x[6],$sp,16*2 696 vst @x[7],$sp,16*3 697 move $t8,$zero 698 b .Loop_tail_4x 699 700.align 5 701.L128_or_more4x: 702 vld @y[0],$inp,16*0 703 vld @y[1],$inp,16*1 704 vld @y[2],$inp,16*2 705 vld @y[3],$inp,16*3 706 vxor.v @y[0],@y[0],@x[0] 707 vxor.v @y[1],@y[1],@x[1] 708 vxor.v @y[2],@y[2],@x[2] 709 vxor.v @y[3],@y[3],@x[3] 710 vst @y[0],$out,16*0 711 vst @y[1],$out,16*1 712 vst @y[2],$out,16*2 713 vst @y[3],$out,16*3 714 715 vld @y[0],$inp,16*4 716 vld @y[1],$inp,16*5 717 vld @y[2],$inp,16*6 718 vld @y[3],$inp,16*7 719 vxor.v @y[0],@y[0],@x[4] 720 vxor.v @y[1],@y[1],@x[5] 721 vxor.v @y[2],@y[2],@x[6] 722 vxor.v @y[3],@y[3],@x[7] 723 vst @y[0],$out,16*4 724 vst @y[1],$out,16*5 725 vst @y[2],$out,16*6 726 vst @y[3],$out,16*7 727 728 addi.d $len,$len,-128 729 beqz $len,.Ldone_4x 730 addi.d $inp,$inp,128 731 addi.d $out,$out,128 732 vst @x[8],$sp,16*0 733 vst @x[9],$sp,16*1 734 vst @x[10],$sp,16*2 735 vst @x[11],$sp,16*3 736 move $t8,$zero 737 b .Loop_tail_4x 738 739.align 5 740.L192_or_more4x: 741 vld @y[0],$inp,16*0 742 vld @y[1],$inp,16*1 743 vld @y[2],$inp,16*2 744 vld @y[3],$inp,16*3 745 vxor.v @y[0],@y[0],@x[0] 746 vxor.v @y[1],@y[1],@x[1] 747 vxor.v @y[2],@y[2],@x[2] 748 vxor.v @y[3],@y[3],@x[3] 749 vst @y[0],$out,16*0 750 vst @y[1],$out,16*1 751 vst @y[2],$out,16*2 752 vst @y[3],$out,16*3 753 754 vld @y[0],$inp,16*4 755 vld @y[1],$inp,16*5 756 vld @y[2],$inp,16*6 757 vld @y[3],$inp,16*7 758 vxor.v @y[0],@y[0],@x[4] 759 vxor.v @y[1],@y[1],@x[5] 760 vxor.v @y[2],@y[2],@x[6] 761 vxor.v @y[3],@y[3],@x[7] 762 vst @y[0],$out,16*4 763 vst @y[1],$out,16*5 764 vst @y[2],$out,16*6 765 vst @y[3],$out,16*7 766 767 vld @y[0],$inp,16*8 768 vld @y[1],$inp,16*9 769 vld @y[2],$inp,16*10 770 vld @y[3],$inp,16*11 771 vxor.v @y[0],@y[0],@x[8] 772 vxor.v @y[1],@y[1],@x[9] 773 vxor.v @y[2],@y[2],@x[10] 774 vxor.v @y[3],@y[3],@x[11] 775 vst @y[0],$out,16*8 776 vst @y[1],$out,16*9 777 vst @y[2],$out,16*10 778 vst @y[3],$out,16*11 779 780 addi.d $len,$len,-192 781 beqz $len,.Ldone_4x 782 addi.d $inp,$inp,192 783 addi.d $out,$out,192 784 vst @x[12],$sp,16*0 785 vst @x[13],$sp,16*1 786 vst @x[14],$sp,16*2 787 vst @x[15],$sp,16*3 788 move $t8,$zero 789 b .Loop_tail_4x 790 791.Loop_tail_4x: 792 # Xor input with states byte by byte 793 ldx.bu $t5,$inp,$t8 794 ldx.bu $t6,$sp,$t8 795 xor $t5,$t5,$t6 796 stx.b $t5,$out,$t8 797 addi.w $t8,$t8,1 798 addi.d $len,$len,-1 799 bnez $len,.Loop_tail_4x 800 b .Ldone_4x 801 802.Ldone_4x: 803 addi.d $sp,$sp,128 804 b .Lrestore_saved_fpr 805 806EOF 807} 808 809######################################################################## 810# 256-bit LASX code path that handles all lengths. 811{ 812# Load the initial states in array @x[*] and update directly. 813my @x = ($xr0, $xr1, $xr2, $xr3, $xr4, $xr5, $xr6, $xr7, 814 $xr8, $xr9, $xr10, $xr11, $xr12, $xr13, $xr14, $xr15); 815 816# Save the initial states in array @y[*] 817my @y = ($xr16, $xr17, $xr18, $xr19, $xr20, $xr21, $xr22, $xr23, 818 $xr24, $xr25, $xr26, $xr27, $xr28, $xr29, $xr30, $xr31); 819 820sub ROUND_8x { 821 my ($a0,$b0,$c0,$d0) = @_; 822 my ($a1,$b1,$c1,$d1) = map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 823 my ($a2,$b2,$c2,$d2) = map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 824 my ($a3,$b3,$c3,$d3) = map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 825 826$code .= <<EOF; 827 xvadd.w @x[$a0],@x[$a0],@x[$b0] 828 xvxor.v @x[$d0],@x[$d0],@x[$a0] 829 xvrotri.w @x[$d0],@x[$d0],16 # rotate left 16 bits 830 xvadd.w @x[$a1],@x[$a1],@x[$b1] 831 xvxor.v @x[$d1],@x[$d1],@x[$a1] 832 xvrotri.w @x[$d1],@x[$d1],16 833 834 xvadd.w @x[$c0],@x[$c0],@x[$d0] 835 xvxor.v @x[$b0],@x[$b0],@x[$c0] 836 xvrotri.w @x[$b0],@x[$b0],20 # rotate left 12 bits 837 xvadd.w @x[$c1],@x[$c1],@x[$d1] 838 xvxor.v @x[$b1],@x[$b1],@x[$c1] 839 xvrotri.w @x[$b1],@x[$b1],20 840 841 xvadd.w @x[$a0],@x[$a0],@x[$b0] 842 xvxor.v @x[$d0],@x[$d0],@x[$a0] 843 xvrotri.w @x[$d0],@x[$d0],24 # rotate left 8 bits 844 xvadd.w @x[$a1],@x[$a1],@x[$b1] 845 xvxor.v @x[$d1],@x[$d1],@x[$a1] 846 xvrotri.w @x[$d1],@x[$d1],24 847 848 xvadd.w @x[$c0],@x[$c0],@x[$d0] 849 xvxor.v @x[$b0],@x[$b0],@x[$c0] 850 xvrotri.w @x[$b0],@x[$b0],25 # rotate left 7 bits 851 xvadd.w @x[$c1],@x[$c1],@x[$d1] 852 xvxor.v @x[$b1],@x[$b1],@x[$c1] 853 xvrotri.w @x[$b1],@x[$b1],25 854 855 xvadd.w @x[$a2],@x[$a2],@x[$b2] 856 xvxor.v @x[$d2],@x[$d2],@x[$a2] 857 xvrotri.w @x[$d2],@x[$d2],16 858 xvadd.w @x[$a3],@x[$a3],@x[$b3] 859 xvxor.v @x[$d3],@x[$d3],@x[$a3] 860 xvrotri.w @x[$d3],@x[$d3],16 861 862 xvadd.w @x[$c2],@x[$c2],@x[$d2] 863 xvxor.v @x[$b2],@x[$b2],@x[$c2] 864 xvrotri.w @x[$b2],@x[$b2],20 865 xvadd.w @x[$c3],@x[$c3],@x[$d3] 866 xvxor.v @x[$b3],@x[$b3],@x[$c3] 867 xvrotri.w @x[$b3],@x[$b3],20 868 869 xvadd.w @x[$a2],@x[$a2],@x[$b2] 870 xvxor.v @x[$d2],@x[$d2],@x[$a2] 871 xvrotri.w @x[$d2],@x[$d2],24 872 xvadd.w @x[$a3],@x[$a3],@x[$b3] 873 xvxor.v @x[$d3],@x[$d3],@x[$a3] 874 xvrotri.w @x[$d3],@x[$d3],24 875 876 xvadd.w @x[$c2],@x[$c2],@x[$d2] 877 xvxor.v @x[$b2],@x[$b2],@x[$c2] 878 xvrotri.w @x[$b2],@x[$b2],25 879 xvadd.w @x[$c3],@x[$c3],@x[$d3] 880 xvxor.v @x[$b3],@x[$b3],@x[$c3] 881 xvrotri.w @x[$b3],@x[$b3],25 882 883EOF 884} 885 886$code .= <<EOF; 887.align 6 888.LChaCha20_8x: 889 addi.d $sp,$sp,-128 890 891 # Save the initial block counter in $t4 892 ld.w $t4,$counter,0 893 b .Loop_outer_8x 894 895.align 5 896.Loop_outer_8x: 897 # Load constant 898 la.local $t8,.Lsigma 899 xvldrepl.w @x[0],$t8,4*0 # 'expa' 900 xvldrepl.w @x[1],$t8,4*1 # 'nd 3' 901 xvldrepl.w @x[2],$t8,4*2 # '2-by' 902 xvldrepl.w @x[3],$t8,4*3 # 'te k' 903 904 # Load key 905 xvldrepl.w @x[4],$key,4*0 906 xvldrepl.w @x[5],$key,4*1 907 xvldrepl.w @x[6],$key,4*2 908 xvldrepl.w @x[7],$key,4*3 909 xvldrepl.w @x[8],$key,4*4 910 xvldrepl.w @x[9],$key,4*5 911 xvldrepl.w @x[10],$key,4*6 912 xvldrepl.w @x[11],$key,4*7 913 914 # Load block counter 915 xvreplgr2vr.w @x[12],$t4 916 917 # Load nonce 918 xvldrepl.w @x[13],$counter,4*1 919 xvldrepl.w @x[14],$counter,4*2 920 xvldrepl.w @x[15],$counter,4*3 921 922 # Get the correct block counter for each block 923 la.local $t8,.Linc8x 924 xvld @y[0],$t8,0 925 xvadd.w @x[12],@x[12],@y[0] 926 927 # Copy the initial states from \@x[*] to \@y[*] 928 xvori.b @y[0],@x[0],0 929 xvori.b @y[1],@x[1],0 930 xvori.b @y[2],@x[2],0 931 xvori.b @y[3],@x[3],0 932 xvori.b @y[4],@x[4],0 933 xvori.b @y[5],@x[5],0 934 xvori.b @y[6],@x[6],0 935 xvori.b @y[7],@x[7],0 936 xvori.b @y[8],@x[8],0 937 xvori.b @y[9],@x[9],0 938 xvori.b @y[10],@x[10],0 939 xvori.b @y[11],@x[11],0 940 xvori.b @y[12],@x[12],0 941 xvori.b @y[13],@x[13],0 942 xvori.b @y[14],@x[14],0 943 xvori.b @y[15],@x[15],0 944 945 # Update states in \@x[*] for 20 rounds 946 ori $t8,$zero,10 947 b .Loop_8x 948 949.align 5 950.Loop_8x: 951EOF 952 953&ROUND_8x (0, 4, 8, 12); 954&ROUND_8x (0, 5, 10, 15); 955 956$code .= <<EOF; 957 addi.w $t8,$t8,-1 958 bnez $t8,.Loop_8x 959 960 # Get the final states by adding the initial states 961 xvadd.w @x[0],@x[0],@y[0] 962 xvadd.w @x[1],@x[1],@y[1] 963 xvadd.w @x[2],@x[2],@y[2] 964 xvadd.w @x[3],@x[3],@y[3] 965 xvadd.w @x[4],@x[4],@y[4] 966 xvadd.w @x[5],@x[5],@y[5] 967 xvadd.w @x[6],@x[6],@y[6] 968 xvadd.w @x[7],@x[7],@y[7] 969 xvadd.w @x[8],@x[8],@y[8] 970 xvadd.w @x[9],@x[9],@y[9] 971 xvadd.w @x[10],@x[10],@y[10] 972 xvadd.w @x[11],@x[11],@y[11] 973 xvadd.w @x[12],@x[12],@y[12] 974 xvadd.w @x[13],@x[13],@y[13] 975 xvadd.w @x[14],@x[14],@y[14] 976 xvadd.w @x[15],@x[15],@y[15] 977 978 # Get the transpose of \@x[*] and save them in \@y[*] 979 xvilvl.w @y[0],@x[1],@x[0] 980 xvilvh.w @y[1],@x[1],@x[0] 981 xvilvl.w @y[2],@x[3],@x[2] 982 xvilvh.w @y[3],@x[3],@x[2] 983 xvilvl.w @y[4],@x[5],@x[4] 984 xvilvh.w @y[5],@x[5],@x[4] 985 xvilvl.w @y[6],@x[7],@x[6] 986 xvilvh.w @y[7],@x[7],@x[6] 987 xvilvl.w @y[8],@x[9],@x[8] 988 xvilvh.w @y[9],@x[9],@x[8] 989 xvilvl.w @y[10],@x[11],@x[10] 990 xvilvh.w @y[11],@x[11],@x[10] 991 xvilvl.w @y[12],@x[13],@x[12] 992 xvilvh.w @y[13],@x[13],@x[12] 993 xvilvl.w @y[14],@x[15],@x[14] 994 xvilvh.w @y[15],@x[15],@x[14] 995 996 xvilvl.d @x[0],@y[2],@y[0] 997 xvilvh.d @x[1],@y[2],@y[0] 998 xvilvl.d @x[2],@y[3],@y[1] 999 xvilvh.d @x[3],@y[3],@y[1] 1000 xvilvl.d @x[4],@y[6],@y[4] 1001 xvilvh.d @x[5],@y[6],@y[4] 1002 xvilvl.d @x[6],@y[7],@y[5] 1003 xvilvh.d @x[7],@y[7],@y[5] 1004 xvilvl.d @x[8],@y[10],@y[8] 1005 xvilvh.d @x[9],@y[10],@y[8] 1006 xvilvl.d @x[10],@y[11],@y[9] 1007 xvilvh.d @x[11],@y[11],@y[9] 1008 xvilvl.d @x[12],@y[14],@y[12] 1009 xvilvh.d @x[13],@y[14],@y[12] 1010 xvilvl.d @x[14],@y[15],@y[13] 1011 xvilvh.d @x[15],@y[15],@y[13] 1012 1013 xvori.b @y[0],@x[4],0 1014 xvpermi.q @y[0],@x[0],0x20 1015 xvori.b @y[1],@x[5],0 1016 xvpermi.q @y[1],@x[1],0x20 1017 xvori.b @y[2],@x[6],0 1018 xvpermi.q @y[2],@x[2],0x20 1019 xvori.b @y[3],@x[7],0 1020 xvpermi.q @y[3],@x[3],0x20 1021 xvori.b @y[4],@x[4],0 1022 xvpermi.q @y[4],@x[0],0x31 1023 xvori.b @y[5],@x[5],0 1024 xvpermi.q @y[5],@x[1],0x31 1025 xvori.b @y[6],@x[6],0 1026 xvpermi.q @y[6],@x[2],0x31 1027 xvori.b @y[7],@x[7],0 1028 xvpermi.q @y[7],@x[3],0x31 1029 xvori.b @y[8],@x[12],0 1030 xvpermi.q @y[8],@x[8],0x20 1031 xvori.b @y[9],@x[13],0 1032 xvpermi.q @y[9],@x[9],0x20 1033 xvori.b @y[10],@x[14],0 1034 xvpermi.q @y[10],@x[10],0x20 1035 xvori.b @y[11],@x[15],0 1036 xvpermi.q @y[11],@x[11],0x20 1037 xvori.b @y[12],@x[12],0 1038 xvpermi.q @y[12],@x[8],0x31 1039 xvori.b @y[13],@x[13],0 1040 xvpermi.q @y[13],@x[9],0x31 1041 xvori.b @y[14],@x[14],0 1042 xvpermi.q @y[14],@x[10],0x31 1043 xvori.b @y[15],@x[15],0 1044 xvpermi.q @y[15],@x[11],0x31 1045 1046EOF 1047 1048# Adjust the order of elements in @y[*] for ease of use. 1049@y = (@y[0],@y[8],@y[1],@y[9],@y[2],@y[10],@y[3],@y[11], 1050 @y[4],@y[12],@y[5],@y[13],@y[6],@y[14],@y[7],@y[15]); 1051 1052$code .= <<EOF; 1053 ori $t8,$zero,64*8 1054 bltu $len,$t8,.Ltail_8x 1055 1056 # Get the encrypted message by xor states with plaintext 1057 xvld @x[0],$inp,32*0 1058 xvld @x[1],$inp,32*1 1059 xvld @x[2],$inp,32*2 1060 xvld @x[3],$inp,32*3 1061 xvxor.v @x[0],@x[0],@y[0] 1062 xvxor.v @x[1],@x[1],@y[1] 1063 xvxor.v @x[2],@x[2],@y[2] 1064 xvxor.v @x[3],@x[3],@y[3] 1065 xvst @x[0],$out,32*0 1066 xvst @x[1],$out,32*1 1067 xvst @x[2],$out,32*2 1068 xvst @x[3],$out,32*3 1069 1070 xvld @x[0],$inp,32*4 1071 xvld @x[1],$inp,32*5 1072 xvld @x[2],$inp,32*6 1073 xvld @x[3],$inp,32*7 1074 xvxor.v @x[0],@x[0],@y[4] 1075 xvxor.v @x[1],@x[1],@y[5] 1076 xvxor.v @x[2],@x[2],@y[6] 1077 xvxor.v @x[3],@x[3],@y[7] 1078 xvst @x[0],$out,32*4 1079 xvst @x[1],$out,32*5 1080 xvst @x[2],$out,32*6 1081 xvst @x[3],$out,32*7 1082 1083 xvld @x[0],$inp,32*8 1084 xvld @x[1],$inp,32*9 1085 xvld @x[2],$inp,32*10 1086 xvld @x[3],$inp,32*11 1087 xvxor.v @x[0],@x[0],@y[8] 1088 xvxor.v @x[1],@x[1],@y[9] 1089 xvxor.v @x[2],@x[2],@y[10] 1090 xvxor.v @x[3],@x[3],@y[11] 1091 xvst @x[0],$out,32*8 1092 xvst @x[1],$out,32*9 1093 xvst @x[2],$out,32*10 1094 xvst @x[3],$out,32*11 1095 1096 xvld @x[0],$inp,32*12 1097 xvld @x[1],$inp,32*13 1098 xvld @x[2],$inp,32*14 1099 xvld @x[3],$inp,32*15 1100 xvxor.v @x[0],@x[0],@y[12] 1101 xvxor.v @x[1],@x[1],@y[13] 1102 xvxor.v @x[2],@x[2],@y[14] 1103 xvxor.v @x[3],@x[3],@y[15] 1104 xvst @x[0],$out,32*12 1105 xvst @x[1],$out,32*13 1106 xvst @x[2],$out,32*14 1107 xvst @x[3],$out,32*15 1108 1109 addi.d $len,$len,-64*8 1110 beqz $len,.Ldone_8x 1111 addi.d $inp,$inp,64*8 1112 addi.d $out,$out,64*8 1113 addi.w $t4,$t4,8 1114 b .Loop_outer_8x 1115 1116.Ltail_8x: 1117 # Handle the tail for 8x (1 <= tail_len <= 511) 1118 ori $t8,$zero,448 1119 bgeu $len,$t8,.L448_or_more8x 1120 ori $t8,$zero,384 1121 bgeu $len,$t8,.L384_or_more8x 1122 ori $t8,$zero,320 1123 bgeu $len,$t8,.L320_or_more8x 1124 ori $t8,$zero,256 1125 bgeu $len,$t8,.L256_or_more8x 1126 ori $t8,$zero,192 1127 bgeu $len,$t8,.L192_or_more8x 1128 ori $t8,$zero,128 1129 bgeu $len,$t8,.L128_or_more8x 1130 ori $t8,$zero,64 1131 bgeu $len,$t8,.L64_or_more8x 1132 1133 xvst @y[0],$sp,32*0 1134 xvst @y[1],$sp,32*1 1135 move $t8,$zero 1136 b .Loop_tail_8x 1137 1138.align 5 1139.L64_or_more8x: 1140 xvld @x[0],$inp,32*0 1141 xvld @x[1],$inp,32*1 1142 xvxor.v @x[0],@x[0],@y[0] 1143 xvxor.v @x[1],@x[1],@y[1] 1144 xvst @x[0],$out,32*0 1145 xvst @x[1],$out,32*1 1146 1147 addi.d $len,$len,-64 1148 beqz $len,.Ldone_8x 1149 addi.d $inp,$inp,64 1150 addi.d $out,$out,64 1151 xvst @y[2],$sp,32*0 1152 xvst @y[3],$sp,32*1 1153 move $t8,$zero 1154 b .Loop_tail_8x 1155 1156.align 5 1157.L128_or_more8x: 1158 xvld @x[0],$inp,32*0 1159 xvld @x[1],$inp,32*1 1160 xvld @x[2],$inp,32*2 1161 xvld @x[3],$inp,32*3 1162 xvxor.v @x[0],@x[0],@y[0] 1163 xvxor.v @x[1],@x[1],@y[1] 1164 xvxor.v @x[2],@x[2],@y[2] 1165 xvxor.v @x[3],@x[3],@y[3] 1166 xvst @x[0],$out,32*0 1167 xvst @x[1],$out,32*1 1168 xvst @x[2],$out,32*2 1169 xvst @x[3],$out,32*3 1170 1171 addi.d $len,$len,-128 1172 beqz $len,.Ldone_8x 1173 addi.d $inp,$inp,128 1174 addi.d $out,$out,128 1175 xvst @y[4],$sp,32*0 1176 xvst @y[5],$sp,32*1 1177 move $t8,$zero 1178 b .Loop_tail_8x 1179 1180.align 5 1181.L192_or_more8x: 1182 xvld @x[0],$inp,32*0 1183 xvld @x[1],$inp,32*1 1184 xvld @x[2],$inp,32*2 1185 xvld @x[3],$inp,32*3 1186 xvxor.v @x[0],@x[0],@y[0] 1187 xvxor.v @x[1],@x[1],@y[1] 1188 xvxor.v @x[2],@x[2],@y[2] 1189 xvxor.v @x[3],@x[3],@y[3] 1190 xvst @x[0],$out,32*0 1191 xvst @x[1],$out,32*1 1192 xvst @x[2],$out,32*2 1193 xvst @x[3],$out,32*3 1194 1195 xvld @x[0],$inp,32*4 1196 xvld @x[1],$inp,32*5 1197 xvxor.v @x[0],@x[0],@y[4] 1198 xvxor.v @x[1],@x[1],@y[5] 1199 xvst @x[0],$out,32*4 1200 xvst @x[1],$out,32*5 1201 1202 addi.d $len,$len,-192 1203 beqz $len,.Ldone_8x 1204 addi.d $inp,$inp,192 1205 addi.d $out,$out,192 1206 xvst @y[6],$sp,32*0 1207 xvst @y[7],$sp,32*1 1208 move $t8,$zero 1209 b .Loop_tail_8x 1210 1211.align 5 1212.L256_or_more8x: 1213 xvld @x[0],$inp,32*0 1214 xvld @x[1],$inp,32*1 1215 xvld @x[2],$inp,32*2 1216 xvld @x[3],$inp,32*3 1217 xvxor.v @x[0],@x[0],@y[0] 1218 xvxor.v @x[1],@x[1],@y[1] 1219 xvxor.v @x[2],@x[2],@y[2] 1220 xvxor.v @x[3],@x[3],@y[3] 1221 xvst @x[0],$out,32*0 1222 xvst @x[1],$out,32*1 1223 xvst @x[2],$out,32*2 1224 xvst @x[3],$out,32*3 1225 1226 xvld @x[0],$inp,32*4 1227 xvld @x[1],$inp,32*5 1228 xvld @x[2],$inp,32*6 1229 xvld @x[3],$inp,32*7 1230 xvxor.v @x[0],@x[0],@y[4] 1231 xvxor.v @x[1],@x[1],@y[5] 1232 xvxor.v @x[2],@x[2],@y[6] 1233 xvxor.v @x[3],@x[3],@y[7] 1234 xvst @x[0],$out,32*4 1235 xvst @x[1],$out,32*5 1236 xvst @x[2],$out,32*6 1237 xvst @x[3],$out,32*7 1238 1239 addi.d $len,$len,-256 1240 beqz $len,.Ldone_8x 1241 addi.d $inp,$inp,256 1242 addi.d $out,$out,256 1243 xvst @y[8],$sp,32*0 1244 xvst @y[9],$sp,32*1 1245 move $t8,$zero 1246 b .Loop_tail_8x 1247 1248.align 5 1249.L320_or_more8x: 1250 xvld @x[0],$inp,32*0 1251 xvld @x[1],$inp,32*1 1252 xvld @x[2],$inp,32*2 1253 xvld @x[3],$inp,32*3 1254 xvxor.v @x[0],@x[0],@y[0] 1255 xvxor.v @x[1],@x[1],@y[1] 1256 xvxor.v @x[2],@x[2],@y[2] 1257 xvxor.v @x[3],@x[3],@y[3] 1258 xvst @x[0],$out,32*0 1259 xvst @x[1],$out,32*1 1260 xvst @x[2],$out,32*2 1261 xvst @x[3],$out,32*3 1262 1263 xvld @x[0],$inp,32*4 1264 xvld @x[1],$inp,32*5 1265 xvld @x[2],$inp,32*6 1266 xvld @x[3],$inp,32*7 1267 xvxor.v @x[0],@x[0],@y[4] 1268 xvxor.v @x[1],@x[1],@y[5] 1269 xvxor.v @x[2],@x[2],@y[6] 1270 xvxor.v @x[3],@x[3],@y[7] 1271 xvst @x[0],$out,32*4 1272 xvst @x[1],$out,32*5 1273 xvst @x[2],$out,32*6 1274 xvst @x[3],$out,32*7 1275 1276 xvld @x[0],$inp,32*8 1277 xvld @x[1],$inp,32*9 1278 xvxor.v @x[0],@x[0],@y[8] 1279 xvxor.v @x[1],@x[1],@y[9] 1280 xvst @x[0],$out,32*8 1281 xvst @x[1],$out,32*9 1282 1283 addi.d $len,$len,-320 1284 beqz $len,.Ldone_8x 1285 addi.d $inp,$inp,320 1286 addi.d $out,$out,320 1287 xvst @y[10],$sp,32*0 1288 xvst @y[11],$sp,32*1 1289 move $t8,$zero 1290 b .Loop_tail_8x 1291 1292.align 5 1293.L384_or_more8x: 1294 xvld @x[0],$inp,32*0 1295 xvld @x[1],$inp,32*1 1296 xvld @x[2],$inp,32*2 1297 xvld @x[3],$inp,32*3 1298 xvxor.v @x[0],@x[0],@y[0] 1299 xvxor.v @x[1],@x[1],@y[1] 1300 xvxor.v @x[2],@x[2],@y[2] 1301 xvxor.v @x[3],@x[3],@y[3] 1302 xvst @x[0],$out,32*0 1303 xvst @x[1],$out,32*1 1304 xvst @x[2],$out,32*2 1305 xvst @x[3],$out,32*3 1306 1307 xvld @x[0],$inp,32*4 1308 xvld @x[1],$inp,32*5 1309 xvld @x[2],$inp,32*6 1310 xvld @x[3],$inp,32*7 1311 xvxor.v @x[0],@x[0],@y[4] 1312 xvxor.v @x[1],@x[1],@y[5] 1313 xvxor.v @x[2],@x[2],@y[6] 1314 xvxor.v @x[3],@x[3],@y[7] 1315 xvst @x[0],$out,32*4 1316 xvst @x[1],$out,32*5 1317 xvst @x[2],$out,32*6 1318 xvst @x[3],$out,32*7 1319 1320 xvld @x[0],$inp,32*8 1321 xvld @x[1],$inp,32*9 1322 xvld @x[2],$inp,32*10 1323 xvld @x[3],$inp,32*11 1324 xvxor.v @x[0],@x[0],@y[8] 1325 xvxor.v @x[1],@x[1],@y[9] 1326 xvxor.v @x[2],@x[2],@y[10] 1327 xvxor.v @x[3],@x[3],@y[11] 1328 xvst @x[0],$out,32*8 1329 xvst @x[1],$out,32*9 1330 xvst @x[2],$out,32*10 1331 xvst @x[3],$out,32*11 1332 1333 addi.d $len,$len,-384 1334 beqz $len,.Ldone_8x 1335 addi.d $inp,$inp,384 1336 addi.d $out,$out,384 1337 xvst @y[12],$sp,32*0 1338 xvst @y[13],$sp,32*1 1339 move $t8,$zero 1340 b .Loop_tail_8x 1341 1342.align 5 1343.L448_or_more8x: 1344 xvld @x[0],$inp,32*0 1345 xvld @x[1],$inp,32*1 1346 xvld @x[2],$inp,32*2 1347 xvld @x[3],$inp,32*3 1348 xvxor.v @x[0],@x[0],@y[0] 1349 xvxor.v @x[1],@x[1],@y[1] 1350 xvxor.v @x[2],@x[2],@y[2] 1351 xvxor.v @x[3],@x[3],@y[3] 1352 xvst @x[0],$out,32*0 1353 xvst @x[1],$out,32*1 1354 xvst @x[2],$out,32*2 1355 xvst @x[3],$out,32*3 1356 1357 xvld @x[0],$inp,32*4 1358 xvld @x[1],$inp,32*5 1359 xvld @x[2],$inp,32*6 1360 xvld @x[3],$inp,32*7 1361 xvxor.v @x[0],@x[0],@y[4] 1362 xvxor.v @x[1],@x[1],@y[5] 1363 xvxor.v @x[2],@x[2],@y[6] 1364 xvxor.v @x[3],@x[3],@y[7] 1365 xvst @x[0],$out,32*4 1366 xvst @x[1],$out,32*5 1367 xvst @x[2],$out,32*6 1368 xvst @x[3],$out,32*7 1369 1370 xvld @x[0],$inp,32*8 1371 xvld @x[1],$inp,32*9 1372 xvld @x[2],$inp,32*10 1373 xvld @x[3],$inp,32*11 1374 xvxor.v @x[0],@x[0],@y[8] 1375 xvxor.v @x[1],@x[1],@y[9] 1376 xvxor.v @x[2],@x[2],@y[10] 1377 xvxor.v @x[3],@x[3],@y[11] 1378 xvst @x[0],$out,32*8 1379 xvst @x[1],$out,32*9 1380 xvst @x[2],$out,32*10 1381 xvst @x[3],$out,32*11 1382 1383 xvld @x[0],$inp,32*12 1384 xvld @x[1],$inp,32*13 1385 xvxor.v @x[0],@x[0],@y[12] 1386 xvxor.v @x[1],@x[1],@y[13] 1387 xvst @x[0],$out,32*12 1388 xvst @x[1],$out,32*13 1389 1390 addi.d $len,$len,-448 1391 beqz $len,.Ldone_8x 1392 addi.d $inp,$inp,448 1393 addi.d $out,$out,448 1394 xvst @y[14],$sp,32*0 1395 xvst @y[15],$sp,32*1 1396 move $t8,$zero 1397 b .Loop_tail_8x 1398 1399.Loop_tail_8x: 1400 # Xor input with states byte by byte 1401 ldx.bu $t5,$inp,$t8 1402 ldx.bu $t6,$sp,$t8 1403 xor $t5,$t5,$t6 1404 stx.b $t5,$out,$t8 1405 addi.w $t8,$t8,1 1406 addi.d $len,$len,-1 1407 bnez $len,.Loop_tail_8x 1408 b .Ldone_8x 1409 1410.Ldone_8x: 1411 addi.d $sp,$sp,128 1412 b .Lrestore_saved_fpr 1413 1414EOF 1415} 1416 1417$code .= <<EOF; 1418.Lrestore_saved_fpr: 1419 fld.d $fs0,$sp,0 1420 fld.d $fs1,$sp,8 1421 fld.d $fs2,$sp,16 1422 fld.d $fs3,$sp,24 1423 fld.d $fs4,$sp,32 1424 fld.d $fs5,$sp,40 1425 fld.d $fs6,$sp,48 1426 fld.d $fs7,$sp,56 1427 addi.d $sp,$sp,64 1428.Lno_data: 1429.Lend: 1430 jr $ra 1431.size ChaCha20_ctr32,.-ChaCha20_ctr32 1432EOF 1433 1434$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1435 1436print $code; 1437 1438close STDOUT; 1439