1#! /usr/bin/env perl 2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# October 2015 18# 19# ChaCha20 for PowerPC/AltiVec. 20# 21# June 2018 22# 23# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for 24# processors that can't issue more than one vector instruction per 25# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x 26# interleave would perform better. Incidentally PowerISA 2.07 (first 27# implemented by POWER8) defined new usable instructions, hence 4xVSX 28# code path... 29# 30# Performance in cycles per byte out of large buffer. 31# 32# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX 33# 34# Freescale e300 13.6/+115% - - 35# PPC74x0/G4e 6.81/+310% 3.81 - 36# PPC970/G5 9.29/+160% ? - 37# POWER7 8.62/+61% 3.35 - 38# POWER8 8.70/+51% 2.91 2.09 39# POWER9 8.80/+29% 4.44(*) 2.45(**) 40# 41# (*) this is trade-off result, it's possible to improve it, but 42# then it would negatively affect all others; 43# (**) POWER9 seems to be "allergic" to mixing vector and integer 44# instructions, which is why switch to vector-only code pays 45# off that much; 46 47# $output is the last argument if it looks like a file (it has an extension) 48# $flavour is the first argument if it doesn't look like a file 49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 51 52if ($flavour =~ /64/) { 53 $SIZE_T =8; 54 $LRSAVE =2*$SIZE_T; 55 $STU ="stdu"; 56 $POP ="ld"; 57 $PUSH ="std"; 58 $UCMP ="cmpld"; 59} elsif ($flavour =~ /32/) { 60 $SIZE_T =4; 61 $LRSAVE =$SIZE_T; 62 $STU ="stwu"; 63 $POP ="lwz"; 64 $PUSH ="stw"; 65 $UCMP ="cmplw"; 66} else { die "nonsense $flavour"; } 67 68$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; 69 70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 72( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 73die "can't locate ppc-xlate.pl"; 74 75open STDOUT,"| $^X $xlate $flavour \"$output\"" 76 or die "can't call $xlate: $!"; 77 78$LOCALS=6*$SIZE_T; 79$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables 80 81sub AUTOLOAD() # thunk [simplified] x86-style perlasm 82{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 83 $code .= "\t$opcode\t".join(',',@_)."\n"; 84} 85 86my $sp = "r1"; 87 88my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); 89 90 91{{{ 92my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 93 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); 94my @K = map("v$_",(16..19)); 95my $CTR = "v26"; 96my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); 97my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); 98my $beperm = "v31"; 99 100my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); 101 102my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload 103 104 105sub VSX_lane_ROUND_4x { 106my ($a0,$b0,$c0,$d0)=@_; 107my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 108my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 109my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 110my @x=map("\"v$_\"",(0..15)); 111 112 ( 113 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 114 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 115 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 116 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 117 "&vxor (@x[$d0],@x[$d0],@x[$a0])", 118 "&vxor (@x[$d1],@x[$d1],@x[$a1])", 119 "&vxor (@x[$d2],@x[$d2],@x[$a2])", 120 "&vxor (@x[$d3],@x[$d3],@x[$a3])", 121 "&vrlw (@x[$d0],@x[$d0],'$sixteen')", 122 "&vrlw (@x[$d1],@x[$d1],'$sixteen')", 123 "&vrlw (@x[$d2],@x[$d2],'$sixteen')", 124 "&vrlw (@x[$d3],@x[$d3],'$sixteen')", 125 126 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", 127 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", 128 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", 129 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", 130 "&vxor (@x[$b0],@x[$b0],@x[$c0])", 131 "&vxor (@x[$b1],@x[$b1],@x[$c1])", 132 "&vxor (@x[$b2],@x[$b2],@x[$c2])", 133 "&vxor (@x[$b3],@x[$b3],@x[$c3])", 134 "&vrlw (@x[$b0],@x[$b0],'$twelve')", 135 "&vrlw (@x[$b1],@x[$b1],'$twelve')", 136 "&vrlw (@x[$b2],@x[$b2],'$twelve')", 137 "&vrlw (@x[$b3],@x[$b3],'$twelve')", 138 139 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", 140 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", 141 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", 142 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", 143 "&vxor (@x[$d0],@x[$d0],@x[$a0])", 144 "&vxor (@x[$d1],@x[$d1],@x[$a1])", 145 "&vxor (@x[$d2],@x[$d2],@x[$a2])", 146 "&vxor (@x[$d3],@x[$d3],@x[$a3])", 147 "&vrlw (@x[$d0],@x[$d0],'$eight')", 148 "&vrlw (@x[$d1],@x[$d1],'$eight')", 149 "&vrlw (@x[$d2],@x[$d2],'$eight')", 150 "&vrlw (@x[$d3],@x[$d3],'$eight')", 151 152 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", 153 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", 154 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", 155 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", 156 "&vxor (@x[$b0],@x[$b0],@x[$c0])", 157 "&vxor (@x[$b1],@x[$b1],@x[$c1])", 158 "&vxor (@x[$b2],@x[$b2],@x[$c2])", 159 "&vxor (@x[$b3],@x[$b3],@x[$c3])", 160 "&vrlw (@x[$b0],@x[$b0],'$seven')", 161 "&vrlw (@x[$b1],@x[$b1],'$seven')", 162 "&vrlw (@x[$b2],@x[$b2],'$seven')", 163 "&vrlw (@x[$b3],@x[$b3],'$seven')" 164 ); 165} 166 167$code.=<<___; 168 169.globl .ChaCha20_ctr32_vsx_p10 170.align 5 171.ChaCha20_ctr32_vsx_p10: 172 ${UCMP}i $len,255 173 ble .Not_greater_than_8x 174 b ChaCha20_ctr32_vsx_8x 175.Not_greater_than_8x: 176 $STU $sp,-$FRAME($sp) 177 mflr r0 178 li r10,`15+$LOCALS+64` 179 li r11,`31+$LOCALS+64` 180 mfspr r12,256 181 stvx v26,r10,$sp 182 addi r10,r10,32 183 stvx v27,r11,$sp 184 addi r11,r11,32 185 stvx v28,r10,$sp 186 addi r10,r10,32 187 stvx v29,r11,$sp 188 addi r11,r11,32 189 stvx v30,r10,$sp 190 stvx v31,r11,$sp 191 stw r12,`$FRAME-4`($sp) # save vrsave 192 li r12,-4096+63 193 $PUSH r0, `$FRAME+$LRSAVE`($sp) 194 mtspr 256,r12 # preserve 29 AltiVec registers 195 196 bl Lconsts # returns pointer Lsigma in r12 197 lvx_4w @K[0],0,r12 # load sigma 198 addi r12,r12,0x70 199 li $x10,16 200 li $x20,32 201 li $x30,48 202 li r11,64 203 204 lvx_4w @K[1],0,$key # load key 205 lvx_4w @K[2],$x10,$key 206 lvx_4w @K[3],0,$ctr # load counter 207 208 vxor $xt0,$xt0,$xt0 209 lvx_4w $xt1,r11,r12 210 vspltw $CTR,@K[3],0 211 vsldoi @K[3],@K[3],$xt0,4 212 vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] 213 vadduwm $CTR,$CTR,$xt1 214 215 be?lvsl $beperm,0,$x10 # 0x00..0f 216 be?vspltisb $xt0,3 # 0x03..03 217 be?vxor $beperm,$beperm,$xt0 # swap bytes within words 218 219 li r0,10 # inner loop counter 220 mtctr r0 221 b Loop_outer_vsx 222 223.align 5 224Loop_outer_vsx: 225 lvx $xa0,$x00,r12 # load [smashed] sigma 226 lvx $xa1,$x10,r12 227 lvx $xa2,$x20,r12 228 lvx $xa3,$x30,r12 229 230 vspltw $xb0,@K[1],0 # smash the key 231 vspltw $xb1,@K[1],1 232 vspltw $xb2,@K[1],2 233 vspltw $xb3,@K[1],3 234 235 vspltw $xc0,@K[2],0 236 vspltw $xc1,@K[2],1 237 vspltw $xc2,@K[2],2 238 vspltw $xc3,@K[2],3 239 240 vmr $xd0,$CTR # smash the counter 241 vspltw $xd1,@K[3],1 242 vspltw $xd2,@K[3],2 243 vspltw $xd3,@K[3],3 244 245 vspltisw $sixteen,-16 # synthesize constants 246 vspltisw $twelve,12 247 vspltisw $eight,8 248 vspltisw $seven,7 249 250Loop_vsx_4x: 251___ 252 foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; } 253 foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; } 254$code.=<<___; 255 256 bdnz Loop_vsx_4x 257 258 vadduwm $xd0,$xd0,$CTR 259 260 vmrgew $xt0,$xa0,$xa1 # transpose data 261 vmrgew $xt1,$xa2,$xa3 262 vmrgow $xa0,$xa0,$xa1 263 vmrgow $xa2,$xa2,$xa3 264 vmrgew $xt2,$xb0,$xb1 265 vmrgew $xt3,$xb2,$xb3 266 vpermdi $xa1,$xa0,$xa2,0b00 267 vpermdi $xa3,$xa0,$xa2,0b11 268 vpermdi $xa0,$xt0,$xt1,0b00 269 vpermdi $xa2,$xt0,$xt1,0b11 270 271 vmrgow $xb0,$xb0,$xb1 272 vmrgow $xb2,$xb2,$xb3 273 vmrgew $xt0,$xc0,$xc1 274 vmrgew $xt1,$xc2,$xc3 275 vpermdi $xb1,$xb0,$xb2,0b00 276 vpermdi $xb3,$xb0,$xb2,0b11 277 vpermdi $xb0,$xt2,$xt3,0b00 278 vpermdi $xb2,$xt2,$xt3,0b11 279 280 vmrgow $xc0,$xc0,$xc1 281 vmrgow $xc2,$xc2,$xc3 282 vmrgew $xt2,$xd0,$xd1 283 vmrgew $xt3,$xd2,$xd3 284 vpermdi $xc1,$xc0,$xc2,0b00 285 vpermdi $xc3,$xc0,$xc2,0b11 286 vpermdi $xc0,$xt0,$xt1,0b00 287 vpermdi $xc2,$xt0,$xt1,0b11 288 289 vmrgow $xd0,$xd0,$xd1 290 vmrgow $xd2,$xd2,$xd3 291 vspltisw $xt0,4 292 vadduwm $CTR,$CTR,$xt0 # next counter value 293 vpermdi $xd1,$xd0,$xd2,0b00 294 vpermdi $xd3,$xd0,$xd2,0b11 295 vpermdi $xd0,$xt2,$xt3,0b00 296 vpermdi $xd2,$xt2,$xt3,0b11 297 298 vadduwm $xa0,$xa0,@K[0] 299 vadduwm $xb0,$xb0,@K[1] 300 vadduwm $xc0,$xc0,@K[2] 301 vadduwm $xd0,$xd0,@K[3] 302 303 be?vperm $xa0,$xa0,$xa0,$beperm 304 be?vperm $xb0,$xb0,$xb0,$beperm 305 be?vperm $xc0,$xc0,$xc0,$beperm 306 be?vperm $xd0,$xd0,$xd0,$beperm 307 308 ${UCMP}i $len,0x40 309 blt Ltail_vsx 310 311 lvx_4w $xt0,$x00,$inp 312 lvx_4w $xt1,$x10,$inp 313 lvx_4w $xt2,$x20,$inp 314 lvx_4w $xt3,$x30,$inp 315 316 vxor $xt0,$xt0,$xa0 317 vxor $xt1,$xt1,$xb0 318 vxor $xt2,$xt2,$xc0 319 vxor $xt3,$xt3,$xd0 320 321 stvx_4w $xt0,$x00,$out 322 stvx_4w $xt1,$x10,$out 323 addi $inp,$inp,0x40 324 stvx_4w $xt2,$x20,$out 325 subi $len,$len,0x40 326 stvx_4w $xt3,$x30,$out 327 addi $out,$out,0x40 328 beq Ldone_vsx 329 330 vadduwm $xa0,$xa1,@K[0] 331 vadduwm $xb0,$xb1,@K[1] 332 vadduwm $xc0,$xc1,@K[2] 333 vadduwm $xd0,$xd1,@K[3] 334 335 be?vperm $xa0,$xa0,$xa0,$beperm 336 be?vperm $xb0,$xb0,$xb0,$beperm 337 be?vperm $xc0,$xc0,$xc0,$beperm 338 be?vperm $xd0,$xd0,$xd0,$beperm 339 340 ${UCMP}i $len,0x40 341 blt Ltail_vsx 342 343 lvx_4w $xt0,$x00,$inp 344 lvx_4w $xt1,$x10,$inp 345 lvx_4w $xt2,$x20,$inp 346 lvx_4w $xt3,$x30,$inp 347 348 vxor $xt0,$xt0,$xa0 349 vxor $xt1,$xt1,$xb0 350 vxor $xt2,$xt2,$xc0 351 vxor $xt3,$xt3,$xd0 352 353 stvx_4w $xt0,$x00,$out 354 stvx_4w $xt1,$x10,$out 355 addi $inp,$inp,0x40 356 stvx_4w $xt2,$x20,$out 357 subi $len,$len,0x40 358 stvx_4w $xt3,$x30,$out 359 addi $out,$out,0x40 360 beq Ldone_vsx 361 362 vadduwm $xa0,$xa2,@K[0] 363 vadduwm $xb0,$xb2,@K[1] 364 vadduwm $xc0,$xc2,@K[2] 365 vadduwm $xd0,$xd2,@K[3] 366 367 be?vperm $xa0,$xa0,$xa0,$beperm 368 be?vperm $xb0,$xb0,$xb0,$beperm 369 be?vperm $xc0,$xc0,$xc0,$beperm 370 be?vperm $xd0,$xd0,$xd0,$beperm 371 372 ${UCMP}i $len,0x40 373 blt Ltail_vsx 374 375 lvx_4w $xt0,$x00,$inp 376 lvx_4w $xt1,$x10,$inp 377 lvx_4w $xt2,$x20,$inp 378 lvx_4w $xt3,$x30,$inp 379 380 vxor $xt0,$xt0,$xa0 381 vxor $xt1,$xt1,$xb0 382 vxor $xt2,$xt2,$xc0 383 vxor $xt3,$xt3,$xd0 384 385 stvx_4w $xt0,$x00,$out 386 stvx_4w $xt1,$x10,$out 387 addi $inp,$inp,0x40 388 stvx_4w $xt2,$x20,$out 389 subi $len,$len,0x40 390 stvx_4w $xt3,$x30,$out 391 addi $out,$out,0x40 392 beq Ldone_vsx 393 394 vadduwm $xa0,$xa3,@K[0] 395 vadduwm $xb0,$xb3,@K[1] 396 vadduwm $xc0,$xc3,@K[2] 397 vadduwm $xd0,$xd3,@K[3] 398 399 be?vperm $xa0,$xa0,$xa0,$beperm 400 be?vperm $xb0,$xb0,$xb0,$beperm 401 be?vperm $xc0,$xc0,$xc0,$beperm 402 be?vperm $xd0,$xd0,$xd0,$beperm 403 404 ${UCMP}i $len,0x40 405 blt Ltail_vsx 406 407 lvx_4w $xt0,$x00,$inp 408 lvx_4w $xt1,$x10,$inp 409 lvx_4w $xt2,$x20,$inp 410 lvx_4w $xt3,$x30,$inp 411 412 vxor $xt0,$xt0,$xa0 413 vxor $xt1,$xt1,$xb0 414 vxor $xt2,$xt2,$xc0 415 vxor $xt3,$xt3,$xd0 416 417 stvx_4w $xt0,$x00,$out 418 stvx_4w $xt1,$x10,$out 419 addi $inp,$inp,0x40 420 stvx_4w $xt2,$x20,$out 421 subi $len,$len,0x40 422 stvx_4w $xt3,$x30,$out 423 addi $out,$out,0x40 424 mtctr r0 425 bne Loop_outer_vsx 426 427Ldone_vsx: 428 lwz r12,`$FRAME-4`($sp) # pull vrsave 429 li r10,`15+$LOCALS+64` 430 li r11,`31+$LOCALS+64` 431 $POP r0, `$FRAME+$LRSAVE`($sp) 432 mtspr 256,r12 # restore vrsave 433 lvx v26,r10,$sp 434 addi r10,r10,32 435 lvx v27,r11,$sp 436 addi r11,r11,32 437 lvx v28,r10,$sp 438 addi r10,r10,32 439 lvx v29,r11,$sp 440 addi r11,r11,32 441 lvx v30,r10,$sp 442 lvx v31,r11,$sp 443 mtlr r0 444 addi $sp,$sp,$FRAME 445 blr 446 447.align 4 448Ltail_vsx: 449 addi r11,$sp,$LOCALS 450 mtctr $len 451 stvx_4w $xa0,$x00,r11 # offload block to stack 452 stvx_4w $xb0,$x10,r11 453 stvx_4w $xc0,$x20,r11 454 stvx_4w $xd0,$x30,r11 455 subi r12,r11,1 # prepare for *++ptr 456 subi $inp,$inp,1 457 subi $out,$out,1 458 459Loop_tail_vsx: 460 lbzu r6,1(r12) 461 lbzu r7,1($inp) 462 xor r6,r6,r7 463 stbu r6,1($out) 464 bdnz Loop_tail_vsx 465 466 stvx_4w $K[0],$x00,r11 # wipe copy of the block 467 stvx_4w $K[0],$x10,r11 468 stvx_4w $K[0],$x20,r11 469 stvx_4w $K[0],$x30,r11 470 471 b Ldone_vsx 472 .long 0 473 .byte 0,12,0x04,1,0x80,0,5,0 474 .long 0 475.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10 476___ 477}}} 478 479##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to 480# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info. 481# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value. 482# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel. 483# 484{{{ 485#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); 486my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 487 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3, 488 $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7, 489 $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31)); 490my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15)); 491my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3)); 492my @K = map("v$_",27,(24..26)); 493my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31)); 494my $xr0 = "v4"; 495my $CTR0 = "v22"; 496my $CTR1 = "v5"; 497my $beperm = "v31"; 498my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); 499my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7)); 500my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17)); 501my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21)); 502my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26)); 503 504my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload 505 506sub VSX_lane_ROUND_8x { 507my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_; 508my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 509my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 510my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 511my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4)); 512my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5)); 513my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6)); 514my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17)); 515my @x=map("\"v$_\"",(0..31)); 516 517 ( 518 "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13 519 "&vxxlorc (@x[$c7], $xv9,$xv9)", 520 521 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 522 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 523 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 524 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 525 "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1 526 "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2 527 "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3 528 "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4 529 530 "&vxor (@x[$d0],@x[$d0],@x[$a0])", 531 "&vxor (@x[$d1],@x[$d1],@x[$a1])", 532 "&vxor (@x[$d2],@x[$d2],@x[$a2])", 533 "&vxor (@x[$d3],@x[$d3],@x[$a3])", 534 "&vxor (@x[$d4],@x[$d4],@x[$a4])", 535 "&vxor (@x[$d5],@x[$d5],@x[$a5])", 536 "&vxor (@x[$d6],@x[$d6],@x[$a6])", 537 "&vxor (@x[$d7],@x[$d7],@x[$a7])", 538 539 "&vrlw (@x[$d0],@x[$d0],@x[$c7])", 540 "&vrlw (@x[$d1],@x[$d1],@x[$c7])", 541 "&vrlw (@x[$d2],@x[$d2],@x[$c7])", 542 "&vrlw (@x[$d3],@x[$d3],@x[$c7])", 543 "&vrlw (@x[$d4],@x[$d4],@x[$c7])", 544 "&vrlw (@x[$d5],@x[$d5],@x[$c7])", 545 "&vrlw (@x[$d6],@x[$d6],@x[$c7])", 546 "&vrlw (@x[$d7],@x[$d7],@x[$c7])", 547 548 "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", 549 "&vxxlorc (@x[$c7], $xv15,$xv15)", 550 "&vxxlorc (@x[$a7], $xv10,$xv10)", 551 552 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", 553 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", 554 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", 555 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", 556 "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", 557 "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", 558 "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", 559 "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", 560 561 "&vxor (@x[$b0],@x[$b0],@x[$c0])", 562 "&vxor (@x[$b1],@x[$b1],@x[$c1])", 563 "&vxor (@x[$b2],@x[$b2],@x[$c2])", 564 "&vxor (@x[$b3],@x[$b3],@x[$c3])", 565 "&vxor (@x[$b4],@x[$b4],@x[$c4])", 566 "&vxor (@x[$b5],@x[$b5],@x[$c5])", 567 "&vxor (@x[$b6],@x[$b6],@x[$c6])", 568 "&vxor (@x[$b7],@x[$b7],@x[$c7])", 569 570 "&vrlw (@x[$b0],@x[$b0],@x[$a7])", 571 "&vrlw (@x[$b1],@x[$b1],@x[$a7])", 572 "&vrlw (@x[$b2],@x[$b2],@x[$a7])", 573 "&vrlw (@x[$b3],@x[$b3],@x[$a7])", 574 "&vrlw (@x[$b4],@x[$b4],@x[$a7])", 575 "&vrlw (@x[$b5],@x[$b5],@x[$a7])", 576 "&vrlw (@x[$b6],@x[$b6],@x[$a7])", 577 "&vrlw (@x[$b7],@x[$b7],@x[$a7])", 578 579 "&vxxlorc (@x[$a7], $xv13,$xv13)", 580 "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", 581 "&vxxlorc (@x[$c7], $xv11,$xv11)", 582 583 584 "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", 585 "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", 586 "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", 587 "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", 588 "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", 589 "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", 590 "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", 591 "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", 592 593 "&vxor (@x[$d0],@x[$d0],@x[$a0])", 594 "&vxor (@x[$d1],@x[$d1],@x[$a1])", 595 "&vxor (@x[$d2],@x[$d2],@x[$a2])", 596 "&vxor (@x[$d3],@x[$d3],@x[$a3])", 597 "&vxor (@x[$d4],@x[$d4],@x[$a4])", 598 "&vxor (@x[$d5],@x[$d5],@x[$a5])", 599 "&vxor (@x[$d6],@x[$d6],@x[$a6])", 600 "&vxor (@x[$d7],@x[$d7],@x[$a7])", 601 602 "&vrlw (@x[$d0],@x[$d0],@x[$c7])", 603 "&vrlw (@x[$d1],@x[$d1],@x[$c7])", 604 "&vrlw (@x[$d2],@x[$d2],@x[$c7])", 605 "&vrlw (@x[$d3],@x[$d3],@x[$c7])", 606 "&vrlw (@x[$d4],@x[$d4],@x[$c7])", 607 "&vrlw (@x[$d5],@x[$d5],@x[$c7])", 608 "&vrlw (@x[$d6],@x[$d6],@x[$c7])", 609 "&vrlw (@x[$d7],@x[$d7],@x[$c7])", 610 611 "&vxxlorc (@x[$c7], $xv15,$xv15)", 612 "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", 613 "&vxxlorc (@x[$a7], $xv12,$xv12)", 614 615 "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", 616 "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", 617 "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", 618 "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", 619 "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", 620 "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", 621 "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", 622 "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", 623 "&vxor (@x[$b0],@x[$b0],@x[$c0])", 624 "&vxor (@x[$b1],@x[$b1],@x[$c1])", 625 "&vxor (@x[$b2],@x[$b2],@x[$c2])", 626 "&vxor (@x[$b3],@x[$b3],@x[$c3])", 627 "&vxor (@x[$b4],@x[$b4],@x[$c4])", 628 "&vxor (@x[$b5],@x[$b5],@x[$c5])", 629 "&vxor (@x[$b6],@x[$b6],@x[$c6])", 630 "&vxor (@x[$b7],@x[$b7],@x[$c7])", 631 "&vrlw (@x[$b0],@x[$b0],@x[$a7])", 632 "&vrlw (@x[$b1],@x[$b1],@x[$a7])", 633 "&vrlw (@x[$b2],@x[$b2],@x[$a7])", 634 "&vrlw (@x[$b3],@x[$b3],@x[$a7])", 635 "&vrlw (@x[$b4],@x[$b4],@x[$a7])", 636 "&vrlw (@x[$b5],@x[$b5],@x[$a7])", 637 "&vrlw (@x[$b6],@x[$b6],@x[$a7])", 638 "&vrlw (@x[$b7],@x[$b7],@x[$a7])", 639 640 "&vxxlorc (@x[$a7], $xv13,$xv13)", 641 ); 642} 643 644$code.=<<___; 645 646.globl .ChaCha20_ctr32_vsx_8x 647.align 5 648.ChaCha20_ctr32_vsx_8x: 649 $STU $sp,-$FRAME($sp) 650 mflr r0 651 li r10,`15+$LOCALS+64` 652 li r11,`31+$LOCALS+64` 653 mfspr r12,256 654 stvx v24,r10,$sp 655 addi r10,r10,32 656 stvx v25,r11,$sp 657 addi r11,r11,32 658 stvx v26,r10,$sp 659 addi r10,r10,32 660 stvx v27,r11,$sp 661 addi r11,r11,32 662 stvx v28,r10,$sp 663 addi r10,r10,32 664 stvx v29,r11,$sp 665 addi r11,r11,32 666 stvx v30,r10,$sp 667 stvx v31,r11,$sp 668 stw r12,`$FRAME-4`($sp) # save vrsave 669 li r12,-4096+63 670 $PUSH r0, `$FRAME+$LRSAVE`($sp) 671 mtspr 256,r12 # preserve 29 AltiVec registers 672 673 bl Lconsts # returns pointer Lsigma in r12 674 675 lvx_4w @K[0],0,r12 # load sigma 676 addi r12,r12,0x70 677 li $x10,16 678 li $x20,32 679 li $x30,48 680 li r11,64 681 682 vspltisw $xa4,-16 # synthesize constants 683 vspltisw $xb4,12 # synthesize constants 684 vspltisw $xc4,8 # synthesize constants 685 vspltisw $xd4,7 # synthesize constants 686 687 lvx $xa0,$x00,r12 # load [smashed] sigma 688 lvx $xa1,$x10,r12 689 lvx $xa2,$x20,r12 690 lvx $xa3,$x30,r12 691 692 vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12 693 vxxlor $xv10 ,$xb4,$xb4 694 vxxlor $xv11 ,$xc4,$xc4 695 vxxlor $xv12 ,$xd4,$xd4 696 vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25 697 vxxlor $xv23 ,$xa1,$xa1 698 vxxlor $xv24 ,$xa2,$xa2 699 vxxlor $xv25 ,$xa3,$xa3 700 701 lvx_4w @K[1],0,$key # load key 702 lvx_4w @K[2],$x10,$key 703 lvx_4w @K[3],0,$ctr # load counter 704 vspltisw $xt3,4 705 706 707 vxor $xt2,$xt2,$xt2 708 lvx_4w $xt1,r11,r12 709 vspltw $xa2,@K[3],0 #save the original count after spltw 710 vsldoi @K[3],@K[3],$xt2,4 711 vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0] 712 vadduwm $xt1,$xa2,$xt1 713 vadduwm $xt3,$xt1,$xt3 # next counter value 714 vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8. 715 716 be?lvsl $beperm,0,$x10 # 0x00..0f 717 be?vspltisb $xt0,3 # 0x03..03 718 be?vxor $beperm,$beperm,$xt0 # swap bytes within words 719 be?vxxlor $xv26 ,$beperm,$beperm 720 721 vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2 722 vxxlor $xv1 ,@K[1],@K[1] 723 vxxlor $xv2 ,@K[2],@K[2] 724 vxxlor $xv3 ,@K[3],@K[3] 725 vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5 726 vxxlor $xv5 ,$xt3,$xt3 727 vxxlor $xv8 ,$xa0,$xa0 728 729 li r0,10 # inner loop counter 730 mtctr r0 731 b Loop_outer_vsx_8x 732 733.align 5 734Loop_outer_vsx_8x: 735 vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma 736 vxxlorc $xa1,$xv23,$xv23 737 vxxlorc $xa2,$xv24,$xv24 738 vxxlorc $xa3,$xv25,$xv25 739 vxxlorc $xa4,$xv22,$xv22 740 vxxlorc $xa5,$xv23,$xv23 741 vxxlorc $xa6,$xv24,$xv24 742 vxxlorc $xa7,$xv25,$xv25 743 744 vspltw $xb0,@K[1],0 # smash the key 745 vspltw $xb1,@K[1],1 746 vspltw $xb2,@K[1],2 747 vspltw $xb3,@K[1],3 748 vspltw $xb4,@K[1],0 # smash the key 749 vspltw $xb5,@K[1],1 750 vspltw $xb6,@K[1],2 751 vspltw $xb7,@K[1],3 752 753 vspltw $xc0,@K[2],0 754 vspltw $xc1,@K[2],1 755 vspltw $xc2,@K[2],2 756 vspltw $xc3,@K[2],3 757 vspltw $xc4,@K[2],0 758 vspltw $xc7,@K[2],3 759 vspltw $xc5,@K[2],1 760 761 vxxlorc $xd0,$xv4,$xv4 # smash the counter 762 vspltw $xd1,@K[3],1 763 vspltw $xd2,@K[3],2 764 vspltw $xd3,@K[3],3 765 vxxlorc $xd4,$xv5,$xv5 # smash the counter 766 vspltw $xd5,@K[3],1 767 vspltw $xd6,@K[3],2 768 vspltw $xd7,@K[3],3 769 vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done 770 771Loop_vsx_8x: 772___ 773 foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; } 774 foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; } 775$code.=<<___; 776 777 bdnz Loop_vsx_8x 778 vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31 779 vxxlor $xv14 ,$xd5,$xd5 # 780 vxxlor $xv15 ,$xd6,$xd6 # 781 vxxlor $xv16 ,$xd7,$xd7 # 782 783 vxxlor $xv18 ,$xc4,$xc4 # 784 vxxlor $xv19 ,$xc5,$xc5 # 785 vxxlor $xv20 ,$xc6,$xc6 # 786 vxxlor $xv21 ,$xc7,$xc7 # 787 788 vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs 789 vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs 790 be?vxxlorc $beperm,$xv26,$xv26 # copy back the beperm. 791 792 vxxlorc @K[0],$xv0,$xv0 #27 793 vxxlorc @K[1],$xv1,$xv1 #24 794 vxxlorc @K[2],$xv2,$xv2 #25 795 vxxlorc @K[3],$xv3,$xv3 #26 796 vxxlorc $CTR0,$xv4,$xv4 797###changing to vertical 798 799 vmrgew $xt0,$xa0,$xa1 # transpose data 800 vmrgew $xt1,$xa2,$xa3 801 vmrgow $xa0,$xa0,$xa1 802 vmrgow $xa2,$xa2,$xa3 803 804 vmrgew $xt2,$xb0,$xb1 805 vmrgew $xt3,$xb2,$xb3 806 vmrgow $xb0,$xb0,$xb1 807 vmrgow $xb2,$xb2,$xb3 808 809 vadduwm $xd0,$xd0,$CTR0 810 811 vpermdi $xa1,$xa0,$xa2,0b00 812 vpermdi $xa3,$xa0,$xa2,0b11 813 vpermdi $xa0,$xt0,$xt1,0b00 814 vpermdi $xa2,$xt0,$xt1,0b11 815 vpermdi $xb1,$xb0,$xb2,0b00 816 vpermdi $xb3,$xb0,$xb2,0b11 817 vpermdi $xb0,$xt2,$xt3,0b00 818 vpermdi $xb2,$xt2,$xt3,0b11 819 820 vmrgew $xt0,$xc0,$xc1 821 vmrgew $xt1,$xc2,$xc3 822 vmrgow $xc0,$xc0,$xc1 823 vmrgow $xc2,$xc2,$xc3 824 vmrgew $xt2,$xd0,$xd1 825 vmrgew $xt3,$xd2,$xd3 826 vmrgow $xd0,$xd0,$xd1 827 vmrgow $xd2,$xd2,$xd3 828 829 vpermdi $xc1,$xc0,$xc2,0b00 830 vpermdi $xc3,$xc0,$xc2,0b11 831 vpermdi $xc0,$xt0,$xt1,0b00 832 vpermdi $xc2,$xt0,$xt1,0b11 833 vpermdi $xd1,$xd0,$xd2,0b00 834 vpermdi $xd3,$xd0,$xd2,0b11 835 vpermdi $xd0,$xt2,$xt3,0b00 836 vpermdi $xd2,$xt2,$xt3,0b11 837 838 vspltisw $xt0,8 839 vadduwm $CTR0,$CTR0,$xt0 # next counter value 840 vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5 841 842 vadduwm $xa0,$xa0,@K[0] 843 vadduwm $xb0,$xb0,@K[1] 844 vadduwm $xc0,$xc0,@K[2] 845 vadduwm $xd0,$xd0,@K[3] 846 847 be?vperm $xa0,$xa0,$xa0,$beperm 848 be?vperm $xb0,$xb0,$xb0,$beperm 849 be?vperm $xc0,$xc0,$xc0,$beperm 850 be?vperm $xd0,$xd0,$xd0,$beperm 851 852 ${UCMP}i $len,0x40 853 blt Ltail_vsx_8x 854 855 lvx_4w $xt0,$x00,$inp 856 lvx_4w $xt1,$x10,$inp 857 lvx_4w $xt2,$x20,$inp 858 lvx_4w $xt3,$x30,$inp 859 860 vxor $xt0,$xt0,$xa0 861 vxor $xt1,$xt1,$xb0 862 vxor $xt2,$xt2,$xc0 863 vxor $xt3,$xt3,$xd0 864 865 stvx_4w $xt0,$x00,$out 866 stvx_4w $xt1,$x10,$out 867 addi $inp,$inp,0x40 868 stvx_4w $xt2,$x20,$out 869 subi $len,$len,0x40 870 stvx_4w $xt3,$x30,$out 871 addi $out,$out,0x40 872 beq Ldone_vsx_8x 873 874 vadduwm $xa0,$xa1,@K[0] 875 vadduwm $xb0,$xb1,@K[1] 876 vadduwm $xc0,$xc1,@K[2] 877 vadduwm $xd0,$xd1,@K[3] 878 879 be?vperm $xa0,$xa0,$xa0,$beperm 880 be?vperm $xb0,$xb0,$xb0,$beperm 881 be?vperm $xc0,$xc0,$xc0,$beperm 882 be?vperm $xd0,$xd0,$xd0,$beperm 883 884 ${UCMP}i $len,0x40 885 blt Ltail_vsx_8x 886 887 lvx_4w $xt0,$x00,$inp 888 lvx_4w $xt1,$x10,$inp 889 lvx_4w $xt2,$x20,$inp 890 lvx_4w $xt3,$x30,$inp 891 892 vxor $xt0,$xt0,$xa0 893 vxor $xt1,$xt1,$xb0 894 vxor $xt2,$xt2,$xc0 895 vxor $xt3,$xt3,$xd0 896 897 stvx_4w $xt0,$x00,$out 898 stvx_4w $xt1,$x10,$out 899 addi $inp,$inp,0x40 900 stvx_4w $xt2,$x20,$out 901 subi $len,$len,0x40 902 stvx_4w $xt3,$x30,$out 903 addi $out,$out,0x40 904 beq Ldone_vsx_8x 905 906 vadduwm $xa0,$xa2,@K[0] 907 vadduwm $xb0,$xb2,@K[1] 908 vadduwm $xc0,$xc2,@K[2] 909 vadduwm $xd0,$xd2,@K[3] 910 911 be?vperm $xa0,$xa0,$xa0,$beperm 912 be?vperm $xb0,$xb0,$xb0,$beperm 913 be?vperm $xc0,$xc0,$xc0,$beperm 914 be?vperm $xd0,$xd0,$xd0,$beperm 915 916 ${UCMP}i $len,0x40 917 blt Ltail_vsx_8x 918 919 lvx_4w $xt0,$x00,$inp 920 lvx_4w $xt1,$x10,$inp 921 lvx_4w $xt2,$x20,$inp 922 lvx_4w $xt3,$x30,$inp 923 924 vxor $xt0,$xt0,$xa0 925 vxor $xt1,$xt1,$xb0 926 vxor $xt2,$xt2,$xc0 927 vxor $xt3,$xt3,$xd0 928 929 stvx_4w $xt0,$x00,$out 930 stvx_4w $xt1,$x10,$out 931 addi $inp,$inp,0x40 932 stvx_4w $xt2,$x20,$out 933 subi $len,$len,0x40 934 stvx_4w $xt3,$x30,$out 935 addi $out,$out,0x40 936 beq Ldone_vsx_8x 937 938 vadduwm $xa0,$xa3,@K[0] 939 vadduwm $xb0,$xb3,@K[1] 940 vadduwm $xc0,$xc3,@K[2] 941 vadduwm $xd0,$xd3,@K[3] 942 943 be?vperm $xa0,$xa0,$xa0,$beperm 944 be?vperm $xb0,$xb0,$xb0,$beperm 945 be?vperm $xc0,$xc0,$xc0,$beperm 946 be?vperm $xd0,$xd0,$xd0,$beperm 947 948 ${UCMP}i $len,0x40 949 blt Ltail_vsx_8x 950 951 lvx_4w $xt0,$x00,$inp 952 lvx_4w $xt1,$x10,$inp 953 lvx_4w $xt2,$x20,$inp 954 lvx_4w $xt3,$x30,$inp 955 956 vxor $xt0,$xt0,$xa0 957 vxor $xt1,$xt1,$xb0 958 vxor $xt2,$xt2,$xc0 959 vxor $xt3,$xt3,$xd0 960 961 stvx_4w $xt0,$x00,$out 962 stvx_4w $xt1,$x10,$out 963 addi $inp,$inp,0x40 964 stvx_4w $xt2,$x20,$out 965 subi $len,$len,0x40 966 stvx_4w $xt3,$x30,$out 967 addi $out,$out,0x40 968 beq Ldone_vsx_8x 969 970#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31. 971#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0. 972 973 vxxlorc $CTR1 ,$xv5,$xv5 974 975 vxxlorc $xcn4 ,$xv18,$xv18 976 vxxlorc $xcn5 ,$xv19,$xv19 977 vxxlorc $xcn6 ,$xv20,$xv20 978 vxxlorc $xcn7 ,$xv21,$xv21 979 980 vxxlorc $xdn4 ,$xv13,$xv13 981 vxxlorc $xdn5 ,$xv14,$xv14 982 vxxlorc $xdn6 ,$xv15,$xv15 983 vxxlorc $xdn7 ,$xv16,$xv16 984 vadduwm $xdn4,$xdn4,$CTR1 985 986 vxxlorc $xb6 ,$xv6,$xv6 987 vxxlorc $xb7 ,$xv7,$xv7 988#use xa1->xr0, as xt0...in the block 4-7 989 990 vmrgew $xr0,$xa4,$xa5 # transpose data 991 vmrgew $xt1,$xa6,$xa7 992 vmrgow $xa4,$xa4,$xa5 993 vmrgow $xa6,$xa6,$xa7 994 vmrgew $xt2,$xb4,$xb5 995 vmrgew $xt3,$xb6,$xb7 996 vmrgow $xb4,$xb4,$xb5 997 vmrgow $xb6,$xb6,$xb7 998 999 vpermdi $xa5,$xa4,$xa6,0b00 1000 vpermdi $xa7,$xa4,$xa6,0b11 1001 vpermdi $xa4,$xr0,$xt1,0b00 1002 vpermdi $xa6,$xr0,$xt1,0b11 1003 vpermdi $xb5,$xb4,$xb6,0b00 1004 vpermdi $xb7,$xb4,$xb6,0b11 1005 vpermdi $xb4,$xt2,$xt3,0b00 1006 vpermdi $xb6,$xt2,$xt3,0b11 1007 1008 vmrgew $xr0,$xcn4,$xcn5 1009 vmrgew $xt1,$xcn6,$xcn7 1010 vmrgow $xcn4,$xcn4,$xcn5 1011 vmrgow $xcn6,$xcn6,$xcn7 1012 vmrgew $xt2,$xdn4,$xdn5 1013 vmrgew $xt3,$xdn6,$xdn7 1014 vmrgow $xdn4,$xdn4,$xdn5 1015 vmrgow $xdn6,$xdn6,$xdn7 1016 1017 vpermdi $xcn5,$xcn4,$xcn6,0b00 1018 vpermdi $xcn7,$xcn4,$xcn6,0b11 1019 vpermdi $xcn4,$xr0,$xt1,0b00 1020 vpermdi $xcn6,$xr0,$xt1,0b11 1021 vpermdi $xdn5,$xdn4,$xdn6,0b00 1022 vpermdi $xdn7,$xdn4,$xdn6,0b11 1023 vpermdi $xdn4,$xt2,$xt3,0b00 1024 vpermdi $xdn6,$xt2,$xt3,0b11 1025 1026 vspltisw $xr0,8 1027 vadduwm $CTR1,$CTR1,$xr0 # next counter value 1028 vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5 1029 1030 vadduwm $xan0,$xa4,@K[0] 1031 vadduwm $xbn0,$xb4,@K[1] 1032 vadduwm $xcn0,$xcn4,@K[2] 1033 vadduwm $xdn0,$xdn4,@K[3] 1034 1035 be?vperm $xan0,$xan0,$xan0,$beperm 1036 be?vperm $xbn0,$xbn0,$xbn0,$beperm 1037 be?vperm $xcn0,$xcn0,$xcn0,$beperm 1038 be?vperm $xdn0,$xdn0,$xdn0,$beperm 1039 1040 ${UCMP}i $len,0x40 1041 blt Ltail_vsx_8x_1 1042 1043 lvx_4w $xr0,$x00,$inp 1044 lvx_4w $xt1,$x10,$inp 1045 lvx_4w $xt2,$x20,$inp 1046 lvx_4w $xt3,$x30,$inp 1047 1048 vxor $xr0,$xr0,$xan0 1049 vxor $xt1,$xt1,$xbn0 1050 vxor $xt2,$xt2,$xcn0 1051 vxor $xt3,$xt3,$xdn0 1052 1053 stvx_4w $xr0,$x00,$out 1054 stvx_4w $xt1,$x10,$out 1055 addi $inp,$inp,0x40 1056 stvx_4w $xt2,$x20,$out 1057 subi $len,$len,0x40 1058 stvx_4w $xt3,$x30,$out 1059 addi $out,$out,0x40 1060 beq Ldone_vsx_8x 1061 1062 vadduwm $xan0,$xa5,@K[0] 1063 vadduwm $xbn0,$xb5,@K[1] 1064 vadduwm $xcn0,$xcn5,@K[2] 1065 vadduwm $xdn0,$xdn5,@K[3] 1066 1067 be?vperm $xan0,$xan0,$xan0,$beperm 1068 be?vperm $xbn0,$xbn0,$xbn0,$beperm 1069 be?vperm $xcn0,$xcn0,$xcn0,$beperm 1070 be?vperm $xdn0,$xdn0,$xdn0,$beperm 1071 1072 ${UCMP}i $len,0x40 1073 blt Ltail_vsx_8x_1 1074 1075 lvx_4w $xr0,$x00,$inp 1076 lvx_4w $xt1,$x10,$inp 1077 lvx_4w $xt2,$x20,$inp 1078 lvx_4w $xt3,$x30,$inp 1079 1080 vxor $xr0,$xr0,$xan0 1081 vxor $xt1,$xt1,$xbn0 1082 vxor $xt2,$xt2,$xcn0 1083 vxor $xt3,$xt3,$xdn0 1084 1085 stvx_4w $xr0,$x00,$out 1086 stvx_4w $xt1,$x10,$out 1087 addi $inp,$inp,0x40 1088 stvx_4w $xt2,$x20,$out 1089 subi $len,$len,0x40 1090 stvx_4w $xt3,$x30,$out 1091 addi $out,$out,0x40 1092 beq Ldone_vsx_8x 1093 1094 vadduwm $xan0,$xa6,@K[0] 1095 vadduwm $xbn0,$xb6,@K[1] 1096 vadduwm $xcn0,$xcn6,@K[2] 1097 vadduwm $xdn0,$xdn6,@K[3] 1098 1099 be?vperm $xan0,$xan0,$xan0,$beperm 1100 be?vperm $xbn0,$xbn0,$xbn0,$beperm 1101 be?vperm $xcn0,$xcn0,$xcn0,$beperm 1102 be?vperm $xdn0,$xdn0,$xdn0,$beperm 1103 1104 ${UCMP}i $len,0x40 1105 blt Ltail_vsx_8x_1 1106 1107 lvx_4w $xr0,$x00,$inp 1108 lvx_4w $xt1,$x10,$inp 1109 lvx_4w $xt2,$x20,$inp 1110 lvx_4w $xt3,$x30,$inp 1111 1112 vxor $xr0,$xr0,$xan0 1113 vxor $xt1,$xt1,$xbn0 1114 vxor $xt2,$xt2,$xcn0 1115 vxor $xt3,$xt3,$xdn0 1116 1117 stvx_4w $xr0,$x00,$out 1118 stvx_4w $xt1,$x10,$out 1119 addi $inp,$inp,0x40 1120 stvx_4w $xt2,$x20,$out 1121 subi $len,$len,0x40 1122 stvx_4w $xt3,$x30,$out 1123 addi $out,$out,0x40 1124 beq Ldone_vsx_8x 1125 1126 vadduwm $xan0,$xa7,@K[0] 1127 vadduwm $xbn0,$xb7,@K[1] 1128 vadduwm $xcn0,$xcn7,@K[2] 1129 vadduwm $xdn0,$xdn7,@K[3] 1130 1131 be?vperm $xan0,$xan0,$xan0,$beperm 1132 be?vperm $xbn0,$xbn0,$xbn0,$beperm 1133 be?vperm $xcn0,$xcn0,$xcn0,$beperm 1134 be?vperm $xdn0,$xdn0,$xdn0,$beperm 1135 1136 ${UCMP}i $len,0x40 1137 blt Ltail_vsx_8x_1 1138 1139 lvx_4w $xr0,$x00,$inp 1140 lvx_4w $xt1,$x10,$inp 1141 lvx_4w $xt2,$x20,$inp 1142 lvx_4w $xt3,$x30,$inp 1143 1144 vxor $xr0,$xr0,$xan0 1145 vxor $xt1,$xt1,$xbn0 1146 vxor $xt2,$xt2,$xcn0 1147 vxor $xt3,$xt3,$xdn0 1148 1149 stvx_4w $xr0,$x00,$out 1150 stvx_4w $xt1,$x10,$out 1151 addi $inp,$inp,0x40 1152 stvx_4w $xt2,$x20,$out 1153 subi $len,$len,0x40 1154 stvx_4w $xt3,$x30,$out 1155 addi $out,$out,0x40 1156 beq Ldone_vsx_8x 1157 1158 mtctr r0 1159 bne Loop_outer_vsx_8x 1160 1161Ldone_vsx_8x: 1162 lwz r12,`$FRAME-4`($sp) # pull vrsave 1163 li r10,`15+$LOCALS+64` 1164 li r11,`31+$LOCALS+64` 1165 $POP r0, `$FRAME+$LRSAVE`($sp) 1166 mtspr 256,r12 # restore vrsave 1167 lvx v24,r10,$sp 1168 addi r10,r10,32 1169 lvx v25,r11,$sp 1170 addi r11,r11,32 1171 lvx v26,r10,$sp 1172 addi r10,r10,32 1173 lvx v27,r11,$sp 1174 addi r11,r11,32 1175 lvx v28,r10,$sp 1176 addi r10,r10,32 1177 lvx v29,r11,$sp 1178 addi r11,r11,32 1179 lvx v30,r10,$sp 1180 lvx v31,r11,$sp 1181 mtlr r0 1182 addi $sp,$sp,$FRAME 1183 blr 1184 1185.align 4 1186Ltail_vsx_8x: 1187 addi r11,$sp,$LOCALS 1188 mtctr $len 1189 stvx_4w $xa0,$x00,r11 # offload block to stack 1190 stvx_4w $xb0,$x10,r11 1191 stvx_4w $xc0,$x20,r11 1192 stvx_4w $xd0,$x30,r11 1193 subi r12,r11,1 # prepare for *++ptr 1194 subi $inp,$inp,1 1195 subi $out,$out,1 1196 bl Loop_tail_vsx_8x 1197Ltail_vsx_8x_1: 1198 addi r11,$sp,$LOCALS 1199 mtctr $len 1200 stvx_4w $xan0,$x00,r11 # offload block to stack 1201 stvx_4w $xbn0,$x10,r11 1202 stvx_4w $xcn0,$x20,r11 1203 stvx_4w $xdn0,$x30,r11 1204 subi r12,r11,1 # prepare for *++ptr 1205 subi $inp,$inp,1 1206 subi $out,$out,1 1207 bl Loop_tail_vsx_8x 1208 1209Loop_tail_vsx_8x: 1210 lbzu r6,1(r12) 1211 lbzu r7,1($inp) 1212 xor r6,r6,r7 1213 stbu r6,1($out) 1214 bdnz Loop_tail_vsx_8x 1215 1216 stvx_4w $K[0],$x00,r11 # wipe copy of the block 1217 stvx_4w $K[0],$x10,r11 1218 stvx_4w $K[0],$x20,r11 1219 stvx_4w $K[0],$x30,r11 1220 1221 b Ldone_vsx_8x 1222 .long 0 1223 .byte 0,12,0x04,1,0x80,0,5,0 1224 .long 0 1225.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x 1226___ 1227}}} 1228 1229 1230$code.=<<___; 1231.align 5 1232Lconsts: 1233 mflr r0 1234 bcl 20,31,\$+4 1235 mflr r12 #vvvvv "distance between . and Lsigma 1236 addi r12,r12,`64-8` 1237 mtlr r0 1238 blr 1239 .long 0 1240 .byte 0,12,0x14,0,0,0,0,0 1241 .space `64-9*4` 1242Lsigma: 1243 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 1244 .long 1,0,0,0 1245 .long 2,0,0,0 1246 .long 3,0,0,0 1247 .long 4,0,0,0 1248___ 1249$code.=<<___ if ($LITTLE_ENDIAN); 1250 .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001 1251 .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300 1252___ 1253$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words 1254 .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d 1255 .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c 1256___ 1257$code.=<<___; 1258 .long 0x61707865,0x61707865,0x61707865,0x61707865 1259 .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e 1260 .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 1261 .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 1262 .long 0,1,2,3 1263 .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c 1264.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>" 1265.align 2 1266___ 1267 1268foreach (split("\n",$code)) { 1269 s/\`([^\`]*)\`/eval $1/ge; 1270 1271 # instructions prefixed with '?' are endian-specific and need 1272 # to be adjusted accordingly... 1273 if ($flavour !~ /le$/) { # big-endian 1274 s/be\?// or 1275 s/le\?/#le#/ or 1276 s/\?lvsr/lvsl/ or 1277 s/\?lvsl/lvsr/ or 1278 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or 1279 s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; 1280 } else { # little-endian 1281 s/le\?// or 1282 s/be\?/#be#/ or 1283 s/\?([a-z]+)/$1/ or 1284 s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; 1285 } 1286 1287 print $_,"\n"; 1288} 1289 1290close STDOUT or die "error closing STDOUT: $!"; 1291