1#! /usr/bin/env perl 2# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2015 18# 19# ChaCha20 for s390x. 20# 21# 3 times faster than compiler-generated code. 22 23# 24# August 2018 25# 26# Add vx code path: 4x"vertical". 27# 28# Copyright IBM Corp. 2018 29# Author: Patrick Steuer <patrick.steuer@de.ibm.com> 30 31# 32# February 2019 33# 34# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's 35# 4x"vertical" submission [on z13] and >3 faster than scalar code. 36# But to harness overheads revert to transliteration of VSX code path 37# from chacha-ppc module, which is also 4x"vertical", to handle inputs 38# not longer than 256 bytes. 39 40use strict; 41use FindBin qw($Bin); 42use lib "$Bin/../.."; 43use perlasm::s390x qw(:DEFAULT :VX :EI AUTOLOAD LABEL INCLUDE); 44 45# $output is the last argument if it looks like a file (it has an extension) 46# $flavour is the first argument if it doesn't look like a file 47my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 48my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 49 50my ($z,$SIZE_T); 51if ($flavour =~ /3[12]/) { 52 $z=0; # S/390 ABI 53 $SIZE_T=4; 54} else { 55 $z=1; # zSeries ABI 56 $SIZE_T=8; 57} 58 59my $sp="%r15"; 60my $stdframe=16*$SIZE_T+4*8; 61 62sub ROUND { 63my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); 64my @t=map("%r$_",(8,9)); 65my ($a0,$b0,$c0,$d0)=@_; 66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 69my ($xc,$xc_)=map("$_",@t); 70 71 # Consider order in which variables are addressed by their 72 # index: 73 # 74 # a b c d 75 # 76 # 0 4 8 12 < even round 77 # 1 5 9 13 78 # 2 6 10 14 79 # 3 7 11 15 80 # 0 5 10 15 < odd round 81 # 1 6 11 12 82 # 2 7 8 13 83 # 3 4 9 14 84 # 85 # 'a', 'b' and 'd's are permanently allocated in registers, 86 # @x[0..7,12..15], while 'c's are maintained in memory. If 87 # you observe 'c' column, you'll notice that pair of 'c's is 88 # invariant between rounds. This means that we have to reload 89 # them once per round, in the middle. This is why you'll see 90 # 'c' stores and loads in the middle, but none in the beginning 91 # or end. 92 93 alr (@x[$a0],@x[$b0]); # Q1 94 alr (@x[$a1],@x[$b1]); # Q2 95 xr (@x[$d0],@x[$a0]); 96 xr (@x[$d1],@x[$a1]); 97 rll (@x[$d0],@x[$d0],16); 98 rll (@x[$d1],@x[$d1],16); 99 100 alr ($xc,@x[$d0]); 101 alr ($xc_,@x[$d1]); 102 xr (@x[$b0],$xc); 103 xr (@x[$b1],$xc_); 104 rll (@x[$b0],@x[$b0],12); 105 rll (@x[$b1],@x[$b1],12); 106 107 alr (@x[$a0],@x[$b0]); 108 alr (@x[$a1],@x[$b1]); 109 xr (@x[$d0],@x[$a0]); 110 xr (@x[$d1],@x[$a1]); 111 rll (@x[$d0],@x[$d0],8); 112 rll (@x[$d1],@x[$d1],8); 113 114 alr ($xc,@x[$d0]); 115 alr ($xc_,@x[$d1]); 116 xr (@x[$b0],$xc); 117 xr (@x[$b1],$xc_); 118 rll (@x[$b0],@x[$b0],7); 119 rll (@x[$b1],@x[$b1],7); 120 121 stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's 122 lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)"); 123 124 alr (@x[$a2],@x[$b2]); # Q3 125 alr (@x[$a3],@x[$b3]); # Q4 126 xr (@x[$d2],@x[$a2]); 127 xr (@x[$d3],@x[$a3]); 128 rll (@x[$d2],@x[$d2],16); 129 rll (@x[$d3],@x[$d3],16); 130 131 alr ($xc,@x[$d2]); 132 alr ($xc_,@x[$d3]); 133 xr (@x[$b2],$xc); 134 xr (@x[$b3],$xc_); 135 rll (@x[$b2],@x[$b2],12); 136 rll (@x[$b3],@x[$b3],12); 137 138 alr (@x[$a2],@x[$b2]); 139 alr (@x[$a3],@x[$b3]); 140 xr (@x[$d2],@x[$a2]); 141 xr (@x[$d3],@x[$a3]); 142 rll (@x[$d2],@x[$d2],8); 143 rll (@x[$d3],@x[$d3],8); 144 145 alr ($xc,@x[$d2]); 146 alr ($xc_,@x[$d3]); 147 xr (@x[$b2],$xc); 148 xr (@x[$b3],$xc_); 149 rll (@x[$b2],@x[$b2],7); 150 rll (@x[$b3],@x[$b3],7); 151} 152 153sub VX_lane_ROUND { 154my ($a0,$b0,$c0,$d0)=@_; 155my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 156my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 157my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 158my @x=map("%v$_",(0..15)); 159 160 vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1 161 vx (@x[$d0],@x[$d0],@x[$a0]); 162 verllf (@x[$d0],@x[$d0],16); 163 vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2 164 vx (@x[$d1],@x[$d1],@x[$a1]); 165 verllf (@x[$d1],@x[$d1],16); 166 vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3 167 vx (@x[$d2],@x[$d2],@x[$a2]); 168 verllf (@x[$d2],@x[$d2],16); 169 vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4 170 vx (@x[$d3],@x[$d3],@x[$a3]); 171 verllf (@x[$d3],@x[$d3],16); 172 173 vaf (@x[$c0],@x[$c0],@x[$d0]); 174 vx (@x[$b0],@x[$b0],@x[$c0]); 175 verllf (@x[$b0],@x[$b0],12); 176 vaf (@x[$c1],@x[$c1],@x[$d1]); 177 vx (@x[$b1],@x[$b1],@x[$c1]); 178 verllf (@x[$b1],@x[$b1],12); 179 vaf (@x[$c2],@x[$c2],@x[$d2]); 180 vx (@x[$b2],@x[$b2],@x[$c2]); 181 verllf (@x[$b2],@x[$b2],12); 182 vaf (@x[$c3],@x[$c3],@x[$d3]); 183 vx (@x[$b3],@x[$b3],@x[$c3]); 184 verllf (@x[$b3],@x[$b3],12); 185 186 vaf (@x[$a0],@x[$a0],@x[$b0]); 187 vx (@x[$d0],@x[$d0],@x[$a0]); 188 verllf (@x[$d0],@x[$d0],8); 189 vaf (@x[$a1],@x[$a1],@x[$b1]); 190 vx (@x[$d1],@x[$d1],@x[$a1]); 191 verllf (@x[$d1],@x[$d1],8); 192 vaf (@x[$a2],@x[$a2],@x[$b2]); 193 vx (@x[$d2],@x[$d2],@x[$a2]); 194 verllf (@x[$d2],@x[$d2],8); 195 vaf (@x[$a3],@x[$a3],@x[$b3]); 196 vx (@x[$d3],@x[$d3],@x[$a3]); 197 verllf (@x[$d3],@x[$d3],8); 198 199 vaf (@x[$c0],@x[$c0],@x[$d0]); 200 vx (@x[$b0],@x[$b0],@x[$c0]); 201 verllf (@x[$b0],@x[$b0],7); 202 vaf (@x[$c1],@x[$c1],@x[$d1]); 203 vx (@x[$b1],@x[$b1],@x[$c1]); 204 verllf (@x[$b1],@x[$b1],7); 205 vaf (@x[$c2],@x[$c2],@x[$d2]); 206 vx (@x[$b2],@x[$b2],@x[$c2]); 207 verllf (@x[$b2],@x[$b2],7); 208 vaf (@x[$c3],@x[$c3],@x[$d3]); 209 vx (@x[$b3],@x[$b3],@x[$c3]); 210 verllf (@x[$b3],@x[$b3],7); 211} 212 213sub VX_ROUND { 214my @a=@_[0..5]; 215my @b=@_[6..11]; 216my @c=@_[12..17]; 217my @d=@_[18..23]; 218my $odd=@_[24]; 219 220 vaf (@a[$_],@a[$_],@b[$_]) for (0..5); 221 vx (@d[$_],@d[$_],@a[$_]) for (0..5); 222 verllf (@d[$_],@d[$_],16) for (0..5); 223 224 vaf (@c[$_],@c[$_],@d[$_]) for (0..5); 225 vx (@b[$_],@b[$_],@c[$_]) for (0..5); 226 verllf (@b[$_],@b[$_],12) for (0..5); 227 228 vaf (@a[$_],@a[$_],@b[$_]) for (0..5); 229 vx (@d[$_],@d[$_],@a[$_]) for (0..5); 230 verllf (@d[$_],@d[$_],8) for (0..5); 231 232 vaf (@c[$_],@c[$_],@d[$_]) for (0..5); 233 vx (@b[$_],@b[$_],@c[$_]) for (0..5); 234 verllf (@b[$_],@b[$_],7) for (0..5); 235 236 vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5); 237 vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5); 238 vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5); 239} 240 241PERLASM_BEGIN($output); 242 243INCLUDE ("s390x_arch.h"); 244TEXT (); 245 246################ 247# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len, 248# const unsigned int key[8], const unsigned int counter[4]) 249my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); 250{ 251my $frame=$stdframe+4*20; 252my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); 253my @t=map("%r$_",(8,9)); 254 255GLOBL ("ChaCha20_ctr32"); 256TYPE ("ChaCha20_ctr32","\@function"); 257ALIGN (32); 258LABEL ("ChaCha20_ctr32"); 259 larl ("%r1","OPENSSL_s390xcap_P"); 260 261 lghi ("%r0",64); 262&{$z? \<gr:\<r} ($len,$len); # len==0? 263 bzr ("%r14"); 264 lg ("%r1","S390X_STFLE+16(%r1)"); 265&{$z? \&clgr:\&clr} ($len,"%r0"); 266 jle (".Lshort"); 267 268 tmhh ("%r1",0x4000); # check for vx bit 269 jnz (".LChaCha20_ctr32_vx"); 270 271LABEL (".Lshort"); 272&{$z? \&aghi:\&ahi} ($len,-64); 273&{$z? \&lghi:\&lhi} ("%r1",-$frame); 274&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)"); 275&{$z? \&slgr:\&slr} ($out,$inp); # difference 276 la ($len,"0($inp,$len)"); # end of input minus 64 277 larl ("%r7",".Lsigma"); 278 lgr ("%r0",$sp); 279 la ($sp,"0(%r1,$sp)"); 280&{$z? \&stg:\&st} ("%r0","0($sp)"); 281 282 lmg ("%r8","%r11","0($key)"); # load key 283 lmg ("%r12","%r13","0($counter)"); # load counter 284 lmg ("%r6","%r7","0(%r7)"); # load sigma constant 285 286 la ("%r14","0($inp)"); 287&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)"); 288&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)"); 289 stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack 290 srlg (@x[12],"%r12",32); # 32-bit counter value 291 j (".Loop_outer"); 292 293ALIGN (16); 294LABEL (".Loop_outer"); 295 lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7] 296 lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11] 297 lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15] 298 stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11] 299 lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9] 300 st (@x[12],"$stdframe+4*12($sp)"); # save counter 301&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer 302 lhi ("%r14",10); 303 j (".Loop"); 304 305ALIGN (4); 306LABEL (".Loop"); 307 ROUND (0, 4, 8,12); 308 ROUND (0, 5,10,15); 309 brct ("%r14",".Loop"); 310 311&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer 312 stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9] 313&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)"); 314 315 al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule 316 al (@x[1],"$stdframe+4*1($sp)"); 317 al (@x[2],"$stdframe+4*2($sp)"); 318 al (@x[3],"$stdframe+4*3($sp)"); 319 al (@x[4],"$stdframe+4*4($sp)"); 320 al (@x[5],"$stdframe+4*5($sp)"); 321 al (@x[6],"$stdframe+4*6($sp)"); 322 al (@x[7],"$stdframe+4*7($sp)"); 323 lrvr (@x[0],@x[0]); 324 lrvr (@x[1],@x[1]); 325 lrvr (@x[2],@x[2]); 326 lrvr (@x[3],@x[3]); 327 lrvr (@x[4],@x[4]); 328 lrvr (@x[5],@x[5]); 329 lrvr (@x[6],@x[6]); 330 lrvr (@x[7],@x[7]); 331 al (@x[12],"$stdframe+4*12($sp)"); 332 al (@x[13],"$stdframe+4*13($sp)"); 333 al (@x[14],"$stdframe+4*14($sp)"); 334 al (@x[15],"$stdframe+4*15($sp)"); 335 lrvr (@x[12],@x[12]); 336 lrvr (@x[13],@x[13]); 337 lrvr (@x[14],@x[14]); 338 lrvr (@x[15],@x[15]); 339 340 la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer 341&{$z? \&clgr:\&clr} ("%r14",@t[1]); 342 jh (".Ltail"); 343 344 x (@x[0],"4*0(%r14)"); # xor with input 345 x (@x[1],"4*1(%r14)"); 346 st (@x[0],"4*0(@t[0])"); # store output 347 x (@x[2],"4*2(%r14)"); 348 st (@x[1],"4*1(@t[0])"); 349 x (@x[3],"4*3(%r14)"); 350 st (@x[2],"4*2(@t[0])"); 351 x (@x[4],"4*4(%r14)"); 352 st (@x[3],"4*3(@t[0])"); 353 lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11] 354 x (@x[5],"4*5(%r14)"); 355 st (@x[4],"4*4(@t[0])"); 356 x (@x[6],"4*6(%r14)"); 357 al (@x[0],"$stdframe+4*8($sp)"); 358 st (@x[5],"4*5(@t[0])"); 359 x (@x[7],"4*7(%r14)"); 360 al (@x[1],"$stdframe+4*9($sp)"); 361 st (@x[6],"4*6(@t[0])"); 362 x (@x[12],"4*12(%r14)"); 363 al (@x[2],"$stdframe+4*10($sp)"); 364 st (@x[7],"4*7(@t[0])"); 365 x (@x[13],"4*13(%r14)"); 366 al (@x[3],"$stdframe+4*11($sp)"); 367 st (@x[12],"4*12(@t[0])"); 368 x (@x[14],"4*14(%r14)"); 369 st (@x[13],"4*13(@t[0])"); 370 x (@x[15],"4*15(%r14)"); 371 st (@x[14],"4*14(@t[0])"); 372 lrvr (@x[0],@x[0]); 373 st (@x[15],"4*15(@t[0])"); 374 lrvr (@x[1],@x[1]); 375 lrvr (@x[2],@x[2]); 376 lrvr (@x[3],@x[3]); 377 lhi (@x[12],1); 378 x (@x[0],"4*8(%r14)"); 379 al (@x[12],"$stdframe+4*12($sp)"); # increment counter 380 x (@x[1],"4*9(%r14)"); 381 st (@x[0],"4*8(@t[0])"); 382 x (@x[2],"4*10(%r14)"); 383 st (@x[1],"4*9(@t[0])"); 384 x (@x[3],"4*11(%r14)"); 385 st (@x[2],"4*10(@t[0])"); 386 st (@x[3],"4*11(@t[0])"); 387 388&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet? 389 la ("%r14","64(%r14)"); 390 jl (".Loop_outer"); 391 392LABEL (".Ldone"); 393 xgr ("%r0","%r0"); 394 xgr ("%r1","%r1"); 395 xgr ("%r2","%r2"); 396 xgr ("%r3","%r3"); 397 stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy 398 stmg ("%r0","%r3","$stdframe+4*12($sp)"); 399 400&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)"); 401 br ("%r14"); 402 403ALIGN (16); 404LABEL (".Ltail"); 405 la (@t[1],"64($t[1])"); 406 stm (@x[0],@x[7],"$stdframe+4*0($sp)"); 407&{$z? \&slgr:\&slr} (@t[1],"%r14"); 408 lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); 409&{$z? \&lghi:\&lhi} (@x[6],0); 410 stm (@x[12],@x[15],"$stdframe+4*12($sp)"); 411 al (@x[0],"$stdframe+4*8($sp)"); 412 al (@x[1],"$stdframe+4*9($sp)"); 413 al (@x[2],"$stdframe+4*10($sp)"); 414 al (@x[3],"$stdframe+4*11($sp)"); 415 lrvr (@x[0],@x[0]); 416 lrvr (@x[1],@x[1]); 417 lrvr (@x[2],@x[2]); 418 lrvr (@x[3],@x[3]); 419 stm (@x[0],@x[3],"$stdframe+4*8($sp)"); 420 421LABEL (".Loop_tail"); 422 llgc (@x[4],"0(@x[6],%r14)"); 423 llgc (@x[5],"$stdframe(@x[6],$sp)"); 424 xr (@x[5],@x[4]); 425 stc (@x[5],"0(@x[6],@t[0])"); 426 la (@x[6],"1(@x[6])"); 427 brct (@t[1],".Loop_tail"); 428 429 j (".Ldone"); 430SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32"); 431} 432 433######################################################################## 434# 4x"vertical" layout minimizes amount of instructions, but pipeline 435# runs underutilized [because of vector instructions' high latency]. 436# On the other hand minimum amount of data it takes to fully utilize 437# the pipeline is higher, so that effectively, short inputs would be 438# processed slower. Hence this code path targeting <=256 bytes lengths. 439# 440{ 441my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, 442 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15)); 443my @K=map("%v$_",(16..19)); 444my $CTR="%v26"; 445my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30)); 446my $beperm="%v31"; 447my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10))); 448my $FRAME=$stdframe+4*16; 449 450ALIGN (32); 451LABEL ("ChaCha20_ctr32_4x"); 452LABEL (".LChaCha20_ctr32_4x"); 453&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); 454if (!$z) { 455 std ("%f4","16*$SIZE_T+2*8($sp)"); 456 std ("%f6","16*$SIZE_T+3*8($sp)"); 457} 458&{$z? \&lghi:\&lhi} ("%r1",-$FRAME); 459 lgr ("%r0",$sp); 460 la ($sp,"0(%r1,$sp)"); 461&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain 462if ($z) { 463 std ("%f8","$stdframe+8*0($sp)"); 464 std ("%f9","$stdframe+8*1($sp)"); 465 std ("%f10","$stdframe+8*2($sp)"); 466 std ("%f11","$stdframe+8*3($sp)"); 467 std ("%f12","$stdframe+8*4($sp)"); 468 std ("%f13","$stdframe+8*5($sp)"); 469 std ("%f14","$stdframe+8*6($sp)"); 470 std ("%f15","$stdframe+8*7($sp)"); 471} 472 larl ("%r7",".Lsigma"); 473 lhi ("%r0",10); 474 lhi ("%r1",0); 475 476 vl (@K[0],"0(%r7)"); # load sigma 477 vl (@K[1],"0($key)"); # load key 478 vl (@K[2],"16($key)"); 479 vl (@K[3],"0($counter)"); # load counter 480 481 vl ($beperm,"0x40(%r7)"); 482 vl ($xt1,"0x50(%r7)"); 483 vrepf ($CTR,@K[3],0); 484 vlvgf (@K[3],"%r1",0); # clear @K[3].word[0] 485 vaf ($CTR,$CTR,$xt1); 486 487#LABEL (".Loop_outer_4x"); 488 vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma 489 490 vrepf ($xb0,@K[1],0); # smash the key 491 vrepf ($xb1,@K[1],1); 492 vrepf ($xb2,@K[1],2); 493 vrepf ($xb3,@K[1],3); 494 495 vrepf ($xc0,@K[2],0); 496 vrepf ($xc1,@K[2],1); 497 vrepf ($xc2,@K[2],2); 498 vrepf ($xc3,@K[2],3); 499 500 vlr ($xd0,$CTR); 501 vrepf ($xd1,@K[3],1); 502 vrepf ($xd2,@K[3],2); 503 vrepf ($xd3,@K[3],3); 504 505LABEL (".Loop_4x"); 506 VX_lane_ROUND(0, 4, 8,12); 507 VX_lane_ROUND(0, 5,10,15); 508 brct ("%r0",".Loop_4x"); 509 510 vaf ($xd0,$xd0,$CTR); 511 512 vmrhf ($xt0,$xa0,$xa1); # transpose data 513 vmrhf ($xt1,$xa2,$xa3); 514 vmrlf ($xt2,$xa0,$xa1); 515 vmrlf ($xt3,$xa2,$xa3); 516 vpdi ($xa0,$xt0,$xt1,0b0000); 517 vpdi ($xa1,$xt0,$xt1,0b0101); 518 vpdi ($xa2,$xt2,$xt3,0b0000); 519 vpdi ($xa3,$xt2,$xt3,0b0101); 520 521 vmrhf ($xt0,$xb0,$xb1); 522 vmrhf ($xt1,$xb2,$xb3); 523 vmrlf ($xt2,$xb0,$xb1); 524 vmrlf ($xt3,$xb2,$xb3); 525 vpdi ($xb0,$xt0,$xt1,0b0000); 526 vpdi ($xb1,$xt0,$xt1,0b0101); 527 vpdi ($xb2,$xt2,$xt3,0b0000); 528 vpdi ($xb3,$xt2,$xt3,0b0101); 529 530 vmrhf ($xt0,$xc0,$xc1); 531 vmrhf ($xt1,$xc2,$xc3); 532 vmrlf ($xt2,$xc0,$xc1); 533 vmrlf ($xt3,$xc2,$xc3); 534 vpdi ($xc0,$xt0,$xt1,0b0000); 535 vpdi ($xc1,$xt0,$xt1,0b0101); 536 vpdi ($xc2,$xt2,$xt3,0b0000); 537 vpdi ($xc3,$xt2,$xt3,0b0101); 538 539 vmrhf ($xt0,$xd0,$xd1); 540 vmrhf ($xt1,$xd2,$xd3); 541 vmrlf ($xt2,$xd0,$xd1); 542 vmrlf ($xt3,$xd2,$xd3); 543 vpdi ($xd0,$xt0,$xt1,0b0000); 544 vpdi ($xd1,$xt0,$xt1,0b0101); 545 vpdi ($xd2,$xt2,$xt3,0b0000); 546 vpdi ($xd3,$xt2,$xt3,0b0101); 547 548 #vrepif ($xt0,4); 549 #vaf ($CTR,$CTR,$xt0); # next counter value 550 551 vaf ($xa0,$xa0,@K[0]); 552 vaf ($xb0,$xb0,@K[1]); 553 vaf ($xc0,$xc0,@K[2]); 554 vaf ($xd0,$xd0,@K[3]); 555 556 vperm ($xa0,$xa0,$xa0,$beperm); 557 vperm ($xb0,$xb0,$xb0,$beperm); 558 vperm ($xc0,$xc0,$xc0,$beperm); 559 vperm ($xd0,$xd0,$xd0,$beperm); 560 561 #&{$z? \&clgfi:\&clfi} ($len,0x40); 562 #jl (".Ltail_4x"); 563 564 vlm ($xt0,$xt3,"0($inp)"); 565 566 vx ($xt0,$xt0,$xa0); 567 vx ($xt1,$xt1,$xb0); 568 vx ($xt2,$xt2,$xc0); 569 vx ($xt3,$xt3,$xd0); 570 571 vstm ($xt0,$xt3,"0($out)"); 572 573 la ($inp,"0x40($inp)"); 574 la ($out,"0x40($out)"); 575&{$z? \&aghi:\&ahi} ($len,-0x40); 576 #je (".Ldone_4x"); 577 578 vaf ($xa0,$xa1,@K[0]); 579 vaf ($xb0,$xb1,@K[1]); 580 vaf ($xc0,$xc1,@K[2]); 581 vaf ($xd0,$xd1,@K[3]); 582 583 vperm ($xa0,$xa0,$xa0,$beperm); 584 vperm ($xb0,$xb0,$xb0,$beperm); 585 vperm ($xc0,$xc0,$xc0,$beperm); 586 vperm ($xd0,$xd0,$xd0,$beperm); 587 588&{$z? \&clgfi:\&clfi} ($len,0x40); 589 jl (".Ltail_4x"); 590 591 vlm ($xt0,$xt3,"0($inp)"); 592 593 vx ($xt0,$xt0,$xa0); 594 vx ($xt1,$xt1,$xb0); 595 vx ($xt2,$xt2,$xc0); 596 vx ($xt3,$xt3,$xd0); 597 598 vstm ($xt0,$xt3,"0($out)"); 599 600 la ($inp,"0x40($inp)"); 601 la ($out,"0x40($out)"); 602&{$z? \&aghi:\&ahi} ($len,-0x40); 603 je (".Ldone_4x"); 604 605 vaf ($xa0,$xa2,@K[0]); 606 vaf ($xb0,$xb2,@K[1]); 607 vaf ($xc0,$xc2,@K[2]); 608 vaf ($xd0,$xd2,@K[3]); 609 610 vperm ($xa0,$xa0,$xa0,$beperm); 611 vperm ($xb0,$xb0,$xb0,$beperm); 612 vperm ($xc0,$xc0,$xc0,$beperm); 613 vperm ($xd0,$xd0,$xd0,$beperm); 614 615&{$z? \&clgfi:\&clfi} ($len,0x40); 616 jl (".Ltail_4x"); 617 618 vlm ($xt0,$xt3,"0($inp)"); 619 620 vx ($xt0,$xt0,$xa0); 621 vx ($xt1,$xt1,$xb0); 622 vx ($xt2,$xt2,$xc0); 623 vx ($xt3,$xt3,$xd0); 624 625 vstm ($xt0,$xt3,"0($out)"); 626 627 la ($inp,"0x40($inp)"); 628 la ($out,"0x40($out)"); 629&{$z? \&aghi:\&ahi} ($len,-0x40); 630 je (".Ldone_4x"); 631 632 vaf ($xa0,$xa3,@K[0]); 633 vaf ($xb0,$xb3,@K[1]); 634 vaf ($xc0,$xc3,@K[2]); 635 vaf ($xd0,$xd3,@K[3]); 636 637 vperm ($xa0,$xa0,$xa0,$beperm); 638 vperm ($xb0,$xb0,$xb0,$beperm); 639 vperm ($xc0,$xc0,$xc0,$beperm); 640 vperm ($xd0,$xd0,$xd0,$beperm); 641 642&{$z? \&clgfi:\&clfi} ($len,0x40); 643 jl (".Ltail_4x"); 644 645 vlm ($xt0,$xt3,"0($inp)"); 646 647 vx ($xt0,$xt0,$xa0); 648 vx ($xt1,$xt1,$xb0); 649 vx ($xt2,$xt2,$xc0); 650 vx ($xt3,$xt3,$xd0); 651 652 vstm ($xt0,$xt3,"0($out)"); 653 654 #la $inp,0x40($inp)); 655 #la $out,0x40($out)); 656 #lhi %r0,10); 657 #&{$z? \&aghi:\&ahi} $len,-0x40); 658 #jne .Loop_outer_4x); 659 660LABEL (".Ldone_4x"); 661if (!$z) { 662 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 663 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 664} else { 665 ld ("%f8","$stdframe+8*0($sp)"); 666 ld ("%f9","$stdframe+8*1($sp)"); 667 ld ("%f10","$stdframe+8*2($sp)"); 668 ld ("%f11","$stdframe+8*3($sp)"); 669 ld ("%f12","$stdframe+8*4($sp)"); 670 ld ("%f13","$stdframe+8*5($sp)"); 671 ld ("%f14","$stdframe+8*6($sp)"); 672 ld ("%f15","$stdframe+8*7($sp)"); 673} 674&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 675 la ($sp,"$FRAME($sp)"); 676 br ("%r14"); 677 678ALIGN (16); 679LABEL (".Ltail_4x"); 680if (!$z) { 681 vlr ($xt0,$xb0); 682 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 683 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 684 685 vst ($xa0,"$stdframe+0x00($sp)"); 686 vst ($xt0,"$stdframe+0x10($sp)"); 687 vst ($xc0,"$stdframe+0x20($sp)"); 688 vst ($xd0,"$stdframe+0x30($sp)"); 689} else { 690 vlr ($xt0,$xc0); 691 ld ("%f8","$stdframe+8*0($sp)"); 692 ld ("%f9","$stdframe+8*1($sp)"); 693 ld ("%f10","$stdframe+8*2($sp)"); 694 ld ("%f11","$stdframe+8*3($sp)"); 695 vlr ($xt1,$xd0); 696 ld ("%f12","$stdframe+8*4($sp)"); 697 ld ("%f13","$stdframe+8*5($sp)"); 698 ld ("%f14","$stdframe+8*6($sp)"); 699 ld ("%f15","$stdframe+8*7($sp)"); 700 701 vst ($xa0,"$stdframe+0x00($sp)"); 702 vst ($xb0,"$stdframe+0x10($sp)"); 703 vst ($xt0,"$stdframe+0x20($sp)"); 704 vst ($xt1,"$stdframe+0x30($sp)"); 705} 706 lghi ("%r1",0); 707 708LABEL (".Loop_tail_4x"); 709 llgc ("%r5","0(%r1,$inp)"); 710 llgc ("%r6","$stdframe(%r1,$sp)"); 711 xr ("%r6","%r5"); 712 stc ("%r6","0(%r1,$out)"); 713 la ("%r1","1(%r1)"); 714 brct ($len,".Loop_tail_4x"); 715 716&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 717 la ($sp,"$FRAME($sp)"); 718 br ("%r14"); 719SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x"); 720} 721 722######################################################################## 723# 6x"horizontal" layout is optimal fit for the platform in its current 724# shape, more specifically for given vector instructions' latency. Well, 725# computational part of 8x"vertical" would be faster, but it consumes 726# all registers and dealing with that will diminish the return... 727# 728{ 729my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1, 730 $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3, 731 $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23)); 732my @K=map("%v$_",(27,24..26)); 733my ($t0,$t1,$t2,$t3)=map("%v$_",27..30); 734my $beperm="%v31"; 735my $FRAME=$stdframe + 4*16; 736 737GLOBL ("ChaCha20_ctr32_vx"); 738ALIGN (32); 739LABEL ("ChaCha20_ctr32_vx"); 740LABEL (".LChaCha20_ctr32_vx"); 741&{$z? \&clgfi:\&clfi} ($len,256); 742 jle (".LChaCha20_ctr32_4x"); 743&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)"); 744if (!$z) { 745 std ("%f4","16*$SIZE_T+2*8($sp)"); 746 std ("%f6","16*$SIZE_T+3*8($sp)"); 747} 748&{$z? \&lghi:\&lhi} ("%r1",-$FRAME); 749 lgr ("%r0",$sp); 750 la ($sp,"0(%r1,$sp)"); 751&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain 752if ($z) { 753 std ("%f8","$FRAME-8*8($sp)"); 754 std ("%f9","$FRAME-8*7($sp)"); 755 std ("%f10","$FRAME-8*6($sp)"); 756 std ("%f11","$FRAME-8*5($sp)"); 757 std ("%f12","$FRAME-8*4($sp)"); 758 std ("%f13","$FRAME-8*3($sp)"); 759 std ("%f14","$FRAME-8*2($sp)"); 760 std ("%f15","$FRAME-8*1($sp)"); 761} 762 larl ("%r7",".Lsigma"); 763 lhi ("%r0",10); 764 765 vlm (@K[1],@K[2],"0($key)"); # load key 766 vl (@K[3],"0($counter)"); # load counter 767 768 vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ... 769 770LABEL (".Loop_outer_vx"); 771 vlr ($a0,@K[0]); 772 vlr ($b0,@K[1]); 773 vlr ($a1,@K[0]); 774 vlr ($b1,@K[1]); 775 vlr ($a2,@K[0]); 776 vlr ($b2,@K[1]); 777 vlr ($a3,@K[0]); 778 vlr ($b3,@K[1]); 779 vlr ($a4,@K[0]); 780 vlr ($b4,@K[1]); 781 vlr ($a5,@K[0]); 782 vlr ($b5,@K[1]); 783 784 vlr ($d0,@K[3]); 785 vaf ($d1,@K[3],$t1); # K[3]+1 786 vaf ($d2,@K[3],$t2); # K[3]+2 787 vaf ($d3,@K[3],$t3); # K[3]+3 788 vaf ($d4,$d2,$t2); # K[3]+4 789 vaf ($d5,$d2,$t3); # K[3]+5 790 791 vlr ($c0,@K[2]); 792 vlr ($c1,@K[2]); 793 vlr ($c2,@K[2]); 794 vlr ($c3,@K[2]); 795 vlr ($c4,@K[2]); 796 vlr ($c5,@K[2]); 797 798 vlr ($t1,$d1); 799 vlr ($t2,$d2); 800 vlr ($t3,$d3); 801 802ALIGN (4); 803LABEL (".Loop_vx"); 804 805 VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5, 806 $b0,$b1,$b2,$b3,$b4,$b5, 807 $c0,$c1,$c2,$c3,$c4,$c5, 808 $d0,$d1,$d2,$d3,$d4,$d5, 809 0); 810 811 VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5, 812 $b0,$b1,$b2,$b3,$b4,$b5, 813 $c0,$c1,$c2,$c3,$c4,$c5, 814 $d0,$d1,$d2,$d3,$d4,$d5, 815 1); 816 817 brct ("%r0",".Loop_vx"); 818 819 vaf ($a0,$a0,@K[0]); 820 vaf ($b0,$b0,@K[1]); 821 vaf ($c0,$c0,@K[2]); 822 vaf ($d0,$d0,@K[3]); 823 vaf ($a1,$a1,@K[0]); 824 vaf ($d1,$d1,$t1); # +K[3]+1 825 826 vperm ($a0,$a0,$a0,$beperm); 827 vperm ($b0,$b0,$b0,$beperm); 828 vperm ($c0,$c0,$c0,$beperm); 829 vperm ($d0,$d0,$d0,$beperm); 830 831&{$z? \&clgfi:\&clfi} ($len,0x40); 832 jl (".Ltail_vx"); 833 834 vaf ($d2,$d2,$t2); # +K[3]+2 835 vaf ($d3,$d3,$t3); # +K[3]+3 836 vlm ($t0,$t3,"0($inp)"); 837 838 vx ($a0,$a0,$t0); 839 vx ($b0,$b0,$t1); 840 vx ($c0,$c0,$t2); 841 vx ($d0,$d0,$t3); 842 843 vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments 844 845 vstm ($a0,$d0,"0($out)"); 846 847 la ($inp,"0x40($inp)"); 848 la ($out,"0x40($out)"); 849&{$z? \&aghi:\&ahi} ($len,-0x40); 850 je (".Ldone_vx"); 851 852 vaf ($b1,$b1,@K[1]); 853 vaf ($c1,$c1,@K[2]); 854 855 vperm ($a0,$a1,$a1,$beperm); 856 vperm ($b0,$b1,$b1,$beperm); 857 vperm ($c0,$c1,$c1,$beperm); 858 vperm ($d0,$d1,$d1,$beperm); 859 860&{$z? \&clgfi:\&clfi} ($len,0x40); 861 jl (".Ltail_vx"); 862 863 vlm ($a1,$d1,"0($inp)"); 864 865 vx ($a0,$a0,$a1); 866 vx ($b0,$b0,$b1); 867 vx ($c0,$c0,$c1); 868 vx ($d0,$d0,$d1); 869 870 vstm ($a0,$d0,"0($out)"); 871 872 la ($inp,"0x40($inp)"); 873 la ($out,"0x40($out)"); 874&{$z? \&aghi:\&ahi} ($len,-0x40); 875 je (".Ldone_vx"); 876 877 vaf ($a2,$a2,@K[0]); 878 vaf ($b2,$b2,@K[1]); 879 vaf ($c2,$c2,@K[2]); 880 881 vperm ($a0,$a2,$a2,$beperm); 882 vperm ($b0,$b2,$b2,$beperm); 883 vperm ($c0,$c2,$c2,$beperm); 884 vperm ($d0,$d2,$d2,$beperm); 885 886&{$z? \&clgfi:\&clfi} ($len,0x40); 887 jl (".Ltail_vx"); 888 889 vlm ($a1,$d1,"0($inp)"); 890 891 vx ($a0,$a0,$a1); 892 vx ($b0,$b0,$b1); 893 vx ($c0,$c0,$c1); 894 vx ($d0,$d0,$d1); 895 896 vstm ($a0,$d0,"0($out)"); 897 898 la ($inp,"0x40($inp)"); 899 la ($out,"0x40($out)"); 900&{$z? \&aghi:\&ahi} ($len,-0x40); 901 je (".Ldone_vx"); 902 903 vaf ($a3,$a3,@K[0]); 904 vaf ($b3,$b3,@K[1]); 905 vaf ($c3,$c3,@K[2]); 906 vaf ($d2,@K[3],$t3); # K[3]+3 907 908 vperm ($a0,$a3,$a3,$beperm); 909 vperm ($b0,$b3,$b3,$beperm); 910 vperm ($c0,$c3,$c3,$beperm); 911 vperm ($d0,$d3,$d3,$beperm); 912 913&{$z? \&clgfi:\&clfi} ($len,0x40); 914 jl (".Ltail_vx"); 915 916 vaf ($d3,$d2,$t1); # K[3]+4 917 vlm ($a1,$d1,"0($inp)"); 918 919 vx ($a0,$a0,$a1); 920 vx ($b0,$b0,$b1); 921 vx ($c0,$c0,$c1); 922 vx ($d0,$d0,$d1); 923 924 vstm ($a0,$d0,"0($out)"); 925 926 la ($inp,"0x40($inp)"); 927 la ($out,"0x40($out)"); 928&{$z? \&aghi:\&ahi} ($len,-0x40); 929 je (".Ldone_vx"); 930 931 vaf ($a4,$a4,@K[0]); 932 vaf ($b4,$b4,@K[1]); 933 vaf ($c4,$c4,@K[2]); 934 vaf ($d4,$d4,$d3); # +K[3]+4 935 vaf ($d3,$d3,$t1); # K[3]+5 936 vaf (@K[3],$d2,$t3); # K[3]+=6 937 938 vperm ($a0,$a4,$a4,$beperm); 939 vperm ($b0,$b4,$b4,$beperm); 940 vperm ($c0,$c4,$c4,$beperm); 941 vperm ($d0,$d4,$d4,$beperm); 942 943&{$z? \&clgfi:\&clfi} ($len,0x40); 944 jl (".Ltail_vx"); 945 946 vlm ($a1,$d1,"0($inp)"); 947 948 vx ($a0,$a0,$a1); 949 vx ($b0,$b0,$b1); 950 vx ($c0,$c0,$c1); 951 vx ($d0,$d0,$d1); 952 953 vstm ($a0,$d0,"0($out)"); 954 955 la ($inp,"0x40($inp)"); 956 la ($out,"0x40($out)"); 957&{$z? \&aghi:\&ahi} ($len,-0x40); 958 je (".Ldone_vx"); 959 960 vaf ($a5,$a5,@K[0]); 961 vaf ($b5,$b5,@K[1]); 962 vaf ($c5,$c5,@K[2]); 963 vaf ($d5,$d5,$d3); # +K[3]+5 964 965 vperm ($a0,$a5,$a5,$beperm); 966 vperm ($b0,$b5,$b5,$beperm); 967 vperm ($c0,$c5,$c5,$beperm); 968 vperm ($d0,$d5,$d5,$beperm); 969 970&{$z? \&clgfi:\&clfi} ($len,0x40); 971 jl (".Ltail_vx"); 972 973 vlm ($a1,$d1,"0($inp)"); 974 975 vx ($a0,$a0,$a1); 976 vx ($b0,$b0,$b1); 977 vx ($c0,$c0,$c1); 978 vx ($d0,$d0,$d1); 979 980 vstm ($a0,$d0,"0($out)"); 981 982 la ($inp,"0x40($inp)"); 983 la ($out,"0x40($out)"); 984 lhi ("%r0",10); 985&{$z? \&aghi:\&ahi} ($len,-0x40); 986 jne (".Loop_outer_vx"); 987 988LABEL (".Ldone_vx"); 989if (!$z) { 990 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 991 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 992} else { 993 ld ("%f8","$FRAME-8*8($sp)"); 994 ld ("%f9","$FRAME-8*7($sp)"); 995 ld ("%f10","$FRAME-8*6($sp)"); 996 ld ("%f11","$FRAME-8*5($sp)"); 997 ld ("%f12","$FRAME-8*4($sp)"); 998 ld ("%f13","$FRAME-8*3($sp)"); 999 ld ("%f14","$FRAME-8*2($sp)"); 1000 ld ("%f15","$FRAME-8*1($sp)"); 1001} 1002&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 1003 la ($sp,"$FRAME($sp)"); 1004 br ("%r14"); 1005 1006ALIGN (16); 1007LABEL (".Ltail_vx"); 1008if (!$z) { 1009 ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)"); 1010 ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)"); 1011} else { 1012 ld ("%f8","$FRAME-8*8($sp)"); 1013 ld ("%f9","$FRAME-8*7($sp)"); 1014 ld ("%f10","$FRAME-8*6($sp)"); 1015 ld ("%f11","$FRAME-8*5($sp)"); 1016 ld ("%f12","$FRAME-8*4($sp)"); 1017 ld ("%f13","$FRAME-8*3($sp)"); 1018 ld ("%f14","$FRAME-8*2($sp)"); 1019 ld ("%f15","$FRAME-8*1($sp)"); 1020} 1021 vstm ($a0,$d0,"$stdframe($sp)"); 1022 lghi ("%r1",0); 1023 1024LABEL (".Loop_tail_vx"); 1025 llgc ("%r5","0(%r1,$inp)"); 1026 llgc ("%r6","$stdframe(%r1,$sp)"); 1027 xr ("%r6","%r5"); 1028 stc ("%r6","0(%r1,$out)"); 1029 la ("%r1","1(%r1)"); 1030 brct ($len,".Loop_tail_vx"); 1031 1032&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)"); 1033 la ($sp,"$FRAME($sp)"); 1034 br ("%r14"); 1035SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx"); 1036} 1037################ 1038 1039ALIGN (32); 1040LABEL (".Lsigma"); 1041LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma 1042LONG (1,0,0,0); 1043LONG (2,0,0,0); 1044LONG (3,0,0,0); 1045LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap 1046 1047LONG (0,1,2,3); 1048LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma 1049LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e); 1050LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32); 1051LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574); 1052 1053ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\""); 1054ALIGN (4); 1055 1056PERLASM_END(); 1057