1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for s390x. 18# 19# June 2015 20# 21# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated 22# code. For older compiler improvement coefficient is >3x, because 23# then base 2^64 and base 2^32 implementations are compared. 24# 25# On side note, z13 enables vector base 2^26 implementation... 26 27# 28# January 2019 29# 30# Add vx code path (base 2^26). 31# 32# Copyright IBM Corp. 2019 33# Author: Patrick Steuer <patrick.steuer@de.ibm.com> 34 35# 36# January 2019 37# 38# Add vector base 2^26 implementation. It's problematic to accurately 39# measure performance, because reference system is hardly idle. But 40# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's 41# >=20% faster than IBM's submission on long inputs, and much faster on 42# short ones, because calculation of key powers is postponed till we 43# know that input is long enough to justify the additional overhead. 44 45use strict; 46use FindBin qw($Bin); 47use lib "$Bin/../.."; 48use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE); 49 50# $output is the last argument if it looks like a file (it has an extension) 51# $flavour is the first argument if it doesn't look like a file 52my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 53my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 54 55my ($z,$SIZE_T); 56if ($flavour =~ /3[12]/) { 57 $z=0; # S/390 ABI 58 $SIZE_T=4; 59} else { 60 $z=1; # zSeries ABI 61 $SIZE_T=8; 62} 63 64my $stdframe=16*$SIZE_T+4*8; 65my $sp="%r15"; 66 67my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); 68 69PERLASM_BEGIN($output); 70 71INCLUDE ("s390x_arch.h"); 72TEXT (); 73 74################ 75# static void poly1305_init(void *ctx, const unsigned char key[16]) 76{ 77GLOBL ("poly1305_init"); 78TYPE ("poly1305_init","\@function"); 79ALIGN (16); 80LABEL ("poly1305_init"); 81 lghi ("%r0",0); 82 lghi ("%r1",-1); 83 stg ("%r0","0($ctx)"); # zero hash value 84 stg ("%r0","8($ctx)"); 85 stg ("%r0","16($ctx)"); 86 st ("%r0","24($ctx)"); # clear is_base2_26 87 lgr ("%r5",$ctx); # reassign $ctx 88 lghi ("%r2",0); 89 90&{$z? \&clgr:\&clr} ($inp,"%r0"); 91 je (".Lno_key"); 92 93 lrvg ("%r2","0($inp)"); # load little-endian key 94 lrvg ("%r3","8($inp)"); 95 96 nihl ("%r1",0xffc0); # 0xffffffc0ffffffff 97 srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff 98 srlg ("%r1","%r1",4); 99 nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc 100 101 ngr ("%r2","%r0"); 102 ngr ("%r3","%r1"); 103 104 stmg ("%r2","%r3","32(%r5)"); 105 106 larl ("%r1","OPENSSL_s390xcap_P"); 107 lg ("%r0","16(%r1)"); 108 srlg ("%r0","%r0",62); 109 nill ("%r0",1); # extract vx bit 110 lcgr ("%r0","%r0"); 111 larl ("%r1",".Lpoly1305_blocks"); 112 larl ("%r2",".Lpoly1305_blocks_vx"); 113 larl ("%r3",".Lpoly1305_emit"); 114&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector 115&{$z? \&ngr:\&nr} ("%r2","%r0"); 116&{$z? \&xgr:\&xr} ("%r2","%r1"); 117&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)"); 118 lghi ("%r2",1); 119LABEL (".Lno_key"); 120 br ("%r14"); 121SIZE ("poly1305_init",".-poly1305_init"); 122} 123 124################ 125# static void poly1305_blocks(void *ctx, const unsigned char *inp, 126# size_t len, u32 padbit) 127{ 128my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); 129my ($r0,$r1,$s1) = map("%r$_",(0..2)); 130 131GLOBL ("poly1305_blocks"); 132TYPE ("poly1305_blocks","\@function"); 133ALIGN (16); 134LABEL ("poly1305_blocks"); 135LABEL (".Lpoly1305_blocks"); 136&{$z? \<gr:\<r} ("%r0",$len); 137 jz (".Lno_data"); 138 139&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); 140 141 lg ($h0,"0($ctx)"); # load hash value 142 lg ($h1,"8($ctx)"); 143 lg ($h2,"16($ctx)"); 144 145LABEL (".Lpoly1305_blocks_entry"); 146if ($z) { 147 srlg ($len,$len,4); 148} else { 149 srl ($len,4); 150} 151 llgfr ($padbit,$padbit); # clear upper half, much needed with 152 # non-64-bit ABI 153 lg ($r0,"32($ctx)"); # load key 154 lg ($r1,"40($ctx)"); 155 156&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx 157 srlg ($s1,$r1,2); 158 algr ($s1,$r1); # s1 = r1 + r1>>2 159 j (".Loop"); 160 161ALIGN (16); 162LABEL (".Loop"); 163 lrvg ($d0lo,"0($inp)"); # load little-endian input 164 lrvg ($d1lo,"8($inp)"); 165 la ($inp,"16($inp)"); 166 167 algr ($d0lo,$h0); # accumulate input 168 alcgr ($d1lo,$h1); 169 alcgr ($h2,$padbit); 170 171 lgr ($h0,$d0lo); 172 mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo 173 lgr ($h1,$d1lo); 174 mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo 175 176 mlgr ($t0,$r1); # h0*r1 -> $t0:$h0 177 mlgr ($t1,$r0); # h1*r0 -> $t1:$h1 178 179 algr ($d0lo,$d1lo); 180 lgr ($d1lo,$h2); 181 alcgr ($d0hi,$d1hi); 182 lghi ($d1hi,0); 183 184 algr ($h1,$h0); 185 alcgr ($t1,$t0); 186 187 msgr ($d1lo,$s1); # h2*s1 188 msgr ($h2,$r0); # h2*r0 189 190 algr ($h1,$d1lo); 191 alcgr ($t1,$d1hi); # $d1hi is zero 192 193 algr ($h1,$d0hi); 194 alcgr ($h2,$t1); 195 196 lghi ($h0,-4); # final reduction step 197 ngr ($h0,$h2); 198 srlg ($t0,$h2,2); 199 algr ($h0,$t0); 200 lghi ($t1,3); 201 ngr ($h2,$t1); 202 203 algr ($h0,$d0lo); 204 alcgr ($h1,$d1hi); # $d1hi is still zero 205 alcgr ($h2,$d1hi); # $d1hi is still zero 206 207&{$z? \&brctg:\&brct} ($len,".Loop"); 208 209&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx 210 211 stg ($h0,"0($ctx)"); # store hash value 212 stg ($h1,"8($ctx)"); 213 stg ($h2,"16($ctx)"); 214 215&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)"); 216LABEL (".Lno_data"); 217 br ("%r14"); 218SIZE ("poly1305_blocks",".-poly1305_blocks"); 219} 220 221################ 222# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp, 223# size_t len, u32 padbit) 224{ 225my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4)); 226my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9)); 227my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14)); 228my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18)); 229my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23)); 230my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27)); 231my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31)); 232 233my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14)); 234 235TYPE ("poly1305_blocks_vx","\@function"); 236ALIGN (16); 237LABEL ("poly1305_blocks_vx"); 238LABEL (".Lpoly1305_blocks_vx"); 239&{$z? \&clgfi:\&clfi} ($len,128); 240 jhe ("__poly1305_blocks_vx"); 241 242&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); 243 244 lg ($d0,"0($ctx)"); 245 lg ($d1,"8($ctx)"); 246 lg ($d2,"16($ctx)"); 247 248 llgfr ("%r0",$d0); # base 2^26 -> base 2^64 249 srlg ($h0,$d0,32); 250 llgfr ("%r1",$d1); 251 srlg ($h1,$d1,32); 252 srlg ($h2,$d2,32); 253 254 sllg ("%r0","%r0",26); 255 algr ($h0,"%r0"); 256 sllg ("%r0",$h1,52); 257 srlg ($h1,$h1,12); 258 sllg ("%r1","%r1",14); 259 algr ($h0,"%r0"); 260 alcgr ($h1,"%r1"); 261 sllg ("%r0",$h2,40); 262 srlg ($h2,$h2,24); 263 lghi ("%r1",0); 264 algr ($h1,"%r0"); 265 alcgr ($h2,"%r1"); 266 267 llgf ("%r0","24($ctx)"); # is_base2_26 268 lcgr ("%r0","%r0"); 269 270 xgr ($h0,$d0); # choose between radixes 271 xgr ($h1,$d1); 272 xgr ($h2,$d2); 273 ngr ($h0,"%r0"); 274 ngr ($h1,"%r0"); 275 ngr ($h2,"%r0"); 276 xgr ($h0,$d0); 277 xgr ($h1,$d1); 278 xgr ($h2,$d2); 279 280 lhi ("%r0",0); 281 st ("%r0","24($ctx)"); # clear is_base2_26 282 283 j (".Lpoly1305_blocks_entry"); 284SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx"); 285 286TYPE ("__poly1305_mul","\@function"); 287ALIGN (16); 288LABEL ("__poly1305_mul"); 289 vmlof ($ACC0,$H0,$R0); 290 vmlof ($ACC1,$H0,$R1); 291 vmlof ($ACC2,$H0,$R2); 292 vmlof ($ACC3,$H0,$R3); 293 vmlof ($ACC4,$H0,$R4); 294 295 vmalof ($ACC0,$H1,$S4,$ACC0); 296 vmalof ($ACC1,$H1,$R0,$ACC1); 297 vmalof ($ACC2,$H1,$R1,$ACC2); 298 vmalof ($ACC3,$H1,$R2,$ACC3); 299 vmalof ($ACC4,$H1,$R3,$ACC4); 300 301 vmalof ($ACC0,$H2,$S3,$ACC0); 302 vmalof ($ACC1,$H2,$S4,$ACC1); 303 vmalof ($ACC2,$H2,$R0,$ACC2); 304 vmalof ($ACC3,$H2,$R1,$ACC3); 305 vmalof ($ACC4,$H2,$R2,$ACC4); 306 307 vmalof ($ACC0,$H3,$S2,$ACC0); 308 vmalof ($ACC1,$H3,$S3,$ACC1); 309 vmalof ($ACC2,$H3,$S4,$ACC2); 310 vmalof ($ACC3,$H3,$R0,$ACC3); 311 vmalof ($ACC4,$H3,$R1,$ACC4); 312 313 vmalof ($ACC0,$H4,$S1,$ACC0); 314 vmalof ($ACC1,$H4,$S2,$ACC1); 315 vmalof ($ACC2,$H4,$S3,$ACC2); 316 vmalof ($ACC3,$H4,$S4,$ACC3); 317 vmalof ($ACC4,$H4,$R0,$ACC4); 318 319 ################################################################ 320 # lazy reduction 321 322 vesrlg ($H4,$ACC3,26); 323 vesrlg ($H1,$ACC0,26); 324 vn ($H3,$ACC3,$mask26); 325 vn ($H0,$ACC0,$mask26); 326 vag ($H4,$H4,$ACC4); # h3 -> h4 327 vag ($H1,$H1,$ACC1); # h0 -> h1 328 329 vesrlg ($ACC4,$H4,26); 330 vesrlg ($ACC1,$H1,26); 331 vn ($H4,$H4,$mask26); 332 vn ($H1,$H1,$mask26); 333 vag ($H0,$H0,$ACC4); 334 vag ($H2,$ACC2,$ACC1); # h1 -> h2 335 336 veslg ($ACC4,$ACC4,2); # <<2 337 vesrlg ($ACC2,$H2,26); 338 vn ($H2,$H2,$mask26); 339 vag ($H0,$H0,$ACC4); # h4 -> h0 340 vag ($H3,$H3,$ACC2); # h2 -> h3 341 342 vesrlg ($ACC0,$H0,26); 343 vesrlg ($ACC3,$H3,26); 344 vn ($H0,$H0,$mask26); 345 vn ($H3,$H3,$mask26); 346 vag ($H1,$H1,$ACC0); # h0 -> h1 347 vag ($H4,$H4,$ACC3); # h3 -> h4 348 br ("%r14"); 349SIZE ("__poly1305_mul",".-__poly1305_mul"); 350 351TYPE ("__poly1305_blocks_vx","\@function"); 352ALIGN (16); 353LABEL ("__poly1305_blocks_vx"); 354&{$z? \&lgr:\&lr} ("%r0",$sp); 355&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)"); 356if (!$z) { 357 std ("%f4","16*$SIZE_T+2*8($sp)"); 358 std ("%f6","16*$SIZE_T+3*8($sp)"); 359 ahi ($sp,-$stdframe); 360 st ("%r0","0($sp)"); # back-chain 361 362 llgfr ($len,$len); # so that srlg works on $len 363} else { 364 aghi ($sp,"-($stdframe+8*8)"); 365 stg ("%r0","0($sp)"); # back-chain 366 367 std ("%f8","$stdframe+0*8($sp)"); 368 std ("%f9","$stdframe+1*8($sp)"); 369 std ("%f10","$stdframe+2*8($sp)"); 370 std ("%f11","$stdframe+3*8($sp)"); 371 std ("%f12","$stdframe+4*8($sp)"); 372 std ("%f13","$stdframe+5*8($sp)"); 373 std ("%f14","$stdframe+6*8($sp)"); 374 std ("%f15","$stdframe+7*8($sp)"); 375} 376 larl ("%r1",".Lconst"); 377 vgmg ($mask26,38,63); 378 vlm ($bswaplo,$bswapmi,"16(%r1)"); 379 380 < ("%r0","24($ctx)"); # is_base2_26? 381 jnz (".Lskip_init"); 382 383 lg ($h0,"32($ctx)"); # load key base 2^64 384 lg ($h1,"40($ctx)"); 385 386 risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 387 srlg ($d1,$h0,52); 388 risbg ($h0,$h0,38,0x80+63,0); 389 vlvgg ($R0,$h0,0); 390 risbg ($d1,$h1,38,51,12); 391 vlvgg ($R1,$d0,0); 392 risbg ($d0,$h1,38,63,50); 393 vlvgg ($R2,$d1,0); 394 srlg ($d1,$h1,40); 395 vlvgg ($R3,$d0,0); 396 vlvgg ($R4,$d1,0); 397 398 veslg ($S1,$R1,2); 399 veslg ($S2,$R2,2); 400 veslg ($S3,$R3,2); 401 veslg ($S4,$R4,2); 402 vlr ($H0,$R0); 403 vlr ($H1,$R1); 404 vlr ($H2,$R2); 405 vlr ($H3,$R3); 406 vlr ($H4,$R4); 407 vag ($S1,$S1,$R1); # * 5 408 vag ($S2,$S2,$R2); 409 vag ($S3,$S3,$R3); 410 vag ($S4,$S4,$R4); 411 412 brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:- 413 414 vpdi ($R0,$H0,$R0,0); # r^2:r^1 415 vpdi ($R1,$H1,$R1,0); 416 vpdi ($R2,$H2,$R2,0); 417 vpdi ($R3,$H3,$R3,0); 418 vpdi ($R4,$H4,$R4,0); 419 vpdi ($H0,$H0,$H0,0); # r^2:r^2 420 vpdi ($H1,$H1,$H1,0); 421 vpdi ($H2,$H2,$H2,0); 422 vpdi ($H3,$H3,$H3,0); 423 vpdi ($H4,$H4,$H4,0); 424 veslg ($S1,$R1,2); 425 veslg ($S2,$R2,2); 426 veslg ($S3,$R3,2); 427 veslg ($S4,$R4,2); 428 vag ($S1,$S1,$R1); # * 5 429 vag ($S2,$S2,$R2); 430 vag ($S3,$S3,$R3); 431 vag ($S4,$S4,$R4); 432 433 brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1 434 435 vl ($I0,"0(%r1)"); # borrow $I0 436 vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3 437 vperm ($R1,$R1,$H1,$I0); 438 vperm ($R2,$R2,$H2,$I0); 439 vperm ($R3,$R3,$H3,$I0); 440 vperm ($R4,$R4,$H4,$I0); 441 veslf ($S1,$R1,2); 442 veslf ($S2,$R2,2); 443 veslf ($S3,$R3,2); 444 veslf ($S4,$R4,2); 445 vaf ($S1,$S1,$R1); # * 5 446 vaf ($S2,$S2,$R2); 447 vaf ($S3,$S3,$R3); 448 vaf ($S4,$S4,$R4); 449 450 lg ($h0,"0($ctx)"); # load hash base 2^64 451 lg ($h1,"8($ctx)"); 452 lg ($h2,"16($ctx)"); 453 454 vzero ($H0); 455 vzero ($H1); 456 vzero ($H2); 457 vzero ($H3); 458 vzero ($H4); 459 460 risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 461 srlg ($d1,$h0,52); 462 risbg ($h0,$h0,38,0x80+63,0); 463 vlvgg ($H0,$h0,0); 464 risbg ($d1,$h1,38,51,12); 465 vlvgg ($H1,$d0,0); 466 risbg ($d0,$h1,38,63,50); 467 vlvgg ($H2,$d1,0); 468 srlg ($d1,$h1,40); 469 vlvgg ($H3,$d0,0); 470 risbg ($d1,$h2,37,39,24); 471 vlvgg ($H4,$d1,0); 472 473 lhi ("%r0",1); 474 st ("%r0","24($ctx)"); # set is_base2_26 475 476 vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26 477 478 vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4 479 vpdi ($R1,$R1,$R1,0); 480 vpdi ($S1,$S1,$S1,0); 481 vpdi ($R2,$R2,$R2,0); 482 vpdi ($S2,$S2,$S2,0); 483 vpdi ($R3,$R3,$R3,0); 484 vpdi ($S3,$S3,$S3,0); 485 vpdi ($R4,$R4,$R4,0); 486 vpdi ($S4,$S4,$S4,0); 487 488 j (".Loaded_hash"); 489 490ALIGN (16); 491LABEL (".Lskip_init"); 492 vllezf ($H0,"0($ctx)"); # load hash base 2^26 493 vllezf ($H1,"4($ctx)"); 494 vllezf ($H2,"8($ctx)"); 495 vllezf ($H3,"12($ctx)"); 496 vllezf ($H4,"16($ctx)"); 497 498 vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4 499 vlrepg ($R1,"0x40($ctx)"); 500 vlrepg ($S1,"0x50($ctx)"); 501 vlrepg ($R2,"0x60($ctx)"); 502 vlrepg ($S2,"0x70($ctx)"); 503 vlrepg ($R3,"0x80($ctx)"); 504 vlrepg ($S3,"0x90($ctx)"); 505 vlrepg ($R4,"0xa0($ctx)"); 506 vlrepg ($S4,"0xb0($ctx)"); 507 508LABEL (".Loaded_hash"); 509 vzero ($I1); 510 vzero ($I3); 511 512 vlm ($T1,$T4,"0x00($inp)"); # load first input block 513 la ($inp,"0x40($inp)"); 514 vgmg ($mask26,6,31); 515 vgmf ($I4,5,5); # padbit<<2 516 517 vperm ($I0,$T3,$T4,$bswaplo); 518 vperm ($I2,$T3,$T4,$bswapmi); 519 vperm ($T3,$T3,$T4,$bswaphi); 520 521 verimg ($I1,$I0,$mask26,6); # >>26 522 veslg ($I0,$I0,32); 523 veslg ($I2,$I2,28); # >>4 524 verimg ($I3,$T3,$mask26,18); # >>14 525 verimg ($I4,$T3,$mask26,58); # >>38 526 vn ($I0,$I0,$mask26); 527 vn ($I2,$I2,$mask26); 528 vesrlf ($I4,$I4,2); # >>2 529 530 vgmg ($mask26,38,63); 531 vperm ($T3,$T1,$T2,$bswaplo); 532 vperm ($T4,$T1,$T2,$bswaphi); 533 vperm ($T2,$T1,$T2,$bswapmi); 534 535 verimg ($I0,$T3,$mask26,0); 536 verimg ($I1,$T3,$mask26,38); # >>26 537 verimg ($I2,$T2,$mask26,60); # >>4 538 verimg ($I3,$T4,$mask26,50); # >>14 539 vesrlg ($T4,$T4,40); 540 vo ($I4,$I4,$T4); 541 542 srlg ("%r0",$len,6); 543&{$z? \&aghi:\&ahi} ("%r0",-1); 544 545ALIGN (16); 546LABEL (".Loop_vx"); 547 vmlef ($ACC0,$I0,$R0); 548 vmlef ($ACC1,$I0,$R1); 549 vmlef ($ACC2,$I0,$R2); 550 vmlef ($ACC3,$I0,$R3); 551 vmlef ($ACC4,$I0,$R4); 552 553 vmalef ($ACC0,$I1,$S4,$ACC0); 554 vmalef ($ACC1,$I1,$R0,$ACC1); 555 vmalef ($ACC2,$I1,$R1,$ACC2); 556 vmalef ($ACC3,$I1,$R2,$ACC3); 557 vmalef ($ACC4,$I1,$R3,$ACC4); 558 559 vaf ($H2,$H2,$I2); 560 vaf ($H0,$H0,$I0); 561 vaf ($H3,$H3,$I3); 562 vaf ($H1,$H1,$I1); 563 vaf ($H4,$H4,$I4); 564 565 vmalef ($ACC0,$I2,$S3,$ACC0); 566 vmalef ($ACC1,$I2,$S4,$ACC1); 567 vmalef ($ACC2,$I2,$R0,$ACC2); 568 vmalef ($ACC3,$I2,$R1,$ACC3); 569 vmalef ($ACC4,$I2,$R2,$ACC4); 570 571 vlm ($T1,$T4,"0x00($inp)"); # load next input block 572 la ($inp,"0x40($inp)"); 573 vgmg ($mask26,6,31); 574 575 vmalef ($ACC0,$I3,$S2,$ACC0); 576 vmalef ($ACC1,$I3,$S3,$ACC1); 577 vmalef ($ACC2,$I3,$S4,$ACC2); 578 vmalef ($ACC3,$I3,$R0,$ACC3); 579 vmalef ($ACC4,$I3,$R1,$ACC4); 580 581 vperm ($I0,$T3,$T4,$bswaplo); 582 vperm ($I2,$T3,$T4,$bswapmi); 583 vperm ($T3,$T3,$T4,$bswaphi); 584 585 vmalef ($ACC0,$I4,$S1,$ACC0); 586 vmalef ($ACC1,$I4,$S2,$ACC1); 587 vmalef ($ACC2,$I4,$S3,$ACC2); 588 vmalef ($ACC3,$I4,$S4,$ACC3); 589 vmalef ($ACC4,$I4,$R0,$ACC4); 590 591 verimg ($I1,$I0,$mask26,6); # >>26 592 veslg ($I0,$I0,32); 593 veslg ($I2,$I2,28); # >>4 594 verimg ($I3,$T3,$mask26,18); # >>14 595 596 vmalof ($ACC0,$H0,$R0,$ACC0); 597 vmalof ($ACC1,$H0,$R1,$ACC1); 598 vmalof ($ACC2,$H0,$R2,$ACC2); 599 vmalof ($ACC3,$H0,$R3,$ACC3); 600 vmalof ($ACC4,$H0,$R4,$ACC4); 601 602 vgmf ($I4,5,5); # padbit<<2 603 verimg ($I4,$T3,$mask26,58); # >>38 604 vn ($I0,$I0,$mask26); 605 vn ($I2,$I2,$mask26); 606 vesrlf ($I4,$I4,2); # >>2 607 608 vmalof ($ACC0,$H1,$S4,$ACC0); 609 vmalof ($ACC1,$H1,$R0,$ACC1); 610 vmalof ($ACC2,$H1,$R1,$ACC2); 611 vmalof ($ACC3,$H1,$R2,$ACC3); 612 vmalof ($ACC4,$H1,$R3,$ACC4); 613 614 vgmg ($mask26,38,63); 615 vperm ($T3,$T1,$T2,$bswaplo); 616 vperm ($T4,$T1,$T2,$bswaphi); 617 vperm ($T2,$T1,$T2,$bswapmi); 618 619 vmalof ($ACC0,$H2,$S3,$ACC0); 620 vmalof ($ACC1,$H2,$S4,$ACC1); 621 vmalof ($ACC2,$H2,$R0,$ACC2); 622 vmalof ($ACC3,$H2,$R1,$ACC3); 623 vmalof ($ACC4,$H2,$R2,$ACC4); 624 625 verimg ($I0,$T3,$mask26,0); 626 verimg ($I1,$T3,$mask26,38); # >>26 627 verimg ($I2,$T2,$mask26,60); # >>4 628 629 vmalof ($ACC0,$H3,$S2,$ACC0); 630 vmalof ($ACC1,$H3,$S3,$ACC1); 631 vmalof ($ACC2,$H3,$S4,$ACC2); 632 vmalof ($ACC3,$H3,$R0,$ACC3); 633 vmalof ($ACC4,$H3,$R1,$ACC4); 634 635 verimg ($I3,$T4,$mask26,50); # >>14 636 vesrlg ($T4,$T4,40); 637 vo ($I4,$I4,$T4); 638 639 vmalof ($ACC0,$H4,$S1,$ACC0); 640 vmalof ($ACC1,$H4,$S2,$ACC1); 641 vmalof ($ACC2,$H4,$S3,$ACC2); 642 vmalof ($ACC3,$H4,$S4,$ACC3); 643 vmalof ($ACC4,$H4,$R0,$ACC4); 644 645 ################################################################ 646 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 647 # and P. Schwabe 648 649 vesrlg ($H4,$ACC3,26); 650 vesrlg ($H1,$ACC0,26); 651 vn ($H3,$ACC3,$mask26); 652 vn ($H0,$ACC0,$mask26); 653 vag ($H4,$H4,$ACC4); # h3 -> h4 654 vag ($H1,$H1,$ACC1); # h0 -> h1 655 656 vesrlg ($ACC4,$H4,26); 657 vesrlg ($ACC1,$H1,26); 658 vn ($H4,$H4,$mask26); 659 vn ($H1,$H1,$mask26); 660 vag ($H0,$H0,$ACC4); 661 vag ($H2,$ACC2,$ACC1); # h1 -> h2 662 663 veslg ($ACC4,$ACC4,2); # <<2 664 vesrlg ($ACC2,$H2,26); 665 vn ($H2,$H2,$mask26); 666 vag ($H0,$H0,$ACC4); # h4 -> h0 667 vag ($H3,$H3,$ACC2); # h2 -> h3 668 669 vesrlg ($ACC0,$H0,26); 670 vesrlg ($ACC3,$H3,26); 671 vn ($H0,$H0,$mask26); 672 vn ($H3,$H3,$mask26); 673 vag ($H1,$H1,$ACC0); # h0 -> h1 674 vag ($H4,$H4,$ACC3); # h3 -> h4 675 676&{$z? \&brctg:\&brct} ("%r0",".Loop_vx"); 677 678 vlm ($R0,$S4,"48($ctx)"); # load all powers 679 680 lghi ("%r0",0x30); 681&{$z? \&lcgr:\&lcr} ($len,$len); 682&{$z? \&ngr:\&nr} ($len,"%r0"); 683&{$z? \&slgr:\&slr} ($inp,$len); 684 685LABEL (".Last"); 686 vmlef ($ACC0,$I0,$R0); 687 vmlef ($ACC1,$I0,$R1); 688 vmlef ($ACC2,$I0,$R2); 689 vmlef ($ACC3,$I0,$R3); 690 vmlef ($ACC4,$I0,$R4); 691 692 vmalef ($ACC0,$I1,$S4,$ACC0); 693 vmalef ($ACC1,$I1,$R0,$ACC1); 694 vmalef ($ACC2,$I1,$R1,$ACC2); 695 vmalef ($ACC3,$I1,$R2,$ACC3); 696 vmalef ($ACC4,$I1,$R3,$ACC4); 697 698 vaf ($H0,$H0,$I0); 699 vaf ($H1,$H1,$I1); 700 vaf ($H2,$H2,$I2); 701 vaf ($H3,$H3,$I3); 702 vaf ($H4,$H4,$I4); 703 704 vmalef ($ACC0,$I2,$S3,$ACC0); 705 vmalef ($ACC1,$I2,$S4,$ACC1); 706 vmalef ($ACC2,$I2,$R0,$ACC2); 707 vmalef ($ACC3,$I2,$R1,$ACC3); 708 vmalef ($ACC4,$I2,$R2,$ACC4); 709 710 vmalef ($ACC0,$I3,$S2,$ACC0); 711 vmalef ($ACC1,$I3,$S3,$ACC1); 712 vmalef ($ACC2,$I3,$S4,$ACC2); 713 vmalef ($ACC3,$I3,$R0,$ACC3); 714 vmalef ($ACC4,$I3,$R1,$ACC4); 715 716 vmalef ($ACC0,$I4,$S1,$ACC0); 717 vmalef ($ACC1,$I4,$S2,$ACC1); 718 vmalef ($ACC2,$I4,$S3,$ACC2); 719 vmalef ($ACC3,$I4,$S4,$ACC3); 720 vmalef ($ACC4,$I4,$R0,$ACC4); 721 722 vmalof ($ACC0,$H0,$R0,$ACC0); 723 vmalof ($ACC1,$H0,$R1,$ACC1); 724 vmalof ($ACC2,$H0,$R2,$ACC2); 725 vmalof ($ACC3,$H0,$R3,$ACC3); 726 vmalof ($ACC4,$H0,$R4,$ACC4); 727 728 vmalof ($ACC0,$H1,$S4,$ACC0); 729 vmalof ($ACC1,$H1,$R0,$ACC1); 730 vmalof ($ACC2,$H1,$R1,$ACC2); 731 vmalof ($ACC3,$H1,$R2,$ACC3); 732 vmalof ($ACC4,$H1,$R3,$ACC4); 733 734 vmalof ($ACC0,$H2,$S3,$ACC0); 735 vmalof ($ACC1,$H2,$S4,$ACC1); 736 vmalof ($ACC2,$H2,$R0,$ACC2); 737 vmalof ($ACC3,$H2,$R1,$ACC3); 738 vmalof ($ACC4,$H2,$R2,$ACC4); 739 740 vmalof ($ACC0,$H3,$S2,$ACC0); 741 vmalof ($ACC1,$H3,$S3,$ACC1); 742 vmalof ($ACC2,$H3,$S4,$ACC2); 743 vmalof ($ACC3,$H3,$R0,$ACC3); 744 vmalof ($ACC4,$H3,$R1,$ACC4); 745 746 vmalof ($ACC0,$H4,$S1,$ACC0); 747 vmalof ($ACC1,$H4,$S2,$ACC1); 748 vmalof ($ACC2,$H4,$S3,$ACC2); 749 vmalof ($ACC3,$H4,$S4,$ACC3); 750 vmalof ($ACC4,$H4,$R0,$ACC4); 751 752 ################################################################ 753 # horizontal addition 754 755 vzero ($H0); 756 vsumqg ($ACC0,$ACC0,$H0); 757 vsumqg ($ACC1,$ACC1,$H0); 758 vsumqg ($ACC2,$ACC2,$H0); 759 vsumqg ($ACC3,$ACC3,$H0); 760 vsumqg ($ACC4,$ACC4,$H0); 761 762 ################################################################ 763 # lazy reduction 764 765 vesrlg ($H4,$ACC3,26); 766 vesrlg ($H1,$ACC0,26); 767 vn ($H3,$ACC3,$mask26); 768 vn ($H0,$ACC0,$mask26); 769 vag ($H4,$H4,$ACC4); # h3 -> h4 770 vag ($H1,$H1,$ACC1); # h0 -> h1 771 772 vesrlg ($ACC4,$H4,26); 773 vesrlg ($ACC1,$H1,26); 774 vn ($H4,$H4,$mask26); 775 vn ($H1,$H1,$mask26); 776 vag ($H0,$H0,$ACC4); 777 vag ($H2,$ACC2,$ACC1); # h1 -> h2 778 779 veslg ($ACC4,$ACC4,2); # <<2 780 vesrlg ($ACC2,$H2,26); 781 vn ($H2,$H2,$mask26); 782 vag ($H0,$H0,$ACC4); # h4 -> h0 783 vag ($H3,$H3,$ACC2); # h2 -> h3 784 785 vesrlg ($ACC0,$H0,26); 786 vesrlg ($ACC3,$H3,26); 787 vn ($H0,$H0,$mask26); 788 vn ($H3,$H3,$mask26); 789 vag ($H1,$H1,$ACC0); # h0 -> h1 790 vag ($H4,$H4,$ACC3); # h3 -> h4 791 792&{$z? \&clgfi:\&clfi} ($len,0); 793 je (".Ldone"); 794 795 vlm ($T1,$T4,"0x00($inp)"); # load last partial block 796 vgmg ($mask26,6,31); 797 vgmf ($I4,5,5); # padbit<<2 798 799 vperm ($I0,$T3,$T4,$bswaplo); 800 vperm ($I2,$T3,$T4,$bswapmi); 801 vperm ($T3,$T3,$T4,$bswaphi); 802 803 vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1 804 vl ($ACC1,"0x60($len,%r1)"); 805 806 verimg ($I1,$I0,$mask26,6); # >>26 807 veslg ($I0,$I0,32); 808 veslg ($I2,$I2,28); # >>4 809 verimg ($I3,$T3,$mask26,18); # >>14 810 verimg ($I4,$T3,$mask26,58); # >>38 811 vn ($I0,$I0,$mask26); 812 vn ($I2,$I2,$mask26); 813 vesrlf ($I4,$I4,2); # >>2 814 815 vgmg ($mask26,38,63); 816 vperm ($T3,$T1,$T2,$bswaplo); 817 vperm ($T4,$T1,$T2,$bswaphi); 818 vperm ($T2,$T1,$T2,$bswapmi); 819 820 verimg ($I0,$T3,$mask26,0); 821 verimg ($I1,$T3,$mask26,38); # >>26 822 verimg ($I2,$T2,$mask26,60); # >>4 823 verimg ($I3,$T4,$mask26,50); # >>14 824 vesrlg ($T4,$T4,40); 825 vo ($I4,$I4,$T4); 826 827 vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane 828 vn ($I0,$I0,$ACC1); # mask redundant lane[s] 829 vperm ($H1,$H1,$H1,$ACC0); 830 vn ($I1,$I1,$ACC1); 831 vperm ($H2,$H2,$H2,$ACC0); 832 vn ($I2,$I2,$ACC1); 833 vperm ($H3,$H3,$H3,$ACC0); 834 vn ($I3,$I3,$ACC1); 835 vperm ($H4,$H4,$H4,$ACC0); 836 vn ($I4,$I4,$ACC1); 837 838 vaf ($I0,$I0,$H0); # accumulate hash 839 vzero ($H0); # wipe hash value 840 vaf ($I1,$I1,$H1); 841 vzero ($H1); 842 vaf ($I2,$I2,$H2); 843 vzero ($H2); 844 vaf ($I3,$I3,$H3); 845 vzero ($H3); 846 vaf ($I4,$I4,$H4); 847 vzero ($H4); 848 849&{$z? \&lghi:\&lhi} ($len,0); 850 j (".Last"); 851 # I don't bother to tell apart cases when only one multiplication 852 # pass is sufficient, because I argue that mispredicted branch 853 # penalties are comparable to overhead of sometimes redundant 854 # multiplication pass... 855 856LABEL (".Ldone"); 857 vstef ($H0,"0($ctx)",3); # store hash base 2^26 858 vstef ($H1,"4($ctx)",3); 859 vstef ($H2,"8($ctx)",3); 860 vstef ($H3,"12($ctx)",3); 861 vstef ($H4,"16($ctx)",3); 862 863if ($z) { 864 ld ("%f8","$stdframe+0*8($sp)"); 865 ld ("%f9","$stdframe+1*8($sp)"); 866 ld ("%f10","$stdframe+2*8($sp)"); 867 ld ("%f11","$stdframe+3*8($sp)"); 868 ld ("%f12","$stdframe+4*8($sp)"); 869 ld ("%f13","$stdframe+5*8($sp)"); 870 ld ("%f14","$stdframe+6*8($sp)"); 871 ld ("%f15","$stdframe+7*8($sp)"); 872&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)"); 873} else { 874 ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)"); 875 ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)"); 876&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)"); 877} 878 br ("%r14"); 879SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx"); 880} 881 882################ 883# static void poly1305_emit(void *ctx, unsigned char mac[16], 884# const u32 nonce[4]) 885{ 886my ($mac,$nonce)=($inp,$len); 887my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10)); 888 889GLOBL ("poly1305_emit"); 890TYPE ("poly1305_emit","\@function"); 891ALIGN (16); 892LABEL ("poly1305_emit"); 893LABEL (".Lpoly1305_emit"); 894&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)"); 895 896 lg ($d0,"0($ctx)"); 897 lg ($d1,"8($ctx)"); 898 lg ($d2,"16($ctx)"); 899 900 llgfr ("%r0",$d0); # base 2^26 -> base 2^64 901 srlg ($h0,$d0,32); 902 llgfr ("%r1",$d1); 903 srlg ($h1,$d1,32); 904 srlg ($h2,$d2,32); 905 906 sllg ("%r0","%r0",26); 907 algr ($h0,"%r0"); 908 sllg ("%r0",$h1,52); 909 srlg ($h1,$h1,12); 910 sllg ("%r1","%r1",14); 911 algr ($h0,"%r0"); 912 alcgr ($h1,"%r1"); 913 sllg ("%r0",$h2,40); 914 srlg ($h2,$h2,24); 915 lghi ("%r1",0); 916 algr ($h1,"%r0"); 917 alcgr ($h2,"%r1"); 918 919 llgf ("%r0","24($ctx)"); # is_base2_26 920 lcgr ("%r0","%r0"); 921 922 xgr ($h0,$d0); # choose between radixes 923 xgr ($h1,$d1); 924 xgr ($h2,$d2); 925 ngr ($h0,"%r0"); 926 ngr ($h1,"%r0"); 927 ngr ($h2,"%r0"); 928 xgr ($h0,$d0); 929 xgr ($h1,$d1); 930 xgr ($h2,$d2); 931 932 lghi ("%r0",5); 933 lgr ($d0,$h0); 934 lgr ($d1,$h1); 935 936 algr ($h0,"%r0"); # compare to modulus 937 alcgr ($h1,"%r1"); 938 alcgr ($h2,"%r1"); 939 940 srlg ($h2,$h2,2); # did it borrow/carry? 941 slgr ("%r1",$h2); # 0-$h2>>2 942 lg ($d2,"0($nonce)"); # load nonce 943 lg ($ctx,"8($nonce)"); 944 945 xgr ($h0,$d0); 946 xgr ($h1,$d1); 947 ngr ($h0,"%r1"); 948 ngr ($h1,"%r1"); 949 xgr ($h0,$d0); 950 rllg ($d0,$d2,32); # flip nonce words 951 xgr ($h1,$d1); 952 rllg ($d1,$ctx,32); 953 954 algr ($h0,$d0); # accumulate nonce 955 alcgr ($h1,$d1); 956 957 strvg ($h0,"0($mac)"); # write little-endian result 958 strvg ($h1,"8($mac)"); 959 960&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)"); 961 br ("%r14"); 962SIZE ("poly1305_emit",".-poly1305_emit"); 963} 964 965################ 966 967ALIGN (16); 968LABEL (".Lconst"); 969LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd 970LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks 971LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918); 972LONG (0x00000000,0x09080706,0x00000000,0x19181716); 973 974LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks 975LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000); 976LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000); 977 978LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff); 979LONG (0xffffffff,0x00000000,0xffffffff,0x00000000); 980LONG (0x00000000,0x00000000,0xffffffff,0x00000000); 981 982STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\""); 983 984PERLASM_END(); 985