1#! /usr/bin/env perl 2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for SPARCv9, vanilla, as well 18# as VIS3 and FMA extensions. 19# 20# May, August 2015 21# 22# Numbers are cycles per processed byte with poly1305_blocks alone. 23# 24# IALU(*) FMA 25# 26# UltraSPARC III 12.3(**) 27# SPARC T3 7.92 28# SPARC T4 1.70(***) 6.55 29# SPARC64 X 5.60 3.64 30# 31# (*) Comparison to compiler-generated code is really problematic, 32# because latter's performance varies too much depending on too 33# many variables. For example, one can measure from 5x to 15x 34# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit 35# unfair comparison, because compiler doesn't use VIS3, but 36# given same initial conditions coefficient varies from 3x to 9x. 37# (**) Pre-III performance should be even worse; floating-point 38# performance for UltraSPARC I-IV on the other hand is reported 39# to be 4.25 for hand-coded assembly, but they are just too old 40# to care about. 41# (***) Multi-process benchmark saturates at ~12.5x single-process 42# result on 8-core processor, or ~21GBps per 2.85GHz socket. 43 44# $output is the last argument if it looks like a file (it has an extension) 45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46 47open STDOUT,">$output" if $output; 48 49my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5)); 50my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7)); 51my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7)); 52my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4)); 53 54$code.=<<___; 55#ifndef __ASSEMBLER__ 56# define __ASSEMBLER__ 1 57#endif 58#include "crypto/sparc_arch.h" 59 60#ifdef __arch64__ 61.register %g2,#scratch 62.register %g3,#scratch 63# define STPTR stx 64# define SIZE_T 8 65#else 66# define STPTR st 67# define SIZE_T 4 68#endif 69#define LOCALS (STACK_BIAS+STACK_FRAME) 70 71.section ".text",#alloc,#execinstr 72 73#ifdef __PIC__ 74SPARC_PIC_THUNK(%g1) 75#endif 76 77.globl poly1305_init 78.align 32 79poly1305_init: 80 save %sp,-STACK_FRAME-16,%sp 81 nop 82 83 SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1) 84 ld [%g1],%g1 85 86 and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1 87 cmp %g1,SPARCV9_FMADD 88 be .Lpoly1305_init_fma 89 nop 90 91 stx %g0,[$ctx+0] 92 stx %g0,[$ctx+8] ! zero hash value 93 brz,pn $inp,.Lno_key 94 stx %g0,[$ctx+16] 95 96 and $inp,7,$shr ! alignment factor 97 andn $inp,7,$inp 98 sll $shr,3,$shr ! *8 99 neg $shr,$shl 100 101 sethi %hi(0x0ffffffc),$t0 102 set 8,$h1 103 or $t0,%lo(0x0ffffffc),$t0 104 set 16,$h2 105 sllx $t0,32,$t1 106 or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc 107 or $t1,3,$t0 ! 0x0ffffffc0fffffff 108 109 ldxa [$inp+%g0]0x88,$h0 ! load little-endian key 110 brz,pt $shr,.Lkey_aligned 111 ldxa [$inp+$h1]0x88,$h1 112 113 ldxa [$inp+$h2]0x88,$h2 114 srlx $h0,$shr,$h0 115 sllx $h1,$shl,$t2 116 srlx $h1,$shr,$h1 117 or $t2,$h0,$h0 118 sllx $h2,$shl,$h2 119 or $h2,$h1,$h1 120 121.Lkey_aligned: 122 and $t0,$h0,$h0 123 and $t1,$h1,$h1 124 stx $h0,[$ctx+32+0] ! store key 125 stx $h1,[$ctx+32+8] 126 127 andcc %g1,SPARCV9_VIS3,%g0 128 be .Lno_key 129 nop 130 1311: call .+8 132 add %o7,poly1305_blocks_vis3-1b,%o7 133 134 add %o7,poly1305_emit-poly1305_blocks_vis3,%o5 135 STPTR %o7,[%i2] 136 STPTR %o5,[%i2+SIZE_T] 137 138 ret 139 restore %g0,1,%o0 ! return 1 140 141.Lno_key: 142 ret 143 restore %g0,%g0,%o0 ! return 0 144.type poly1305_init,#function 145.size poly1305_init,.-poly1305_init 146 147.globl poly1305_blocks 148.align 32 149poly1305_blocks: 150 save %sp,-STACK_FRAME,%sp 151 srln $len,4,$len 152 153 brz,pn $len,.Lno_data 154 nop 155 156 ld [$ctx+32+0],$r1 ! load key 157 ld [$ctx+32+4],$r0 158 ld [$ctx+32+8],$r3 159 ld [$ctx+32+12],$r2 160 161 ld [$ctx+0],$h1 ! load hash value 162 ld [$ctx+4],$h0 163 ld [$ctx+8],$h3 164 ld [$ctx+12],$h2 165 ld [$ctx+16],$h4 166 167 and $inp,7,$shr ! alignment factor 168 andn $inp,7,$inp 169 set 8,$d1 170 sll $shr,3,$shr ! *8 171 set 16,$d2 172 neg $shr,$shl 173 174 srl $r1,2,$s1 175 srl $r2,2,$s2 176 add $r1,$s1,$s1 177 srl $r3,2,$s3 178 add $r2,$s2,$s2 179 add $r3,$s3,$s3 180 181.Loop: 182 ldxa [$inp+%g0]0x88,$d0 ! load little-endian input 183 brz,pt $shr,.Linp_aligned 184 ldxa [$inp+$d1]0x88,$d1 185 186 ldxa [$inp+$d2]0x88,$d2 187 srlx $d0,$shr,$d0 188 sllx $d1,$shl,$t1 189 srlx $d1,$shr,$d1 190 or $t1,$d0,$d0 191 sllx $d2,$shl,$d2 192 or $d2,$d1,$d1 193 194.Linp_aligned: 195 srlx $d0,32,$t0 196 addcc $d0,$h0,$h0 ! accumulate input 197 srlx $d1,32,$t1 198 addccc $t0,$h1,$h1 199 addccc $d1,$h2,$h2 200 addccc $t1,$h3,$h3 201 addc $padbit,$h4,$h4 202 203 umul $r0,$h0,$d0 204 umul $r1,$h0,$d1 205 umul $r2,$h0,$d2 206 umul $r3,$h0,$d3 207 sub $len,1,$len 208 add $inp,16,$inp 209 210 umul $s3,$h1,$t0 211 umul $r0,$h1,$t1 212 umul $r1,$h1,$t2 213 add $t0,$d0,$d0 214 add $t1,$d1,$d1 215 umul $r2,$h1,$t0 216 add $t2,$d2,$d2 217 add $t0,$d3,$d3 218 219 umul $s2,$h2,$t1 220 umul $s3,$h2,$t2 221 umul $r0,$h2,$t0 222 add $t1,$d0,$d0 223 add $t2,$d1,$d1 224 umul $r1,$h2,$t1 225 add $t0,$d2,$d2 226 add $t1,$d3,$d3 227 228 umul $s1,$h3,$t2 229 umul $s2,$h3,$t0 230 umul $s3,$h3,$t1 231 add $t2,$d0,$d0 232 add $t0,$d1,$d1 233 umul $r0,$h3,$t2 234 add $t1,$d2,$d2 235 add $t2,$d3,$d3 236 237 umul $s1,$h4,$t0 238 umul $s2,$h4,$t1 239 umul $s3,$h4,$t2 240 umul $r0,$h4,$h4 241 add $t0,$d1,$d1 242 add $t1,$d2,$d2 243 srlx $d0,32,$h1 244 add $t2,$d3,$d3 245 srlx $d1,32,$h2 246 247 addcc $d1,$h1,$h1 248 srlx $d2,32,$h3 249 set 8,$d1 250 addccc $d2,$h2,$h2 251 srlx $d3,32,$t0 252 set 16,$d2 253 addccc $d3,$h3,$h3 254 addc $t0,$h4,$h4 255 256 srl $h4,2,$t0 ! final reduction step 257 andn $h4,3,$t1 258 and $h4,3,$h4 259 add $t1,$t0,$t0 260 261 addcc $t0,$d0,$h0 262 addccc %g0,$h1,$h1 263 addccc %g0,$h2,$h2 264 addccc %g0,$h3,$h3 265 brnz,pt $len,.Loop 266 addc %g0,$h4,$h4 267 268 st $h1,[$ctx+0] ! store hash value 269 st $h0,[$ctx+4] 270 st $h3,[$ctx+8] 271 st $h2,[$ctx+12] 272 st $h4,[$ctx+16] 273 274.Lno_data: 275 ret 276 restore 277.type poly1305_blocks,#function 278.size poly1305_blocks,.-poly1305_blocks 279___ 280######################################################################## 281# VIS3 has umulxhi and addxc... 282{ 283my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7)); 284my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4)); 285 286$code.=<<___; 287.align 32 288poly1305_blocks_vis3: 289 save %sp,-STACK_FRAME,%sp 290 srln $len,4,$len 291 292 brz,pn $len,.Lno_data 293 nop 294 295 ldx [$ctx+32+0],$R0 ! load key 296 ldx [$ctx+32+8],$R1 297 298 ldx [$ctx+0],$H0 ! load hash value 299 ldx [$ctx+8],$H1 300 ld [$ctx+16],$H2 301 302 and $inp,7,$shr ! alignment factor 303 andn $inp,7,$inp 304 set 8,$r1 305 sll $shr,3,$shr ! *8 306 set 16,$r2 307 neg $shr,$shl 308 309 srlx $R1,2,$S1 310 b .Loop_vis3 311 add $R1,$S1,$S1 312 313.Loop_vis3: 314 ldxa [$inp+%g0]0x88,$D0 ! load little-endian input 315 brz,pt $shr,.Linp_aligned_vis3 316 ldxa [$inp+$r1]0x88,$D1 317 318 ldxa [$inp+$r2]0x88,$D2 319 srlx $D0,$shr,$D0 320 sllx $D1,$shl,$T1 321 srlx $D1,$shr,$D1 322 or $T1,$D0,$D0 323 sllx $D2,$shl,$D2 324 or $D2,$D1,$D1 325 326.Linp_aligned_vis3: 327 addcc $D0,$H0,$H0 ! accumulate input 328 sub $len,1,$len 329 addxccc $D1,$H1,$H1 330 add $inp,16,$inp 331 332 mulx $R0,$H0,$D0 ! r0*h0 333 addxc $padbit,$H2,$H2 334 umulxhi $R0,$H0,$D1 335 mulx $S1,$H1,$T0 ! s1*h1 336 umulxhi $S1,$H1,$T1 337 addcc $T0,$D0,$D0 338 mulx $R1,$H0,$T0 ! r1*h0 339 addxc $T1,$D1,$D1 340 umulxhi $R1,$H0,$D2 341 addcc $T0,$D1,$D1 342 mulx $R0,$H1,$T0 ! r0*h1 343 addxc %g0,$D2,$D2 344 umulxhi $R0,$H1,$T1 345 addcc $T0,$D1,$D1 346 mulx $S1,$H2,$T0 ! s1*h2 347 addxc $T1,$D2,$D2 348 mulx $R0,$H2,$T1 ! r0*h2 349 addcc $T0,$D1,$D1 350 addxc $T1,$D2,$D2 351 352 srlx $D2,2,$T0 ! final reduction step 353 andn $D2,3,$T1 354 and $D2,3,$H2 355 add $T1,$T0,$T0 356 357 addcc $T0,$D0,$H0 358 addxccc %g0,$D1,$H1 359 brnz,pt $len,.Loop_vis3 360 addxc %g0,$H2,$H2 361 362 stx $H0,[$ctx+0] ! store hash value 363 stx $H1,[$ctx+8] 364 st $H2,[$ctx+16] 365 366 ret 367 restore 368.type poly1305_blocks_vis3,#function 369.size poly1305_blocks_vis3,.-poly1305_blocks_vis3 370___ 371} 372my ($mac,$nonce) = ($inp,$len); 373 374$code.=<<___; 375.globl poly1305_emit 376.align 32 377poly1305_emit: 378 save %sp,-STACK_FRAME,%sp 379 380 ld [$ctx+0],$h1 ! load hash value 381 ld [$ctx+4],$h0 382 ld [$ctx+8],$h3 383 ld [$ctx+12],$h2 384 ld [$ctx+16],$h4 385 386 addcc $h0,5,$r0 ! compare to modulus 387 addccc $h1,0,$r1 388 addccc $h2,0,$r2 389 addccc $h3,0,$r3 390 addc $h4,0,$h4 391 andcc $h4,4,%g0 ! did it carry/borrow? 392 393 movnz %icc,$r0,$h0 394 ld [$nonce+0],$r0 ! load nonce 395 movnz %icc,$r1,$h1 396 ld [$nonce+4],$r1 397 movnz %icc,$r2,$h2 398 ld [$nonce+8],$r2 399 movnz %icc,$r3,$h3 400 ld [$nonce+12],$r3 401 402 addcc $r0,$h0,$h0 ! accumulate nonce 403 addccc $r1,$h1,$h1 404 addccc $r2,$h2,$h2 405 addc $r3,$h3,$h3 406 407 srl $h0,8,$r0 408 stb $h0,[$mac+0] ! store little-endian result 409 srl $h0,16,$r1 410 stb $r0,[$mac+1] 411 srl $h0,24,$r2 412 stb $r1,[$mac+2] 413 stb $r2,[$mac+3] 414 415 srl $h1,8,$r0 416 stb $h1,[$mac+4] 417 srl $h1,16,$r1 418 stb $r0,[$mac+5] 419 srl $h1,24,$r2 420 stb $r1,[$mac+6] 421 stb $r2,[$mac+7] 422 423 srl $h2,8,$r0 424 stb $h2,[$mac+8] 425 srl $h2,16,$r1 426 stb $r0,[$mac+9] 427 srl $h2,24,$r2 428 stb $r1,[$mac+10] 429 stb $r2,[$mac+11] 430 431 srl $h3,8,$r0 432 stb $h3,[$mac+12] 433 srl $h3,16,$r1 434 stb $r0,[$mac+13] 435 srl $h3,24,$r2 436 stb $r1,[$mac+14] 437 stb $r2,[$mac+15] 438 439 ret 440 restore 441.type poly1305_emit,#function 442.size poly1305_emit,.-poly1305_emit 443___ 444 445{ 446my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3)); 447my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4)); 448my ($i1,$step,$shr,$shl) = map("%l$_",(0..7)); 449my $i2=$step; 450 451my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, 452 $two0,$two32,$two64,$two96,$two130,$five_two130, 453 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, 454 $s2lo,$s2hi,$s3lo,$s3hi, 455 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31)); 456# borrowings 457my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); 458my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); 459my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo); 460 461$code.=<<___; 462.align 32 463poly1305_init_fma: 464 save %sp,-STACK_FRAME-16,%sp 465 nop 466 467.Lpoly1305_init_fma: 4681: call .+8 469 add %o7,.Lconsts_fma-1b,%o7 470 471 ldd [%o7+8*0],$two0 ! load constants 472 ldd [%o7+8*1],$two32 473 ldd [%o7+8*2],$two64 474 ldd [%o7+8*3],$two96 475 ldd [%o7+8*5],$five_two130 476 477 std $two0,[$ctx+8*0] ! initial hash value, biased 0 478 std $two32,[$ctx+8*1] 479 std $two64,[$ctx+8*2] 480 std $two96,[$ctx+8*3] 481 482 brz,pn $inp,.Lno_key_fma 483 nop 484 485 stx %fsr,[%sp+LOCALS] ! save original %fsr 486 ldx [%o7+8*6],%fsr ! load new %fsr 487 488 std $two0,[$ctx+8*4] ! key "template" 489 std $two32,[$ctx+8*5] 490 std $two64,[$ctx+8*6] 491 std $two96,[$ctx+8*7] 492 493 and $inp,7,$shr 494 andn $inp,7,$inp ! align pointer 495 mov 8,$i1 496 sll $shr,3,$shr 497 mov 16,$i2 498 neg $shr,$shl 499 500 ldxa [$inp+%g0]0x88,$in0 ! load little-endian key 501 ldxa [$inp+$i1]0x88,$in2 502 503 brz $shr,.Lkey_aligned_fma 504 sethi %hi(0xf0000000),$i1 ! 0xf0000000 505 506 ldxa [$inp+$i2]0x88,$in4 507 508 srlx $in0,$shr,$in0 ! align data 509 sllx $in2,$shl,$in1 510 srlx $in2,$shr,$in2 511 or $in1,$in0,$in0 512 sllx $in4,$shl,$in3 513 or $in3,$in2,$in2 514 515.Lkey_aligned_fma: 516 or $i1,3,$i2 ! 0xf0000003 517 srlx $in0,32,$in1 518 andn $in0,$i1,$in0 ! &=0x0fffffff 519 andn $in1,$i2,$in1 ! &=0x0ffffffc 520 srlx $in2,32,$in3 521 andn $in2,$i2,$in2 522 andn $in3,$i2,$in3 523 524 st $in0,[$ctx+`8*4+4`] ! fill "template" 525 st $in1,[$ctx+`8*5+4`] 526 st $in2,[$ctx+`8*6+4`] 527 st $in3,[$ctx+`8*7+4`] 528 529 ldd [$ctx+8*4],$h0lo ! load [biased] key 530 ldd [$ctx+8*5],$h1lo 531 ldd [$ctx+8*6],$h2lo 532 ldd [$ctx+8*7],$h3lo 533 534 fsubd $h0lo,$two0, $h0lo ! r0 535 ldd [%o7+8*7],$two0 ! more constants 536 fsubd $h1lo,$two32,$h1lo ! r1 537 ldd [%o7+8*8],$two32 538 fsubd $h2lo,$two64,$h2lo ! r2 539 ldd [%o7+8*9],$two64 540 fsubd $h3lo,$two96,$h3lo ! r3 541 ldd [%o7+8*10],$two96 542 543 fmuld $five_two130,$h1lo,$s1lo ! s1 544 fmuld $five_two130,$h2lo,$s2lo ! s2 545 fmuld $five_two130,$h3lo,$s3lo ! s3 546 547 faddd $h0lo,$two0, $h0hi 548 faddd $h1lo,$two32,$h1hi 549 faddd $h2lo,$two64,$h2hi 550 faddd $h3lo,$two96,$h3hi 551 552 fsubd $h0hi,$two0, $h0hi 553 ldd [%o7+8*11],$two0 ! more constants 554 fsubd $h1hi,$two32,$h1hi 555 ldd [%o7+8*12],$two32 556 fsubd $h2hi,$two64,$h2hi 557 ldd [%o7+8*13],$two64 558 fsubd $h3hi,$two96,$h3hi 559 560 fsubd $h0lo,$h0hi,$h0lo 561 std $h0hi,[$ctx+8*5] ! r0hi 562 fsubd $h1lo,$h1hi,$h1lo 563 std $h1hi,[$ctx+8*7] ! r1hi 564 fsubd $h2lo,$h2hi,$h2lo 565 std $h2hi,[$ctx+8*9] ! r2hi 566 fsubd $h3lo,$h3hi,$h3lo 567 std $h3hi,[$ctx+8*11] ! r3hi 568 569 faddd $s1lo,$two0, $s1hi 570 faddd $s2lo,$two32,$s2hi 571 faddd $s3lo,$two64,$s3hi 572 573 fsubd $s1hi,$two0, $s1hi 574 fsubd $s2hi,$two32,$s2hi 575 fsubd $s3hi,$two64,$s3hi 576 577 fsubd $s1lo,$s1hi,$s1lo 578 fsubd $s2lo,$s2hi,$s2lo 579 fsubd $s3lo,$s3hi,$s3lo 580 581 ldx [%sp+LOCALS],%fsr ! restore %fsr 582 583 std $h0lo,[$ctx+8*4] ! r0lo 584 std $h1lo,[$ctx+8*6] ! r1lo 585 std $h2lo,[$ctx+8*8] ! r2lo 586 std $h3lo,[$ctx+8*10] ! r3lo 587 588 std $s1hi,[$ctx+8*13] 589 std $s2hi,[$ctx+8*15] 590 std $s3hi,[$ctx+8*17] 591 592 std $s1lo,[$ctx+8*12] 593 std $s2lo,[$ctx+8*14] 594 std $s3lo,[$ctx+8*16] 595 596 add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0 597 add %o7,poly1305_emit_fma-.Lconsts_fma,%o1 598 STPTR %o0,[%i2] 599 STPTR %o1,[%i2+SIZE_T] 600 601 ret 602 restore %g0,1,%o0 ! return 1 603 604.Lno_key_fma: 605 ret 606 restore %g0,%g0,%o0 ! return 0 607.type poly1305_init_fma,#function 608.size poly1305_init_fma,.-poly1305_init_fma 609 610.align 32 611poly1305_blocks_fma: 612 save %sp,-STACK_FRAME-48,%sp 613 srln $len,4,$len 614 615 brz,pn $len,.Labort 616 sub $len,1,$len 617 6181: call .+8 619 add %o7,.Lconsts_fma-1b,%o7 620 621 ldd [%o7+8*0],$two0 ! load constants 622 ldd [%o7+8*1],$two32 623 ldd [%o7+8*2],$two64 624 ldd [%o7+8*3],$two96 625 ldd [%o7+8*4],$two130 626 ldd [%o7+8*5],$five_two130 627 628 ldd [$ctx+8*0],$h0lo ! load [biased] hash value 629 ldd [$ctx+8*1],$h1lo 630 ldd [$ctx+8*2],$h2lo 631 ldd [$ctx+8*3],$h3lo 632 633 std $two0,[%sp+LOCALS+8*0] ! input "template" 634 sethi %hi((1023+52+96)<<20),$in3 635 std $two32,[%sp+LOCALS+8*1] 636 or $padbit,$in3,$in3 637 std $two64,[%sp+LOCALS+8*2] 638 st $in3,[%sp+LOCALS+8*3] 639 640 and $inp,7,$shr 641 andn $inp,7,$inp ! align pointer 642 mov 8,$i1 643 sll $shr,3,$shr 644 mov 16,$step 645 neg $shr,$shl 646 647 ldxa [$inp+%g0]0x88,$in0 ! load little-endian input 648 brz $shr,.Linp_aligned_fma 649 ldxa [$inp+$i1]0x88,$in2 650 651 ldxa [$inp+$step]0x88,$in4 652 add $inp,8,$inp 653 654 srlx $in0,$shr,$in0 ! align data 655 sllx $in2,$shl,$in1 656 srlx $in2,$shr,$in2 657 or $in1,$in0,$in0 658 sllx $in4,$shl,$in3 659 srlx $in4,$shr,$in4 ! pre-shift 660 or $in3,$in2,$in2 661 662.Linp_aligned_fma: 663 srlx $in0,32,$in1 664 movrz $len,0,$step 665 srlx $in2,32,$in3 666 add $step,$inp,$inp ! conditional advance 667 668 st $in0,[%sp+LOCALS+8*0+4] ! fill "template" 669 st $in1,[%sp+LOCALS+8*1+4] 670 st $in2,[%sp+LOCALS+8*2+4] 671 st $in3,[%sp+LOCALS+8*3+4] 672 673 ldd [$ctx+8*4],$r0lo ! load key 674 ldd [$ctx+8*5],$r0hi 675 ldd [$ctx+8*6],$r1lo 676 ldd [$ctx+8*7],$r1hi 677 ldd [$ctx+8*8],$r2lo 678 ldd [$ctx+8*9],$r2hi 679 ldd [$ctx+8*10],$r3lo 680 ldd [$ctx+8*11],$r3hi 681 ldd [$ctx+8*12],$s1lo 682 ldd [$ctx+8*13],$s1hi 683 ldd [$ctx+8*14],$s2lo 684 ldd [$ctx+8*15],$s2hi 685 ldd [$ctx+8*16],$s3lo 686 ldd [$ctx+8*17],$s3hi 687 688 stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr 689 ldx [%o7+8*6],%fsr ! load new %fsr 690 691 subcc $len,1,$len 692 movrz $len,0,$step 693 694 ldd [%sp+LOCALS+8*0],$x0 ! load biased input 695 ldd [%sp+LOCALS+8*1],$x1 696 ldd [%sp+LOCALS+8*2],$x2 697 ldd [%sp+LOCALS+8*3],$x3 698 699 fsubd $h0lo,$two0, $h0lo ! de-bias hash value 700 fsubd $h1lo,$two32,$h1lo 701 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load 702 fsubd $h2lo,$two64,$h2lo 703 fsubd $h3lo,$two96,$h3lo 704 ldxa [$inp+$i1]0x88,$in2 705 706 fsubd $x0,$two0, $x0 ! de-bias input 707 fsubd $x1,$two32,$x1 708 fsubd $x2,$two64,$x2 709 fsubd $x3,$two96,$x3 710 711 brz $shr,.Linp_aligned_fma2 712 add $step,$inp,$inp ! conditional advance 713 714 sllx $in0,$shl,$in1 ! align data 715 srlx $in0,$shr,$in3 716 or $in1,$in4,$in0 717 sllx $in2,$shl,$in1 718 srlx $in2,$shr,$in4 ! pre-shift 719 or $in3,$in1,$in2 720.Linp_aligned_fma2: 721 srlx $in0,32,$in1 722 srlx $in2,32,$in3 723 724 faddd $h0lo,$x0,$x0 ! accumulate input 725 stw $in0,[%sp+LOCALS+8*0+4] 726 faddd $h1lo,$x1,$x1 727 stw $in1,[%sp+LOCALS+8*1+4] 728 faddd $h2lo,$x2,$x2 729 stw $in2,[%sp+LOCALS+8*2+4] 730 faddd $h3lo,$x3,$x3 731 stw $in3,[%sp+LOCALS+8*3+4] 732 733 b .Lentry_fma 734 nop 735 736.align 16 737.Loop_fma: 738 ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load 739 ldxa [$inp+$i1]0x88,$in2 740 movrz $len,0,$step 741 742 faddd $y0,$h0lo,$h0lo ! accumulate input 743 faddd $y1,$h0hi,$h0hi 744 faddd $y2,$h2lo,$h2lo 745 faddd $y3,$h2hi,$h2hi 746 747 brz,pn $shr,.Linp_aligned_fma3 748 add $step,$inp,$inp ! conditional advance 749 750 sllx $in0,$shl,$in1 ! align data 751 srlx $in0,$shr,$in3 752 or $in1,$in4,$in0 753 sllx $in2,$shl,$in1 754 srlx $in2,$shr,$in4 ! pre-shift 755 or $in3,$in1,$in2 756 757.Linp_aligned_fma3: 758 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 759 faddd $two64,$h1lo,$c1lo 760 srlx $in0,32,$in1 761 faddd $two64,$h1hi,$c1hi 762 srlx $in2,32,$in3 763 faddd $two130,$h3lo,$c3lo 764 st $in0,[%sp+LOCALS+8*0+4] ! fill "template" 765 faddd $two130,$h3hi,$c3hi 766 st $in1,[%sp+LOCALS+8*1+4] 767 faddd $two32,$h0lo,$c0lo 768 st $in2,[%sp+LOCALS+8*2+4] 769 faddd $two32,$h0hi,$c0hi 770 st $in3,[%sp+LOCALS+8*3+4] 771 faddd $two96,$h2lo,$c2lo 772 faddd $two96,$h2hi,$c2hi 773 774 fsubd $c1lo,$two64,$c1lo 775 fsubd $c1hi,$two64,$c1hi 776 fsubd $c3lo,$two130,$c3lo 777 fsubd $c3hi,$two130,$c3hi 778 fsubd $c0lo,$two32,$c0lo 779 fsubd $c0hi,$two32,$c0hi 780 fsubd $c2lo,$two96,$c2lo 781 fsubd $c2hi,$two96,$c2hi 782 783 fsubd $h1lo,$c1lo,$h1lo 784 fsubd $h1hi,$c1hi,$h1hi 785 fsubd $h3lo,$c3lo,$h3lo 786 fsubd $h3hi,$c3hi,$h3hi 787 fsubd $h2lo,$c2lo,$h2lo 788 fsubd $h2hi,$c2hi,$h2hi 789 fsubd $h0lo,$c0lo,$h0lo 790 fsubd $h0hi,$c0hi,$h0hi 791 792 faddd $h1lo,$c0lo,$h1lo 793 faddd $h1hi,$c0hi,$h1hi 794 faddd $h3lo,$c2lo,$h3lo 795 faddd $h3hi,$c2hi,$h3hi 796 faddd $h2lo,$c1lo,$h2lo 797 faddd $h2hi,$c1hi,$h2hi 798 fmaddd $five_two130,$c3lo,$h0lo,$h0lo 799 fmaddd $five_two130,$c3hi,$h0hi,$h0hi 800 801 faddd $h1lo,$h1hi,$x1 802 ldd [$ctx+8*12],$s1lo ! reload constants 803 faddd $h3lo,$h3hi,$x3 804 ldd [$ctx+8*13],$s1hi 805 faddd $h2lo,$h2hi,$x2 806 ldd [$ctx+8*10],$r3lo 807 faddd $h0lo,$h0hi,$x0 808 ldd [$ctx+8*11],$r3hi 809 810.Lentry_fma: 811 fmuld $x1,$s3lo,$h0lo 812 fmuld $x1,$s3hi,$h0hi 813 fmuld $x1,$r1lo,$h2lo 814 fmuld $x1,$r1hi,$h2hi 815 fmuld $x1,$r0lo,$h1lo 816 fmuld $x1,$r0hi,$h1hi 817 fmuld $x1,$r2lo,$h3lo 818 fmuld $x1,$r2hi,$h3hi 819 820 fmaddd $x3,$s1lo,$h0lo,$h0lo 821 fmaddd $x3,$s1hi,$h0hi,$h0hi 822 fmaddd $x3,$s3lo,$h2lo,$h2lo 823 fmaddd $x3,$s3hi,$h2hi,$h2hi 824 fmaddd $x3,$s2lo,$h1lo,$h1lo 825 fmaddd $x3,$s2hi,$h1hi,$h1hi 826 fmaddd $x3,$r0lo,$h3lo,$h3lo 827 fmaddd $x3,$r0hi,$h3hi,$h3hi 828 829 fmaddd $x2,$s2lo,$h0lo,$h0lo 830 fmaddd $x2,$s2hi,$h0hi,$h0hi 831 fmaddd $x2,$r0lo,$h2lo,$h2lo 832 fmaddd $x2,$r0hi,$h2hi,$h2hi 833 fmaddd $x2,$s3lo,$h1lo,$h1lo 834 ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input 835 fmaddd $x2,$s3hi,$h1hi,$h1hi 836 ldd [%sp+LOCALS+8*1],$y1 837 fmaddd $x2,$r1lo,$h3lo,$h3lo 838 ldd [%sp+LOCALS+8*2],$y2 839 fmaddd $x2,$r1hi,$h3hi,$h3hi 840 ldd [%sp+LOCALS+8*3],$y3 841 842 fmaddd $x0,$r0lo,$h0lo,$h0lo 843 fsubd $y0,$two0, $y0 ! de-bias input 844 fmaddd $x0,$r0hi,$h0hi,$h0hi 845 fsubd $y1,$two32,$y1 846 fmaddd $x0,$r2lo,$h2lo,$h2lo 847 fsubd $y2,$two64,$y2 848 fmaddd $x0,$r2hi,$h2hi,$h2hi 849 fsubd $y3,$two96,$y3 850 fmaddd $x0,$r1lo,$h1lo,$h1lo 851 fmaddd $x0,$r1hi,$h1hi,$h1hi 852 fmaddd $x0,$r3lo,$h3lo,$h3lo 853 fmaddd $x0,$r3hi,$h3hi,$h3hi 854 855 bcc SIZE_T_CC,.Loop_fma 856 subcc $len,1,$len 857 858 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32 859 faddd $h0lo,$two32,$c0lo 860 faddd $h0hi,$two32,$c0hi 861 faddd $h2lo,$two96,$c2lo 862 faddd $h2hi,$two96,$c2hi 863 faddd $h1lo,$two64,$c1lo 864 faddd $h1hi,$two64,$c1hi 865 faddd $h3lo,$two130,$c3lo 866 faddd $h3hi,$two130,$c3hi 867 868 fsubd $c0lo,$two32,$c0lo 869 fsubd $c0hi,$two32,$c0hi 870 fsubd $c2lo,$two96,$c2lo 871 fsubd $c2hi,$two96,$c2hi 872 fsubd $c1lo,$two64,$c1lo 873 fsubd $c1hi,$two64,$c1hi 874 fsubd $c3lo,$two130,$c3lo 875 fsubd $c3hi,$two130,$c3hi 876 877 fsubd $h1lo,$c1lo,$h1lo 878 fsubd $h1hi,$c1hi,$h1hi 879 fsubd $h3lo,$c3lo,$h3lo 880 fsubd $h3hi,$c3hi,$h3hi 881 fsubd $h2lo,$c2lo,$h2lo 882 fsubd $h2hi,$c2hi,$h2hi 883 fsubd $h0lo,$c0lo,$h0lo 884 fsubd $h0hi,$c0hi,$h0hi 885 886 faddd $h1lo,$c0lo,$h1lo 887 faddd $h1hi,$c0hi,$h1hi 888 faddd $h3lo,$c2lo,$h3lo 889 faddd $h3hi,$c2hi,$h3hi 890 faddd $h2lo,$c1lo,$h2lo 891 faddd $h2hi,$c1hi,$h2hi 892 fmaddd $five_two130,$c3lo,$h0lo,$h0lo 893 fmaddd $five_two130,$c3hi,$h0hi,$h0hi 894 895 faddd $h1lo,$h1hi,$x1 896 faddd $h3lo,$h3hi,$x3 897 faddd $h2lo,$h2hi,$x2 898 faddd $h0lo,$h0hi,$x0 899 900 faddd $x1,$two32,$x1 ! bias 901 faddd $x3,$two96,$x3 902 faddd $x2,$two64,$x2 903 faddd $x0,$two0, $x0 904 905 ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr 906 907 std $x1,[$ctx+8*1] ! store [biased] hash value 908 std $x3,[$ctx+8*3] 909 std $x2,[$ctx+8*2] 910 std $x0,[$ctx+8*0] 911 912.Labort: 913 ret 914 restore 915.type poly1305_blocks_fma,#function 916.size poly1305_blocks_fma,.-poly1305_blocks_fma 917___ 918{ 919my ($mac,$nonce)=($inp,$len); 920 921my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask 922 ) = (map("%l$_",(0..5)),map("%o$_",(0..4))); 923 924$code.=<<___; 925.align 32 926poly1305_emit_fma: 927 save %sp,-STACK_FRAME,%sp 928 929 ld [$ctx+8*0+0],$d0 ! load hash 930 ld [$ctx+8*0+4],$h0 931 ld [$ctx+8*1+0],$d1 932 ld [$ctx+8*1+4],$h1 933 ld [$ctx+8*2+0],$d2 934 ld [$ctx+8*2+4],$h2 935 ld [$ctx+8*3+0],$d3 936 ld [$ctx+8*3+4],$h3 937 938 sethi %hi(0xfff00000),$mask 939 andn $d0,$mask,$d0 ! mask exponent 940 andn $d1,$mask,$d1 941 andn $d2,$mask,$d2 942 andn $d3,$mask,$d3 ! can be partially reduced... 943 mov 3,$mask 944 945 srl $d3,2,$padbit ! ... so reduce 946 and $d3,$mask,$h4 947 andn $d3,$mask,$d3 948 add $padbit,$d3,$d3 949 950 addcc $d3,$h0,$h0 951 addccc $d0,$h1,$h1 952 addccc $d1,$h2,$h2 953 addccc $d2,$h3,$h3 954 addc %g0,$h4,$h4 955 956 addcc $h0,5,$d0 ! compare to modulus 957 addccc $h1,0,$d1 958 addccc $h2,0,$d2 959 addccc $h3,0,$d3 960 addc $h4,0,$mask 961 962 srl $mask,2,$mask ! did it carry/borrow? 963 neg $mask,$mask 964 sra $mask,31,$mask ! mask 965 966 andn $h0,$mask,$h0 967 and $d0,$mask,$d0 968 andn $h1,$mask,$h1 969 and $d1,$mask,$d1 970 or $d0,$h0,$h0 971 ld [$nonce+0],$d0 ! load nonce 972 andn $h2,$mask,$h2 973 and $d2,$mask,$d2 974 or $d1,$h1,$h1 975 ld [$nonce+4],$d1 976 andn $h3,$mask,$h3 977 and $d3,$mask,$d3 978 or $d2,$h2,$h2 979 ld [$nonce+8],$d2 980 or $d3,$h3,$h3 981 ld [$nonce+12],$d3 982 983 addcc $d0,$h0,$h0 ! accumulate nonce 984 addccc $d1,$h1,$h1 985 addccc $d2,$h2,$h2 986 addc $d3,$h3,$h3 987 988 stb $h0,[$mac+0] ! write little-endian result 989 srl $h0,8,$h0 990 stb $h1,[$mac+4] 991 srl $h1,8,$h1 992 stb $h2,[$mac+8] 993 srl $h2,8,$h2 994 stb $h3,[$mac+12] 995 srl $h3,8,$h3 996 997 stb $h0,[$mac+1] 998 srl $h0,8,$h0 999 stb $h1,[$mac+5] 1000 srl $h1,8,$h1 1001 stb $h2,[$mac+9] 1002 srl $h2,8,$h2 1003 stb $h3,[$mac+13] 1004 srl $h3,8,$h3 1005 1006 stb $h0,[$mac+2] 1007 srl $h0,8,$h0 1008 stb $h1,[$mac+6] 1009 srl $h1,8,$h1 1010 stb $h2,[$mac+10] 1011 srl $h2,8,$h2 1012 stb $h3,[$mac+14] 1013 srl $h3,8,$h3 1014 1015 stb $h0,[$mac+3] 1016 stb $h1,[$mac+7] 1017 stb $h2,[$mac+11] 1018 stb $h3,[$mac+15] 1019 1020 ret 1021 restore 1022.type poly1305_emit_fma,#function 1023.size poly1305_emit_fma,.-poly1305_emit_fma 1024___ 1025} 1026 1027$code.=<<___; 1028.align 64 1029.Lconsts_fma: 1030.word 0x43300000,0x00000000 ! 2^(52+0) 1031.word 0x45300000,0x00000000 ! 2^(52+32) 1032.word 0x47300000,0x00000000 ! 2^(52+64) 1033.word 0x49300000,0x00000000 ! 2^(52+96) 1034.word 0x4b500000,0x00000000 ! 2^(52+130) 1035 1036.word 0x37f40000,0x00000000 ! 5/2^130 1037.word 0,1<<30 ! fsr: truncate, no exceptions 1038 1039.word 0x44300000,0x00000000 ! 2^(52+16+0) 1040.word 0x46300000,0x00000000 ! 2^(52+16+32) 1041.word 0x48300000,0x00000000 ! 2^(52+16+64) 1042.word 0x4a300000,0x00000000 ! 2^(52+16+96) 1043.word 0x3e300000,0x00000000 ! 2^(52+16+0-96) 1044.word 0x40300000,0x00000000 ! 2^(52+16+32-96) 1045.word 0x42300000,0x00000000 ! 2^(52+16+64-96) 1046.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>" 1047.align 4 1048___ 1049} 1050 1051# Purpose of these subroutines is to explicitly encode VIS instructions, 1052# so that one can compile the module without having to specify VIS 1053# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1054# Idea is to reserve for option to produce "universal" binary and let 1055# programmer detect if current CPU is VIS capable at run-time. 1056sub unvis3 { 1057my ($mnemonic,$rs1,$rs2,$rd)=@_; 1058my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1059my ($ref,$opf); 1060my %visopf = ( "addxc" => 0x011, 1061 "addxccc" => 0x013, 1062 "umulxhi" => 0x016 ); 1063 1064 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1065 1066 if ($opf=$visopf{$mnemonic}) { 1067 foreach ($rs1,$rs2,$rd) { 1068 return $ref if (!/%([goli])([0-9])/); 1069 $_=$bias{$1}+$2; 1070 } 1071 1072 return sprintf ".word\t0x%08x !%s", 1073 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1074 $ref; 1075 } else { 1076 return $ref; 1077 } 1078} 1079 1080sub unfma { 1081my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1082my ($ref,$opf); 1083my %fmaopf = ( "fmadds" => 0x1, 1084 "fmaddd" => 0x2, 1085 "fmsubs" => 0x5, 1086 "fmsubd" => 0x6 ); 1087 1088 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1089 1090 if ($opf=$fmaopf{$mnemonic}) { 1091 foreach ($rs1,$rs2,$rs3,$rd) { 1092 return $ref if (!/%f([0-9]{1,2})/); 1093 $_=$1; 1094 if ($1>=32) { 1095 return $ref if ($1&1); 1096 # re-encode for upper double register addressing 1097 $_=($1|$1>>5)&31; 1098 } 1099 } 1100 1101 return sprintf ".word\t0x%08x !%s", 1102 0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1103 $ref; 1104 } else { 1105 return $ref; 1106 } 1107} 1108 1109foreach (split("\n",$code)) { 1110 s/\`([^\`]*)\`/eval $1/ge; 1111 1112 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1113 &unvis3($1,$2,$3,$4) 1114 /ge or 1115 s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/ 1116 &unfma($1,$2,$3,$4,$5) 1117 /ge; 1118 1119 print $_,"\n"; 1120} 1121 1122close STDOUT or die "error closing STDOUT: $!"; 1123