1#! /usr/bin/env perl 2# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+128 bytes shared table]. Performance 22# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU 23# and are expressed in cycles per processed byte, less is better: 24# 25# gcc 3.3.x cc 5.2 this assembler 26# 27# 32-bit build 81.4 43.3 12.6 (+546%/+244%) 28# 64-bit build 20.2 21.2 12.6 (+60%/+68%) 29# 30# Here is data collected on UltraSPARC T1 system running Linux: 31# 32# gcc 4.4.1 this assembler 33# 34# 32-bit build 566 50 (+1000%) 35# 64-bit build 56 50 (+12%) 36# 37# I don't quite understand why difference between 32-bit and 64-bit 38# compiler-generated code is so big. Compilers *were* instructed to 39# generate code for UltraSPARC and should have used 64-bit registers 40# for Z vector (see C code) even in 32-bit build... Oh well, it only 41# means more impressive improvement coefficients for this assembler 42# module;-) Loops are aggressively modulo-scheduled in respect to 43# references to input data and Z.hi updates to achieve 12 cycles 44# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 45# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. 46# 47# October 2012 48# 49# Add VIS3 lookup-table-free implementation using polynomial 50# multiplication xmulx[hi] and extended addition addxc[cc] 51# instructions. 4.52/7.63x improvement on T3/T4 or in absolute 52# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark 53# saturates at ~15.5x single-process result on 8-core processor, 54# or ~20.5GBps per 2.85GHz socket. 55 56$output=pop and open STDOUT,">$output"; 57 58$frame="STACK_FRAME"; 59$bias="STACK_BIAS"; 60 61$Zhi="%o0"; # 64-bit values 62$Zlo="%o1"; 63$Thi="%o2"; 64$Tlo="%o3"; 65$rem="%o4"; 66$tmp="%o5"; 67 68$nhi="%l0"; # small values and pointers 69$nlo="%l1"; 70$xi0="%l2"; 71$xi1="%l3"; 72$rem_4bit="%l4"; 73$remi="%l5"; 74$Htblo="%l6"; 75$cnt="%l7"; 76 77$Xi="%i0"; # input argument block 78$Htbl="%i1"; 79$inp="%i2"; 80$len="%i3"; 81 82$code.=<<___; 83#ifndef __ASSEMBLER__ 84# define __ASSEMBLER__ 1 85#endif 86#include "crypto/sparc_arch.h" 87 88#ifdef __arch64__ 89.register %g2,#scratch 90.register %g3,#scratch 91#endif 92 93.section ".text",#alloc,#execinstr 94 95.align 64 96rem_4bit: 97 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 98 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 99 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 100 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 101.type rem_4bit,#object 102.size rem_4bit,(.-rem_4bit) 103 104.globl gcm_ghash_4bit 105.align 32 106gcm_ghash_4bit: 107 save %sp,-$frame,%sp 108 ldub [$inp+15],$nlo 109 ldub [$Xi+15],$xi0 110 ldub [$Xi+14],$xi1 111 add $len,$inp,$len 112 add $Htbl,8,$Htblo 113 1141: call .+8 115 add %o7,rem_4bit-1b,$rem_4bit 116 117.Louter: 118 xor $xi0,$nlo,$nlo 119 and $nlo,0xf0,$nhi 120 and $nlo,0x0f,$nlo 121 sll $nlo,4,$nlo 122 ldx [$Htblo+$nlo],$Zlo 123 ldx [$Htbl+$nlo],$Zhi 124 125 ldub [$inp+14],$nlo 126 127 ldx [$Htblo+$nhi],$Tlo 128 and $Zlo,0xf,$remi 129 ldx [$Htbl+$nhi],$Thi 130 sll $remi,3,$remi 131 ldx [$rem_4bit+$remi],$rem 132 srlx $Zlo,4,$Zlo 133 mov 13,$cnt 134 sllx $Zhi,60,$tmp 135 xor $Tlo,$Zlo,$Zlo 136 srlx $Zhi,4,$Zhi 137 xor $Zlo,$tmp,$Zlo 138 139 xor $xi1,$nlo,$nlo 140 and $Zlo,0xf,$remi 141 and $nlo,0xf0,$nhi 142 and $nlo,0x0f,$nlo 143 ba .Lghash_inner 144 sll $nlo,4,$nlo 145.align 32 146.Lghash_inner: 147 ldx [$Htblo+$nlo],$Tlo 148 sll $remi,3,$remi 149 xor $Thi,$Zhi,$Zhi 150 ldx [$Htbl+$nlo],$Thi 151 srlx $Zlo,4,$Zlo 152 xor $rem,$Zhi,$Zhi 153 ldx [$rem_4bit+$remi],$rem 154 sllx $Zhi,60,$tmp 155 xor $Tlo,$Zlo,$Zlo 156 ldub [$inp+$cnt],$nlo 157 srlx $Zhi,4,$Zhi 158 xor $Zlo,$tmp,$Zlo 159 ldub [$Xi+$cnt],$xi1 160 xor $Thi,$Zhi,$Zhi 161 and $Zlo,0xf,$remi 162 163 ldx [$Htblo+$nhi],$Tlo 164 sll $remi,3,$remi 165 xor $rem,$Zhi,$Zhi 166 ldx [$Htbl+$nhi],$Thi 167 srlx $Zlo,4,$Zlo 168 ldx [$rem_4bit+$remi],$rem 169 sllx $Zhi,60,$tmp 170 xor $xi1,$nlo,$nlo 171 srlx $Zhi,4,$Zhi 172 and $nlo,0xf0,$nhi 173 addcc $cnt,-1,$cnt 174 xor $Zlo,$tmp,$Zlo 175 and $nlo,0x0f,$nlo 176 xor $Tlo,$Zlo,$Zlo 177 sll $nlo,4,$nlo 178 blu .Lghash_inner 179 and $Zlo,0xf,$remi 180 181 ldx [$Htblo+$nlo],$Tlo 182 sll $remi,3,$remi 183 xor $Thi,$Zhi,$Zhi 184 ldx [$Htbl+$nlo],$Thi 185 srlx $Zlo,4,$Zlo 186 xor $rem,$Zhi,$Zhi 187 ldx [$rem_4bit+$remi],$rem 188 sllx $Zhi,60,$tmp 189 xor $Tlo,$Zlo,$Zlo 190 srlx $Zhi,4,$Zhi 191 xor $Zlo,$tmp,$Zlo 192 xor $Thi,$Zhi,$Zhi 193 194 add $inp,16,$inp 195 cmp $inp,$len 196 be,pn SIZE_T_CC,.Ldone 197 and $Zlo,0xf,$remi 198 199 ldx [$Htblo+$nhi],$Tlo 200 sll $remi,3,$remi 201 xor $rem,$Zhi,$Zhi 202 ldx [$Htbl+$nhi],$Thi 203 srlx $Zlo,4,$Zlo 204 ldx [$rem_4bit+$remi],$rem 205 sllx $Zhi,60,$tmp 206 xor $Tlo,$Zlo,$Zlo 207 ldub [$inp+15],$nlo 208 srlx $Zhi,4,$Zhi 209 xor $Zlo,$tmp,$Zlo 210 xor $Thi,$Zhi,$Zhi 211 stx $Zlo,[$Xi+8] 212 xor $rem,$Zhi,$Zhi 213 stx $Zhi,[$Xi] 214 srl $Zlo,8,$xi1 215 and $Zlo,0xff,$xi0 216 ba .Louter 217 and $xi1,0xff,$xi1 218.align 32 219.Ldone: 220 ldx [$Htblo+$nhi],$Tlo 221 sll $remi,3,$remi 222 xor $rem,$Zhi,$Zhi 223 ldx [$Htbl+$nhi],$Thi 224 srlx $Zlo,4,$Zlo 225 ldx [$rem_4bit+$remi],$rem 226 sllx $Zhi,60,$tmp 227 xor $Tlo,$Zlo,$Zlo 228 srlx $Zhi,4,$Zhi 229 xor $Zlo,$tmp,$Zlo 230 xor $Thi,$Zhi,$Zhi 231 stx $Zlo,[$Xi+8] 232 xor $rem,$Zhi,$Zhi 233 stx $Zhi,[$Xi] 234 235 ret 236 restore 237.type gcm_ghash_4bit,#function 238.size gcm_ghash_4bit,(.-gcm_ghash_4bit) 239___ 240 241undef $inp; 242undef $len; 243 244$code.=<<___; 245.globl gcm_gmult_4bit 246.align 32 247gcm_gmult_4bit: 248 save %sp,-$frame,%sp 249 ldub [$Xi+15],$nlo 250 add $Htbl,8,$Htblo 251 2521: call .+8 253 add %o7,rem_4bit-1b,$rem_4bit 254 255 and $nlo,0xf0,$nhi 256 and $nlo,0x0f,$nlo 257 sll $nlo,4,$nlo 258 ldx [$Htblo+$nlo],$Zlo 259 ldx [$Htbl+$nlo],$Zhi 260 261 ldub [$Xi+14],$nlo 262 263 ldx [$Htblo+$nhi],$Tlo 264 and $Zlo,0xf,$remi 265 ldx [$Htbl+$nhi],$Thi 266 sll $remi,3,$remi 267 ldx [$rem_4bit+$remi],$rem 268 srlx $Zlo,4,$Zlo 269 mov 13,$cnt 270 sllx $Zhi,60,$tmp 271 xor $Tlo,$Zlo,$Zlo 272 srlx $Zhi,4,$Zhi 273 xor $Zlo,$tmp,$Zlo 274 275 and $Zlo,0xf,$remi 276 and $nlo,0xf0,$nhi 277 and $nlo,0x0f,$nlo 278 ba .Lgmult_inner 279 sll $nlo,4,$nlo 280.align 32 281.Lgmult_inner: 282 ldx [$Htblo+$nlo],$Tlo 283 sll $remi,3,$remi 284 xor $Thi,$Zhi,$Zhi 285 ldx [$Htbl+$nlo],$Thi 286 srlx $Zlo,4,$Zlo 287 xor $rem,$Zhi,$Zhi 288 ldx [$rem_4bit+$remi],$rem 289 sllx $Zhi,60,$tmp 290 xor $Tlo,$Zlo,$Zlo 291 ldub [$Xi+$cnt],$nlo 292 srlx $Zhi,4,$Zhi 293 xor $Zlo,$tmp,$Zlo 294 xor $Thi,$Zhi,$Zhi 295 and $Zlo,0xf,$remi 296 297 ldx [$Htblo+$nhi],$Tlo 298 sll $remi,3,$remi 299 xor $rem,$Zhi,$Zhi 300 ldx [$Htbl+$nhi],$Thi 301 srlx $Zlo,4,$Zlo 302 ldx [$rem_4bit+$remi],$rem 303 sllx $Zhi,60,$tmp 304 srlx $Zhi,4,$Zhi 305 and $nlo,0xf0,$nhi 306 addcc $cnt,-1,$cnt 307 xor $Zlo,$tmp,$Zlo 308 and $nlo,0x0f,$nlo 309 xor $Tlo,$Zlo,$Zlo 310 sll $nlo,4,$nlo 311 blu .Lgmult_inner 312 and $Zlo,0xf,$remi 313 314 ldx [$Htblo+$nlo],$Tlo 315 sll $remi,3,$remi 316 xor $Thi,$Zhi,$Zhi 317 ldx [$Htbl+$nlo],$Thi 318 srlx $Zlo,4,$Zlo 319 xor $rem,$Zhi,$Zhi 320 ldx [$rem_4bit+$remi],$rem 321 sllx $Zhi,60,$tmp 322 xor $Tlo,$Zlo,$Zlo 323 srlx $Zhi,4,$Zhi 324 xor $Zlo,$tmp,$Zlo 325 xor $Thi,$Zhi,$Zhi 326 and $Zlo,0xf,$remi 327 328 ldx [$Htblo+$nhi],$Tlo 329 sll $remi,3,$remi 330 xor $rem,$Zhi,$Zhi 331 ldx [$Htbl+$nhi],$Thi 332 srlx $Zlo,4,$Zlo 333 ldx [$rem_4bit+$remi],$rem 334 sllx $Zhi,60,$tmp 335 xor $Tlo,$Zlo,$Zlo 336 srlx $Zhi,4,$Zhi 337 xor $Zlo,$tmp,$Zlo 338 xor $Thi,$Zhi,$Zhi 339 stx $Zlo,[$Xi+8] 340 xor $rem,$Zhi,$Zhi 341 stx $Zhi,[$Xi] 342 343 ret 344 restore 345.type gcm_gmult_4bit,#function 346.size gcm_gmult_4bit,(.-gcm_gmult_4bit) 347___ 348 349{{{ 350# Straightforward 128x128-bit multiplication using Karatsuba algorithm 351# followed by pair of 64-bit reductions [with a shortcut in first one, 352# which allowed to break dependency between reductions and remove one 353# multiplication from critical path]. While it might be suboptimal 354# with regard to sheer number of multiplications, other methods [such 355# as aggregate reduction] would require more 64-bit registers, which 356# we don't have in 32-bit application context. 357 358($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); 359 360($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= 361 (map("%o$_",(0..5,7)),map("%g$_",(1..5))); 362 363($shl,$shr)=map("%l$_",(0..7)); 364 365# For details regarding "twisted H" see ghash-x86.pl. 366$code.=<<___; 367.globl gcm_init_vis3 368.align 32 369gcm_init_vis3: 370 save %sp,-$frame,%sp 371 372 ldx [%i1+0],$Hhi 373 ldx [%i1+8],$Hlo 374 mov 0xE1,$Xhi 375 mov 1,$Xlo 376 sllx $Xhi,57,$Xhi 377 srax $Hhi,63,$C0 ! broadcast carry 378 addcc $Hlo,$Hlo,$Hlo ! H<<=1 379 addxc $Hhi,$Hhi,$Hhi 380 and $C0,$Xlo,$Xlo 381 and $C0,$Xhi,$Xhi 382 xor $Xlo,$Hlo,$Hlo 383 xor $Xhi,$Hhi,$Hhi 384 stx $Hlo,[%i0+8] ! save twisted H 385 stx $Hhi,[%i0+0] 386 387 sethi %hi(0xA0406080),$V 388 sethi %hi(0x20C0E000),%l0 389 or $V,%lo(0xA0406080),$V 390 or %l0,%lo(0x20C0E000),%l0 391 sllx $V,32,$V 392 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 393 stx $V,[%i0+16] 394 395 ret 396 restore 397.type gcm_init_vis3,#function 398.size gcm_init_vis3,.-gcm_init_vis3 399 400.globl gcm_gmult_vis3 401.align 32 402gcm_gmult_vis3: 403 save %sp,-$frame,%sp 404 405 ldx [$Xip+8],$Xlo ! load Xi 406 ldx [$Xip+0],$Xhi 407 ldx [$Htable+8],$Hlo ! load twisted H 408 ldx [$Htable+0],$Hhi 409 410 mov 0xE1,%l7 411 sllx %l7,57,$xE1 ! 57 is not a typo 412 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 413 414 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 415 xmulx $Xlo,$Hlo,$C0 416 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 417 xmulx $C2,$Hhl,$C1 418 xmulxhi $Xlo,$Hlo,$Xlo 419 xmulxhi $C2,$Hhl,$C2 420 xmulxhi $Xhi,$Hhi,$C3 421 xmulx $Xhi,$Hhi,$Xhi 422 423 sll $C0,3,$sqr 424 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] 425 xor $C0,$sqr,$sqr 426 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] 427 428 xor $C0,$C1,$C1 ! Karatsuba post-processing 429 xor $Xlo,$C2,$C2 430 xor $sqr,$Xlo,$Xlo ! real destination is $C1 431 xor $C3,$C2,$C2 432 xor $Xlo,$C1,$C1 433 xor $Xhi,$C2,$C2 434 xor $Xhi,$C1,$C1 435 436 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 437 xor $C0,$C2,$C2 438 xmulx $C1,$xE1,$C0 439 xor $C1,$C3,$C3 440 xmulxhi $C1,$xE1,$C1 441 442 xor $Xlo,$C2,$C2 443 xor $C0,$C2,$C2 444 xor $C1,$C3,$C3 445 446 stx $C2,[$Xip+8] ! save Xi 447 stx $C3,[$Xip+0] 448 449 ret 450 restore 451.type gcm_gmult_vis3,#function 452.size gcm_gmult_vis3,.-gcm_gmult_vis3 453 454.globl gcm_ghash_vis3 455.align 32 456gcm_ghash_vis3: 457 save %sp,-$frame,%sp 458 nop 459 srln $len,0,$len ! needed on v8+, "nop" on v9 460 461 ldx [$Xip+8],$C2 ! load Xi 462 ldx [$Xip+0],$C3 463 ldx [$Htable+8],$Hlo ! load twisted H 464 ldx [$Htable+0],$Hhi 465 466 mov 0xE1,%l7 467 sllx %l7,57,$xE1 ! 57 is not a typo 468 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 469 470 and $inp,7,$shl 471 andn $inp,7,$inp 472 sll $shl,3,$shl 473 prefetch [$inp+63], 20 474 sub %g0,$shl,$shr 475 476 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 477.Loop: 478 ldx [$inp+8],$Xlo 479 brz,pt $shl,1f 480 ldx [$inp+0],$Xhi 481 482 ldx [$inp+16],$C1 ! align data 483 srlx $Xlo,$shr,$C0 484 sllx $Xlo,$shl,$Xlo 485 sllx $Xhi,$shl,$Xhi 486 srlx $C1,$shr,$C1 487 or $C0,$Xhi,$Xhi 488 or $C1,$Xlo,$Xlo 4891: 490 add $inp,16,$inp 491 sub $len,16,$len 492 xor $C2,$Xlo,$Xlo 493 xor $C3,$Xhi,$Xhi 494 prefetch [$inp+63], 20 495 496 xmulx $Xlo,$Hlo,$C0 497 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 498 xmulx $C2,$Hhl,$C1 499 xmulxhi $Xlo,$Hlo,$Xlo 500 xmulxhi $C2,$Hhl,$C2 501 xmulxhi $Xhi,$Hhi,$C3 502 xmulx $Xhi,$Hhi,$Xhi 503 504 sll $C0,3,$sqr 505 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] 506 xor $C0,$sqr,$sqr 507 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] 508 509 xor $C0,$C1,$C1 ! Karatsuba post-processing 510 xor $Xlo,$C2,$C2 511 xor $sqr,$Xlo,$Xlo ! real destination is $C1 512 xor $C3,$C2,$C2 513 xor $Xlo,$C1,$C1 514 xor $Xhi,$C2,$C2 515 xor $Xhi,$C1,$C1 516 517 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 518 xor $C0,$C2,$C2 519 xmulx $C1,$xE1,$C0 520 xor $C1,$C3,$C3 521 xmulxhi $C1,$xE1,$C1 522 523 xor $Xlo,$C2,$C2 524 xor $C0,$C2,$C2 525 brnz,pt $len,.Loop 526 xor $C1,$C3,$C3 527 528 stx $C2,[$Xip+8] ! save Xi 529 stx $C3,[$Xip+0] 530 531 ret 532 restore 533.type gcm_ghash_vis3,#function 534.size gcm_ghash_vis3,.-gcm_ghash_vis3 535___ 536}}} 537$code.=<<___; 538.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" 539.align 4 540___ 541 542 543# Purpose of these subroutines is to explicitly encode VIS instructions, 544# so that one can compile the module without having to specify VIS 545# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 546# Idea is to reserve for option to produce "universal" binary and let 547# programmer detect if current CPU is VIS capable at run-time. 548sub unvis3 { 549my ($mnemonic,$rs1,$rs2,$rd)=@_; 550my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 551my ($ref,$opf); 552my %visopf = ( "addxc" => 0x011, 553 "addxccc" => 0x013, 554 "xmulx" => 0x115, 555 "xmulxhi" => 0x116 ); 556 557 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 558 559 if ($opf=$visopf{$mnemonic}) { 560 foreach ($rs1,$rs2,$rd) { 561 return $ref if (!/%([goli])([0-9])/); 562 $_=$bias{$1}+$2; 563 } 564 565 return sprintf ".word\t0x%08x !%s", 566 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 567 $ref; 568 } else { 569 return $ref; 570 } 571} 572 573foreach (split("\n",$code)) { 574 s/\`([^\`]*)\`/eval $1/ge; 575 576 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 577 &unvis3($1,$2,$3,$4) 578 /ge; 579 580 print $_,"\n"; 581} 582 583close STDOUT or die "error closing STDOUT: $!"; 584