1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+128 bytes shared table]. Even though 22# loops are aggressively modulo-scheduled in respect to references to 23# Htbl and Z.hi updates for 8 cycles per byte, measured performance is 24# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic 25# scheduling "glitch," because uprofile(1) indicates uniform sample 26# distribution, as if all instruction bundles execute in 1.5 cycles. 27# Meaning that it could have been even faster, yet 12 cycles is ~60% 28# better than gcc-generated code and ~80% than code generated by vendor 29# compiler. 30 31$cnt="v0"; # $0 32$t0="t0"; 33$t1="t1"; 34$t2="t2"; 35$Thi0="t3"; # $4 36$Tlo0="t4"; 37$Thi1="t5"; 38$Tlo1="t6"; 39$rem="t7"; # $8 40################# 41$Xi="a0"; # $16, input argument block 42$Htbl="a1"; 43$inp="a2"; 44$len="a3"; 45$nlo="a4"; # $20 46$nhi="a5"; 47$Zhi="t8"; 48$Zlo="t9"; 49$Xhi="t10"; # $24 50$Xlo="t11"; 51$remp="t12"; 52$rem_4bit="AT"; # $28 53 54{ my $N; 55 sub loop() { 56 57 $N++; 58$code.=<<___; 59.align 4 60 extbl $Xlo,7,$nlo 61 and $nlo,0xf0,$nhi 62 sll $nlo,4,$nlo 63 and $nlo,0xf0,$nlo 64 65 addq $nlo,$Htbl,$nlo 66 ldq $Zlo,8($nlo) 67 addq $nhi,$Htbl,$nhi 68 ldq $Zhi,0($nlo) 69 70 and $Zlo,0x0f,$remp 71 sll $Zhi,60,$t0 72 lda $cnt,6(zero) 73 extbl $Xlo,6,$nlo 74 75 ldq $Tlo1,8($nhi) 76 s8addq $remp,$rem_4bit,$remp 77 ldq $Thi1,0($nhi) 78 srl $Zlo,4,$Zlo 79 80 ldq $rem,0($remp) 81 srl $Zhi,4,$Zhi 82 xor $t0,$Zlo,$Zlo 83 and $nlo,0xf0,$nhi 84 85 xor $Tlo1,$Zlo,$Zlo 86 sll $nlo,4,$nlo 87 xor $Thi1,$Zhi,$Zhi 88 and $nlo,0xf0,$nlo 89 90 addq $nlo,$Htbl,$nlo 91 ldq $Tlo0,8($nlo) 92 addq $nhi,$Htbl,$nhi 93 ldq $Thi0,0($nlo) 94 95.Looplo$N: 96 and $Zlo,0x0f,$remp 97 sll $Zhi,60,$t0 98 subq $cnt,1,$cnt 99 srl $Zlo,4,$Zlo 100 101 ldq $Tlo1,8($nhi) 102 xor $rem,$Zhi,$Zhi 103 ldq $Thi1,0($nhi) 104 s8addq $remp,$rem_4bit,$remp 105 106 ldq $rem,0($remp) 107 srl $Zhi,4,$Zhi 108 xor $t0,$Zlo,$Zlo 109 extbl $Xlo,$cnt,$nlo 110 111 and $nlo,0xf0,$nhi 112 xor $Thi0,$Zhi,$Zhi 113 xor $Tlo0,$Zlo,$Zlo 114 sll $nlo,4,$nlo 115 116 117 and $Zlo,0x0f,$remp 118 sll $Zhi,60,$t0 119 and $nlo,0xf0,$nlo 120 srl $Zlo,4,$Zlo 121 122 s8addq $remp,$rem_4bit,$remp 123 xor $rem,$Zhi,$Zhi 124 addq $nlo,$Htbl,$nlo 125 addq $nhi,$Htbl,$nhi 126 127 ldq $rem,0($remp) 128 srl $Zhi,4,$Zhi 129 ldq $Tlo0,8($nlo) 130 xor $t0,$Zlo,$Zlo 131 132 xor $Tlo1,$Zlo,$Zlo 133 xor $Thi1,$Zhi,$Zhi 134 ldq $Thi0,0($nlo) 135 bne $cnt,.Looplo$N 136 137 138 and $Zlo,0x0f,$remp 139 sll $Zhi,60,$t0 140 lda $cnt,7(zero) 141 srl $Zlo,4,$Zlo 142 143 ldq $Tlo1,8($nhi) 144 xor $rem,$Zhi,$Zhi 145 ldq $Thi1,0($nhi) 146 s8addq $remp,$rem_4bit,$remp 147 148 ldq $rem,0($remp) 149 srl $Zhi,4,$Zhi 150 xor $t0,$Zlo,$Zlo 151 extbl $Xhi,$cnt,$nlo 152 153 and $nlo,0xf0,$nhi 154 xor $Thi0,$Zhi,$Zhi 155 xor $Tlo0,$Zlo,$Zlo 156 sll $nlo,4,$nlo 157 158 and $Zlo,0x0f,$remp 159 sll $Zhi,60,$t0 160 and $nlo,0xf0,$nlo 161 srl $Zlo,4,$Zlo 162 163 s8addq $remp,$rem_4bit,$remp 164 xor $rem,$Zhi,$Zhi 165 addq $nlo,$Htbl,$nlo 166 addq $nhi,$Htbl,$nhi 167 168 ldq $rem,0($remp) 169 srl $Zhi,4,$Zhi 170 ldq $Tlo0,8($nlo) 171 xor $t0,$Zlo,$Zlo 172 173 xor $Tlo1,$Zlo,$Zlo 174 xor $Thi1,$Zhi,$Zhi 175 ldq $Thi0,0($nlo) 176 unop 177 178 179.Loophi$N: 180 and $Zlo,0x0f,$remp 181 sll $Zhi,60,$t0 182 subq $cnt,1,$cnt 183 srl $Zlo,4,$Zlo 184 185 ldq $Tlo1,8($nhi) 186 xor $rem,$Zhi,$Zhi 187 ldq $Thi1,0($nhi) 188 s8addq $remp,$rem_4bit,$remp 189 190 ldq $rem,0($remp) 191 srl $Zhi,4,$Zhi 192 xor $t0,$Zlo,$Zlo 193 extbl $Xhi,$cnt,$nlo 194 195 and $nlo,0xf0,$nhi 196 xor $Thi0,$Zhi,$Zhi 197 xor $Tlo0,$Zlo,$Zlo 198 sll $nlo,4,$nlo 199 200 201 and $Zlo,0x0f,$remp 202 sll $Zhi,60,$t0 203 and $nlo,0xf0,$nlo 204 srl $Zlo,4,$Zlo 205 206 s8addq $remp,$rem_4bit,$remp 207 xor $rem,$Zhi,$Zhi 208 addq $nlo,$Htbl,$nlo 209 addq $nhi,$Htbl,$nhi 210 211 ldq $rem,0($remp) 212 srl $Zhi,4,$Zhi 213 ldq $Tlo0,8($nlo) 214 xor $t0,$Zlo,$Zlo 215 216 xor $Tlo1,$Zlo,$Zlo 217 xor $Thi1,$Zhi,$Zhi 218 ldq $Thi0,0($nlo) 219 bne $cnt,.Loophi$N 220 221 222 and $Zlo,0x0f,$remp 223 sll $Zhi,60,$t0 224 srl $Zlo,4,$Zlo 225 226 ldq $Tlo1,8($nhi) 227 xor $rem,$Zhi,$Zhi 228 ldq $Thi1,0($nhi) 229 s8addq $remp,$rem_4bit,$remp 230 231 ldq $rem,0($remp) 232 srl $Zhi,4,$Zhi 233 xor $t0,$Zlo,$Zlo 234 235 xor $Tlo0,$Zlo,$Zlo 236 xor $Thi0,$Zhi,$Zhi 237 238 and $Zlo,0x0f,$remp 239 sll $Zhi,60,$t0 240 srl $Zlo,4,$Zlo 241 242 s8addq $remp,$rem_4bit,$remp 243 xor $rem,$Zhi,$Zhi 244 245 ldq $rem,0($remp) 246 srl $Zhi,4,$Zhi 247 xor $Tlo1,$Zlo,$Zlo 248 xor $Thi1,$Zhi,$Zhi 249 xor $t0,$Zlo,$Zlo 250 xor $rem,$Zhi,$Zhi 251___ 252}} 253 254$code=<<___; 255#ifdef __linux__ 256#include <asm/regdef.h> 257#else 258#include <asm.h> 259#include <regdef.h> 260#endif 261 262.text 263 264.set noat 265.set noreorder 266.globl gcm_gmult_4bit 267.align 4 268.ent gcm_gmult_4bit 269gcm_gmult_4bit: 270 .frame sp,0,ra 271 .prologue 0 272 273 ldq $Xlo,8($Xi) 274 ldq $Xhi,0($Xi) 275 276 bsr $t0,picmeup 277 nop 278___ 279 280 &loop(); 281 282$code.=<<___; 283 srl $Zlo,24,$t0 # byte swap 284 srl $Zlo,8,$t1 285 286 sll $Zlo,8,$t2 287 sll $Zlo,24,$Zlo 288 zapnot $t0,0x11,$t0 289 zapnot $t1,0x22,$t1 290 291 zapnot $Zlo,0x88,$Zlo 292 or $t0,$t1,$t0 293 zapnot $t2,0x44,$t2 294 295 or $Zlo,$t0,$Zlo 296 srl $Zhi,24,$t0 297 srl $Zhi,8,$t1 298 299 or $Zlo,$t2,$Zlo 300 sll $Zhi,8,$t2 301 sll $Zhi,24,$Zhi 302 303 srl $Zlo,32,$Xlo 304 sll $Zlo,32,$Zlo 305 306 zapnot $t0,0x11,$t0 307 zapnot $t1,0x22,$t1 308 or $Zlo,$Xlo,$Xlo 309 310 zapnot $Zhi,0x88,$Zhi 311 or $t0,$t1,$t0 312 zapnot $t2,0x44,$t2 313 314 or $Zhi,$t0,$Zhi 315 or $Zhi,$t2,$Zhi 316 317 srl $Zhi,32,$Xhi 318 sll $Zhi,32,$Zhi 319 320 or $Zhi,$Xhi,$Xhi 321 stq $Xlo,8($Xi) 322 stq $Xhi,0($Xi) 323 324 ret (ra) 325.end gcm_gmult_4bit 326___ 327 328$inhi="s0"; 329$inlo="s1"; 330 331$code.=<<___; 332.globl gcm_ghash_4bit 333.align 4 334.ent gcm_ghash_4bit 335gcm_ghash_4bit: 336 lda sp,-32(sp) 337 stq ra,0(sp) 338 stq s0,8(sp) 339 stq s1,16(sp) 340 .mask 0x04000600,-32 341 .frame sp,32,ra 342 .prologue 0 343 344 ldq_u $inhi,0($inp) 345 ldq_u $Thi0,7($inp) 346 ldq_u $inlo,8($inp) 347 ldq_u $Tlo0,15($inp) 348 ldq $Xhi,0($Xi) 349 ldq $Xlo,8($Xi) 350 351 bsr $t0,picmeup 352 nop 353 354.Louter: 355 extql $inhi,$inp,$inhi 356 extqh $Thi0,$inp,$Thi0 357 or $inhi,$Thi0,$inhi 358 lda $inp,16($inp) 359 360 extql $inlo,$inp,$inlo 361 extqh $Tlo0,$inp,$Tlo0 362 or $inlo,$Tlo0,$inlo 363 subq $len,16,$len 364 365 xor $Xlo,$inlo,$Xlo 366 xor $Xhi,$inhi,$Xhi 367___ 368 369 &loop(); 370 371$code.=<<___; 372 srl $Zlo,24,$t0 # byte swap 373 srl $Zlo,8,$t1 374 375 sll $Zlo,8,$t2 376 sll $Zlo,24,$Zlo 377 zapnot $t0,0x11,$t0 378 zapnot $t1,0x22,$t1 379 380 zapnot $Zlo,0x88,$Zlo 381 or $t0,$t1,$t0 382 zapnot $t2,0x44,$t2 383 384 or $Zlo,$t0,$Zlo 385 srl $Zhi,24,$t0 386 srl $Zhi,8,$t1 387 388 or $Zlo,$t2,$Zlo 389 sll $Zhi,8,$t2 390 sll $Zhi,24,$Zhi 391 392 srl $Zlo,32,$Xlo 393 sll $Zlo,32,$Zlo 394 beq $len,.Ldone 395 396 zapnot $t0,0x11,$t0 397 zapnot $t1,0x22,$t1 398 or $Zlo,$Xlo,$Xlo 399 ldq_u $inhi,0($inp) 400 401 zapnot $Zhi,0x88,$Zhi 402 or $t0,$t1,$t0 403 zapnot $t2,0x44,$t2 404 ldq_u $Thi0,7($inp) 405 406 or $Zhi,$t0,$Zhi 407 or $Zhi,$t2,$Zhi 408 ldq_u $inlo,8($inp) 409 ldq_u $Tlo0,15($inp) 410 411 srl $Zhi,32,$Xhi 412 sll $Zhi,32,$Zhi 413 414 or $Zhi,$Xhi,$Xhi 415 br zero,.Louter 416 417.Ldone: 418 zapnot $t0,0x11,$t0 419 zapnot $t1,0x22,$t1 420 or $Zlo,$Xlo,$Xlo 421 422 zapnot $Zhi,0x88,$Zhi 423 or $t0,$t1,$t0 424 zapnot $t2,0x44,$t2 425 426 or $Zhi,$t0,$Zhi 427 or $Zhi,$t2,$Zhi 428 429 srl $Zhi,32,$Xhi 430 sll $Zhi,32,$Zhi 431 432 or $Zhi,$Xhi,$Xhi 433 434 stq $Xlo,8($Xi) 435 stq $Xhi,0($Xi) 436 437 .set noreorder 438 /*ldq ra,0(sp)*/ 439 ldq s0,8(sp) 440 ldq s1,16(sp) 441 lda sp,32(sp) 442 ret (ra) 443.end gcm_ghash_4bit 444 445.align 4 446.ent picmeup 447picmeup: 448 .frame sp,0,$t0 449 .prologue 0 450 br $rem_4bit,.Lpic 451.Lpic: lda $rem_4bit,12($rem_4bit) 452 ret ($t0) 453.end picmeup 454 nop 455rem_4bit: 456 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 457 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 458 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 459 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 460.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 461.align 4 462 463___ 464$output=pop and open STDOUT,">$output"; 465print $code; 466close STDOUT or die "error closing STDOUT: $!"; 467 468