1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for PowerPC FPU. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone, 22# and improvement coefficients relative to gcc-generated code. 23# 24# Freescale e300 9.78/+30% 25# PPC74x0 6.92/+50% 26# PPC970 6.03/+80% 27# POWER7 3.50/+30% 28# POWER8 3.75/+10% 29 30# $output is the last argument if it looks like a file (it has an extension) 31# $flavour is the first argument if it doesn't look like a file 32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 34 35if ($flavour =~ /64/) { 36 $SIZE_T =8; 37 $LRSAVE =2*$SIZE_T; 38 $UCMP ="cmpld"; 39 $STU ="stdu"; 40 $POP ="ld"; 41 $PUSH ="std"; 42} elsif ($flavour =~ /32/) { 43 $SIZE_T =4; 44 $LRSAVE =$SIZE_T; 45 $UCMP ="cmplw"; 46 $STU ="stwu"; 47 $POP ="lwz"; 48 $PUSH ="stw"; 49} else { die "nonsense $flavour"; } 50 51$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; 52 53$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx"; 54 55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 56( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 57( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 58die "can't locate ppc-xlate.pl"; 59 60open STDOUT,"| $^X $xlate $flavour \"$output\"" 61 or die "can't call $xlate: $!"; 62 63$LOCALS=6*$SIZE_T; 64$FRAME=$LOCALS+6*8+18*8; 65 66my $sp="r1"; 67 68my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); 69my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6)); 70 71my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, 72 $two0,$two32,$two64,$two96,$two130,$five_two130, 73 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, 74 $s2lo,$s2hi,$s3lo,$s3hi, 75 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31)); 76# borrowings 77my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); 78my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); 79my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi); 80 81$code.=<<___; 82.machine "any" 83.text 84 85.globl .poly1305_init_fpu 86.align 6 87.poly1305_init_fpu: 88 $STU $sp,-$LOCALS($sp) # minimal frame 89 mflr $padbit 90 $PUSH $padbit,`$LOCALS+$LRSAVE`($sp) 91 92 bl LPICmeup 93 94 xor r0,r0,r0 95 mtlr $padbit # restore lr 96 97 lfd $two0,8*0($len) # load constants 98 lfd $two32,8*1($len) 99 lfd $two64,8*2($len) 100 lfd $two96,8*3($len) 101 lfd $two130,8*4($len) 102 lfd $five_two130,8*5($len) 103 104 stfd $two0,8*0($ctx) # initial hash value, biased 0 105 stfd $two32,8*1($ctx) 106 stfd $two64,8*2($ctx) 107 stfd $two96,8*3($ctx) 108 109 $UCMP $inp,r0 110 beq- Lno_key 111 112 lfd $h3lo,8*13($len) # new fpscr 113 mffs $h3hi # old fpscr 114 115 stfd $two0,8*4($ctx) # key "template" 116 stfd $two32,8*5($ctx) 117 stfd $two64,8*6($ctx) 118 stfd $two96,8*7($ctx) 119 120 li $in1,4 121 li $in2,8 122 li $in3,12 123 $LWXLE $in0,0,$inp # load key 124 $LWXLE $in1,$in1,$inp 125 $LWXLE $in2,$in2,$inp 126 $LWXLE $in3,$in3,$inp 127 128 lis $i1,0xf000 # 0xf0000000 129 ori $i2,$i1,3 # 0xf0000003 130 andc $in0,$in0,$i1 # &=0x0fffffff 131 andc $in1,$in1,$i2 # &=0x0ffffffc 132 andc $in2,$in2,$i2 133 andc $in3,$in3,$i2 134 135 stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template" 136 stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx) 137 stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx) 138 stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx) 139 140 mtfsf 255,$h3lo # fpscr 141 stfd $two0,8*18($ctx) # copy constants to context 142 stfd $two32,8*19($ctx) 143 stfd $two64,8*20($ctx) 144 stfd $two96,8*21($ctx) 145 stfd $two130,8*22($ctx) 146 stfd $five_two130,8*23($ctx) 147 148 lfd $h0lo,8*4($ctx) # load [biased] key 149 lfd $h1lo,8*5($ctx) 150 lfd $h2lo,8*6($ctx) 151 lfd $h3lo,8*7($ctx) 152 153 fsub $h0lo,$h0lo,$two0 # r0 154 fsub $h1lo,$h1lo,$two32 # r1 155 fsub $h2lo,$h2lo,$two64 # r2 156 fsub $h3lo,$h3lo,$two96 # r3 157 158 lfd $two0,8*6($len) # more constants 159 lfd $two32,8*7($len) 160 lfd $two64,8*8($len) 161 lfd $two96,8*9($len) 162 163 fmul $h1hi,$h1lo,$five_two130 # s1 164 fmul $h2hi,$h2lo,$five_two130 # s2 165 stfd $h3hi,8*15($ctx) # borrow slot for original fpscr 166 fmul $h3hi,$h3lo,$five_two130 # s3 167 168 fadd $h0hi,$h0lo,$two0 169 stfd $h1hi,8*12($ctx) # put aside for now 170 fadd $h1hi,$h1lo,$two32 171 stfd $h2hi,8*13($ctx) 172 fadd $h2hi,$h2lo,$two64 173 stfd $h3hi,8*14($ctx) 174 fadd $h3hi,$h3lo,$two96 175 176 fsub $h0hi,$h0hi,$two0 177 fsub $h1hi,$h1hi,$two32 178 fsub $h2hi,$h2hi,$two64 179 fsub $h3hi,$h3hi,$two96 180 181 lfd $two0,8*10($len) # more constants 182 lfd $two32,8*11($len) 183 lfd $two64,8*12($len) 184 185 fsub $h0lo,$h0lo,$h0hi 186 fsub $h1lo,$h1lo,$h1hi 187 fsub $h2lo,$h2lo,$h2hi 188 fsub $h3lo,$h3lo,$h3hi 189 190 stfd $h0hi,8*5($ctx) # r0hi 191 stfd $h1hi,8*7($ctx) # r1hi 192 stfd $h2hi,8*9($ctx) # r2hi 193 stfd $h3hi,8*11($ctx) # r3hi 194 195 stfd $h0lo,8*4($ctx) # r0lo 196 stfd $h1lo,8*6($ctx) # r1lo 197 stfd $h2lo,8*8($ctx) # r2lo 198 stfd $h3lo,8*10($ctx) # r3lo 199 200 lfd $h1lo,8*12($ctx) # s1 201 lfd $h2lo,8*13($ctx) # s2 202 lfd $h3lo,8*14($ctx) # s3 203 lfd $h0lo,8*15($ctx) # pull original fpscr 204 205 fadd $h1hi,$h1lo,$two0 206 fadd $h2hi,$h2lo,$two32 207 fadd $h3hi,$h3lo,$two64 208 209 fsub $h1hi,$h1hi,$two0 210 fsub $h2hi,$h2hi,$two32 211 fsub $h3hi,$h3hi,$two64 212 213 fsub $h1lo,$h1lo,$h1hi 214 fsub $h2lo,$h2lo,$h2hi 215 fsub $h3lo,$h3lo,$h3hi 216 217 stfd $h1hi,8*13($ctx) # s1hi 218 stfd $h2hi,8*15($ctx) # s2hi 219 stfd $h3hi,8*17($ctx) # s3hi 220 221 stfd $h1lo,8*12($ctx) # s1lo 222 stfd $h2lo,8*14($ctx) # s2lo 223 stfd $h3lo,8*16($ctx) # s3lo 224 225 mtfsf 255,$h0lo # restore fpscr 226Lno_key: 227 xor r3,r3,r3 228 addi $sp,$sp,$LOCALS 229 blr 230 .long 0 231 .byte 0,12,4,1,0x80,0,2,0 232.size .poly1305_init_fpu,.-.poly1305_init_fpu 233 234.globl .poly1305_blocks_fpu 235.align 4 236.poly1305_blocks_fpu: 237 srwi. $len,$len,4 238 beq- Labort 239 240 $STU $sp,-$FRAME($sp) 241 mflr r0 242 stfd f14,`$FRAME-8*18`($sp) 243 stfd f15,`$FRAME-8*17`($sp) 244 stfd f16,`$FRAME-8*16`($sp) 245 stfd f17,`$FRAME-8*15`($sp) 246 stfd f18,`$FRAME-8*14`($sp) 247 stfd f19,`$FRAME-8*13`($sp) 248 stfd f20,`$FRAME-8*12`($sp) 249 stfd f21,`$FRAME-8*11`($sp) 250 stfd f22,`$FRAME-8*10`($sp) 251 stfd f23,`$FRAME-8*9`($sp) 252 stfd f24,`$FRAME-8*8`($sp) 253 stfd f25,`$FRAME-8*7`($sp) 254 stfd f26,`$FRAME-8*6`($sp) 255 stfd f27,`$FRAME-8*5`($sp) 256 stfd f28,`$FRAME-8*4`($sp) 257 stfd f29,`$FRAME-8*3`($sp) 258 stfd f30,`$FRAME-8*2`($sp) 259 stfd f31,`$FRAME-8*1`($sp) 260 $PUSH r0,`$FRAME+$LRSAVE`($sp) 261 262 xor r0,r0,r0 263 li $in3,1 264 mtctr $len 265 neg $len,$len 266 stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp) 267 stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp) 268 269 lfd $two0,8*18($ctx) # load constants 270 lfd $two32,8*19($ctx) 271 lfd $two64,8*20($ctx) 272 lfd $two96,8*21($ctx) 273 lfd $two130,8*22($ctx) 274 lfd $five_two130,8*23($ctx) 275 276 lfd $h0lo,8*0($ctx) # load [biased] hash value 277 lfd $h1lo,8*1($ctx) 278 lfd $h2lo,8*2($ctx) 279 lfd $h3lo,8*3($ctx) 280 281 stfd $two0,`$LOCALS+8*0`($sp) # input "template" 282 oris $in3,$padbit,`(1023+52+96)<<4` 283 stfd $two32,`$LOCALS+8*1`($sp) 284 stfd $two64,`$LOCALS+8*2`($sp) 285 stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp) 286 287 li $i1,4 288 li $i2,8 289 li $i3,12 290 $LWXLE $in0,0,$inp # load input 291 $LWXLE $in1,$i1,$inp 292 $LWXLE $in2,$i2,$inp 293 $LWXLE $in3,$i3,$inp 294 addi $inp,$inp,16 295 296 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" 297 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) 298 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) 299 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) 300 301 mffs $x0 # original fpscr 302 lfd $x1,`$LOCALS+8*4`($sp) # new fpscr 303 lfd $r0lo,8*4($ctx) # load key 304 lfd $r0hi,8*5($ctx) 305 lfd $r1lo,8*6($ctx) 306 lfd $r1hi,8*7($ctx) 307 lfd $r2lo,8*8($ctx) 308 lfd $r2hi,8*9($ctx) 309 lfd $r3lo,8*10($ctx) 310 lfd $r3hi,8*11($ctx) 311 lfd $s1lo,8*12($ctx) 312 lfd $s1hi,8*13($ctx) 313 lfd $s2lo,8*14($ctx) 314 lfd $s2hi,8*15($ctx) 315 lfd $s3lo,8*16($ctx) 316 lfd $s3hi,8*17($ctx) 317 318 stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr 319 mtfsf 255,$x1 320 321 addic $len,$len,1 322 addze r0,r0 323 slwi. r0,r0,4 324 sub $inp,$inp,r0 # conditional rewind 325 326 lfd $x0,`$LOCALS+8*0`($sp) 327 lfd $x1,`$LOCALS+8*1`($sp) 328 lfd $x2,`$LOCALS+8*2`($sp) 329 lfd $x3,`$LOCALS+8*3`($sp) 330 331 fsub $h0lo,$h0lo,$two0 # de-bias hash value 332 $LWXLE $in0,0,$inp # modulo-scheduled input load 333 fsub $h1lo,$h1lo,$two32 334 $LWXLE $in1,$i1,$inp 335 fsub $h2lo,$h2lo,$two64 336 $LWXLE $in2,$i2,$inp 337 fsub $h3lo,$h3lo,$two96 338 $LWXLE $in3,$i3,$inp 339 340 fsub $x0,$x0,$two0 # de-bias input 341 addi $inp,$inp,16 342 fsub $x1,$x1,$two32 343 fsub $x2,$x2,$two64 344 fsub $x3,$x3,$two96 345 346 fadd $x0,$x0,$h0lo # accumulate input 347 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) 348 fadd $x1,$x1,$h1lo 349 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) 350 fadd $x2,$x2,$h2lo 351 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) 352 fadd $x3,$x3,$h3lo 353 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) 354 355 b Lentry 356 357.align 4 358Loop: 359 fsub $y0,$y0,$two0 # de-bias input 360 addic $len,$len,1 361 fsub $y1,$y1,$two32 362 addze r0,r0 363 fsub $y2,$y2,$two64 364 slwi. r0,r0,4 365 fsub $y3,$y3,$two96 366 sub $inp,$inp,r0 # conditional rewind 367 368 fadd $h0lo,$h0lo,$y0 # accumulate input 369 fadd $h0hi,$h0hi,$y1 370 fadd $h2lo,$h2lo,$y2 371 fadd $h2hi,$h2hi,$y3 372 373 ######################################### base 2^48 -> base 2^32 374 fadd $c1lo,$h1lo,$two64 375 $LWXLE $in0,0,$inp # modulo-scheduled input load 376 fadd $c1hi,$h1hi,$two64 377 $LWXLE $in1,$i1,$inp 378 fadd $c3lo,$h3lo,$two130 379 $LWXLE $in2,$i2,$inp 380 fadd $c3hi,$h3hi,$two130 381 $LWXLE $in3,$i3,$inp 382 fadd $c0lo,$h0lo,$two32 383 addi $inp,$inp,16 384 fadd $c0hi,$h0hi,$two32 385 fadd $c2lo,$h2lo,$two96 386 fadd $c2hi,$h2hi,$two96 387 388 fsub $c1lo,$c1lo,$two64 389 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" 390 fsub $c1hi,$c1hi,$two64 391 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) 392 fsub $c3lo,$c3lo,$two130 393 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) 394 fsub $c3hi,$c3hi,$two130 395 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) 396 fsub $c0lo,$c0lo,$two32 397 fsub $c0hi,$c0hi,$two32 398 fsub $c2lo,$c2lo,$two96 399 fsub $c2hi,$c2hi,$two96 400 401 fsub $h1lo,$h1lo,$c1lo 402 fsub $h1hi,$h1hi,$c1hi 403 fsub $h3lo,$h3lo,$c3lo 404 fsub $h3hi,$h3hi,$c3hi 405 fsub $h2lo,$h2lo,$c2lo 406 fsub $h2hi,$h2hi,$c2hi 407 fsub $h0lo,$h0lo,$c0lo 408 fsub $h0hi,$h0hi,$c0hi 409 410 fadd $h1lo,$h1lo,$c0lo 411 fadd $h1hi,$h1hi,$c0hi 412 fadd $h3lo,$h3lo,$c2lo 413 fadd $h3hi,$h3hi,$c2hi 414 fadd $h2lo,$h2lo,$c1lo 415 fadd $h2hi,$h2hi,$c1hi 416 fmadd $h0lo,$c3lo,$five_two130,$h0lo 417 fmadd $h0hi,$c3hi,$five_two130,$h0hi 418 419 fadd $x1,$h1lo,$h1hi 420 lfd $s1lo,8*12($ctx) # reload constants 421 fadd $x3,$h3lo,$h3hi 422 lfd $s1hi,8*13($ctx) 423 fadd $x2,$h2lo,$h2hi 424 lfd $r3lo,8*10($ctx) 425 fadd $x0,$h0lo,$h0hi 426 lfd $r3hi,8*11($ctx) 427Lentry: 428 fmul $h0lo,$s3lo,$x1 429 fmul $h0hi,$s3hi,$x1 430 fmul $h2lo,$r1lo,$x1 431 fmul $h2hi,$r1hi,$x1 432 fmul $h1lo,$r0lo,$x1 433 fmul $h1hi,$r0hi,$x1 434 fmul $h3lo,$r2lo,$x1 435 fmul $h3hi,$r2hi,$x1 436 437 fmadd $h0lo,$s1lo,$x3,$h0lo 438 fmadd $h0hi,$s1hi,$x3,$h0hi 439 fmadd $h2lo,$s3lo,$x3,$h2lo 440 fmadd $h2hi,$s3hi,$x3,$h2hi 441 fmadd $h1lo,$s2lo,$x3,$h1lo 442 fmadd $h1hi,$s2hi,$x3,$h1hi 443 fmadd $h3lo,$r0lo,$x3,$h3lo 444 fmadd $h3hi,$r0hi,$x3,$h3hi 445 446 fmadd $h0lo,$s2lo,$x2,$h0lo 447 fmadd $h0hi,$s2hi,$x2,$h0hi 448 fmadd $h2lo,$r0lo,$x2,$h2lo 449 fmadd $h2hi,$r0hi,$x2,$h2hi 450 fmadd $h1lo,$s3lo,$x2,$h1lo 451 fmadd $h1hi,$s3hi,$x2,$h1hi 452 fmadd $h3lo,$r1lo,$x2,$h3lo 453 fmadd $h3hi,$r1hi,$x2,$h3hi 454 455 fmadd $h0lo,$r0lo,$x0,$h0lo 456 lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input 457 fmadd $h0hi,$r0hi,$x0,$h0hi 458 lfd $y1,`$LOCALS+8*1`($sp) 459 fmadd $h2lo,$r2lo,$x0,$h2lo 460 lfd $y2,`$LOCALS+8*2`($sp) 461 fmadd $h2hi,$r2hi,$x0,$h2hi 462 lfd $y3,`$LOCALS+8*3`($sp) 463 fmadd $h1lo,$r1lo,$x0,$h1lo 464 fmadd $h1hi,$r1hi,$x0,$h1hi 465 fmadd $h3lo,$r3lo,$x0,$h3lo 466 fmadd $h3hi,$r3hi,$x0,$h3hi 467 468 bdnz Loop 469 470 ######################################### base 2^48 -> base 2^32 471 fadd $c0lo,$h0lo,$two32 472 fadd $c0hi,$h0hi,$two32 473 fadd $c2lo,$h2lo,$two96 474 fadd $c2hi,$h2hi,$two96 475 fadd $c1lo,$h1lo,$two64 476 fadd $c1hi,$h1hi,$two64 477 fadd $c3lo,$h3lo,$two130 478 fadd $c3hi,$h3hi,$two130 479 480 fsub $c0lo,$c0lo,$two32 481 fsub $c0hi,$c0hi,$two32 482 fsub $c2lo,$c2lo,$two96 483 fsub $c2hi,$c2hi,$two96 484 fsub $c1lo,$c1lo,$two64 485 fsub $c1hi,$c1hi,$two64 486 fsub $c3lo,$c3lo,$two130 487 fsub $c3hi,$c3hi,$two130 488 489 fsub $h1lo,$h1lo,$c1lo 490 fsub $h1hi,$h1hi,$c1hi 491 fsub $h3lo,$h3lo,$c3lo 492 fsub $h3hi,$h3hi,$c3hi 493 fsub $h2lo,$h2lo,$c2lo 494 fsub $h2hi,$h2hi,$c2hi 495 fsub $h0lo,$h0lo,$c0lo 496 fsub $h0hi,$h0hi,$c0hi 497 498 fadd $h1lo,$h1lo,$c0lo 499 fadd $h1hi,$h1hi,$c0hi 500 fadd $h3lo,$h3lo,$c2lo 501 fadd $h3hi,$h3hi,$c2hi 502 fadd $h2lo,$h2lo,$c1lo 503 fadd $h2hi,$h2hi,$c1hi 504 fmadd $h0lo,$c3lo,$five_two130,$h0lo 505 fmadd $h0hi,$c3hi,$five_two130,$h0hi 506 507 fadd $x1,$h1lo,$h1hi 508 fadd $x3,$h3lo,$h3hi 509 fadd $x2,$h2lo,$h2hi 510 fadd $x0,$h0lo,$h0hi 511 512 lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr 513 fadd $x1,$x1,$two32 # bias 514 fadd $x3,$x3,$two96 515 fadd $x2,$x2,$two64 516 fadd $x0,$x0,$two0 517 518 stfd $x1,8*1($ctx) # store [biased] hash value 519 stfd $x3,8*3($ctx) 520 stfd $x2,8*2($ctx) 521 stfd $x0,8*0($ctx) 522 523 mtfsf 255,$h0lo # restore original fpscr 524 lfd f14,`$FRAME-8*18`($sp) 525 lfd f15,`$FRAME-8*17`($sp) 526 lfd f16,`$FRAME-8*16`($sp) 527 lfd f17,`$FRAME-8*15`($sp) 528 lfd f18,`$FRAME-8*14`($sp) 529 lfd f19,`$FRAME-8*13`($sp) 530 lfd f20,`$FRAME-8*12`($sp) 531 lfd f21,`$FRAME-8*11`($sp) 532 lfd f22,`$FRAME-8*10`($sp) 533 lfd f23,`$FRAME-8*9`($sp) 534 lfd f24,`$FRAME-8*8`($sp) 535 lfd f25,`$FRAME-8*7`($sp) 536 lfd f26,`$FRAME-8*6`($sp) 537 lfd f27,`$FRAME-8*5`($sp) 538 lfd f28,`$FRAME-8*4`($sp) 539 lfd f29,`$FRAME-8*3`($sp) 540 lfd f30,`$FRAME-8*2`($sp) 541 lfd f31,`$FRAME-8*1`($sp) 542 addi $sp,$sp,$FRAME 543Labort: 544 blr 545 .long 0 546 .byte 0,12,4,1,0x80,0,4,0 547.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu 548___ 549{ 550my ($mac,$nonce)=($inp,$len); 551 552my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3 553 ) = map("r$_",(7..11,28..31)); 554my $mask = "r0"; 555my $FRAME = (6+4)*$SIZE_T; 556 557$code.=<<___; 558.globl .poly1305_emit_fpu 559.align 4 560.poly1305_emit_fpu: 561 $STU $sp,-$FRAME($sp) 562 mflr r0 563 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 564 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 565 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 566 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 567 $PUSH r0,`$FRAME+$LRSAVE`($sp) 568 569 lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash 570 lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx) 571 lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx) 572 lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx) 573 lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx) 574 lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx) 575 lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx) 576 lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx) 577 578 lis $mask,0xfff0 579 andc $d0,$d0,$mask # mask exponent 580 andc $d1,$d1,$mask 581 andc $d2,$d2,$mask 582 andc $d3,$d3,$mask # can be partially reduced... 583 li $mask,3 584 585 srwi $padbit,$d3,2 # ... so reduce 586 and $h4,$d3,$mask 587 andc $d3,$d3,$mask 588 add $d3,$d3,$padbit 589___ 590 if ($SIZE_T==4) { 591$code.=<<___; 592 addc $h0,$h0,$d3 593 adde $h1,$h1,$d0 594 adde $h2,$h2,$d1 595 adde $h3,$h3,$d2 596 addze $h4,$h4 597 598 addic $d0,$h0,5 # compare to modulus 599 addze $d1,$h1 600 addze $d2,$h2 601 addze $d3,$h3 602 addze $mask,$h4 603 604 srwi $mask,$mask,2 # did it carry/borrow? 605 neg $mask,$mask 606 srawi $mask,$mask,31 # mask 607 608 andc $h0,$h0,$mask 609 and $d0,$d0,$mask 610 andc $h1,$h1,$mask 611 and $d1,$d1,$mask 612 or $h0,$h0,$d0 613 lwz $d0,0($nonce) # load nonce 614 andc $h2,$h2,$mask 615 and $d2,$d2,$mask 616 or $h1,$h1,$d1 617 lwz $d1,4($nonce) 618 andc $h3,$h3,$mask 619 and $d3,$d3,$mask 620 or $h2,$h2,$d2 621 lwz $d2,8($nonce) 622 or $h3,$h3,$d3 623 lwz $d3,12($nonce) 624 625 addc $h0,$h0,$d0 # accumulate nonce 626 adde $h1,$h1,$d1 627 adde $h2,$h2,$d2 628 adde $h3,$h3,$d3 629___ 630 } else { 631$code.=<<___; 632 add $h0,$h0,$d3 633 add $h1,$h1,$d0 634 add $h2,$h2,$d1 635 add $h3,$h3,$d2 636 637 srdi $d0,$h0,32 638 add $h1,$h1,$d0 639 srdi $d1,$h1,32 640 add $h2,$h2,$d1 641 srdi $d2,$h2,32 642 add $h3,$h3,$d2 643 srdi $d3,$h3,32 644 add $h4,$h4,$d3 645 646 insrdi $h0,$h1,32,0 647 insrdi $h2,$h3,32,0 648 649 addic $d0,$h0,5 # compare to modulus 650 addze $d1,$h2 651 addze $d2,$h4 652 653 srdi $mask,$d2,2 # did it carry/borrow? 654 neg $mask,$mask 655 sradi $mask,$mask,63 # mask 656 ld $d2,0($nonce) # load nonce 657 ld $d3,8($nonce) 658 659 andc $h0,$h0,$mask 660 and $d0,$d0,$mask 661 andc $h2,$h2,$mask 662 and $d1,$d1,$mask 663 or $h0,$h0,$d0 664 or $h2,$h2,$d1 665___ 666$code.=<<___ if (!$LITTLE_ENDIAN); 667 rotldi $d2,$d2,32 # flip nonce words 668 rotldi $d3,$d3,32 669___ 670$code.=<<___; 671 addc $h0,$h0,$d2 # accumulate nonce 672 adde $h2,$h2,$d3 673 674 srdi $h1,$h0,32 675 srdi $h3,$h2,32 676___ 677 } 678$code.=<<___ if ($LITTLE_ENDIAN); 679 stw $h0,0($mac) # write result 680 stw $h1,4($mac) 681 stw $h2,8($mac) 682 stw $h3,12($mac) 683___ 684$code.=<<___ if (!$LITTLE_ENDIAN); 685 li $d1,4 686 stwbrx $h0,0,$mac # write result 687 li $d2,8 688 stwbrx $h1,$d1,$mac 689 li $d3,12 690 stwbrx $h2,$d2,$mac 691 stwbrx $h3,$d3,$mac 692___ 693$code.=<<___; 694 $POP r28,`$FRAME-$SIZE_T*4`($sp) 695 $POP r29,`$FRAME-$SIZE_T*3`($sp) 696 $POP r30,`$FRAME-$SIZE_T*2`($sp) 697 $POP r31,`$FRAME-$SIZE_T*1`($sp) 698 addi $sp,$sp,$FRAME 699 blr 700 .long 0 701 .byte 0,12,4,1,0x80,4,3,0 702.size .poly1305_emit_fpu,.-.poly1305_emit_fpu 703___ 704} 705# Ugly hack here, because PPC assembler syntax seem to vary too 706# much from platforms to platform... 707$code.=<<___; 708.align 6 709LPICmeup: 710 mflr r0 711 bcl 20,31,\$+4 712 mflr $len # vvvvvv "distance" between . and 1st data entry 713 addi $len,$len,`64-8` # borrow $len 714 mtlr r0 715 blr 716 .long 0 717 .byte 0,12,0x14,0,0,0,0,0 718 .space `64-9*4` 719 720.quad 0x4330000000000000 # 2^(52+0) 721.quad 0x4530000000000000 # 2^(52+32) 722.quad 0x4730000000000000 # 2^(52+64) 723.quad 0x4930000000000000 # 2^(52+96) 724.quad 0x4b50000000000000 # 2^(52+130) 725 726.quad 0x37f4000000000000 # 5/2^130 727 728.quad 0x4430000000000000 # 2^(52+16+0) 729.quad 0x4630000000000000 # 2^(52+16+32) 730.quad 0x4830000000000000 # 2^(52+16+64) 731.quad 0x4a30000000000000 # 2^(52+16+96) 732.quad 0x3e30000000000000 # 2^(52+16+0-96) 733.quad 0x4030000000000000 # 2^(52+16+32-96) 734.quad 0x4230000000000000 # 2^(52+16+64-96) 735 736.quad 0x0000000000000001 # fpscr: truncate, no exceptions 737.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>" 738.align 4 739___ 740 741$code =~ s/\`([^\`]*)\`/eval $1/gem; 742print $code; 743close STDOUT or die "error closing STDOUT: $!"; 744