1#! /usr/bin/env perl 2# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# X25519 lower-level primitives for PPC64. 17# 18# July 2018. 19# 20# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15% 21# faster on PPC970/G5. POWER8 on the other hand seems to trip on own 22# shoelaces when handling longer carry chains. As base 2^51 has just 23# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is 24# pretty old, base 2^64 implementation is not engaged. Comparison to 25# compiler-generated code is complicated by the fact that not all 26# compilers support 128-bit integers. When compiler doesn't, like xlc, 27# this module delivers more than 2x improvement, and when it does, 28# from 12% to 30% improvement was measured... 29 30# $output is the last argument if it looks like a file (it has an extension) 31# $flavour is the first argument if it doesn't look like a file 32$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 33$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 34 35$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 36( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 37( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 38die "can't locate ppc-xlate.pl"; 39 40open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 41 or die "can't call $xlate: $!"; 42*STDOUT=*OUT; 43 44my $sp = "r1"; 45my ($rp,$ap,$bp) = map("r$_",3..5); 46 47####################################################### base 2^64 48if (0) { 49my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3, 50 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = 51 map("r$_",(6..12,22..31)); 52my $zero = "r0"; 53my $FRAME = 16*8; 54 55$code.=<<___; 56.text 57 58.globl x25519_fe64_mul 59.type x25519_fe64_mul,\@function 60.align 5 61x25519_fe64_mul: 62 stdu $sp,-$FRAME($sp) 63 std r22,`$FRAME-8*10`($sp) 64 std r23,`$FRAME-8*9`($sp) 65 std r24,`$FRAME-8*8`($sp) 66 std r25,`$FRAME-8*7`($sp) 67 std r26,`$FRAME-8*6`($sp) 68 std r27,`$FRAME-8*5`($sp) 69 std r28,`$FRAME-8*4`($sp) 70 std r29,`$FRAME-8*3`($sp) 71 std r30,`$FRAME-8*2`($sp) 72 std r31,`$FRAME-8*1`($sp) 73 74 ld $bi,0($bp) 75 ld $a0,0($ap) 76 xor $zero,$zero,$zero 77 ld $a1,8($ap) 78 ld $a2,16($ap) 79 ld $a3,24($ap) 80 81 mulld $acc0,$a0,$bi # a[0]*b[0] 82 mulhdu $t0,$a0,$bi 83 mulld $acc1,$a1,$bi # a[1]*b[0] 84 mulhdu $t1,$a1,$bi 85 mulld $acc2,$a2,$bi # a[2]*b[0] 86 mulhdu $t2,$a2,$bi 87 mulld $acc3,$a3,$bi # a[3]*b[0] 88 mulhdu $t3,$a3,$bi 89___ 90for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7), 91 my $i=1; $i<4; shift(@acc), $i++) { 92my $acc4 = $i==1? $zero : @acc[4]; 93 94$code.=<<___; 95 ld $bi,`8*$i`($bp) 96 addc @acc[1],@acc[1],$t0 # accumulate high parts 97 mulld $t0,$a0,$bi 98 adde @acc[2],@acc[2],$t1 99 mulld $t1,$a1,$bi 100 adde @acc[3],@acc[3],$t2 101 mulld $t2,$a2,$bi 102 adde @acc[4],$acc4,$t3 103 mulld $t3,$a3,$bi 104 addc @acc[1],@acc[1],$t0 # accumulate low parts 105 mulhdu $t0,$a0,$bi 106 adde @acc[2],@acc[2],$t1 107 mulhdu $t1,$a1,$bi 108 adde @acc[3],@acc[3],$t2 109 mulhdu $t2,$a2,$bi 110 adde @acc[4],@acc[4],$t3 111 mulhdu $t3,$a3,$bi 112 adde @acc[5],$zero,$zero 113___ 114} 115$code.=<<___; 116 li $bi,38 117 addc $acc4,$acc4,$t0 118 mulld $t0,$acc4,$bi 119 adde $acc5,$acc5,$t1 120 mulld $t1,$acc5,$bi 121 adde $acc6,$acc6,$t2 122 mulld $t2,$acc6,$bi 123 adde $acc7,$acc7,$t3 124 mulld $t3,$acc7,$bi 125 126 addc $acc0,$acc0,$t0 127 mulhdu $t0,$acc4,$bi 128 adde $acc1,$acc1,$t1 129 mulhdu $t1,$acc5,$bi 130 adde $acc2,$acc2,$t2 131 mulhdu $t2,$acc6,$bi 132 adde $acc3,$acc3,$t3 133 mulhdu $t3,$acc7,$bi 134 adde $acc4,$zero,$zero 135 136 addc $acc1,$acc1,$t0 137 adde $acc2,$acc2,$t1 138 adde $acc3,$acc3,$t2 139 adde $acc4,$acc4,$t3 140 141 mulld $acc4,$acc4,$bi 142 143 addc $acc0,$acc0,$acc4 144 addze $acc1,$acc1 145 addze $acc2,$acc2 146 addze $acc3,$acc3 147 148 subfe $acc4,$acc4,$acc4 # carry -> ~mask 149 std $acc1,8($rp) 150 andc $acc4,$bi,$acc4 151 std $acc2,16($rp) 152 add $acc0,$acc0,$acc4 153 std $acc3,24($rp) 154 std $acc0,0($rp) 155 156 ld r22,`$FRAME-8*10`($sp) 157 ld r23,`$FRAME-8*9`($sp) 158 ld r24,`$FRAME-8*8`($sp) 159 ld r25,`$FRAME-8*7`($sp) 160 ld r26,`$FRAME-8*6`($sp) 161 ld r27,`$FRAME-8*5`($sp) 162 ld r28,`$FRAME-8*4`($sp) 163 ld r29,`$FRAME-8*3`($sp) 164 ld r30,`$FRAME-8*2`($sp) 165 ld r31,`$FRAME-8*1`($sp) 166 addi $sp,$sp,$FRAME 167 blr 168 .long 0 169 .byte 0,12,4,0,0x80,10,3,0 170 .long 0 171.size x25519_fe64_mul,.-x25519_fe64_mul 172 173.globl x25519_fe64_sqr 174.type x25519_fe64_sqr,\@function 175.align 5 176x25519_fe64_sqr: 177 stdu $sp,-$FRAME($sp) 178 std r22,`$FRAME-8*10`($sp) 179 std r23,`$FRAME-8*9`($sp) 180 std r24,`$FRAME-8*8`($sp) 181 std r25,`$FRAME-8*7`($sp) 182 std r26,`$FRAME-8*6`($sp) 183 std r27,`$FRAME-8*5`($sp) 184 std r28,`$FRAME-8*4`($sp) 185 std r29,`$FRAME-8*3`($sp) 186 std r30,`$FRAME-8*2`($sp) 187 std r31,`$FRAME-8*1`($sp) 188 189 ld $a0,0($ap) 190 xor $zero,$zero,$zero 191 ld $a1,8($ap) 192 ld $a2,16($ap) 193 ld $a3,24($ap) 194 195 ################################ 196 # | | | | | |a1*a0| | 197 # | | | | |a2*a0| | | 198 # | |a3*a2|a3*a0| | | | 199 # | | | |a2*a1| | | | 200 # | | |a3*a1| | | | | 201 # *| | | | | | | | 2| 202 # +|a3*a3|a2*a2|a1*a1|a0*a0| 203 # |--+--+--+--+--+--+--+--| 204 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 205 # 206 # "can't overflow" below mark carrying into high part of 207 # multiplication result, which can't overflow, because it 208 # can never be all ones. 209 210 mulld $acc1,$a1,$a0 # a[1]*a[0] 211 mulhdu $t1,$a1,$a0 212 mulld $acc2,$a2,$a0 # a[2]*a[0] 213 mulhdu $t2,$a2,$a0 214 mulld $acc3,$a3,$a0 # a[3]*a[0] 215 mulhdu $acc4,$a3,$a0 216 217 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication 218 mulld $t0,$a2,$a1 # a[2]*a[1] 219 mulhdu $t1,$a2,$a1 220 adde $acc3,$acc3,$t2 221 mulld $t2,$a3,$a1 # a[3]*a[1] 222 mulhdu $t3,$a3,$a1 223 addze $acc4,$acc4 # can't overflow 224 225 mulld $acc5,$a3,$a2 # a[3]*a[2] 226 mulhdu $acc6,$a3,$a2 227 228 addc $t1,$t1,$t2 # accumulate high parts of multiplication 229 mulld $acc0,$a0,$a0 # a[0]*a[0] 230 addze $t2,$t3 # can't overflow 231 232 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication 233 mulhdu $a0,$a0,$a0 234 adde $acc4,$acc4,$t1 235 mulld $t1,$a1,$a1 # a[1]*a[1] 236 adde $acc5,$acc5,$t2 237 mulhdu $a1,$a1,$a1 238 addze $acc6,$acc6 # can't overflow 239 240 addc $acc1,$acc1,$acc1 # acc[1-6]*=2 241 mulld $t2,$a2,$a2 # a[2]*a[2] 242 adde $acc2,$acc2,$acc2 243 mulhdu $a2,$a2,$a2 244 adde $acc3,$acc3,$acc3 245 mulld $t3,$a3,$a3 # a[3]*a[3] 246 adde $acc4,$acc4,$acc4 247 mulhdu $a3,$a3,$a3 248 adde $acc5,$acc5,$acc5 249 adde $acc6,$acc6,$acc6 250 addze $acc7,$zero 251 252 addc $acc1,$acc1,$a0 # +a[i]*a[i] 253 li $bi,38 254 adde $acc2,$acc2,$t1 255 adde $acc3,$acc3,$a1 256 adde $acc4,$acc4,$t2 257 adde $acc5,$acc5,$a2 258 adde $acc6,$acc6,$t3 259 adde $acc7,$acc7,$a3 260 261 mulld $t0,$acc4,$bi 262 mulld $t1,$acc5,$bi 263 mulld $t2,$acc6,$bi 264 mulld $t3,$acc7,$bi 265 266 addc $acc0,$acc0,$t0 267 mulhdu $t0,$acc4,$bi 268 adde $acc1,$acc1,$t1 269 mulhdu $t1,$acc5,$bi 270 adde $acc2,$acc2,$t2 271 mulhdu $t2,$acc6,$bi 272 adde $acc3,$acc3,$t3 273 mulhdu $t3,$acc7,$bi 274 addze $acc4,$zero 275 276 addc $acc1,$acc1,$t0 277 adde $acc2,$acc2,$t1 278 adde $acc3,$acc3,$t2 279 adde $acc4,$acc4,$t3 280 281 mulld $acc4,$acc4,$bi 282 283 addc $acc0,$acc0,$acc4 284 addze $acc1,$acc1 285 addze $acc2,$acc2 286 addze $acc3,$acc3 287 288 subfe $acc4,$acc4,$acc4 # carry -> ~mask 289 std $acc1,8($rp) 290 andc $acc4,$bi,$acc4 291 std $acc2,16($rp) 292 add $acc0,$acc0,$acc4 293 std $acc3,24($rp) 294 std $acc0,0($rp) 295 296 ld r22,`$FRAME-8*10`($sp) 297 ld r23,`$FRAME-8*9`($sp) 298 ld r24,`$FRAME-8*8`($sp) 299 ld r25,`$FRAME-8*7`($sp) 300 ld r26,`$FRAME-8*6`($sp) 301 ld r27,`$FRAME-8*5`($sp) 302 ld r28,`$FRAME-8*4`($sp) 303 ld r29,`$FRAME-8*3`($sp) 304 ld r30,`$FRAME-8*2`($sp) 305 ld r31,`$FRAME-8*1`($sp) 306 addi $sp,$sp,$FRAME 307 blr 308 .long 0 309 .byte 0,12,4,0,0x80,10,2,0 310 .long 0 311.size x25519_fe64_sqr,.-x25519_fe64_sqr 312 313.globl x25519_fe64_mul121666 314.type x25519_fe64_mul121666,\@function 315.align 5 316x25519_fe64_mul121666: 317 lis $bi,`65536>>16` 318 ori $bi,$bi,`121666-65536` 319 320 ld $t0,0($ap) 321 ld $t1,8($ap) 322 ld $bp,16($ap) 323 ld $ap,24($ap) 324 325 mulld $a0,$t0,$bi 326 mulhdu $t0,$t0,$bi 327 mulld $a1,$t1,$bi 328 mulhdu $t1,$t1,$bi 329 mulld $a2,$bp,$bi 330 mulhdu $bp,$bp,$bi 331 mulld $a3,$ap,$bi 332 mulhdu $ap,$ap,$bi 333 334 addc $a1,$a1,$t0 335 adde $a2,$a2,$t1 336 adde $a3,$a3,$bp 337 addze $ap, $ap 338 339 mulli $ap,$ap,38 340 341 addc $a0,$a0,$ap 342 addze $a1,$a1 343 addze $a2,$a2 344 addze $a3,$a3 345 346 subfe $t1,$t1,$t1 # carry -> ~mask 347 std $a1,8($rp) 348 andc $t0,$t0,$t1 349 std $a2,16($rp) 350 add $a0,$a0,$t0 351 std $a3,24($rp) 352 std $a0,0($rp) 353 354 blr 355 .long 0 356 .byte 0,12,0x14,0,0,0,2,0 357 .long 0 358.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 359 360.globl x25519_fe64_add 361.type x25519_fe64_add,\@function 362.align 5 363x25519_fe64_add: 364 ld $a0,0($ap) 365 ld $t0,0($bp) 366 ld $a1,8($ap) 367 ld $t1,8($bp) 368 ld $a2,16($ap) 369 ld $bi,16($bp) 370 ld $a3,24($ap) 371 ld $bp,24($bp) 372 373 addc $a0,$a0,$t0 374 adde $a1,$a1,$t1 375 adde $a2,$a2,$bi 376 adde $a3,$a3,$bp 377 378 li $t0,38 379 subfe $t1,$t1,$t1 # carry -> ~mask 380 andc $t1,$t0,$t1 381 382 addc $a0,$a0,$t1 383 addze $a1,$a1 384 addze $a2,$a2 385 addze $a3,$a3 386 387 subfe $t1,$t1,$t1 # carry -> ~mask 388 std $a1,8($rp) 389 andc $t0,$t0,$t1 390 std $a2,16($rp) 391 add $a0,$a0,$t0 392 std $a3,24($rp) 393 std $a0,0($rp) 394 395 blr 396 .long 0 397 .byte 0,12,0x14,0,0,0,3,0 398 .long 0 399.size x25519_fe64_add,.-x25519_fe64_add 400 401.globl x25519_fe64_sub 402.type x25519_fe64_sub,\@function 403.align 5 404x25519_fe64_sub: 405 ld $a0,0($ap) 406 ld $t0,0($bp) 407 ld $a1,8($ap) 408 ld $t1,8($bp) 409 ld $a2,16($ap) 410 ld $bi,16($bp) 411 ld $a3,24($ap) 412 ld $bp,24($bp) 413 414 subfc $a0,$t0,$a0 415 subfe $a1,$t1,$a1 416 subfe $a2,$bi,$a2 417 subfe $a3,$bp,$a3 418 419 li $t0,38 420 subfe $t1,$t1,$t1 # borrow -> mask 421 xor $zero,$zero,$zero 422 and $t1,$t0,$t1 423 424 subfc $a0,$t1,$a0 425 subfe $a1,$zero,$a1 426 subfe $a2,$zero,$a2 427 subfe $a3,$zero,$a3 428 429 subfe $t1,$t1,$t1 # borrow -> mask 430 std $a1,8($rp) 431 and $t0,$t0,$t1 432 std $a2,16($rp) 433 subf $a0,$t0,$a0 434 std $a3,24($rp) 435 std $a0,0($rp) 436 437 blr 438 .long 0 439 .byte 0,12,0x14,0,0,0,3,0 440 .long 0 441.size x25519_fe64_sub,.-x25519_fe64_sub 442 443.globl x25519_fe64_tobytes 444.type x25519_fe64_tobytes,\@function 445.align 5 446x25519_fe64_tobytes: 447 ld $a3,24($ap) 448 ld $a0,0($ap) 449 ld $a1,8($ap) 450 ld $a2,16($ap) 451 452 sradi $t0,$a3,63 # most significant bit -> mask 453 li $t1,19 454 and $t0,$t0,$t1 455 sldi $a3,$a3,1 456 add $t0,$t0,$t1 # compare to modulus in the same go 457 srdi $a3,$a3,1 # most significant bit cleared 458 459 addc $a0,$a0,$t0 460 addze $a1,$a1 461 addze $a2,$a2 462 addze $a3,$a3 463 464 xor $zero,$zero,$zero 465 sradi $t0,$a3,63 # most significant bit -> mask 466 sldi $a3,$a3,1 467 andc $t0,$t1,$t0 468 srdi $a3,$a3,1 # most significant bit cleared 469 470 subi $rp,$rp,1 471 subfc $a0,$t0,$a0 472 subfe $a1,$zero,$a1 473 subfe $a2,$zero,$a2 474 subfe $a3,$zero,$a3 475 476___ 477for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) { 478$code.=<<___; 479 srdi $t0,@a[0],8 480 stbu @a[0],1($rp) 481 srdi @a[0],@a[0],16 482 stbu $t0,1($rp) 483 srdi $t0,@a[0],8 484 stbu @a[0],1($rp) 485 srdi @a[0],@a[0],16 486 stbu $t0,1($rp) 487 srdi $t0,@a[0],8 488 stbu @a[0],1($rp) 489 srdi @a[0],@a[0],16 490 stbu $t0,1($rp) 491 srdi $t0,@a[0],8 492 stbu @a[0],1($rp) 493 stbu $t0,1($rp) 494___ 495} 496$code.=<<___; 497 blr 498 .long 0 499 .byte 0,12,0x14,0,0,0,2,0 500 .long 0 501.size x25519_fe64_tobytes,.-x25519_fe64_tobytes 502___ 503} 504####################################################### base 2^51 505{ 506my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1, 507 $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) = 508 map("r$_",(6..12,21..31)); 509my $mask = "r0"; 510my $FRAME = 18*8; 511 512$code.=<<___; 513.text 514 515.globl x25519_fe51_mul 516.type x25519_fe51_mul,\@function 517.align 5 518x25519_fe51_mul: 519 stdu $sp,-$FRAME($sp) 520 std r21,`$FRAME-8*11`($sp) 521 std r22,`$FRAME-8*10`($sp) 522 std r23,`$FRAME-8*9`($sp) 523 std r24,`$FRAME-8*8`($sp) 524 std r25,`$FRAME-8*7`($sp) 525 std r26,`$FRAME-8*6`($sp) 526 std r27,`$FRAME-8*5`($sp) 527 std r28,`$FRAME-8*4`($sp) 528 std r29,`$FRAME-8*3`($sp) 529 std r30,`$FRAME-8*2`($sp) 530 std r31,`$FRAME-8*1`($sp) 531 532 ld $bi,0($bp) 533 ld $a0,0($ap) 534 ld $a1,8($ap) 535 ld $a2,16($ap) 536 ld $a3,24($ap) 537 ld $a4,32($ap) 538 539 mulld $h0lo,$a0,$bi # a[0]*b[0] 540 mulhdu $h0hi,$a0,$bi 541 542 mulld $h1lo,$a1,$bi # a[1]*b[0] 543 mulhdu $h1hi,$a1,$bi 544 545 mulld $h4lo,$a4,$bi # a[4]*b[0] 546 mulhdu $h4hi,$a4,$bi 547 ld $ap,8($bp) 548 mulli $a4,$a4,19 549 550 mulld $h2lo,$a2,$bi # a[2]*b[0] 551 mulhdu $h2hi,$a2,$bi 552 553 mulld $h3lo,$a3,$bi # a[3]*b[0] 554 mulhdu $h3hi,$a3,$bi 555___ 556for(my @a=($a0,$a1,$a2,$a3,$a4), 557 my $i=1; $i<4; $i++) { 558 ($ap,$bi) = ($bi,$ap); 559$code.=<<___; 560 mulld $t0,@a[4],$bi 561 mulhdu $t1,@a[4],$bi 562 addc $h0lo,$h0lo,$t0 563 adde $h0hi,$h0hi,$t1 564 565 mulld $t0,@a[0],$bi 566 mulhdu $t1,@a[0],$bi 567 addc $h1lo,$h1lo,$t0 568 adde $h1hi,$h1hi,$t1 569 570 mulld $t0,@a[3],$bi 571 mulhdu $t1,@a[3],$bi 572 ld $ap,`8*($i+1)`($bp) 573 mulli @a[3],@a[3],19 574 addc $h4lo,$h4lo,$t0 575 adde $h4hi,$h4hi,$t1 576 577 mulld $t0,@a[1],$bi 578 mulhdu $t1,@a[1],$bi 579 addc $h2lo,$h2lo,$t0 580 adde $h2hi,$h2hi,$t1 581 582 mulld $t0,@a[2],$bi 583 mulhdu $t1,@a[2],$bi 584 addc $h3lo,$h3lo,$t0 585 adde $h3hi,$h3hi,$t1 586___ 587 unshift(@a,pop(@a)); 588} 589 ($ap,$bi) = ($bi,$ap); 590$code.=<<___; 591 mulld $t0,$a1,$bi 592 mulhdu $t1,$a1,$bi 593 addc $h0lo,$h0lo,$t0 594 adde $h0hi,$h0hi,$t1 595 596 mulld $t0,$a2,$bi 597 mulhdu $t1,$a2,$bi 598 addc $h1lo,$h1lo,$t0 599 adde $h1hi,$h1hi,$t1 600 601 mulld $t0,$a3,$bi 602 mulhdu $t1,$a3,$bi 603 addc $h2lo,$h2lo,$t0 604 adde $h2hi,$h2hi,$t1 605 606 mulld $t0,$a4,$bi 607 mulhdu $t1,$a4,$bi 608 addc $h3lo,$h3lo,$t0 609 adde $h3hi,$h3hi,$t1 610 611 mulld $t0,$a0,$bi 612 mulhdu $t1,$a0,$bi 613 addc $h4lo,$h4lo,$t0 614 adde $h4hi,$h4hi,$t1 615 616.Lfe51_reduce: 617 li $mask,-1 618 srdi $mask,$mask,13 # 0x7ffffffffffff 619 620 srdi $t0,$h2lo,51 621 and $a2,$h2lo,$mask 622 insrdi $t0,$h2hi,51,0 # h2>>51 623 srdi $t1,$h0lo,51 624 and $a0,$h0lo,$mask 625 insrdi $t1,$h0hi,51,0 # h0>>51 626 addc $h3lo,$h3lo,$t0 627 addze $h3hi,$h3hi 628 addc $h1lo,$h1lo,$t1 629 addze $h1hi,$h1hi 630 631 srdi $t0,$h3lo,51 632 and $a3,$h3lo,$mask 633 insrdi $t0,$h3hi,51,0 # h3>>51 634 srdi $t1,$h1lo,51 635 and $a1,$h1lo,$mask 636 insrdi $t1,$h1hi,51,0 # h1>>51 637 addc $h4lo,$h4lo,$t0 638 addze $h4hi,$h4hi 639 add $a2,$a2,$t1 640 641 srdi $t0,$h4lo,51 642 and $a4,$h4lo,$mask 643 insrdi $t0,$h4hi,51,0 644 mulli $t0,$t0,19 # (h4 >> 51) * 19 645 646 add $a0,$a0,$t0 647 648 srdi $t1,$a2,51 649 and $a2,$a2,$mask 650 add $a3,$a3,$t1 651 652 srdi $t0,$a0,51 653 and $a0,$a0,$mask 654 add $a1,$a1,$t0 655 656 std $a2,16($rp) 657 std $a3,24($rp) 658 std $a4,32($rp) 659 std $a0,0($rp) 660 std $a1,8($rp) 661 662 ld r21,`$FRAME-8*11`($sp) 663 ld r22,`$FRAME-8*10`($sp) 664 ld r23,`$FRAME-8*9`($sp) 665 ld r24,`$FRAME-8*8`($sp) 666 ld r25,`$FRAME-8*7`($sp) 667 ld r26,`$FRAME-8*6`($sp) 668 ld r27,`$FRAME-8*5`($sp) 669 ld r28,`$FRAME-8*4`($sp) 670 ld r29,`$FRAME-8*3`($sp) 671 ld r30,`$FRAME-8*2`($sp) 672 ld r31,`$FRAME-8*1`($sp) 673 addi $sp,$sp,$FRAME 674 blr 675 .long 0 676 .byte 0,12,4,0,0x80,11,3,0 677 .long 0 678.size x25519_fe51_mul,.-x25519_fe51_mul 679___ 680{ 681my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1); 682$code.=<<___; 683.globl x25519_fe51_sqr 684.type x25519_fe51_sqr,\@function 685.align 5 686x25519_fe51_sqr: 687 stdu $sp,-$FRAME($sp) 688 std r21,`$FRAME-8*11`($sp) 689 std r22,`$FRAME-8*10`($sp) 690 std r23,`$FRAME-8*9`($sp) 691 std r24,`$FRAME-8*8`($sp) 692 std r25,`$FRAME-8*7`($sp) 693 std r26,`$FRAME-8*6`($sp) 694 std r27,`$FRAME-8*5`($sp) 695 std r28,`$FRAME-8*4`($sp) 696 std r29,`$FRAME-8*3`($sp) 697 std r30,`$FRAME-8*2`($sp) 698 std r31,`$FRAME-8*1`($sp) 699 700 ld $a0,0($ap) 701 ld $a1,8($ap) 702 ld $a2,16($ap) 703 ld $a3,24($ap) 704 ld $a4,32($ap) 705 706 add $bi,$a0,$a0 # a[0]*2 707 mulli $t1,$a4,19 # a[4]*19 708 709 mulld $h0lo,$a0,$a0 710 mulhdu $h0hi,$a0,$a0 711 mulld $h1lo,$a1,$bi 712 mulhdu $h1hi,$a1,$bi 713 mulld $h2lo,$a2,$bi 714 mulhdu $h2hi,$a2,$bi 715 mulld $h3lo,$a3,$bi 716 mulhdu $h3hi,$a3,$bi 717 mulld $h4lo,$a4,$bi 718 mulhdu $h4hi,$a4,$bi 719 add $bi,$a1,$a1 # a[1]*2 720___ 721 ($a4,$t1) = ($t1,$a4); 722$code.=<<___; 723 mulld $t0,$t1,$a4 724 mulhdu $t1,$t1,$a4 725 addc $h3lo,$h3lo,$t0 726 adde $h3hi,$h3hi,$t1 727 728 mulli $bp,$a3,19 # a[3]*19 729 730 mulld $t0,$a1,$a1 731 mulhdu $t1,$a1,$a1 732 addc $h2lo,$h2lo,$t0 733 adde $h2hi,$h2hi,$t1 734 mulld $t0,$a2,$bi 735 mulhdu $t1,$a2,$bi 736 addc $h3lo,$h3lo,$t0 737 adde $h3hi,$h3hi,$t1 738 mulld $t0,$a3,$bi 739 mulhdu $t1,$a3,$bi 740 addc $h4lo,$h4lo,$t0 741 adde $h4hi,$h4hi,$t1 742 mulld $t0,$a4,$bi 743 mulhdu $t1,$a4,$bi 744 add $bi,$a3,$a3 # a[3]*2 745 addc $h0lo,$h0lo,$t0 746 adde $h0hi,$h0hi,$t1 747___ 748 ($a3,$t1) = ($bp,$a3); 749$code.=<<___; 750 mulld $t0,$t1,$a3 751 mulhdu $t1,$t1,$a3 752 addc $h1lo,$h1lo,$t0 753 adde $h1hi,$h1hi,$t1 754 mulld $t0,$bi,$a4 755 mulhdu $t1,$bi,$a4 756 add $bi,$a2,$a2 # a[2]*2 757 addc $h2lo,$h2lo,$t0 758 adde $h2hi,$h2hi,$t1 759 760 mulld $t0,$a2,$a2 761 mulhdu $t1,$a2,$a2 762 addc $h4lo,$h4lo,$t0 763 adde $h4hi,$h4hi,$t1 764 mulld $t0,$a3,$bi 765 mulhdu $t1,$a3,$bi 766 addc $h0lo,$h0lo,$t0 767 adde $h0hi,$h0hi,$t1 768 mulld $t0,$a4,$bi 769 mulhdu $t1,$a4,$bi 770 addc $h1lo,$h1lo,$t0 771 adde $h1hi,$h1hi,$t1 772 773 b .Lfe51_reduce 774 .long 0 775 .byte 0,12,4,0,0x80,11,2,0 776 .long 0 777.size x25519_fe51_sqr,.-x25519_fe51_sqr 778___ 779} 780$code.=<<___; 781.globl x25519_fe51_mul121666 782.type x25519_fe51_mul121666,\@function 783.align 5 784x25519_fe51_mul121666: 785 stdu $sp,-$FRAME($sp) 786 std r21,`$FRAME-8*11`($sp) 787 std r22,`$FRAME-8*10`($sp) 788 std r23,`$FRAME-8*9`($sp) 789 std r24,`$FRAME-8*8`($sp) 790 std r25,`$FRAME-8*7`($sp) 791 std r26,`$FRAME-8*6`($sp) 792 std r27,`$FRAME-8*5`($sp) 793 std r28,`$FRAME-8*4`($sp) 794 std r29,`$FRAME-8*3`($sp) 795 std r30,`$FRAME-8*2`($sp) 796 std r31,`$FRAME-8*1`($sp) 797 798 lis $bi,`65536>>16` 799 ori $bi,$bi,`121666-65536` 800 ld $a0,0($ap) 801 ld $a1,8($ap) 802 ld $a2,16($ap) 803 ld $a3,24($ap) 804 ld $a4,32($ap) 805 806 mulld $h0lo,$a0,$bi # a[0]*121666 807 mulhdu $h0hi,$a0,$bi 808 mulld $h1lo,$a1,$bi # a[1]*121666 809 mulhdu $h1hi,$a1,$bi 810 mulld $h2lo,$a2,$bi # a[2]*121666 811 mulhdu $h2hi,$a2,$bi 812 mulld $h3lo,$a3,$bi # a[3]*121666 813 mulhdu $h3hi,$a3,$bi 814 mulld $h4lo,$a4,$bi # a[4]*121666 815 mulhdu $h4hi,$a4,$bi 816 817 b .Lfe51_reduce 818 .long 0 819 .byte 0,12,4,0,0x80,11,2,0 820 .long 0 821.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 822___ 823} 824 825$code =~ s/\`([^\`]*)\`/eval $1/gem; 826print $code; 827close STDOUT or die "error closing STDOUT: $!"; 828