1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for PPC64. 18# 19# August 2016. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# POWER7 +260-530% 26# POWER8 +220-340% 27 28# $output is the last argument if it looks like a file (it has an extension) 29# $flavour is the first argument if it doesn't look like a file 30$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 31$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 32 33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 34( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 35( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 36die "can't locate ppc-xlate.pl"; 37 38open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 39 or die "can't call $xlate: $!"; 40*STDOUT=*OUT; 41 42my $sp="r1"; 43 44{ 45my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3, 46 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) = 47 map("r$_",(3..12,22..31)); 48 49my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont 50 51$code.=<<___; 52.machine "any" 53.text 54___ 55######################################################################## 56# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 57# 58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 59open TABLE,"<ecp_nistz256_table.c" or 60open TABLE,"<${dir}../ecp_nistz256_table.c" or 61die "failed to open ecp_nistz256_table.c:",$!; 62 63use integer; 64 65foreach(<TABLE>) { 66 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 67} 68close TABLE; 69 70# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 71# 64*16*37-1 is because $#arr returns last valid index or @arr, not 72# amount of elements. 73die "insane number of elements" if ($#arr != 64*16*37-1); 74 75$code.=<<___; 76.type ecp_nistz256_precomputed,\@object 77.globl ecp_nistz256_precomputed 78.align 12 79ecp_nistz256_precomputed: 80___ 81######################################################################## 82# this conversion smashes P256_POINT_AFFINE by individual bytes with 83# 64 byte interval, similar to 84# 1111222233334444 85# 1234123412341234 86for(1..37) { 87 @tbl = splice(@arr,0,64*16); 88 for($i=0;$i<64;$i++) { 89 undef @line; 90 for($j=0;$j<64;$j++) { 91 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 92 } 93 $code.=".byte\t"; 94 $code.=join(',',map { sprintf "0x%02x",$_} @line); 95 $code.="\n"; 96 } 97} 98 99$code.=<<___; 100.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 101.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 102 103# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 104# const BN_ULONG x2[4]); 105.globl ecp_nistz256_mul_mont 106.align 5 107ecp_nistz256_mul_mont: 108 stdu $sp,-128($sp) 109 mflr r0 110 std r22,48($sp) 111 std r23,56($sp) 112 std r24,64($sp) 113 std r25,72($sp) 114 std r26,80($sp) 115 std r27,88($sp) 116 std r28,96($sp) 117 std r29,104($sp) 118 std r30,112($sp) 119 std r31,120($sp) 120 121 ld $a0,0($ap) 122 ld $bi,0($bp) 123 ld $a1,8($ap) 124 ld $a2,16($ap) 125 ld $a3,24($ap) 126 127 li $poly1,-1 128 srdi $poly1,$poly1,32 # 0x00000000ffffffff 129 li $poly3,1 130 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 131 132 bl __ecp_nistz256_mul_mont 133 134 mtlr r0 135 ld r22,48($sp) 136 ld r23,56($sp) 137 ld r24,64($sp) 138 ld r25,72($sp) 139 ld r26,80($sp) 140 ld r27,88($sp) 141 ld r28,96($sp) 142 ld r29,104($sp) 143 ld r30,112($sp) 144 ld r31,120($sp) 145 addi $sp,$sp,128 146 blr 147 .long 0 148 .byte 0,12,4,0,0x80,10,3,0 149 .long 0 150.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 151 152# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 153.globl ecp_nistz256_sqr_mont 154.align 4 155ecp_nistz256_sqr_mont: 156 stdu $sp,-128($sp) 157 mflr r0 158 std r22,48($sp) 159 std r23,56($sp) 160 std r24,64($sp) 161 std r25,72($sp) 162 std r26,80($sp) 163 std r27,88($sp) 164 std r28,96($sp) 165 std r29,104($sp) 166 std r30,112($sp) 167 std r31,120($sp) 168 169 ld $a0,0($ap) 170 ld $a1,8($ap) 171 ld $a2,16($ap) 172 ld $a3,24($ap) 173 174 li $poly1,-1 175 srdi $poly1,$poly1,32 # 0x00000000ffffffff 176 li $poly3,1 177 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 178 179 bl __ecp_nistz256_sqr_mont 180 181 mtlr r0 182 ld r22,48($sp) 183 ld r23,56($sp) 184 ld r24,64($sp) 185 ld r25,72($sp) 186 ld r26,80($sp) 187 ld r27,88($sp) 188 ld r28,96($sp) 189 ld r29,104($sp) 190 ld r30,112($sp) 191 ld r31,120($sp) 192 addi $sp,$sp,128 193 blr 194 .long 0 195 .byte 0,12,4,0,0x80,10,2,0 196 .long 0 197.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 198 199# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 200# const BN_ULONG x2[4]); 201.globl ecp_nistz256_add 202.align 4 203ecp_nistz256_add: 204 stdu $sp,-128($sp) 205 mflr r0 206 std r28,96($sp) 207 std r29,104($sp) 208 std r30,112($sp) 209 std r31,120($sp) 210 211 ld $acc0,0($ap) 212 ld $t0, 0($bp) 213 ld $acc1,8($ap) 214 ld $t1, 8($bp) 215 ld $acc2,16($ap) 216 ld $t2, 16($bp) 217 ld $acc3,24($ap) 218 ld $t3, 24($bp) 219 220 li $poly1,-1 221 srdi $poly1,$poly1,32 # 0x00000000ffffffff 222 li $poly3,1 223 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 224 225 bl __ecp_nistz256_add 226 227 mtlr r0 228 ld r28,96($sp) 229 ld r29,104($sp) 230 ld r30,112($sp) 231 ld r31,120($sp) 232 addi $sp,$sp,128 233 blr 234 .long 0 235 .byte 0,12,4,0,0x80,4,3,0 236 .long 0 237.size ecp_nistz256_add,.-ecp_nistz256_add 238 239# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 240.globl ecp_nistz256_div_by_2 241.align 4 242ecp_nistz256_div_by_2: 243 stdu $sp,-128($sp) 244 mflr r0 245 std r28,96($sp) 246 std r29,104($sp) 247 std r30,112($sp) 248 std r31,120($sp) 249 250 ld $acc0,0($ap) 251 ld $acc1,8($ap) 252 ld $acc2,16($ap) 253 ld $acc3,24($ap) 254 255 li $poly1,-1 256 srdi $poly1,$poly1,32 # 0x00000000ffffffff 257 li $poly3,1 258 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 259 260 bl __ecp_nistz256_div_by_2 261 262 mtlr r0 263 ld r28,96($sp) 264 ld r29,104($sp) 265 ld r30,112($sp) 266 ld r31,120($sp) 267 addi $sp,$sp,128 268 blr 269 .long 0 270 .byte 0,12,4,0,0x80,4,2,0 271 .long 0 272.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 273 274# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 275.globl ecp_nistz256_mul_by_2 276.align 4 277ecp_nistz256_mul_by_2: 278 stdu $sp,-128($sp) 279 mflr r0 280 std r28,96($sp) 281 std r29,104($sp) 282 std r30,112($sp) 283 std r31,120($sp) 284 285 ld $acc0,0($ap) 286 ld $acc1,8($ap) 287 ld $acc2,16($ap) 288 ld $acc3,24($ap) 289 290 mr $t0,$acc0 291 mr $t1,$acc1 292 mr $t2,$acc2 293 mr $t3,$acc3 294 295 li $poly1,-1 296 srdi $poly1,$poly1,32 # 0x00000000ffffffff 297 li $poly3,1 298 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 299 300 bl __ecp_nistz256_add # ret = a+a // 2*a 301 302 mtlr r0 303 ld r28,96($sp) 304 ld r29,104($sp) 305 ld r30,112($sp) 306 ld r31,120($sp) 307 addi $sp,$sp,128 308 blr 309 .long 0 310 .byte 0,12,4,0,0x80,4,3,0 311 .long 0 312.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 313 314# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 315.globl ecp_nistz256_mul_by_3 316.align 4 317ecp_nistz256_mul_by_3: 318 stdu $sp,-128($sp) 319 mflr r0 320 std r28,96($sp) 321 std r29,104($sp) 322 std r30,112($sp) 323 std r31,120($sp) 324 325 ld $acc0,0($ap) 326 ld $acc1,8($ap) 327 ld $acc2,16($ap) 328 ld $acc3,24($ap) 329 330 mr $t0,$acc0 331 std $acc0,64($sp) 332 mr $t1,$acc1 333 std $acc1,72($sp) 334 mr $t2,$acc2 335 std $acc2,80($sp) 336 mr $t3,$acc3 337 std $acc3,88($sp) 338 339 li $poly1,-1 340 srdi $poly1,$poly1,32 # 0x00000000ffffffff 341 li $poly3,1 342 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 343 344 bl __ecp_nistz256_add # ret = a+a // 2*a 345 346 ld $t0,64($sp) 347 ld $t1,72($sp) 348 ld $t2,80($sp) 349 ld $t3,88($sp) 350 351 bl __ecp_nistz256_add # ret += a // 2*a+a=3*a 352 353 mtlr r0 354 ld r28,96($sp) 355 ld r29,104($sp) 356 ld r30,112($sp) 357 ld r31,120($sp) 358 addi $sp,$sp,128 359 blr 360 .long 0 361 .byte 0,12,4,0,0x80,4,2,0 362 .long 0 363.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 364 365# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 366# const BN_ULONG x2[4]); 367.globl ecp_nistz256_sub 368.align 4 369ecp_nistz256_sub: 370 stdu $sp,-128($sp) 371 mflr r0 372 std r28,96($sp) 373 std r29,104($sp) 374 std r30,112($sp) 375 std r31,120($sp) 376 377 ld $acc0,0($ap) 378 ld $acc1,8($ap) 379 ld $acc2,16($ap) 380 ld $acc3,24($ap) 381 382 li $poly1,-1 383 srdi $poly1,$poly1,32 # 0x00000000ffffffff 384 li $poly3,1 385 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 386 387 bl __ecp_nistz256_sub_from 388 389 mtlr r0 390 ld r28,96($sp) 391 ld r29,104($sp) 392 ld r30,112($sp) 393 ld r31,120($sp) 394 addi $sp,$sp,128 395 blr 396 .long 0 397 .byte 0,12,4,0,0x80,4,3,0 398 .long 0 399.size ecp_nistz256_sub,.-ecp_nistz256_sub 400 401# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 402.globl ecp_nistz256_neg 403.align 4 404ecp_nistz256_neg: 405 stdu $sp,-128($sp) 406 mflr r0 407 std r28,96($sp) 408 std r29,104($sp) 409 std r30,112($sp) 410 std r31,120($sp) 411 412 mr $bp,$ap 413 li $acc0,0 414 li $acc1,0 415 li $acc2,0 416 li $acc3,0 417 418 li $poly1,-1 419 srdi $poly1,$poly1,32 # 0x00000000ffffffff 420 li $poly3,1 421 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 422 423 bl __ecp_nistz256_sub_from 424 425 mtlr r0 426 ld r28,96($sp) 427 ld r29,104($sp) 428 ld r30,112($sp) 429 ld r31,120($sp) 430 addi $sp,$sp,128 431 blr 432 .long 0 433 .byte 0,12,4,0,0x80,4,2,0 434 .long 0 435.size ecp_nistz256_neg,.-ecp_nistz256_neg 436 437# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 438# to $a0-$a3 and b[0] - to $bi 439.type __ecp_nistz256_mul_mont,\@function 440.align 4 441__ecp_nistz256_mul_mont: 442 mulld $acc0,$a0,$bi # a[0]*b[0] 443 mulhdu $t0,$a0,$bi 444 445 mulld $acc1,$a1,$bi # a[1]*b[0] 446 mulhdu $t1,$a1,$bi 447 448 mulld $acc2,$a2,$bi # a[2]*b[0] 449 mulhdu $t2,$a2,$bi 450 451 mulld $acc3,$a3,$bi # a[3]*b[0] 452 mulhdu $t3,$a3,$bi 453 ld $bi,8($bp) # b[1] 454 455 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication 456 sldi $t0,$acc0,32 457 adde $acc2,$acc2,$t1 458 srdi $t1,$acc0,32 459 adde $acc3,$acc3,$t2 460 addze $acc4,$t3 461 li $acc5,0 462___ 463for($i=1;$i<4;$i++) { 464 ################################################################ 465 # Reduction iteration is normally performed by accumulating 466 # result of multiplication of modulus by "magic" digit [and 467 # omitting least significant word, which is guaranteed to 468 # be 0], but thanks to special form of modulus and "magic" 469 # digit being equal to least significant word, it can be 470 # performed with additions and subtractions alone. Indeed: 471 # 472 # ffff0001.00000000.0000ffff.ffffffff 473 # * abcdefgh 474 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 475 # 476 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 477 # rewrite above as: 478 # 479 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 480 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 481 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 482 # 483 # or marking redundant operations: 484 # 485 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 486 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 487 # - 0000abcd.efgh0000.--------.--------.-------- 488 489$code.=<<___; 490 subfc $t2,$t0,$acc0 # "*0xffff0001" 491 subfe $t3,$t1,$acc0 492 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 493 adde $acc1,$acc2,$t1 494 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 495 adde $acc3,$acc4,$t3 496 addze $acc4,$acc5 497 498 mulld $t0,$a0,$bi # lo(a[0]*b[i]) 499 mulld $t1,$a1,$bi # lo(a[1]*b[i]) 500 mulld $t2,$a2,$bi # lo(a[2]*b[i]) 501 mulld $t3,$a3,$bi # lo(a[3]*b[i]) 502 addc $acc0,$acc0,$t0 # accumulate low parts of multiplication 503 mulhdu $t0,$a0,$bi # hi(a[0]*b[i]) 504 adde $acc1,$acc1,$t1 505 mulhdu $t1,$a1,$bi # hi(a[1]*b[i]) 506 adde $acc2,$acc2,$t2 507 mulhdu $t2,$a2,$bi # hi(a[2]*b[i]) 508 adde $acc3,$acc3,$t3 509 mulhdu $t3,$a3,$bi # hi(a[3]*b[i]) 510 addze $acc4,$acc4 511___ 512$code.=<<___ if ($i<3); 513 ld $bi,8*($i+1)($bp) # b[$i+1] 514___ 515$code.=<<___; 516 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication 517 sldi $t0,$acc0,32 518 adde $acc2,$acc2,$t1 519 srdi $t1,$acc0,32 520 adde $acc3,$acc3,$t2 521 adde $acc4,$acc4,$t3 522 li $acc5,0 523 addze $acc5,$acc5 524___ 525} 526$code.=<<___; 527 # last reduction 528 subfc $t2,$t0,$acc0 # "*0xffff0001" 529 subfe $t3,$t1,$acc0 530 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 531 adde $acc1,$acc2,$t1 532 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 533 adde $acc3,$acc4,$t3 534 addze $acc4,$acc5 535 536 li $t2,0 537 addic $acc0,$acc0,1 # ret -= modulus 538 subfe $acc1,$poly1,$acc1 539 subfe $acc2,$t2,$acc2 540 subfe $acc3,$poly3,$acc3 541 subfe $acc4,$t2,$acc4 542 543 addc $acc0,$acc0,$acc4 # ret += modulus if borrow 544 and $t1,$poly1,$acc4 545 and $t3,$poly3,$acc4 546 adde $acc1,$acc1,$t1 547 addze $acc2,$acc2 548 adde $acc3,$acc3,$t3 549 550 std $acc0,0($rp) 551 std $acc1,8($rp) 552 std $acc2,16($rp) 553 std $acc3,24($rp) 554 555 blr 556 .long 0 557 .byte 0,12,0x14,0,0,0,1,0 558 .long 0 559.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 560 561# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 562# to $a0-$a3 563.type __ecp_nistz256_sqr_mont,\@function 564.align 4 565__ecp_nistz256_sqr_mont: 566 ################################################################ 567 # | | | | | |a1*a0| | 568 # | | | | |a2*a0| | | 569 # | |a3*a2|a3*a0| | | | 570 # | | | |a2*a1| | | | 571 # | | |a3*a1| | | | | 572 # *| | | | | | | | 2| 573 # +|a3*a3|a2*a2|a1*a1|a0*a0| 574 # |--+--+--+--+--+--+--+--| 575 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 576 # 577 # "can't overflow" below mark carrying into high part of 578 # multiplication result, which can't overflow, because it 579 # can never be all ones. 580 581 mulld $acc1,$a1,$a0 # a[1]*a[0] 582 mulhdu $t1,$a1,$a0 583 mulld $acc2,$a2,$a0 # a[2]*a[0] 584 mulhdu $t2,$a2,$a0 585 mulld $acc3,$a3,$a0 # a[3]*a[0] 586 mulhdu $acc4,$a3,$a0 587 588 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication 589 mulld $t0,$a2,$a1 # a[2]*a[1] 590 mulhdu $t1,$a2,$a1 591 adde $acc3,$acc3,$t2 592 mulld $t2,$a3,$a1 # a[3]*a[1] 593 mulhdu $t3,$a3,$a1 594 addze $acc4,$acc4 # can't overflow 595 596 mulld $acc5,$a3,$a2 # a[3]*a[2] 597 mulhdu $acc6,$a3,$a2 598 599 addc $t1,$t1,$t2 # accumulate high parts of multiplication 600 addze $t2,$t3 # can't overflow 601 602 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication 603 adde $acc4,$acc4,$t1 604 adde $acc5,$acc5,$t2 605 addze $acc6,$acc6 # can't overflow 606 607 addc $acc1,$acc1,$acc1 # acc[1-6]*=2 608 adde $acc2,$acc2,$acc2 609 adde $acc3,$acc3,$acc3 610 adde $acc4,$acc4,$acc4 611 adde $acc5,$acc5,$acc5 612 adde $acc6,$acc6,$acc6 613 li $acc7,0 614 addze $acc7,$acc7 615 616 mulld $acc0,$a0,$a0 # a[0]*a[0] 617 mulhdu $a0,$a0,$a0 618 mulld $t1,$a1,$a1 # a[1]*a[1] 619 mulhdu $a1,$a1,$a1 620 mulld $t2,$a2,$a2 # a[2]*a[2] 621 mulhdu $a2,$a2,$a2 622 mulld $t3,$a3,$a3 # a[3]*a[3] 623 mulhdu $a3,$a3,$a3 624 addc $acc1,$acc1,$a0 # +a[i]*a[i] 625 sldi $t0,$acc0,32 626 adde $acc2,$acc2,$t1 627 srdi $t1,$acc0,32 628 adde $acc3,$acc3,$a1 629 adde $acc4,$acc4,$t2 630 adde $acc5,$acc5,$a2 631 adde $acc6,$acc6,$t3 632 adde $acc7,$acc7,$a3 633___ 634for($i=0;$i<3;$i++) { # reductions, see commentary in 635 # multiplication for details 636$code.=<<___; 637 subfc $t2,$t0,$acc0 # "*0xffff0001" 638 subfe $t3,$t1,$acc0 639 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 640 sldi $t0,$acc0,32 641 adde $acc1,$acc2,$t1 642 srdi $t1,$acc0,32 643 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 644 addze $acc3,$t3 # can't overflow 645___ 646} 647$code.=<<___; 648 subfc $t2,$t0,$acc0 # "*0xffff0001" 649 subfe $t3,$t1,$acc0 650 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 651 adde $acc1,$acc2,$t1 652 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 653 addze $acc3,$t3 # can't overflow 654 655 addc $acc0,$acc0,$acc4 # accumulate upper half 656 adde $acc1,$acc1,$acc5 657 adde $acc2,$acc2,$acc6 658 adde $acc3,$acc3,$acc7 659 li $t2,0 660 addze $acc4,$t2 661 662 addic $acc0,$acc0,1 # ret -= modulus 663 subfe $acc1,$poly1,$acc1 664 subfe $acc2,$t2,$acc2 665 subfe $acc3,$poly3,$acc3 666 subfe $acc4,$t2,$acc4 667 668 addc $acc0,$acc0,$acc4 # ret += modulus if borrow 669 and $t1,$poly1,$acc4 670 and $t3,$poly3,$acc4 671 adde $acc1,$acc1,$t1 672 addze $acc2,$acc2 673 adde $acc3,$acc3,$t3 674 675 std $acc0,0($rp) 676 std $acc1,8($rp) 677 std $acc2,16($rp) 678 std $acc3,24($rp) 679 680 blr 681 .long 0 682 .byte 0,12,0x14,0,0,0,1,0 683 .long 0 684.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 685 686# Note that __ecp_nistz256_add expects both input vectors pre-loaded to 687# $a0-$a3 and $t0-$t3. This is done because it's used in multiple 688# contexts, e.g. in multiplication by 2 and 3... 689.type __ecp_nistz256_add,\@function 690.align 4 691__ecp_nistz256_add: 692 addc $acc0,$acc0,$t0 # ret = a+b 693 adde $acc1,$acc1,$t1 694 adde $acc2,$acc2,$t2 695 li $t2,0 696 adde $acc3,$acc3,$t3 697 addze $t0,$t2 698 699 # if a+b >= modulus, subtract modulus 700 # 701 # But since comparison implies subtraction, we subtract 702 # modulus and then add it back if subtraction borrowed. 703 704 subic $acc0,$acc0,-1 705 subfe $acc1,$poly1,$acc1 706 subfe $acc2,$t2,$acc2 707 subfe $acc3,$poly3,$acc3 708 subfe $t0,$t2,$t0 709 710 addc $acc0,$acc0,$t0 711 and $t1,$poly1,$t0 712 and $t3,$poly3,$t0 713 adde $acc1,$acc1,$t1 714 addze $acc2,$acc2 715 adde $acc3,$acc3,$t3 716 717 std $acc0,0($rp) 718 std $acc1,8($rp) 719 std $acc2,16($rp) 720 std $acc3,24($rp) 721 722 blr 723 .long 0 724 .byte 0,12,0x14,0,0,0,3,0 725 .long 0 726.size __ecp_nistz256_add,.-__ecp_nistz256_add 727 728.type __ecp_nistz256_sub_from,\@function 729.align 4 730__ecp_nistz256_sub_from: 731 ld $t0,0($bp) 732 ld $t1,8($bp) 733 ld $t2,16($bp) 734 ld $t3,24($bp) 735 subfc $acc0,$t0,$acc0 # ret = a-b 736 subfe $acc1,$t1,$acc1 737 subfe $acc2,$t2,$acc2 738 subfe $acc3,$t3,$acc3 739 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 740 741 # if a-b borrowed, add modulus 742 743 addc $acc0,$acc0,$t0 # ret -= modulus & t0 744 and $t1,$poly1,$t0 745 and $t3,$poly3,$t0 746 adde $acc1,$acc1,$t1 747 addze $acc2,$acc2 748 adde $acc3,$acc3,$t3 749 750 std $acc0,0($rp) 751 std $acc1,8($rp) 752 std $acc2,16($rp) 753 std $acc3,24($rp) 754 755 blr 756 .long 0 757 .byte 0,12,0x14,0,0,0,3,0 758 .long 0 759.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 760 761.type __ecp_nistz256_sub_morf,\@function 762.align 4 763__ecp_nistz256_sub_morf: 764 ld $t0,0($bp) 765 ld $t1,8($bp) 766 ld $t2,16($bp) 767 ld $t3,24($bp) 768 subfc $acc0,$acc0,$t0 # ret = b-a 769 subfe $acc1,$acc1,$t1 770 subfe $acc2,$acc2,$t2 771 subfe $acc3,$acc3,$t3 772 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 773 774 # if b-a borrowed, add modulus 775 776 addc $acc0,$acc0,$t0 # ret -= modulus & t0 777 and $t1,$poly1,$t0 778 and $t3,$poly3,$t0 779 adde $acc1,$acc1,$t1 780 addze $acc2,$acc2 781 adde $acc3,$acc3,$t3 782 783 std $acc0,0($rp) 784 std $acc1,8($rp) 785 std $acc2,16($rp) 786 std $acc3,24($rp) 787 788 blr 789 .long 0 790 .byte 0,12,0x14,0,0,0,3,0 791 .long 0 792.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 793 794.type __ecp_nistz256_div_by_2,\@function 795.align 4 796__ecp_nistz256_div_by_2: 797 andi. $t0,$acc0,1 798 addic $acc0,$acc0,-1 # a += modulus 799 neg $t0,$t0 800 adde $acc1,$acc1,$poly1 801 not $t0,$t0 802 addze $acc2,$acc2 803 li $t2,0 804 adde $acc3,$acc3,$poly3 805 and $t1,$poly1,$t0 806 addze $ap,$t2 # ap = carry 807 and $t3,$poly3,$t0 808 809 subfc $acc0,$t0,$acc0 # a -= modulus if a was even 810 subfe $acc1,$t1,$acc1 811 subfe $acc2,$t2,$acc2 812 subfe $acc3,$t3,$acc3 813 subfe $ap, $t2,$ap 814 815 srdi $acc0,$acc0,1 816 sldi $t0,$acc1,63 817 srdi $acc1,$acc1,1 818 sldi $t1,$acc2,63 819 srdi $acc2,$acc2,1 820 sldi $t2,$acc3,63 821 srdi $acc3,$acc3,1 822 sldi $t3,$ap,63 823 or $acc0,$acc0,$t0 824 or $acc1,$acc1,$t1 825 or $acc2,$acc2,$t2 826 or $acc3,$acc3,$t3 827 828 std $acc0,0($rp) 829 std $acc1,8($rp) 830 std $acc2,16($rp) 831 std $acc3,24($rp) 832 833 blr 834 .long 0 835 .byte 0,12,0x14,0,0,0,1,0 836 .long 0 837.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 838___ 839######################################################################## 840# following subroutines are "literal" implementation of those found in 841# ecp_nistz256.c 842# 843######################################################################## 844# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 845# 846if (1) { 847my $FRAME=64+32*4+12*8; 848my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3)); 849# above map() describes stack layout with 4 temporary 850# 256-bit vectors on top. 851my ($rp_real,$ap_real) = map("r$_",(20,21)); 852 853$code.=<<___; 854.globl ecp_nistz256_point_double 855.align 5 856ecp_nistz256_point_double: 857 stdu $sp,-$FRAME($sp) 858 mflr r0 859 std r20,$FRAME-8*12($sp) 860 std r21,$FRAME-8*11($sp) 861 std r22,$FRAME-8*10($sp) 862 std r23,$FRAME-8*9($sp) 863 std r24,$FRAME-8*8($sp) 864 std r25,$FRAME-8*7($sp) 865 std r26,$FRAME-8*6($sp) 866 std r27,$FRAME-8*5($sp) 867 std r28,$FRAME-8*4($sp) 868 std r29,$FRAME-8*3($sp) 869 std r30,$FRAME-8*2($sp) 870 std r31,$FRAME-8*1($sp) 871 872 li $poly1,-1 873 srdi $poly1,$poly1,32 # 0x00000000ffffffff 874 li $poly3,1 875 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 876.Ldouble_shortcut: 877 ld $acc0,32($ap) 878 ld $acc1,40($ap) 879 ld $acc2,48($ap) 880 ld $acc3,56($ap) 881 mr $t0,$acc0 882 mr $t1,$acc1 883 mr $t2,$acc2 884 mr $t3,$acc3 885 ld $a0,64($ap) # forward load for p256_sqr_mont 886 ld $a1,72($ap) 887 ld $a2,80($ap) 888 ld $a3,88($ap) 889 mr $rp_real,$rp 890 mr $ap_real,$ap 891 addi $rp,$sp,$S 892 bl __ecp_nistz256_add # p256_mul_by_2(S, in_y); 893 894 addi $rp,$sp,$Zsqr 895 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z); 896 897 ld $t0,0($ap_real) 898 ld $t1,8($ap_real) 899 ld $t2,16($ap_real) 900 ld $t3,24($ap_real) 901 mr $a0,$acc0 # put Zsqr aside for p256_sub 902 mr $a1,$acc1 903 mr $a2,$acc2 904 mr $a3,$acc3 905 addi $rp,$sp,$M 906 bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x); 907 908 addi $bp,$ap_real,0 909 mr $acc0,$a0 # restore Zsqr 910 mr $acc1,$a1 911 mr $acc2,$a2 912 mr $acc3,$a3 913 ld $a0,$S+0($sp) # forward load for p256_sqr_mont 914 ld $a1,$S+8($sp) 915 ld $a2,$S+16($sp) 916 ld $a3,$S+24($sp) 917 addi $rp,$sp,$Zsqr 918 bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr); 919 920 addi $rp,$sp,$S 921 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S); 922 923 ld $bi,32($ap_real) 924 ld $a0,64($ap_real) 925 ld $a1,72($ap_real) 926 ld $a2,80($ap_real) 927 ld $a3,88($ap_real) 928 addi $bp,$ap_real,32 929 addi $rp,$sp,$tmp0 930 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y); 931 932 mr $t0,$acc0 933 mr $t1,$acc1 934 mr $t2,$acc2 935 mr $t3,$acc3 936 ld $a0,$S+0($sp) # forward load for p256_sqr_mont 937 ld $a1,$S+8($sp) 938 ld $a2,$S+16($sp) 939 ld $a3,$S+24($sp) 940 addi $rp,$rp_real,64 941 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0); 942 943 addi $rp,$sp,$tmp0 944 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S); 945 946 ld $bi,$Zsqr($sp) # forward load for p256_mul_mont 947 ld $a0,$M+0($sp) 948 ld $a1,$M+8($sp) 949 ld $a2,$M+16($sp) 950 ld $a3,$M+24($sp) 951 addi $rp,$rp_real,32 952 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0); 953 954 addi $bp,$sp,$Zsqr 955 addi $rp,$sp,$M 956 bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr); 957 958 mr $t0,$acc0 # duplicate M 959 mr $t1,$acc1 960 mr $t2,$acc2 961 mr $t3,$acc3 962 mr $a0,$acc0 # put M aside 963 mr $a1,$acc1 964 mr $a2,$acc2 965 mr $a3,$acc3 966 addi $rp,$sp,$M 967 bl __ecp_nistz256_add 968 mr $t0,$a0 # restore M 969 mr $t1,$a1 970 mr $t2,$a2 971 mr $t3,$a3 972 ld $bi,0($ap_real) # forward load for p256_mul_mont 973 ld $a0,$S+0($sp) 974 ld $a1,$S+8($sp) 975 ld $a2,$S+16($sp) 976 ld $a3,$S+24($sp) 977 bl __ecp_nistz256_add # p256_mul_by_3(M, M); 978 979 addi $bp,$ap_real,0 980 addi $rp,$sp,$S 981 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x); 982 983 mr $t0,$acc0 984 mr $t1,$acc1 985 mr $t2,$acc2 986 mr $t3,$acc3 987 ld $a0,$M+0($sp) # forward load for p256_sqr_mont 988 ld $a1,$M+8($sp) 989 ld $a2,$M+16($sp) 990 ld $a3,$M+24($sp) 991 addi $rp,$sp,$tmp0 992 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S); 993 994 addi $rp,$rp_real,0 995 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M); 996 997 addi $bp,$sp,$tmp0 998 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0); 999 1000 addi $bp,$sp,$S 1001 addi $rp,$sp,$S 1002 bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x); 1003 1004 ld $bi,$M($sp) 1005 mr $a0,$acc0 # copy S 1006 mr $a1,$acc1 1007 mr $a2,$acc2 1008 mr $a3,$acc3 1009 addi $bp,$sp,$M 1010 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M); 1011 1012 addi $bp,$rp_real,32 1013 addi $rp,$rp_real,32 1014 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y); 1015 1016 mtlr r0 1017 ld r20,$FRAME-8*12($sp) 1018 ld r21,$FRAME-8*11($sp) 1019 ld r22,$FRAME-8*10($sp) 1020 ld r23,$FRAME-8*9($sp) 1021 ld r24,$FRAME-8*8($sp) 1022 ld r25,$FRAME-8*7($sp) 1023 ld r26,$FRAME-8*6($sp) 1024 ld r27,$FRAME-8*5($sp) 1025 ld r28,$FRAME-8*4($sp) 1026 ld r29,$FRAME-8*3($sp) 1027 ld r30,$FRAME-8*2($sp) 1028 ld r31,$FRAME-8*1($sp) 1029 addi $sp,$sp,$FRAME 1030 blr 1031 .long 0 1032 .byte 0,12,4,0,0x80,12,2,0 1033 .long 0 1034.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1035___ 1036} 1037 1038######################################################################## 1039# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1040# const P256_POINT *in2); 1041if (1) { 1042my $FRAME = 64 + 32*12 + 16*8; 1043my ($res_x,$res_y,$res_z, 1044 $H,$Hsqr,$R,$Rsqr,$Hcub, 1045 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11)); 1046my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1047# above map() describes stack layout with 12 temporary 1048# 256-bit vectors on top. 1049my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); 1050 1051$code.=<<___; 1052.globl ecp_nistz256_point_add 1053.align 5 1054ecp_nistz256_point_add: 1055 stdu $sp,-$FRAME($sp) 1056 mflr r0 1057 std r16,$FRAME-8*16($sp) 1058 std r17,$FRAME-8*15($sp) 1059 std r18,$FRAME-8*14($sp) 1060 std r19,$FRAME-8*13($sp) 1061 std r20,$FRAME-8*12($sp) 1062 std r21,$FRAME-8*11($sp) 1063 std r22,$FRAME-8*10($sp) 1064 std r23,$FRAME-8*9($sp) 1065 std r24,$FRAME-8*8($sp) 1066 std r25,$FRAME-8*7($sp) 1067 std r26,$FRAME-8*6($sp) 1068 std r27,$FRAME-8*5($sp) 1069 std r28,$FRAME-8*4($sp) 1070 std r29,$FRAME-8*3($sp) 1071 std r30,$FRAME-8*2($sp) 1072 std r31,$FRAME-8*1($sp) 1073 1074 li $poly1,-1 1075 srdi $poly1,$poly1,32 # 0x00000000ffffffff 1076 li $poly3,1 1077 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 1078 1079 ld $a0,64($bp) # in2_z 1080 ld $a1,72($bp) 1081 ld $a2,80($bp) 1082 ld $a3,88($bp) 1083 mr $rp_real,$rp 1084 mr $ap_real,$ap 1085 mr $bp_real,$bp 1086 or $t0,$a0,$a1 1087 or $t2,$a2,$a3 1088 or $in2infty,$t0,$t2 1089 neg $t0,$in2infty 1090 or $in2infty,$in2infty,$t0 1091 sradi $in2infty,$in2infty,63 # !in2infty 1092 addi $rp,$sp,$Z2sqr 1093 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z); 1094 1095 ld $a0,64($ap_real) # in1_z 1096 ld $a1,72($ap_real) 1097 ld $a2,80($ap_real) 1098 ld $a3,88($ap_real) 1099 or $t0,$a0,$a1 1100 or $t2,$a2,$a3 1101 or $in1infty,$t0,$t2 1102 neg $t0,$in1infty 1103 or $in1infty,$in1infty,$t0 1104 sradi $in1infty,$in1infty,63 # !in1infty 1105 addi $rp,$sp,$Z1sqr 1106 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); 1107 1108 ld $bi,64($bp_real) 1109 ld $a0,$Z2sqr+0($sp) 1110 ld $a1,$Z2sqr+8($sp) 1111 ld $a2,$Z2sqr+16($sp) 1112 ld $a3,$Z2sqr+24($sp) 1113 addi $bp,$bp_real,64 1114 addi $rp,$sp,$S1 1115 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z); 1116 1117 ld $bi,64($ap_real) 1118 ld $a0,$Z1sqr+0($sp) 1119 ld $a1,$Z1sqr+8($sp) 1120 ld $a2,$Z1sqr+16($sp) 1121 ld $a3,$Z1sqr+24($sp) 1122 addi $bp,$ap_real,64 1123 addi $rp,$sp,$S2 1124 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); 1125 1126 ld $bi,32($ap_real) 1127 ld $a0,$S1+0($sp) 1128 ld $a1,$S1+8($sp) 1129 ld $a2,$S1+16($sp) 1130 ld $a3,$S1+24($sp) 1131 addi $bp,$ap_real,32 1132 addi $rp,$sp,$S1 1133 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y); 1134 1135 ld $bi,32($bp_real) 1136 ld $a0,$S2+0($sp) 1137 ld $a1,$S2+8($sp) 1138 ld $a2,$S2+16($sp) 1139 ld $a3,$S2+24($sp) 1140 addi $bp,$bp_real,32 1141 addi $rp,$sp,$S2 1142 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); 1143 1144 addi $bp,$sp,$S1 1145 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont 1146 ld $a0,0($ap_real) 1147 ld $a1,8($ap_real) 1148 ld $a2,16($ap_real) 1149 ld $a3,24($ap_real) 1150 addi $rp,$sp,$R 1151 bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1); 1152 1153 or $acc0,$acc0,$acc1 # see if result is zero 1154 or $acc2,$acc2,$acc3 1155 or $temp,$acc0,$acc2 1156 1157 addi $bp,$sp,$Z2sqr 1158 addi $rp,$sp,$U1 1159 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr); 1160 1161 ld $bi,$Z1sqr($sp) 1162 ld $a0,0($bp_real) 1163 ld $a1,8($bp_real) 1164 ld $a2,16($bp_real) 1165 ld $a3,24($bp_real) 1166 addi $bp,$sp,$Z1sqr 1167 addi $rp,$sp,$U2 1168 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr); 1169 1170 addi $bp,$sp,$U1 1171 ld $a0,$R+0($sp) # forward load for p256_sqr_mont 1172 ld $a1,$R+8($sp) 1173 ld $a2,$R+16($sp) 1174 ld $a3,$R+24($sp) 1175 addi $rp,$sp,$H 1176 bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1); 1177 1178 or $acc0,$acc0,$acc1 # see if result is zero 1179 or $acc2,$acc2,$acc3 1180 or. $acc0,$acc0,$acc2 1181 bne .Ladd_proceed # is_equal(U1,U2)? 1182 1183 and. $t0,$in1infty,$in2infty 1184 beq .Ladd_proceed # (in1infty || in2infty)? 1185 1186 cmpldi $temp,0 1187 beq .Ladd_double # is_equal(S1,S2)? 1188 1189 xor $a0,$a0,$a0 1190 std $a0,0($rp_real) 1191 std $a0,8($rp_real) 1192 std $a0,16($rp_real) 1193 std $a0,24($rp_real) 1194 std $a0,32($rp_real) 1195 std $a0,40($rp_real) 1196 std $a0,48($rp_real) 1197 std $a0,56($rp_real) 1198 std $a0,64($rp_real) 1199 std $a0,72($rp_real) 1200 std $a0,80($rp_real) 1201 std $a0,88($rp_real) 1202 b .Ladd_done 1203 1204.align 4 1205.Ladd_double: 1206 ld $bp,0($sp) # back-link 1207 mr $ap,$ap_real 1208 mr $rp,$rp_real 1209 ld r16,$FRAME-8*16($sp) 1210 ld r17,$FRAME-8*15($sp) 1211 ld r18,$FRAME-8*14($sp) 1212 ld r19,$FRAME-8*13($sp) 1213 stdu $bp,$FRAME-288($sp) # difference in stack frame sizes 1214 b .Ldouble_shortcut 1215 1216.align 4 1217.Ladd_proceed: 1218 addi $rp,$sp,$Rsqr 1219 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); 1220 1221 ld $bi,64($ap_real) 1222 ld $a0,$H+0($sp) 1223 ld $a1,$H+8($sp) 1224 ld $a2,$H+16($sp) 1225 ld $a3,$H+24($sp) 1226 addi $bp,$ap_real,64 1227 addi $rp,$sp,$res_z 1228 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); 1229 1230 ld $a0,$H+0($sp) 1231 ld $a1,$H+8($sp) 1232 ld $a2,$H+16($sp) 1233 ld $a3,$H+24($sp) 1234 addi $rp,$sp,$Hsqr 1235 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); 1236 1237 ld $bi,64($bp_real) 1238 ld $a0,$res_z+0($sp) 1239 ld $a1,$res_z+8($sp) 1240 ld $a2,$res_z+16($sp) 1241 ld $a3,$res_z+24($sp) 1242 addi $bp,$bp_real,64 1243 addi $rp,$sp,$res_z 1244 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z); 1245 1246 ld $bi,$H($sp) 1247 ld $a0,$Hsqr+0($sp) 1248 ld $a1,$Hsqr+8($sp) 1249 ld $a2,$Hsqr+16($sp) 1250 ld $a3,$Hsqr+24($sp) 1251 addi $bp,$sp,$H 1252 addi $rp,$sp,$Hcub 1253 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); 1254 1255 ld $bi,$Hsqr($sp) 1256 ld $a0,$U1+0($sp) 1257 ld $a1,$U1+8($sp) 1258 ld $a2,$U1+16($sp) 1259 ld $a3,$U1+24($sp) 1260 addi $bp,$sp,$Hsqr 1261 addi $rp,$sp,$U2 1262 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr); 1263 1264 mr $t0,$acc0 1265 mr $t1,$acc1 1266 mr $t2,$acc2 1267 mr $t3,$acc3 1268 addi $rp,$sp,$Hsqr 1269 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); 1270 1271 addi $bp,$sp,$Rsqr 1272 addi $rp,$sp,$res_x 1273 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); 1274 1275 addi $bp,$sp,$Hcub 1276 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); 1277 1278 addi $bp,$sp,$U2 1279 ld $bi,$Hcub($sp) # forward load for p256_mul_mont 1280 ld $a0,$S1+0($sp) 1281 ld $a1,$S1+8($sp) 1282 ld $a2,$S1+16($sp) 1283 ld $a3,$S1+24($sp) 1284 addi $rp,$sp,$res_y 1285 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); 1286 1287 addi $bp,$sp,$Hcub 1288 addi $rp,$sp,$S2 1289 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub); 1290 1291 ld $bi,$R($sp) 1292 ld $a0,$res_y+0($sp) 1293 ld $a1,$res_y+8($sp) 1294 ld $a2,$res_y+16($sp) 1295 ld $a3,$res_y+24($sp) 1296 addi $bp,$sp,$R 1297 addi $rp,$sp,$res_y 1298 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); 1299 1300 addi $bp,$sp,$S2 1301 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); 1302 1303 ld $t0,0($bp_real) # in2 1304 ld $t1,8($bp_real) 1305 ld $t2,16($bp_real) 1306 ld $t3,24($bp_real) 1307 ld $a0,$res_x+0($sp) # res 1308 ld $a1,$res_x+8($sp) 1309 ld $a2,$res_x+16($sp) 1310 ld $a3,$res_x+24($sp) 1311___ 1312for($i=0;$i<64;$i+=32) { # conditional moves 1313$code.=<<___; 1314 ld $acc0,$i+0($ap_real) # in1 1315 ld $acc1,$i+8($ap_real) 1316 ld $acc2,$i+16($ap_real) 1317 ld $acc3,$i+24($ap_real) 1318 andc $t0,$t0,$in1infty 1319 andc $t1,$t1,$in1infty 1320 andc $t2,$t2,$in1infty 1321 andc $t3,$t3,$in1infty 1322 and $a0,$a0,$in1infty 1323 and $a1,$a1,$in1infty 1324 and $a2,$a2,$in1infty 1325 and $a3,$a3,$in1infty 1326 or $t0,$t0,$a0 1327 or $t1,$t1,$a1 1328 or $t2,$t2,$a2 1329 or $t3,$t3,$a3 1330 andc $acc0,$acc0,$in2infty 1331 andc $acc1,$acc1,$in2infty 1332 andc $acc2,$acc2,$in2infty 1333 andc $acc3,$acc3,$in2infty 1334 and $t0,$t0,$in2infty 1335 and $t1,$t1,$in2infty 1336 and $t2,$t2,$in2infty 1337 and $t3,$t3,$in2infty 1338 or $acc0,$acc0,$t0 1339 or $acc1,$acc1,$t1 1340 or $acc2,$acc2,$t2 1341 or $acc3,$acc3,$t3 1342 1343 ld $t0,$i+32($bp_real) # in2 1344 ld $t1,$i+40($bp_real) 1345 ld $t2,$i+48($bp_real) 1346 ld $t3,$i+56($bp_real) 1347 ld $a0,$res_x+$i+32($sp) 1348 ld $a1,$res_x+$i+40($sp) 1349 ld $a2,$res_x+$i+48($sp) 1350 ld $a3,$res_x+$i+56($sp) 1351 std $acc0,$i+0($rp_real) 1352 std $acc1,$i+8($rp_real) 1353 std $acc2,$i+16($rp_real) 1354 std $acc3,$i+24($rp_real) 1355___ 1356} 1357$code.=<<___; 1358 ld $acc0,$i+0($ap_real) # in1 1359 ld $acc1,$i+8($ap_real) 1360 ld $acc2,$i+16($ap_real) 1361 ld $acc3,$i+24($ap_real) 1362 andc $t0,$t0,$in1infty 1363 andc $t1,$t1,$in1infty 1364 andc $t2,$t2,$in1infty 1365 andc $t3,$t3,$in1infty 1366 and $a0,$a0,$in1infty 1367 and $a1,$a1,$in1infty 1368 and $a2,$a2,$in1infty 1369 and $a3,$a3,$in1infty 1370 or $t0,$t0,$a0 1371 or $t1,$t1,$a1 1372 or $t2,$t2,$a2 1373 or $t3,$t3,$a3 1374 andc $acc0,$acc0,$in2infty 1375 andc $acc1,$acc1,$in2infty 1376 andc $acc2,$acc2,$in2infty 1377 andc $acc3,$acc3,$in2infty 1378 and $t0,$t0,$in2infty 1379 and $t1,$t1,$in2infty 1380 and $t2,$t2,$in2infty 1381 and $t3,$t3,$in2infty 1382 or $acc0,$acc0,$t0 1383 or $acc1,$acc1,$t1 1384 or $acc2,$acc2,$t2 1385 or $acc3,$acc3,$t3 1386 std $acc0,$i+0($rp_real) 1387 std $acc1,$i+8($rp_real) 1388 std $acc2,$i+16($rp_real) 1389 std $acc3,$i+24($rp_real) 1390 1391.Ladd_done: 1392 mtlr r0 1393 ld r16,$FRAME-8*16($sp) 1394 ld r17,$FRAME-8*15($sp) 1395 ld r18,$FRAME-8*14($sp) 1396 ld r19,$FRAME-8*13($sp) 1397 ld r20,$FRAME-8*12($sp) 1398 ld r21,$FRAME-8*11($sp) 1399 ld r22,$FRAME-8*10($sp) 1400 ld r23,$FRAME-8*9($sp) 1401 ld r24,$FRAME-8*8($sp) 1402 ld r25,$FRAME-8*7($sp) 1403 ld r26,$FRAME-8*6($sp) 1404 ld r27,$FRAME-8*5($sp) 1405 ld r28,$FRAME-8*4($sp) 1406 ld r29,$FRAME-8*3($sp) 1407 ld r30,$FRAME-8*2($sp) 1408 ld r31,$FRAME-8*1($sp) 1409 addi $sp,$sp,$FRAME 1410 blr 1411 .long 0 1412 .byte 0,12,4,0,0x80,16,3,0 1413 .long 0 1414.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1415___ 1416} 1417 1418######################################################################## 1419# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1420# const P256_POINT_AFFINE *in2); 1421if (1) { 1422my $FRAME = 64 + 32*10 + 16*8; 1423my ($res_x,$res_y,$res_z, 1424 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9)); 1425my $Z1sqr = $S2; 1426# above map() describes stack layout with 10 temporary 1427# 256-bit vectors on top. 1428my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); 1429 1430$code.=<<___; 1431.globl ecp_nistz256_point_add_affine 1432.align 5 1433ecp_nistz256_point_add_affine: 1434 stdu $sp,-$FRAME($sp) 1435 mflr r0 1436 std r16,$FRAME-8*16($sp) 1437 std r17,$FRAME-8*15($sp) 1438 std r18,$FRAME-8*14($sp) 1439 std r19,$FRAME-8*13($sp) 1440 std r20,$FRAME-8*12($sp) 1441 std r21,$FRAME-8*11($sp) 1442 std r22,$FRAME-8*10($sp) 1443 std r23,$FRAME-8*9($sp) 1444 std r24,$FRAME-8*8($sp) 1445 std r25,$FRAME-8*7($sp) 1446 std r26,$FRAME-8*6($sp) 1447 std r27,$FRAME-8*5($sp) 1448 std r28,$FRAME-8*4($sp) 1449 std r29,$FRAME-8*3($sp) 1450 std r30,$FRAME-8*2($sp) 1451 std r31,$FRAME-8*1($sp) 1452 1453 li $poly1,-1 1454 srdi $poly1,$poly1,32 # 0x00000000ffffffff 1455 li $poly3,1 1456 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 1457 1458 mr $rp_real,$rp 1459 mr $ap_real,$ap 1460 mr $bp_real,$bp 1461 1462 ld $a0,64($ap) # in1_z 1463 ld $a1,72($ap) 1464 ld $a2,80($ap) 1465 ld $a3,88($ap) 1466 or $t0,$a0,$a1 1467 or $t2,$a2,$a3 1468 or $in1infty,$t0,$t2 1469 neg $t0,$in1infty 1470 or $in1infty,$in1infty,$t0 1471 sradi $in1infty,$in1infty,63 # !in1infty 1472 1473 ld $acc0,0($bp) # in2_x 1474 ld $acc1,8($bp) 1475 ld $acc2,16($bp) 1476 ld $acc3,24($bp) 1477 ld $t0,32($bp) # in2_y 1478 ld $t1,40($bp) 1479 ld $t2,48($bp) 1480 ld $t3,56($bp) 1481 or $acc0,$acc0,$acc1 1482 or $acc2,$acc2,$acc3 1483 or $acc0,$acc0,$acc2 1484 or $t0,$t0,$t1 1485 or $t2,$t2,$t3 1486 or $t0,$t0,$t2 1487 or $in2infty,$acc0,$t0 1488 neg $t0,$in2infty 1489 or $in2infty,$in2infty,$t0 1490 sradi $in2infty,$in2infty,63 # !in2infty 1491 1492 addi $rp,$sp,$Z1sqr 1493 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); 1494 1495 mr $a0,$acc0 1496 mr $a1,$acc1 1497 mr $a2,$acc2 1498 mr $a3,$acc3 1499 ld $bi,0($bp_real) 1500 addi $bp,$bp_real,0 1501 addi $rp,$sp,$U2 1502 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x); 1503 1504 addi $bp,$ap_real,0 1505 ld $bi,64($ap_real) # forward load for p256_mul_mont 1506 ld $a0,$Z1sqr+0($sp) 1507 ld $a1,$Z1sqr+8($sp) 1508 ld $a2,$Z1sqr+16($sp) 1509 ld $a3,$Z1sqr+24($sp) 1510 addi $rp,$sp,$H 1511 bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x); 1512 1513 addi $bp,$ap_real,64 1514 addi $rp,$sp,$S2 1515 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); 1516 1517 ld $bi,64($ap_real) 1518 ld $a0,$H+0($sp) 1519 ld $a1,$H+8($sp) 1520 ld $a2,$H+16($sp) 1521 ld $a3,$H+24($sp) 1522 addi $bp,$ap_real,64 1523 addi $rp,$sp,$res_z 1524 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); 1525 1526 ld $bi,32($bp_real) 1527 ld $a0,$S2+0($sp) 1528 ld $a1,$S2+8($sp) 1529 ld $a2,$S2+16($sp) 1530 ld $a3,$S2+24($sp) 1531 addi $bp,$bp_real,32 1532 addi $rp,$sp,$S2 1533 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); 1534 1535 addi $bp,$ap_real,32 1536 ld $a0,$H+0($sp) # forward load for p256_sqr_mont 1537 ld $a1,$H+8($sp) 1538 ld $a2,$H+16($sp) 1539 ld $a3,$H+24($sp) 1540 addi $rp,$sp,$R 1541 bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y); 1542 1543 addi $rp,$sp,$Hsqr 1544 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); 1545 1546 ld $a0,$R+0($sp) 1547 ld $a1,$R+8($sp) 1548 ld $a2,$R+16($sp) 1549 ld $a3,$R+24($sp) 1550 addi $rp,$sp,$Rsqr 1551 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); 1552 1553 ld $bi,$H($sp) 1554 ld $a0,$Hsqr+0($sp) 1555 ld $a1,$Hsqr+8($sp) 1556 ld $a2,$Hsqr+16($sp) 1557 ld $a3,$Hsqr+24($sp) 1558 addi $bp,$sp,$H 1559 addi $rp,$sp,$Hcub 1560 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); 1561 1562 ld $bi,0($ap_real) 1563 ld $a0,$Hsqr+0($sp) 1564 ld $a1,$Hsqr+8($sp) 1565 ld $a2,$Hsqr+16($sp) 1566 ld $a3,$Hsqr+24($sp) 1567 addi $bp,$ap_real,0 1568 addi $rp,$sp,$U2 1569 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr); 1570 1571 mr $t0,$acc0 1572 mr $t1,$acc1 1573 mr $t2,$acc2 1574 mr $t3,$acc3 1575 addi $rp,$sp,$Hsqr 1576 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); 1577 1578 addi $bp,$sp,$Rsqr 1579 addi $rp,$sp,$res_x 1580 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); 1581 1582 addi $bp,$sp,$Hcub 1583 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); 1584 1585 addi $bp,$sp,$U2 1586 ld $bi,32($ap_real) # forward load for p256_mul_mont 1587 ld $a0,$Hcub+0($sp) 1588 ld $a1,$Hcub+8($sp) 1589 ld $a2,$Hcub+16($sp) 1590 ld $a3,$Hcub+24($sp) 1591 addi $rp,$sp,$res_y 1592 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); 1593 1594 addi $bp,$ap_real,32 1595 addi $rp,$sp,$S2 1596 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub); 1597 1598 ld $bi,$R($sp) 1599 ld $a0,$res_y+0($sp) 1600 ld $a1,$res_y+8($sp) 1601 ld $a2,$res_y+16($sp) 1602 ld $a3,$res_y+24($sp) 1603 addi $bp,$sp,$R 1604 addi $rp,$sp,$res_y 1605 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); 1606 1607 addi $bp,$sp,$S2 1608 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); 1609 1610 ld $t0,0($bp_real) # in2 1611 ld $t1,8($bp_real) 1612 ld $t2,16($bp_real) 1613 ld $t3,24($bp_real) 1614 ld $a0,$res_x+0($sp) # res 1615 ld $a1,$res_x+8($sp) 1616 ld $a2,$res_x+16($sp) 1617 ld $a3,$res_x+24($sp) 1618___ 1619for($i=0;$i<64;$i+=32) { # conditional moves 1620$code.=<<___; 1621 ld $acc0,$i+0($ap_real) # in1 1622 ld $acc1,$i+8($ap_real) 1623 ld $acc2,$i+16($ap_real) 1624 ld $acc3,$i+24($ap_real) 1625 andc $t0,$t0,$in1infty 1626 andc $t1,$t1,$in1infty 1627 andc $t2,$t2,$in1infty 1628 andc $t3,$t3,$in1infty 1629 and $a0,$a0,$in1infty 1630 and $a1,$a1,$in1infty 1631 and $a2,$a2,$in1infty 1632 and $a3,$a3,$in1infty 1633 or $t0,$t0,$a0 1634 or $t1,$t1,$a1 1635 or $t2,$t2,$a2 1636 or $t3,$t3,$a3 1637 andc $acc0,$acc0,$in2infty 1638 andc $acc1,$acc1,$in2infty 1639 andc $acc2,$acc2,$in2infty 1640 andc $acc3,$acc3,$in2infty 1641 and $t0,$t0,$in2infty 1642 and $t1,$t1,$in2infty 1643 and $t2,$t2,$in2infty 1644 and $t3,$t3,$in2infty 1645 or $acc0,$acc0,$t0 1646 or $acc1,$acc1,$t1 1647 or $acc2,$acc2,$t2 1648 or $acc3,$acc3,$t3 1649___ 1650$code.=<<___ if ($i==0); 1651 ld $t0,32($bp_real) # in2 1652 ld $t1,40($bp_real) 1653 ld $t2,48($bp_real) 1654 ld $t3,56($bp_real) 1655___ 1656$code.=<<___ if ($i==32); 1657 li $t0,1 # Lone_mont 1658 not $t1,$poly1 1659 li $t2,-1 1660 not $t3,$poly3 1661___ 1662$code.=<<___; 1663 ld $a0,$res_x+$i+32($sp) 1664 ld $a1,$res_x+$i+40($sp) 1665 ld $a2,$res_x+$i+48($sp) 1666 ld $a3,$res_x+$i+56($sp) 1667 std $acc0,$i+0($rp_real) 1668 std $acc1,$i+8($rp_real) 1669 std $acc2,$i+16($rp_real) 1670 std $acc3,$i+24($rp_real) 1671___ 1672} 1673$code.=<<___; 1674 ld $acc0,$i+0($ap_real) # in1 1675 ld $acc1,$i+8($ap_real) 1676 ld $acc2,$i+16($ap_real) 1677 ld $acc3,$i+24($ap_real) 1678 andc $t0,$t0,$in1infty 1679 andc $t1,$t1,$in1infty 1680 andc $t2,$t2,$in1infty 1681 andc $t3,$t3,$in1infty 1682 and $a0,$a0,$in1infty 1683 and $a1,$a1,$in1infty 1684 and $a2,$a2,$in1infty 1685 and $a3,$a3,$in1infty 1686 or $t0,$t0,$a0 1687 or $t1,$t1,$a1 1688 or $t2,$t2,$a2 1689 or $t3,$t3,$a3 1690 andc $acc0,$acc0,$in2infty 1691 andc $acc1,$acc1,$in2infty 1692 andc $acc2,$acc2,$in2infty 1693 andc $acc3,$acc3,$in2infty 1694 and $t0,$t0,$in2infty 1695 and $t1,$t1,$in2infty 1696 and $t2,$t2,$in2infty 1697 and $t3,$t3,$in2infty 1698 or $acc0,$acc0,$t0 1699 or $acc1,$acc1,$t1 1700 or $acc2,$acc2,$t2 1701 or $acc3,$acc3,$t3 1702 std $acc0,$i+0($rp_real) 1703 std $acc1,$i+8($rp_real) 1704 std $acc2,$i+16($rp_real) 1705 std $acc3,$i+24($rp_real) 1706 1707 mtlr r0 1708 ld r16,$FRAME-8*16($sp) 1709 ld r17,$FRAME-8*15($sp) 1710 ld r18,$FRAME-8*14($sp) 1711 ld r19,$FRAME-8*13($sp) 1712 ld r20,$FRAME-8*12($sp) 1713 ld r21,$FRAME-8*11($sp) 1714 ld r22,$FRAME-8*10($sp) 1715 ld r23,$FRAME-8*9($sp) 1716 ld r24,$FRAME-8*8($sp) 1717 ld r25,$FRAME-8*7($sp) 1718 ld r26,$FRAME-8*6($sp) 1719 ld r27,$FRAME-8*5($sp) 1720 ld r28,$FRAME-8*4($sp) 1721 ld r29,$FRAME-8*3($sp) 1722 ld r30,$FRAME-8*2($sp) 1723 ld r31,$FRAME-8*1($sp) 1724 addi $sp,$sp,$FRAME 1725 blr 1726 .long 0 1727 .byte 0,12,4,0,0x80,16,3,0 1728 .long 0 1729.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1730___ 1731} 1732if (1) { 1733my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21)); 1734my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0"); 1735 1736$code.=<<___; 1737######################################################################## 1738# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1739# uint64_t b[4]); 1740.globl ecp_nistz256_ord_mul_mont 1741.align 5 1742ecp_nistz256_ord_mul_mont: 1743 stdu $sp,-160($sp) 1744 std r18,48($sp) 1745 std r19,56($sp) 1746 std r20,64($sp) 1747 std r21,72($sp) 1748 std r22,80($sp) 1749 std r23,88($sp) 1750 std r24,96($sp) 1751 std r25,104($sp) 1752 std r26,112($sp) 1753 std r27,120($sp) 1754 std r28,128($sp) 1755 std r29,136($sp) 1756 std r30,144($sp) 1757 std r31,152($sp) 1758 1759 ld $a0,0($ap) 1760 ld $bi,0($bp) 1761 ld $a1,8($ap) 1762 ld $a2,16($ap) 1763 ld $a3,24($ap) 1764 1765 lis $ordk,0xccd1 1766 lis $ord0,0xf3b9 1767 lis $ord1,0xbce6 1768 ori $ordk,$ordk,0xc8aa 1769 ori $ord0,$ord0,0xcac2 1770 ori $ord1,$ord1,0xfaad 1771 sldi $ordk,$ordk,32 1772 sldi $ord0,$ord0,32 1773 sldi $ord1,$ord1,32 1774 oris $ordk,$ordk,0xee00 1775 oris $ord0,$ord0,0xfc63 1776 oris $ord1,$ord1,0xa717 1777 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f 1778 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 1779 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 1780 li $ord2,-1 # 0xffffffffffffffff 1781 sldi $ord3,$ord2,32 # 0xffffffff00000000 1782 li $zr,0 1783 1784 mulld $acc0,$a0,$bi # a[0]*b[0] 1785 mulhdu $t0,$a0,$bi 1786 1787 mulld $acc1,$a1,$bi # a[1]*b[0] 1788 mulhdu $t1,$a1,$bi 1789 1790 mulld $acc2,$a2,$bi # a[2]*b[0] 1791 mulhdu $t2,$a2,$bi 1792 1793 mulld $acc3,$a3,$bi # a[3]*b[0] 1794 mulhdu $acc4,$a3,$bi 1795 1796 mulld $t4,$acc0,$ordk 1797 1798 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication 1799 adde $acc2,$acc2,$t1 1800 adde $acc3,$acc3,$t2 1801 addze $acc4,$acc4 1802 li $acc5,0 1803___ 1804for ($i=1;$i<4;$i++) { 1805 ################################################################ 1806 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1807 # * abcdefgh 1808 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1809 # 1810 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1811 # rewrite above as: 1812 # 1813 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1814 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1815 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1816$code.=<<___; 1817 ld $bi,8*$i($bp) # b[i] 1818 1819 sldi $t0,$t4,32 1820 subfc $acc2,$t4,$acc2 1821 srdi $t1,$t4,32 1822 subfe $acc3,$t0,$acc3 1823 subfe $acc4,$t1,$acc4 1824 subfe $acc5,$zr,$acc5 1825 1826 addic $t0,$acc0,-1 # discarded 1827 mulhdu $t1,$ord0,$t4 1828 mulld $t2,$ord1,$t4 1829 mulhdu $t3,$ord1,$t4 1830 1831 adde $t2,$t2,$t1 1832 mulld $t0,$a0,$bi 1833 addze $t3,$t3 1834 mulld $t1,$a1,$bi 1835 1836 addc $acc0,$acc1,$t2 1837 mulld $t2,$a2,$bi 1838 adde $acc1,$acc2,$t3 1839 mulld $t3,$a3,$bi 1840 adde $acc2,$acc3,$t4 1841 adde $acc3,$acc4,$t4 1842 addze $acc4,$acc5 1843 1844 addc $acc0,$acc0,$t0 # accumulate low parts 1845 mulhdu $t0,$a0,$bi 1846 adde $acc1,$acc1,$t1 1847 mulhdu $t1,$a1,$bi 1848 adde $acc2,$acc2,$t2 1849 mulhdu $t2,$a2,$bi 1850 adde $acc3,$acc3,$t3 1851 mulhdu $t3,$a3,$bi 1852 addze $acc4,$acc4 1853 mulld $t4,$acc0,$ordk 1854 addc $acc1,$acc1,$t0 # accumulate high parts 1855 adde $acc2,$acc2,$t1 1856 adde $acc3,$acc3,$t2 1857 adde $acc4,$acc4,$t3 1858 addze $acc5,$zr 1859___ 1860} 1861$code.=<<___; 1862 sldi $t0,$t4,32 # last reduction 1863 subfc $acc2,$t4,$acc2 1864 srdi $t1,$t4,32 1865 subfe $acc3,$t0,$acc3 1866 subfe $acc4,$t1,$acc4 1867 subfe $acc5,$zr,$acc5 1868 1869 addic $t0,$acc0,-1 # discarded 1870 mulhdu $t1,$ord0,$t4 1871 mulld $t2,$ord1,$t4 1872 mulhdu $t3,$ord1,$t4 1873 1874 adde $t2,$t2,$t1 1875 addze $t3,$t3 1876 1877 addc $acc0,$acc1,$t2 1878 adde $acc1,$acc2,$t3 1879 adde $acc2,$acc3,$t4 1880 adde $acc3,$acc4,$t4 1881 addze $acc4,$acc5 1882 1883 subfc $acc0,$ord0,$acc0 # ret -= modulus 1884 subfe $acc1,$ord1,$acc1 1885 subfe $acc2,$ord2,$acc2 1886 subfe $acc3,$ord3,$acc3 1887 subfe $acc4,$zr,$acc4 1888 1889 and $t0,$ord0,$acc4 1890 and $t1,$ord1,$acc4 1891 addc $acc0,$acc0,$t0 # ret += modulus if borrow 1892 and $t3,$ord3,$acc4 1893 adde $acc1,$acc1,$t1 1894 adde $acc2,$acc2,$acc4 1895 adde $acc3,$acc3,$t3 1896 1897 std $acc0,0($rp) 1898 std $acc1,8($rp) 1899 std $acc2,16($rp) 1900 std $acc3,24($rp) 1901 1902 ld r18,48($sp) 1903 ld r19,56($sp) 1904 ld r20,64($sp) 1905 ld r21,72($sp) 1906 ld r22,80($sp) 1907 ld r23,88($sp) 1908 ld r24,96($sp) 1909 ld r25,104($sp) 1910 ld r26,112($sp) 1911 ld r27,120($sp) 1912 ld r28,128($sp) 1913 ld r29,136($sp) 1914 ld r30,144($sp) 1915 ld r31,152($sp) 1916 addi $sp,$sp,160 1917 blr 1918 .long 0 1919 .byte 0,12,4,0,0x80,14,3,0 1920 .long 0 1921.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1922 1923################################################################################ 1924# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1925# uint64_t rep); 1926.globl ecp_nistz256_ord_sqr_mont 1927.align 5 1928ecp_nistz256_ord_sqr_mont: 1929 stdu $sp,-160($sp) 1930 std r18,48($sp) 1931 std r19,56($sp) 1932 std r20,64($sp) 1933 std r21,72($sp) 1934 std r22,80($sp) 1935 std r23,88($sp) 1936 std r24,96($sp) 1937 std r25,104($sp) 1938 std r26,112($sp) 1939 std r27,120($sp) 1940 std r28,128($sp) 1941 std r29,136($sp) 1942 std r30,144($sp) 1943 std r31,152($sp) 1944 1945 mtctr $bp 1946 1947 ld $a0,0($ap) 1948 ld $a1,8($ap) 1949 ld $a2,16($ap) 1950 ld $a3,24($ap) 1951 1952 lis $ordk,0xccd1 1953 lis $ord0,0xf3b9 1954 lis $ord1,0xbce6 1955 ori $ordk,$ordk,0xc8aa 1956 ori $ord0,$ord0,0xcac2 1957 ori $ord1,$ord1,0xfaad 1958 sldi $ordk,$ordk,32 1959 sldi $ord0,$ord0,32 1960 sldi $ord1,$ord1,32 1961 oris $ordk,$ordk,0xee00 1962 oris $ord0,$ord0,0xfc63 1963 oris $ord1,$ord1,0xa717 1964 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f 1965 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 1966 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 1967 li $ord2,-1 # 0xffffffffffffffff 1968 sldi $ord3,$ord2,32 # 0xffffffff00000000 1969 li $zr,0 1970 b .Loop_ord_sqr 1971 1972.align 5 1973.Loop_ord_sqr: 1974 ################################################################ 1975 # | | | | | |a1*a0| | 1976 # | | | | |a2*a0| | | 1977 # | |a3*a2|a3*a0| | | | 1978 # | | | |a2*a1| | | | 1979 # | | |a3*a1| | | | | 1980 # *| | | | | | | | 2| 1981 # +|a3*a3|a2*a2|a1*a1|a0*a0| 1982 # |--+--+--+--+--+--+--+--| 1983 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1984 # 1985 # "can't overflow" below mark carrying into high part of 1986 # multiplication result, which can't overflow, because it 1987 # can never be all ones. 1988 1989 mulld $acc1,$a1,$a0 # a[1]*a[0] 1990 mulhdu $t1,$a1,$a0 1991 mulld $acc2,$a2,$a0 # a[2]*a[0] 1992 mulhdu $t2,$a2,$a0 1993 mulld $acc3,$a3,$a0 # a[3]*a[0] 1994 mulhdu $acc4,$a3,$a0 1995 1996 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication 1997 mulld $t0,$a2,$a1 # a[2]*a[1] 1998 mulhdu $t1,$a2,$a1 1999 adde $acc3,$acc3,$t2 2000 mulld $t2,$a3,$a1 # a[3]*a[1] 2001 mulhdu $t3,$a3,$a1 2002 addze $acc4,$acc4 # can't overflow 2003 2004 mulld $acc5,$a3,$a2 # a[3]*a[2] 2005 mulhdu $acc6,$a3,$a2 2006 2007 addc $t1,$t1,$t2 # accumulate high parts of multiplication 2008 mulld $acc0,$a0,$a0 # a[0]*a[0] 2009 addze $t2,$t3 # can't overflow 2010 2011 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication 2012 mulhdu $a0,$a0,$a0 2013 adde $acc4,$acc4,$t1 2014 mulld $t1,$a1,$a1 # a[1]*a[1] 2015 adde $acc5,$acc5,$t2 2016 mulhdu $a1,$a1,$a1 2017 addze $acc6,$acc6 # can't overflow 2018 2019 addc $acc1,$acc1,$acc1 # acc[1-6]*=2 2020 mulld $t2,$a2,$a2 # a[2]*a[2] 2021 adde $acc2,$acc2,$acc2 2022 mulhdu $a2,$a2,$a2 2023 adde $acc3,$acc3,$acc3 2024 mulld $t3,$a3,$a3 # a[3]*a[3] 2025 adde $acc4,$acc4,$acc4 2026 mulhdu $a3,$a3,$a3 2027 adde $acc5,$acc5,$acc5 2028 adde $acc6,$acc6,$acc6 2029 addze $acc7,$zr 2030 2031 addc $acc1,$acc1,$a0 # +a[i]*a[i] 2032 mulld $t4,$acc0,$ordk 2033 adde $acc2,$acc2,$t1 2034 adde $acc3,$acc3,$a1 2035 adde $acc4,$acc4,$t2 2036 adde $acc5,$acc5,$a2 2037 adde $acc6,$acc6,$t3 2038 adde $acc7,$acc7,$a3 2039___ 2040for($i=0; $i<4; $i++) { # reductions 2041$code.=<<___; 2042 addic $t0,$acc0,-1 # discarded 2043 mulhdu $t1,$ord0,$t4 2044 mulld $t2,$ord1,$t4 2045 mulhdu $t3,$ord1,$t4 2046 2047 adde $t2,$t2,$t1 2048 addze $t3,$t3 2049 2050 addc $acc0,$acc1,$t2 2051 adde $acc1,$acc2,$t3 2052 adde $acc2,$acc3,$t4 2053 adde $acc3,$zr,$t4 # can't overflow 2054___ 2055$code.=<<___ if ($i<3); 2056 mulld $t3,$acc0,$ordk 2057___ 2058$code.=<<___; 2059 sldi $t0,$t4,32 2060 subfc $acc1,$t4,$acc1 2061 srdi $t1,$t4,32 2062 subfe $acc2,$t0,$acc2 2063 subfe $acc3,$t1,$acc3 # can't borrow 2064___ 2065 ($t3,$t4) = ($t4,$t3); 2066} 2067$code.=<<___; 2068 addc $acc0,$acc0,$acc4 # accumulate upper half 2069 adde $acc1,$acc1,$acc5 2070 adde $acc2,$acc2,$acc6 2071 adde $acc3,$acc3,$acc7 2072 addze $acc4,$zr 2073 2074 subfc $acc0,$ord0,$acc0 # ret -= modulus 2075 subfe $acc1,$ord1,$acc1 2076 subfe $acc2,$ord2,$acc2 2077 subfe $acc3,$ord3,$acc3 2078 subfe $acc4,$zr,$acc4 2079 2080 and $t0,$ord0,$acc4 2081 and $t1,$ord1,$acc4 2082 addc $a0,$acc0,$t0 # ret += modulus if borrow 2083 and $t3,$ord3,$acc4 2084 adde $a1,$acc1,$t1 2085 adde $a2,$acc2,$acc4 2086 adde $a3,$acc3,$t3 2087 2088 bdnz .Loop_ord_sqr 2089 2090 std $a0,0($rp) 2091 std $a1,8($rp) 2092 std $a2,16($rp) 2093 std $a3,24($rp) 2094 2095 ld r18,48($sp) 2096 ld r19,56($sp) 2097 ld r20,64($sp) 2098 ld r21,72($sp) 2099 ld r22,80($sp) 2100 ld r23,88($sp) 2101 ld r24,96($sp) 2102 ld r25,104($sp) 2103 ld r26,112($sp) 2104 ld r27,120($sp) 2105 ld r28,128($sp) 2106 ld r29,136($sp) 2107 ld r30,144($sp) 2108 ld r31,152($sp) 2109 addi $sp,$sp,160 2110 blr 2111 .long 0 2112 .byte 0,12,4,0,0x80,14,3,0 2113 .long 0 2114.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 2115___ 2116} } 2117 2118######################################################################## 2119# scatter-gather subroutines 2120{ 2121my ($out,$inp,$index,$mask)=map("r$_",(3..7)); 2122$code.=<<___; 2123######################################################################## 2124# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp, 2125# int index); 2126.globl ecp_nistz256_scatter_w5 2127.align 4 2128ecp_nistz256_scatter_w5: 2129 slwi $index,$index,2 2130 add $out,$out,$index 2131 2132 ld r8, 0($inp) # X 2133 ld r9, 8($inp) 2134 ld r10,16($inp) 2135 ld r11,24($inp) 2136 2137 stw r8, 64*0-4($out) 2138 srdi r8, r8, 32 2139 stw r9, 64*1-4($out) 2140 srdi r9, r9, 32 2141 stw r10,64*2-4($out) 2142 srdi r10,r10,32 2143 stw r11,64*3-4($out) 2144 srdi r11,r11,32 2145 stw r8, 64*4-4($out) 2146 stw r9, 64*5-4($out) 2147 stw r10,64*6-4($out) 2148 stw r11,64*7-4($out) 2149 addi $out,$out,64*8 2150 2151 ld r8, 32($inp) # Y 2152 ld r9, 40($inp) 2153 ld r10,48($inp) 2154 ld r11,56($inp) 2155 2156 stw r8, 64*0-4($out) 2157 srdi r8, r8, 32 2158 stw r9, 64*1-4($out) 2159 srdi r9, r9, 32 2160 stw r10,64*2-4($out) 2161 srdi r10,r10,32 2162 stw r11,64*3-4($out) 2163 srdi r11,r11,32 2164 stw r8, 64*4-4($out) 2165 stw r9, 64*5-4($out) 2166 stw r10,64*6-4($out) 2167 stw r11,64*7-4($out) 2168 addi $out,$out,64*8 2169 2170 ld r8, 64($inp) # Z 2171 ld r9, 72($inp) 2172 ld r10,80($inp) 2173 ld r11,88($inp) 2174 2175 stw r8, 64*0-4($out) 2176 srdi r8, r8, 32 2177 stw r9, 64*1-4($out) 2178 srdi r9, r9, 32 2179 stw r10,64*2-4($out) 2180 srdi r10,r10,32 2181 stw r11,64*3-4($out) 2182 srdi r11,r11,32 2183 stw r8, 64*4-4($out) 2184 stw r9, 64*5-4($out) 2185 stw r10,64*6-4($out) 2186 stw r11,64*7-4($out) 2187 2188 blr 2189 .long 0 2190 .byte 0,12,0x14,0,0,0,3,0 2191 .long 0 2192.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 2193 2194######################################################################## 2195# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp, 2196# int index); 2197.globl ecp_nistz256_gather_w5 2198.align 4 2199ecp_nistz256_gather_w5: 2200 neg r0,$index 2201 sradi r0,r0,63 2202 2203 add $index,$index,r0 2204 slwi $index,$index,2 2205 add $inp,$inp,$index 2206 2207 lwz r5, 64*0($inp) 2208 lwz r6, 64*1($inp) 2209 lwz r7, 64*2($inp) 2210 lwz r8, 64*3($inp) 2211 lwz r9, 64*4($inp) 2212 lwz r10,64*5($inp) 2213 lwz r11,64*6($inp) 2214 lwz r12,64*7($inp) 2215 addi $inp,$inp,64*8 2216 sldi r9, r9, 32 2217 sldi r10,r10,32 2218 sldi r11,r11,32 2219 sldi r12,r12,32 2220 or r5,r5,r9 2221 or r6,r6,r10 2222 or r7,r7,r11 2223 or r8,r8,r12 2224 and r5,r5,r0 2225 and r6,r6,r0 2226 and r7,r7,r0 2227 and r8,r8,r0 2228 std r5,0($out) # X 2229 std r6,8($out) 2230 std r7,16($out) 2231 std r8,24($out) 2232 2233 lwz r5, 64*0($inp) 2234 lwz r6, 64*1($inp) 2235 lwz r7, 64*2($inp) 2236 lwz r8, 64*3($inp) 2237 lwz r9, 64*4($inp) 2238 lwz r10,64*5($inp) 2239 lwz r11,64*6($inp) 2240 lwz r12,64*7($inp) 2241 addi $inp,$inp,64*8 2242 sldi r9, r9, 32 2243 sldi r10,r10,32 2244 sldi r11,r11,32 2245 sldi r12,r12,32 2246 or r5,r5,r9 2247 or r6,r6,r10 2248 or r7,r7,r11 2249 or r8,r8,r12 2250 and r5,r5,r0 2251 and r6,r6,r0 2252 and r7,r7,r0 2253 and r8,r8,r0 2254 std r5,32($out) # Y 2255 std r6,40($out) 2256 std r7,48($out) 2257 std r8,56($out) 2258 2259 lwz r5, 64*0($inp) 2260 lwz r6, 64*1($inp) 2261 lwz r7, 64*2($inp) 2262 lwz r8, 64*3($inp) 2263 lwz r9, 64*4($inp) 2264 lwz r10,64*5($inp) 2265 lwz r11,64*6($inp) 2266 lwz r12,64*7($inp) 2267 sldi r9, r9, 32 2268 sldi r10,r10,32 2269 sldi r11,r11,32 2270 sldi r12,r12,32 2271 or r5,r5,r9 2272 or r6,r6,r10 2273 or r7,r7,r11 2274 or r8,r8,r12 2275 and r5,r5,r0 2276 and r6,r6,r0 2277 and r7,r7,r0 2278 and r8,r8,r0 2279 std r5,64($out) # Z 2280 std r6,72($out) 2281 std r7,80($out) 2282 std r8,88($out) 2283 2284 blr 2285 .long 0 2286 .byte 0,12,0x14,0,0,0,3,0 2287 .long 0 2288.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 2289 2290######################################################################## 2291# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp, 2292# int index); 2293.globl ecp_nistz256_scatter_w7 2294.align 4 2295ecp_nistz256_scatter_w7: 2296 li r0,8 2297 mtctr r0 2298 add $out,$out,$index 2299 subi $inp,$inp,8 2300 2301.Loop_scatter_w7: 2302 ldu r0,8($inp) 2303 stb r0,64*0($out) 2304 srdi r0,r0,8 2305 stb r0,64*1($out) 2306 srdi r0,r0,8 2307 stb r0,64*2($out) 2308 srdi r0,r0,8 2309 stb r0,64*3($out) 2310 srdi r0,r0,8 2311 stb r0,64*4($out) 2312 srdi r0,r0,8 2313 stb r0,64*5($out) 2314 srdi r0,r0,8 2315 stb r0,64*6($out) 2316 srdi r0,r0,8 2317 stb r0,64*7($out) 2318 addi $out,$out,64*8 2319 bdnz .Loop_scatter_w7 2320 2321 blr 2322 .long 0 2323 .byte 0,12,0x14,0,0,0,3,0 2324 .long 0 2325.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 2326 2327######################################################################## 2328# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp, 2329# int index); 2330.globl ecp_nistz256_gather_w7 2331.align 4 2332ecp_nistz256_gather_w7: 2333 li r0,8 2334 mtctr r0 2335 neg r0,$index 2336 sradi r0,r0,63 2337 2338 add $index,$index,r0 2339 add $inp,$inp,$index 2340 subi $out,$out,8 2341 2342.Loop_gather_w7: 2343 lbz r5, 64*0($inp) 2344 lbz r6, 64*1($inp) 2345 lbz r7, 64*2($inp) 2346 lbz r8, 64*3($inp) 2347 lbz r9, 64*4($inp) 2348 lbz r10,64*5($inp) 2349 lbz r11,64*6($inp) 2350 lbz r12,64*7($inp) 2351 addi $inp,$inp,64*8 2352 2353 sldi r6, r6, 8 2354 sldi r7, r7, 16 2355 sldi r8, r8, 24 2356 sldi r9, r9, 32 2357 sldi r10,r10,40 2358 sldi r11,r11,48 2359 sldi r12,r12,56 2360 2361 or r5,r5,r6 2362 or r7,r7,r8 2363 or r9,r9,r10 2364 or r11,r11,r12 2365 or r5,r5,r7 2366 or r9,r9,r11 2367 or r5,r5,r9 2368 and r5,r5,r0 2369 stdu r5,8($out) 2370 bdnz .Loop_gather_w7 2371 2372 blr 2373 .long 0 2374 .byte 0,12,0x14,0,0,0,3,0 2375 .long 0 2376.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 2377___ 2378} 2379 2380foreach (split("\n",$code)) { 2381 s/\`([^\`]*)\`/eval $1/ge; 2382 2383 print $_,"\n"; 2384} 2385close STDOUT or die "error closing STDOUT: $!"; # enforce flush 2386