1#! /usr/bin/env perl 2# Copyright 2014-2024 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2014, Intel Corporation. All Rights Reserved. 4# Copyright (c) 2015 CloudFlare, Inc. 5# 6# Licensed under the Apache License 2.0 (the "License"). You may not use 7# this file except in compliance with the License. You can obtain a copy 8# in the file LICENSE in the source distribution or at 9# https://www.openssl.org/source/license.html 10# 11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) 12# (1) Intel Corporation, Israel Development Center, Haifa, Israel 13# (2) University of Haifa, Israel 14# (3) CloudFlare, Inc. 15# 16# Reference: 17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with 18# 256 Bit Primes" 19 20# Further optimization by <appro@openssl.org>: 21# 22# this/original with/without -DECP_NISTZ256_ASM(*) 23# Opteron +15-49% +150-195% 24# Bulldozer +18-45% +175-240% 25# P4 +24-46% +100-150% 26# Westmere +18-34% +87-160% 27# Sandy Bridge +14-35% +120-185% 28# Ivy Bridge +11-35% +125-180% 29# Haswell +10-37% +160-200% 30# Broadwell +24-58% +210-270% 31# Atom +20-50% +180-240% 32# VIA Nano +50-160% +480-480% 33# 34# (*) "without -DECP_NISTZ256_ASM" refers to build with 35# "enable-ec_nistp_64_gcc_128"; 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. In "this/original" column lower coefficient is for 39# ECDSA sign, while in "with/without" - for ECDH key agreement, and 40# higher - for ECDSA sign, relatively fastest server-side operation. 41# Keep in mind that +100% means 2x improvement. 42 43# $output is the last argument if it looks like a file (it has an extension) 44# $flavour is the first argument if it doesn't look like a file 45$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 47 48$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 49 50$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 51( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 52( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 53die "can't locate x86_64-xlate.pl"; 54 55open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 56 or die "can't call $xlate: $!"; 57*STDOUT=*OUT; 58 59if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 61 $avx = ($1>=2.19) + ($1>=2.22); 62 $addx = ($1>=2.23); 63} 64 65if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 66 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 67 $avx = ($1>=2.09) + ($1>=2.10); 68 $addx = ($1>=2.10); 69} 70 71if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 72 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 73 $avx = ($1>=10) + ($1>=11); 74 $addx = ($1>=12); 75} 76 77if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 78 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 79 $avx = ($ver>=3.0) + ($ver>=3.01); 80 $addx = ($ver>=3.03); 81} 82 83$code.=<<___; 84.text 85.extern OPENSSL_ia32cap_P 86 87# The polynomial 88.section .rodata align=4096 89.align 64 90.Lpoly: 91.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 92 93# 2^512 mod P precomputed for NIST P256 polynomial 94.LRR: 95.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd 96 97.LOne: 98.long 1,1,1,1,1,1,1,1 99.LTwo: 100.long 2,2,2,2,2,2,2,2 101.LThree: 102.long 3,3,3,3,3,3,3,3 103.LONE_mont: 104.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 105 106# Constants for computations modulo ord(p256) 107.Lord: 108.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 109.LordK: 110.quad 0xccd1c8aaee00bc4f 111.previous 112___ 113 114{ 115################################################################################ 116# void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); 117 118my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); 119my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); 120my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); 121 122$code.=<<___; 123 124.globl ecp_nistz256_mul_by_2 125.type ecp_nistz256_mul_by_2,\@function,2 126.align 64 127ecp_nistz256_mul_by_2: 128.cfi_startproc 129 push %r12 130.cfi_push %r12 131 push %r13 132.cfi_push %r13 133.Lmul_by_2_body: 134 135 mov 8*0($a_ptr), $a0 136 xor $t4,$t4 137 mov 8*1($a_ptr), $a1 138 add $a0, $a0 # a0:a3+a0:a3 139 mov 8*2($a_ptr), $a2 140 adc $a1, $a1 141 mov 8*3($a_ptr), $a3 142 lea .Lpoly(%rip), $a_ptr 143 mov $a0, $t0 144 adc $a2, $a2 145 adc $a3, $a3 146 mov $a1, $t1 147 adc \$0, $t4 148 149 sub 8*0($a_ptr), $a0 150 mov $a2, $t2 151 sbb 8*1($a_ptr), $a1 152 sbb 8*2($a_ptr), $a2 153 mov $a3, $t3 154 sbb 8*3($a_ptr), $a3 155 sbb \$0, $t4 156 157 cmovc $t0, $a0 158 cmovc $t1, $a1 159 mov $a0, 8*0($r_ptr) 160 cmovc $t2, $a2 161 mov $a1, 8*1($r_ptr) 162 cmovc $t3, $a3 163 mov $a2, 8*2($r_ptr) 164 mov $a3, 8*3($r_ptr) 165 166 mov 0(%rsp),%r13 167.cfi_restore %r13 168 mov 8(%rsp),%r12 169.cfi_restore %r12 170 lea 16(%rsp),%rsp 171.cfi_adjust_cfa_offset -16 172.Lmul_by_2_epilogue: 173 ret 174.cfi_endproc 175.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 176 177################################################################################ 178# void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); 179.globl ecp_nistz256_div_by_2 180.type ecp_nistz256_div_by_2,\@function,2 181.align 32 182ecp_nistz256_div_by_2: 183.cfi_startproc 184 push %r12 185.cfi_push %r12 186 push %r13 187.cfi_push %r13 188.Ldiv_by_2_body: 189 190 mov 8*0($a_ptr), $a0 191 mov 8*1($a_ptr), $a1 192 mov 8*2($a_ptr), $a2 193 mov $a0, $t0 194 mov 8*3($a_ptr), $a3 195 lea .Lpoly(%rip), $a_ptr 196 197 mov $a1, $t1 198 xor $t4, $t4 199 add 8*0($a_ptr), $a0 200 mov $a2, $t2 201 adc 8*1($a_ptr), $a1 202 adc 8*2($a_ptr), $a2 203 mov $a3, $t3 204 adc 8*3($a_ptr), $a3 205 adc \$0, $t4 206 xor $a_ptr, $a_ptr # borrow $a_ptr 207 test \$1, $t0 208 209 cmovz $t0, $a0 210 cmovz $t1, $a1 211 cmovz $t2, $a2 212 cmovz $t3, $a3 213 cmovz $a_ptr, $t4 214 215 mov $a1, $t0 # a0:a3>>1 216 shr \$1, $a0 217 shl \$63, $t0 218 mov $a2, $t1 219 shr \$1, $a1 220 or $t0, $a0 221 shl \$63, $t1 222 mov $a3, $t2 223 shr \$1, $a2 224 or $t1, $a1 225 shl \$63, $t2 226 shr \$1, $a3 227 shl \$63, $t4 228 or $t2, $a2 229 or $t4, $a3 230 231 mov $a0, 8*0($r_ptr) 232 mov $a1, 8*1($r_ptr) 233 mov $a2, 8*2($r_ptr) 234 mov $a3, 8*3($r_ptr) 235 236 mov 0(%rsp),%r13 237.cfi_restore %r13 238 mov 8(%rsp),%r12 239.cfi_restore %r12 240 lea 16(%rsp),%rsp 241.cfi_adjust_cfa_offset -16 242.Ldiv_by_2_epilogue: 243 ret 244.cfi_endproc 245.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 246 247################################################################################ 248# void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); 249.globl ecp_nistz256_mul_by_3 250.type ecp_nistz256_mul_by_3,\@function,2 251.align 32 252ecp_nistz256_mul_by_3: 253.cfi_startproc 254 push %r12 255.cfi_push %r12 256 push %r13 257.cfi_push %r13 258.Lmul_by_3_body: 259 260 mov 8*0($a_ptr), $a0 261 xor $t4, $t4 262 mov 8*1($a_ptr), $a1 263 add $a0, $a0 # a0:a3+a0:a3 264 mov 8*2($a_ptr), $a2 265 adc $a1, $a1 266 mov 8*3($a_ptr), $a3 267 mov $a0, $t0 268 adc $a2, $a2 269 adc $a3, $a3 270 mov $a1, $t1 271 adc \$0, $t4 272 273 sub \$-1, $a0 274 mov $a2, $t2 275 sbb .Lpoly+8*1(%rip), $a1 276 sbb \$0, $a2 277 mov $a3, $t3 278 sbb .Lpoly+8*3(%rip), $a3 279 sbb \$0, $t4 280 281 cmovc $t0, $a0 282 cmovc $t1, $a1 283 cmovc $t2, $a2 284 cmovc $t3, $a3 285 286 xor $t4, $t4 287 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] 288 adc 8*1($a_ptr), $a1 289 mov $a0, $t0 290 adc 8*2($a_ptr), $a2 291 adc 8*3($a_ptr), $a3 292 mov $a1, $t1 293 adc \$0, $t4 294 295 sub \$-1, $a0 296 mov $a2, $t2 297 sbb .Lpoly+8*1(%rip), $a1 298 sbb \$0, $a2 299 mov $a3, $t3 300 sbb .Lpoly+8*3(%rip), $a3 301 sbb \$0, $t4 302 303 cmovc $t0, $a0 304 cmovc $t1, $a1 305 mov $a0, 8*0($r_ptr) 306 cmovc $t2, $a2 307 mov $a1, 8*1($r_ptr) 308 cmovc $t3, $a3 309 mov $a2, 8*2($r_ptr) 310 mov $a3, 8*3($r_ptr) 311 312 mov 0(%rsp),%r13 313.cfi_restore %r13 314 mov 8(%rsp),%r12 315.cfi_restore %r12 316 lea 16(%rsp),%rsp 317.cfi_adjust_cfa_offset -16 318.Lmul_by_3_epilogue: 319 ret 320.cfi_endproc 321.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 322 323################################################################################ 324# void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 325.globl ecp_nistz256_add 326.type ecp_nistz256_add,\@function,3 327.align 32 328ecp_nistz256_add: 329.cfi_startproc 330 push %r12 331.cfi_push %r12 332 push %r13 333.cfi_push %r13 334.Ladd_body: 335 336 mov 8*0($a_ptr), $a0 337 xor $t4, $t4 338 mov 8*1($a_ptr), $a1 339 mov 8*2($a_ptr), $a2 340 mov 8*3($a_ptr), $a3 341 lea .Lpoly(%rip), $a_ptr 342 343 add 8*0($b_ptr), $a0 344 adc 8*1($b_ptr), $a1 345 mov $a0, $t0 346 adc 8*2($b_ptr), $a2 347 adc 8*3($b_ptr), $a3 348 mov $a1, $t1 349 adc \$0, $t4 350 351 sub 8*0($a_ptr), $a0 352 mov $a2, $t2 353 sbb 8*1($a_ptr), $a1 354 sbb 8*2($a_ptr), $a2 355 mov $a3, $t3 356 sbb 8*3($a_ptr), $a3 357 sbb \$0, $t4 358 359 cmovc $t0, $a0 360 cmovc $t1, $a1 361 mov $a0, 8*0($r_ptr) 362 cmovc $t2, $a2 363 mov $a1, 8*1($r_ptr) 364 cmovc $t3, $a3 365 mov $a2, 8*2($r_ptr) 366 mov $a3, 8*3($r_ptr) 367 368 mov 0(%rsp),%r13 369.cfi_restore %r13 370 mov 8(%rsp),%r12 371.cfi_restore %r12 372 lea 16(%rsp),%rsp 373.cfi_adjust_cfa_offset -16 374.Ladd_epilogue: 375 ret 376.cfi_endproc 377.size ecp_nistz256_add,.-ecp_nistz256_add 378 379################################################################################ 380# void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); 381.globl ecp_nistz256_sub 382.type ecp_nistz256_sub,\@function,3 383.align 32 384ecp_nistz256_sub: 385.cfi_startproc 386 push %r12 387.cfi_push %r12 388 push %r13 389.cfi_push %r13 390.Lsub_body: 391 392 mov 8*0($a_ptr), $a0 393 xor $t4, $t4 394 mov 8*1($a_ptr), $a1 395 mov 8*2($a_ptr), $a2 396 mov 8*3($a_ptr), $a3 397 lea .Lpoly(%rip), $a_ptr 398 399 sub 8*0($b_ptr), $a0 400 sbb 8*1($b_ptr), $a1 401 mov $a0, $t0 402 sbb 8*2($b_ptr), $a2 403 sbb 8*3($b_ptr), $a3 404 mov $a1, $t1 405 sbb \$0, $t4 406 407 add 8*0($a_ptr), $a0 408 mov $a2, $t2 409 adc 8*1($a_ptr), $a1 410 adc 8*2($a_ptr), $a2 411 mov $a3, $t3 412 adc 8*3($a_ptr), $a3 413 test $t4, $t4 414 415 cmovz $t0, $a0 416 cmovz $t1, $a1 417 mov $a0, 8*0($r_ptr) 418 cmovz $t2, $a2 419 mov $a1, 8*1($r_ptr) 420 cmovz $t3, $a3 421 mov $a2, 8*2($r_ptr) 422 mov $a3, 8*3($r_ptr) 423 424 mov 0(%rsp),%r13 425.cfi_restore %r13 426 mov 8(%rsp),%r12 427.cfi_restore %r12 428 lea 16(%rsp),%rsp 429.cfi_adjust_cfa_offset -16 430.Lsub_epilogue: 431 ret 432.cfi_endproc 433.size ecp_nistz256_sub,.-ecp_nistz256_sub 434 435################################################################################ 436# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); 437.globl ecp_nistz256_neg 438.type ecp_nistz256_neg,\@function,2 439.align 32 440ecp_nistz256_neg: 441.cfi_startproc 442 push %r12 443.cfi_push %r12 444 push %r13 445.cfi_push %r13 446.Lneg_body: 447 448 xor $a0, $a0 449 xor $a1, $a1 450 xor $a2, $a2 451 xor $a3, $a3 452 xor $t4, $t4 453 454 sub 8*0($a_ptr), $a0 455 sbb 8*1($a_ptr), $a1 456 sbb 8*2($a_ptr), $a2 457 mov $a0, $t0 458 sbb 8*3($a_ptr), $a3 459 lea .Lpoly(%rip), $a_ptr 460 mov $a1, $t1 461 sbb \$0, $t4 462 463 add 8*0($a_ptr), $a0 464 mov $a2, $t2 465 adc 8*1($a_ptr), $a1 466 adc 8*2($a_ptr), $a2 467 mov $a3, $t3 468 adc 8*3($a_ptr), $a3 469 test $t4, $t4 470 471 cmovz $t0, $a0 472 cmovz $t1, $a1 473 mov $a0, 8*0($r_ptr) 474 cmovz $t2, $a2 475 mov $a1, 8*1($r_ptr) 476 cmovz $t3, $a3 477 mov $a2, 8*2($r_ptr) 478 mov $a3, 8*3($r_ptr) 479 480 mov 0(%rsp),%r13 481.cfi_restore %r13 482 mov 8(%rsp),%r12 483.cfi_restore %r12 484 lea 16(%rsp),%rsp 485.cfi_adjust_cfa_offset -16 486.Lneg_epilogue: 487 ret 488.cfi_endproc 489.size ecp_nistz256_neg,.-ecp_nistz256_neg 490___ 491} 492{ 493my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 494my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 495my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); 496my ($poly1,$poly3)=($acc6,$acc7); 497 498$code.=<<___; 499################################################################################ 500# void ecp_nistz256_ord_mul_mont( 501# uint64_t res[4], 502# uint64_t a[4], 503# uint64_t b[4]); 504 505.globl ecp_nistz256_ord_mul_mont 506.type ecp_nistz256_ord_mul_mont,\@function,3 507.align 32 508ecp_nistz256_ord_mul_mont: 509.cfi_startproc 510___ 511$code.=<<___ if ($addx); 512 mov \$0x80100, %ecx 513 and OPENSSL_ia32cap_P+8(%rip), %ecx 514 cmp \$0x80100, %ecx 515 je .Lecp_nistz256_ord_mul_montx 516___ 517$code.=<<___; 518 push %rbp 519.cfi_push %rbp 520 push %rbx 521.cfi_push %rbx 522 push %r12 523.cfi_push %r12 524 push %r13 525.cfi_push %r13 526 push %r14 527.cfi_push %r14 528 push %r15 529.cfi_push %r15 530.Lord_mul_body: 531 532 mov 8*0($b_org), %rax 533 mov $b_org, $b_ptr 534 lea .Lord(%rip), %r14 535 mov .LordK(%rip), %r15 536 537 ################################# * b[0] 538 mov %rax, $t0 539 mulq 8*0($a_ptr) 540 mov %rax, $acc0 541 mov $t0, %rax 542 mov %rdx, $acc1 543 544 mulq 8*1($a_ptr) 545 add %rax, $acc1 546 mov $t0, %rax 547 adc \$0, %rdx 548 mov %rdx, $acc2 549 550 mulq 8*2($a_ptr) 551 add %rax, $acc2 552 mov $t0, %rax 553 adc \$0, %rdx 554 555 mov $acc0, $acc5 556 imulq %r15,$acc0 557 558 mov %rdx, $acc3 559 mulq 8*3($a_ptr) 560 add %rax, $acc3 561 mov $acc0, %rax 562 adc \$0, %rdx 563 mov %rdx, $acc4 564 565 ################################# First reduction step 566 mulq 8*0(%r14) 567 mov $acc0, $t1 568 add %rax, $acc5 # guaranteed to be zero 569 mov $acc0, %rax 570 adc \$0, %rdx 571 mov %rdx, $t0 572 573 sub $acc0, $acc2 574 sbb \$0, $acc0 # can't borrow 575 576 mulq 8*1(%r14) 577 add $t0, $acc1 578 adc \$0, %rdx 579 add %rax, $acc1 580 mov $t1, %rax 581 adc %rdx, $acc2 582 mov $t1, %rdx 583 adc \$0, $acc0 # can't overflow 584 585 shl \$32, %rax 586 shr \$32, %rdx 587 sub %rax, $acc3 588 mov 8*1($b_ptr), %rax 589 sbb %rdx, $t1 # can't borrow 590 591 add $acc0, $acc3 592 adc $t1, $acc4 593 adc \$0, $acc5 594 595 ################################# * b[1] 596 mov %rax, $t0 597 mulq 8*0($a_ptr) 598 add %rax, $acc1 599 mov $t0, %rax 600 adc \$0, %rdx 601 mov %rdx, $t1 602 603 mulq 8*1($a_ptr) 604 add $t1, $acc2 605 adc \$0, %rdx 606 add %rax, $acc2 607 mov $t0, %rax 608 adc \$0, %rdx 609 mov %rdx, $t1 610 611 mulq 8*2($a_ptr) 612 add $t1, $acc3 613 adc \$0, %rdx 614 add %rax, $acc3 615 mov $t0, %rax 616 adc \$0, %rdx 617 618 mov $acc1, $t0 619 imulq %r15, $acc1 620 621 mov %rdx, $t1 622 mulq 8*3($a_ptr) 623 add $t1, $acc4 624 adc \$0, %rdx 625 xor $acc0, $acc0 626 add %rax, $acc4 627 mov $acc1, %rax 628 adc %rdx, $acc5 629 adc \$0, $acc0 630 631 ################################# Second reduction step 632 mulq 8*0(%r14) 633 mov $acc1, $t1 634 add %rax, $t0 # guaranteed to be zero 635 mov $acc1, %rax 636 adc %rdx, $t0 637 638 sub $acc1, $acc3 639 sbb \$0, $acc1 # can't borrow 640 641 mulq 8*1(%r14) 642 add $t0, $acc2 643 adc \$0, %rdx 644 add %rax, $acc2 645 mov $t1, %rax 646 adc %rdx, $acc3 647 mov $t1, %rdx 648 adc \$0, $acc1 # can't overflow 649 650 shl \$32, %rax 651 shr \$32, %rdx 652 sub %rax, $acc4 653 mov 8*2($b_ptr), %rax 654 sbb %rdx, $t1 # can't borrow 655 656 add $acc1, $acc4 657 adc $t1, $acc5 658 adc \$0, $acc0 659 660 ################################## * b[2] 661 mov %rax, $t0 662 mulq 8*0($a_ptr) 663 add %rax, $acc2 664 mov $t0, %rax 665 adc \$0, %rdx 666 mov %rdx, $t1 667 668 mulq 8*1($a_ptr) 669 add $t1, $acc3 670 adc \$0, %rdx 671 add %rax, $acc3 672 mov $t0, %rax 673 adc \$0, %rdx 674 mov %rdx, $t1 675 676 mulq 8*2($a_ptr) 677 add $t1, $acc4 678 adc \$0, %rdx 679 add %rax, $acc4 680 mov $t0, %rax 681 adc \$0, %rdx 682 683 mov $acc2, $t0 684 imulq %r15, $acc2 685 686 mov %rdx, $t1 687 mulq 8*3($a_ptr) 688 add $t1, $acc5 689 adc \$0, %rdx 690 xor $acc1, $acc1 691 add %rax, $acc5 692 mov $acc2, %rax 693 adc %rdx, $acc0 694 adc \$0, $acc1 695 696 ################################# Third reduction step 697 mulq 8*0(%r14) 698 mov $acc2, $t1 699 add %rax, $t0 # guaranteed to be zero 700 mov $acc2, %rax 701 adc %rdx, $t0 702 703 sub $acc2, $acc4 704 sbb \$0, $acc2 # can't borrow 705 706 mulq 8*1(%r14) 707 add $t0, $acc3 708 adc \$0, %rdx 709 add %rax, $acc3 710 mov $t1, %rax 711 adc %rdx, $acc4 712 mov $t1, %rdx 713 adc \$0, $acc2 # can't overflow 714 715 shl \$32, %rax 716 shr \$32, %rdx 717 sub %rax, $acc5 718 mov 8*3($b_ptr), %rax 719 sbb %rdx, $t1 # can't borrow 720 721 add $acc2, $acc5 722 adc $t1, $acc0 723 adc \$0, $acc1 724 725 ################################# * b[3] 726 mov %rax, $t0 727 mulq 8*0($a_ptr) 728 add %rax, $acc3 729 mov $t0, %rax 730 adc \$0, %rdx 731 mov %rdx, $t1 732 733 mulq 8*1($a_ptr) 734 add $t1, $acc4 735 adc \$0, %rdx 736 add %rax, $acc4 737 mov $t0, %rax 738 adc \$0, %rdx 739 mov %rdx, $t1 740 741 mulq 8*2($a_ptr) 742 add $t1, $acc5 743 adc \$0, %rdx 744 add %rax, $acc5 745 mov $t0, %rax 746 adc \$0, %rdx 747 748 mov $acc3, $t0 749 imulq %r15, $acc3 750 751 mov %rdx, $t1 752 mulq 8*3($a_ptr) 753 add $t1, $acc0 754 adc \$0, %rdx 755 xor $acc2, $acc2 756 add %rax, $acc0 757 mov $acc3, %rax 758 adc %rdx, $acc1 759 adc \$0, $acc2 760 761 ################################# Last reduction step 762 mulq 8*0(%r14) 763 mov $acc3, $t1 764 add %rax, $t0 # guaranteed to be zero 765 mov $acc3, %rax 766 adc %rdx, $t0 767 768 sub $acc3, $acc5 769 sbb \$0, $acc3 # can't borrow 770 771 mulq 8*1(%r14) 772 add $t0, $acc4 773 adc \$0, %rdx 774 add %rax, $acc4 775 mov $t1, %rax 776 adc %rdx, $acc5 777 mov $t1, %rdx 778 adc \$0, $acc3 # can't overflow 779 780 shl \$32, %rax 781 shr \$32, %rdx 782 sub %rax, $acc0 783 sbb %rdx, $t1 # can't borrow 784 785 add $acc3, $acc0 786 adc $t1, $acc1 787 adc \$0, $acc2 788 789 ################################# Subtract ord 790 mov $acc4, $a_ptr 791 sub 8*0(%r14), $acc4 792 mov $acc5, $acc3 793 sbb 8*1(%r14), $acc5 794 mov $acc0, $t0 795 sbb 8*2(%r14), $acc0 796 mov $acc1, $t1 797 sbb 8*3(%r14), $acc1 798 sbb \$0, $acc2 799 800 cmovc $a_ptr, $acc4 801 cmovc $acc3, $acc5 802 cmovc $t0, $acc0 803 cmovc $t1, $acc1 804 805 mov $acc4, 8*0($r_ptr) 806 mov $acc5, 8*1($r_ptr) 807 mov $acc0, 8*2($r_ptr) 808 mov $acc1, 8*3($r_ptr) 809 810 mov 0(%rsp),%r15 811.cfi_restore %r15 812 mov 8(%rsp),%r14 813.cfi_restore %r14 814 mov 16(%rsp),%r13 815.cfi_restore %r13 816 mov 24(%rsp),%r12 817.cfi_restore %r12 818 mov 32(%rsp),%rbx 819.cfi_restore %rbx 820 mov 40(%rsp),%rbp 821.cfi_restore %rbp 822 lea 48(%rsp),%rsp 823.cfi_adjust_cfa_offset -48 824.Lord_mul_epilogue: 825 ret 826.cfi_endproc 827.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 828 829################################################################################ 830# void ecp_nistz256_ord_sqr_mont( 831# uint64_t res[4], 832# uint64_t a[4], 833# uint64_t rep); 834 835.globl ecp_nistz256_ord_sqr_mont 836.type ecp_nistz256_ord_sqr_mont,\@function,3 837.align 32 838ecp_nistz256_ord_sqr_mont: 839.cfi_startproc 840___ 841$code.=<<___ if ($addx); 842 mov \$0x80100, %ecx 843 and OPENSSL_ia32cap_P+8(%rip), %ecx 844 cmp \$0x80100, %ecx 845 je .Lecp_nistz256_ord_sqr_montx 846___ 847$code.=<<___; 848 push %rbp 849.cfi_push %rbp 850 push %rbx 851.cfi_push %rbx 852 push %r12 853.cfi_push %r12 854 push %r13 855.cfi_push %r13 856 push %r14 857.cfi_push %r14 858 push %r15 859.cfi_push %r15 860.Lord_sqr_body: 861 862 mov 8*0($a_ptr), $acc0 863 mov 8*1($a_ptr), %rax 864 mov 8*2($a_ptr), $acc6 865 mov 8*3($a_ptr), $acc7 866 lea .Lord(%rip), $a_ptr # pointer to modulus 867 mov $b_org, $b_ptr 868 jmp .Loop_ord_sqr 869 870.align 32 871.Loop_ord_sqr: 872 ################################# a[1:] * a[0] 873 mov %rax, $t1 # put aside a[1] 874 mul $acc0 # a[1] * a[0] 875 mov %rax, $acc1 876 movq $t1, %xmm1 # offload a[1] 877 mov $acc6, %rax 878 mov %rdx, $acc2 879 880 mul $acc0 # a[2] * a[0] 881 add %rax, $acc2 882 mov $acc7, %rax 883 movq $acc6, %xmm2 # offload a[2] 884 adc \$0, %rdx 885 mov %rdx, $acc3 886 887 mul $acc0 # a[3] * a[0] 888 add %rax, $acc3 889 mov $acc7, %rax 890 movq $acc7, %xmm3 # offload a[3] 891 adc \$0, %rdx 892 mov %rdx, $acc4 893 894 ################################# a[3] * a[2] 895 mul $acc6 # a[3] * a[2] 896 mov %rax, $acc5 897 mov $acc6, %rax 898 mov %rdx, $acc6 899 900 ################################# a[2:] * a[1] 901 mul $t1 # a[2] * a[1] 902 add %rax, $acc3 903 mov $acc7, %rax 904 adc \$0, %rdx 905 mov %rdx, $acc7 906 907 mul $t1 # a[3] * a[1] 908 add %rax, $acc4 909 adc \$0, %rdx 910 911 add $acc7, $acc4 912 adc %rdx, $acc5 913 adc \$0, $acc6 # can't overflow 914 915 ################################# *2 916 xor $acc7, $acc7 917 mov $acc0, %rax 918 add $acc1, $acc1 919 adc $acc2, $acc2 920 adc $acc3, $acc3 921 adc $acc4, $acc4 922 adc $acc5, $acc5 923 adc $acc6, $acc6 924 adc \$0, $acc7 925 926 ################################# Missing products 927 mul %rax # a[0] * a[0] 928 mov %rax, $acc0 929 movq %xmm1, %rax 930 mov %rdx, $t1 931 932 mul %rax # a[1] * a[1] 933 add $t1, $acc1 934 adc %rax, $acc2 935 movq %xmm2, %rax 936 adc \$0, %rdx 937 mov %rdx, $t1 938 939 mul %rax # a[2] * a[2] 940 add $t1, $acc3 941 adc %rax, $acc4 942 movq %xmm3, %rax 943 adc \$0, %rdx 944 mov %rdx, $t1 945 946 mov $acc0, $t0 947 imulq 8*4($a_ptr), $acc0 # *= .LordK 948 949 mul %rax # a[3] * a[3] 950 add $t1, $acc5 951 adc %rax, $acc6 952 mov 8*0($a_ptr), %rax # modulus[0] 953 adc %rdx, $acc7 # can't overflow 954 955 ################################# First reduction step 956 mul $acc0 957 mov $acc0, $t1 958 add %rax, $t0 # guaranteed to be zero 959 mov 8*1($a_ptr), %rax # modulus[1] 960 adc %rdx, $t0 961 962 sub $acc0, $acc2 963 sbb \$0, $t1 # can't borrow 964 965 mul $acc0 966 add $t0, $acc1 967 adc \$0, %rdx 968 add %rax, $acc1 969 mov $acc0, %rax 970 adc %rdx, $acc2 971 mov $acc0, %rdx 972 adc \$0, $t1 # can't overflow 973 974 mov $acc1, $t0 975 imulq 8*4($a_ptr), $acc1 # *= .LordK 976 977 shl \$32, %rax 978 shr \$32, %rdx 979 sub %rax, $acc3 980 mov 8*0($a_ptr), %rax 981 sbb %rdx, $acc0 # can't borrow 982 983 add $t1, $acc3 984 adc \$0, $acc0 # can't overflow 985 986 ################################# Second reduction step 987 mul $acc1 988 mov $acc1, $t1 989 add %rax, $t0 # guaranteed to be zero 990 mov 8*1($a_ptr), %rax 991 adc %rdx, $t0 992 993 sub $acc1, $acc3 994 sbb \$0, $t1 # can't borrow 995 996 mul $acc1 997 add $t0, $acc2 998 adc \$0, %rdx 999 add %rax, $acc2 1000 mov $acc1, %rax 1001 adc %rdx, $acc3 1002 mov $acc1, %rdx 1003 adc \$0, $t1 # can't overflow 1004 1005 mov $acc2, $t0 1006 imulq 8*4($a_ptr), $acc2 # *= .LordK 1007 1008 shl \$32, %rax 1009 shr \$32, %rdx 1010 sub %rax, $acc0 1011 mov 8*0($a_ptr), %rax 1012 sbb %rdx, $acc1 # can't borrow 1013 1014 add $t1, $acc0 1015 adc \$0, $acc1 # can't overflow 1016 1017 ################################# Third reduction step 1018 mul $acc2 1019 mov $acc2, $t1 1020 add %rax, $t0 # guaranteed to be zero 1021 mov 8*1($a_ptr), %rax 1022 adc %rdx, $t0 1023 1024 sub $acc2, $acc0 1025 sbb \$0, $t1 # can't borrow 1026 1027 mul $acc2 1028 add $t0, $acc3 1029 adc \$0, %rdx 1030 add %rax, $acc3 1031 mov $acc2, %rax 1032 adc %rdx, $acc0 1033 mov $acc2, %rdx 1034 adc \$0, $t1 # can't overflow 1035 1036 mov $acc3, $t0 1037 imulq 8*4($a_ptr), $acc3 # *= .LordK 1038 1039 shl \$32, %rax 1040 shr \$32, %rdx 1041 sub %rax, $acc1 1042 mov 8*0($a_ptr), %rax 1043 sbb %rdx, $acc2 # can't borrow 1044 1045 add $t1, $acc1 1046 adc \$0, $acc2 # can't overflow 1047 1048 ################################# Last reduction step 1049 mul $acc3 1050 mov $acc3, $t1 1051 add %rax, $t0 # guaranteed to be zero 1052 mov 8*1($a_ptr), %rax 1053 adc %rdx, $t0 1054 1055 sub $acc3, $acc1 1056 sbb \$0, $t1 # can't borrow 1057 1058 mul $acc3 1059 add $t0, $acc0 1060 adc \$0, %rdx 1061 add %rax, $acc0 1062 mov $acc3, %rax 1063 adc %rdx, $acc1 1064 mov $acc3, %rdx 1065 adc \$0, $t1 # can't overflow 1066 1067 shl \$32, %rax 1068 shr \$32, %rdx 1069 sub %rax, $acc2 1070 sbb %rdx, $acc3 # can't borrow 1071 1072 add $t1, $acc2 1073 adc \$0, $acc3 # can't overflow 1074 1075 ################################# Add bits [511:256] of the sqr result 1076 xor %rdx, %rdx 1077 add $acc4, $acc0 1078 adc $acc5, $acc1 1079 mov $acc0, $acc4 1080 adc $acc6, $acc2 1081 adc $acc7, $acc3 1082 mov $acc1, %rax 1083 adc \$0, %rdx 1084 1085 ################################# Compare to modulus 1086 sub 8*0($a_ptr), $acc0 1087 mov $acc2, $acc6 1088 sbb 8*1($a_ptr), $acc1 1089 sbb 8*2($a_ptr), $acc2 1090 mov $acc3, $acc7 1091 sbb 8*3($a_ptr), $acc3 1092 sbb \$0, %rdx 1093 1094 cmovc $acc4, $acc0 1095 cmovnc $acc1, %rax 1096 cmovnc $acc2, $acc6 1097 cmovnc $acc3, $acc7 1098 1099 dec $b_ptr 1100 jnz .Loop_ord_sqr 1101 1102 mov $acc0, 8*0($r_ptr) 1103 mov %rax, 8*1($r_ptr) 1104 pxor %xmm1, %xmm1 1105 mov $acc6, 8*2($r_ptr) 1106 pxor %xmm2, %xmm2 1107 mov $acc7, 8*3($r_ptr) 1108 pxor %xmm3, %xmm3 1109 1110 mov 0(%rsp),%r15 1111.cfi_restore %r15 1112 mov 8(%rsp),%r14 1113.cfi_restore %r14 1114 mov 16(%rsp),%r13 1115.cfi_restore %r13 1116 mov 24(%rsp),%r12 1117.cfi_restore %r12 1118 mov 32(%rsp),%rbx 1119.cfi_restore %rbx 1120 mov 40(%rsp),%rbp 1121.cfi_restore %rbp 1122 lea 48(%rsp),%rsp 1123.cfi_adjust_cfa_offset -48 1124.Lord_sqr_epilogue: 1125 ret 1126.cfi_endproc 1127.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1128___ 1129 1130$code.=<<___ if ($addx); 1131################################################################################ 1132.type ecp_nistz256_ord_mul_montx,\@function,3 1133.align 32 1134ecp_nistz256_ord_mul_montx: 1135.cfi_startproc 1136.Lecp_nistz256_ord_mul_montx: 1137 push %rbp 1138.cfi_push %rbp 1139 push %rbx 1140.cfi_push %rbx 1141 push %r12 1142.cfi_push %r12 1143 push %r13 1144.cfi_push %r13 1145 push %r14 1146.cfi_push %r14 1147 push %r15 1148.cfi_push %r15 1149.Lord_mulx_body: 1150 1151 mov $b_org, $b_ptr 1152 mov 8*0($b_org), %rdx 1153 mov 8*0($a_ptr), $acc1 1154 mov 8*1($a_ptr), $acc2 1155 mov 8*2($a_ptr), $acc3 1156 mov 8*3($a_ptr), $acc4 1157 lea -128($a_ptr), $a_ptr # control u-op density 1158 lea .Lord-128(%rip), %r14 1159 mov .LordK(%rip), %r15 1160 1161 ################################# Multiply by b[0] 1162 mulx $acc1, $acc0, $acc1 1163 mulx $acc2, $t0, $acc2 1164 mulx $acc3, $t1, $acc3 1165 add $t0, $acc1 1166 mulx $acc4, $t0, $acc4 1167 mov $acc0, %rdx 1168 mulx %r15, %rdx, %rax 1169 adc $t1, $acc2 1170 adc $t0, $acc3 1171 adc \$0, $acc4 1172 1173 ################################# reduction 1174 xor $acc5, $acc5 # $acc5=0, cf=0, of=0 1175 mulx 8*0+128(%r14), $t0, $t1 1176 adcx $t0, $acc0 # guaranteed to be zero 1177 adox $t1, $acc1 1178 1179 mulx 8*1+128(%r14), $t0, $t1 1180 adcx $t0, $acc1 1181 adox $t1, $acc2 1182 1183 mulx 8*2+128(%r14), $t0, $t1 1184 adcx $t0, $acc2 1185 adox $t1, $acc3 1186 1187 mulx 8*3+128(%r14), $t0, $t1 1188 mov 8*1($b_ptr), %rdx 1189 adcx $t0, $acc3 1190 adox $t1, $acc4 1191 adcx $acc0, $acc4 1192 adox $acc0, $acc5 1193 adc \$0, $acc5 # cf=0, of=0 1194 1195 ################################# Multiply by b[1] 1196 mulx 8*0+128($a_ptr), $t0, $t1 1197 adcx $t0, $acc1 1198 adox $t1, $acc2 1199 1200 mulx 8*1+128($a_ptr), $t0, $t1 1201 adcx $t0, $acc2 1202 adox $t1, $acc3 1203 1204 mulx 8*2+128($a_ptr), $t0, $t1 1205 adcx $t0, $acc3 1206 adox $t1, $acc4 1207 1208 mulx 8*3+128($a_ptr), $t0, $t1 1209 mov $acc1, %rdx 1210 mulx %r15, %rdx, %rax 1211 adcx $t0, $acc4 1212 adox $t1, $acc5 1213 1214 adcx $acc0, $acc5 1215 adox $acc0, $acc0 1216 adc \$0, $acc0 # cf=0, of=0 1217 1218 ################################# reduction 1219 mulx 8*0+128(%r14), $t0, $t1 1220 adcx $t0, $acc1 # guaranteed to be zero 1221 adox $t1, $acc2 1222 1223 mulx 8*1+128(%r14), $t0, $t1 1224 adcx $t0, $acc2 1225 adox $t1, $acc3 1226 1227 mulx 8*2+128(%r14), $t0, $t1 1228 adcx $t0, $acc3 1229 adox $t1, $acc4 1230 1231 mulx 8*3+128(%r14), $t0, $t1 1232 mov 8*2($b_ptr), %rdx 1233 adcx $t0, $acc4 1234 adox $t1, $acc5 1235 adcx $acc1, $acc5 1236 adox $acc1, $acc0 1237 adc \$0, $acc0 # cf=0, of=0 1238 1239 ################################# Multiply by b[2] 1240 mulx 8*0+128($a_ptr), $t0, $t1 1241 adcx $t0, $acc2 1242 adox $t1, $acc3 1243 1244 mulx 8*1+128($a_ptr), $t0, $t1 1245 adcx $t0, $acc3 1246 adox $t1, $acc4 1247 1248 mulx 8*2+128($a_ptr), $t0, $t1 1249 adcx $t0, $acc4 1250 adox $t1, $acc5 1251 1252 mulx 8*3+128($a_ptr), $t0, $t1 1253 mov $acc2, %rdx 1254 mulx %r15, %rdx, %rax 1255 adcx $t0, $acc5 1256 adox $t1, $acc0 1257 1258 adcx $acc1, $acc0 1259 adox $acc1, $acc1 1260 adc \$0, $acc1 # cf=0, of=0 1261 1262 ################################# reduction 1263 mulx 8*0+128(%r14), $t0, $t1 1264 adcx $t0, $acc2 # guaranteed to be zero 1265 adox $t1, $acc3 1266 1267 mulx 8*1+128(%r14), $t0, $t1 1268 adcx $t0, $acc3 1269 adox $t1, $acc4 1270 1271 mulx 8*2+128(%r14), $t0, $t1 1272 adcx $t0, $acc4 1273 adox $t1, $acc5 1274 1275 mulx 8*3+128(%r14), $t0, $t1 1276 mov 8*3($b_ptr), %rdx 1277 adcx $t0, $acc5 1278 adox $t1, $acc0 1279 adcx $acc2, $acc0 1280 adox $acc2, $acc1 1281 adc \$0, $acc1 # cf=0, of=0 1282 1283 ################################# Multiply by b[3] 1284 mulx 8*0+128($a_ptr), $t0, $t1 1285 adcx $t0, $acc3 1286 adox $t1, $acc4 1287 1288 mulx 8*1+128($a_ptr), $t0, $t1 1289 adcx $t0, $acc4 1290 adox $t1, $acc5 1291 1292 mulx 8*2+128($a_ptr), $t0, $t1 1293 adcx $t0, $acc5 1294 adox $t1, $acc0 1295 1296 mulx 8*3+128($a_ptr), $t0, $t1 1297 mov $acc3, %rdx 1298 mulx %r15, %rdx, %rax 1299 adcx $t0, $acc0 1300 adox $t1, $acc1 1301 1302 adcx $acc2, $acc1 1303 adox $acc2, $acc2 1304 adc \$0, $acc2 # cf=0, of=0 1305 1306 ################################# reduction 1307 mulx 8*0+128(%r14), $t0, $t1 1308 adcx $t0, $acc3 # guaranteed to be zero 1309 adox $t1, $acc4 1310 1311 mulx 8*1+128(%r14), $t0, $t1 1312 adcx $t0, $acc4 1313 adox $t1, $acc5 1314 1315 mulx 8*2+128(%r14), $t0, $t1 1316 adcx $t0, $acc5 1317 adox $t1, $acc0 1318 1319 mulx 8*3+128(%r14), $t0, $t1 1320 lea 128(%r14),%r14 1321 mov $acc4, $t2 1322 adcx $t0, $acc0 1323 adox $t1, $acc1 1324 mov $acc5, $t3 1325 adcx $acc3, $acc1 1326 adox $acc3, $acc2 1327 adc \$0, $acc2 1328 1329 ################################# 1330 # Branch-less conditional subtraction of P 1331 mov $acc0, $t0 1332 sub 8*0(%r14), $acc4 1333 sbb 8*1(%r14), $acc5 1334 sbb 8*2(%r14), $acc0 1335 mov $acc1, $t1 1336 sbb 8*3(%r14), $acc1 1337 sbb \$0, $acc2 1338 1339 cmovc $t2, $acc4 1340 cmovc $t3, $acc5 1341 cmovc $t0, $acc0 1342 cmovc $t1, $acc1 1343 1344 mov $acc4, 8*0($r_ptr) 1345 mov $acc5, 8*1($r_ptr) 1346 mov $acc0, 8*2($r_ptr) 1347 mov $acc1, 8*3($r_ptr) 1348 1349 mov 0(%rsp),%r15 1350.cfi_restore %r15 1351 mov 8(%rsp),%r14 1352.cfi_restore %r14 1353 mov 16(%rsp),%r13 1354.cfi_restore %r13 1355 mov 24(%rsp),%r12 1356.cfi_restore %r12 1357 mov 32(%rsp),%rbx 1358.cfi_restore %rbx 1359 mov 40(%rsp),%rbp 1360.cfi_restore %rbp 1361 lea 48(%rsp),%rsp 1362.cfi_adjust_cfa_offset -48 1363.Lord_mulx_epilogue: 1364 ret 1365.cfi_endproc 1366.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx 1367 1368.type ecp_nistz256_ord_sqr_montx,\@function,3 1369.align 32 1370ecp_nistz256_ord_sqr_montx: 1371.cfi_startproc 1372.Lecp_nistz256_ord_sqr_montx: 1373 push %rbp 1374.cfi_push %rbp 1375 push %rbx 1376.cfi_push %rbx 1377 push %r12 1378.cfi_push %r12 1379 push %r13 1380.cfi_push %r13 1381 push %r14 1382.cfi_push %r14 1383 push %r15 1384.cfi_push %r15 1385.Lord_sqrx_body: 1386 1387 mov $b_org, $b_ptr 1388 mov 8*0($a_ptr), %rdx 1389 mov 8*1($a_ptr), $acc6 1390 mov 8*2($a_ptr), $acc7 1391 mov 8*3($a_ptr), $acc0 1392 lea .Lord(%rip), $a_ptr 1393 jmp .Loop_ord_sqrx 1394 1395.align 32 1396.Loop_ord_sqrx: 1397 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 1398 mulx $acc7, $t0, $acc3 # a[0]*a[2] 1399 mov %rdx, %rax # offload a[0] 1400 movq $acc6, %xmm1 # offload a[1] 1401 mulx $acc0, $t1, $acc4 # a[0]*a[3] 1402 mov $acc6, %rdx 1403 add $t0, $acc2 1404 movq $acc7, %xmm2 # offload a[2] 1405 adc $t1, $acc3 1406 adc \$0, $acc4 1407 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 1408 ################################# 1409 mulx $acc7, $t0, $t1 # a[1]*a[2] 1410 adcx $t0, $acc3 1411 adox $t1, $acc4 1412 1413 mulx $acc0, $t0, $t1 # a[1]*a[3] 1414 mov $acc7, %rdx 1415 adcx $t0, $acc4 1416 adox $t1, $acc5 1417 adc \$0, $acc5 1418 ################################# 1419 mulx $acc0, $t0, $acc6 # a[2]*a[3] 1420 mov %rax, %rdx 1421 movq $acc0, %xmm3 # offload a[3] 1422 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 1423 adcx $acc1, $acc1 # acc1:6<<1 1424 adox $t0, $acc5 1425 adcx $acc2, $acc2 1426 adox $acc7, $acc6 # of=0 1427 1428 ################################# a[i]*a[i] 1429 mulx %rdx, $acc0, $t1 1430 movq %xmm1, %rdx 1431 adcx $acc3, $acc3 1432 adox $t1, $acc1 1433 adcx $acc4, $acc4 1434 mulx %rdx, $t0, $t4 1435 movq %xmm2, %rdx 1436 adcx $acc5, $acc5 1437 adox $t0, $acc2 1438 adcx $acc6, $acc6 1439 mulx %rdx, $t0, $t1 1440 .byte 0x67 1441 movq %xmm3, %rdx 1442 adox $t4, $acc3 1443 adcx $acc7, $acc7 1444 adox $t0, $acc4 1445 adox $t1, $acc5 1446 mulx %rdx, $t0, $t4 1447 adox $t0, $acc6 1448 adox $t4, $acc7 1449 1450 ################################# reduction 1451 mov $acc0, %rdx 1452 mulx 8*4($a_ptr), %rdx, $t0 1453 1454 xor %rax, %rax # cf=0, of=0 1455 mulx 8*0($a_ptr), $t0, $t1 1456 adcx $t0, $acc0 # guaranteed to be zero 1457 adox $t1, $acc1 1458 mulx 8*1($a_ptr), $t0, $t1 1459 adcx $t0, $acc1 1460 adox $t1, $acc2 1461 mulx 8*2($a_ptr), $t0, $t1 1462 adcx $t0, $acc2 1463 adox $t1, $acc3 1464 mulx 8*3($a_ptr), $t0, $t1 1465 adcx $t0, $acc3 1466 adox $t1, $acc0 # of=0 1467 adcx %rax, $acc0 # cf=0 1468 1469 ################################# 1470 mov $acc1, %rdx 1471 mulx 8*4($a_ptr), %rdx, $t0 1472 1473 mulx 8*0($a_ptr), $t0, $t1 1474 adox $t0, $acc1 # guaranteed to be zero 1475 adcx $t1, $acc2 1476 mulx 8*1($a_ptr), $t0, $t1 1477 adox $t0, $acc2 1478 adcx $t1, $acc3 1479 mulx 8*2($a_ptr), $t0, $t1 1480 adox $t0, $acc3 1481 adcx $t1, $acc0 1482 mulx 8*3($a_ptr), $t0, $t1 1483 adox $t0, $acc0 1484 adcx $t1, $acc1 # cf=0 1485 adox %rax, $acc1 # of=0 1486 1487 ################################# 1488 mov $acc2, %rdx 1489 mulx 8*4($a_ptr), %rdx, $t0 1490 1491 mulx 8*0($a_ptr), $t0, $t1 1492 adcx $t0, $acc2 # guaranteed to be zero 1493 adox $t1, $acc3 1494 mulx 8*1($a_ptr), $t0, $t1 1495 adcx $t0, $acc3 1496 adox $t1, $acc0 1497 mulx 8*2($a_ptr), $t0, $t1 1498 adcx $t0, $acc0 1499 adox $t1, $acc1 1500 mulx 8*3($a_ptr), $t0, $t1 1501 adcx $t0, $acc1 1502 adox $t1, $acc2 # of=0 1503 adcx %rax, $acc2 # cf=0 1504 1505 ################################# 1506 mov $acc3, %rdx 1507 mulx 8*4($a_ptr), %rdx, $t0 1508 1509 mulx 8*0($a_ptr), $t0, $t1 1510 adox $t0, $acc3 # guaranteed to be zero 1511 adcx $t1, $acc0 1512 mulx 8*1($a_ptr), $t0, $t1 1513 adox $t0, $acc0 1514 adcx $t1, $acc1 1515 mulx 8*2($a_ptr), $t0, $t1 1516 adox $t0, $acc1 1517 adcx $t1, $acc2 1518 mulx 8*3($a_ptr), $t0, $t1 1519 adox $t0, $acc2 1520 adcx $t1, $acc3 1521 adox %rax, $acc3 1522 1523 ################################# accumulate upper half 1524 add $acc0, $acc4 # add $acc4, $acc0 1525 adc $acc5, $acc1 1526 mov $acc4, %rdx 1527 adc $acc6, $acc2 1528 adc $acc7, $acc3 1529 mov $acc1, $acc6 1530 adc \$0, %rax 1531 1532 ################################# compare to modulus 1533 sub 8*0($a_ptr), $acc4 1534 mov $acc2, $acc7 1535 sbb 8*1($a_ptr), $acc1 1536 sbb 8*2($a_ptr), $acc2 1537 mov $acc3, $acc0 1538 sbb 8*3($a_ptr), $acc3 1539 sbb \$0, %rax 1540 1541 cmovnc $acc4, %rdx 1542 cmovnc $acc1, $acc6 1543 cmovnc $acc2, $acc7 1544 cmovnc $acc3, $acc0 1545 1546 dec $b_ptr 1547 jnz .Loop_ord_sqrx 1548 1549 mov %rdx, 8*0($r_ptr) 1550 mov $acc6, 8*1($r_ptr) 1551 pxor %xmm1, %xmm1 1552 mov $acc7, 8*2($r_ptr) 1553 pxor %xmm2, %xmm2 1554 mov $acc0, 8*3($r_ptr) 1555 pxor %xmm3, %xmm3 1556 1557 mov 0(%rsp),%r15 1558.cfi_restore %r15 1559 mov 8(%rsp),%r14 1560.cfi_restore %r14 1561 mov 16(%rsp),%r13 1562.cfi_restore %r13 1563 mov 24(%rsp),%r12 1564.cfi_restore %r12 1565 mov 32(%rsp),%rbx 1566.cfi_restore %rbx 1567 mov 40(%rsp),%rbp 1568.cfi_restore %rbp 1569 lea 48(%rsp),%rsp 1570.cfi_adjust_cfa_offset -48 1571.Lord_sqrx_epilogue: 1572 ret 1573.cfi_endproc 1574.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx 1575___ 1576 1577$code.=<<___; 1578################################################################################ 1579# void ecp_nistz256_to_mont( 1580# uint64_t res[4], 1581# uint64_t in[4]); 1582.globl ecp_nistz256_to_mont 1583.type ecp_nistz256_to_mont,\@function,2 1584.align 32 1585ecp_nistz256_to_mont: 1586.cfi_startproc 1587___ 1588$code.=<<___ if ($addx); 1589 mov \$0x80100, %ecx 1590 and OPENSSL_ia32cap_P+8(%rip), %ecx 1591___ 1592$code.=<<___; 1593 lea .LRR(%rip), $b_org 1594 jmp .Lmul_mont 1595.cfi_endproc 1596.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 1597 1598################################################################################ 1599# void ecp_nistz256_mul_mont( 1600# uint64_t res[4], 1601# uint64_t a[4], 1602# uint64_t b[4]); 1603 1604.globl ecp_nistz256_mul_mont 1605.type ecp_nistz256_mul_mont,\@function,3 1606.align 32 1607ecp_nistz256_mul_mont: 1608.cfi_startproc 1609___ 1610$code.=<<___ if ($addx); 1611 mov \$0x80100, %ecx 1612 and OPENSSL_ia32cap_P+8(%rip), %ecx 1613___ 1614$code.=<<___; 1615.Lmul_mont: 1616 push %rbp 1617.cfi_push %rbp 1618 push %rbx 1619.cfi_push %rbx 1620 push %r12 1621.cfi_push %r12 1622 push %r13 1623.cfi_push %r13 1624 push %r14 1625.cfi_push %r14 1626 push %r15 1627.cfi_push %r15 1628.Lmul_body: 1629___ 1630$code.=<<___ if ($addx); 1631 cmp \$0x80100, %ecx 1632 je .Lmul_montx 1633___ 1634$code.=<<___; 1635 mov $b_org, $b_ptr 1636 mov 8*0($b_org), %rax 1637 mov 8*0($a_ptr), $acc1 1638 mov 8*1($a_ptr), $acc2 1639 mov 8*2($a_ptr), $acc3 1640 mov 8*3($a_ptr), $acc4 1641 1642 call __ecp_nistz256_mul_montq 1643___ 1644$code.=<<___ if ($addx); 1645 jmp .Lmul_mont_done 1646 1647.align 32 1648.Lmul_montx: 1649 mov $b_org, $b_ptr 1650 mov 8*0($b_org), %rdx 1651 mov 8*0($a_ptr), $acc1 1652 mov 8*1($a_ptr), $acc2 1653 mov 8*2($a_ptr), $acc3 1654 mov 8*3($a_ptr), $acc4 1655 lea -128($a_ptr), $a_ptr # control u-op density 1656 1657 call __ecp_nistz256_mul_montx 1658___ 1659$code.=<<___; 1660.Lmul_mont_done: 1661 mov 0(%rsp),%r15 1662.cfi_restore %r15 1663 mov 8(%rsp),%r14 1664.cfi_restore %r14 1665 mov 16(%rsp),%r13 1666.cfi_restore %r13 1667 mov 24(%rsp),%r12 1668.cfi_restore %r12 1669 mov 32(%rsp),%rbx 1670.cfi_restore %rbx 1671 mov 40(%rsp),%rbp 1672.cfi_restore %rbp 1673 lea 48(%rsp),%rsp 1674.cfi_adjust_cfa_offset -48 1675.Lmul_epilogue: 1676 ret 1677.cfi_endproc 1678.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 1679 1680.type __ecp_nistz256_mul_montq,\@abi-omnipotent 1681.align 32 1682__ecp_nistz256_mul_montq: 1683.cfi_startproc 1684 ######################################################################## 1685 # Multiply a by b[0] 1686 mov %rax, $t1 1687 mulq $acc1 1688 mov .Lpoly+8*1(%rip),$poly1 1689 mov %rax, $acc0 1690 mov $t1, %rax 1691 mov %rdx, $acc1 1692 1693 mulq $acc2 1694 mov .Lpoly+8*3(%rip),$poly3 1695 add %rax, $acc1 1696 mov $t1, %rax 1697 adc \$0, %rdx 1698 mov %rdx, $acc2 1699 1700 mulq $acc3 1701 add %rax, $acc2 1702 mov $t1, %rax 1703 adc \$0, %rdx 1704 mov %rdx, $acc3 1705 1706 mulq $acc4 1707 add %rax, $acc3 1708 mov $acc0, %rax 1709 adc \$0, %rdx 1710 xor $acc5, $acc5 1711 mov %rdx, $acc4 1712 1713 ######################################################################## 1714 # First reduction step 1715 # Basically now we want to multiply acc[0] by p256, 1716 # and add the result to the acc. 1717 # Due to the special form of p256 we do some optimizations 1718 # 1719 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] 1720 # then we add acc[0] and get acc[0] x 2^96 1721 1722 mov $acc0, $t1 1723 shl \$32, $acc0 1724 mulq $poly3 1725 shr \$32, $t1 1726 add $acc0, $acc1 # +=acc[0]<<96 1727 adc $t1, $acc2 1728 adc %rax, $acc3 1729 mov 8*1($b_ptr), %rax 1730 adc %rdx, $acc4 1731 adc \$0, $acc5 1732 xor $acc0, $acc0 1733 1734 ######################################################################## 1735 # Multiply by b[1] 1736 mov %rax, $t1 1737 mulq 8*0($a_ptr) 1738 add %rax, $acc1 1739 mov $t1, %rax 1740 adc \$0, %rdx 1741 mov %rdx, $t0 1742 1743 mulq 8*1($a_ptr) 1744 add $t0, $acc2 1745 adc \$0, %rdx 1746 add %rax, $acc2 1747 mov $t1, %rax 1748 adc \$0, %rdx 1749 mov %rdx, $t0 1750 1751 mulq 8*2($a_ptr) 1752 add $t0, $acc3 1753 adc \$0, %rdx 1754 add %rax, $acc3 1755 mov $t1, %rax 1756 adc \$0, %rdx 1757 mov %rdx, $t0 1758 1759 mulq 8*3($a_ptr) 1760 add $t0, $acc4 1761 adc \$0, %rdx 1762 add %rax, $acc4 1763 mov $acc1, %rax 1764 adc %rdx, $acc5 1765 adc \$0, $acc0 1766 1767 ######################################################################## 1768 # Second reduction step 1769 mov $acc1, $t1 1770 shl \$32, $acc1 1771 mulq $poly3 1772 shr \$32, $t1 1773 add $acc1, $acc2 1774 adc $t1, $acc3 1775 adc %rax, $acc4 1776 mov 8*2($b_ptr), %rax 1777 adc %rdx, $acc5 1778 adc \$0, $acc0 1779 xor $acc1, $acc1 1780 1781 ######################################################################## 1782 # Multiply by b[2] 1783 mov %rax, $t1 1784 mulq 8*0($a_ptr) 1785 add %rax, $acc2 1786 mov $t1, %rax 1787 adc \$0, %rdx 1788 mov %rdx, $t0 1789 1790 mulq 8*1($a_ptr) 1791 add $t0, $acc3 1792 adc \$0, %rdx 1793 add %rax, $acc3 1794 mov $t1, %rax 1795 adc \$0, %rdx 1796 mov %rdx, $t0 1797 1798 mulq 8*2($a_ptr) 1799 add $t0, $acc4 1800 adc \$0, %rdx 1801 add %rax, $acc4 1802 mov $t1, %rax 1803 adc \$0, %rdx 1804 mov %rdx, $t0 1805 1806 mulq 8*3($a_ptr) 1807 add $t0, $acc5 1808 adc \$0, %rdx 1809 add %rax, $acc5 1810 mov $acc2, %rax 1811 adc %rdx, $acc0 1812 adc \$0, $acc1 1813 1814 ######################################################################## 1815 # Third reduction step 1816 mov $acc2, $t1 1817 shl \$32, $acc2 1818 mulq $poly3 1819 shr \$32, $t1 1820 add $acc2, $acc3 1821 adc $t1, $acc4 1822 adc %rax, $acc5 1823 mov 8*3($b_ptr), %rax 1824 adc %rdx, $acc0 1825 adc \$0, $acc1 1826 xor $acc2, $acc2 1827 1828 ######################################################################## 1829 # Multiply by b[3] 1830 mov %rax, $t1 1831 mulq 8*0($a_ptr) 1832 add %rax, $acc3 1833 mov $t1, %rax 1834 adc \$0, %rdx 1835 mov %rdx, $t0 1836 1837 mulq 8*1($a_ptr) 1838 add $t0, $acc4 1839 adc \$0, %rdx 1840 add %rax, $acc4 1841 mov $t1, %rax 1842 adc \$0, %rdx 1843 mov %rdx, $t0 1844 1845 mulq 8*2($a_ptr) 1846 add $t0, $acc5 1847 adc \$0, %rdx 1848 add %rax, $acc5 1849 mov $t1, %rax 1850 adc \$0, %rdx 1851 mov %rdx, $t0 1852 1853 mulq 8*3($a_ptr) 1854 add $t0, $acc0 1855 adc \$0, %rdx 1856 add %rax, $acc0 1857 mov $acc3, %rax 1858 adc %rdx, $acc1 1859 adc \$0, $acc2 1860 1861 ######################################################################## 1862 # Final reduction step 1863 mov $acc3, $t1 1864 shl \$32, $acc3 1865 mulq $poly3 1866 shr \$32, $t1 1867 add $acc3, $acc4 1868 adc $t1, $acc5 1869 mov $acc4, $t0 1870 adc %rax, $acc0 1871 adc %rdx, $acc1 1872 mov $acc5, $t1 1873 adc \$0, $acc2 1874 1875 ######################################################################## 1876 # Branch-less conditional subtraction of P 1877 sub \$-1, $acc4 # .Lpoly[0] 1878 mov $acc0, $t2 1879 sbb $poly1, $acc5 # .Lpoly[1] 1880 sbb \$0, $acc0 # .Lpoly[2] 1881 mov $acc1, $t3 1882 sbb $poly3, $acc1 # .Lpoly[3] 1883 sbb \$0, $acc2 1884 1885 cmovc $t0, $acc4 1886 cmovc $t1, $acc5 1887 mov $acc4, 8*0($r_ptr) 1888 cmovc $t2, $acc0 1889 mov $acc5, 8*1($r_ptr) 1890 cmovc $t3, $acc1 1891 mov $acc0, 8*2($r_ptr) 1892 mov $acc1, 8*3($r_ptr) 1893 1894 ret 1895.cfi_endproc 1896.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq 1897 1898################################################################################ 1899# void ecp_nistz256_sqr_mont( 1900# uint64_t res[4], 1901# uint64_t a[4]); 1902 1903# we optimize the square according to S.Gueron and V.Krasnov, 1904# "Speeding up Big-Number Squaring" 1905.globl ecp_nistz256_sqr_mont 1906.type ecp_nistz256_sqr_mont,\@function,2 1907.align 32 1908ecp_nistz256_sqr_mont: 1909.cfi_startproc 1910___ 1911$code.=<<___ if ($addx); 1912 mov \$0x80100, %ecx 1913 and OPENSSL_ia32cap_P+8(%rip), %ecx 1914___ 1915$code.=<<___; 1916 push %rbp 1917.cfi_push %rbp 1918 push %rbx 1919.cfi_push %rbx 1920 push %r12 1921.cfi_push %r12 1922 push %r13 1923.cfi_push %r13 1924 push %r14 1925.cfi_push %r14 1926 push %r15 1927.cfi_push %r15 1928.Lsqr_body: 1929___ 1930$code.=<<___ if ($addx); 1931 cmp \$0x80100, %ecx 1932 je .Lsqr_montx 1933___ 1934$code.=<<___; 1935 mov 8*0($a_ptr), %rax 1936 mov 8*1($a_ptr), $acc6 1937 mov 8*2($a_ptr), $acc7 1938 mov 8*3($a_ptr), $acc0 1939 1940 call __ecp_nistz256_sqr_montq 1941___ 1942$code.=<<___ if ($addx); 1943 jmp .Lsqr_mont_done 1944 1945.align 32 1946.Lsqr_montx: 1947 mov 8*0($a_ptr), %rdx 1948 mov 8*1($a_ptr), $acc6 1949 mov 8*2($a_ptr), $acc7 1950 mov 8*3($a_ptr), $acc0 1951 lea -128($a_ptr), $a_ptr # control u-op density 1952 1953 call __ecp_nistz256_sqr_montx 1954___ 1955$code.=<<___; 1956.Lsqr_mont_done: 1957 mov 0(%rsp),%r15 1958.cfi_restore %r15 1959 mov 8(%rsp),%r14 1960.cfi_restore %r14 1961 mov 16(%rsp),%r13 1962.cfi_restore %r13 1963 mov 24(%rsp),%r12 1964.cfi_restore %r12 1965 mov 32(%rsp),%rbx 1966.cfi_restore %rbx 1967 mov 40(%rsp),%rbp 1968.cfi_restore %rbp 1969 lea 48(%rsp),%rsp 1970.cfi_adjust_cfa_offset -48 1971.Lsqr_epilogue: 1972 ret 1973.cfi_endproc 1974.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 1975 1976.type __ecp_nistz256_sqr_montq,\@abi-omnipotent 1977.align 32 1978__ecp_nistz256_sqr_montq: 1979.cfi_startproc 1980 mov %rax, $acc5 1981 mulq $acc6 # a[1]*a[0] 1982 mov %rax, $acc1 1983 mov $acc7, %rax 1984 mov %rdx, $acc2 1985 1986 mulq $acc5 # a[0]*a[2] 1987 add %rax, $acc2 1988 mov $acc0, %rax 1989 adc \$0, %rdx 1990 mov %rdx, $acc3 1991 1992 mulq $acc5 # a[0]*a[3] 1993 add %rax, $acc3 1994 mov $acc7, %rax 1995 adc \$0, %rdx 1996 mov %rdx, $acc4 1997 1998 ################################# 1999 mulq $acc6 # a[1]*a[2] 2000 add %rax, $acc3 2001 mov $acc0, %rax 2002 adc \$0, %rdx 2003 mov %rdx, $t1 2004 2005 mulq $acc6 # a[1]*a[3] 2006 add %rax, $acc4 2007 mov $acc0, %rax 2008 adc \$0, %rdx 2009 add $t1, $acc4 2010 mov %rdx, $acc5 2011 adc \$0, $acc5 2012 2013 ################################# 2014 mulq $acc7 # a[2]*a[3] 2015 xor $acc7, $acc7 2016 add %rax, $acc5 2017 mov 8*0($a_ptr), %rax 2018 mov %rdx, $acc6 2019 adc \$0, $acc6 2020 2021 add $acc1, $acc1 # acc1:6<<1 2022 adc $acc2, $acc2 2023 adc $acc3, $acc3 2024 adc $acc4, $acc4 2025 adc $acc5, $acc5 2026 adc $acc6, $acc6 2027 adc \$0, $acc7 2028 2029 mulq %rax 2030 mov %rax, $acc0 2031 mov 8*1($a_ptr), %rax 2032 mov %rdx, $t0 2033 2034 mulq %rax 2035 add $t0, $acc1 2036 adc %rax, $acc2 2037 mov 8*2($a_ptr), %rax 2038 adc \$0, %rdx 2039 mov %rdx, $t0 2040 2041 mulq %rax 2042 add $t0, $acc3 2043 adc %rax, $acc4 2044 mov 8*3($a_ptr), %rax 2045 adc \$0, %rdx 2046 mov %rdx, $t0 2047 2048 mulq %rax 2049 add $t0, $acc5 2050 adc %rax, $acc6 2051 mov $acc0, %rax 2052 adc %rdx, $acc7 2053 2054 mov .Lpoly+8*1(%rip), $a_ptr 2055 mov .Lpoly+8*3(%rip), $t1 2056 2057 ########################################## 2058 # Now the reduction 2059 # First iteration 2060 mov $acc0, $t0 2061 shl \$32, $acc0 2062 mulq $t1 2063 shr \$32, $t0 2064 add $acc0, $acc1 # +=acc[0]<<96 2065 adc $t0, $acc2 2066 adc %rax, $acc3 2067 mov $acc1, %rax 2068 adc \$0, %rdx 2069 2070 ########################################## 2071 # Second iteration 2072 mov $acc1, $t0 2073 shl \$32, $acc1 2074 mov %rdx, $acc0 2075 mulq $t1 2076 shr \$32, $t0 2077 add $acc1, $acc2 2078 adc $t0, $acc3 2079 adc %rax, $acc0 2080 mov $acc2, %rax 2081 adc \$0, %rdx 2082 2083 ########################################## 2084 # Third iteration 2085 mov $acc2, $t0 2086 shl \$32, $acc2 2087 mov %rdx, $acc1 2088 mulq $t1 2089 shr \$32, $t0 2090 add $acc2, $acc3 2091 adc $t0, $acc0 2092 adc %rax, $acc1 2093 mov $acc3, %rax 2094 adc \$0, %rdx 2095 2096 ########################################### 2097 # Last iteration 2098 mov $acc3, $t0 2099 shl \$32, $acc3 2100 mov %rdx, $acc2 2101 mulq $t1 2102 shr \$32, $t0 2103 add $acc3, $acc0 2104 adc $t0, $acc1 2105 adc %rax, $acc2 2106 adc \$0, %rdx 2107 xor $acc3, $acc3 2108 2109 ############################################ 2110 # Add the rest of the acc 2111 add $acc0, $acc4 2112 adc $acc1, $acc5 2113 mov $acc4, $acc0 2114 adc $acc2, $acc6 2115 adc %rdx, $acc7 2116 mov $acc5, $acc1 2117 adc \$0, $acc3 2118 2119 sub \$-1, $acc4 # .Lpoly[0] 2120 mov $acc6, $acc2 2121 sbb $a_ptr, $acc5 # .Lpoly[1] 2122 sbb \$0, $acc6 # .Lpoly[2] 2123 mov $acc7, $t0 2124 sbb $t1, $acc7 # .Lpoly[3] 2125 sbb \$0, $acc3 2126 2127 cmovc $acc0, $acc4 2128 cmovc $acc1, $acc5 2129 mov $acc4, 8*0($r_ptr) 2130 cmovc $acc2, $acc6 2131 mov $acc5, 8*1($r_ptr) 2132 cmovc $t0, $acc7 2133 mov $acc6, 8*2($r_ptr) 2134 mov $acc7, 8*3($r_ptr) 2135 2136 ret 2137.cfi_endproc 2138.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq 2139___ 2140 2141if ($addx) { 2142$code.=<<___; 2143.type __ecp_nistz256_mul_montx,\@abi-omnipotent 2144.align 32 2145__ecp_nistz256_mul_montx: 2146.cfi_startproc 2147 ######################################################################## 2148 # Multiply by b[0] 2149 mulx $acc1, $acc0, $acc1 2150 mulx $acc2, $t0, $acc2 2151 mov \$32, $poly1 2152 xor $acc5, $acc5 # cf=0 2153 mulx $acc3, $t1, $acc3 2154 mov .Lpoly+8*3(%rip), $poly3 2155 adc $t0, $acc1 2156 mulx $acc4, $t0, $acc4 2157 mov $acc0, %rdx 2158 adc $t1, $acc2 2159 shlx $poly1,$acc0,$t1 2160 adc $t0, $acc3 2161 shrx $poly1,$acc0,$t0 2162 adc \$0, $acc4 2163 2164 ######################################################################## 2165 # First reduction step 2166 add $t1, $acc1 2167 adc $t0, $acc2 2168 2169 mulx $poly3, $t0, $t1 2170 mov 8*1($b_ptr), %rdx 2171 adc $t0, $acc3 2172 adc $t1, $acc4 2173 adc \$0, $acc5 2174 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 2175 2176 ######################################################################## 2177 # Multiply by b[1] 2178 mulx 8*0+128($a_ptr), $t0, $t1 2179 adcx $t0, $acc1 2180 adox $t1, $acc2 2181 2182 mulx 8*1+128($a_ptr), $t0, $t1 2183 adcx $t0, $acc2 2184 adox $t1, $acc3 2185 2186 mulx 8*2+128($a_ptr), $t0, $t1 2187 adcx $t0, $acc3 2188 adox $t1, $acc4 2189 2190 mulx 8*3+128($a_ptr), $t0, $t1 2191 mov $acc1, %rdx 2192 adcx $t0, $acc4 2193 shlx $poly1, $acc1, $t0 2194 adox $t1, $acc5 2195 shrx $poly1, $acc1, $t1 2196 2197 adcx $acc0, $acc5 2198 adox $acc0, $acc0 2199 adc \$0, $acc0 2200 2201 ######################################################################## 2202 # Second reduction step 2203 add $t0, $acc2 2204 adc $t1, $acc3 2205 2206 mulx $poly3, $t0, $t1 2207 mov 8*2($b_ptr), %rdx 2208 adc $t0, $acc4 2209 adc $t1, $acc5 2210 adc \$0, $acc0 2211 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 2212 2213 ######################################################################## 2214 # Multiply by b[2] 2215 mulx 8*0+128($a_ptr), $t0, $t1 2216 adcx $t0, $acc2 2217 adox $t1, $acc3 2218 2219 mulx 8*1+128($a_ptr), $t0, $t1 2220 adcx $t0, $acc3 2221 adox $t1, $acc4 2222 2223 mulx 8*2+128($a_ptr), $t0, $t1 2224 adcx $t0, $acc4 2225 adox $t1, $acc5 2226 2227 mulx 8*3+128($a_ptr), $t0, $t1 2228 mov $acc2, %rdx 2229 adcx $t0, $acc5 2230 shlx $poly1, $acc2, $t0 2231 adox $t1, $acc0 2232 shrx $poly1, $acc2, $t1 2233 2234 adcx $acc1, $acc0 2235 adox $acc1, $acc1 2236 adc \$0, $acc1 2237 2238 ######################################################################## 2239 # Third reduction step 2240 add $t0, $acc3 2241 adc $t1, $acc4 2242 2243 mulx $poly3, $t0, $t1 2244 mov 8*3($b_ptr), %rdx 2245 adc $t0, $acc5 2246 adc $t1, $acc0 2247 adc \$0, $acc1 2248 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 2249 2250 ######################################################################## 2251 # Multiply by b[3] 2252 mulx 8*0+128($a_ptr), $t0, $t1 2253 adcx $t0, $acc3 2254 adox $t1, $acc4 2255 2256 mulx 8*1+128($a_ptr), $t0, $t1 2257 adcx $t0, $acc4 2258 adox $t1, $acc5 2259 2260 mulx 8*2+128($a_ptr), $t0, $t1 2261 adcx $t0, $acc5 2262 adox $t1, $acc0 2263 2264 mulx 8*3+128($a_ptr), $t0, $t1 2265 mov $acc3, %rdx 2266 adcx $t0, $acc0 2267 shlx $poly1, $acc3, $t0 2268 adox $t1, $acc1 2269 shrx $poly1, $acc3, $t1 2270 2271 adcx $acc2, $acc1 2272 adox $acc2, $acc2 2273 adc \$0, $acc2 2274 2275 ######################################################################## 2276 # Fourth reduction step 2277 add $t0, $acc4 2278 adc $t1, $acc5 2279 2280 mulx $poly3, $t0, $t1 2281 mov $acc4, $t2 2282 mov .Lpoly+8*1(%rip), $poly1 2283 adc $t0, $acc0 2284 mov $acc5, $t3 2285 adc $t1, $acc1 2286 adc \$0, $acc2 2287 2288 ######################################################################## 2289 # Branch-less conditional subtraction of P 2290 xor %eax, %eax 2291 mov $acc0, $t0 2292 sbb \$-1, $acc4 # .Lpoly[0] 2293 sbb $poly1, $acc5 # .Lpoly[1] 2294 sbb \$0, $acc0 # .Lpoly[2] 2295 mov $acc1, $t1 2296 sbb $poly3, $acc1 # .Lpoly[3] 2297 sbb \$0, $acc2 2298 2299 cmovc $t2, $acc4 2300 cmovc $t3, $acc5 2301 mov $acc4, 8*0($r_ptr) 2302 cmovc $t0, $acc0 2303 mov $acc5, 8*1($r_ptr) 2304 cmovc $t1, $acc1 2305 mov $acc0, 8*2($r_ptr) 2306 mov $acc1, 8*3($r_ptr) 2307 2308 ret 2309.cfi_endproc 2310.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx 2311 2312.type __ecp_nistz256_sqr_montx,\@abi-omnipotent 2313.align 32 2314__ecp_nistz256_sqr_montx: 2315.cfi_startproc 2316 mulx $acc6, $acc1, $acc2 # a[0]*a[1] 2317 mulx $acc7, $t0, $acc3 # a[0]*a[2] 2318 xor %eax, %eax 2319 adc $t0, $acc2 2320 mulx $acc0, $t1, $acc4 # a[0]*a[3] 2321 mov $acc6, %rdx 2322 adc $t1, $acc3 2323 adc \$0, $acc4 2324 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 2325 2326 ################################# 2327 mulx $acc7, $t0, $t1 # a[1]*a[2] 2328 adcx $t0, $acc3 2329 adox $t1, $acc4 2330 2331 mulx $acc0, $t0, $t1 # a[1]*a[3] 2332 mov $acc7, %rdx 2333 adcx $t0, $acc4 2334 adox $t1, $acc5 2335 adc \$0, $acc5 2336 2337 ################################# 2338 mulx $acc0, $t0, $acc6 # a[2]*a[3] 2339 mov 8*0+128($a_ptr), %rdx 2340 xor $acc7, $acc7 # $acc7=0,cf=0,of=0 2341 adcx $acc1, $acc1 # acc1:6<<1 2342 adox $t0, $acc5 2343 adcx $acc2, $acc2 2344 adox $acc7, $acc6 # of=0 2345 2346 mulx %rdx, $acc0, $t1 2347 mov 8*1+128($a_ptr), %rdx 2348 adcx $acc3, $acc3 2349 adox $t1, $acc1 2350 adcx $acc4, $acc4 2351 mulx %rdx, $t0, $t4 2352 mov 8*2+128($a_ptr), %rdx 2353 adcx $acc5, $acc5 2354 adox $t0, $acc2 2355 adcx $acc6, $acc6 2356 .byte 0x67 2357 mulx %rdx, $t0, $t1 2358 mov 8*3+128($a_ptr), %rdx 2359 adox $t4, $acc3 2360 adcx $acc7, $acc7 2361 adox $t0, $acc4 2362 mov \$32, $a_ptr 2363 adox $t1, $acc5 2364 .byte 0x67,0x67 2365 mulx %rdx, $t0, $t4 2366 mov .Lpoly+8*3(%rip), %rdx 2367 adox $t0, $acc6 2368 shlx $a_ptr, $acc0, $t0 2369 adox $t4, $acc7 2370 shrx $a_ptr, $acc0, $t4 2371 mov %rdx,$t1 2372 2373 # reduction step 1 2374 add $t0, $acc1 2375 adc $t4, $acc2 2376 2377 mulx $acc0, $t0, $acc0 2378 adc $t0, $acc3 2379 shlx $a_ptr, $acc1, $t0 2380 adc \$0, $acc0 2381 shrx $a_ptr, $acc1, $t4 2382 2383 # reduction step 2 2384 add $t0, $acc2 2385 adc $t4, $acc3 2386 2387 mulx $acc1, $t0, $acc1 2388 adc $t0, $acc0 2389 shlx $a_ptr, $acc2, $t0 2390 adc \$0, $acc1 2391 shrx $a_ptr, $acc2, $t4 2392 2393 # reduction step 3 2394 add $t0, $acc3 2395 adc $t4, $acc0 2396 2397 mulx $acc2, $t0, $acc2 2398 adc $t0, $acc1 2399 shlx $a_ptr, $acc3, $t0 2400 adc \$0, $acc2 2401 shrx $a_ptr, $acc3, $t4 2402 2403 # reduction step 4 2404 add $t0, $acc0 2405 adc $t4, $acc1 2406 2407 mulx $acc3, $t0, $acc3 2408 adc $t0, $acc2 2409 adc \$0, $acc3 2410 2411 xor $t3, $t3 2412 add $acc0, $acc4 # accumulate upper half 2413 mov .Lpoly+8*1(%rip), $a_ptr 2414 adc $acc1, $acc5 2415 mov $acc4, $acc0 2416 adc $acc2, $acc6 2417 adc $acc3, $acc7 2418 mov $acc5, $acc1 2419 adc \$0, $t3 2420 2421 sub \$-1, $acc4 # .Lpoly[0] 2422 mov $acc6, $acc2 2423 sbb $a_ptr, $acc5 # .Lpoly[1] 2424 sbb \$0, $acc6 # .Lpoly[2] 2425 mov $acc7, $acc3 2426 sbb $t1, $acc7 # .Lpoly[3] 2427 sbb \$0, $t3 2428 2429 cmovc $acc0, $acc4 2430 cmovc $acc1, $acc5 2431 mov $acc4, 8*0($r_ptr) 2432 cmovc $acc2, $acc6 2433 mov $acc5, 8*1($r_ptr) 2434 cmovc $acc3, $acc7 2435 mov $acc6, 8*2($r_ptr) 2436 mov $acc7, 8*3($r_ptr) 2437 2438 ret 2439.cfi_endproc 2440.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx 2441___ 2442} 2443} 2444{ 2445my ($r_ptr,$in_ptr)=("%rdi","%rsi"); 2446my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); 2447my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); 2448 2449$code.=<<___; 2450################################################################################ 2451# void ecp_nistz256_from_mont( 2452# uint64_t res[4], 2453# uint64_t in[4]); 2454# This one performs Montgomery multiplication by 1, so we only need the reduction 2455 2456.globl ecp_nistz256_from_mont 2457.type ecp_nistz256_from_mont,\@function,2 2458.align 32 2459ecp_nistz256_from_mont: 2460.cfi_startproc 2461 push %r12 2462.cfi_push %r12 2463 push %r13 2464.cfi_push %r13 2465.Lfrom_body: 2466 2467 mov 8*0($in_ptr), %rax 2468 mov .Lpoly+8*3(%rip), $t2 2469 mov 8*1($in_ptr), $acc1 2470 mov 8*2($in_ptr), $acc2 2471 mov 8*3($in_ptr), $acc3 2472 mov %rax, $acc0 2473 mov .Lpoly+8*1(%rip), $t1 2474 2475 ######################################### 2476 # First iteration 2477 mov %rax, $t0 2478 shl \$32, $acc0 2479 mulq $t2 2480 shr \$32, $t0 2481 add $acc0, $acc1 2482 adc $t0, $acc2 2483 adc %rax, $acc3 2484 mov $acc1, %rax 2485 adc \$0, %rdx 2486 2487 ######################################### 2488 # Second iteration 2489 mov $acc1, $t0 2490 shl \$32, $acc1 2491 mov %rdx, $acc0 2492 mulq $t2 2493 shr \$32, $t0 2494 add $acc1, $acc2 2495 adc $t0, $acc3 2496 adc %rax, $acc0 2497 mov $acc2, %rax 2498 adc \$0, %rdx 2499 2500 ########################################## 2501 # Third iteration 2502 mov $acc2, $t0 2503 shl \$32, $acc2 2504 mov %rdx, $acc1 2505 mulq $t2 2506 shr \$32, $t0 2507 add $acc2, $acc3 2508 adc $t0, $acc0 2509 adc %rax, $acc1 2510 mov $acc3, %rax 2511 adc \$0, %rdx 2512 2513 ########################################### 2514 # Last iteration 2515 mov $acc3, $t0 2516 shl \$32, $acc3 2517 mov %rdx, $acc2 2518 mulq $t2 2519 shr \$32, $t0 2520 add $acc3, $acc0 2521 adc $t0, $acc1 2522 mov $acc0, $t0 2523 adc %rax, $acc2 2524 mov $acc1, $in_ptr 2525 adc \$0, %rdx 2526 2527 ########################################### 2528 # Branch-less conditional subtraction 2529 sub \$-1, $acc0 2530 mov $acc2, %rax 2531 sbb $t1, $acc1 2532 sbb \$0, $acc2 2533 mov %rdx, $acc3 2534 sbb $t2, %rdx 2535 sbb $t2, $t2 2536 2537 cmovnz $t0, $acc0 2538 cmovnz $in_ptr, $acc1 2539 mov $acc0, 8*0($r_ptr) 2540 cmovnz %rax, $acc2 2541 mov $acc1, 8*1($r_ptr) 2542 cmovz %rdx, $acc3 2543 mov $acc2, 8*2($r_ptr) 2544 mov $acc3, 8*3($r_ptr) 2545 2546 mov 0(%rsp),%r13 2547.cfi_restore %r13 2548 mov 8(%rsp),%r12 2549.cfi_restore %r12 2550 lea 16(%rsp),%rsp 2551.cfi_adjust_cfa_offset -16 2552.Lfrom_epilogue: 2553 ret 2554.cfi_endproc 2555.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 2556___ 2557} 2558{ 2559my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2560my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); 2561my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); 2562my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); 2563 2564$code.=<<___; 2565################################################################################ 2566# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); 2567.globl ecp_nistz256_scatter_w5 2568.type ecp_nistz256_scatter_w5,\@abi-omnipotent 2569.align 32 2570ecp_nistz256_scatter_w5: 2571.cfi_startproc 2572 lea -3($index,$index,2), $index 2573 movdqa 0x00($in_t), %xmm0 2574 shl \$5, $index 2575 movdqa 0x10($in_t), %xmm1 2576 movdqa 0x20($in_t), %xmm2 2577 movdqa 0x30($in_t), %xmm3 2578 movdqa 0x40($in_t), %xmm4 2579 movdqa 0x50($in_t), %xmm5 2580 movdqa %xmm0, 0x00($val,$index) 2581 movdqa %xmm1, 0x10($val,$index) 2582 movdqa %xmm2, 0x20($val,$index) 2583 movdqa %xmm3, 0x30($val,$index) 2584 movdqa %xmm4, 0x40($val,$index) 2585 movdqa %xmm5, 0x50($val,$index) 2586 2587 ret 2588.cfi_endproc 2589.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 2590 2591################################################################################ 2592# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); 2593.globl ecp_nistz256_gather_w5 2594.type ecp_nistz256_gather_w5,\@abi-omnipotent 2595.align 32 2596ecp_nistz256_gather_w5: 2597.cfi_startproc 2598___ 2599$code.=<<___ if ($avx>1); 2600 mov OPENSSL_ia32cap_P+8(%rip), %eax 2601 test \$`1<<5`, %eax 2602 jnz .Lavx2_gather_w5 2603___ 2604$code.=<<___ if ($win64); 2605 lea -0x88(%rsp), %rax 2606.LSEH_begin_ecp_nistz256_gather_w5: 2607 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2608 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2609 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2610 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2611 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2612 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2613 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2614 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2615 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2616 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2617 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2618___ 2619$code.=<<___; 2620 movdqa .LOne(%rip), $ONE 2621 movd $index, $INDEX 2622 2623 pxor $Ra, $Ra 2624 pxor $Rb, $Rb 2625 pxor $Rc, $Rc 2626 pxor $Rd, $Rd 2627 pxor $Re, $Re 2628 pxor $Rf, $Rf 2629 2630 movdqa $ONE, $M0 2631 pshufd \$0, $INDEX, $INDEX 2632 2633 mov \$16, %rax 2634.Lselect_loop_sse_w5: 2635 2636 movdqa $M0, $TMP0 2637 paddd $ONE, $M0 2638 pcmpeqd $INDEX, $TMP0 2639 2640 movdqa 16*0($in_t), $T0a 2641 movdqa 16*1($in_t), $T0b 2642 movdqa 16*2($in_t), $T0c 2643 movdqa 16*3($in_t), $T0d 2644 movdqa 16*4($in_t), $T0e 2645 movdqa 16*5($in_t), $T0f 2646 lea 16*6($in_t), $in_t 2647 2648 pand $TMP0, $T0a 2649 pand $TMP0, $T0b 2650 por $T0a, $Ra 2651 pand $TMP0, $T0c 2652 por $T0b, $Rb 2653 pand $TMP0, $T0d 2654 por $T0c, $Rc 2655 pand $TMP0, $T0e 2656 por $T0d, $Rd 2657 pand $TMP0, $T0f 2658 por $T0e, $Re 2659 por $T0f, $Rf 2660 2661 dec %rax 2662 jnz .Lselect_loop_sse_w5 2663 2664 movdqu $Ra, 16*0($val) 2665 movdqu $Rb, 16*1($val) 2666 movdqu $Rc, 16*2($val) 2667 movdqu $Rd, 16*3($val) 2668 movdqu $Re, 16*4($val) 2669 movdqu $Rf, 16*5($val) 2670___ 2671$code.=<<___ if ($win64); 2672 movaps (%rsp), %xmm6 2673 movaps 0x10(%rsp), %xmm7 2674 movaps 0x20(%rsp), %xmm8 2675 movaps 0x30(%rsp), %xmm9 2676 movaps 0x40(%rsp), %xmm10 2677 movaps 0x50(%rsp), %xmm11 2678 movaps 0x60(%rsp), %xmm12 2679 movaps 0x70(%rsp), %xmm13 2680 movaps 0x80(%rsp), %xmm14 2681 movaps 0x90(%rsp), %xmm15 2682 lea 0xa8(%rsp), %rsp 2683___ 2684$code.=<<___; 2685 ret 2686.cfi_endproc 2687.LSEH_end_ecp_nistz256_gather_w5: 2688.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 2689 2690################################################################################ 2691# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); 2692.globl ecp_nistz256_scatter_w7 2693.type ecp_nistz256_scatter_w7,\@abi-omnipotent 2694.align 32 2695ecp_nistz256_scatter_w7: 2696.cfi_startproc 2697 movdqu 0x00($in_t), %xmm0 2698 shl \$6, $index 2699 movdqu 0x10($in_t), %xmm1 2700 movdqu 0x20($in_t), %xmm2 2701 movdqu 0x30($in_t), %xmm3 2702 movdqa %xmm0, 0x00($val,$index) 2703 movdqa %xmm1, 0x10($val,$index) 2704 movdqa %xmm2, 0x20($val,$index) 2705 movdqa %xmm3, 0x30($val,$index) 2706 2707 ret 2708.cfi_endproc 2709.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 2710 2711################################################################################ 2712# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); 2713.globl ecp_nistz256_gather_w7 2714.type ecp_nistz256_gather_w7,\@abi-omnipotent 2715.align 32 2716ecp_nistz256_gather_w7: 2717.cfi_startproc 2718___ 2719$code.=<<___ if ($avx>1); 2720 mov OPENSSL_ia32cap_P+8(%rip), %eax 2721 test \$`1<<5`, %eax 2722 jnz .Lavx2_gather_w7 2723___ 2724$code.=<<___ if ($win64); 2725 lea -0x88(%rsp), %rax 2726.LSEH_begin_ecp_nistz256_gather_w7: 2727 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp 2728 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) 2729 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) 2730 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) 2731 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) 2732 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) 2733 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) 2734 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) 2735 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) 2736 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) 2737 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) 2738___ 2739$code.=<<___; 2740 movdqa .LOne(%rip), $M0 2741 movd $index, $INDEX 2742 2743 pxor $Ra, $Ra 2744 pxor $Rb, $Rb 2745 pxor $Rc, $Rc 2746 pxor $Rd, $Rd 2747 2748 movdqa $M0, $ONE 2749 pshufd \$0, $INDEX, $INDEX 2750 mov \$64, %rax 2751 2752.Lselect_loop_sse_w7: 2753 movdqa $M0, $TMP0 2754 paddd $ONE, $M0 2755 movdqa 16*0($in_t), $T0a 2756 movdqa 16*1($in_t), $T0b 2757 pcmpeqd $INDEX, $TMP0 2758 movdqa 16*2($in_t), $T0c 2759 movdqa 16*3($in_t), $T0d 2760 lea 16*4($in_t), $in_t 2761 2762 pand $TMP0, $T0a 2763 pand $TMP0, $T0b 2764 por $T0a, $Ra 2765 pand $TMP0, $T0c 2766 por $T0b, $Rb 2767 pand $TMP0, $T0d 2768 por $T0c, $Rc 2769 prefetcht0 255($in_t) 2770 por $T0d, $Rd 2771 2772 dec %rax 2773 jnz .Lselect_loop_sse_w7 2774 2775 movdqu $Ra, 16*0($val) 2776 movdqu $Rb, 16*1($val) 2777 movdqu $Rc, 16*2($val) 2778 movdqu $Rd, 16*3($val) 2779___ 2780$code.=<<___ if ($win64); 2781 movaps (%rsp), %xmm6 2782 movaps 0x10(%rsp), %xmm7 2783 movaps 0x20(%rsp), %xmm8 2784 movaps 0x30(%rsp), %xmm9 2785 movaps 0x40(%rsp), %xmm10 2786 movaps 0x50(%rsp), %xmm11 2787 movaps 0x60(%rsp), %xmm12 2788 movaps 0x70(%rsp), %xmm13 2789 movaps 0x80(%rsp), %xmm14 2790 movaps 0x90(%rsp), %xmm15 2791 lea 0xa8(%rsp), %rsp 2792___ 2793$code.=<<___; 2794 ret 2795.cfi_endproc 2796.LSEH_end_ecp_nistz256_gather_w7: 2797.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 2798___ 2799} 2800if ($avx>1) { 2801my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2802my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); 2803my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); 2804my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); 2805 2806$code.=<<___; 2807################################################################################ 2808# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); 2809.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent 2810.align 32 2811ecp_nistz256_avx2_gather_w5: 2812.cfi_startproc 2813.Lavx2_gather_w5: 2814 vzeroupper 2815___ 2816$code.=<<___ if ($win64); 2817 lea -0x88(%rsp), %rax 2818 mov %rsp,%r11 2819.LSEH_begin_ecp_nistz256_avx2_gather_w5: 2820 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2821 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2822 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2823 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2824 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2825 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2826 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2827 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2828 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2829 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2830 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2831___ 2832$code.=<<___; 2833 vmovdqa .LTwo(%rip), $TWO 2834 2835 vpxor $Ra, $Ra, $Ra 2836 vpxor $Rb, $Rb, $Rb 2837 vpxor $Rc, $Rc, $Rc 2838 2839 vmovdqa .LOne(%rip), $M0 2840 vmovdqa .LTwo(%rip), $M1 2841 2842 vmovd $index, %xmm1 2843 vpermd $INDEX, $Ra, $INDEX 2844 2845 mov \$8, %rax 2846.Lselect_loop_avx2_w5: 2847 2848 vmovdqa 32*0($in_t), $T0a 2849 vmovdqa 32*1($in_t), $T0b 2850 vmovdqa 32*2($in_t), $T0c 2851 2852 vmovdqa 32*3($in_t), $T1a 2853 vmovdqa 32*4($in_t), $T1b 2854 vmovdqa 32*5($in_t), $T1c 2855 2856 vpcmpeqd $INDEX, $M0, $TMP0 2857 vpcmpeqd $INDEX, $M1, $TMP1 2858 2859 vpaddd $TWO, $M0, $M0 2860 vpaddd $TWO, $M1, $M1 2861 lea 32*6($in_t), $in_t 2862 2863 vpand $TMP0, $T0a, $T0a 2864 vpand $TMP0, $T0b, $T0b 2865 vpand $TMP0, $T0c, $T0c 2866 vpand $TMP1, $T1a, $T1a 2867 vpand $TMP1, $T1b, $T1b 2868 vpand $TMP1, $T1c, $T1c 2869 2870 vpxor $T0a, $Ra, $Ra 2871 vpxor $T0b, $Rb, $Rb 2872 vpxor $T0c, $Rc, $Rc 2873 vpxor $T1a, $Ra, $Ra 2874 vpxor $T1b, $Rb, $Rb 2875 vpxor $T1c, $Rc, $Rc 2876 2877 dec %rax 2878 jnz .Lselect_loop_avx2_w5 2879 2880 vmovdqu $Ra, 32*0($val) 2881 vmovdqu $Rb, 32*1($val) 2882 vmovdqu $Rc, 32*2($val) 2883 vzeroupper 2884___ 2885$code.=<<___ if ($win64); 2886 movaps (%rsp), %xmm6 2887 movaps 0x10(%rsp), %xmm7 2888 movaps 0x20(%rsp), %xmm8 2889 movaps 0x30(%rsp), %xmm9 2890 movaps 0x40(%rsp), %xmm10 2891 movaps 0x50(%rsp), %xmm11 2892 movaps 0x60(%rsp), %xmm12 2893 movaps 0x70(%rsp), %xmm13 2894 movaps 0x80(%rsp), %xmm14 2895 movaps 0x90(%rsp), %xmm15 2896 lea (%r11), %rsp 2897___ 2898$code.=<<___; 2899 ret 2900.cfi_endproc 2901.LSEH_end_ecp_nistz256_avx2_gather_w5: 2902.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 2903___ 2904} 2905if ($avx>1) { 2906my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); 2907my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); 2908my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); 2909my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); 2910my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); 2911 2912$code.=<<___; 2913 2914################################################################################ 2915# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); 2916.globl ecp_nistz256_avx2_gather_w7 2917.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent 2918.align 32 2919ecp_nistz256_avx2_gather_w7: 2920.cfi_startproc 2921.Lavx2_gather_w7: 2922 vzeroupper 2923___ 2924$code.=<<___ if ($win64); 2925 mov %rsp,%r11 2926 lea -0x88(%rsp), %rax 2927.LSEH_begin_ecp_nistz256_avx2_gather_w7: 2928 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp 2929 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) 2930 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) 2931 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) 2932 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) 2933 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) 2934 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) 2935 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) 2936 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) 2937 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) 2938 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) 2939___ 2940$code.=<<___; 2941 vmovdqa .LThree(%rip), $THREE 2942 2943 vpxor $Ra, $Ra, $Ra 2944 vpxor $Rb, $Rb, $Rb 2945 2946 vmovdqa .LOne(%rip), $M0 2947 vmovdqa .LTwo(%rip), $M1 2948 vmovdqa .LThree(%rip), $M2 2949 2950 vmovd $index, %xmm1 2951 vpermd $INDEX, $Ra, $INDEX 2952 # Skip index = 0, because it is implicitly the point at infinity 2953 2954 mov \$21, %rax 2955.Lselect_loop_avx2_w7: 2956 2957 vmovdqa 32*0($in_t), $T0a 2958 vmovdqa 32*1($in_t), $T0b 2959 2960 vmovdqa 32*2($in_t), $T1a 2961 vmovdqa 32*3($in_t), $T1b 2962 2963 vmovdqa 32*4($in_t), $T2a 2964 vmovdqa 32*5($in_t), $T2b 2965 2966 vpcmpeqd $INDEX, $M0, $TMP0 2967 vpcmpeqd $INDEX, $M1, $TMP1 2968 vpcmpeqd $INDEX, $M2, $TMP2 2969 2970 vpaddd $THREE, $M0, $M0 2971 vpaddd $THREE, $M1, $M1 2972 vpaddd $THREE, $M2, $M2 2973 lea 32*6($in_t), $in_t 2974 2975 vpand $TMP0, $T0a, $T0a 2976 vpand $TMP0, $T0b, $T0b 2977 vpand $TMP1, $T1a, $T1a 2978 vpand $TMP1, $T1b, $T1b 2979 vpand $TMP2, $T2a, $T2a 2980 vpand $TMP2, $T2b, $T2b 2981 2982 vpxor $T0a, $Ra, $Ra 2983 vpxor $T0b, $Rb, $Rb 2984 vpxor $T1a, $Ra, $Ra 2985 vpxor $T1b, $Rb, $Rb 2986 vpxor $T2a, $Ra, $Ra 2987 vpxor $T2b, $Rb, $Rb 2988 2989 dec %rax 2990 jnz .Lselect_loop_avx2_w7 2991 2992 2993 vmovdqa 32*0($in_t), $T0a 2994 vmovdqa 32*1($in_t), $T0b 2995 2996 vpcmpeqd $INDEX, $M0, $TMP0 2997 2998 vpand $TMP0, $T0a, $T0a 2999 vpand $TMP0, $T0b, $T0b 3000 3001 vpxor $T0a, $Ra, $Ra 3002 vpxor $T0b, $Rb, $Rb 3003 3004 vmovdqu $Ra, 32*0($val) 3005 vmovdqu $Rb, 32*1($val) 3006 vzeroupper 3007___ 3008$code.=<<___ if ($win64); 3009 movaps (%rsp), %xmm6 3010 movaps 0x10(%rsp), %xmm7 3011 movaps 0x20(%rsp), %xmm8 3012 movaps 0x30(%rsp), %xmm9 3013 movaps 0x40(%rsp), %xmm10 3014 movaps 0x50(%rsp), %xmm11 3015 movaps 0x60(%rsp), %xmm12 3016 movaps 0x70(%rsp), %xmm13 3017 movaps 0x80(%rsp), %xmm14 3018 movaps 0x90(%rsp), %xmm15 3019 lea (%r11), %rsp 3020___ 3021$code.=<<___; 3022 ret 3023.cfi_endproc 3024.LSEH_end_ecp_nistz256_avx2_gather_w7: 3025.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 3026___ 3027} else { 3028$code.=<<___; 3029.globl ecp_nistz256_avx2_gather_w7 3030.type ecp_nistz256_avx2_gather_w7,\@function,3 3031.align 32 3032ecp_nistz256_avx2_gather_w7: 3033.cfi_startproc 3034 .byte 0x0f,0x0b # ud2 3035 ret 3036.cfi_endproc 3037.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 3038___ 3039} 3040{{{ 3041######################################################################## 3042# This block implements higher level point_double, point_add and 3043# point_add_affine. The key to performance in this case is to allow 3044# out-of-order execution logic to overlap computations from next step 3045# with tail processing from current step. By using tailored calling 3046# sequence we minimize inter-step overhead to give processor better 3047# shot at overlapping operations... 3048# 3049# You will notice that input data is copied to stack. Trouble is that 3050# there are no registers to spare for holding original pointers and 3051# reloading them, pointers, would create undesired dependencies on 3052# effective addresses calculation paths. In other words it's too done 3053# to favour out-of-order execution logic. 3054# <appro@openssl.org> 3055 3056my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); 3057my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); 3058my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); 3059my ($poly1,$poly3)=($acc6,$acc7); 3060 3061sub load_for_mul () { 3062my ($a,$b,$src0) = @_; 3063my $bias = $src0 eq "%rax" ? 0 : -128; 3064 3065" mov $b, $src0 3066 lea $b, $b_ptr 3067 mov 8*0+$a, $acc1 3068 mov 8*1+$a, $acc2 3069 lea $bias+$a, $a_ptr 3070 mov 8*2+$a, $acc3 3071 mov 8*3+$a, $acc4" 3072} 3073 3074sub load_for_sqr () { 3075my ($a,$src0) = @_; 3076my $bias = $src0 eq "%rax" ? 0 : -128; 3077 3078" mov 8*0+$a, $src0 3079 mov 8*1+$a, $acc6 3080 lea $bias+$a, $a_ptr 3081 mov 8*2+$a, $acc7 3082 mov 8*3+$a, $acc0" 3083} 3084 3085 { 3086######################################################################## 3087# operate in 4-5-0-1 "name space" that matches multiplication output 3088# 3089my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3090 3091$code.=<<___; 3092.type __ecp_nistz256_add_toq,\@abi-omnipotent 3093.align 32 3094__ecp_nistz256_add_toq: 3095.cfi_startproc 3096 xor $t4,$t4 3097 add 8*0($b_ptr), $a0 3098 adc 8*1($b_ptr), $a1 3099 mov $a0, $t0 3100 adc 8*2($b_ptr), $a2 3101 adc 8*3($b_ptr), $a3 3102 mov $a1, $t1 3103 adc \$0, $t4 3104 3105 sub \$-1, $a0 3106 mov $a2, $t2 3107 sbb $poly1, $a1 3108 sbb \$0, $a2 3109 mov $a3, $t3 3110 sbb $poly3, $a3 3111 sbb \$0, $t4 3112 3113 cmovc $t0, $a0 3114 cmovc $t1, $a1 3115 mov $a0, 8*0($r_ptr) 3116 cmovc $t2, $a2 3117 mov $a1, 8*1($r_ptr) 3118 cmovc $t3, $a3 3119 mov $a2, 8*2($r_ptr) 3120 mov $a3, 8*3($r_ptr) 3121 3122 ret 3123.cfi_endproc 3124.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq 3125 3126.type __ecp_nistz256_sub_fromq,\@abi-omnipotent 3127.align 32 3128__ecp_nistz256_sub_fromq: 3129.cfi_startproc 3130 sub 8*0($b_ptr), $a0 3131 sbb 8*1($b_ptr), $a1 3132 mov $a0, $t0 3133 sbb 8*2($b_ptr), $a2 3134 sbb 8*3($b_ptr), $a3 3135 mov $a1, $t1 3136 sbb $t4, $t4 3137 3138 add \$-1, $a0 3139 mov $a2, $t2 3140 adc $poly1, $a1 3141 adc \$0, $a2 3142 mov $a3, $t3 3143 adc $poly3, $a3 3144 test $t4, $t4 3145 3146 cmovz $t0, $a0 3147 cmovz $t1, $a1 3148 mov $a0, 8*0($r_ptr) 3149 cmovz $t2, $a2 3150 mov $a1, 8*1($r_ptr) 3151 cmovz $t3, $a3 3152 mov $a2, 8*2($r_ptr) 3153 mov $a3, 8*3($r_ptr) 3154 3155 ret 3156.cfi_endproc 3157.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq 3158 3159.type __ecp_nistz256_subq,\@abi-omnipotent 3160.align 32 3161__ecp_nistz256_subq: 3162.cfi_startproc 3163 sub $a0, $t0 3164 sbb $a1, $t1 3165 mov $t0, $a0 3166 sbb $a2, $t2 3167 sbb $a3, $t3 3168 mov $t1, $a1 3169 sbb $t4, $t4 3170 3171 add \$-1, $t0 3172 mov $t2, $a2 3173 adc $poly1, $t1 3174 adc \$0, $t2 3175 mov $t3, $a3 3176 adc $poly3, $t3 3177 test $t4, $t4 3178 3179 cmovnz $t0, $a0 3180 cmovnz $t1, $a1 3181 cmovnz $t2, $a2 3182 cmovnz $t3, $a3 3183 3184 ret 3185.cfi_endproc 3186.size __ecp_nistz256_subq,.-__ecp_nistz256_subq 3187 3188.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent 3189.align 32 3190__ecp_nistz256_mul_by_2q: 3191.cfi_startproc 3192 xor $t4, $t4 3193 add $a0, $a0 # a0:a3+a0:a3 3194 adc $a1, $a1 3195 mov $a0, $t0 3196 adc $a2, $a2 3197 adc $a3, $a3 3198 mov $a1, $t1 3199 adc \$0, $t4 3200 3201 sub \$-1, $a0 3202 mov $a2, $t2 3203 sbb $poly1, $a1 3204 sbb \$0, $a2 3205 mov $a3, $t3 3206 sbb $poly3, $a3 3207 sbb \$0, $t4 3208 3209 cmovc $t0, $a0 3210 cmovc $t1, $a1 3211 mov $a0, 8*0($r_ptr) 3212 cmovc $t2, $a2 3213 mov $a1, 8*1($r_ptr) 3214 cmovc $t3, $a3 3215 mov $a2, 8*2($r_ptr) 3216 mov $a3, 8*3($r_ptr) 3217 3218 ret 3219.cfi_endproc 3220.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q 3221___ 3222 } 3223sub gen_double () { 3224 my $x = shift; 3225 my ($src0,$sfx,$bias); 3226 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 3227 3228 if ($x ne "x") { 3229 $src0 = "%rax"; 3230 $sfx = ""; 3231 $bias = 0; 3232 3233$code.=<<___; 3234.globl ecp_nistz256_point_double 3235.type ecp_nistz256_point_double,\@function,2 3236.align 32 3237ecp_nistz256_point_double: 3238.cfi_startproc 3239___ 3240$code.=<<___ if ($addx); 3241 mov \$0x80100, %ecx 3242 and OPENSSL_ia32cap_P+8(%rip), %ecx 3243 cmp \$0x80100, %ecx 3244 je .Lpoint_doublex 3245___ 3246 } else { 3247 $src0 = "%rdx"; 3248 $sfx = "x"; 3249 $bias = 128; 3250 3251$code.=<<___; 3252.type ecp_nistz256_point_doublex,\@function,2 3253.align 32 3254ecp_nistz256_point_doublex: 3255.cfi_startproc 3256.Lpoint_doublex: 3257___ 3258 } 3259$code.=<<___; 3260 push %rbp 3261.cfi_push %rbp 3262 push %rbx 3263.cfi_push %rbx 3264 push %r12 3265.cfi_push %r12 3266 push %r13 3267.cfi_push %r13 3268 push %r14 3269.cfi_push %r14 3270 push %r15 3271.cfi_push %r15 3272 sub \$32*5+8, %rsp 3273.cfi_adjust_cfa_offset 32*5+8 3274.Lpoint_double${x}_body: 3275 3276.Lpoint_double_shortcut$x: 3277 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x 3278 mov $a_ptr, $b_ptr # backup copy 3279 movdqu 0x10($a_ptr), %xmm1 3280 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order 3281 mov 0x20+8*1($a_ptr), $acc5 3282 mov 0x20+8*2($a_ptr), $acc0 3283 mov 0x20+8*3($a_ptr), $acc1 3284 mov .Lpoly+8*1(%rip), $poly1 3285 mov .Lpoly+8*3(%rip), $poly3 3286 movdqa %xmm0, $in_x(%rsp) 3287 movdqa %xmm1, $in_x+0x10(%rsp) 3288 lea 0x20($r_ptr), $acc2 3289 lea 0x40($r_ptr), $acc3 3290 movq $r_ptr, %xmm0 3291 movq $acc2, %xmm1 3292 movq $acc3, %xmm2 3293 3294 lea $S(%rsp), $r_ptr 3295 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); 3296 3297 mov 0x40+8*0($a_ptr), $src0 3298 mov 0x40+8*1($a_ptr), $acc6 3299 mov 0x40+8*2($a_ptr), $acc7 3300 mov 0x40+8*3($a_ptr), $acc0 3301 lea 0x40-$bias($a_ptr), $a_ptr 3302 lea $Zsqr(%rsp), $r_ptr 3303 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); 3304 3305 `&load_for_sqr("$S(%rsp)", "$src0")` 3306 lea $S(%rsp), $r_ptr 3307 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); 3308 3309 mov 0x20($b_ptr), $src0 # $b_ptr is still valid 3310 mov 0x40+8*0($b_ptr), $acc1 3311 mov 0x40+8*1($b_ptr), $acc2 3312 mov 0x40+8*2($b_ptr), $acc3 3313 mov 0x40+8*3($b_ptr), $acc4 3314 lea 0x40-$bias($b_ptr), $a_ptr 3315 lea 0x20($b_ptr), $b_ptr 3316 movq %xmm2, $r_ptr 3317 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); 3318 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); 3319 3320 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 3321 mov $in_x+8*1(%rsp), $acc5 3322 lea $Zsqr(%rsp), $b_ptr 3323 mov $in_x+8*2(%rsp), $acc0 3324 mov $in_x+8*3(%rsp), $acc1 3325 lea $M(%rsp), $r_ptr 3326 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); 3327 3328 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order 3329 mov $in_x+8*1(%rsp), $acc5 3330 lea $Zsqr(%rsp), $b_ptr 3331 mov $in_x+8*2(%rsp), $acc0 3332 mov $in_x+8*3(%rsp), $acc1 3333 lea $Zsqr(%rsp), $r_ptr 3334 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); 3335 3336 `&load_for_sqr("$S(%rsp)", "$src0")` 3337 movq %xmm1, $r_ptr 3338 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); 3339___ 3340{ 3341######## ecp_nistz256_div_by_2(res_y, res_y); ########################## 3342# operate in 4-5-6-7 "name space" that matches squaring output 3343# 3344my ($poly1,$poly3)=($a_ptr,$t1); 3345my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); 3346 3347$code.=<<___; 3348 xor $t4, $t4 3349 mov $a0, $t0 3350 add \$-1, $a0 3351 mov $a1, $t1 3352 adc $poly1, $a1 3353 mov $a2, $t2 3354 adc \$0, $a2 3355 mov $a3, $t3 3356 adc $poly3, $a3 3357 adc \$0, $t4 3358 xor $a_ptr, $a_ptr # borrow $a_ptr 3359 test \$1, $t0 3360 3361 cmovz $t0, $a0 3362 cmovz $t1, $a1 3363 cmovz $t2, $a2 3364 cmovz $t3, $a3 3365 cmovz $a_ptr, $t4 3366 3367 mov $a1, $t0 # a0:a3>>1 3368 shr \$1, $a0 3369 shl \$63, $t0 3370 mov $a2, $t1 3371 shr \$1, $a1 3372 or $t0, $a0 3373 shl \$63, $t1 3374 mov $a3, $t2 3375 shr \$1, $a2 3376 or $t1, $a1 3377 shl \$63, $t2 3378 mov $a0, 8*0($r_ptr) 3379 shr \$1, $a3 3380 mov $a1, 8*1($r_ptr) 3381 shl \$63, $t4 3382 or $t2, $a2 3383 or $t4, $a3 3384 mov $a2, 8*2($r_ptr) 3385 mov $a3, 8*3($r_ptr) 3386___ 3387} 3388$code.=<<___; 3389 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` 3390 lea $M(%rsp), $r_ptr 3391 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); 3392 3393 lea $tmp0(%rsp), $r_ptr 3394 call __ecp_nistz256_mul_by_2$x 3395 3396 lea $M(%rsp), $b_ptr 3397 lea $M(%rsp), $r_ptr 3398 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); 3399 3400 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` 3401 lea $S(%rsp), $r_ptr 3402 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); 3403 3404 lea $tmp0(%rsp), $r_ptr 3405 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); 3406 3407 `&load_for_sqr("$M(%rsp)", "$src0")` 3408 movq %xmm0, $r_ptr 3409 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); 3410 3411 lea $tmp0(%rsp), $b_ptr 3412 mov $acc6, $acc0 # harmonize sqr output and sub input 3413 mov $acc7, $acc1 3414 mov $a_ptr, $poly1 3415 mov $t1, $poly3 3416 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); 3417 3418 mov $S+8*0(%rsp), $t0 3419 mov $S+8*1(%rsp), $t1 3420 mov $S+8*2(%rsp), $t2 3421 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order 3422 lea $S(%rsp), $r_ptr 3423 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); 3424 3425 mov $M(%rsp), $src0 3426 lea $M(%rsp), $b_ptr 3427 mov $acc4, $acc6 # harmonize sub output and mul input 3428 xor %ecx, %ecx 3429 mov $acc4, $S+8*0(%rsp) # have to save:-( 3430 mov $acc5, $acc2 3431 mov $acc5, $S+8*1(%rsp) 3432 cmovz $acc0, $acc3 3433 mov $acc0, $S+8*2(%rsp) 3434 lea $S-$bias(%rsp), $a_ptr 3435 cmovz $acc1, $acc4 3436 mov $acc1, $S+8*3(%rsp) 3437 mov $acc6, $acc1 3438 lea $S(%rsp), $r_ptr 3439 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); 3440 3441 movq %xmm1, $b_ptr 3442 movq %xmm1, $r_ptr 3443 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); 3444 3445 lea 32*5+56(%rsp), %rsi 3446.cfi_def_cfa %rsi,8 3447 mov -48(%rsi),%r15 3448.cfi_restore %r15 3449 mov -40(%rsi),%r14 3450.cfi_restore %r14 3451 mov -32(%rsi),%r13 3452.cfi_restore %r13 3453 mov -24(%rsi),%r12 3454.cfi_restore %r12 3455 mov -16(%rsi),%rbx 3456.cfi_restore %rbx 3457 mov -8(%rsi),%rbp 3458.cfi_restore %rbp 3459 lea (%rsi),%rsp 3460.cfi_def_cfa_register %rsp 3461.Lpoint_double${x}_epilogue: 3462 ret 3463.cfi_endproc 3464.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx 3465___ 3466} 3467&gen_double("q"); 3468 3469sub gen_add () { 3470 my $x = shift; 3471 my ($src0,$sfx,$bias); 3472 my ($H,$Hsqr,$R,$Rsqr,$Hcub, 3473 $U1,$U2,$S1,$S2, 3474 $res_x,$res_y,$res_z, 3475 $in1_x,$in1_y,$in1_z, 3476 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); 3477 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 3478 3479 if ($x ne "x") { 3480 $src0 = "%rax"; 3481 $sfx = ""; 3482 $bias = 0; 3483 3484$code.=<<___; 3485.globl ecp_nistz256_point_add 3486.type ecp_nistz256_point_add,\@function,3 3487.align 32 3488ecp_nistz256_point_add: 3489.cfi_startproc 3490___ 3491$code.=<<___ if ($addx); 3492 mov \$0x80100, %ecx 3493 and OPENSSL_ia32cap_P+8(%rip), %ecx 3494 cmp \$0x80100, %ecx 3495 je .Lpoint_addx 3496___ 3497 } else { 3498 $src0 = "%rdx"; 3499 $sfx = "x"; 3500 $bias = 128; 3501 3502$code.=<<___; 3503.type ecp_nistz256_point_addx,\@function,3 3504.align 32 3505ecp_nistz256_point_addx: 3506.cfi_startproc 3507.Lpoint_addx: 3508___ 3509 } 3510$code.=<<___; 3511 push %rbp 3512.cfi_push %rbp 3513 push %rbx 3514.cfi_push %rbx 3515 push %r12 3516.cfi_push %r12 3517 push %r13 3518.cfi_push %r13 3519 push %r14 3520.cfi_push %r14 3521 push %r15 3522.cfi_push %r15 3523 sub \$32*18+8, %rsp 3524.cfi_adjust_cfa_offset 32*18+8 3525.Lpoint_add${x}_body: 3526 3527 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3528 movdqu 0x10($a_ptr), %xmm1 3529 movdqu 0x20($a_ptr), %xmm2 3530 movdqu 0x30($a_ptr), %xmm3 3531 movdqu 0x40($a_ptr), %xmm4 3532 movdqu 0x50($a_ptr), %xmm5 3533 mov $a_ptr, $b_ptr # reassign 3534 mov $b_org, $a_ptr # reassign 3535 movdqa %xmm0, $in1_x(%rsp) 3536 movdqa %xmm1, $in1_x+0x10(%rsp) 3537 movdqa %xmm2, $in1_y(%rsp) 3538 movdqa %xmm3, $in1_y+0x10(%rsp) 3539 movdqa %xmm4, $in1_z(%rsp) 3540 movdqa %xmm5, $in1_z+0x10(%rsp) 3541 por %xmm4, %xmm5 3542 3543 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr 3544 pshufd \$0xb1, %xmm5, %xmm3 3545 movdqu 0x10($a_ptr), %xmm1 3546 movdqu 0x20($a_ptr), %xmm2 3547 por %xmm3, %xmm5 3548 movdqu 0x30($a_ptr), %xmm3 3549 mov 0x40+8*0($a_ptr), $src0 # load original in2_z 3550 mov 0x40+8*1($a_ptr), $acc6 3551 mov 0x40+8*2($a_ptr), $acc7 3552 mov 0x40+8*3($a_ptr), $acc0 3553 movdqa %xmm0, $in2_x(%rsp) 3554 pshufd \$0x1e, %xmm5, %xmm4 3555 movdqa %xmm1, $in2_x+0x10(%rsp) 3556 movdqu 0x40($a_ptr),%xmm0 # in2_z again 3557 movdqu 0x50($a_ptr),%xmm1 3558 movdqa %xmm2, $in2_y(%rsp) 3559 movdqa %xmm3, $in2_y+0x10(%rsp) 3560 por %xmm4, %xmm5 3561 pxor %xmm4, %xmm4 3562 por %xmm0, %xmm1 3563 movq $r_ptr, %xmm0 # save $r_ptr 3564 3565 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3566 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy 3567 mov $acc6, $in2_z+8*1(%rsp) 3568 mov $acc7, $in2_z+8*2(%rsp) 3569 mov $acc0, $in2_z+8*3(%rsp) 3570 lea $Z2sqr(%rsp), $r_ptr # Z2^2 3571 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); 3572 3573 pcmpeqd %xmm4, %xmm5 3574 pshufd \$0xb1, %xmm1, %xmm4 3575 por %xmm1, %xmm4 3576 pshufd \$0, %xmm5, %xmm5 # in1infty 3577 pshufd \$0x1e, %xmm4, %xmm3 3578 por %xmm3, %xmm4 3579 pxor %xmm3, %xmm3 3580 pcmpeqd %xmm3, %xmm4 3581 pshufd \$0, %xmm4, %xmm4 # in2infty 3582 mov 0x40+8*0($b_ptr), $src0 # load original in1_z 3583 mov 0x40+8*1($b_ptr), $acc6 3584 mov 0x40+8*2($b_ptr), $acc7 3585 mov 0x40+8*3($b_ptr), $acc0 3586 movq $b_ptr, %xmm1 3587 3588 lea 0x40-$bias($b_ptr), $a_ptr 3589 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3590 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3591 3592 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` 3593 lea $S1(%rsp), $r_ptr # S1 = Z2^3 3594 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); 3595 3596 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3597 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3598 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3599 3600 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` 3601 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 3602 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); 3603 3604 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3605 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3606 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3607 3608 lea $S1(%rsp), $b_ptr 3609 lea $R(%rsp), $r_ptr # R = S2 - S1 3610 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); 3611 3612 or $acc5, $acc4 # see if result is zero 3613 movdqa %xmm4, %xmm2 3614 or $acc0, $acc4 3615 or $acc1, $acc4 3616 por %xmm5, %xmm2 # in1infty || in2infty 3617 movq $acc4, %xmm3 3618 3619 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` 3620 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 3621 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); 3622 3623 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` 3624 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3625 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); 3626 3627 lea $U1(%rsp), $b_ptr 3628 lea $H(%rsp), $r_ptr # H = U2 - U1 3629 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); 3630 3631 or $acc5, $acc4 # see if result is zero 3632 or $acc0, $acc4 3633 or $acc1, $acc4 # !is_equal(U1, U2) 3634 3635 movq %xmm2, $acc0 # in1infty | in2infty 3636 movq %xmm3, $acc1 # !is_equal(S1, S2) 3637 3638 or $acc0, $acc4 3639 or $acc1, $acc4 3640 3641 # if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2)) 3642 .byte 0x3e # predict taken 3643 jnz .Ladd_proceed$x 3644 3645.Ladd_double$x: 3646 movq %xmm1, $a_ptr # restore $a_ptr 3647 movq %xmm0, $r_ptr # restore $r_ptr 3648 add \$`32*(18-5)`, %rsp # difference in frame sizes 3649.cfi_adjust_cfa_offset `-32*(18-5)` 3650 jmp .Lpoint_double_shortcut$x 3651.cfi_adjust_cfa_offset `32*(18-5)` 3652 3653.align 32 3654.Ladd_proceed$x: 3655 `&load_for_sqr("$R(%rsp)", "$src0")` 3656 lea $Rsqr(%rsp), $r_ptr # R^2 3657 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3658 3659 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3660 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3661 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3662 3663 `&load_for_sqr("$H(%rsp)", "$src0")` 3664 lea $Hsqr(%rsp), $r_ptr # H^2 3665 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3666 3667 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` 3668 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3669 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); 3670 3671 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` 3672 lea $Hcub(%rsp), $r_ptr # H^3 3673 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3674 3675 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` 3676 lea $U2(%rsp), $r_ptr # U1*H^2 3677 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); 3678___ 3679{ 3680####################################################################### 3681# operate in 4-5-0-1 "name space" that matches multiplication output 3682# 3683my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 3684my ($poly1, $poly3)=($acc6,$acc7); 3685 3686$code.=<<___; 3687 #lea $U2(%rsp), $a_ptr 3688 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 3689 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 3690 3691 xor $t4, $t4 3692 add $acc0, $acc0 # a0:a3+a0:a3 3693 lea $Rsqr(%rsp), $a_ptr 3694 adc $acc1, $acc1 3695 mov $acc0, $t0 3696 adc $acc2, $acc2 3697 adc $acc3, $acc3 3698 mov $acc1, $t1 3699 adc \$0, $t4 3700 3701 sub \$-1, $acc0 3702 mov $acc2, $t2 3703 sbb $poly1, $acc1 3704 sbb \$0, $acc2 3705 mov $acc3, $t3 3706 sbb $poly3, $acc3 3707 sbb \$0, $t4 3708 3709 cmovc $t0, $acc0 3710 mov 8*0($a_ptr), $t0 3711 cmovc $t1, $acc1 3712 mov 8*1($a_ptr), $t1 3713 cmovc $t2, $acc2 3714 mov 8*2($a_ptr), $t2 3715 cmovc $t3, $acc3 3716 mov 8*3($a_ptr), $t3 3717 3718 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 3719 3720 lea $Hcub(%rsp), $b_ptr 3721 lea $res_x(%rsp), $r_ptr 3722 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 3723 3724 mov $U2+8*0(%rsp), $t0 3725 mov $U2+8*1(%rsp), $t1 3726 mov $U2+8*2(%rsp), $t2 3727 mov $U2+8*3(%rsp), $t3 3728 lea $res_y(%rsp), $r_ptr 3729 3730 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); 3731 3732 mov $acc0, 8*0($r_ptr) # save the result, as 3733 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 3734 mov $acc2, 8*2($r_ptr) 3735 mov $acc3, 8*3($r_ptr) 3736___ 3737} 3738$code.=<<___; 3739 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` 3740 lea $S2(%rsp), $r_ptr 3741 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); 3742 3743 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` 3744 lea $res_y(%rsp), $r_ptr 3745 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); 3746 3747 lea $S2(%rsp), $b_ptr 3748 lea $res_y(%rsp), $r_ptr 3749 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); 3750 3751 movq %xmm0, $r_ptr # restore $r_ptr 3752 3753 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); 3754 movdqa %xmm5, %xmm1 3755 pandn $res_z(%rsp), %xmm0 3756 movdqa %xmm5, %xmm2 3757 pandn $res_z+0x10(%rsp), %xmm1 3758 movdqa %xmm5, %xmm3 3759 pand $in2_z(%rsp), %xmm2 3760 pand $in2_z+0x10(%rsp), %xmm3 3761 por %xmm0, %xmm2 3762 por %xmm1, %xmm3 3763 3764 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 3765 movdqa %xmm4, %xmm1 3766 pandn %xmm2, %xmm0 3767 movdqa %xmm4, %xmm2 3768 pandn %xmm3, %xmm1 3769 movdqa %xmm4, %xmm3 3770 pand $in1_z(%rsp), %xmm2 3771 pand $in1_z+0x10(%rsp), %xmm3 3772 por %xmm0, %xmm2 3773 por %xmm1, %xmm3 3774 movdqu %xmm2, 0x40($r_ptr) 3775 movdqu %xmm3, 0x50($r_ptr) 3776 3777 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 3778 movdqa %xmm5, %xmm1 3779 pandn $res_x(%rsp), %xmm0 3780 movdqa %xmm5, %xmm2 3781 pandn $res_x+0x10(%rsp), %xmm1 3782 movdqa %xmm5, %xmm3 3783 pand $in2_x(%rsp), %xmm2 3784 pand $in2_x+0x10(%rsp), %xmm3 3785 por %xmm0, %xmm2 3786 por %xmm1, %xmm3 3787 3788 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 3789 movdqa %xmm4, %xmm1 3790 pandn %xmm2, %xmm0 3791 movdqa %xmm4, %xmm2 3792 pandn %xmm3, %xmm1 3793 movdqa %xmm4, %xmm3 3794 pand $in1_x(%rsp), %xmm2 3795 pand $in1_x+0x10(%rsp), %xmm3 3796 por %xmm0, %xmm2 3797 por %xmm1, %xmm3 3798 movdqu %xmm2, 0x00($r_ptr) 3799 movdqu %xmm3, 0x10($r_ptr) 3800 3801 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 3802 movdqa %xmm5, %xmm1 3803 pandn $res_y(%rsp), %xmm0 3804 movdqa %xmm5, %xmm2 3805 pandn $res_y+0x10(%rsp), %xmm1 3806 movdqa %xmm5, %xmm3 3807 pand $in2_y(%rsp), %xmm2 3808 pand $in2_y+0x10(%rsp), %xmm3 3809 por %xmm0, %xmm2 3810 por %xmm1, %xmm3 3811 3812 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 3813 movdqa %xmm4, %xmm1 3814 pandn %xmm2, %xmm0 3815 movdqa %xmm4, %xmm2 3816 pandn %xmm3, %xmm1 3817 movdqa %xmm4, %xmm3 3818 pand $in1_y(%rsp), %xmm2 3819 pand $in1_y+0x10(%rsp), %xmm3 3820 por %xmm0, %xmm2 3821 por %xmm1, %xmm3 3822 movdqu %xmm2, 0x20($r_ptr) 3823 movdqu %xmm3, 0x30($r_ptr) 3824 3825.Ladd_done$x: 3826 lea 32*18+56(%rsp), %rsi 3827.cfi_def_cfa %rsi,8 3828 mov -48(%rsi),%r15 3829.cfi_restore %r15 3830 mov -40(%rsi),%r14 3831.cfi_restore %r14 3832 mov -32(%rsi),%r13 3833.cfi_restore %r13 3834 mov -24(%rsi),%r12 3835.cfi_restore %r12 3836 mov -16(%rsi),%rbx 3837.cfi_restore %rbx 3838 mov -8(%rsi),%rbp 3839.cfi_restore %rbp 3840 lea (%rsi),%rsp 3841.cfi_def_cfa_register %rsp 3842.Lpoint_add${x}_epilogue: 3843 ret 3844.cfi_endproc 3845.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx 3846___ 3847} 3848&gen_add("q"); 3849 3850sub gen_add_affine () { 3851 my $x = shift; 3852 my ($src0,$sfx,$bias); 3853 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, 3854 $res_x,$res_y,$res_z, 3855 $in1_x,$in1_y,$in1_z, 3856 $in2_x,$in2_y)=map(32*$_,(0..14)); 3857 my $Z1sqr = $S2; 3858 3859 if ($x ne "x") { 3860 $src0 = "%rax"; 3861 $sfx = ""; 3862 $bias = 0; 3863 3864$code.=<<___; 3865.globl ecp_nistz256_point_add_affine 3866.type ecp_nistz256_point_add_affine,\@function,3 3867.align 32 3868ecp_nistz256_point_add_affine: 3869.cfi_startproc 3870___ 3871$code.=<<___ if ($addx); 3872 mov \$0x80100, %ecx 3873 and OPENSSL_ia32cap_P+8(%rip), %ecx 3874 cmp \$0x80100, %ecx 3875 je .Lpoint_add_affinex 3876___ 3877 } else { 3878 $src0 = "%rdx"; 3879 $sfx = "x"; 3880 $bias = 128; 3881 3882$code.=<<___; 3883.type ecp_nistz256_point_add_affinex,\@function,3 3884.align 32 3885ecp_nistz256_point_add_affinex: 3886.cfi_startproc 3887.Lpoint_add_affinex: 3888___ 3889 } 3890$code.=<<___; 3891 push %rbp 3892.cfi_push %rbp 3893 push %rbx 3894.cfi_push %rbx 3895 push %r12 3896.cfi_push %r12 3897 push %r13 3898.cfi_push %r13 3899 push %r14 3900.cfi_push %r14 3901 push %r15 3902.cfi_push %r15 3903 sub \$32*15+8, %rsp 3904.cfi_adjust_cfa_offset 32*15+8 3905.Ladd_affine${x}_body: 3906 3907 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr 3908 mov $b_org, $b_ptr # reassign 3909 movdqu 0x10($a_ptr), %xmm1 3910 movdqu 0x20($a_ptr), %xmm2 3911 movdqu 0x30($a_ptr), %xmm3 3912 movdqu 0x40($a_ptr), %xmm4 3913 movdqu 0x50($a_ptr), %xmm5 3914 mov 0x40+8*0($a_ptr), $src0 # load original in1_z 3915 mov 0x40+8*1($a_ptr), $acc6 3916 mov 0x40+8*2($a_ptr), $acc7 3917 mov 0x40+8*3($a_ptr), $acc0 3918 movdqa %xmm0, $in1_x(%rsp) 3919 movdqa %xmm1, $in1_x+0x10(%rsp) 3920 movdqa %xmm2, $in1_y(%rsp) 3921 movdqa %xmm3, $in1_y+0x10(%rsp) 3922 movdqa %xmm4, $in1_z(%rsp) 3923 movdqa %xmm5, $in1_z+0x10(%rsp) 3924 por %xmm4, %xmm5 3925 3926 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr 3927 pshufd \$0xb1, %xmm5, %xmm3 3928 movdqu 0x10($b_ptr), %xmm1 3929 movdqu 0x20($b_ptr), %xmm2 3930 por %xmm3, %xmm5 3931 movdqu 0x30($b_ptr), %xmm3 3932 movdqa %xmm0, $in2_x(%rsp) 3933 pshufd \$0x1e, %xmm5, %xmm4 3934 movdqa %xmm1, $in2_x+0x10(%rsp) 3935 por %xmm0, %xmm1 3936 movq $r_ptr, %xmm0 # save $r_ptr 3937 movdqa %xmm2, $in2_y(%rsp) 3938 movdqa %xmm3, $in2_y+0x10(%rsp) 3939 por %xmm2, %xmm3 3940 por %xmm4, %xmm5 3941 pxor %xmm4, %xmm4 3942 por %xmm1, %xmm3 3943 3944 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid 3945 lea $Z1sqr(%rsp), $r_ptr # Z1^2 3946 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); 3947 3948 pcmpeqd %xmm4, %xmm5 3949 pshufd \$0xb1, %xmm3, %xmm4 3950 mov 0x00($b_ptr), $src0 # $b_ptr is still valid 3951 #lea 0x00($b_ptr), $b_ptr 3952 mov $acc4, $acc1 # harmonize sqr output and mul input 3953 por %xmm3, %xmm4 3954 pshufd \$0, %xmm5, %xmm5 # in1infty 3955 pshufd \$0x1e, %xmm4, %xmm3 3956 mov $acc5, $acc2 3957 por %xmm3, %xmm4 3958 pxor %xmm3, %xmm3 3959 mov $acc6, $acc3 3960 pcmpeqd %xmm3, %xmm4 3961 pshufd \$0, %xmm4, %xmm4 # in2infty 3962 3963 lea $Z1sqr-$bias(%rsp), $a_ptr 3964 mov $acc7, $acc4 3965 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 3966 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); 3967 3968 lea $in1_x(%rsp), $b_ptr 3969 lea $H(%rsp), $r_ptr # H = U2 - U1 3970 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); 3971 3972 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` 3973 lea $S2(%rsp), $r_ptr # S2 = Z1^3 3974 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); 3975 3976 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` 3977 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 3978 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); 3979 3980 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` 3981 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 3982 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); 3983 3984 lea $in1_y(%rsp), $b_ptr 3985 lea $R(%rsp), $r_ptr # R = S2 - S1 3986 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); 3987 3988 `&load_for_sqr("$H(%rsp)", "$src0")` 3989 lea $Hsqr(%rsp), $r_ptr # H^2 3990 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); 3991 3992 `&load_for_sqr("$R(%rsp)", "$src0")` 3993 lea $Rsqr(%rsp), $r_ptr # R^2 3994 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); 3995 3996 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` 3997 lea $Hcub(%rsp), $r_ptr # H^3 3998 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); 3999 4000 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` 4001 lea $U2(%rsp), $r_ptr # U1*H^2 4002 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); 4003___ 4004{ 4005####################################################################### 4006# operate in 4-5-0-1 "name space" that matches multiplication output 4007# 4008my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 4009my ($poly1, $poly3)=($acc6,$acc7); 4010 4011$code.=<<___; 4012 #lea $U2(%rsp), $a_ptr 4013 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 4014 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); 4015 4016 xor $t4, $t4 4017 add $acc0, $acc0 # a0:a3+a0:a3 4018 lea $Rsqr(%rsp), $a_ptr 4019 adc $acc1, $acc1 4020 mov $acc0, $t0 4021 adc $acc2, $acc2 4022 adc $acc3, $acc3 4023 mov $acc1, $t1 4024 adc \$0, $t4 4025 4026 sub \$-1, $acc0 4027 mov $acc2, $t2 4028 sbb $poly1, $acc1 4029 sbb \$0, $acc2 4030 mov $acc3, $t3 4031 sbb $poly3, $acc3 4032 sbb \$0, $t4 4033 4034 cmovc $t0, $acc0 4035 mov 8*0($a_ptr), $t0 4036 cmovc $t1, $acc1 4037 mov 8*1($a_ptr), $t1 4038 cmovc $t2, $acc2 4039 mov 8*2($a_ptr), $t2 4040 cmovc $t3, $acc3 4041 mov 8*3($a_ptr), $t3 4042 4043 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); 4044 4045 lea $Hcub(%rsp), $b_ptr 4046 lea $res_x(%rsp), $r_ptr 4047 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); 4048 4049 mov $U2+8*0(%rsp), $t0 4050 mov $U2+8*1(%rsp), $t1 4051 mov $U2+8*2(%rsp), $t2 4052 mov $U2+8*3(%rsp), $t3 4053 lea $H(%rsp), $r_ptr 4054 4055 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); 4056 4057 mov $acc0, 8*0($r_ptr) # save the result, as 4058 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't 4059 mov $acc2, 8*2($r_ptr) 4060 mov $acc3, 8*3($r_ptr) 4061___ 4062} 4063$code.=<<___; 4064 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` 4065 lea $S2(%rsp), $r_ptr 4066 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); 4067 4068 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` 4069 lea $H(%rsp), $r_ptr 4070 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); 4071 4072 lea $S2(%rsp), $b_ptr 4073 lea $res_y(%rsp), $r_ptr 4074 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); 4075 4076 movq %xmm0, $r_ptr # restore $r_ptr 4077 4078 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); 4079 movdqa %xmm5, %xmm1 4080 pandn $res_z(%rsp), %xmm0 4081 movdqa %xmm5, %xmm2 4082 pandn $res_z+0x10(%rsp), %xmm1 4083 movdqa %xmm5, %xmm3 4084 pand .LONE_mont(%rip), %xmm2 4085 pand .LONE_mont+0x10(%rip), %xmm3 4086 por %xmm0, %xmm2 4087 por %xmm1, %xmm3 4088 4089 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); 4090 movdqa %xmm4, %xmm1 4091 pandn %xmm2, %xmm0 4092 movdqa %xmm4, %xmm2 4093 pandn %xmm3, %xmm1 4094 movdqa %xmm4, %xmm3 4095 pand $in1_z(%rsp), %xmm2 4096 pand $in1_z+0x10(%rsp), %xmm3 4097 por %xmm0, %xmm2 4098 por %xmm1, %xmm3 4099 movdqu %xmm2, 0x40($r_ptr) 4100 movdqu %xmm3, 0x50($r_ptr) 4101 4102 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); 4103 movdqa %xmm5, %xmm1 4104 pandn $res_x(%rsp), %xmm0 4105 movdqa %xmm5, %xmm2 4106 pandn $res_x+0x10(%rsp), %xmm1 4107 movdqa %xmm5, %xmm3 4108 pand $in2_x(%rsp), %xmm2 4109 pand $in2_x+0x10(%rsp), %xmm3 4110 por %xmm0, %xmm2 4111 por %xmm1, %xmm3 4112 4113 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); 4114 movdqa %xmm4, %xmm1 4115 pandn %xmm2, %xmm0 4116 movdqa %xmm4, %xmm2 4117 pandn %xmm3, %xmm1 4118 movdqa %xmm4, %xmm3 4119 pand $in1_x(%rsp), %xmm2 4120 pand $in1_x+0x10(%rsp), %xmm3 4121 por %xmm0, %xmm2 4122 por %xmm1, %xmm3 4123 movdqu %xmm2, 0x00($r_ptr) 4124 movdqu %xmm3, 0x10($r_ptr) 4125 4126 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); 4127 movdqa %xmm5, %xmm1 4128 pandn $res_y(%rsp), %xmm0 4129 movdqa %xmm5, %xmm2 4130 pandn $res_y+0x10(%rsp), %xmm1 4131 movdqa %xmm5, %xmm3 4132 pand $in2_y(%rsp), %xmm2 4133 pand $in2_y+0x10(%rsp), %xmm3 4134 por %xmm0, %xmm2 4135 por %xmm1, %xmm3 4136 4137 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); 4138 movdqa %xmm4, %xmm1 4139 pandn %xmm2, %xmm0 4140 movdqa %xmm4, %xmm2 4141 pandn %xmm3, %xmm1 4142 movdqa %xmm4, %xmm3 4143 pand $in1_y(%rsp), %xmm2 4144 pand $in1_y+0x10(%rsp), %xmm3 4145 por %xmm0, %xmm2 4146 por %xmm1, %xmm3 4147 movdqu %xmm2, 0x20($r_ptr) 4148 movdqu %xmm3, 0x30($r_ptr) 4149 4150 lea 32*15+56(%rsp), %rsi 4151.cfi_def_cfa %rsi,8 4152 mov -48(%rsi),%r15 4153.cfi_restore %r15 4154 mov -40(%rsi),%r14 4155.cfi_restore %r14 4156 mov -32(%rsi),%r13 4157.cfi_restore %r13 4158 mov -24(%rsi),%r12 4159.cfi_restore %r12 4160 mov -16(%rsi),%rbx 4161.cfi_restore %rbx 4162 mov -8(%rsi),%rbp 4163.cfi_restore %rbp 4164 lea (%rsi),%rsp 4165.cfi_def_cfa_register %rsp 4166.Ladd_affine${x}_epilogue: 4167 ret 4168.cfi_endproc 4169.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx 4170___ 4171} 4172&gen_add_affine("q"); 4173 4174######################################################################## 4175# AD*X magic 4176# 4177if ($addx) { { 4178######################################################################## 4179# operate in 4-5-0-1 "name space" that matches multiplication output 4180# 4181my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); 4182 4183$code.=<<___; 4184.type __ecp_nistz256_add_tox,\@abi-omnipotent 4185.align 32 4186__ecp_nistz256_add_tox: 4187.cfi_startproc 4188 xor $t4, $t4 4189 adc 8*0($b_ptr), $a0 4190 adc 8*1($b_ptr), $a1 4191 mov $a0, $t0 4192 adc 8*2($b_ptr), $a2 4193 adc 8*3($b_ptr), $a3 4194 mov $a1, $t1 4195 adc \$0, $t4 4196 4197 xor $t3, $t3 4198 sbb \$-1, $a0 4199 mov $a2, $t2 4200 sbb $poly1, $a1 4201 sbb \$0, $a2 4202 mov $a3, $t3 4203 sbb $poly3, $a3 4204 sbb \$0, $t4 4205 4206 cmovc $t0, $a0 4207 cmovc $t1, $a1 4208 mov $a0, 8*0($r_ptr) 4209 cmovc $t2, $a2 4210 mov $a1, 8*1($r_ptr) 4211 cmovc $t3, $a3 4212 mov $a2, 8*2($r_ptr) 4213 mov $a3, 8*3($r_ptr) 4214 4215 ret 4216.cfi_endproc 4217.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox 4218 4219.type __ecp_nistz256_sub_fromx,\@abi-omnipotent 4220.align 32 4221__ecp_nistz256_sub_fromx: 4222.cfi_startproc 4223 xor $t4, $t4 4224 sbb 8*0($b_ptr), $a0 4225 sbb 8*1($b_ptr), $a1 4226 mov $a0, $t0 4227 sbb 8*2($b_ptr), $a2 4228 sbb 8*3($b_ptr), $a3 4229 mov $a1, $t1 4230 sbb \$0, $t4 4231 4232 xor $t3, $t3 4233 adc \$-1, $a0 4234 mov $a2, $t2 4235 adc $poly1, $a1 4236 adc \$0, $a2 4237 mov $a3, $t3 4238 adc $poly3, $a3 4239 4240 bt \$0, $t4 4241 cmovnc $t0, $a0 4242 cmovnc $t1, $a1 4243 mov $a0, 8*0($r_ptr) 4244 cmovnc $t2, $a2 4245 mov $a1, 8*1($r_ptr) 4246 cmovnc $t3, $a3 4247 mov $a2, 8*2($r_ptr) 4248 mov $a3, 8*3($r_ptr) 4249 4250 ret 4251.cfi_endproc 4252.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx 4253 4254.type __ecp_nistz256_subx,\@abi-omnipotent 4255.align 32 4256__ecp_nistz256_subx: 4257.cfi_startproc 4258 xor $t4, $t4 4259 sbb $a0, $t0 4260 sbb $a1, $t1 4261 mov $t0, $a0 4262 sbb $a2, $t2 4263 sbb $a3, $t3 4264 mov $t1, $a1 4265 sbb \$0, $t4 4266 4267 xor $a3 ,$a3 4268 adc \$-1, $t0 4269 mov $t2, $a2 4270 adc $poly1, $t1 4271 adc \$0, $t2 4272 mov $t3, $a3 4273 adc $poly3, $t3 4274 4275 bt \$0, $t4 4276 cmovc $t0, $a0 4277 cmovc $t1, $a1 4278 cmovc $t2, $a2 4279 cmovc $t3, $a3 4280 4281 ret 4282.cfi_endproc 4283.size __ecp_nistz256_subx,.-__ecp_nistz256_subx 4284 4285.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent 4286.align 32 4287__ecp_nistz256_mul_by_2x: 4288.cfi_startproc 4289 xor $t4, $t4 4290 adc $a0, $a0 # a0:a3+a0:a3 4291 adc $a1, $a1 4292 mov $a0, $t0 4293 adc $a2, $a2 4294 adc $a3, $a3 4295 mov $a1, $t1 4296 adc \$0, $t4 4297 4298 xor $t3, $t3 4299 sbb \$-1, $a0 4300 mov $a2, $t2 4301 sbb $poly1, $a1 4302 sbb \$0, $a2 4303 mov $a3, $t3 4304 sbb $poly3, $a3 4305 sbb \$0, $t4 4306 4307 cmovc $t0, $a0 4308 cmovc $t1, $a1 4309 mov $a0, 8*0($r_ptr) 4310 cmovc $t2, $a2 4311 mov $a1, 8*1($r_ptr) 4312 cmovc $t3, $a3 4313 mov $a2, 8*2($r_ptr) 4314 mov $a3, 8*3($r_ptr) 4315 4316 ret 4317.cfi_endproc 4318.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x 4319___ 4320 } 4321&gen_double("x"); 4322&gen_add("x"); 4323&gen_add_affine("x"); 4324} 4325}}} 4326 4327# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4328# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4329if ($win64) { 4330$rec="%rcx"; 4331$frame="%rdx"; 4332$context="%r8"; 4333$disp="%r9"; 4334 4335$code.=<<___; 4336.extern __imp_RtlVirtualUnwind 4337 4338.type short_handler,\@abi-omnipotent 4339.align 16 4340short_handler: 4341 push %rsi 4342 push %rdi 4343 push %rbx 4344 push %rbp 4345 push %r12 4346 push %r13 4347 push %r14 4348 push %r15 4349 pushfq 4350 sub \$64,%rsp 4351 4352 mov 120($context),%rax # pull context->Rax 4353 mov 248($context),%rbx # pull context->Rip 4354 4355 mov 8($disp),%rsi # disp->ImageBase 4356 mov 56($disp),%r11 # disp->HandlerData 4357 4358 mov 0(%r11),%r10d # HandlerData[0] 4359 lea (%rsi,%r10),%r10 # end of prologue label 4360 cmp %r10,%rbx # context->Rip<end of prologue label 4361 jb .Lcommon_seh_tail 4362 4363 mov 152($context),%rax # pull context->Rsp 4364 4365 mov 4(%r11),%r10d # HandlerData[1] 4366 lea (%rsi,%r10),%r10 # epilogue label 4367 cmp %r10,%rbx # context->Rip>=epilogue label 4368 jae .Lcommon_seh_tail 4369 4370 lea 16(%rax),%rax 4371 4372 mov -8(%rax),%r12 4373 mov -16(%rax),%r13 4374 mov %r12,216($context) # restore context->R12 4375 mov %r13,224($context) # restore context->R13 4376 4377 jmp .Lcommon_seh_tail 4378.size short_handler,.-short_handler 4379 4380.type full_handler,\@abi-omnipotent 4381.align 16 4382full_handler: 4383 push %rsi 4384 push %rdi 4385 push %rbx 4386 push %rbp 4387 push %r12 4388 push %r13 4389 push %r14 4390 push %r15 4391 pushfq 4392 sub \$64,%rsp 4393 4394 mov 120($context),%rax # pull context->Rax 4395 mov 248($context),%rbx # pull context->Rip 4396 4397 mov 8($disp),%rsi # disp->ImageBase 4398 mov 56($disp),%r11 # disp->HandlerData 4399 4400 mov 0(%r11),%r10d # HandlerData[0] 4401 lea (%rsi,%r10),%r10 # end of prologue label 4402 cmp %r10,%rbx # context->Rip<end of prologue label 4403 jb .Lcommon_seh_tail 4404 4405 mov 152($context),%rax # pull context->Rsp 4406 4407 mov 4(%r11),%r10d # HandlerData[1] 4408 lea (%rsi,%r10),%r10 # epilogue label 4409 cmp %r10,%rbx # context->Rip>=epilogue label 4410 jae .Lcommon_seh_tail 4411 4412 mov 8(%r11),%r10d # HandlerData[2] 4413 lea (%rax,%r10),%rax 4414 4415 mov -8(%rax),%rbp 4416 mov -16(%rax),%rbx 4417 mov -24(%rax),%r12 4418 mov -32(%rax),%r13 4419 mov -40(%rax),%r14 4420 mov -48(%rax),%r15 4421 mov %rbx,144($context) # restore context->Rbx 4422 mov %rbp,160($context) # restore context->Rbp 4423 mov %r12,216($context) # restore context->R12 4424 mov %r13,224($context) # restore context->R13 4425 mov %r14,232($context) # restore context->R14 4426 mov %r15,240($context) # restore context->R15 4427 4428.Lcommon_seh_tail: 4429 mov 8(%rax),%rdi 4430 mov 16(%rax),%rsi 4431 mov %rax,152($context) # restore context->Rsp 4432 mov %rsi,168($context) # restore context->Rsi 4433 mov %rdi,176($context) # restore context->Rdi 4434 4435 mov 40($disp),%rdi # disp->ContextRecord 4436 mov $context,%rsi # context 4437 mov \$154,%ecx # sizeof(CONTEXT) 4438 .long 0xa548f3fc # cld; rep movsq 4439 4440 mov $disp,%rsi 4441 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4442 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4443 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4444 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4445 mov 40(%rsi),%r10 # disp->ContextRecord 4446 lea 56(%rsi),%r11 # &disp->HandlerData 4447 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4448 mov %r10,32(%rsp) # arg5 4449 mov %r11,40(%rsp) # arg6 4450 mov %r12,48(%rsp) # arg7 4451 mov %rcx,56(%rsp) # arg8, (NULL) 4452 call *__imp_RtlVirtualUnwind(%rip) 4453 4454 mov \$1,%eax # ExceptionContinueSearch 4455 add \$64,%rsp 4456 popfq 4457 pop %r15 4458 pop %r14 4459 pop %r13 4460 pop %r12 4461 pop %rbp 4462 pop %rbx 4463 pop %rdi 4464 pop %rsi 4465 ret 4466.size full_handler,.-full_handler 4467 4468.section .pdata 4469.align 4 4470 .rva .LSEH_begin_ecp_nistz256_mul_by_2 4471 .rva .LSEH_end_ecp_nistz256_mul_by_2 4472 .rva .LSEH_info_ecp_nistz256_mul_by_2 4473 4474 .rva .LSEH_begin_ecp_nistz256_div_by_2 4475 .rva .LSEH_end_ecp_nistz256_div_by_2 4476 .rva .LSEH_info_ecp_nistz256_div_by_2 4477 4478 .rva .LSEH_begin_ecp_nistz256_mul_by_3 4479 .rva .LSEH_end_ecp_nistz256_mul_by_3 4480 .rva .LSEH_info_ecp_nistz256_mul_by_3 4481 4482 .rva .LSEH_begin_ecp_nistz256_add 4483 .rva .LSEH_end_ecp_nistz256_add 4484 .rva .LSEH_info_ecp_nistz256_add 4485 4486 .rva .LSEH_begin_ecp_nistz256_sub 4487 .rva .LSEH_end_ecp_nistz256_sub 4488 .rva .LSEH_info_ecp_nistz256_sub 4489 4490 .rva .LSEH_begin_ecp_nistz256_neg 4491 .rva .LSEH_end_ecp_nistz256_neg 4492 .rva .LSEH_info_ecp_nistz256_neg 4493 4494 .rva .LSEH_begin_ecp_nistz256_ord_mul_mont 4495 .rva .LSEH_end_ecp_nistz256_ord_mul_mont 4496 .rva .LSEH_info_ecp_nistz256_ord_mul_mont 4497 4498 .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont 4499 .rva .LSEH_end_ecp_nistz256_ord_sqr_mont 4500 .rva .LSEH_info_ecp_nistz256_ord_sqr_mont 4501___ 4502$code.=<<___ if ($addx); 4503 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx 4504 .rva .LSEH_end_ecp_nistz256_ord_mul_montx 4505 .rva .LSEH_info_ecp_nistz256_ord_mul_montx 4506 4507 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx 4508 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx 4509 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx 4510___ 4511$code.=<<___; 4512 .rva .LSEH_begin_ecp_nistz256_to_mont 4513 .rva .LSEH_end_ecp_nistz256_to_mont 4514 .rva .LSEH_info_ecp_nistz256_to_mont 4515 4516 .rva .LSEH_begin_ecp_nistz256_mul_mont 4517 .rva .LSEH_end_ecp_nistz256_mul_mont 4518 .rva .LSEH_info_ecp_nistz256_mul_mont 4519 4520 .rva .LSEH_begin_ecp_nistz256_sqr_mont 4521 .rva .LSEH_end_ecp_nistz256_sqr_mont 4522 .rva .LSEH_info_ecp_nistz256_sqr_mont 4523 4524 .rva .LSEH_begin_ecp_nistz256_from_mont 4525 .rva .LSEH_end_ecp_nistz256_from_mont 4526 .rva .LSEH_info_ecp_nistz256_from_mont 4527 4528 .rva .LSEH_begin_ecp_nistz256_gather_w5 4529 .rva .LSEH_end_ecp_nistz256_gather_w5 4530 .rva .LSEH_info_ecp_nistz256_gather_wX 4531 4532 .rva .LSEH_begin_ecp_nistz256_gather_w7 4533 .rva .LSEH_end_ecp_nistz256_gather_w7 4534 .rva .LSEH_info_ecp_nistz256_gather_wX 4535___ 4536$code.=<<___ if ($avx>1); 4537 .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 4538 .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 4539 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX 4540 4541 .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 4542 .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 4543 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX 4544___ 4545$code.=<<___; 4546 .rva .LSEH_begin_ecp_nistz256_point_double 4547 .rva .LSEH_end_ecp_nistz256_point_double 4548 .rva .LSEH_info_ecp_nistz256_point_double 4549 4550 .rva .LSEH_begin_ecp_nistz256_point_add 4551 .rva .LSEH_end_ecp_nistz256_point_add 4552 .rva .LSEH_info_ecp_nistz256_point_add 4553 4554 .rva .LSEH_begin_ecp_nistz256_point_add_affine 4555 .rva .LSEH_end_ecp_nistz256_point_add_affine 4556 .rva .LSEH_info_ecp_nistz256_point_add_affine 4557___ 4558$code.=<<___ if ($addx); 4559 .rva .LSEH_begin_ecp_nistz256_point_doublex 4560 .rva .LSEH_end_ecp_nistz256_point_doublex 4561 .rva .LSEH_info_ecp_nistz256_point_doublex 4562 4563 .rva .LSEH_begin_ecp_nistz256_point_addx 4564 .rva .LSEH_end_ecp_nistz256_point_addx 4565 .rva .LSEH_info_ecp_nistz256_point_addx 4566 4567 .rva .LSEH_begin_ecp_nistz256_point_add_affinex 4568 .rva .LSEH_end_ecp_nistz256_point_add_affinex 4569 .rva .LSEH_info_ecp_nistz256_point_add_affinex 4570___ 4571$code.=<<___; 4572 4573.section .xdata 4574.align 8 4575.LSEH_info_ecp_nistz256_mul_by_2: 4576 .byte 9,0,0,0 4577 .rva short_handler 4578 .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] 4579.LSEH_info_ecp_nistz256_div_by_2: 4580 .byte 9,0,0,0 4581 .rva short_handler 4582 .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] 4583.LSEH_info_ecp_nistz256_mul_by_3: 4584 .byte 9,0,0,0 4585 .rva short_handler 4586 .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] 4587.LSEH_info_ecp_nistz256_add: 4588 .byte 9,0,0,0 4589 .rva short_handler 4590 .rva .Ladd_body,.Ladd_epilogue # HandlerData[] 4591.LSEH_info_ecp_nistz256_sub: 4592 .byte 9,0,0,0 4593 .rva short_handler 4594 .rva .Lsub_body,.Lsub_epilogue # HandlerData[] 4595.LSEH_info_ecp_nistz256_neg: 4596 .byte 9,0,0,0 4597 .rva short_handler 4598 .rva .Lneg_body,.Lneg_epilogue # HandlerData[] 4599.LSEH_info_ecp_nistz256_ord_mul_mont: 4600 .byte 9,0,0,0 4601 .rva full_handler 4602 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] 4603 .long 48,0 4604.LSEH_info_ecp_nistz256_ord_sqr_mont: 4605 .byte 9,0,0,0 4606 .rva full_handler 4607 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] 4608 .long 48,0 4609___ 4610$code.=<<___ if ($addx); 4611.LSEH_info_ecp_nistz256_ord_mul_montx: 4612 .byte 9,0,0,0 4613 .rva full_handler 4614 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] 4615 .long 48,0 4616.LSEH_info_ecp_nistz256_ord_sqr_montx: 4617 .byte 9,0,0,0 4618 .rva full_handler 4619 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] 4620 .long 48,0 4621___ 4622$code.=<<___; 4623.LSEH_info_ecp_nistz256_to_mont: 4624 .byte 9,0,0,0 4625 .rva full_handler 4626 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 4627 .long 48,0 4628.LSEH_info_ecp_nistz256_mul_mont: 4629 .byte 9,0,0,0 4630 .rva full_handler 4631 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 4632 .long 48,0 4633.LSEH_info_ecp_nistz256_sqr_mont: 4634 .byte 9,0,0,0 4635 .rva full_handler 4636 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 4637 .long 48,0 4638.LSEH_info_ecp_nistz256_from_mont: 4639 .byte 9,0,0,0 4640 .rva short_handler 4641 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] 4642.LSEH_info_ecp_nistz256_gather_wX: 4643 .byte 0x01,0x33,0x16,0x00 4644 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 4645 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 4646 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 4647 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 4648 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 4649 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 4650 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 4651 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 4652 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 4653 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 4654 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 4655 .align 8 4656___ 4657$code.=<<___ if ($avx>1); 4658.LSEH_info_ecp_nistz256_avx2_gather_wX: 4659 .byte 0x01,0x36,0x17,0x0b 4660 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 4661 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 4662 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 4663 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 4664 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 4665 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 4666 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 4667 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 4668 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 4669 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 4670 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 4671 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 4672 .align 8 4673___ 4674$code.=<<___; 4675.LSEH_info_ecp_nistz256_point_double: 4676 .byte 9,0,0,0 4677 .rva full_handler 4678 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] 4679 .long 32*5+56,0 4680.LSEH_info_ecp_nistz256_point_add: 4681 .byte 9,0,0,0 4682 .rva full_handler 4683 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] 4684 .long 32*18+56,0 4685.LSEH_info_ecp_nistz256_point_add_affine: 4686 .byte 9,0,0,0 4687 .rva full_handler 4688 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] 4689 .long 32*15+56,0 4690___ 4691$code.=<<___ if ($addx); 4692.align 8 4693.LSEH_info_ecp_nistz256_point_doublex: 4694 .byte 9,0,0,0 4695 .rva full_handler 4696 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] 4697 .long 32*5+56,0 4698.LSEH_info_ecp_nistz256_point_addx: 4699 .byte 9,0,0,0 4700 .rva full_handler 4701 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] 4702 .long 32*18+56,0 4703.LSEH_info_ecp_nistz256_point_add_affinex: 4704 .byte 9,0,0,0 4705 .rva full_handler 4706 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] 4707 .long 32*15+56,0 4708___ 4709} 4710 4711######################################################################## 4712# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 4713# 4714open TABLE,"<ecp_nistz256_table.c" or 4715open TABLE,"<${dir}../ecp_nistz256_table.c" or 4716die "failed to open ecp_nistz256_table.c:",$!; 4717 4718use integer; 4719 4720foreach(<TABLE>) { 4721 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 4722} 4723close TABLE; 4724 4725die "insane number of elements" if ($#arr != 64*16*37-1); 4726 4727print <<___; 4728.section .rodata align=4096 4729.globl ecp_nistz256_precomputed 4730.type ecp_nistz256_precomputed,\@object 4731.align 4096 4732ecp_nistz256_precomputed: 4733___ 4734while (@line=splice(@arr,0,16)) { 4735 print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; 4736} 4737print <<___; 4738.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 4739___ 4740 4741$code =~ s/\`([^\`]*)\`/eval $1/gem; 4742print $code; 4743close STDOUT or die "error closing STDOUT: $!"; 4744