1#! /usr/bin/env perl 2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# October 2005. 18# 19# Montgomery multiplication routine for x86_64. While it gives modest 20# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more 21# than twice, >2x, as fast. Most common rsa1024 sign is improved by 22# respectful 50%. It remains to be seen if loop unrolling and 23# dedicated squaring routine can provide further improvement... 24 25# July 2011. 26# 27# Add dedicated squaring procedure. Performance improvement varies 28# from platform to platform, but in average it's ~5%/15%/25%/33% 29# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 30 31# August 2011. 32# 33# Unroll and modulo-schedule inner loops in such manner that they 34# are "fallen through" for input lengths of 8, which is critical for 35# 1024-bit RSA *sign*. Average performance improvement in comparison 36# to *initial* version of this module from 2005 is ~0%/30%/40%/45% 37# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. 38 39# June 2013. 40# 41# Optimize reduction in squaring procedure and improve 1024+-bit RSA 42# sign performance by 10-16% on Intel Sandy Bridge and later 43# (virtually same on non-Intel processors). 44 45# August 2013. 46# 47# Add MULX/ADOX/ADCX code path. 48 49# $output is the last argument if it looks like a file (it has an extension) 50# $flavour is the first argument if it doesn't look like a file 51$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 52$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 53 54$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 55 56$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 57( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 58( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 59die "can't locate x86_64-xlate.pl"; 60 61open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 62 or die "can't call $xlate: $!"; 63*STDOUT=*OUT; 64 65if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 66 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 67 $addx = ($1>=2.23); 68} 69 70if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 71 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 72 $addx = ($1>=2.10); 73} 74 75if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 76 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 77 $addx = ($1>=12); 78} 79 80if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 81 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 82 $addx = ($ver>=3.03); 83} 84 85# int bn_mul_mont( 86$rp="%rdi"; # BN_ULONG *rp, 87$ap="%rsi"; # const BN_ULONG *ap, 88$bp="%rdx"; # const BN_ULONG *bp, 89$np="%rcx"; # const BN_ULONG *np, 90$n0="%r8"; # const BN_ULONG *n0, 91$num="%r9"; # int num); 92$lo0="%r10"; 93$hi0="%r11"; 94$hi1="%r13"; 95$i="%r14"; 96$j="%r15"; 97$m0="%rbx"; 98$m1="%rbp"; 99 100$code=<<___; 101.text 102 103.extern OPENSSL_ia32cap_P 104 105.globl bn_mul_mont 106.type bn_mul_mont,\@function,6 107.align 16 108bn_mul_mont: 109.cfi_startproc 110 mov ${num}d,${num}d 111 mov %rsp,%rax 112.cfi_def_cfa_register %rax 113 test \$3,${num}d 114 jnz .Lmul_enter 115 cmp \$8,${num}d 116 jb .Lmul_enter 117___ 118$code.=<<___ if ($addx); 119 mov OPENSSL_ia32cap_P+8(%rip),%r11d 120___ 121$code.=<<___; 122 cmp $ap,$bp 123 jne .Lmul4x_enter 124 test \$7,${num}d 125 jz .Lsqr8x_enter 126 jmp .Lmul4x_enter 127 128.align 16 129.Lmul_enter: 130 push %rbx 131.cfi_push %rbx 132 push %rbp 133.cfi_push %rbp 134 push %r12 135.cfi_push %r12 136 push %r13 137.cfi_push %r13 138 push %r14 139.cfi_push %r14 140 push %r15 141.cfi_push %r15 142 143 neg $num 144 mov %rsp,%r11 145 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) 146 neg $num # restore $num 147 and \$-1024,%r10 # minimize TLB usage 148 149 # An OS-agnostic version of __chkstk. 150 # 151 # Some OSes (Windows) insist on stack being "wired" to 152 # physical memory in strictly sequential manner, i.e. if stack 153 # allocation spans two pages, then reference to farmost one can 154 # be punishable by SEGV. But page walking can do good even on 155 # other OSes, because it guarantees that villain thread hits 156 # the guard page before it can make damage to innocent one... 157 sub %r10,%r11 158 and \$-4096,%r11 159 lea (%r10,%r11),%rsp 160 mov (%rsp),%r11 161 cmp %r10,%rsp 162 ja .Lmul_page_walk 163 jmp .Lmul_page_walk_done 164 165.align 16 166.Lmul_page_walk: 167 lea -4096(%rsp),%rsp 168 mov (%rsp),%r11 169 cmp %r10,%rsp 170 ja .Lmul_page_walk 171.Lmul_page_walk_done: 172 173 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 174.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 175.Lmul_body: 176 mov $bp,%r12 # reassign $bp 177___ 178 $bp="%r12"; 179$code.=<<___; 180 mov ($n0),$n0 # pull n0[0] value 181 mov ($bp),$m0 # m0=bp[0] 182 mov ($ap),%rax 183 184 xor $i,$i # i=0 185 xor $j,$j # j=0 186 187 mov $n0,$m1 188 mulq $m0 # ap[0]*bp[0] 189 mov %rax,$lo0 190 mov ($np),%rax 191 192 imulq $lo0,$m1 # "tp[0]"*n0 193 mov %rdx,$hi0 194 195 mulq $m1 # np[0]*m1 196 add %rax,$lo0 # discarded 197 mov 8($ap),%rax 198 adc \$0,%rdx 199 mov %rdx,$hi1 200 201 lea 1($j),$j # j++ 202 jmp .L1st_enter 203 204.align 16 205.L1st: 206 add %rax,$hi1 207 mov ($ap,$j,8),%rax 208 adc \$0,%rdx 209 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 210 mov $lo0,$hi0 211 adc \$0,%rdx 212 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 213 mov %rdx,$hi1 214 215.L1st_enter: 216 mulq $m0 # ap[j]*bp[0] 217 add %rax,$hi0 218 mov ($np,$j,8),%rax 219 adc \$0,%rdx 220 lea 1($j),$j # j++ 221 mov %rdx,$lo0 222 223 mulq $m1 # np[j]*m1 224 cmp $num,$j 225 jne .L1st 226 227 add %rax,$hi1 228 mov ($ap),%rax # ap[0] 229 adc \$0,%rdx 230 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] 231 adc \$0,%rdx 232 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 233 mov %rdx,$hi1 234 mov $lo0,$hi0 235 236 xor %rdx,%rdx 237 add $hi0,$hi1 238 adc \$0,%rdx 239 mov $hi1,-8(%rsp,$num,8) 240 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 241 242 lea 1($i),$i # i++ 243 jmp .Louter 244.align 16 245.Louter: 246 mov ($bp,$i,8),$m0 # m0=bp[i] 247 xor $j,$j # j=0 248 mov $n0,$m1 249 mov (%rsp),$lo0 250 mulq $m0 # ap[0]*bp[i] 251 add %rax,$lo0 # ap[0]*bp[i]+tp[0] 252 mov ($np),%rax 253 adc \$0,%rdx 254 255 imulq $lo0,$m1 # tp[0]*n0 256 mov %rdx,$hi0 257 258 mulq $m1 # np[0]*m1 259 add %rax,$lo0 # discarded 260 mov 8($ap),%rax 261 adc \$0,%rdx 262 mov 8(%rsp),$lo0 # tp[1] 263 mov %rdx,$hi1 264 265 lea 1($j),$j # j++ 266 jmp .Linner_enter 267 268.align 16 269.Linner: 270 add %rax,$hi1 271 mov ($ap,$j,8),%rax 272 adc \$0,%rdx 273 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 274 mov (%rsp,$j,8),$lo0 275 adc \$0,%rdx 276 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 277 mov %rdx,$hi1 278 279.Linner_enter: 280 mulq $m0 # ap[j]*bp[i] 281 add %rax,$hi0 282 mov ($np,$j,8),%rax 283 adc \$0,%rdx 284 add $hi0,$lo0 # ap[j]*bp[i]+tp[j] 285 mov %rdx,$hi0 286 adc \$0,$hi0 287 lea 1($j),$j # j++ 288 289 mulq $m1 # np[j]*m1 290 cmp $num,$j 291 jne .Linner 292 293 add %rax,$hi1 294 mov ($ap),%rax # ap[0] 295 adc \$0,%rdx 296 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] 297 mov (%rsp,$j,8),$lo0 298 adc \$0,%rdx 299 mov $hi1,-16(%rsp,$j,8) # tp[j-1] 300 mov %rdx,$hi1 301 302 xor %rdx,%rdx 303 add $hi0,$hi1 304 adc \$0,%rdx 305 add $lo0,$hi1 # pull upmost overflow bit 306 adc \$0,%rdx 307 mov $hi1,-8(%rsp,$num,8) 308 mov %rdx,(%rsp,$num,8) # store upmost overflow bit 309 310 lea 1($i),$i # i++ 311 cmp $num,$i 312 jb .Louter 313 314 xor $i,$i # i=0 and clear CF! 315 mov (%rsp),%rax # tp[0] 316 mov $num,$j # j=num 317 318.align 16 319.Lsub: sbb ($np,$i,8),%rax 320 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] 321 mov 8(%rsp,$i,8),%rax # tp[i+1] 322 lea 1($i),$i # i++ 323 dec $j # doesn't affect CF! 324 jnz .Lsub 325 326 sbb \$0,%rax # handle upmost overflow bit 327 mov \$-1,%rbx 328 xor %rax,%rbx # not %rax 329 xor $i,$i 330 mov $num,$j # j=num 331 332.Lcopy: # conditional copy 333 mov ($rp,$i,8),%rcx 334 mov (%rsp,$i,8),%rdx 335 and %rbx,%rcx 336 and %rax,%rdx 337 mov $num,(%rsp,$i,8) # zap temporary vector 338 or %rcx,%rdx 339 mov %rdx,($rp,$i,8) # rp[i]=tp[i] 340 lea 1($i),$i 341 sub \$1,$j 342 jnz .Lcopy 343 344 mov 8(%rsp,$num,8),%rsi # restore %rsp 345.cfi_def_cfa %rsi,8 346 mov \$1,%rax 347 mov -48(%rsi),%r15 348.cfi_restore %r15 349 mov -40(%rsi),%r14 350.cfi_restore %r14 351 mov -32(%rsi),%r13 352.cfi_restore %r13 353 mov -24(%rsi),%r12 354.cfi_restore %r12 355 mov -16(%rsi),%rbp 356.cfi_restore %rbp 357 mov -8(%rsi),%rbx 358.cfi_restore %rbx 359 lea (%rsi),%rsp 360.cfi_def_cfa_register %rsp 361.Lmul_epilogue: 362 ret 363.cfi_endproc 364.size bn_mul_mont,.-bn_mul_mont 365___ 366{{{ 367my @A=("%r10","%r11"); 368my @N=("%r13","%rdi"); 369$code.=<<___; 370.type bn_mul4x_mont,\@function,6 371.align 16 372bn_mul4x_mont: 373.cfi_startproc 374 mov ${num}d,${num}d 375 mov %rsp,%rax 376.cfi_def_cfa_register %rax 377.Lmul4x_enter: 378___ 379$code.=<<___ if ($addx); 380 and \$0x80100,%r11d 381 cmp \$0x80100,%r11d 382 je .Lmulx4x_enter 383___ 384$code.=<<___; 385 push %rbx 386.cfi_push %rbx 387 push %rbp 388.cfi_push %rbp 389 push %r12 390.cfi_push %r12 391 push %r13 392.cfi_push %r13 393 push %r14 394.cfi_push %r14 395 push %r15 396.cfi_push %r15 397 398 neg $num 399 mov %rsp,%r11 400 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) 401 neg $num # restore 402 and \$-1024,%r10 # minimize TLB usage 403 404 sub %r10,%r11 405 and \$-4096,%r11 406 lea (%r10,%r11),%rsp 407 mov (%rsp),%r11 408 cmp %r10,%rsp 409 ja .Lmul4x_page_walk 410 jmp .Lmul4x_page_walk_done 411 412.Lmul4x_page_walk: 413 lea -4096(%rsp),%rsp 414 mov (%rsp),%r11 415 cmp %r10,%rsp 416 ja .Lmul4x_page_walk 417.Lmul4x_page_walk_done: 418 419 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp 420.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 421.Lmul4x_body: 422 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp 423 mov %rdx,%r12 # reassign $bp 424___ 425 $bp="%r12"; 426$code.=<<___; 427 mov ($n0),$n0 # pull n0[0] value 428 mov ($bp),$m0 # m0=bp[0] 429 mov ($ap),%rax 430 431 xor $i,$i # i=0 432 xor $j,$j # j=0 433 434 mov $n0,$m1 435 mulq $m0 # ap[0]*bp[0] 436 mov %rax,$A[0] 437 mov ($np),%rax 438 439 imulq $A[0],$m1 # "tp[0]"*n0 440 mov %rdx,$A[1] 441 442 mulq $m1 # np[0]*m1 443 add %rax,$A[0] # discarded 444 mov 8($ap),%rax 445 adc \$0,%rdx 446 mov %rdx,$N[1] 447 448 mulq $m0 449 add %rax,$A[1] 450 mov 8($np),%rax 451 adc \$0,%rdx 452 mov %rdx,$A[0] 453 454 mulq $m1 455 add %rax,$N[1] 456 mov 16($ap),%rax 457 adc \$0,%rdx 458 add $A[1],$N[1] 459 lea 4($j),$j # j++ 460 adc \$0,%rdx 461 mov $N[1],(%rsp) 462 mov %rdx,$N[0] 463 jmp .L1st4x 464.align 16 465.L1st4x: 466 mulq $m0 # ap[j]*bp[0] 467 add %rax,$A[0] 468 mov -16($np,$j,8),%rax 469 adc \$0,%rdx 470 mov %rdx,$A[1] 471 472 mulq $m1 # np[j]*m1 473 add %rax,$N[0] 474 mov -8($ap,$j,8),%rax 475 adc \$0,%rdx 476 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 477 adc \$0,%rdx 478 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 479 mov %rdx,$N[1] 480 481 mulq $m0 # ap[j]*bp[0] 482 add %rax,$A[1] 483 mov -8($np,$j,8),%rax 484 adc \$0,%rdx 485 mov %rdx,$A[0] 486 487 mulq $m1 # np[j]*m1 488 add %rax,$N[1] 489 mov ($ap,$j,8),%rax 490 adc \$0,%rdx 491 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 492 adc \$0,%rdx 493 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 494 mov %rdx,$N[0] 495 496 mulq $m0 # ap[j]*bp[0] 497 add %rax,$A[0] 498 mov ($np,$j,8),%rax 499 adc \$0,%rdx 500 mov %rdx,$A[1] 501 502 mulq $m1 # np[j]*m1 503 add %rax,$N[0] 504 mov 8($ap,$j,8),%rax 505 adc \$0,%rdx 506 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 507 adc \$0,%rdx 508 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 509 mov %rdx,$N[1] 510 511 mulq $m0 # ap[j]*bp[0] 512 add %rax,$A[1] 513 mov 8($np,$j,8),%rax 514 adc \$0,%rdx 515 lea 4($j),$j # j++ 516 mov %rdx,$A[0] 517 518 mulq $m1 # np[j]*m1 519 add %rax,$N[1] 520 mov -16($ap,$j,8),%rax 521 adc \$0,%rdx 522 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 523 adc \$0,%rdx 524 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 525 mov %rdx,$N[0] 526 cmp $num,$j 527 jb .L1st4x 528 529 mulq $m0 # ap[j]*bp[0] 530 add %rax,$A[0] 531 mov -16($np,$j,8),%rax 532 adc \$0,%rdx 533 mov %rdx,$A[1] 534 535 mulq $m1 # np[j]*m1 536 add %rax,$N[0] 537 mov -8($ap,$j,8),%rax 538 adc \$0,%rdx 539 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] 540 adc \$0,%rdx 541 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 542 mov %rdx,$N[1] 543 544 mulq $m0 # ap[j]*bp[0] 545 add %rax,$A[1] 546 mov -8($np,$j,8),%rax 547 adc \$0,%rdx 548 mov %rdx,$A[0] 549 550 mulq $m1 # np[j]*m1 551 add %rax,$N[1] 552 mov ($ap),%rax # ap[0] 553 adc \$0,%rdx 554 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] 555 adc \$0,%rdx 556 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 557 mov %rdx,$N[0] 558 559 xor $N[1],$N[1] 560 add $A[0],$N[0] 561 adc \$0,$N[1] 562 mov $N[0],-8(%rsp,$j,8) 563 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 564 565 lea 1($i),$i # i++ 566.align 4 567.Louter4x: 568 mov ($bp,$i,8),$m0 # m0=bp[i] 569 xor $j,$j # j=0 570 mov (%rsp),$A[0] 571 mov $n0,$m1 572 mulq $m0 # ap[0]*bp[i] 573 add %rax,$A[0] # ap[0]*bp[i]+tp[0] 574 mov ($np),%rax 575 adc \$0,%rdx 576 577 imulq $A[0],$m1 # tp[0]*n0 578 mov %rdx,$A[1] 579 580 mulq $m1 # np[0]*m1 581 add %rax,$A[0] # "$N[0]", discarded 582 mov 8($ap),%rax 583 adc \$0,%rdx 584 mov %rdx,$N[1] 585 586 mulq $m0 # ap[j]*bp[i] 587 add %rax,$A[1] 588 mov 8($np),%rax 589 adc \$0,%rdx 590 add 8(%rsp),$A[1] # +tp[1] 591 adc \$0,%rdx 592 mov %rdx,$A[0] 593 594 mulq $m1 # np[j]*m1 595 add %rax,$N[1] 596 mov 16($ap),%rax 597 adc \$0,%rdx 598 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] 599 lea 4($j),$j # j+=2 600 adc \$0,%rdx 601 mov $N[1],(%rsp) # tp[j-1] 602 mov %rdx,$N[0] 603 jmp .Linner4x 604.align 16 605.Linner4x: 606 mulq $m0 # ap[j]*bp[i] 607 add %rax,$A[0] 608 mov -16($np,$j,8),%rax 609 adc \$0,%rdx 610 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 611 adc \$0,%rdx 612 mov %rdx,$A[1] 613 614 mulq $m1 # np[j]*m1 615 add %rax,$N[0] 616 mov -8($ap,$j,8),%rax 617 adc \$0,%rdx 618 add $A[0],$N[0] 619 adc \$0,%rdx 620 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 621 mov %rdx,$N[1] 622 623 mulq $m0 # ap[j]*bp[i] 624 add %rax,$A[1] 625 mov -8($np,$j,8),%rax 626 adc \$0,%rdx 627 add -8(%rsp,$j,8),$A[1] 628 adc \$0,%rdx 629 mov %rdx,$A[0] 630 631 mulq $m1 # np[j]*m1 632 add %rax,$N[1] 633 mov ($ap,$j,8),%rax 634 adc \$0,%rdx 635 add $A[1],$N[1] 636 adc \$0,%rdx 637 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 638 mov %rdx,$N[0] 639 640 mulq $m0 # ap[j]*bp[i] 641 add %rax,$A[0] 642 mov ($np,$j,8),%rax 643 adc \$0,%rdx 644 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 645 adc \$0,%rdx 646 mov %rdx,$A[1] 647 648 mulq $m1 # np[j]*m1 649 add %rax,$N[0] 650 mov 8($ap,$j,8),%rax 651 adc \$0,%rdx 652 add $A[0],$N[0] 653 adc \$0,%rdx 654 mov $N[0],-8(%rsp,$j,8) # tp[j-1] 655 mov %rdx,$N[1] 656 657 mulq $m0 # ap[j]*bp[i] 658 add %rax,$A[1] 659 mov 8($np,$j,8),%rax 660 adc \$0,%rdx 661 add 8(%rsp,$j,8),$A[1] 662 adc \$0,%rdx 663 lea 4($j),$j # j++ 664 mov %rdx,$A[0] 665 666 mulq $m1 # np[j]*m1 667 add %rax,$N[1] 668 mov -16($ap,$j,8),%rax 669 adc \$0,%rdx 670 add $A[1],$N[1] 671 adc \$0,%rdx 672 mov $N[1],-32(%rsp,$j,8) # tp[j-1] 673 mov %rdx,$N[0] 674 cmp $num,$j 675 jb .Linner4x 676 677 mulq $m0 # ap[j]*bp[i] 678 add %rax,$A[0] 679 mov -16($np,$j,8),%rax 680 adc \$0,%rdx 681 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] 682 adc \$0,%rdx 683 mov %rdx,$A[1] 684 685 mulq $m1 # np[j]*m1 686 add %rax,$N[0] 687 mov -8($ap,$j,8),%rax 688 adc \$0,%rdx 689 add $A[0],$N[0] 690 adc \$0,%rdx 691 mov $N[0],-24(%rsp,$j,8) # tp[j-1] 692 mov %rdx,$N[1] 693 694 mulq $m0 # ap[j]*bp[i] 695 add %rax,$A[1] 696 mov -8($np,$j,8),%rax 697 adc \$0,%rdx 698 add -8(%rsp,$j,8),$A[1] 699 adc \$0,%rdx 700 lea 1($i),$i # i++ 701 mov %rdx,$A[0] 702 703 mulq $m1 # np[j]*m1 704 add %rax,$N[1] 705 mov ($ap),%rax # ap[0] 706 adc \$0,%rdx 707 add $A[1],$N[1] 708 adc \$0,%rdx 709 mov $N[1],-16(%rsp,$j,8) # tp[j-1] 710 mov %rdx,$N[0] 711 712 xor $N[1],$N[1] 713 add $A[0],$N[0] 714 adc \$0,$N[1] 715 add (%rsp,$num,8),$N[0] # pull upmost overflow bit 716 adc \$0,$N[1] 717 mov $N[0],-8(%rsp,$j,8) 718 mov $N[1],(%rsp,$j,8) # store upmost overflow bit 719 720 cmp $num,$i 721 jb .Louter4x 722___ 723{ 724my @ri=("%rax","%rdx",$m0,$m1); 725$code.=<<___; 726 mov 16(%rsp,$num,8),$rp # restore $rp 727 lea -4($num),$j 728 mov 0(%rsp),@ri[0] # tp[0] 729 mov 8(%rsp),@ri[1] # tp[1] 730 shr \$2,$j # j=num/4-1 731 lea (%rsp),$ap # borrow ap for tp 732 xor $i,$i # i=0 and clear CF! 733 734 sub 0($np),@ri[0] 735 mov 16($ap),@ri[2] # tp[2] 736 mov 24($ap),@ri[3] # tp[3] 737 sbb 8($np),@ri[1] 738 739.Lsub4x: 740 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 741 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 742 sbb 16($np,$i,8),@ri[2] 743 mov 32($ap,$i,8),@ri[0] # tp[i+1] 744 mov 40($ap,$i,8),@ri[1] 745 sbb 24($np,$i,8),@ri[3] 746 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 747 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 748 sbb 32($np,$i,8),@ri[0] 749 mov 48($ap,$i,8),@ri[2] 750 mov 56($ap,$i,8),@ri[3] 751 sbb 40($np,$i,8),@ri[1] 752 lea 4($i),$i # i++ 753 dec $j # doesn't affect CF! 754 jnz .Lsub4x 755 756 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] 757 mov 32($ap,$i,8),@ri[0] # load overflow bit 758 sbb 16($np,$i,8),@ri[2] 759 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] 760 sbb 24($np,$i,8),@ri[3] 761 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] 762 763 sbb \$0,@ri[0] # handle upmost overflow bit 764 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] 765 pxor %xmm0,%xmm0 766 movq @ri[0],%xmm4 767 pcmpeqd %xmm5,%xmm5 768 pshufd \$0,%xmm4,%xmm4 769 mov $num,$j 770 pxor %xmm4,%xmm5 771 shr \$2,$j # j=num/4 772 xor %eax,%eax # i=0 773 774 jmp .Lcopy4x 775.align 16 776.Lcopy4x: # conditional copy 777 movdqa (%rsp,%rax),%xmm1 778 movdqu ($rp,%rax),%xmm2 779 pand %xmm4,%xmm1 780 pand %xmm5,%xmm2 781 movdqa 16(%rsp,%rax),%xmm3 782 movdqa %xmm0,(%rsp,%rax) 783 por %xmm2,%xmm1 784 movdqu 16($rp,%rax),%xmm2 785 movdqu %xmm1,($rp,%rax) 786 pand %xmm4,%xmm3 787 pand %xmm5,%xmm2 788 movdqa %xmm0,16(%rsp,%rax) 789 por %xmm2,%xmm3 790 movdqu %xmm3,16($rp,%rax) 791 lea 32(%rax),%rax 792 dec $j 793 jnz .Lcopy4x 794___ 795} 796$code.=<<___; 797 mov 8(%rsp,$num,8),%rsi # restore %rsp 798.cfi_def_cfa %rsi, 8 799 mov \$1,%rax 800 mov -48(%rsi),%r15 801.cfi_restore %r15 802 mov -40(%rsi),%r14 803.cfi_restore %r14 804 mov -32(%rsi),%r13 805.cfi_restore %r13 806 mov -24(%rsi),%r12 807.cfi_restore %r12 808 mov -16(%rsi),%rbp 809.cfi_restore %rbp 810 mov -8(%rsi),%rbx 811.cfi_restore %rbx 812 lea (%rsi),%rsp 813.cfi_def_cfa_register %rsp 814.Lmul4x_epilogue: 815 ret 816.cfi_endproc 817.size bn_mul4x_mont,.-bn_mul4x_mont 818___ 819}}} 820{{{ 821###################################################################### 822# void bn_sqr8x_mont( 823my $rptr="%rdi"; # const BN_ULONG *rptr, 824my $aptr="%rsi"; # const BN_ULONG *aptr, 825my $bptr="%rdx"; # not used 826my $nptr="%rcx"; # const BN_ULONG *nptr, 827my $n0 ="%r8"; # const BN_ULONG *n0); 828my $num ="%r9"; # int num, has to be divisible by 8 829 830my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); 831my @A0=("%r10","%r11"); 832my @A1=("%r12","%r13"); 833my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); 834 835$code.=<<___ if ($addx); 836.extern bn_sqrx8x_internal # see x86_64-mont5 module 837___ 838$code.=<<___; 839.extern bn_sqr8x_internal # see x86_64-mont5 module 840 841.type bn_sqr8x_mont,\@function,6 842.align 32 843bn_sqr8x_mont: 844.cfi_startproc 845 mov %rsp,%rax 846.cfi_def_cfa_register %rax 847.Lsqr8x_enter: 848 push %rbx 849.cfi_push %rbx 850 push %rbp 851.cfi_push %rbp 852 push %r12 853.cfi_push %r12 854 push %r13 855.cfi_push %r13 856 push %r14 857.cfi_push %r14 858 push %r15 859.cfi_push %r15 860.Lsqr8x_prologue: 861 862 mov ${num}d,%r10d 863 shl \$3,${num}d # convert $num to bytes 864 shl \$3+2,%r10 # 4*$num 865 neg $num 866 867 ############################################################## 868 # ensure that stack frame doesn't alias with $aptr modulo 869 # 4096. this is done to allow memory disambiguation logic 870 # do its job. 871 # 872 lea -64(%rsp,$num,2),%r11 873 mov %rsp,%rbp 874 mov ($n0),$n0 # *n0 875 sub $aptr,%r11 876 and \$4095,%r11 877 cmp %r11,%r10 878 jb .Lsqr8x_sp_alt 879 sub %r11,%rbp # align with $aptr 880 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) 881 jmp .Lsqr8x_sp_done 882 883.align 32 884.Lsqr8x_sp_alt: 885 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num 886 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) 887 sub %r10,%r11 888 mov \$0,%r10 889 cmovc %r10,%r11 890 sub %r11,%rbp 891.Lsqr8x_sp_done: 892 and \$-64,%rbp 893 mov %rsp,%r11 894 sub %rbp,%r11 895 and \$-4096,%r11 896 lea (%rbp,%r11),%rsp 897 mov (%rsp),%r10 898 cmp %rbp,%rsp 899 ja .Lsqr8x_page_walk 900 jmp .Lsqr8x_page_walk_done 901 902.align 16 903.Lsqr8x_page_walk: 904 lea -4096(%rsp),%rsp 905 mov (%rsp),%r10 906 cmp %rbp,%rsp 907 ja .Lsqr8x_page_walk 908.Lsqr8x_page_walk_done: 909 910 mov $num,%r10 911 neg $num 912 913 mov $n0, 32(%rsp) 914 mov %rax, 40(%rsp) # save original %rsp 915.cfi_cfa_expression %rsp+40,deref,+8 916.Lsqr8x_body: 917 918 movq $nptr, %xmm2 # save pointer to modulus 919 pxor %xmm0,%xmm0 920 movq $rptr,%xmm1 # save $rptr 921 movq %r10, %xmm3 # -$num 922___ 923$code.=<<___ if ($addx); 924 mov OPENSSL_ia32cap_P+8(%rip),%eax 925 and \$0x80100,%eax 926 cmp \$0x80100,%eax 927 jne .Lsqr8x_nox 928 929 call bn_sqrx8x_internal # see x86_64-mont5 module 930 # %rax top-most carry 931 # %rbp nptr 932 # %rcx -8*num 933 # %r8 end of tp[2*num] 934 lea (%r8,%rcx),%rbx 935 mov %rcx,$num 936 mov %rcx,%rdx 937 movq %xmm1,$rptr 938 sar \$3+2,%rcx # %cf=0 939 jmp .Lsqr8x_sub 940 941.align 32 942.Lsqr8x_nox: 943___ 944$code.=<<___; 945 call bn_sqr8x_internal # see x86_64-mont5 module 946 # %rax top-most carry 947 # %rbp nptr 948 # %r8 -8*num 949 # %rdi end of tp[2*num] 950 lea (%rdi,$num),%rbx 951 mov $num,%rcx 952 mov $num,%rdx 953 movq %xmm1,$rptr 954 sar \$3+2,%rcx # %cf=0 955 jmp .Lsqr8x_sub 956 957.align 32 958.Lsqr8x_sub: 959 mov 8*0(%rbx),%r12 960 mov 8*1(%rbx),%r13 961 mov 8*2(%rbx),%r14 962 mov 8*3(%rbx),%r15 963 lea 8*4(%rbx),%rbx 964 sbb 8*0(%rbp),%r12 965 sbb 8*1(%rbp),%r13 966 sbb 8*2(%rbp),%r14 967 sbb 8*3(%rbp),%r15 968 lea 8*4(%rbp),%rbp 969 mov %r12,8*0($rptr) 970 mov %r13,8*1($rptr) 971 mov %r14,8*2($rptr) 972 mov %r15,8*3($rptr) 973 lea 8*4($rptr),$rptr 974 inc %rcx # preserves %cf 975 jnz .Lsqr8x_sub 976 977 sbb \$0,%rax # top-most carry 978 lea (%rbx,$num),%rbx # rewind 979 lea ($rptr,$num),$rptr # rewind 980 981 movq %rax,%xmm1 982 pxor %xmm0,%xmm0 983 pshufd \$0,%xmm1,%xmm1 984 mov 40(%rsp),%rsi # restore %rsp 985.cfi_def_cfa %rsi,8 986 jmp .Lsqr8x_cond_copy 987 988.align 32 989.Lsqr8x_cond_copy: 990 movdqa 16*0(%rbx),%xmm2 991 movdqa 16*1(%rbx),%xmm3 992 lea 16*2(%rbx),%rbx 993 movdqu 16*0($rptr),%xmm4 994 movdqu 16*1($rptr),%xmm5 995 lea 16*2($rptr),$rptr 996 movdqa %xmm0,-16*2(%rbx) # zero tp 997 movdqa %xmm0,-16*1(%rbx) 998 movdqa %xmm0,-16*2(%rbx,%rdx) 999 movdqa %xmm0,-16*1(%rbx,%rdx) 1000 pcmpeqd %xmm1,%xmm0 1001 pand %xmm1,%xmm2 1002 pand %xmm1,%xmm3 1003 pand %xmm0,%xmm4 1004 pand %xmm0,%xmm5 1005 pxor %xmm0,%xmm0 1006 por %xmm2,%xmm4 1007 por %xmm3,%xmm5 1008 movdqu %xmm4,-16*2($rptr) 1009 movdqu %xmm5,-16*1($rptr) 1010 add \$32,$num 1011 jnz .Lsqr8x_cond_copy 1012 1013 mov \$1,%rax 1014 mov -48(%rsi),%r15 1015.cfi_restore %r15 1016 mov -40(%rsi),%r14 1017.cfi_restore %r14 1018 mov -32(%rsi),%r13 1019.cfi_restore %r13 1020 mov -24(%rsi),%r12 1021.cfi_restore %r12 1022 mov -16(%rsi),%rbp 1023.cfi_restore %rbp 1024 mov -8(%rsi),%rbx 1025.cfi_restore %rbx 1026 lea (%rsi),%rsp 1027.cfi_def_cfa_register %rsp 1028.Lsqr8x_epilogue: 1029 ret 1030.cfi_endproc 1031.size bn_sqr8x_mont,.-bn_sqr8x_mont 1032___ 1033}}} 1034 1035if ($addx) {{{ 1036my $bp="%rdx"; # original value 1037 1038$code.=<<___; 1039.type bn_mulx4x_mont,\@function,6 1040.align 32 1041bn_mulx4x_mont: 1042.cfi_startproc 1043 mov %rsp,%rax 1044.cfi_def_cfa_register %rax 1045.Lmulx4x_enter: 1046 push %rbx 1047.cfi_push %rbx 1048 push %rbp 1049.cfi_push %rbp 1050 push %r12 1051.cfi_push %r12 1052 push %r13 1053.cfi_push %r13 1054 push %r14 1055.cfi_push %r14 1056 push %r15 1057.cfi_push %r15 1058.Lmulx4x_prologue: 1059 1060 shl \$3,${num}d # convert $num to bytes 1061 xor %r10,%r10 1062 sub $num,%r10 # -$num 1063 mov ($n0),$n0 # *n0 1064 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) 1065 and \$-128,%rbp 1066 mov %rsp,%r11 1067 sub %rbp,%r11 1068 and \$-4096,%r11 1069 lea (%rbp,%r11),%rsp 1070 mov (%rsp),%r10 1071 cmp %rbp,%rsp 1072 ja .Lmulx4x_page_walk 1073 jmp .Lmulx4x_page_walk_done 1074 1075.align 16 1076.Lmulx4x_page_walk: 1077 lea -4096(%rsp),%rsp 1078 mov (%rsp),%r10 1079 cmp %rbp,%rsp 1080 ja .Lmulx4x_page_walk 1081.Lmulx4x_page_walk_done: 1082 1083 lea ($bp,$num),%r10 1084 ############################################################## 1085 # Stack layout 1086 # +0 num 1087 # +8 off-loaded &b[i] 1088 # +16 end of b[num] 1089 # +24 saved n0 1090 # +32 saved rp 1091 # +40 saved %rsp 1092 # +48 inner counter 1093 # +56 1094 # +64 tmp[num+1] 1095 # 1096 mov $num,0(%rsp) # save $num 1097 shr \$5,$num 1098 mov %r10,16(%rsp) # end of b[num] 1099 sub \$1,$num 1100 mov $n0, 24(%rsp) # save *n0 1101 mov $rp, 32(%rsp) # save $rp 1102 mov %rax,40(%rsp) # save original %rsp 1103.cfi_cfa_expression %rsp+40,deref,+8 1104 mov $num,48(%rsp) # inner counter 1105 jmp .Lmulx4x_body 1106 1107.align 32 1108.Lmulx4x_body: 1109___ 1110my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= 1111 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); 1112my $rptr=$bptr; 1113$code.=<<___; 1114 lea 8($bp),$bptr 1115 mov ($bp),%rdx # b[0], $bp==%rdx actually 1116 lea 64+32(%rsp),$tptr 1117 mov %rdx,$bi 1118 1119 mulx 0*8($aptr),$mi,%rax # a[0]*b[0] 1120 mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] 1121 add %rax,%r11 1122 mov $bptr,8(%rsp) # off-load &b[i] 1123 mulx 2*8($aptr),%r12,%r13 # ... 1124 adc %r14,%r12 1125 adc \$0,%r13 1126 1127 mov $mi,$bptr # borrow $bptr 1128 imulq 24(%rsp),$mi # "t[0]"*n0 1129 xor $zero,$zero # cf=0, of=0 1130 1131 mulx 3*8($aptr),%rax,%r14 1132 mov $mi,%rdx 1133 lea 4*8($aptr),$aptr 1134 adcx %rax,%r13 1135 adcx $zero,%r14 # cf=0 1136 1137 mulx 0*8($nptr),%rax,%r10 1138 adcx %rax,$bptr # discarded 1139 adox %r11,%r10 1140 mulx 1*8($nptr),%rax,%r11 1141 adcx %rax,%r10 1142 adox %r12,%r11 1143 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 1144 mov 48(%rsp),$bptr # counter value 1145 mov %r10,-4*8($tptr) 1146 adcx %rax,%r11 1147 adox %r13,%r12 1148 mulx 3*8($nptr),%rax,%r15 1149 mov $bi,%rdx 1150 mov %r11,-3*8($tptr) 1151 adcx %rax,%r12 1152 adox $zero,%r15 # of=0 1153 lea 4*8($nptr),$nptr 1154 mov %r12,-2*8($tptr) 1155 1156 jmp .Lmulx4x_1st 1157 1158.align 32 1159.Lmulx4x_1st: 1160 adcx $zero,%r15 # cf=0, modulo-scheduled 1161 mulx 0*8($aptr),%r10,%rax # a[4]*b[0] 1162 adcx %r14,%r10 1163 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] 1164 adcx %rax,%r11 1165 mulx 2*8($aptr),%r12,%rax # ... 1166 adcx %r14,%r12 1167 mulx 3*8($aptr),%r13,%r14 1168 .byte 0x67,0x67 1169 mov $mi,%rdx 1170 adcx %rax,%r13 1171 adcx $zero,%r14 # cf=0 1172 lea 4*8($aptr),$aptr 1173 lea 4*8($tptr),$tptr 1174 1175 adox %r15,%r10 1176 mulx 0*8($nptr),%rax,%r15 1177 adcx %rax,%r10 1178 adox %r15,%r11 1179 mulx 1*8($nptr),%rax,%r15 1180 adcx %rax,%r11 1181 adox %r15,%r12 1182 mulx 2*8($nptr),%rax,%r15 1183 mov %r10,-5*8($tptr) 1184 adcx %rax,%r12 1185 mov %r11,-4*8($tptr) 1186 adox %r15,%r13 1187 mulx 3*8($nptr),%rax,%r15 1188 mov $bi,%rdx 1189 mov %r12,-3*8($tptr) 1190 adcx %rax,%r13 1191 adox $zero,%r15 1192 lea 4*8($nptr),$nptr 1193 mov %r13,-2*8($tptr) 1194 1195 dec $bptr # of=0, pass cf 1196 jnz .Lmulx4x_1st 1197 1198 mov 0(%rsp),$num # load num 1199 mov 8(%rsp),$bptr # re-load &b[i] 1200 adc $zero,%r15 # modulo-scheduled 1201 add %r15,%r14 1202 sbb %r15,%r15 # top-most carry 1203 mov %r14,-1*8($tptr) 1204 jmp .Lmulx4x_outer 1205 1206.align 32 1207.Lmulx4x_outer: 1208 mov ($bptr),%rdx # b[i] 1209 lea 8($bptr),$bptr # b++ 1210 sub $num,$aptr # rewind $aptr 1211 mov %r15,($tptr) # save top-most carry 1212 lea 64+4*8(%rsp),$tptr 1213 sub $num,$nptr # rewind $nptr 1214 1215 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] 1216 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 1217 mov %rdx,$bi 1218 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] 1219 adox -4*8($tptr),$mi 1220 adcx %r14,%r11 1221 mulx 2*8($aptr),%r15,%r13 # ... 1222 adox -3*8($tptr),%r11 1223 adcx %r15,%r12 1224 adox -2*8($tptr),%r12 1225 adcx $zero,%r13 1226 adox $zero,%r13 1227 1228 mov $bptr,8(%rsp) # off-load &b[i] 1229 mov $mi,%r15 1230 imulq 24(%rsp),$mi # "t[0]"*n0 1231 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 1232 1233 mulx 3*8($aptr),%rax,%r14 1234 mov $mi,%rdx 1235 adcx %rax,%r13 1236 adox -1*8($tptr),%r13 1237 adcx $zero,%r14 1238 lea 4*8($aptr),$aptr 1239 adox $zero,%r14 1240 1241 mulx 0*8($nptr),%rax,%r10 1242 adcx %rax,%r15 # discarded 1243 adox %r11,%r10 1244 mulx 1*8($nptr),%rax,%r11 1245 adcx %rax,%r10 1246 adox %r12,%r11 1247 mulx 2*8($nptr),%rax,%r12 1248 mov %r10,-4*8($tptr) 1249 adcx %rax,%r11 1250 adox %r13,%r12 1251 mulx 3*8($nptr),%rax,%r15 1252 mov $bi,%rdx 1253 mov %r11,-3*8($tptr) 1254 lea 4*8($nptr),$nptr 1255 adcx %rax,%r12 1256 adox $zero,%r15 # of=0 1257 mov 48(%rsp),$bptr # counter value 1258 mov %r12,-2*8($tptr) 1259 1260 jmp .Lmulx4x_inner 1261 1262.align 32 1263.Lmulx4x_inner: 1264 mulx 0*8($aptr),%r10,%rax # a[4]*b[i] 1265 adcx $zero,%r15 # cf=0, modulo-scheduled 1266 adox %r14,%r10 1267 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] 1268 adcx 0*8($tptr),%r10 1269 adox %rax,%r11 1270 mulx 2*8($aptr),%r12,%rax # ... 1271 adcx 1*8($tptr),%r11 1272 adox %r14,%r12 1273 mulx 3*8($aptr),%r13,%r14 1274 mov $mi,%rdx 1275 adcx 2*8($tptr),%r12 1276 adox %rax,%r13 1277 adcx 3*8($tptr),%r13 1278 adox $zero,%r14 # of=0 1279 lea 4*8($aptr),$aptr 1280 lea 4*8($tptr),$tptr 1281 adcx $zero,%r14 # cf=0 1282 1283 adox %r15,%r10 1284 mulx 0*8($nptr),%rax,%r15 1285 adcx %rax,%r10 1286 adox %r15,%r11 1287 mulx 1*8($nptr),%rax,%r15 1288 adcx %rax,%r11 1289 adox %r15,%r12 1290 mulx 2*8($nptr),%rax,%r15 1291 mov %r10,-5*8($tptr) 1292 adcx %rax,%r12 1293 adox %r15,%r13 1294 mulx 3*8($nptr),%rax,%r15 1295 mov $bi,%rdx 1296 mov %r11,-4*8($tptr) 1297 mov %r12,-3*8($tptr) 1298 adcx %rax,%r13 1299 adox $zero,%r15 1300 lea 4*8($nptr),$nptr 1301 mov %r13,-2*8($tptr) 1302 1303 dec $bptr # of=0, pass cf 1304 jnz .Lmulx4x_inner 1305 1306 mov 0(%rsp),$num # load num 1307 mov 8(%rsp),$bptr # re-load &b[i] 1308 adc $zero,%r15 # modulo-scheduled 1309 sub 0*8($tptr),$zero # pull top-most carry 1310 adc %r15,%r14 1311 sbb %r15,%r15 # top-most carry 1312 mov %r14,-1*8($tptr) 1313 1314 cmp 16(%rsp),$bptr 1315 jne .Lmulx4x_outer 1316 1317 lea 64(%rsp),$tptr 1318 sub $num,$nptr # rewind $nptr 1319 neg %r15 1320 mov $num,%rdx 1321 shr \$3+2,$num # %cf=0 1322 mov 32(%rsp),$rptr # restore rp 1323 jmp .Lmulx4x_sub 1324 1325.align 32 1326.Lmulx4x_sub: 1327 mov 8*0($tptr),%r11 1328 mov 8*1($tptr),%r12 1329 mov 8*2($tptr),%r13 1330 mov 8*3($tptr),%r14 1331 lea 8*4($tptr),$tptr 1332 sbb 8*0($nptr),%r11 1333 sbb 8*1($nptr),%r12 1334 sbb 8*2($nptr),%r13 1335 sbb 8*3($nptr),%r14 1336 lea 8*4($nptr),$nptr 1337 mov %r11,8*0($rptr) 1338 mov %r12,8*1($rptr) 1339 mov %r13,8*2($rptr) 1340 mov %r14,8*3($rptr) 1341 lea 8*4($rptr),$rptr 1342 dec $num # preserves %cf 1343 jnz .Lmulx4x_sub 1344 1345 sbb \$0,%r15 # top-most carry 1346 lea 64(%rsp),$tptr 1347 sub %rdx,$rptr # rewind 1348 1349 movq %r15,%xmm1 1350 pxor %xmm0,%xmm0 1351 pshufd \$0,%xmm1,%xmm1 1352 mov 40(%rsp),%rsi # restore %rsp 1353.cfi_def_cfa %rsi,8 1354 jmp .Lmulx4x_cond_copy 1355 1356.align 32 1357.Lmulx4x_cond_copy: 1358 movdqa 16*0($tptr),%xmm2 1359 movdqa 16*1($tptr),%xmm3 1360 lea 16*2($tptr),$tptr 1361 movdqu 16*0($rptr),%xmm4 1362 movdqu 16*1($rptr),%xmm5 1363 lea 16*2($rptr),$rptr 1364 movdqa %xmm0,-16*2($tptr) # zero tp 1365 movdqa %xmm0,-16*1($tptr) 1366 pcmpeqd %xmm1,%xmm0 1367 pand %xmm1,%xmm2 1368 pand %xmm1,%xmm3 1369 pand %xmm0,%xmm4 1370 pand %xmm0,%xmm5 1371 pxor %xmm0,%xmm0 1372 por %xmm2,%xmm4 1373 por %xmm3,%xmm5 1374 movdqu %xmm4,-16*2($rptr) 1375 movdqu %xmm5,-16*1($rptr) 1376 sub \$32,%rdx 1377 jnz .Lmulx4x_cond_copy 1378 1379 mov %rdx,($tptr) 1380 1381 mov \$1,%rax 1382 mov -48(%rsi),%r15 1383.cfi_restore %r15 1384 mov -40(%rsi),%r14 1385.cfi_restore %r14 1386 mov -32(%rsi),%r13 1387.cfi_restore %r13 1388 mov -24(%rsi),%r12 1389.cfi_restore %r12 1390 mov -16(%rsi),%rbp 1391.cfi_restore %rbp 1392 mov -8(%rsi),%rbx 1393.cfi_restore %rbx 1394 lea (%rsi),%rsp 1395.cfi_def_cfa_register %rsp 1396.Lmulx4x_epilogue: 1397 ret 1398.cfi_endproc 1399.size bn_mulx4x_mont,.-bn_mulx4x_mont 1400___ 1401}}} 1402$code.=<<___; 1403.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1404.align 16 1405___ 1406 1407# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1408# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1409if ($win64) { 1410$rec="%rcx"; 1411$frame="%rdx"; 1412$context="%r8"; 1413$disp="%r9"; 1414 1415$code.=<<___; 1416.extern __imp_RtlVirtualUnwind 1417.type mul_handler,\@abi-omnipotent 1418.align 16 1419mul_handler: 1420 push %rsi 1421 push %rdi 1422 push %rbx 1423 push %rbp 1424 push %r12 1425 push %r13 1426 push %r14 1427 push %r15 1428 pushfq 1429 sub \$64,%rsp 1430 1431 mov 120($context),%rax # pull context->Rax 1432 mov 248($context),%rbx # pull context->Rip 1433 1434 mov 8($disp),%rsi # disp->ImageBase 1435 mov 56($disp),%r11 # disp->HandlerData 1436 1437 mov 0(%r11),%r10d # HandlerData[0] 1438 lea (%rsi,%r10),%r10 # end of prologue label 1439 cmp %r10,%rbx # context->Rip<end of prologue label 1440 jb .Lcommon_seh_tail 1441 1442 mov 152($context),%rax # pull context->Rsp 1443 1444 mov 4(%r11),%r10d # HandlerData[1] 1445 lea (%rsi,%r10),%r10 # epilogue label 1446 cmp %r10,%rbx # context->Rip>=epilogue label 1447 jae .Lcommon_seh_tail 1448 1449 mov 192($context),%r10 # pull $num 1450 mov 8(%rax,%r10,8),%rax # pull saved stack pointer 1451 1452 jmp .Lcommon_pop_regs 1453.size mul_handler,.-mul_handler 1454 1455.type sqr_handler,\@abi-omnipotent 1456.align 16 1457sqr_handler: 1458 push %rsi 1459 push %rdi 1460 push %rbx 1461 push %rbp 1462 push %r12 1463 push %r13 1464 push %r14 1465 push %r15 1466 pushfq 1467 sub \$64,%rsp 1468 1469 mov 120($context),%rax # pull context->Rax 1470 mov 248($context),%rbx # pull context->Rip 1471 1472 mov 8($disp),%rsi # disp->ImageBase 1473 mov 56($disp),%r11 # disp->HandlerData 1474 1475 mov 0(%r11),%r10d # HandlerData[0] 1476 lea (%rsi,%r10),%r10 # end of prologue label 1477 cmp %r10,%rbx # context->Rip<.Lsqr_prologue 1478 jb .Lcommon_seh_tail 1479 1480 mov 4(%r11),%r10d # HandlerData[1] 1481 lea (%rsi,%r10),%r10 # body label 1482 cmp %r10,%rbx # context->Rip<.Lsqr_body 1483 jb .Lcommon_pop_regs 1484 1485 mov 152($context),%rax # pull context->Rsp 1486 1487 mov 8(%r11),%r10d # HandlerData[2] 1488 lea (%rsi,%r10),%r10 # epilogue label 1489 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue 1490 jae .Lcommon_seh_tail 1491 1492 mov 40(%rax),%rax # pull saved stack pointer 1493 1494.Lcommon_pop_regs: 1495 mov -8(%rax),%rbx 1496 mov -16(%rax),%rbp 1497 mov -24(%rax),%r12 1498 mov -32(%rax),%r13 1499 mov -40(%rax),%r14 1500 mov -48(%rax),%r15 1501 mov %rbx,144($context) # restore context->Rbx 1502 mov %rbp,160($context) # restore context->Rbp 1503 mov %r12,216($context) # restore context->R12 1504 mov %r13,224($context) # restore context->R13 1505 mov %r14,232($context) # restore context->R14 1506 mov %r15,240($context) # restore context->R15 1507 1508.Lcommon_seh_tail: 1509 mov 8(%rax),%rdi 1510 mov 16(%rax),%rsi 1511 mov %rax,152($context) # restore context->Rsp 1512 mov %rsi,168($context) # restore context->Rsi 1513 mov %rdi,176($context) # restore context->Rdi 1514 1515 mov 40($disp),%rdi # disp->ContextRecord 1516 mov $context,%rsi # context 1517 mov \$154,%ecx # sizeof(CONTEXT) 1518 .long 0xa548f3fc # cld; rep movsq 1519 1520 mov $disp,%rsi 1521 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1522 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1523 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1524 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1525 mov 40(%rsi),%r10 # disp->ContextRecord 1526 lea 56(%rsi),%r11 # &disp->HandlerData 1527 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1528 mov %r10,32(%rsp) # arg5 1529 mov %r11,40(%rsp) # arg6 1530 mov %r12,48(%rsp) # arg7 1531 mov %rcx,56(%rsp) # arg8, (NULL) 1532 call *__imp_RtlVirtualUnwind(%rip) 1533 1534 mov \$1,%eax # ExceptionContinueSearch 1535 add \$64,%rsp 1536 popfq 1537 pop %r15 1538 pop %r14 1539 pop %r13 1540 pop %r12 1541 pop %rbp 1542 pop %rbx 1543 pop %rdi 1544 pop %rsi 1545 ret 1546.size sqr_handler,.-sqr_handler 1547 1548.section .pdata 1549.align 4 1550 .rva .LSEH_begin_bn_mul_mont 1551 .rva .LSEH_end_bn_mul_mont 1552 .rva .LSEH_info_bn_mul_mont 1553 1554 .rva .LSEH_begin_bn_mul4x_mont 1555 .rva .LSEH_end_bn_mul4x_mont 1556 .rva .LSEH_info_bn_mul4x_mont 1557 1558 .rva .LSEH_begin_bn_sqr8x_mont 1559 .rva .LSEH_end_bn_sqr8x_mont 1560 .rva .LSEH_info_bn_sqr8x_mont 1561___ 1562$code.=<<___ if ($addx); 1563 .rva .LSEH_begin_bn_mulx4x_mont 1564 .rva .LSEH_end_bn_mulx4x_mont 1565 .rva .LSEH_info_bn_mulx4x_mont 1566___ 1567$code.=<<___; 1568.section .xdata 1569.align 8 1570.LSEH_info_bn_mul_mont: 1571 .byte 9,0,0,0 1572 .rva mul_handler 1573 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 1574.LSEH_info_bn_mul4x_mont: 1575 .byte 9,0,0,0 1576 .rva mul_handler 1577 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] 1578.LSEH_info_bn_sqr8x_mont: 1579 .byte 9,0,0,0 1580 .rva sqr_handler 1581 .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] 1582.align 8 1583___ 1584$code.=<<___ if ($addx); 1585.LSEH_info_bn_mulx4x_mont: 1586 .byte 9,0,0,0 1587 .rva sqr_handler 1588 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] 1589.align 8 1590___ 1591} 1592 1593print $code; 1594close STDOUT or die "error closing STDOUT: $!"; 1595