1#! /usr/bin/env perl 2# Copyright 2013-2024 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4# 5# Licensed under the Apache License 2.0 (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9# 10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11# (1) Intel Corporation, Israel Development Center, Haifa, Israel 12# (2) University of Haifa, Israel 13# 14# References: 15# [1] S. Gueron, "Efficient Software Implementations of Modular 16# Exponentiation", http://eprint.iacr.org/2011/239 17# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". 18# IEEE Proceedings of 9th International Conference on Information 19# Technology: New Generations (ITNG 2012), 821-823 (2012). 20# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation 21# Journal of Cryptographic Engineering 2:31-43 (2012). 22# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis 23# resistant 512-bit and 1024-bit modular exponentiation for optimizing 24# RSA1024 and RSA2048 on x86_64 platforms", 25# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest 26# 27# While original submission covers 512- and 1024-bit exponentiation, 28# this module is limited to 512-bit version only (and as such 29# accelerates RSA1024 sign). This is because improvement for longer 30# keys is not high enough to justify the effort, highest measured 31# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 32# for the moment of this writing!] Nor does this module implement 33# "monolithic" complete exponentiation jumbo-subroutine, but adheres 34# to more modular mixture of C and assembly. And it's optimized even 35# for processors other than Intel Core family (see table below for 36# improvement coefficients). 37# <appro@openssl.org> 38# 39# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 40# ----------------+--------------------------- 41# Opteron +13% |+5% +20% 42# Bulldozer -0% |-1% +10% 43# P4 +11% |+7% +8% 44# Westmere +5% |+14% +17% 45# Sandy Bridge +2% |+12% +29% 46# Ivy Bridge +1% |+11% +35% 47# Haswell(**) -0% |+12% +39% 48# Atom +13% |+11% +4% 49# VIA Nano +70% |+9% +25% 50# 51# (*) rsax engine and fips numbers are presented for reference 52# purposes; 53# (**) MULX was attempted, but found to give only marginal improvement; 54 55# $output is the last argument if it looks like a file (it has an extension) 56# $flavour is the first argument if it doesn't look like a file 57$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 58$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 59 60$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 61 62$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 63( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 64( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 65die "can't locate x86_64-xlate.pl"; 66 67open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 68 or die "can't call $xlate: $!"; 69*STDOUT=*OUT; 70 71if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 72 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 73 $addx = ($1>=2.23); 74} 75 76if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 77 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 78 $addx = ($1>=2.10); 79} 80 81if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 82 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 83 $addx = ($1>=12); 84} 85 86if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 87 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 88 $addx = ($ver>=3.03); 89} 90 91($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 92{ 93my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 94 95$code.=<<___; 96.text 97 98.extern OPENSSL_ia32cap_P 99 100.globl rsaz_512_sqr 101.type rsaz_512_sqr,\@function,5 102.align 32 103rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 104.cfi_startproc 105 push %rbx 106.cfi_push %rbx 107 push %rbp 108.cfi_push %rbp 109 push %r12 110.cfi_push %r12 111 push %r13 112.cfi_push %r13 113 push %r14 114.cfi_push %r14 115 push %r15 116.cfi_push %r15 117 118 subq \$128+24, %rsp 119.cfi_adjust_cfa_offset 128+24 120.Lsqr_body: 121 movq $mod, %xmm1 # common off-load 122 movq ($inp), %rdx 123 movq 8($inp), %rax 124 movq $n0, 128(%rsp) 125___ 126$code.=<<___ if ($addx); 127 movl \$0x80100,%r11d 128 andl OPENSSL_ia32cap_P+8(%rip),%r11d 129 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 130 je .Loop_sqrx 131___ 132$code.=<<___; 133 jmp .Loop_sqr 134 135.align 32 136.Loop_sqr: 137 movl $times,128+8(%rsp) 138#first iteration 139 movq %rdx, %rbx # 0($inp) 140 mov %rax, %rbp # 8($inp) 141 mulq %rdx 142 movq %rax, %r8 143 movq 16($inp), %rax 144 movq %rdx, %r9 145 146 mulq %rbx 147 addq %rax, %r9 148 movq 24($inp), %rax 149 movq %rdx, %r10 150 adcq \$0, %r10 151 152 mulq %rbx 153 addq %rax, %r10 154 movq 32($inp), %rax 155 movq %rdx, %r11 156 adcq \$0, %r11 157 158 mulq %rbx 159 addq %rax, %r11 160 movq 40($inp), %rax 161 movq %rdx, %r12 162 adcq \$0, %r12 163 164 mulq %rbx 165 addq %rax, %r12 166 movq 48($inp), %rax 167 movq %rdx, %r13 168 adcq \$0, %r13 169 170 mulq %rbx 171 addq %rax, %r13 172 movq 56($inp), %rax 173 movq %rdx, %r14 174 adcq \$0, %r14 175 176 mulq %rbx 177 addq %rax, %r14 178 movq %rbx, %rax 179 adcq \$0, %rdx 180 181 xorq %rcx,%rcx # rcx:r8 = r8 << 1 182 addq %r8, %r8 183 movq %rdx, %r15 184 adcq \$0, %rcx 185 186 mulq %rax 187 addq %r8, %rdx 188 adcq \$0, %rcx 189 190 movq %rax, (%rsp) 191 movq %rdx, 8(%rsp) 192 193#second iteration 194 movq 16($inp), %rax 195 mulq %rbp 196 addq %rax, %r10 197 movq 24($inp), %rax 198 movq %rdx, %rbx 199 adcq \$0, %rbx 200 201 mulq %rbp 202 addq %rax, %r11 203 movq 32($inp), %rax 204 adcq \$0, %rdx 205 addq %rbx, %r11 206 movq %rdx, %rbx 207 adcq \$0, %rbx 208 209 mulq %rbp 210 addq %rax, %r12 211 movq 40($inp), %rax 212 adcq \$0, %rdx 213 addq %rbx, %r12 214 movq %rdx, %rbx 215 adcq \$0, %rbx 216 217 mulq %rbp 218 addq %rax, %r13 219 movq 48($inp), %rax 220 adcq \$0, %rdx 221 addq %rbx, %r13 222 movq %rdx, %rbx 223 adcq \$0, %rbx 224 225 mulq %rbp 226 addq %rax, %r14 227 movq 56($inp), %rax 228 adcq \$0, %rdx 229 addq %rbx, %r14 230 movq %rdx, %rbx 231 adcq \$0, %rbx 232 233 mulq %rbp 234 addq %rax, %r15 235 movq %rbp, %rax 236 adcq \$0, %rdx 237 addq %rbx, %r15 238 adcq \$0, %rdx 239 240 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 241 addq %r9, %r9 242 movq %rdx, %r8 243 adcq %r10, %r10 244 adcq \$0, %rbx 245 246 mulq %rax 247 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 248 addq %rcx, %rax 249 movq 16($inp), %rbp 250 addq %rax, %r9 251 movq 24($inp), %rax 252 adcq %rdx, %r10 253 adcq \$0, %rbx 254 255 movq %r9, 16(%rsp) 256 movq %r10, 24(%rsp) 257 258#third iteration 259 mulq %rbp 260 addq %rax, %r12 261 movq 32($inp), %rax 262 movq %rdx, %rcx 263 adcq \$0, %rcx 264 265 mulq %rbp 266 addq %rax, %r13 267 movq 40($inp), %rax 268 adcq \$0, %rdx 269 addq %rcx, %r13 270 movq %rdx, %rcx 271 adcq \$0, %rcx 272 273 mulq %rbp 274 addq %rax, %r14 275 movq 48($inp), %rax 276 adcq \$0, %rdx 277 addq %rcx, %r14 278 movq %rdx, %rcx 279 adcq \$0, %rcx 280 281 mulq %rbp 282 addq %rax, %r15 283 movq 56($inp), %rax 284 adcq \$0, %rdx 285 addq %rcx, %r15 286 movq %rdx, %rcx 287 adcq \$0, %rcx 288 289 mulq %rbp 290 addq %rax, %r8 291 movq %rbp, %rax 292 adcq \$0, %rdx 293 addq %rcx, %r8 294 adcq \$0, %rdx 295 296 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 297 addq %r11, %r11 298 movq %rdx, %r9 299 adcq %r12, %r12 300 adcq \$0, %rcx 301 302 mulq %rax 303 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 304 addq %rbx, %rax 305 movq 24($inp), %r10 306 addq %rax, %r11 307 movq 32($inp), %rax 308 adcq %rdx, %r12 309 adcq \$0, %rcx 310 311 movq %r11, 32(%rsp) 312 movq %r12, 40(%rsp) 313 314#fourth iteration 315 mov %rax, %r11 # 32($inp) 316 mulq %r10 317 addq %rax, %r14 318 movq 40($inp), %rax 319 movq %rdx, %rbx 320 adcq \$0, %rbx 321 322 mov %rax, %r12 # 40($inp) 323 mulq %r10 324 addq %rax, %r15 325 movq 48($inp), %rax 326 adcq \$0, %rdx 327 addq %rbx, %r15 328 movq %rdx, %rbx 329 adcq \$0, %rbx 330 331 mov %rax, %rbp # 48($inp) 332 mulq %r10 333 addq %rax, %r8 334 movq 56($inp), %rax 335 adcq \$0, %rdx 336 addq %rbx, %r8 337 movq %rdx, %rbx 338 adcq \$0, %rbx 339 340 mulq %r10 341 addq %rax, %r9 342 movq %r10, %rax 343 adcq \$0, %rdx 344 addq %rbx, %r9 345 adcq \$0, %rdx 346 347 xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1 348 addq %r13, %r13 349 movq %rdx, %r10 350 adcq %r14, %r14 351 adcq \$0, %rbx 352 353 mulq %rax 354 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 355 addq %rcx, %rax 356 addq %rax, %r13 357 movq %r12, %rax # 40($inp) 358 adcq %rdx, %r14 359 adcq \$0, %rbx 360 361 movq %r13, 48(%rsp) 362 movq %r14, 56(%rsp) 363 364#fifth iteration 365 mulq %r11 366 addq %rax, %r8 367 movq %rbp, %rax # 48($inp) 368 movq %rdx, %rcx 369 adcq \$0, %rcx 370 371 mulq %r11 372 addq %rax, %r9 373 movq 56($inp), %rax 374 adcq \$0, %rdx 375 addq %rcx, %r9 376 movq %rdx, %rcx 377 adcq \$0, %rcx 378 379 mov %rax, %r14 # 56($inp) 380 mulq %r11 381 addq %rax, %r10 382 movq %r11, %rax 383 adcq \$0, %rdx 384 addq %rcx, %r10 385 adcq \$0, %rdx 386 387 xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1 388 addq %r15, %r15 389 movq %rdx, %r11 390 adcq %r8, %r8 391 adcq \$0, %rcx 392 393 mulq %rax 394 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 395 addq %rbx, %rax 396 addq %rax, %r15 397 movq %rbp, %rax # 48($inp) 398 adcq %rdx, %r8 399 adcq \$0, %rcx 400 401 movq %r15, 64(%rsp) 402 movq %r8, 72(%rsp) 403 404#sixth iteration 405 mulq %r12 406 addq %rax, %r10 407 movq %r14, %rax # 56($inp) 408 movq %rdx, %rbx 409 adcq \$0, %rbx 410 411 mulq %r12 412 addq %rax, %r11 413 movq %r12, %rax 414 adcq \$0, %rdx 415 addq %rbx, %r11 416 adcq \$0, %rdx 417 418 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 419 addq %r9, %r9 420 movq %rdx, %r12 421 adcq %r10, %r10 422 adcq \$0, %rbx 423 424 mulq %rax 425 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 426 addq %rcx, %rax 427 addq %rax, %r9 428 movq %r14, %rax # 56($inp) 429 adcq %rdx, %r10 430 adcq \$0, %rbx 431 432 movq %r9, 80(%rsp) 433 movq %r10, 88(%rsp) 434 435#seventh iteration 436 mulq %rbp 437 addq %rax, %r12 438 movq %rbp, %rax 439 adcq \$0, %rdx 440 441 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 442 addq %r11, %r11 443 movq %rdx, %r13 444 adcq %r12, %r12 445 adcq \$0, %rcx 446 447 mulq %rax 448 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 449 addq %rbx, %rax 450 addq %rax, %r11 451 movq %r14, %rax # 56($inp) 452 adcq %rdx, %r12 453 adcq \$0, %rcx 454 455 movq %r11, 96(%rsp) 456 movq %r12, 104(%rsp) 457 458#eighth iteration 459 xorq %rbx, %rbx # rbx:r13 = r13 << 1 460 addq %r13, %r13 461 adcq \$0, %rbx 462 463 mulq %rax 464 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 465 addq %rcx, %rax 466 addq %r13, %rax 467 adcq %rbx, %rdx 468 469 movq (%rsp), %r8 470 movq 8(%rsp), %r9 471 movq 16(%rsp), %r10 472 movq 24(%rsp), %r11 473 movq 32(%rsp), %r12 474 movq 40(%rsp), %r13 475 movq 48(%rsp), %r14 476 movq 56(%rsp), %r15 477 movq %xmm1, %rbp 478 479 movq %rax, 112(%rsp) 480 movq %rdx, 120(%rsp) 481 482 call __rsaz_512_reduce 483 484 addq 64(%rsp), %r8 485 adcq 72(%rsp), %r9 486 adcq 80(%rsp), %r10 487 adcq 88(%rsp), %r11 488 adcq 96(%rsp), %r12 489 adcq 104(%rsp), %r13 490 adcq 112(%rsp), %r14 491 adcq 120(%rsp), %r15 492 sbbq %rcx, %rcx 493 494 call __rsaz_512_subtract 495 496 movq %r8, %rdx 497 movq %r9, %rax 498 movl 128+8(%rsp), $times 499 movq $out, $inp 500 501 decl $times 502 jnz .Loop_sqr 503___ 504if ($addx) { 505$code.=<<___; 506 jmp .Lsqr_tail 507 508.align 32 509.Loop_sqrx: 510 movl $times,128+8(%rsp) 511 movq $out, %xmm0 # off-load 512#first iteration 513 mulx %rax, %r8, %r9 514 mov %rax, %rbx 515 516 mulx 16($inp), %rcx, %r10 517 xor %rbp, %rbp # cf=0, of=0 518 519 mulx 24($inp), %rax, %r11 520 adcx %rcx, %r9 521 522 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12 523 adcx %rax, %r10 524 525 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13 526 adcx %rcx, %r11 527 528 mulx 48($inp), %rcx, %r14 529 adcx %rax, %r12 530 adcx %rcx, %r13 531 532 mulx 56($inp), %rax, %r15 533 adcx %rax, %r14 534 adcx %rbp, %r15 # %rbp is 0 535 536 mulx %rdx, %rax, $out 537 mov %rbx, %rdx # 8($inp) 538 xor %rcx, %rcx 539 adox %r8, %r8 540 adcx $out, %r8 541 adox %rbp, %rcx 542 adcx %rbp, %rcx 543 544 mov %rax, (%rsp) 545 mov %r8, 8(%rsp) 546 547#second iteration 548 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx 549 adox %rax, %r10 550 adcx %rbx, %r11 551 552 mulx 24($inp), $out, %r8 553 adox $out, %r11 554 .byte 0x66 555 adcx %r8, %r12 556 557 mulx 32($inp), %rax, %rbx 558 adox %rax, %r12 559 adcx %rbx, %r13 560 561 mulx 40($inp), $out, %r8 562 adox $out, %r13 563 adcx %r8, %r14 564 565 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 566 adox %rax, %r14 567 adcx %rbx, %r15 568 569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 570 adox $out, %r15 571 adcx %rbp, %r8 572 mulx %rdx, %rax, $out 573 adox %rbp, %r8 574 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx 575 576 xor %rbx, %rbx 577 adox %r9, %r9 578 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 579 adcx %rcx, %rax 580 adox %r10, %r10 581 adcx %rax, %r9 582 adox %rbp, %rbx 583 adcx $out, %r10 584 adcx %rbp, %rbx 585 586 mov %r9, 16(%rsp) 587 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 588 589#third iteration 590 mulx 24($inp), $out, %r9 591 adox $out, %r12 592 adcx %r9, %r13 593 594 mulx 32($inp), %rax, %rcx 595 adox %rax, %r13 596 adcx %rcx, %r14 597 598 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9 599 adox $out, %r14 600 adcx %r9, %r15 601 602 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 603 adox %rax, %r15 604 adcx %rcx, %r8 605 606 mulx 56($inp), $out, %r9 607 adox $out, %r8 608 adcx %rbp, %r9 609 mulx %rdx, %rax, $out 610 adox %rbp, %r9 611 mov 24($inp), %rdx 612 613 xor %rcx, %rcx 614 adox %r11, %r11 615 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 616 adcx %rbx, %rax 617 adox %r12, %r12 618 adcx %rax, %r11 619 adox %rbp, %rcx 620 adcx $out, %r12 621 adcx %rbp, %rcx 622 623 mov %r11, 32(%rsp) 624 mov %r12, 40(%rsp) 625 626#fourth iteration 627 mulx 32($inp), %rax, %rbx 628 adox %rax, %r14 629 adcx %rbx, %r15 630 631 mulx 40($inp), $out, %r10 632 adox $out, %r15 633 adcx %r10, %r8 634 635 mulx 48($inp), %rax, %rbx 636 adox %rax, %r8 637 adcx %rbx, %r9 638 639 mulx 56($inp), $out, %r10 640 adox $out, %r9 641 adcx %rbp, %r10 642 mulx %rdx, %rax, $out 643 adox %rbp, %r10 644 mov 32($inp), %rdx 645 646 xor %rbx, %rbx 647 adox %r13, %r13 648 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 649 adcx %rcx, %rax 650 adox %r14, %r14 651 adcx %rax, %r13 652 adox %rbp, %rbx 653 adcx $out, %r14 654 adcx %rbp, %rbx 655 656 mov %r13, 48(%rsp) 657 mov %r14, 56(%rsp) 658 659#fifth iteration 660 mulx 40($inp), $out, %r11 661 adox $out, %r8 662 adcx %r11, %r9 663 664 mulx 48($inp), %rax, %rcx 665 adox %rax, %r9 666 adcx %rcx, %r10 667 668 mulx 56($inp), $out, %r11 669 adox $out, %r10 670 adcx %rbp, %r11 671 mulx %rdx, %rax, $out 672 mov 40($inp), %rdx 673 adox %rbp, %r11 674 675 xor %rcx, %rcx 676 adox %r15, %r15 677 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 678 adcx %rbx, %rax 679 adox %r8, %r8 680 adcx %rax, %r15 681 adox %rbp, %rcx 682 adcx $out, %r8 683 adcx %rbp, %rcx 684 685 mov %r15, 64(%rsp) 686 mov %r8, 72(%rsp) 687 688#sixth iteration 689 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 690 adox %rax, %r10 691 adcx %rbx, %r11 692 693 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 694 adox $out, %r11 695 adcx %rbp, %r12 696 mulx %rdx, %rax, $out 697 adox %rbp, %r12 698 mov 48($inp), %rdx 699 700 xor %rbx, %rbx 701 adox %r9, %r9 702 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 703 adcx %rcx, %rax 704 adox %r10, %r10 705 adcx %rax, %r9 706 adcx $out, %r10 707 adox %rbp, %rbx 708 adcx %rbp, %rbx 709 710 mov %r9, 80(%rsp) 711 mov %r10, 88(%rsp) 712 713#seventh iteration 714 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 715 adox %rax, %r12 716 adox %rbp, %r13 717 718 mulx %rdx, %rax, $out 719 xor %rcx, %rcx 720 mov 56($inp), %rdx 721 adox %r11, %r11 722 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 723 adcx %rbx, %rax 724 adox %r12, %r12 725 adcx %rax, %r11 726 adox %rbp, %rcx 727 adcx $out, %r12 728 adcx %rbp, %rcx 729 730 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 731 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 732 733#eighth iteration 734 mulx %rdx, %rax, %rdx 735 xor %rbx, %rbx 736 adox %r13, %r13 737 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 738 adcx %rcx, %rax 739 adox %rbp, %rbx 740 adcx %r13, %rax 741 adcx %rdx, %rbx 742 743 movq %xmm0, $out 744 movq %xmm1, %rbp 745 746 movq 128(%rsp), %rdx # pull $n0 747 movq (%rsp), %r8 748 movq 8(%rsp), %r9 749 movq 16(%rsp), %r10 750 movq 24(%rsp), %r11 751 movq 32(%rsp), %r12 752 movq 40(%rsp), %r13 753 movq 48(%rsp), %r14 754 movq 56(%rsp), %r15 755 756 movq %rax, 112(%rsp) 757 movq %rbx, 120(%rsp) 758 759 call __rsaz_512_reducex 760 761 addq 64(%rsp), %r8 762 adcq 72(%rsp), %r9 763 adcq 80(%rsp), %r10 764 adcq 88(%rsp), %r11 765 adcq 96(%rsp), %r12 766 adcq 104(%rsp), %r13 767 adcq 112(%rsp), %r14 768 adcq 120(%rsp), %r15 769 sbbq %rcx, %rcx 770 771 call __rsaz_512_subtract 772 773 movq %r8, %rdx 774 movq %r9, %rax 775 movl 128+8(%rsp), $times 776 movq $out, $inp 777 778 decl $times 779 jnz .Loop_sqrx 780 781.Lsqr_tail: 782___ 783} 784$code.=<<___; 785 786 leaq 128+24+48(%rsp), %rax 787.cfi_def_cfa %rax,8 788 movq -48(%rax), %r15 789.cfi_restore %r15 790 movq -40(%rax), %r14 791.cfi_restore %r14 792 movq -32(%rax), %r13 793.cfi_restore %r13 794 movq -24(%rax), %r12 795.cfi_restore %r12 796 movq -16(%rax), %rbp 797.cfi_restore %rbp 798 movq -8(%rax), %rbx 799.cfi_restore %rbx 800 leaq (%rax), %rsp 801.cfi_def_cfa_register %rsp 802.Lsqr_epilogue: 803 ret 804.cfi_endproc 805.size rsaz_512_sqr,.-rsaz_512_sqr 806___ 807} 808{ 809my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 810$code.=<<___; 811.globl rsaz_512_mul 812.type rsaz_512_mul,\@function,5 813.align 32 814rsaz_512_mul: 815.cfi_startproc 816 push %rbx 817.cfi_push %rbx 818 push %rbp 819.cfi_push %rbp 820 push %r12 821.cfi_push %r12 822 push %r13 823.cfi_push %r13 824 push %r14 825.cfi_push %r14 826 push %r15 827.cfi_push %r15 828 829 subq \$128+24, %rsp 830.cfi_adjust_cfa_offset 128+24 831.Lmul_body: 832 movq $out, %xmm0 # off-load arguments 833 movq $mod, %xmm1 834 movq $n0, 128(%rsp) 835___ 836$code.=<<___ if ($addx); 837 movl \$0x80100,%r11d 838 andl OPENSSL_ia32cap_P+8(%rip),%r11d 839 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 840 je .Lmulx 841___ 842$code.=<<___; 843 movq ($bp), %rbx # pass b[0] 844 movq $bp, %rbp # pass argument 845 call __rsaz_512_mul 846 847 movq %xmm0, $out 848 movq %xmm1, %rbp 849 850 movq (%rsp), %r8 851 movq 8(%rsp), %r9 852 movq 16(%rsp), %r10 853 movq 24(%rsp), %r11 854 movq 32(%rsp), %r12 855 movq 40(%rsp), %r13 856 movq 48(%rsp), %r14 857 movq 56(%rsp), %r15 858 859 call __rsaz_512_reduce 860___ 861$code.=<<___ if ($addx); 862 jmp .Lmul_tail 863 864.align 32 865.Lmulx: 866 movq $bp, %rbp # pass argument 867 movq ($bp), %rdx # pass b[0] 868 call __rsaz_512_mulx 869 870 movq %xmm0, $out 871 movq %xmm1, %rbp 872 873 movq 128(%rsp), %rdx # pull $n0 874 movq (%rsp), %r8 875 movq 8(%rsp), %r9 876 movq 16(%rsp), %r10 877 movq 24(%rsp), %r11 878 movq 32(%rsp), %r12 879 movq 40(%rsp), %r13 880 movq 48(%rsp), %r14 881 movq 56(%rsp), %r15 882 883 call __rsaz_512_reducex 884.Lmul_tail: 885___ 886$code.=<<___; 887 addq 64(%rsp), %r8 888 adcq 72(%rsp), %r9 889 adcq 80(%rsp), %r10 890 adcq 88(%rsp), %r11 891 adcq 96(%rsp), %r12 892 adcq 104(%rsp), %r13 893 adcq 112(%rsp), %r14 894 adcq 120(%rsp), %r15 895 sbbq %rcx, %rcx 896 897 call __rsaz_512_subtract 898 899 leaq 128+24+48(%rsp), %rax 900.cfi_def_cfa %rax,8 901 movq -48(%rax), %r15 902.cfi_restore %r15 903 movq -40(%rax), %r14 904.cfi_restore %r14 905 movq -32(%rax), %r13 906.cfi_restore %r13 907 movq -24(%rax), %r12 908.cfi_restore %r12 909 movq -16(%rax), %rbp 910.cfi_restore %rbp 911 movq -8(%rax), %rbx 912.cfi_restore %rbx 913 leaq (%rax), %rsp 914.cfi_def_cfa_register %rsp 915.Lmul_epilogue: 916 ret 917.cfi_endproc 918.size rsaz_512_mul,.-rsaz_512_mul 919___ 920} 921{ 922my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 923$code.=<<___; 924.globl rsaz_512_mul_gather4 925.type rsaz_512_mul_gather4,\@function,6 926.align 32 927rsaz_512_mul_gather4: 928.cfi_startproc 929 push %rbx 930.cfi_push %rbx 931 push %rbp 932.cfi_push %rbp 933 push %r12 934.cfi_push %r12 935 push %r13 936.cfi_push %r13 937 push %r14 938.cfi_push %r14 939 push %r15 940.cfi_push %r15 941 942 subq \$`128+24+($win64?0xb0:0)`, %rsp 943.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)` 944___ 945$code.=<<___ if ($win64); 946 movaps %xmm6,0xa0(%rsp) 947 movaps %xmm7,0xb0(%rsp) 948 movaps %xmm8,0xc0(%rsp) 949 movaps %xmm9,0xd0(%rsp) 950 movaps %xmm10,0xe0(%rsp) 951 movaps %xmm11,0xf0(%rsp) 952 movaps %xmm12,0x100(%rsp) 953 movaps %xmm13,0x110(%rsp) 954 movaps %xmm14,0x120(%rsp) 955 movaps %xmm15,0x130(%rsp) 956___ 957$code.=<<___; 958.Lmul_gather4_body: 959 movd $pwr,%xmm8 960 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 961 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 962 963 pshufd \$0,%xmm8,%xmm8 # broadcast $power 964 movdqa %xmm1,%xmm7 965 movdqa %xmm1,%xmm2 966___ 967######################################################################## 968# calculate mask by comparing 0..15 to $power 969# 970for($i=0;$i<4;$i++) { 971$code.=<<___; 972 paddd %xmm`$i`,%xmm`$i+1` 973 pcmpeqd %xmm8,%xmm`$i` 974 movdqa %xmm7,%xmm`$i+3` 975___ 976} 977for(;$i<7;$i++) { 978$code.=<<___; 979 paddd %xmm`$i`,%xmm`$i+1` 980 pcmpeqd %xmm8,%xmm`$i` 981___ 982} 983$code.=<<___; 984 pcmpeqd %xmm8,%xmm7 985 986 movdqa 16*0($bp),%xmm8 987 movdqa 16*1($bp),%xmm9 988 movdqa 16*2($bp),%xmm10 989 movdqa 16*3($bp),%xmm11 990 pand %xmm0,%xmm8 991 movdqa 16*4($bp),%xmm12 992 pand %xmm1,%xmm9 993 movdqa 16*5($bp),%xmm13 994 pand %xmm2,%xmm10 995 movdqa 16*6($bp),%xmm14 996 pand %xmm3,%xmm11 997 movdqa 16*7($bp),%xmm15 998 leaq 128($bp), %rbp 999 pand %xmm4,%xmm12 1000 pand %xmm5,%xmm13 1001 pand %xmm6,%xmm14 1002 pand %xmm7,%xmm15 1003 por %xmm10,%xmm8 1004 por %xmm11,%xmm9 1005 por %xmm12,%xmm8 1006 por %xmm13,%xmm9 1007 por %xmm14,%xmm8 1008 por %xmm15,%xmm9 1009 1010 por %xmm9,%xmm8 1011 pshufd \$0x4e,%xmm8,%xmm9 1012 por %xmm9,%xmm8 1013___ 1014$code.=<<___ if ($addx); 1015 movl \$0x80100,%r11d 1016 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1017 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1018 je .Lmulx_gather 1019___ 1020$code.=<<___; 1021 movq %xmm8,%rbx 1022 1023 movq $n0, 128(%rsp) # off-load arguments 1024 movq $out, 128+8(%rsp) 1025 movq $mod, 128+16(%rsp) 1026 1027 movq ($ap), %rax 1028 movq 8($ap), %rcx 1029 mulq %rbx # 0 iteration 1030 movq %rax, (%rsp) 1031 movq %rcx, %rax 1032 movq %rdx, %r8 1033 1034 mulq %rbx 1035 addq %rax, %r8 1036 movq 16($ap), %rax 1037 movq %rdx, %r9 1038 adcq \$0, %r9 1039 1040 mulq %rbx 1041 addq %rax, %r9 1042 movq 24($ap), %rax 1043 movq %rdx, %r10 1044 adcq \$0, %r10 1045 1046 mulq %rbx 1047 addq %rax, %r10 1048 movq 32($ap), %rax 1049 movq %rdx, %r11 1050 adcq \$0, %r11 1051 1052 mulq %rbx 1053 addq %rax, %r11 1054 movq 40($ap), %rax 1055 movq %rdx, %r12 1056 adcq \$0, %r12 1057 1058 mulq %rbx 1059 addq %rax, %r12 1060 movq 48($ap), %rax 1061 movq %rdx, %r13 1062 adcq \$0, %r13 1063 1064 mulq %rbx 1065 addq %rax, %r13 1066 movq 56($ap), %rax 1067 movq %rdx, %r14 1068 adcq \$0, %r14 1069 1070 mulq %rbx 1071 addq %rax, %r14 1072 movq ($ap), %rax 1073 movq %rdx, %r15 1074 adcq \$0, %r15 1075 1076 leaq 8(%rsp), %rdi 1077 movl \$7, %ecx 1078 jmp .Loop_mul_gather 1079 1080.align 32 1081.Loop_mul_gather: 1082 movdqa 16*0(%rbp),%xmm8 1083 movdqa 16*1(%rbp),%xmm9 1084 movdqa 16*2(%rbp),%xmm10 1085 movdqa 16*3(%rbp),%xmm11 1086 pand %xmm0,%xmm8 1087 movdqa 16*4(%rbp),%xmm12 1088 pand %xmm1,%xmm9 1089 movdqa 16*5(%rbp),%xmm13 1090 pand %xmm2,%xmm10 1091 movdqa 16*6(%rbp),%xmm14 1092 pand %xmm3,%xmm11 1093 movdqa 16*7(%rbp),%xmm15 1094 leaq 128(%rbp), %rbp 1095 pand %xmm4,%xmm12 1096 pand %xmm5,%xmm13 1097 pand %xmm6,%xmm14 1098 pand %xmm7,%xmm15 1099 por %xmm10,%xmm8 1100 por %xmm11,%xmm9 1101 por %xmm12,%xmm8 1102 por %xmm13,%xmm9 1103 por %xmm14,%xmm8 1104 por %xmm15,%xmm9 1105 1106 por %xmm9,%xmm8 1107 pshufd \$0x4e,%xmm8,%xmm9 1108 por %xmm9,%xmm8 1109 movq %xmm8,%rbx 1110 1111 mulq %rbx 1112 addq %rax, %r8 1113 movq 8($ap), %rax 1114 movq %r8, (%rdi) 1115 movq %rdx, %r8 1116 adcq \$0, %r8 1117 1118 mulq %rbx 1119 addq %rax, %r9 1120 movq 16($ap), %rax 1121 adcq \$0, %rdx 1122 addq %r9, %r8 1123 movq %rdx, %r9 1124 adcq \$0, %r9 1125 1126 mulq %rbx 1127 addq %rax, %r10 1128 movq 24($ap), %rax 1129 adcq \$0, %rdx 1130 addq %r10, %r9 1131 movq %rdx, %r10 1132 adcq \$0, %r10 1133 1134 mulq %rbx 1135 addq %rax, %r11 1136 movq 32($ap), %rax 1137 adcq \$0, %rdx 1138 addq %r11, %r10 1139 movq %rdx, %r11 1140 adcq \$0, %r11 1141 1142 mulq %rbx 1143 addq %rax, %r12 1144 movq 40($ap), %rax 1145 adcq \$0, %rdx 1146 addq %r12, %r11 1147 movq %rdx, %r12 1148 adcq \$0, %r12 1149 1150 mulq %rbx 1151 addq %rax, %r13 1152 movq 48($ap), %rax 1153 adcq \$0, %rdx 1154 addq %r13, %r12 1155 movq %rdx, %r13 1156 adcq \$0, %r13 1157 1158 mulq %rbx 1159 addq %rax, %r14 1160 movq 56($ap), %rax 1161 adcq \$0, %rdx 1162 addq %r14, %r13 1163 movq %rdx, %r14 1164 adcq \$0, %r14 1165 1166 mulq %rbx 1167 addq %rax, %r15 1168 movq ($ap), %rax 1169 adcq \$0, %rdx 1170 addq %r15, %r14 1171 movq %rdx, %r15 1172 adcq \$0, %r15 1173 1174 leaq 8(%rdi), %rdi 1175 1176 decl %ecx 1177 jnz .Loop_mul_gather 1178 1179 movq %r8, (%rdi) 1180 movq %r9, 8(%rdi) 1181 movq %r10, 16(%rdi) 1182 movq %r11, 24(%rdi) 1183 movq %r12, 32(%rdi) 1184 movq %r13, 40(%rdi) 1185 movq %r14, 48(%rdi) 1186 movq %r15, 56(%rdi) 1187 1188 movq 128+8(%rsp), $out 1189 movq 128+16(%rsp), %rbp 1190 1191 movq (%rsp), %r8 1192 movq 8(%rsp), %r9 1193 movq 16(%rsp), %r10 1194 movq 24(%rsp), %r11 1195 movq 32(%rsp), %r12 1196 movq 40(%rsp), %r13 1197 movq 48(%rsp), %r14 1198 movq 56(%rsp), %r15 1199 1200 call __rsaz_512_reduce 1201___ 1202$code.=<<___ if ($addx); 1203 jmp .Lmul_gather_tail 1204 1205.align 32 1206.Lmulx_gather: 1207 movq %xmm8,%rdx 1208 1209 mov $n0, 128(%rsp) # off-load arguments 1210 mov $out, 128+8(%rsp) 1211 mov $mod, 128+16(%rsp) 1212 1213 mulx ($ap), %rbx, %r8 # 0 iteration 1214 mov %rbx, (%rsp) 1215 xor %edi, %edi # cf=0, of=0 1216 1217 mulx 8($ap), %rax, %r9 1218 1219 mulx 16($ap), %rbx, %r10 1220 adcx %rax, %r8 1221 1222 mulx 24($ap), %rax, %r11 1223 adcx %rbx, %r9 1224 1225 mulx 32($ap), %rbx, %r12 1226 adcx %rax, %r10 1227 1228 mulx 40($ap), %rax, %r13 1229 adcx %rbx, %r11 1230 1231 mulx 48($ap), %rbx, %r14 1232 adcx %rax, %r12 1233 1234 mulx 56($ap), %rax, %r15 1235 adcx %rbx, %r13 1236 adcx %rax, %r14 1237 .byte 0x67 1238 mov %r8, %rbx 1239 adcx %rdi, %r15 # %rdi is 0 1240 1241 mov \$-7, %rcx 1242 jmp .Loop_mulx_gather 1243 1244.align 32 1245.Loop_mulx_gather: 1246 movdqa 16*0(%rbp),%xmm8 1247 movdqa 16*1(%rbp),%xmm9 1248 movdqa 16*2(%rbp),%xmm10 1249 movdqa 16*3(%rbp),%xmm11 1250 pand %xmm0,%xmm8 1251 movdqa 16*4(%rbp),%xmm12 1252 pand %xmm1,%xmm9 1253 movdqa 16*5(%rbp),%xmm13 1254 pand %xmm2,%xmm10 1255 movdqa 16*6(%rbp),%xmm14 1256 pand %xmm3,%xmm11 1257 movdqa 16*7(%rbp),%xmm15 1258 leaq 128(%rbp), %rbp 1259 pand %xmm4,%xmm12 1260 pand %xmm5,%xmm13 1261 pand %xmm6,%xmm14 1262 pand %xmm7,%xmm15 1263 por %xmm10,%xmm8 1264 por %xmm11,%xmm9 1265 por %xmm12,%xmm8 1266 por %xmm13,%xmm9 1267 por %xmm14,%xmm8 1268 por %xmm15,%xmm9 1269 1270 por %xmm9,%xmm8 1271 pshufd \$0x4e,%xmm8,%xmm9 1272 por %xmm9,%xmm8 1273 movq %xmm8,%rdx 1274 1275 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1276 adcx %rax, %rbx 1277 adox %r9, %r8 1278 1279 mulx 8($ap), %rax, %r9 1280 adcx %rax, %r8 1281 adox %r10, %r9 1282 1283 mulx 16($ap), %rax, %r10 1284 adcx %rax, %r9 1285 adox %r11, %r10 1286 1287 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1288 adcx %rax, %r10 1289 adox %r12, %r11 1290 1291 mulx 32($ap), %rax, %r12 1292 adcx %rax, %r11 1293 adox %r13, %r12 1294 1295 mulx 40($ap), %rax, %r13 1296 adcx %rax, %r12 1297 adox %r14, %r13 1298 1299 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1300 adcx %rax, %r13 1301 .byte 0x67 1302 adox %r15, %r14 1303 1304 mulx 56($ap), %rax, %r15 1305 mov %rbx, 64(%rsp,%rcx,8) 1306 adcx %rax, %r14 1307 adox %rdi, %r15 1308 mov %r8, %rbx 1309 adcx %rdi, %r15 # cf=0 1310 1311 inc %rcx # of=0 1312 jnz .Loop_mulx_gather 1313 1314 mov %r8, 64(%rsp) 1315 mov %r9, 64+8(%rsp) 1316 mov %r10, 64+16(%rsp) 1317 mov %r11, 64+24(%rsp) 1318 mov %r12, 64+32(%rsp) 1319 mov %r13, 64+40(%rsp) 1320 mov %r14, 64+48(%rsp) 1321 mov %r15, 64+56(%rsp) 1322 1323 mov 128(%rsp), %rdx # pull arguments 1324 mov 128+8(%rsp), $out 1325 mov 128+16(%rsp), %rbp 1326 1327 mov (%rsp), %r8 1328 mov 8(%rsp), %r9 1329 mov 16(%rsp), %r10 1330 mov 24(%rsp), %r11 1331 mov 32(%rsp), %r12 1332 mov 40(%rsp), %r13 1333 mov 48(%rsp), %r14 1334 mov 56(%rsp), %r15 1335 1336 call __rsaz_512_reducex 1337 1338.Lmul_gather_tail: 1339___ 1340$code.=<<___; 1341 addq 64(%rsp), %r8 1342 adcq 72(%rsp), %r9 1343 adcq 80(%rsp), %r10 1344 adcq 88(%rsp), %r11 1345 adcq 96(%rsp), %r12 1346 adcq 104(%rsp), %r13 1347 adcq 112(%rsp), %r14 1348 adcq 120(%rsp), %r15 1349 sbbq %rcx, %rcx 1350 1351 call __rsaz_512_subtract 1352 1353 leaq 128+24+48(%rsp), %rax 1354___ 1355$code.=<<___ if ($win64); 1356 movaps 0xa0-0xc8(%rax),%xmm6 1357 movaps 0xb0-0xc8(%rax),%xmm7 1358 movaps 0xc0-0xc8(%rax),%xmm8 1359 movaps 0xd0-0xc8(%rax),%xmm9 1360 movaps 0xe0-0xc8(%rax),%xmm10 1361 movaps 0xf0-0xc8(%rax),%xmm11 1362 movaps 0x100-0xc8(%rax),%xmm12 1363 movaps 0x110-0xc8(%rax),%xmm13 1364 movaps 0x120-0xc8(%rax),%xmm14 1365 movaps 0x130-0xc8(%rax),%xmm15 1366 lea 0xb0(%rax),%rax 1367___ 1368$code.=<<___; 1369.cfi_def_cfa %rax,8 1370 movq -48(%rax), %r15 1371.cfi_restore %r15 1372 movq -40(%rax), %r14 1373.cfi_restore %r14 1374 movq -32(%rax), %r13 1375.cfi_restore %r13 1376 movq -24(%rax), %r12 1377.cfi_restore %r12 1378 movq -16(%rax), %rbp 1379.cfi_restore %rbp 1380 movq -8(%rax), %rbx 1381.cfi_restore %rbx 1382 leaq (%rax), %rsp 1383.cfi_def_cfa_register %rsp 1384.Lmul_gather4_epilogue: 1385 ret 1386.cfi_endproc 1387.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1388___ 1389} 1390{ 1391my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1392$code.=<<___; 1393.globl rsaz_512_mul_scatter4 1394.type rsaz_512_mul_scatter4,\@function,6 1395.align 32 1396rsaz_512_mul_scatter4: 1397.cfi_startproc 1398 push %rbx 1399.cfi_push %rbx 1400 push %rbp 1401.cfi_push %rbp 1402 push %r12 1403.cfi_push %r12 1404 push %r13 1405.cfi_push %r13 1406 push %r14 1407.cfi_push %r14 1408 push %r15 1409.cfi_push %r15 1410 1411 mov $pwr, $pwr 1412 subq \$128+24, %rsp 1413.cfi_adjust_cfa_offset 128+24 1414.Lmul_scatter4_body: 1415 leaq ($tbl,$pwr,8), $tbl 1416 movq $out, %xmm0 # off-load arguments 1417 movq $mod, %xmm1 1418 movq $tbl, %xmm2 1419 movq $n0, 128(%rsp) 1420 1421 movq $out, %rbp 1422___ 1423$code.=<<___ if ($addx); 1424 movl \$0x80100,%r11d 1425 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1426 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1427 je .Lmulx_scatter 1428___ 1429$code.=<<___; 1430 movq ($out),%rbx # pass b[0] 1431 call __rsaz_512_mul 1432 1433 movq %xmm0, $out 1434 movq %xmm1, %rbp 1435 1436 movq (%rsp), %r8 1437 movq 8(%rsp), %r9 1438 movq 16(%rsp), %r10 1439 movq 24(%rsp), %r11 1440 movq 32(%rsp), %r12 1441 movq 40(%rsp), %r13 1442 movq 48(%rsp), %r14 1443 movq 56(%rsp), %r15 1444 1445 call __rsaz_512_reduce 1446___ 1447$code.=<<___ if ($addx); 1448 jmp .Lmul_scatter_tail 1449 1450.align 32 1451.Lmulx_scatter: 1452 movq ($out), %rdx # pass b[0] 1453 call __rsaz_512_mulx 1454 1455 movq %xmm0, $out 1456 movq %xmm1, %rbp 1457 1458 movq 128(%rsp), %rdx # pull $n0 1459 movq (%rsp), %r8 1460 movq 8(%rsp), %r9 1461 movq 16(%rsp), %r10 1462 movq 24(%rsp), %r11 1463 movq 32(%rsp), %r12 1464 movq 40(%rsp), %r13 1465 movq 48(%rsp), %r14 1466 movq 56(%rsp), %r15 1467 1468 call __rsaz_512_reducex 1469 1470.Lmul_scatter_tail: 1471___ 1472$code.=<<___; 1473 addq 64(%rsp), %r8 1474 adcq 72(%rsp), %r9 1475 adcq 80(%rsp), %r10 1476 adcq 88(%rsp), %r11 1477 adcq 96(%rsp), %r12 1478 adcq 104(%rsp), %r13 1479 adcq 112(%rsp), %r14 1480 adcq 120(%rsp), %r15 1481 movq %xmm2, $inp 1482 sbbq %rcx, %rcx 1483 1484 call __rsaz_512_subtract 1485 1486 movq %r8, 128*0($inp) # scatter 1487 movq %r9, 128*1($inp) 1488 movq %r10, 128*2($inp) 1489 movq %r11, 128*3($inp) 1490 movq %r12, 128*4($inp) 1491 movq %r13, 128*5($inp) 1492 movq %r14, 128*6($inp) 1493 movq %r15, 128*7($inp) 1494 1495 leaq 128+24+48(%rsp), %rax 1496.cfi_def_cfa %rax,8 1497 movq -48(%rax), %r15 1498.cfi_restore %r15 1499 movq -40(%rax), %r14 1500.cfi_restore %r14 1501 movq -32(%rax), %r13 1502.cfi_restore %r13 1503 movq -24(%rax), %r12 1504.cfi_restore %r12 1505 movq -16(%rax), %rbp 1506.cfi_restore %rbp 1507 movq -8(%rax), %rbx 1508.cfi_restore %rbx 1509 leaq (%rax), %rsp 1510.cfi_def_cfa_register %rsp 1511.Lmul_scatter4_epilogue: 1512 ret 1513.cfi_endproc 1514.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1515___ 1516} 1517{ 1518my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1519$code.=<<___; 1520.globl rsaz_512_mul_by_one 1521.type rsaz_512_mul_by_one,\@function,4 1522.align 32 1523rsaz_512_mul_by_one: 1524.cfi_startproc 1525 push %rbx 1526.cfi_push %rbx 1527 push %rbp 1528.cfi_push %rbp 1529 push %r12 1530.cfi_push %r12 1531 push %r13 1532.cfi_push %r13 1533 push %r14 1534.cfi_push %r14 1535 push %r15 1536.cfi_push %r15 1537 1538 subq \$128+24, %rsp 1539.cfi_adjust_cfa_offset 128+24 1540.Lmul_by_one_body: 1541___ 1542$code.=<<___ if ($addx); 1543 movl OPENSSL_ia32cap_P+8(%rip),%eax 1544___ 1545$code.=<<___; 1546 movq $mod, %rbp # reassign argument 1547 movq $n0, 128(%rsp) 1548 1549 movq ($inp), %r8 1550 pxor %xmm0, %xmm0 1551 movq 8($inp), %r9 1552 movq 16($inp), %r10 1553 movq 24($inp), %r11 1554 movq 32($inp), %r12 1555 movq 40($inp), %r13 1556 movq 48($inp), %r14 1557 movq 56($inp), %r15 1558 1559 movdqa %xmm0, (%rsp) 1560 movdqa %xmm0, 16(%rsp) 1561 movdqa %xmm0, 32(%rsp) 1562 movdqa %xmm0, 48(%rsp) 1563 movdqa %xmm0, 64(%rsp) 1564 movdqa %xmm0, 80(%rsp) 1565 movdqa %xmm0, 96(%rsp) 1566___ 1567$code.=<<___ if ($addx); 1568 andl \$0x80100,%eax 1569 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1570 je .Lby_one_callx 1571___ 1572$code.=<<___; 1573 call __rsaz_512_reduce 1574___ 1575$code.=<<___ if ($addx); 1576 jmp .Lby_one_tail 1577.align 32 1578.Lby_one_callx: 1579 movq 128(%rsp), %rdx # pull $n0 1580 call __rsaz_512_reducex 1581.Lby_one_tail: 1582___ 1583$code.=<<___; 1584 movq %r8, ($out) 1585 movq %r9, 8($out) 1586 movq %r10, 16($out) 1587 movq %r11, 24($out) 1588 movq %r12, 32($out) 1589 movq %r13, 40($out) 1590 movq %r14, 48($out) 1591 movq %r15, 56($out) 1592 1593 leaq 128+24+48(%rsp), %rax 1594.cfi_def_cfa %rax,8 1595 movq -48(%rax), %r15 1596.cfi_restore %r15 1597 movq -40(%rax), %r14 1598.cfi_restore %r14 1599 movq -32(%rax), %r13 1600.cfi_restore %r13 1601 movq -24(%rax), %r12 1602.cfi_restore %r12 1603 movq -16(%rax), %rbp 1604.cfi_restore %rbp 1605 movq -8(%rax), %rbx 1606.cfi_restore %rbx 1607 leaq (%rax), %rsp 1608.cfi_def_cfa_register %rsp 1609.Lmul_by_one_epilogue: 1610 ret 1611.cfi_endproc 1612.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1613___ 1614} 1615{ # __rsaz_512_reduce 1616 # 1617 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1618 # output: %r8-%r15 1619 # clobbers: everything except %rbp and %rdi 1620$code.=<<___; 1621.type __rsaz_512_reduce,\@abi-omnipotent 1622.align 32 1623__rsaz_512_reduce: 1624.cfi_startproc 1625 movq %r8, %rbx 1626 imulq 128+8(%rsp), %rbx 1627 movq 0(%rbp), %rax 1628 movl \$8, %ecx 1629 jmp .Lreduction_loop 1630 1631.align 32 1632.Lreduction_loop: 1633 mulq %rbx 1634 movq 8(%rbp), %rax 1635 negq %r8 1636 movq %rdx, %r8 1637 adcq \$0, %r8 1638 1639 mulq %rbx 1640 addq %rax, %r9 1641 movq 16(%rbp), %rax 1642 adcq \$0, %rdx 1643 addq %r9, %r8 1644 movq %rdx, %r9 1645 adcq \$0, %r9 1646 1647 mulq %rbx 1648 addq %rax, %r10 1649 movq 24(%rbp), %rax 1650 adcq \$0, %rdx 1651 addq %r10, %r9 1652 movq %rdx, %r10 1653 adcq \$0, %r10 1654 1655 mulq %rbx 1656 addq %rax, %r11 1657 movq 32(%rbp), %rax 1658 adcq \$0, %rdx 1659 addq %r11, %r10 1660 movq 128+8(%rsp), %rsi 1661 #movq %rdx, %r11 1662 #adcq \$0, %r11 1663 adcq \$0, %rdx 1664 movq %rdx, %r11 1665 1666 mulq %rbx 1667 addq %rax, %r12 1668 movq 40(%rbp), %rax 1669 adcq \$0, %rdx 1670 imulq %r8, %rsi 1671 addq %r12, %r11 1672 movq %rdx, %r12 1673 adcq \$0, %r12 1674 1675 mulq %rbx 1676 addq %rax, %r13 1677 movq 48(%rbp), %rax 1678 adcq \$0, %rdx 1679 addq %r13, %r12 1680 movq %rdx, %r13 1681 adcq \$0, %r13 1682 1683 mulq %rbx 1684 addq %rax, %r14 1685 movq 56(%rbp), %rax 1686 adcq \$0, %rdx 1687 addq %r14, %r13 1688 movq %rdx, %r14 1689 adcq \$0, %r14 1690 1691 mulq %rbx 1692 movq %rsi, %rbx 1693 addq %rax, %r15 1694 movq 0(%rbp), %rax 1695 adcq \$0, %rdx 1696 addq %r15, %r14 1697 movq %rdx, %r15 1698 adcq \$0, %r15 1699 1700 decl %ecx 1701 jne .Lreduction_loop 1702 1703 ret 1704.cfi_endproc 1705.size __rsaz_512_reduce,.-__rsaz_512_reduce 1706___ 1707} 1708if ($addx) { 1709 # __rsaz_512_reducex 1710 # 1711 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1712 # output: %r8-%r15 1713 # clobbers: everything except %rbp and %rdi 1714$code.=<<___; 1715.type __rsaz_512_reducex,\@abi-omnipotent 1716.align 32 1717__rsaz_512_reducex: 1718.cfi_startproc 1719 #movq 128+8(%rsp), %rdx # pull $n0 1720 imulq %r8, %rdx 1721 xorq %rsi, %rsi # cf=0,of=0 1722 movl \$8, %ecx 1723 jmp .Lreduction_loopx 1724 1725.align 32 1726.Lreduction_loopx: 1727 mov %r8, %rbx 1728 mulx 0(%rbp), %rax, %r8 1729 adcx %rbx, %rax 1730 adox %r9, %r8 1731 1732 mulx 8(%rbp), %rax, %r9 1733 adcx %rax, %r8 1734 adox %r10, %r9 1735 1736 mulx 16(%rbp), %rbx, %r10 1737 adcx %rbx, %r9 1738 adox %r11, %r10 1739 1740 mulx 24(%rbp), %rbx, %r11 1741 adcx %rbx, %r10 1742 adox %r12, %r11 1743 1744 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1745 mov %rdx, %rax 1746 mov %r8, %rdx 1747 adcx %rbx, %r11 1748 adox %r13, %r12 1749 1750 mulx 128+8(%rsp), %rbx, %rdx 1751 mov %rax, %rdx 1752 1753 mulx 40(%rbp), %rax, %r13 1754 adcx %rax, %r12 1755 adox %r14, %r13 1756 1757 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1758 adcx %rax, %r13 1759 adox %r15, %r14 1760 1761 mulx 56(%rbp), %rax, %r15 1762 mov %rbx, %rdx 1763 adcx %rax, %r14 1764 adox %rsi, %r15 # %rsi is 0 1765 adcx %rsi, %r15 # cf=0 1766 1767 decl %ecx # of=0 1768 jne .Lreduction_loopx 1769 1770 ret 1771.cfi_endproc 1772.size __rsaz_512_reducex,.-__rsaz_512_reducex 1773___ 1774} 1775{ # __rsaz_512_subtract 1776 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1777 # output: 1778 # clobbers: everything but %rdi, %rsi and %rbp 1779$code.=<<___; 1780.type __rsaz_512_subtract,\@abi-omnipotent 1781.align 32 1782__rsaz_512_subtract: 1783.cfi_startproc 1784 movq %r8, ($out) 1785 movq %r9, 8($out) 1786 movq %r10, 16($out) 1787 movq %r11, 24($out) 1788 movq %r12, 32($out) 1789 movq %r13, 40($out) 1790 movq %r14, 48($out) 1791 movq %r15, 56($out) 1792 1793 movq 0($mod), %r8 1794 movq 8($mod), %r9 1795 negq %r8 1796 notq %r9 1797 andq %rcx, %r8 1798 movq 16($mod), %r10 1799 andq %rcx, %r9 1800 notq %r10 1801 movq 24($mod), %r11 1802 andq %rcx, %r10 1803 notq %r11 1804 movq 32($mod), %r12 1805 andq %rcx, %r11 1806 notq %r12 1807 movq 40($mod), %r13 1808 andq %rcx, %r12 1809 notq %r13 1810 movq 48($mod), %r14 1811 andq %rcx, %r13 1812 notq %r14 1813 movq 56($mod), %r15 1814 andq %rcx, %r14 1815 notq %r15 1816 andq %rcx, %r15 1817 1818 addq ($out), %r8 1819 adcq 8($out), %r9 1820 adcq 16($out), %r10 1821 adcq 24($out), %r11 1822 adcq 32($out), %r12 1823 adcq 40($out), %r13 1824 adcq 48($out), %r14 1825 adcq 56($out), %r15 1826 1827 movq %r8, ($out) 1828 movq %r9, 8($out) 1829 movq %r10, 16($out) 1830 movq %r11, 24($out) 1831 movq %r12, 32($out) 1832 movq %r13, 40($out) 1833 movq %r14, 48($out) 1834 movq %r15, 56($out) 1835 1836 ret 1837.cfi_endproc 1838.size __rsaz_512_subtract,.-__rsaz_512_subtract 1839___ 1840} 1841{ # __rsaz_512_mul 1842 # 1843 # input: %rsi - ap, %rbp - bp 1844 # output: 1845 # clobbers: everything 1846my ($ap,$bp) = ("%rsi","%rbp"); 1847$code.=<<___; 1848.type __rsaz_512_mul,\@abi-omnipotent 1849.align 32 1850__rsaz_512_mul: 1851.cfi_startproc 1852 leaq 8(%rsp), %rdi 1853 1854 movq ($ap), %rax 1855 mulq %rbx 1856 movq %rax, (%rdi) 1857 movq 8($ap), %rax 1858 movq %rdx, %r8 1859 1860 mulq %rbx 1861 addq %rax, %r8 1862 movq 16($ap), %rax 1863 movq %rdx, %r9 1864 adcq \$0, %r9 1865 1866 mulq %rbx 1867 addq %rax, %r9 1868 movq 24($ap), %rax 1869 movq %rdx, %r10 1870 adcq \$0, %r10 1871 1872 mulq %rbx 1873 addq %rax, %r10 1874 movq 32($ap), %rax 1875 movq %rdx, %r11 1876 adcq \$0, %r11 1877 1878 mulq %rbx 1879 addq %rax, %r11 1880 movq 40($ap), %rax 1881 movq %rdx, %r12 1882 adcq \$0, %r12 1883 1884 mulq %rbx 1885 addq %rax, %r12 1886 movq 48($ap), %rax 1887 movq %rdx, %r13 1888 adcq \$0, %r13 1889 1890 mulq %rbx 1891 addq %rax, %r13 1892 movq 56($ap), %rax 1893 movq %rdx, %r14 1894 adcq \$0, %r14 1895 1896 mulq %rbx 1897 addq %rax, %r14 1898 movq ($ap), %rax 1899 movq %rdx, %r15 1900 adcq \$0, %r15 1901 1902 leaq 8($bp), $bp 1903 leaq 8(%rdi), %rdi 1904 1905 movl \$7, %ecx 1906 jmp .Loop_mul 1907 1908.align 32 1909.Loop_mul: 1910 movq ($bp), %rbx 1911 mulq %rbx 1912 addq %rax, %r8 1913 movq 8($ap), %rax 1914 movq %r8, (%rdi) 1915 movq %rdx, %r8 1916 adcq \$0, %r8 1917 1918 mulq %rbx 1919 addq %rax, %r9 1920 movq 16($ap), %rax 1921 adcq \$0, %rdx 1922 addq %r9, %r8 1923 movq %rdx, %r9 1924 adcq \$0, %r9 1925 1926 mulq %rbx 1927 addq %rax, %r10 1928 movq 24($ap), %rax 1929 adcq \$0, %rdx 1930 addq %r10, %r9 1931 movq %rdx, %r10 1932 adcq \$0, %r10 1933 1934 mulq %rbx 1935 addq %rax, %r11 1936 movq 32($ap), %rax 1937 adcq \$0, %rdx 1938 addq %r11, %r10 1939 movq %rdx, %r11 1940 adcq \$0, %r11 1941 1942 mulq %rbx 1943 addq %rax, %r12 1944 movq 40($ap), %rax 1945 adcq \$0, %rdx 1946 addq %r12, %r11 1947 movq %rdx, %r12 1948 adcq \$0, %r12 1949 1950 mulq %rbx 1951 addq %rax, %r13 1952 movq 48($ap), %rax 1953 adcq \$0, %rdx 1954 addq %r13, %r12 1955 movq %rdx, %r13 1956 adcq \$0, %r13 1957 1958 mulq %rbx 1959 addq %rax, %r14 1960 movq 56($ap), %rax 1961 adcq \$0, %rdx 1962 addq %r14, %r13 1963 movq %rdx, %r14 1964 leaq 8($bp), $bp 1965 adcq \$0, %r14 1966 1967 mulq %rbx 1968 addq %rax, %r15 1969 movq ($ap), %rax 1970 adcq \$0, %rdx 1971 addq %r15, %r14 1972 movq %rdx, %r15 1973 adcq \$0, %r15 1974 1975 leaq 8(%rdi), %rdi 1976 1977 decl %ecx 1978 jnz .Loop_mul 1979 1980 movq %r8, (%rdi) 1981 movq %r9, 8(%rdi) 1982 movq %r10, 16(%rdi) 1983 movq %r11, 24(%rdi) 1984 movq %r12, 32(%rdi) 1985 movq %r13, 40(%rdi) 1986 movq %r14, 48(%rdi) 1987 movq %r15, 56(%rdi) 1988 1989 ret 1990.cfi_endproc 1991.size __rsaz_512_mul,.-__rsaz_512_mul 1992___ 1993} 1994if ($addx) { 1995 # __rsaz_512_mulx 1996 # 1997 # input: %rsi - ap, %rbp - bp 1998 # output: 1999 # clobbers: everything 2000my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 2001$code.=<<___; 2002.type __rsaz_512_mulx,\@abi-omnipotent 2003.align 32 2004__rsaz_512_mulx: 2005.cfi_startproc 2006 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 2007 mov \$-6, %rcx 2008 2009 mulx 8($ap), %rax, %r9 2010 movq %rbx, 8(%rsp) 2011 2012 mulx 16($ap), %rbx, %r10 2013 adc %rax, %r8 2014 2015 mulx 24($ap), %rax, %r11 2016 adc %rbx, %r9 2017 2018 mulx 32($ap), %rbx, %r12 2019 adc %rax, %r10 2020 2021 mulx 40($ap), %rax, %r13 2022 adc %rbx, %r11 2023 2024 mulx 48($ap), %rbx, %r14 2025 adc %rax, %r12 2026 2027 mulx 56($ap), %rax, %r15 2028 mov 8($bp), %rdx 2029 adc %rbx, %r13 2030 adc %rax, %r14 2031 adc \$0, %r15 2032 2033 xor $zero, $zero # cf=0,of=0 2034 jmp .Loop_mulx 2035 2036.align 32 2037.Loop_mulx: 2038 movq %r8, %rbx 2039 mulx ($ap), %rax, %r8 2040 adcx %rax, %rbx 2041 adox %r9, %r8 2042 2043 mulx 8($ap), %rax, %r9 2044 adcx %rax, %r8 2045 adox %r10, %r9 2046 2047 mulx 16($ap), %rax, %r10 2048 adcx %rax, %r9 2049 adox %r11, %r10 2050 2051 mulx 24($ap), %rax, %r11 2052 adcx %rax, %r10 2053 adox %r12, %r11 2054 2055 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 2056 adcx %rax, %r11 2057 adox %r13, %r12 2058 2059 mulx 40($ap), %rax, %r13 2060 adcx %rax, %r12 2061 adox %r14, %r13 2062 2063 mulx 48($ap), %rax, %r14 2064 adcx %rax, %r13 2065 adox %r15, %r14 2066 2067 mulx 56($ap), %rax, %r15 2068 movq 64($bp,%rcx,8), %rdx 2069 movq %rbx, 8+64-8(%rsp,%rcx,8) 2070 adcx %rax, %r14 2071 adox $zero, %r15 2072 adcx $zero, %r15 # cf=0 2073 2074 inc %rcx # of=0 2075 jnz .Loop_mulx 2076 2077 movq %r8, %rbx 2078 mulx ($ap), %rax, %r8 2079 adcx %rax, %rbx 2080 adox %r9, %r8 2081 2082 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2083 adcx %rax, %r8 2084 adox %r10, %r9 2085 2086 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2087 adcx %rax, %r9 2088 adox %r11, %r10 2089 2090 mulx 24($ap), %rax, %r11 2091 adcx %rax, %r10 2092 adox %r12, %r11 2093 2094 mulx 32($ap), %rax, %r12 2095 adcx %rax, %r11 2096 adox %r13, %r12 2097 2098 mulx 40($ap), %rax, %r13 2099 adcx %rax, %r12 2100 adox %r14, %r13 2101 2102 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2103 adcx %rax, %r13 2104 adox %r15, %r14 2105 2106 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2107 adcx %rax, %r14 2108 adox $zero, %r15 2109 adcx $zero, %r15 2110 2111 mov %rbx, 8+64-8(%rsp) 2112 mov %r8, 8+64(%rsp) 2113 mov %r9, 8+64+8(%rsp) 2114 mov %r10, 8+64+16(%rsp) 2115 mov %r11, 8+64+24(%rsp) 2116 mov %r12, 8+64+32(%rsp) 2117 mov %r13, 8+64+40(%rsp) 2118 mov %r14, 8+64+48(%rsp) 2119 mov %r15, 8+64+56(%rsp) 2120 2121 ret 2122.cfi_endproc 2123.size __rsaz_512_mulx,.-__rsaz_512_mulx 2124___ 2125} 2126{ 2127my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2128$code.=<<___; 2129.globl rsaz_512_scatter4 2130.type rsaz_512_scatter4,\@abi-omnipotent 2131.align 16 2132rsaz_512_scatter4: 2133.cfi_startproc 2134 leaq ($out,$power,8), $out 2135 movl \$8, %r9d 2136 jmp .Loop_scatter 2137.align 16 2138.Loop_scatter: 2139 movq ($inp), %rax 2140 leaq 8($inp), $inp 2141 movq %rax, ($out) 2142 leaq 128($out), $out 2143 decl %r9d 2144 jnz .Loop_scatter 2145 ret 2146.cfi_endproc 2147.size rsaz_512_scatter4,.-rsaz_512_scatter4 2148 2149.globl rsaz_512_gather4 2150.type rsaz_512_gather4,\@abi-omnipotent 2151.align 16 2152rsaz_512_gather4: 2153.cfi_startproc 2154___ 2155$code.=<<___ if ($win64); 2156.LSEH_begin_rsaz_512_gather4: 2157 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2158 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2159 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2160 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2161 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2162 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2163 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2164 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2165 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2166 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2167 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2168___ 2169$code.=<<___; 2170 movd $power,%xmm8 2171 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2172 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2173 2174 pshufd \$0,%xmm8,%xmm8 # broadcast $power 2175 movdqa %xmm1,%xmm7 2176 movdqa %xmm1,%xmm2 2177___ 2178######################################################################## 2179# calculate mask by comparing 0..15 to $power 2180# 2181for($i=0;$i<4;$i++) { 2182$code.=<<___; 2183 paddd %xmm`$i`,%xmm`$i+1` 2184 pcmpeqd %xmm8,%xmm`$i` 2185 movdqa %xmm7,%xmm`$i+3` 2186___ 2187} 2188for(;$i<7;$i++) { 2189$code.=<<___; 2190 paddd %xmm`$i`,%xmm`$i+1` 2191 pcmpeqd %xmm8,%xmm`$i` 2192___ 2193} 2194$code.=<<___; 2195 pcmpeqd %xmm8,%xmm7 2196 movl \$8, %r9d 2197 jmp .Loop_gather 2198.align 16 2199.Loop_gather: 2200 movdqa 16*0($inp),%xmm8 2201 movdqa 16*1($inp),%xmm9 2202 movdqa 16*2($inp),%xmm10 2203 movdqa 16*3($inp),%xmm11 2204 pand %xmm0,%xmm8 2205 movdqa 16*4($inp),%xmm12 2206 pand %xmm1,%xmm9 2207 movdqa 16*5($inp),%xmm13 2208 pand %xmm2,%xmm10 2209 movdqa 16*6($inp),%xmm14 2210 pand %xmm3,%xmm11 2211 movdqa 16*7($inp),%xmm15 2212 leaq 128($inp), $inp 2213 pand %xmm4,%xmm12 2214 pand %xmm5,%xmm13 2215 pand %xmm6,%xmm14 2216 pand %xmm7,%xmm15 2217 por %xmm10,%xmm8 2218 por %xmm11,%xmm9 2219 por %xmm12,%xmm8 2220 por %xmm13,%xmm9 2221 por %xmm14,%xmm8 2222 por %xmm15,%xmm9 2223 2224 por %xmm9,%xmm8 2225 pshufd \$0x4e,%xmm8,%xmm9 2226 por %xmm9,%xmm8 2227 movq %xmm8,($out) 2228 leaq 8($out), $out 2229 decl %r9d 2230 jnz .Loop_gather 2231___ 2232$code.=<<___ if ($win64); 2233 movaps 0x00(%rsp),%xmm6 2234 movaps 0x10(%rsp),%xmm7 2235 movaps 0x20(%rsp),%xmm8 2236 movaps 0x30(%rsp),%xmm9 2237 movaps 0x40(%rsp),%xmm10 2238 movaps 0x50(%rsp),%xmm11 2239 movaps 0x60(%rsp),%xmm12 2240 movaps 0x70(%rsp),%xmm13 2241 movaps 0x80(%rsp),%xmm14 2242 movaps 0x90(%rsp),%xmm15 2243 add \$0xa8,%rsp 2244___ 2245$code.=<<___; 2246 ret 2247.LSEH_end_rsaz_512_gather4: 2248.cfi_endproc 2249.size rsaz_512_gather4,.-rsaz_512_gather4 2250 2251.section .rodata align=64 2252.align 64 2253.Linc: 2254 .long 0,0, 1,1 2255 .long 2,2, 2,2 2256.previous 2257___ 2258} 2259 2260# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2261# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2262if ($win64) { 2263$rec="%rcx"; 2264$frame="%rdx"; 2265$context="%r8"; 2266$disp="%r9"; 2267 2268$code.=<<___; 2269.extern __imp_RtlVirtualUnwind 2270.type se_handler,\@abi-omnipotent 2271.align 16 2272se_handler: 2273 push %rsi 2274 push %rdi 2275 push %rbx 2276 push %rbp 2277 push %r12 2278 push %r13 2279 push %r14 2280 push %r15 2281 pushfq 2282 sub \$64,%rsp 2283 2284 mov 120($context),%rax # pull context->Rax 2285 mov 248($context),%rbx # pull context->Rip 2286 2287 mov 8($disp),%rsi # disp->ImageBase 2288 mov 56($disp),%r11 # disp->HandlerData 2289 2290 mov 0(%r11),%r10d # HandlerData[0] 2291 lea (%rsi,%r10),%r10 # end of prologue label 2292 cmp %r10,%rbx # context->Rip<end of prologue label 2293 jb .Lcommon_seh_tail 2294 2295 mov 152($context),%rax # pull context->Rsp 2296 2297 mov 4(%r11),%r10d # HandlerData[1] 2298 lea (%rsi,%r10),%r10 # epilogue label 2299 cmp %r10,%rbx # context->Rip>=epilogue label 2300 jae .Lcommon_seh_tail 2301 2302 lea 128+24+48(%rax),%rax 2303 2304 lea .Lmul_gather4_epilogue(%rip),%rbx 2305 cmp %r10,%rbx 2306 jne .Lse_not_in_mul_gather4 2307 2308 lea 0xb0(%rax),%rax 2309 2310 lea -48-0xa8(%rax),%rsi 2311 lea 512($context),%rdi 2312 mov \$20,%ecx 2313 .long 0xa548f3fc # cld; rep movsq 2314 2315.Lse_not_in_mul_gather4: 2316 mov -8(%rax),%rbx 2317 mov -16(%rax),%rbp 2318 mov -24(%rax),%r12 2319 mov -32(%rax),%r13 2320 mov -40(%rax),%r14 2321 mov -48(%rax),%r15 2322 mov %rbx,144($context) # restore context->Rbx 2323 mov %rbp,160($context) # restore context->Rbp 2324 mov %r12,216($context) # restore context->R12 2325 mov %r13,224($context) # restore context->R13 2326 mov %r14,232($context) # restore context->R14 2327 mov %r15,240($context) # restore context->R15 2328 2329.Lcommon_seh_tail: 2330 mov 8(%rax),%rdi 2331 mov 16(%rax),%rsi 2332 mov %rax,152($context) # restore context->Rsp 2333 mov %rsi,168($context) # restore context->Rsi 2334 mov %rdi,176($context) # restore context->Rdi 2335 2336 mov 40($disp),%rdi # disp->ContextRecord 2337 mov $context,%rsi # context 2338 mov \$154,%ecx # sizeof(CONTEXT) 2339 .long 0xa548f3fc # cld; rep movsq 2340 2341 mov $disp,%rsi 2342 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2343 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2344 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2345 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2346 mov 40(%rsi),%r10 # disp->ContextRecord 2347 lea 56(%rsi),%r11 # &disp->HandlerData 2348 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2349 mov %r10,32(%rsp) # arg5 2350 mov %r11,40(%rsp) # arg6 2351 mov %r12,48(%rsp) # arg7 2352 mov %rcx,56(%rsp) # arg8, (NULL) 2353 call *__imp_RtlVirtualUnwind(%rip) 2354 2355 mov \$1,%eax # ExceptionContinueSearch 2356 add \$64,%rsp 2357 popfq 2358 pop %r15 2359 pop %r14 2360 pop %r13 2361 pop %r12 2362 pop %rbp 2363 pop %rbx 2364 pop %rdi 2365 pop %rsi 2366 ret 2367.size se_handler,.-se_handler 2368 2369.section .pdata 2370.align 4 2371 .rva .LSEH_begin_rsaz_512_sqr 2372 .rva .LSEH_end_rsaz_512_sqr 2373 .rva .LSEH_info_rsaz_512_sqr 2374 2375 .rva .LSEH_begin_rsaz_512_mul 2376 .rva .LSEH_end_rsaz_512_mul 2377 .rva .LSEH_info_rsaz_512_mul 2378 2379 .rva .LSEH_begin_rsaz_512_mul_gather4 2380 .rva .LSEH_end_rsaz_512_mul_gather4 2381 .rva .LSEH_info_rsaz_512_mul_gather4 2382 2383 .rva .LSEH_begin_rsaz_512_mul_scatter4 2384 .rva .LSEH_end_rsaz_512_mul_scatter4 2385 .rva .LSEH_info_rsaz_512_mul_scatter4 2386 2387 .rva .LSEH_begin_rsaz_512_mul_by_one 2388 .rva .LSEH_end_rsaz_512_mul_by_one 2389 .rva .LSEH_info_rsaz_512_mul_by_one 2390 2391 .rva .LSEH_begin_rsaz_512_gather4 2392 .rva .LSEH_end_rsaz_512_gather4 2393 .rva .LSEH_info_rsaz_512_gather4 2394 2395.section .xdata 2396.align 8 2397.LSEH_info_rsaz_512_sqr: 2398 .byte 9,0,0,0 2399 .rva se_handler 2400 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2401.LSEH_info_rsaz_512_mul: 2402 .byte 9,0,0,0 2403 .rva se_handler 2404 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2405.LSEH_info_rsaz_512_mul_gather4: 2406 .byte 9,0,0,0 2407 .rva se_handler 2408 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2409.LSEH_info_rsaz_512_mul_scatter4: 2410 .byte 9,0,0,0 2411 .rva se_handler 2412 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2413.LSEH_info_rsaz_512_mul_by_one: 2414 .byte 9,0,0,0 2415 .rva se_handler 2416 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2417.LSEH_info_rsaz_512_gather4: 2418 .byte 0x01,0x46,0x16,0x00 2419 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2420 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2421 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2422 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2423 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2424 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2425 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2426 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2427 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2428 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2429 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2430___ 2431} 2432 2433$code =~ s/\`([^\`]*)\`/eval $1/gem; 2434print $code; 2435close STDOUT or die "error closing STDOUT: $!"; 2436