1#!/usr/bin/env perl 2# Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# X25519 lower-level primitives for x86_64. 17# 18# February 2018. 19# 20# This module implements radix 2^51 multiplication and squaring, and 21# radix 2^64 multiplication, squaring, addition, subtraction and final 22# reduction. Latter radix is used on ADCX/ADOX-capable processors such 23# as Broadwell. On related note one should mention that there are 24# vector implementations that provide significantly better performance 25# on some processors(*), but they are large and overly complex. Which 26# in combination with them being effectively processor-specific makes 27# the undertaking hard to justify. The goal for this implementation 28# is rather versatility and simplicity [and ultimately formal 29# verification]. 30# 31# (*) For example sandy2x should provide ~30% improvement on Sandy 32# Bridge, but only nominal ~5% on Haswell [and big loss on 33# Broadwell and successors]. 34# 35###################################################################### 36# Improvement coefficients: 37# 38# amd64-51(*) gcc-5.x(**) 39# 40# P4 +22% +40% 41# Sandy Bridge -3% +11% 42# Haswell -1% +13% 43# Broadwell(***) +30% +35% 44# Skylake(***) +33% +47% 45# Silvermont +20% +26% 46# Goldmont +40% +50% 47# Bulldozer +20% +9% 48# Ryzen(***) +43% +40% 49# VIA +170% +120% 50# 51# (*) amd64-51 is popular assembly implementation with 2^51 radix, 52# only multiplication and squaring subroutines were linked 53# for comparison, but not complete ladder step; gain on most 54# processors is because this module refrains from shld, and 55# minor regression on others is because this does result in 56# higher instruction count; 57# (**) compiler is free to inline functions, in assembly one would 58# need to implement ladder step to do that, and it will improve 59# performance by several percent; 60# (***) ADCX/ADOX result for 2^64 radix, there is no corresponding 61# C implementation, so that comparison is always against 62# 2^51 radix; 63 64# $output is the last argument if it looks like a file (it has an extension) 65# $flavour is the first argument if it doesn't look like a file 66$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 67$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 68 69$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 70 71$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 72( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 73( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 74die "can't locate x86_64-xlate.pl"; 75 76open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 77 or die "can't call $xlate: $!"; 78*STDOUT=*OUT; 79 80if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 81 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 82 $addx = ($1>=2.23); 83} 84 85if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 86 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 87 $addx = ($1>=2.10); 88} 89 90if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 91 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 92 $addx = ($1>=12); 93} 94 95if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { 96 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 97 $addx = ($ver>=3.03); 98} 99 100$code.=<<___; 101.text 102 103.globl x25519_fe51_mul 104.type x25519_fe51_mul,\@function,3 105.align 32 106x25519_fe51_mul: 107.cfi_startproc 108 push %rbp 109.cfi_push %rbp 110 push %rbx 111.cfi_push %rbx 112 push %r12 113.cfi_push %r12 114 push %r13 115.cfi_push %r13 116 push %r14 117.cfi_push %r14 118 push %r15 119.cfi_push %r15 120 lea -8*5(%rsp),%rsp 121.cfi_adjust_cfa_offset 40 122.Lfe51_mul_body: 123 124 mov 8*0(%rsi),%rax # f[0] 125 mov 8*0(%rdx),%r11 # load g[0-4] 126 mov 8*1(%rdx),%r12 127 mov 8*2(%rdx),%r13 128 mov 8*3(%rdx),%rbp 129 mov 8*4(%rdx),%r14 130 131 mov %rdi,8*4(%rsp) # offload 1st argument 132 mov %rax,%rdi 133 mulq %r11 # f[0]*g[0] 134 mov %r11,8*0(%rsp) # offload g[0] 135 mov %rax,%rbx # %rbx:%rcx = h0 136 mov %rdi,%rax 137 mov %rdx,%rcx 138 mulq %r12 # f[0]*g[1] 139 mov %r12,8*1(%rsp) # offload g[1] 140 mov %rax,%r8 # %r8:%r9 = h1 141 mov %rdi,%rax 142 lea (%r14,%r14,8),%r15 143 mov %rdx,%r9 144 mulq %r13 # f[0]*g[2] 145 mov %r13,8*2(%rsp) # offload g[2] 146 mov %rax,%r10 # %r10:%r11 = h2 147 mov %rdi,%rax 148 lea (%r14,%r15,2),%rdi # g[4]*19 149 mov %rdx,%r11 150 mulq %rbp # f[0]*g[3] 151 mov %rax,%r12 # %r12:%r13 = h3 152 mov 8*0(%rsi),%rax # f[0] 153 mov %rdx,%r13 154 mulq %r14 # f[0]*g[4] 155 mov %rax,%r14 # %r14:%r15 = h4 156 mov 8*1(%rsi),%rax # f[1] 157 mov %rdx,%r15 158 159 mulq %rdi # f[1]*g[4]*19 160 add %rax,%rbx 161 mov 8*2(%rsi),%rax # f[2] 162 adc %rdx,%rcx 163 mulq %rdi # f[2]*g[4]*19 164 add %rax,%r8 165 mov 8*3(%rsi),%rax # f[3] 166 adc %rdx,%r9 167 mulq %rdi # f[3]*g[4]*19 168 add %rax,%r10 169 mov 8*4(%rsi),%rax # f[4] 170 adc %rdx,%r11 171 mulq %rdi # f[4]*g[4]*19 172 imulq \$19,%rbp,%rdi # g[3]*19 173 add %rax,%r12 174 mov 8*1(%rsi),%rax # f[1] 175 adc %rdx,%r13 176 mulq %rbp # f[1]*g[3] 177 mov 8*2(%rsp),%rbp # g[2] 178 add %rax,%r14 179 mov 8*2(%rsi),%rax # f[2] 180 adc %rdx,%r15 181 182 mulq %rdi # f[2]*g[3]*19 183 add %rax,%rbx 184 mov 8*3(%rsi),%rax # f[3] 185 adc %rdx,%rcx 186 mulq %rdi # f[3]*g[3]*19 187 add %rax,%r8 188 mov 8*4(%rsi),%rax # f[4] 189 adc %rdx,%r9 190 mulq %rdi # f[4]*g[3]*19 191 imulq \$19,%rbp,%rdi # g[2]*19 192 add %rax,%r10 193 mov 8*1(%rsi),%rax # f[1] 194 adc %rdx,%r11 195 mulq %rbp # f[1]*g[2] 196 add %rax,%r12 197 mov 8*2(%rsi),%rax # f[2] 198 adc %rdx,%r13 199 mulq %rbp # f[2]*g[2] 200 mov 8*1(%rsp),%rbp # g[1] 201 add %rax,%r14 202 mov 8*3(%rsi),%rax # f[3] 203 adc %rdx,%r15 204 205 mulq %rdi # f[3]*g[2]*19 206 add %rax,%rbx 207 mov 8*4(%rsi),%rax # f[3] 208 adc %rdx,%rcx 209 mulq %rdi # f[4]*g[2]*19 210 add %rax,%r8 211 mov 8*1(%rsi),%rax # f[1] 212 adc %rdx,%r9 213 mulq %rbp # f[1]*g[1] 214 imulq \$19,%rbp,%rdi 215 add %rax,%r10 216 mov 8*2(%rsi),%rax # f[2] 217 adc %rdx,%r11 218 mulq %rbp # f[2]*g[1] 219 add %rax,%r12 220 mov 8*3(%rsi),%rax # f[3] 221 adc %rdx,%r13 222 mulq %rbp # f[3]*g[1] 223 mov 8*0(%rsp),%rbp # g[0] 224 add %rax,%r14 225 mov 8*4(%rsi),%rax # f[4] 226 adc %rdx,%r15 227 228 mulq %rdi # f[4]*g[1]*19 229 add %rax,%rbx 230 mov 8*1(%rsi),%rax # f[1] 231 adc %rdx,%rcx 232 mul %rbp # f[1]*g[0] 233 add %rax,%r8 234 mov 8*2(%rsi),%rax # f[2] 235 adc %rdx,%r9 236 mul %rbp # f[2]*g[0] 237 add %rax,%r10 238 mov 8*3(%rsi),%rax # f[3] 239 adc %rdx,%r11 240 mul %rbp # f[3]*g[0] 241 add %rax,%r12 242 mov 8*4(%rsi),%rax # f[4] 243 adc %rdx,%r13 244 mulq %rbp # f[4]*g[0] 245 add %rax,%r14 246 adc %rdx,%r15 247 248 mov 8*4(%rsp),%rdi # restore 1st argument 249 jmp .Lreduce51 250.Lfe51_mul_epilogue: 251.cfi_endproc 252.size x25519_fe51_mul,.-x25519_fe51_mul 253 254.globl x25519_fe51_sqr 255.type x25519_fe51_sqr,\@function,2 256.align 32 257x25519_fe51_sqr: 258.cfi_startproc 259 push %rbp 260.cfi_push %rbp 261 push %rbx 262.cfi_push %rbx 263 push %r12 264.cfi_push %r12 265 push %r13 266.cfi_push %r13 267 push %r14 268.cfi_push %r14 269 push %r15 270.cfi_push %r15 271 lea -8*5(%rsp),%rsp 272.cfi_adjust_cfa_offset 40 273.Lfe51_sqr_body: 274 275 mov 8*0(%rsi),%rax # g[0] 276 mov 8*2(%rsi),%r15 # g[2] 277 mov 8*4(%rsi),%rbp # g[4] 278 279 mov %rdi,8*4(%rsp) # offload 1st argument 280 lea (%rax,%rax),%r14 281 mulq %rax # g[0]*g[0] 282 mov %rax,%rbx 283 mov 8*1(%rsi),%rax # g[1] 284 mov %rdx,%rcx 285 mulq %r14 # 2*g[0]*g[1] 286 mov %rax,%r8 287 mov %r15,%rax 288 mov %r15,8*0(%rsp) # offload g[2] 289 mov %rdx,%r9 290 mulq %r14 # 2*g[0]*g[2] 291 mov %rax,%r10 292 mov 8*3(%rsi),%rax 293 mov %rdx,%r11 294 imulq \$19,%rbp,%rdi # g[4]*19 295 mulq %r14 # 2*g[0]*g[3] 296 mov %rax,%r12 297 mov %rbp,%rax 298 mov %rdx,%r13 299 mulq %r14 # 2*g[0]*g[4] 300 mov %rax,%r14 301 mov %rbp,%rax 302 mov %rdx,%r15 303 304 mulq %rdi # g[4]*g[4]*19 305 add %rax,%r12 306 mov 8*1(%rsi),%rax # g[1] 307 adc %rdx,%r13 308 309 mov 8*3(%rsi),%rsi # g[3] 310 lea (%rax,%rax),%rbp 311 mulq %rax # g[1]*g[1] 312 add %rax,%r10 313 mov 8*0(%rsp),%rax # g[2] 314 adc %rdx,%r11 315 mulq %rbp # 2*g[1]*g[2] 316 add %rax,%r12 317 mov %rbp,%rax 318 adc %rdx,%r13 319 mulq %rsi # 2*g[1]*g[3] 320 add %rax,%r14 321 mov %rbp,%rax 322 adc %rdx,%r15 323 imulq \$19,%rsi,%rbp # g[3]*19 324 mulq %rdi # 2*g[1]*g[4]*19 325 add %rax,%rbx 326 lea (%rsi,%rsi),%rax 327 adc %rdx,%rcx 328 329 mulq %rdi # 2*g[3]*g[4]*19 330 add %rax,%r10 331 mov %rsi,%rax 332 adc %rdx,%r11 333 mulq %rbp # g[3]*g[3]*19 334 add %rax,%r8 335 mov 8*0(%rsp),%rax # g[2] 336 adc %rdx,%r9 337 338 lea (%rax,%rax),%rsi 339 mulq %rax # g[2]*g[2] 340 add %rax,%r14 341 mov %rbp,%rax 342 adc %rdx,%r15 343 mulq %rsi # 2*g[2]*g[3]*19 344 add %rax,%rbx 345 mov %rsi,%rax 346 adc %rdx,%rcx 347 mulq %rdi # 2*g[2]*g[4]*19 348 add %rax,%r8 349 adc %rdx,%r9 350 351 mov 8*4(%rsp),%rdi # restore 1st argument 352 jmp .Lreduce51 353 354.align 32 355.Lreduce51: 356 mov \$0x7ffffffffffff,%rbp 357 358 mov %r10,%rdx 359 shr \$51,%r10 360 shl \$13,%r11 361 and %rbp,%rdx # %rdx = g2 = h2 & mask 362 or %r10,%r11 # h2>>51 363 add %r11,%r12 364 adc \$0,%r13 # h3 += h2>>51 365 366 mov %rbx,%rax 367 shr \$51,%rbx 368 shl \$13,%rcx 369 and %rbp,%rax # %rax = g0 = h0 & mask 370 or %rbx,%rcx # h0>>51 371 add %rcx,%r8 # h1 += h0>>51 372 adc \$0,%r9 373 374 mov %r12,%rbx 375 shr \$51,%r12 376 shl \$13,%r13 377 and %rbp,%rbx # %rbx = g3 = h3 & mask 378 or %r12,%r13 # h3>>51 379 add %r13,%r14 # h4 += h3>>51 380 adc \$0,%r15 381 382 mov %r8,%rcx 383 shr \$51,%r8 384 shl \$13,%r9 385 and %rbp,%rcx # %rcx = g1 = h1 & mask 386 or %r8,%r9 387 add %r9,%rdx # g2 += h1>>51 388 389 mov %r14,%r10 390 shr \$51,%r14 391 shl \$13,%r15 392 and %rbp,%r10 # %r10 = g4 = h0 & mask 393 or %r14,%r15 # h0>>51 394 395 lea (%r15,%r15,8),%r14 396 lea (%r15,%r14,2),%r15 397 add %r15,%rax # g0 += (h0>>51)*19 398 399 mov %rdx,%r8 400 and %rbp,%rdx # g2 &= mask 401 shr \$51,%r8 402 add %r8,%rbx # g3 += g2>>51 403 404 mov %rax,%r9 405 and %rbp,%rax # g0 &= mask 406 shr \$51,%r9 407 add %r9,%rcx # g1 += g0>>51 408 409 mov %rax,8*0(%rdi) # save the result 410 mov %rcx,8*1(%rdi) 411 mov %rdx,8*2(%rdi) 412 mov %rbx,8*3(%rdi) 413 mov %r10,8*4(%rdi) 414 415 mov 8*5(%rsp),%r15 416.cfi_restore %r15 417 mov 8*6(%rsp),%r14 418.cfi_restore %r14 419 mov 8*7(%rsp),%r13 420.cfi_restore %r13 421 mov 8*8(%rsp),%r12 422.cfi_restore %r12 423 mov 8*9(%rsp),%rbx 424.cfi_restore %rbx 425 mov 8*10(%rsp),%rbp 426.cfi_restore %rbp 427 lea 8*11(%rsp),%rsp 428.cfi_adjust_cfa_offset 88 429.Lfe51_sqr_epilogue: 430 ret 431.cfi_endproc 432.size x25519_fe51_sqr,.-x25519_fe51_sqr 433 434.globl x25519_fe51_mul121666 435.type x25519_fe51_mul121666,\@function,2 436.align 32 437x25519_fe51_mul121666: 438.cfi_startproc 439 push %rbp 440.cfi_push %rbp 441 push %rbx 442.cfi_push %rbx 443 push %r12 444.cfi_push %r12 445 push %r13 446.cfi_push %r13 447 push %r14 448.cfi_push %r14 449 push %r15 450.cfi_push %r15 451 lea -8*5(%rsp),%rsp 452.cfi_adjust_cfa_offset 40 453.Lfe51_mul121666_body: 454 mov \$121666,%eax 455 456 mulq 8*0(%rsi) 457 mov %rax,%rbx # %rbx:%rcx = h0 458 mov \$121666,%eax 459 mov %rdx,%rcx 460 mulq 8*1(%rsi) 461 mov %rax,%r8 # %r8:%r9 = h1 462 mov \$121666,%eax 463 mov %rdx,%r9 464 mulq 8*2(%rsi) 465 mov %rax,%r10 # %r10:%r11 = h2 466 mov \$121666,%eax 467 mov %rdx,%r11 468 mulq 8*3(%rsi) 469 mov %rax,%r12 # %r12:%r13 = h3 470 mov \$121666,%eax # f[0] 471 mov %rdx,%r13 472 mulq 8*4(%rsi) 473 mov %rax,%r14 # %r14:%r15 = h4 474 mov %rdx,%r15 475 476 jmp .Lreduce51 477.Lfe51_mul121666_epilogue: 478.cfi_endproc 479.size x25519_fe51_mul121666,.-x25519_fe51_mul121666 480___ 481######################################################################## 482# Base 2^64 subroutines modulo 2*(2^255-19) 483# 484if ($addx) { 485my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); 486 487$code.=<<___; 488.extern OPENSSL_ia32cap_P 489.globl x25519_fe64_eligible 490.type x25519_fe64_eligible,\@abi-omnipotent 491.align 32 492x25519_fe64_eligible: 493.cfi_startproc 494 mov OPENSSL_ia32cap_P+8(%rip),%ecx 495 xor %eax,%eax 496 and \$0x80100,%ecx 497 cmp \$0x80100,%ecx 498 cmove %ecx,%eax 499 ret 500.cfi_endproc 501.size x25519_fe64_eligible,.-x25519_fe64_eligible 502 503.globl x25519_fe64_mul 504.type x25519_fe64_mul,\@function,3 505.align 32 506x25519_fe64_mul: 507.cfi_startproc 508 push %rbp 509.cfi_push %rbp 510 push %rbx 511.cfi_push %rbx 512 push %r12 513.cfi_push %r12 514 push %r13 515.cfi_push %r13 516 push %r14 517.cfi_push %r14 518 push %r15 519.cfi_push %r15 520 push %rdi # offload dst 521.cfi_push %rdi 522 lea -8*2(%rsp),%rsp 523.cfi_adjust_cfa_offset 16 524.Lfe64_mul_body: 525 526 mov %rdx,%rax 527 mov 8*0(%rdx),%rbp # b[0] 528 mov 8*0(%rsi),%rdx # a[0] 529 mov 8*1(%rax),%rcx # b[1] 530 mov 8*2(%rax),$acc6 # b[2] 531 mov 8*3(%rax),$acc7 # b[3] 532 533 mulx %rbp,$acc0,%rax # a[0]*b[0] 534 xor %edi,%edi # cf=0,of=0 535 mulx %rcx,$acc1,%rbx # a[0]*b[1] 536 adcx %rax,$acc1 537 mulx $acc6,$acc2,%rax # a[0]*b[2] 538 adcx %rbx,$acc2 539 mulx $acc7,$acc3,$acc4 # a[0]*b[3] 540 mov 8*1(%rsi),%rdx # a[1] 541 adcx %rax,$acc3 542 mov $acc6,(%rsp) # offload b[2] 543 adcx %rdi,$acc4 # cf=0 544 545 mulx %rbp,%rax,%rbx # a[1]*b[0] 546 adox %rax,$acc1 547 adcx %rbx,$acc2 548 mulx %rcx,%rax,%rbx # a[1]*b[1] 549 adox %rax,$acc2 550 adcx %rbx,$acc3 551 mulx $acc6,%rax,%rbx # a[1]*b[2] 552 adox %rax,$acc3 553 adcx %rbx,$acc4 554 mulx $acc7,%rax,$acc5 # a[1]*b[3] 555 mov 8*2(%rsi),%rdx # a[2] 556 adox %rax,$acc4 557 adcx %rdi,$acc5 # cf=0 558 adox %rdi,$acc5 # of=0 559 560 mulx %rbp,%rax,%rbx # a[2]*b[0] 561 adcx %rax,$acc2 562 adox %rbx,$acc3 563 mulx %rcx,%rax,%rbx # a[2]*b[1] 564 adcx %rax,$acc3 565 adox %rbx,$acc4 566 mulx $acc6,%rax,%rbx # a[2]*b[2] 567 adcx %rax,$acc4 568 adox %rbx,$acc5 569 mulx $acc7,%rax,$acc6 # a[2]*b[3] 570 mov 8*3(%rsi),%rdx # a[3] 571 adcx %rax,$acc5 572 adox %rdi,$acc6 # of=0 573 adcx %rdi,$acc6 # cf=0 574 575 mulx %rbp,%rax,%rbx # a[3]*b[0] 576 adox %rax,$acc3 577 adcx %rbx,$acc4 578 mulx %rcx,%rax,%rbx # a[3]*b[1] 579 adox %rax,$acc4 580 adcx %rbx,$acc5 581 mulx (%rsp),%rax,%rbx # a[3]*b[2] 582 adox %rax,$acc5 583 adcx %rbx,$acc6 584 mulx $acc7,%rax,$acc7 # a[3]*b[3] 585 mov \$38,%edx 586 adox %rax,$acc6 587 adcx %rdi,$acc7 # cf=0 588 adox %rdi,$acc7 # of=0 589 590 jmp .Lreduce64 591.Lfe64_mul_epilogue: 592.cfi_endproc 593.size x25519_fe64_mul,.-x25519_fe64_mul 594 595.globl x25519_fe64_sqr 596.type x25519_fe64_sqr,\@function,2 597.align 32 598x25519_fe64_sqr: 599.cfi_startproc 600 push %rbp 601.cfi_push %rbp 602 push %rbx 603.cfi_push %rbx 604 push %r12 605.cfi_push %r12 606 push %r13 607.cfi_push %r13 608 push %r14 609.cfi_push %r14 610 push %r15 611.cfi_push %r15 612 push %rdi # offload dst 613.cfi_push %rdi 614 lea -8*2(%rsp),%rsp 615.cfi_adjust_cfa_offset 16 616.Lfe64_sqr_body: 617 618 mov 8*0(%rsi),%rdx # a[0] 619 mov 8*1(%rsi),%rcx # a[1] 620 mov 8*2(%rsi),%rbp # a[2] 621 mov 8*3(%rsi),%rsi # a[3] 622 623 ################################################################ 624 mulx %rdx,$acc0,$acc7 # a[0]*a[0] 625 mulx %rcx,$acc1,%rax # a[0]*a[1] 626 xor %edi,%edi # cf=0,of=0 627 mulx %rbp,$acc2,%rbx # a[0]*a[2] 628 adcx %rax,$acc2 629 mulx %rsi,$acc3,$acc4 # a[0]*a[3] 630 mov %rcx,%rdx # a[1] 631 adcx %rbx,$acc3 632 adcx %rdi,$acc4 # cf=0 633 634 ################################################################ 635 mulx %rbp,%rax,%rbx # a[1]*a[2] 636 adox %rax,$acc3 637 adcx %rbx,$acc4 638 mulx %rsi,%rax,$acc5 # a[1]*a[3] 639 mov %rbp,%rdx # a[2] 640 adox %rax,$acc4 641 adcx %rdi,$acc5 642 643 ################################################################ 644 mulx %rsi,%rax,$acc6 # a[2]*a[3] 645 mov %rcx,%rdx # a[1] 646 adox %rax,$acc5 647 adcx %rdi,$acc6 # cf=0 648 adox %rdi,$acc6 # of=0 649 650 adcx $acc1,$acc1 # acc1:6<<1 651 adox $acc7,$acc1 652 adcx $acc2,$acc2 653 mulx %rdx,%rax,%rbx # a[1]*a[1] 654 mov %rbp,%rdx # a[2] 655 adcx $acc3,$acc3 656 adox %rax,$acc2 657 adcx $acc4,$acc4 658 adox %rbx,$acc3 659 mulx %rdx,%rax,%rbx # a[2]*a[2] 660 mov %rsi,%rdx # a[3] 661 adcx $acc5,$acc5 662 adox %rax,$acc4 663 adcx $acc6,$acc6 664 adox %rbx,$acc5 665 mulx %rdx,%rax,$acc7 # a[3]*a[3] 666 mov \$38,%edx 667 adox %rax,$acc6 668 adcx %rdi,$acc7 # cf=0 669 adox %rdi,$acc7 # of=0 670 jmp .Lreduce64 671 672.align 32 673.Lreduce64: 674 mulx $acc4,%rax,%rbx 675 adcx %rax,$acc0 676 adox %rbx,$acc1 677 mulx $acc5,%rax,%rbx 678 adcx %rax,$acc1 679 adox %rbx,$acc2 680 mulx $acc6,%rax,%rbx 681 adcx %rax,$acc2 682 adox %rbx,$acc3 683 mulx $acc7,%rax,$acc4 684 adcx %rax,$acc3 685 adox %rdi,$acc4 686 adcx %rdi,$acc4 687 688 mov 8*2(%rsp),%rdi # restore dst 689 imulq %rdx,$acc4 690 691 add $acc4,$acc0 692 adc \$0,$acc1 693 adc \$0,$acc2 694 adc \$0,$acc3 695 696 sbb %rax,%rax # cf -> mask 697 and \$38,%rax 698 699 add %rax,$acc0 700 mov $acc1,8*1(%rdi) 701 mov $acc2,8*2(%rdi) 702 mov $acc3,8*3(%rdi) 703 mov $acc0,8*0(%rdi) 704 705 mov 8*3(%rsp),%r15 706.cfi_restore %r15 707 mov 8*4(%rsp),%r14 708.cfi_restore %r14 709 mov 8*5(%rsp),%r13 710.cfi_restore %r13 711 mov 8*6(%rsp),%r12 712.cfi_restore %r12 713 mov 8*7(%rsp),%rbx 714.cfi_restore %rbx 715 mov 8*8(%rsp),%rbp 716.cfi_restore %rbp 717 lea 8*9(%rsp),%rsp 718.cfi_adjust_cfa_offset 88 719.Lfe64_sqr_epilogue: 720 ret 721.cfi_endproc 722.size x25519_fe64_sqr,.-x25519_fe64_sqr 723 724.globl x25519_fe64_mul121666 725.type x25519_fe64_mul121666,\@function,2 726.align 32 727x25519_fe64_mul121666: 728.Lfe64_mul121666_body: 729.cfi_startproc 730 mov \$121666,%edx 731 mulx 8*0(%rsi),$acc0,%rcx 732 mulx 8*1(%rsi),$acc1,%rax 733 add %rcx,$acc1 734 mulx 8*2(%rsi),$acc2,%rcx 735 adc %rax,$acc2 736 mulx 8*3(%rsi),$acc3,%rax 737 adc %rcx,$acc3 738 adc \$0,%rax 739 740 imulq \$38,%rax,%rax 741 742 add %rax,$acc0 743 adc \$0,$acc1 744 adc \$0,$acc2 745 adc \$0,$acc3 746 747 sbb %rax,%rax # cf -> mask 748 and \$38,%rax 749 750 add %rax,$acc0 751 mov $acc1,8*1(%rdi) 752 mov $acc2,8*2(%rdi) 753 mov $acc3,8*3(%rdi) 754 mov $acc0,8*0(%rdi) 755 756.Lfe64_mul121666_epilogue: 757 ret 758.cfi_endproc 759.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 760 761.globl x25519_fe64_add 762.type x25519_fe64_add,\@function,3 763.align 32 764x25519_fe64_add: 765.Lfe64_add_body: 766.cfi_startproc 767 mov 8*0(%rsi),$acc0 768 mov 8*1(%rsi),$acc1 769 mov 8*2(%rsi),$acc2 770 mov 8*3(%rsi),$acc3 771 772 add 8*0(%rdx),$acc0 773 adc 8*1(%rdx),$acc1 774 adc 8*2(%rdx),$acc2 775 adc 8*3(%rdx),$acc3 776 777 sbb %rax,%rax # cf -> mask 778 and \$38,%rax 779 780 add %rax,$acc0 781 adc \$0,$acc1 782 adc \$0,$acc2 783 mov $acc1,8*1(%rdi) 784 adc \$0,$acc3 785 mov $acc2,8*2(%rdi) 786 sbb %rax,%rax # cf -> mask 787 mov $acc3,8*3(%rdi) 788 and \$38,%rax 789 790 add %rax,$acc0 791 mov $acc0,8*0(%rdi) 792 793.Lfe64_add_epilogue: 794 ret 795.cfi_endproc 796.size x25519_fe64_add,.-x25519_fe64_add 797 798.globl x25519_fe64_sub 799.type x25519_fe64_sub,\@function,3 800.align 32 801x25519_fe64_sub: 802.Lfe64_sub_body: 803.cfi_startproc 804 mov 8*0(%rsi),$acc0 805 mov 8*1(%rsi),$acc1 806 mov 8*2(%rsi),$acc2 807 mov 8*3(%rsi),$acc3 808 809 sub 8*0(%rdx),$acc0 810 sbb 8*1(%rdx),$acc1 811 sbb 8*2(%rdx),$acc2 812 sbb 8*3(%rdx),$acc3 813 814 sbb %rax,%rax # cf -> mask 815 and \$38,%rax 816 817 sub %rax,$acc0 818 sbb \$0,$acc1 819 sbb \$0,$acc2 820 mov $acc1,8*1(%rdi) 821 sbb \$0,$acc3 822 mov $acc2,8*2(%rdi) 823 sbb %rax,%rax # cf -> mask 824 mov $acc3,8*3(%rdi) 825 and \$38,%rax 826 827 sub %rax,$acc0 828 mov $acc0,8*0(%rdi) 829 830.Lfe64_sub_epilogue: 831 ret 832.cfi_endproc 833.size x25519_fe64_sub,.-x25519_fe64_sub 834 835.globl x25519_fe64_tobytes 836.type x25519_fe64_tobytes,\@function,2 837.align 32 838x25519_fe64_tobytes: 839.Lfe64_to_body: 840.cfi_startproc 841 mov 8*0(%rsi),$acc0 842 mov 8*1(%rsi),$acc1 843 mov 8*2(%rsi),$acc2 844 mov 8*3(%rsi),$acc3 845 846 ################################# reduction modulo 2^255-19 847 lea ($acc3,$acc3),%rax 848 sar \$63,$acc3 # most significant bit -> mask 849 shr \$1,%rax # most significant bit cleared 850 and \$19,$acc3 851 add \$19,$acc3 # compare to modulus in the same go 852 853 add $acc3,$acc0 854 adc \$0,$acc1 855 adc \$0,$acc2 856 adc \$0,%rax 857 858 lea (%rax,%rax),$acc3 859 sar \$63,%rax # most significant bit -> mask 860 shr \$1,$acc3 # most significant bit cleared 861 not %rax 862 and \$19,%rax 863 864 sub %rax,$acc0 865 sbb \$0,$acc1 866 sbb \$0,$acc2 867 sbb \$0,$acc3 868 869 mov $acc0,8*0(%rdi) 870 mov $acc1,8*1(%rdi) 871 mov $acc2,8*2(%rdi) 872 mov $acc3,8*3(%rdi) 873 874.Lfe64_to_epilogue: 875 ret 876.cfi_endproc 877.size x25519_fe64_tobytes,.-x25519_fe64_tobytes 878___ 879} else { 880$code.=<<___; 881.globl x25519_fe64_eligible 882.type x25519_fe64_eligible,\@abi-omnipotent 883.align 32 884x25519_fe64_eligible: 885.cfi_startproc 886 xor %eax,%eax 887 ret 888.cfi_endproc 889.size x25519_fe64_eligible,.-x25519_fe64_eligible 890 891.globl x25519_fe64_mul 892.type x25519_fe64_mul,\@abi-omnipotent 893.globl x25519_fe64_sqr 894.globl x25519_fe64_mul121666 895.globl x25519_fe64_add 896.globl x25519_fe64_sub 897.globl x25519_fe64_tobytes 898x25519_fe64_mul: 899x25519_fe64_sqr: 900x25519_fe64_mul121666: 901x25519_fe64_add: 902x25519_fe64_sub: 903x25519_fe64_tobytes: 904.cfi_startproc 905 .byte 0x0f,0x0b # ud2 906 ret 907.cfi_endproc 908.size x25519_fe64_mul,.-x25519_fe64_mul 909___ 910} 911$code.=<<___; 912.asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 913___ 914 915# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 916# CONTEXT *context,DISPATCHER_CONTEXT *disp) 917if ($win64) { 918$rec="%rcx"; 919$frame="%rdx"; 920$context="%r8"; 921$disp="%r9"; 922 923$code.=<<___; 924.extern __imp_RtlVirtualUnwind 925 926.type short_handler,\@abi-omnipotent 927.align 16 928short_handler: 929 push %rsi 930 push %rdi 931 push %rbx 932 push %rbp 933 push %r12 934 push %r13 935 push %r14 936 push %r15 937 pushfq 938 sub \$64,%rsp 939 940 mov 120($context),%rax # pull context->Rax 941 mov 248($context),%rbx # pull context->Rip 942 943 mov 8($disp),%rsi # disp->ImageBase 944 mov 56($disp),%r11 # disp->HandlerData 945 946 mov 0(%r11),%r10d # HandlerData[0] 947 lea (%rsi,%r10),%r10 # end of prologue label 948 cmp %r10,%rbx # context->Rip<end of prologue label 949 jb .Lcommon_seh_tail 950 951 mov 152($context),%rax # pull context->Rsp 952 jmp .Lcommon_seh_tail 953.size short_handler,.-short_handler 954 955.type full_handler,\@abi-omnipotent 956.align 16 957full_handler: 958 push %rsi 959 push %rdi 960 push %rbx 961 push %rbp 962 push %r12 963 push %r13 964 push %r14 965 push %r15 966 pushfq 967 sub \$64,%rsp 968 969 mov 120($context),%rax # pull context->Rax 970 mov 248($context),%rbx # pull context->Rip 971 972 mov 8($disp),%rsi # disp->ImageBase 973 mov 56($disp),%r11 # disp->HandlerData 974 975 mov 0(%r11),%r10d # HandlerData[0] 976 lea (%rsi,%r10),%r10 # end of prologue label 977 cmp %r10,%rbx # context->Rip<end of prologue label 978 jb .Lcommon_seh_tail 979 980 mov 152($context),%rax # pull context->Rsp 981 982 mov 4(%r11),%r10d # HandlerData[1] 983 lea (%rsi,%r10),%r10 # epilogue label 984 cmp %r10,%rbx # context->Rip>=epilogue label 985 jae .Lcommon_seh_tail 986 987 mov 8(%r11),%r10d # HandlerData[2] 988 lea (%rax,%r10),%rax 989 990 mov -8(%rax),%rbp 991 mov -16(%rax),%rbx 992 mov -24(%rax),%r12 993 mov -32(%rax),%r13 994 mov -40(%rax),%r14 995 mov -48(%rax),%r15 996 mov %rbx,144($context) # restore context->Rbx 997 mov %rbp,160($context) # restore context->Rbp 998 mov %r12,216($context) # restore context->R12 999 mov %r13,224($context) # restore context->R13 1000 mov %r14,232($context) # restore context->R14 1001 mov %r15,240($context) # restore context->R15 1002 1003.Lcommon_seh_tail: 1004 mov 8(%rax),%rdi 1005 mov 16(%rax),%rsi 1006 mov %rax,152($context) # restore context->Rsp 1007 mov %rsi,168($context) # restore context->Rsi 1008 mov %rdi,176($context) # restore context->Rdi 1009 1010 mov 40($disp),%rdi # disp->ContextRecord 1011 mov $context,%rsi # context 1012 mov \$154,%ecx # sizeof(CONTEXT) 1013 .long 0xa548f3fc # cld; rep movsq 1014 1015 mov $disp,%rsi 1016 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1017 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1018 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1019 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1020 mov 40(%rsi),%r10 # disp->ContextRecord 1021 lea 56(%rsi),%r11 # &disp->HandlerData 1022 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1023 mov %r10,32(%rsp) # arg5 1024 mov %r11,40(%rsp) # arg6 1025 mov %r12,48(%rsp) # arg7 1026 mov %rcx,56(%rsp) # arg8, (NULL) 1027 call *__imp_RtlVirtualUnwind(%rip) 1028 1029 mov \$1,%eax # ExceptionContinueSearch 1030 add \$64,%rsp 1031 popfq 1032 pop %r15 1033 pop %r14 1034 pop %r13 1035 pop %r12 1036 pop %rbp 1037 pop %rbx 1038 pop %rdi 1039 pop %rsi 1040 ret 1041.size full_handler,.-full_handler 1042 1043.section .pdata 1044.align 4 1045 .rva .LSEH_begin_x25519_fe51_mul 1046 .rva .LSEH_end_x25519_fe51_mul 1047 .rva .LSEH_info_x25519_fe51_mul 1048 1049 .rva .LSEH_begin_x25519_fe51_sqr 1050 .rva .LSEH_end_x25519_fe51_sqr 1051 .rva .LSEH_info_x25519_fe51_sqr 1052 1053 .rva .LSEH_begin_x25519_fe51_mul121666 1054 .rva .LSEH_end_x25519_fe51_mul121666 1055 .rva .LSEH_info_x25519_fe51_mul121666 1056___ 1057$code.=<<___ if ($addx); 1058 .rva .LSEH_begin_x25519_fe64_mul 1059 .rva .LSEH_end_x25519_fe64_mul 1060 .rva .LSEH_info_x25519_fe64_mul 1061 1062 .rva .LSEH_begin_x25519_fe64_sqr 1063 .rva .LSEH_end_x25519_fe64_sqr 1064 .rva .LSEH_info_x25519_fe64_sqr 1065 1066 .rva .LSEH_begin_x25519_fe64_mul121666 1067 .rva .LSEH_end_x25519_fe64_mul121666 1068 .rva .LSEH_info_x25519_fe64_mul121666 1069 1070 .rva .LSEH_begin_x25519_fe64_add 1071 .rva .LSEH_end_x25519_fe64_add 1072 .rva .LSEH_info_x25519_fe64_add 1073 1074 .rva .LSEH_begin_x25519_fe64_sub 1075 .rva .LSEH_end_x25519_fe64_sub 1076 .rva .LSEH_info_x25519_fe64_sub 1077 1078 .rva .LSEH_begin_x25519_fe64_tobytes 1079 .rva .LSEH_end_x25519_fe64_tobytes 1080 .rva .LSEH_info_x25519_fe64_tobytes 1081___ 1082$code.=<<___; 1083.section .xdata 1084.align 8 1085.LSEH_info_x25519_fe51_mul: 1086 .byte 9,0,0,0 1087 .rva full_handler 1088 .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[] 1089 .long 88,0 1090.LSEH_info_x25519_fe51_sqr: 1091 .byte 9,0,0,0 1092 .rva full_handler 1093 .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[] 1094 .long 88,0 1095.LSEH_info_x25519_fe51_mul121666: 1096 .byte 9,0,0,0 1097 .rva full_handler 1098 .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[] 1099 .long 88,0 1100___ 1101$code.=<<___ if ($addx); 1102.LSEH_info_x25519_fe64_mul: 1103 .byte 9,0,0,0 1104 .rva full_handler 1105 .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[] 1106 .long 72,0 1107.LSEH_info_x25519_fe64_sqr: 1108 .byte 9,0,0,0 1109 .rva full_handler 1110 .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[] 1111 .long 72,0 1112.LSEH_info_x25519_fe64_mul121666: 1113 .byte 9,0,0,0 1114 .rva short_handler 1115 .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[] 1116.LSEH_info_x25519_fe64_add: 1117 .byte 9,0,0,0 1118 .rva short_handler 1119 .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[] 1120.LSEH_info_x25519_fe64_sub: 1121 .byte 9,0,0,0 1122 .rva short_handler 1123 .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[] 1124.LSEH_info_x25519_fe64_tobytes: 1125 .byte 9,0,0,0 1126 .rva short_handler 1127 .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[] 1128___ 1129} 1130 1131$code =~ s/\`([^\`]*)\`/eval $1/gem; 1132print $code; 1133close STDOUT or die "error closing STDOUT: $!"; 1134